From 0a676c0ea96c7121891199001171f6b828233197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E6=B5=A9?= Date: Thu, 16 Jan 2025 16:12:49 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0bs4-script.py=EF=BC=8C?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E4=BD=BF=E7=94=A8playwright=E5=92=8Crequests?= =?UTF-8?q?=E4=B8=A4=E7=A7=8D=E6=96=B9=E5=BC=8F=E8=8E=B7=E5=8F=96=E7=BD=91?= =?UTF-8?q?=E9=A1=B5=E5=86=85=E5=AE=B9=EF=BC=8C=E5=B9=B6=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E5=91=BD=E4=BB=A4=E8=A1=8C=E5=8F=82=E6=95=B0=E8=A7=A3=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bs4-script.py | 71 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/bs4-script.py b/bs4-script.py index 57e3142..b122b05 100644 --- a/bs4-script.py +++ b/bs4-script.py @@ -9,6 +9,7 @@ import json import re import os import requests +from playwright.sync_api import sync_playwright def clean_text(text): @@ -81,8 +82,14 @@ def extract_station_info(html_content): return stations -def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3'): - """获取页面内容,可以从本地文件或网络获取""" +def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3', method='requests'): + """ + 获取页面内容 + :param use_local: 是否使用本地文件 + :param url: 要抓取的URL + :param method: 获取方式,可选值:'requests'或'playwright' + :return: 页面内容 + """ if use_local: try: with open('source/satnogs.html', 'r', encoding='utf-8') as file: @@ -92,22 +99,49 @@ def get_content(use_local=True, url='https://network.satnogs.org/stations/?page= use_local = False if not use_local: - try: - response = requests.get(url) - response.raise_for_status() - print("成功从网络获取数据") - return response.text - except requests.RequestException as e: - print(f"获取网页数据时出错: {e}") + if method == 'requests': + try: + response = requests.get(url) + response.raise_for_status() + print("成功使用requests从网络获取数据") + return response.text + except requests.RequestException as e: + print(f"使用requests获取网页数据时出错: {e}") + return None + + elif method == 'playwright': + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=False) + page = browser.new_page() + page.goto(url) + # 等待页面加载完成 + page.wait_for_selector('tr.station-row') + content = page.content() + browser.close() + print("成功使用playwright从网络获取数据") + return content + except Exception as e: + print(f"使用playwright获取网页数据时出错: {e}") + return None + + else: + print(f"不支持的获取方式: {method}") return None def main(): - # 设置数据源 (True为本地文件,False为网络请求) - use_local = False + import argparse + + # 创建命令行参数解析器 + parser = argparse.ArgumentParser(description='抓取SatNOGS网站站点信息') + parser.add_argument('--local', action='store_true', help='使用本地文件') + parser.add_argument('--method', choices=['requests', 'playwright'], default='requests', + help='选择获取内容的方式: requests或playwright') + args = parser.parse_args() # 获取内容 - content = get_content(use_local) + content = get_content(use_local=args.local, method=args.method) if not content: print("无法获取数据,程序退出") return @@ -170,6 +204,19 @@ def main(): print(f"最低海拔: {min(altitudes)}m") print(f"平均海拔: {sum(altitudes) / len(altitudes):.1f}m") + print("\n等待10秒后退出") + import time + time.sleep(10) +""" +# 使用playwright方式抓取 +python bs4-script.py --method playwright + +# 使用requests方式抓取 +python bs4-script.py --method requests + +# 使用本地文件 +python bs4-script.py --local +""" if __name__ == "__main__": main()