更新bs4-script.py,支持使用playwright和requests两种方式获取网页内容,并添加命令行参数解析
This commit is contained in:
@ -9,6 +9,7 @@ import json
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text):
|
def clean_text(text):
|
||||||
@ -81,8 +82,14 @@ def extract_station_info(html_content):
|
|||||||
return stations
|
return stations
|
||||||
|
|
||||||
|
|
||||||
def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3'):
|
def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3', method='requests'):
|
||||||
"""获取页面内容,可以从本地文件或网络获取"""
|
"""
|
||||||
|
获取页面内容
|
||||||
|
:param use_local: 是否使用本地文件
|
||||||
|
:param url: 要抓取的URL
|
||||||
|
:param method: 获取方式,可选值:'requests'或'playwright'
|
||||||
|
:return: 页面内容
|
||||||
|
"""
|
||||||
if use_local:
|
if use_local:
|
||||||
try:
|
try:
|
||||||
with open('source/satnogs.html', 'r', encoding='utf-8') as file:
|
with open('source/satnogs.html', 'r', encoding='utf-8') as file:
|
||||||
@ -92,22 +99,49 @@ def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=
|
|||||||
use_local = False
|
use_local = False
|
||||||
|
|
||||||
if not use_local:
|
if not use_local:
|
||||||
try:
|
if method == 'requests':
|
||||||
response = requests.get(url)
|
try:
|
||||||
response.raise_for_status()
|
response = requests.get(url)
|
||||||
print("成功从网络获取数据")
|
response.raise_for_status()
|
||||||
return response.text
|
print("成功使用requests从网络获取数据")
|
||||||
except requests.RequestException as e:
|
return response.text
|
||||||
print(f"获取网页数据时出错: {e}")
|
except requests.RequestException as e:
|
||||||
|
print(f"使用requests获取网页数据时出错: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
elif method == 'playwright':
|
||||||
|
try:
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=False)
|
||||||
|
page = browser.new_page()
|
||||||
|
page.goto(url)
|
||||||
|
# 等待页面加载完成
|
||||||
|
page.wait_for_selector('tr.station-row')
|
||||||
|
content = page.content()
|
||||||
|
browser.close()
|
||||||
|
print("成功使用playwright从网络获取数据")
|
||||||
|
return content
|
||||||
|
except Exception as e:
|
||||||
|
print(f"使用playwright获取网页数据时出错: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"不支持的获取方式: {method}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# 设置数据源 (True为本地文件,False为网络请求)
|
import argparse
|
||||||
use_local = False
|
|
||||||
|
# 创建命令行参数解析器
|
||||||
|
parser = argparse.ArgumentParser(description='抓取SatNOGS网站站点信息')
|
||||||
|
parser.add_argument('--local', action='store_true', help='使用本地文件')
|
||||||
|
parser.add_argument('--method', choices=['requests', 'playwright'], default='requests',
|
||||||
|
help='选择获取内容的方式: requests或playwright')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
# 获取内容
|
# 获取内容
|
||||||
content = get_content(use_local)
|
content = get_content(use_local=args.local, method=args.method)
|
||||||
if not content:
|
if not content:
|
||||||
print("无法获取数据,程序退出")
|
print("无法获取数据,程序退出")
|
||||||
return
|
return
|
||||||
@ -170,6 +204,19 @@ def main():
|
|||||||
print(f"最低海拔: {min(altitudes)}m")
|
print(f"最低海拔: {min(altitudes)}m")
|
||||||
print(f"平均海拔: {sum(altitudes) / len(altitudes):.1f}m")
|
print(f"平均海拔: {sum(altitudes) / len(altitudes):.1f}m")
|
||||||
|
|
||||||
|
print("\n等待10秒后退出")
|
||||||
|
import time
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
"""
|
||||||
|
# 使用playwright方式抓取
|
||||||
|
python bs4-script.py --method playwright
|
||||||
|
|
||||||
|
# 使用requests方式抓取
|
||||||
|
python bs4-script.py --method requests
|
||||||
|
|
||||||
|
# 使用本地文件
|
||||||
|
python bs4-script.py --local
|
||||||
|
"""
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Reference in New Issue
Block a user