""" 提示词: 在文件result-script.py创建一个Python脚本来抓取页面上的所有站点以及站点信息 with open('source/satnogs.html', 'r', encoding='utf-8') as file: content = file.read() """ from bs4 import BeautifulSoup import json import re import os import requests from playwright.sync_api import sync_playwright def clean_text(text): """清理文本,移除多余的空白字符""" return ' '.join(text.split()) def extract_station_info(html_content): soup = BeautifulSoup(html_content, 'html.parser') stations = [] # 查找所有站点行 for row in soup.find_all('tr', class_='station-row'): station = {} # 提取站点ID station['id'] = row.find('span', class_='station').text.strip() # 提取站点名称 station['name'] = row.find_all('td')[1].text.strip() # 提取位置信息 location_td = row.find_all('td')[2] location_span = location_td.find('span', {'data-toggle': 'tooltip'}) if location_span: # 提取坐标 coordinates = location_span['title'].strip() # 提取网格代码 - 只保留网格代码部分 grid = location_span.text.strip().split('@')[0].strip() # 提取海拔 altitude_text = location_td.text altitude_match = re.search(r'@(\d+)m', altitude_text) altitude = f"{altitude_match.group(1)}m" if altitude_match else "N/A" station['location'] = { 'coordinates': coordinates, # 例如: "39.236°, -86.305°" 'grid': grid, # 例如: "EM69uf" 'altitude': altitude # 例如: "280m" } # 提取总观测数 total_obs = row.find('a', class_='badge-success') station['total_observations'] = total_obs.text.strip() if total_obs else '0' # 提取未来观测数 future_obs = row.find('a', class_='badge-info') station['future_observations'] = future_obs.text.strip() if future_obs else '0' # 提取天线信息 antennas = [] for antenna_span in row.find_all('span', class_='antenna-pill'): freq_range = antenna_span['title'].strip() antenna_type = antenna_span.text.strip() antennas.append({ 'type': antenna_type, 'frequency_range': freq_range }) station['antennas'] = antennas # 提取所有者信息 owner_link = row.find_all('td')[-1].find('a') if owner_link: station['owner'] = { 'name': owner_link.text.strip(), 'profile_url': owner_link['href'] } stations.append(station) return stations def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3', method='requests'): """ 获取页面内容 :param use_local: 是否使用本地文件 :param url: 要抓取的URL :param method: 获取方式,可选值:'requests'或'playwright' :return: 页面内容 """ if use_local: try: with open('source/satnogs.html', 'r', encoding='utf-8') as file: return file.read() except FileNotFoundError: print("本地文件不存在,将尝试从网络获取数据") use_local = False if not use_local: if method == 'requests': try: response = requests.get(url) response.raise_for_status() print("成功使用requests从网络获取数据") return response.text except requests.RequestException as e: print(f"使用requests获取网页数据时出错: {e}") return None elif method == 'playwright': try: with sync_playwright() as p: browser = p.chromium.launch(headless=False) page = browser.new_page() page.goto(url) # 等待页面加载完成 page.wait_for_selector('tr.station-row') content = page.content() browser.close() print("成功使用playwright从网络获取数据") return content except Exception as e: print(f"使用playwright获取网页数据时出错: {e}") return None else: print(f"不支持的获取方式: {method}") return None def main(): import argparse # 创建命令行参数解析器 parser = argparse.ArgumentParser(description='抓取SatNOGS网站站点信息') parser.add_argument('--local', action='store_true', help='使用本地文件') parser.add_argument('--method', choices=['requests', 'playwright'], default='requests', help='选择获取内容的方式: requests或playwright') args = parser.parse_args() # 获取内容 content = get_content(use_local=args.local, method=args.method) if not content: print("无法获取数据,程序退出") return # 提取站点信息 stations = extract_station_info(content) # 确保_tmp目录存在 os.makedirs('_tmp', exist_ok=True) # 将结果保存为JSON文件 with open('_tmp/stations.json', 'w', encoding='utf-8') as f: json.dump(stations, f, ensure_ascii=False, indent=2) # 打印统计信息 print(f"已成功提取 {len(stations)} 个站点的信息") print("详细信息已保存到 _tmp/stations.json 文件中") # 打印一些数据统计 total_observations = sum(int(station['total_observations']) for station in stations) print(f"所有站点总观测数: {total_observations}") # 统计天线类型 antenna_types = {} for station in stations: for antenna in station['antennas']: antenna_type = antenna['type'] antenna_types[antenna_type] = antenna_types.get(antenna_type, 0) + 1 print("\n天线类型统计:") for antenna_type, count in sorted(antenna_types.items()): print(f"{antenna_type}: {count}个") # 统计频段分布 print("\n频段分布:") vhf_count = uhf_count = other_count = 0 for station in stations: for antenna in station['antennas']: if 'VHF' in antenna['type']: vhf_count += 1 if 'UHF' in antenna['type']: uhf_count += 1 if not ('VHF' in antenna['type'] or 'UHF' in antenna['type']): other_count += 1 print(f"VHF频段天线: {vhf_count}个") print(f"UHF频段天线: {uhf_count}个") print(f"其他频段天线: {other_count}个") # 打印海拔分布 altitudes = [] for station in stations: alt = station['location']['altitude'] if alt != 'N/A': altitudes.append(int(alt[:-1])) # 移除'm'并转换为整数 if altitudes: print(f"\n海拔统计:") print(f"最高海拔: {max(altitudes)}m") print(f"最低海拔: {min(altitudes)}m") print(f"平均海拔: {sum(altitudes) / len(altitudes):.1f}m") print("\n等待10秒后退出") import time time.sleep(10) """ # 使用playwright方式抓取 python bs4-script.py --method playwright # 使用requests方式抓取 python bs4-script.py --method requests # 使用本地文件 python bs4-script.py --local """ if __name__ == "__main__": main()