Files
ScrapeGraphAI-experiments/bs4-script.py
2025-01-16 15:57:50 +08:00

176 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
提示词: 在文件result-script.py创建一个Python脚本来抓取页面上的所有站点以及站点信息
with open('source/satnogs.html', 'r', encoding='utf-8') as file:
content = file.read()
"""
from bs4 import BeautifulSoup
import json
import re
import os
import requests
def clean_text(text):
"""清理文本,移除多余的空白字符"""
return ' '.join(text.split())
def extract_station_info(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
stations = []
# 查找所有站点行
for row in soup.find_all('tr', class_='station-row'):
station = {}
# 提取站点ID
station['id'] = row.find('span', class_='station').text.strip()
# 提取站点名称
station['name'] = row.find_all('td')[1].text.strip()
# 提取位置信息
location_td = row.find_all('td')[2]
location_span = location_td.find('span', {'data-toggle': 'tooltip'})
if location_span:
# 提取坐标
coordinates = location_span['title'].strip()
# 提取网格代码 - 只保留网格代码部分
grid = location_span.text.strip().split('@')[0].strip()
# 提取海拔
altitude_text = location_td.text
altitude_match = re.search(r'@(\d+)m', altitude_text)
altitude = f"{altitude_match.group(1)}m" if altitude_match else "N/A"
station['location'] = {
'coordinates': coordinates, # 例如: "39.236°, -86.305°"
'grid': grid, # 例如: "EM69uf"
'altitude': altitude # 例如: "280m"
}
# 提取总观测数
total_obs = row.find('a', class_='badge-success')
station['total_observations'] = total_obs.text.strip() if total_obs else '0'
# 提取未来观测数
future_obs = row.find('a', class_='badge-info')
station['future_observations'] = future_obs.text.strip() if future_obs else '0'
# 提取天线信息
antennas = []
for antenna_span in row.find_all('span', class_='antenna-pill'):
freq_range = antenna_span['title'].strip()
antenna_type = antenna_span.text.strip()
antennas.append({
'type': antenna_type,
'frequency_range': freq_range
})
station['antennas'] = antennas
# 提取所有者信息
owner_link = row.find_all('td')[-1].find('a')
if owner_link:
station['owner'] = {
'name': owner_link.text.strip(),
'profile_url': owner_link['href']
}
stations.append(station)
return stations
def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3'):
"""获取页面内容,可以从本地文件或网络获取"""
if use_local:
try:
with open('source/satnogs.html', 'r', encoding='utf-8') as file:
return file.read()
except FileNotFoundError:
print("本地文件不存在,将尝试从网络获取数据")
use_local = False
if not use_local:
try:
response = requests.get(url)
response.raise_for_status()
print("成功从网络获取数据")
return response.text
except requests.RequestException as e:
print(f"获取网页数据时出错: {e}")
return None
def main():
# 设置数据源 (True为本地文件False为网络请求)
use_local = False
# 获取内容
content = get_content(use_local)
if not content:
print("无法获取数据,程序退出")
return
# 提取站点信息
stations = extract_station_info(content)
# 确保_tmp目录存在
os.makedirs('_tmp', exist_ok=True)
# 将结果保存为JSON文件
with open('_tmp/stations.json', 'w', encoding='utf-8') as f:
json.dump(stations, f, ensure_ascii=False, indent=2)
# 打印统计信息
print(f"已成功提取 {len(stations)} 个站点的信息")
print("详细信息已保存到 _tmp/stations.json 文件中")
# 打印一些数据统计
total_observations = sum(int(station['total_observations']) for station in stations)
print(f"所有站点总观测数: {total_observations}")
# 统计天线类型
antenna_types = {}
for station in stations:
for antenna in station['antennas']:
antenna_type = antenna['type']
antenna_types[antenna_type] = antenna_types.get(antenna_type, 0) + 1
print("\n天线类型统计:")
for antenna_type, count in sorted(antenna_types.items()):
print(f"{antenna_type}: {count}")
# 统计频段分布
print("\n频段分布:")
vhf_count = uhf_count = other_count = 0
for station in stations:
for antenna in station['antennas']:
if 'VHF' in antenna['type']:
vhf_count += 1
if 'UHF' in antenna['type']:
uhf_count += 1
if not ('VHF' in antenna['type'] or 'UHF' in antenna['type']):
other_count += 1
print(f"VHF频段天线: {vhf_count}")
print(f"UHF频段天线: {uhf_count}")
print(f"其他频段天线: {other_count}")
# 打印海拔分布
altitudes = []
for station in stations:
alt = station['location']['altitude']
if alt != 'N/A':
altitudes.append(int(alt[:-1])) # 移除'm'并转换为整数
if altitudes:
print(f"\n海拔统计:")
print(f"最高海拔: {max(altitudes)}m")
print(f"最低海拔: {min(altitudes)}m")
print(f"平均海拔: {sum(altitudes) / len(altitudes):.1f}m")
if __name__ == "__main__":
main()