bs4-script.py
This commit is contained in:
@ -4,10 +4,11 @@ ScrapeGraphAI是一个用于网络爬虫和数据抓取的AI工具。
|
|||||||
|
|
||||||
- https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md
|
- https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md
|
||||||
- https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples
|
- https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples
|
||||||
|
- https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples/blob/main/extras/authenticated_playwright.py
|
||||||
|
|
||||||
|
|
||||||
## Reference
|
## Reference
|
||||||
https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph#可用管道
|
https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
```
|
```
|
||||||
|
28
ScriptCreator.py
Normal file
28
ScriptCreator.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from scrapegraphai.graphs import ScriptCreatorGraph
|
||||||
|
|
||||||
|
graph_config = {
|
||||||
|
"llm": {
|
||||||
|
"model": "ollama/mistral-nemo:12b",
|
||||||
|
"model_tokens": 1024000,
|
||||||
|
"temperature": 0,
|
||||||
|
},
|
||||||
|
"library": "beautifulsoup4",
|
||||||
|
# https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/types.html#scriptcreatorgraph-scriptcreatormultigraph
|
||||||
|
"verbose": True,
|
||||||
|
# "headless": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# #############################
|
||||||
|
prompt = "创建一个Python脚本来抓取页面上的所有站点以及站点信息"
|
||||||
|
with open('source/satnogs.html', 'r', encoding='utf-8') as file:
|
||||||
|
source = file.read()
|
||||||
|
|
||||||
|
script_creator_graph = ScriptCreatorGraph(
|
||||||
|
prompt=prompt,
|
||||||
|
source=source,
|
||||||
|
config=graph_config,
|
||||||
|
# schema=schema
|
||||||
|
)
|
||||||
|
|
||||||
|
result = script_creator_graph.run()
|
||||||
|
print(result)
|
@ -24,25 +24,22 @@ graph_config = {
|
|||||||
# "headless": False,
|
# "headless": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# #############################
|
# #############################
|
||||||
prompt="List all article titles on the page"
|
# prompt = "List all article titles on the page"
|
||||||
prompt="列出页面上的所有文章标题"
|
# prompt = "列出页面上的所有文章标题"
|
||||||
source="https://www.aivi.fyi/"
|
# source = "https://www.aivi.fyi/"
|
||||||
|
#
|
||||||
# #############################
|
# # #############################
|
||||||
prompt="List all Stations on the page."
|
# prompt = "List all Stations on the page."
|
||||||
source="https://network.satnogs.org/stations/"
|
# source = "https://network.satnogs.org/stations/"
|
||||||
|
|
||||||
# #############################
|
# #############################
|
||||||
# prompt="列出页面上的所有站点。"
|
# prompt="列出页面上的所有站点。"
|
||||||
prompt="列出页面上的所有站点以及站点信息"
|
prompt = "列出页面上的所有站点以及站点信息"
|
||||||
# prompt="列出页面上的所有站点以及站点信息。antennas需要是一个数组。"
|
# prompt="列出页面上的所有站点以及站点信息。antennas需要是一个数组。"
|
||||||
source=""
|
|
||||||
with open('source/satnogs.html', 'r', encoding='utf-8') as file:
|
with open('source/satnogs.html', 'r', encoding='utf-8') as file:
|
||||||
source = file.read()
|
source = file.read()
|
||||||
|
|
||||||
|
|
||||||
# ************************************************
|
# ************************************************
|
||||||
# Create the SmartScraperGraph instance and run it
|
# Create the SmartScraperGraph instance and run it
|
||||||
# ************************************************
|
# ************************************************
|
||||||
|
175
bs4-script.py
Normal file
175
bs4-script.py
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
"""
|
||||||
|
提示词: 在文件result-script.py创建一个Python脚本来抓取页面上的所有站点以及站点信息
|
||||||
|
|
||||||
|
with open('source/satnogs.html', 'r', encoding='utf-8') as file:
|
||||||
|
content = file.read()
|
||||||
|
"""
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
"""清理文本,移除多余的空白字符"""
|
||||||
|
return ' '.join(text.split())
|
||||||
|
|
||||||
|
|
||||||
|
def extract_station_info(html_content):
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
stations = []
|
||||||
|
|
||||||
|
# 查找所有站点行
|
||||||
|
for row in soup.find_all('tr', class_='station-row'):
|
||||||
|
station = {}
|
||||||
|
|
||||||
|
# 提取站点ID
|
||||||
|
station['id'] = row.find('span', class_='station').text.strip()
|
||||||
|
|
||||||
|
# 提取站点名称
|
||||||
|
station['name'] = row.find_all('td')[1].text.strip()
|
||||||
|
|
||||||
|
# 提取位置信息
|
||||||
|
location_td = row.find_all('td')[2]
|
||||||
|
location_span = location_td.find('span', {'data-toggle': 'tooltip'})
|
||||||
|
if location_span:
|
||||||
|
# 提取坐标
|
||||||
|
coordinates = location_span['title'].strip()
|
||||||
|
# 提取网格代码 - 只保留网格代码部分
|
||||||
|
grid = location_span.text.strip().split('@')[0].strip()
|
||||||
|
# 提取海拔
|
||||||
|
altitude_text = location_td.text
|
||||||
|
altitude_match = re.search(r'@(\d+)m', altitude_text)
|
||||||
|
altitude = f"{altitude_match.group(1)}m" if altitude_match else "N/A"
|
||||||
|
|
||||||
|
station['location'] = {
|
||||||
|
'coordinates': coordinates, # 例如: "39.236°, -86.305°"
|
||||||
|
'grid': grid, # 例如: "EM69uf"
|
||||||
|
'altitude': altitude # 例如: "280m"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 提取总观测数
|
||||||
|
total_obs = row.find('a', class_='badge-success')
|
||||||
|
station['total_observations'] = total_obs.text.strip() if total_obs else '0'
|
||||||
|
|
||||||
|
# 提取未来观测数
|
||||||
|
future_obs = row.find('a', class_='badge-info')
|
||||||
|
station['future_observations'] = future_obs.text.strip() if future_obs else '0'
|
||||||
|
|
||||||
|
# 提取天线信息
|
||||||
|
antennas = []
|
||||||
|
for antenna_span in row.find_all('span', class_='antenna-pill'):
|
||||||
|
freq_range = antenna_span['title'].strip()
|
||||||
|
antenna_type = antenna_span.text.strip()
|
||||||
|
antennas.append({
|
||||||
|
'type': antenna_type,
|
||||||
|
'frequency_range': freq_range
|
||||||
|
})
|
||||||
|
station['antennas'] = antennas
|
||||||
|
|
||||||
|
# 提取所有者信息
|
||||||
|
owner_link = row.find_all('td')[-1].find('a')
|
||||||
|
if owner_link:
|
||||||
|
station['owner'] = {
|
||||||
|
'name': owner_link.text.strip(),
|
||||||
|
'profile_url': owner_link['href']
|
||||||
|
}
|
||||||
|
|
||||||
|
stations.append(station)
|
||||||
|
|
||||||
|
return stations
|
||||||
|
|
||||||
|
|
||||||
|
def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3'):
|
||||||
|
"""获取页面内容,可以从本地文件或网络获取"""
|
||||||
|
if use_local:
|
||||||
|
try:
|
||||||
|
with open('source/satnogs.html', 'r', encoding='utf-8') as file:
|
||||||
|
return file.read()
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("本地文件不存在,将尝试从网络获取数据")
|
||||||
|
use_local = False
|
||||||
|
|
||||||
|
if not use_local:
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
print("成功从网络获取数据")
|
||||||
|
return response.text
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"获取网页数据时出错: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 设置数据源 (True为本地文件,False为网络请求)
|
||||||
|
use_local = False
|
||||||
|
|
||||||
|
# 获取内容
|
||||||
|
content = get_content(use_local)
|
||||||
|
if not content:
|
||||||
|
print("无法获取数据,程序退出")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 提取站点信息
|
||||||
|
stations = extract_station_info(content)
|
||||||
|
|
||||||
|
# 确保_tmp目录存在
|
||||||
|
os.makedirs('_tmp', exist_ok=True)
|
||||||
|
|
||||||
|
# 将结果保存为JSON文件
|
||||||
|
with open('_tmp/stations.json', 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(stations, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
# 打印统计信息
|
||||||
|
print(f"已成功提取 {len(stations)} 个站点的信息")
|
||||||
|
print("详细信息已保存到 _tmp/stations.json 文件中")
|
||||||
|
|
||||||
|
# 打印一些数据统计
|
||||||
|
total_observations = sum(int(station['total_observations']) for station in stations)
|
||||||
|
print(f"所有站点总观测数: {total_observations}")
|
||||||
|
|
||||||
|
# 统计天线类型
|
||||||
|
antenna_types = {}
|
||||||
|
for station in stations:
|
||||||
|
for antenna in station['antennas']:
|
||||||
|
antenna_type = antenna['type']
|
||||||
|
antenna_types[antenna_type] = antenna_types.get(antenna_type, 0) + 1
|
||||||
|
|
||||||
|
print("\n天线类型统计:")
|
||||||
|
for antenna_type, count in sorted(antenna_types.items()):
|
||||||
|
print(f"{antenna_type}: {count}个")
|
||||||
|
|
||||||
|
# 统计频段分布
|
||||||
|
print("\n频段分布:")
|
||||||
|
vhf_count = uhf_count = other_count = 0
|
||||||
|
for station in stations:
|
||||||
|
for antenna in station['antennas']:
|
||||||
|
if 'VHF' in antenna['type']:
|
||||||
|
vhf_count += 1
|
||||||
|
if 'UHF' in antenna['type']:
|
||||||
|
uhf_count += 1
|
||||||
|
if not ('VHF' in antenna['type'] or 'UHF' in antenna['type']):
|
||||||
|
other_count += 1
|
||||||
|
|
||||||
|
print(f"VHF频段天线: {vhf_count}个")
|
||||||
|
print(f"UHF频段天线: {uhf_count}个")
|
||||||
|
print(f"其他频段天线: {other_count}个")
|
||||||
|
|
||||||
|
# 打印海拔分布
|
||||||
|
altitudes = []
|
||||||
|
for station in stations:
|
||||||
|
alt = station['location']['altitude']
|
||||||
|
if alt != 'N/A':
|
||||||
|
altitudes.append(int(alt[:-1])) # 移除'm'并转换为整数
|
||||||
|
|
||||||
|
if altitudes:
|
||||||
|
print(f"\n海拔统计:")
|
||||||
|
print(f"最高海拔: {max(altitudes)}m")
|
||||||
|
print(f"最低海拔: {min(altitudes)}m")
|
||||||
|
print(f"平均海拔: {sum(altitudes) / len(altitudes):.1f}m")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
0
scrape_stations.py
Normal file
0
scrape_stations.py
Normal file
1641
source/satnogs.html
1641
source/satnogs.html
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user