diff --git a/README.md b/README.md index 16d8ce4..83739a1 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,11 @@ ScrapeGraphAI是一个用于网络爬虫和数据抓取的AI工具。 - https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md - https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples +- https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples/blob/main/extras/authenticated_playwright.py ## Reference -https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph#可用管道 +https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph ## Dependencies ``` diff --git a/ScriptCreator.py b/ScriptCreator.py new file mode 100644 index 0000000..94dffe4 --- /dev/null +++ b/ScriptCreator.py @@ -0,0 +1,28 @@ +from scrapegraphai.graphs import ScriptCreatorGraph + +graph_config = { + "llm": { + "model": "ollama/mistral-nemo:12b", + "model_tokens": 1024000, + "temperature": 0, + }, + "library": "beautifulsoup4", + # https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/types.html#scriptcreatorgraph-scriptcreatormultigraph + "verbose": True, + # "headless": False, +} + +# ############################# +prompt = "创建一个Python脚本来抓取页面上的所有站点以及站点信息" +with open('source/satnogs.html', 'r', encoding='utf-8') as file: + source = file.read() + +script_creator_graph = ScriptCreatorGraph( + prompt=prompt, + source=source, + config=graph_config, + # schema=schema +) + +result = script_creator_graph.run() +print(result) diff --git a/app-ollama.py b/app-ollama.py index d3eaae5..81c82df 100644 --- a/app-ollama.py +++ b/app-ollama.py @@ -24,25 +24,22 @@ graph_config = { # "headless": False, } - # ############################# -prompt="List all article titles on the page" -prompt="列出页面上的所有文章标题" -source="https://www.aivi.fyi/" - -# ############################# -prompt="List all Stations on the page." -source="https://network.satnogs.org/stations/" +# prompt = "List all article titles on the page" +# prompt = "列出页面上的所有文章标题" +# source = "https://www.aivi.fyi/" +# +# # ############################# +# prompt = "List all Stations on the page." +# source = "https://network.satnogs.org/stations/" # ############################# # prompt="列出页面上的所有站点。" -prompt="列出页面上的所有站点以及站点信息" +prompt = "列出页面上的所有站点以及站点信息" # prompt="列出页面上的所有站点以及站点信息。antennas需要是一个数组。" -source="" with open('source/satnogs.html', 'r', encoding='utf-8') as file: source = file.read() - # ************************************************ # Create the SmartScraperGraph instance and run it # ************************************************ diff --git a/bs4-script.py b/bs4-script.py new file mode 100644 index 0000000..57e3142 --- /dev/null +++ b/bs4-script.py @@ -0,0 +1,175 @@ +""" +提示词: 在文件result-script.py创建一个Python脚本来抓取页面上的所有站点以及站点信息 + +with open('source/satnogs.html', 'r', encoding='utf-8') as file: + content = file.read() +""" +from bs4 import BeautifulSoup +import json +import re +import os +import requests + + +def clean_text(text): + """清理文本,移除多余的空白字符""" + return ' '.join(text.split()) + + +def extract_station_info(html_content): + soup = BeautifulSoup(html_content, 'html.parser') + stations = [] + + # 查找所有站点行 + for row in soup.find_all('tr', class_='station-row'): + station = {} + + # 提取站点ID + station['id'] = row.find('span', class_='station').text.strip() + + # 提取站点名称 + station['name'] = row.find_all('td')[1].text.strip() + + # 提取位置信息 + location_td = row.find_all('td')[2] + location_span = location_td.find('span', {'data-toggle': 'tooltip'}) + if location_span: + # 提取坐标 + coordinates = location_span['title'].strip() + # 提取网格代码 - 只保留网格代码部分 + grid = location_span.text.strip().split('@')[0].strip() + # 提取海拔 + altitude_text = location_td.text + altitude_match = re.search(r'@(\d+)m', altitude_text) + altitude = f"{altitude_match.group(1)}m" if altitude_match else "N/A" + + station['location'] = { + 'coordinates': coordinates, # 例如: "39.236°, -86.305°" + 'grid': grid, # 例如: "EM69uf" + 'altitude': altitude # 例如: "280m" + } + + # 提取总观测数 + total_obs = row.find('a', class_='badge-success') + station['total_observations'] = total_obs.text.strip() if total_obs else '0' + + # 提取未来观测数 + future_obs = row.find('a', class_='badge-info') + station['future_observations'] = future_obs.text.strip() if future_obs else '0' + + # 提取天线信息 + antennas = [] + for antenna_span in row.find_all('span', class_='antenna-pill'): + freq_range = antenna_span['title'].strip() + antenna_type = antenna_span.text.strip() + antennas.append({ + 'type': antenna_type, + 'frequency_range': freq_range + }) + station['antennas'] = antennas + + # 提取所有者信息 + owner_link = row.find_all('td')[-1].find('a') + if owner_link: + station['owner'] = { + 'name': owner_link.text.strip(), + 'profile_url': owner_link['href'] + } + + stations.append(station) + + return stations + + +def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3'): + """获取页面内容,可以从本地文件或网络获取""" + if use_local: + try: + with open('source/satnogs.html', 'r', encoding='utf-8') as file: + return file.read() + except FileNotFoundError: + print("本地文件不存在,将尝试从网络获取数据") + use_local = False + + if not use_local: + try: + response = requests.get(url) + response.raise_for_status() + print("成功从网络获取数据") + return response.text + except requests.RequestException as e: + print(f"获取网页数据时出错: {e}") + return None + + +def main(): + # 设置数据源 (True为本地文件,False为网络请求) + use_local = False + + # 获取内容 + content = get_content(use_local) + if not content: + print("无法获取数据,程序退出") + return + + # 提取站点信息 + stations = extract_station_info(content) + + # 确保_tmp目录存在 + os.makedirs('_tmp', exist_ok=True) + + # 将结果保存为JSON文件 + with open('_tmp/stations.json', 'w', encoding='utf-8') as f: + json.dump(stations, f, ensure_ascii=False, indent=2) + + # 打印统计信息 + print(f"已成功提取 {len(stations)} 个站点的信息") + print("详细信息已保存到 _tmp/stations.json 文件中") + + # 打印一些数据统计 + total_observations = sum(int(station['total_observations']) for station in stations) + print(f"所有站点总观测数: {total_observations}") + + # 统计天线类型 + antenna_types = {} + for station in stations: + for antenna in station['antennas']: + antenna_type = antenna['type'] + antenna_types[antenna_type] = antenna_types.get(antenna_type, 0) + 1 + + print("\n天线类型统计:") + for antenna_type, count in sorted(antenna_types.items()): + print(f"{antenna_type}: {count}个") + + # 统计频段分布 + print("\n频段分布:") + vhf_count = uhf_count = other_count = 0 + for station in stations: + for antenna in station['antennas']: + if 'VHF' in antenna['type']: + vhf_count += 1 + if 'UHF' in antenna['type']: + uhf_count += 1 + if not ('VHF' in antenna['type'] or 'UHF' in antenna['type']): + other_count += 1 + + print(f"VHF频段天线: {vhf_count}个") + print(f"UHF频段天线: {uhf_count}个") + print(f"其他频段天线: {other_count}个") + + # 打印海拔分布 + altitudes = [] + for station in stations: + alt = station['location']['altitude'] + if alt != 'N/A': + altitudes.append(int(alt[:-1])) # 移除'm'并转换为整数 + + if altitudes: + print(f"\n海拔统计:") + print(f"最高海拔: {max(altitudes)}m") + print(f"最低海拔: {min(altitudes)}m") + print(f"平均海拔: {sum(altitudes) / len(altitudes):.1f}m") + + +if __name__ == "__main__": + main() diff --git a/scrape_stations.py b/scrape_stations.py new file mode 100644 index 0000000..e69de29 diff --git a/source/satnogs.html b/source/satnogs.html index 39f533a..a4f5be3 100644 --- a/source/satnogs.html +++ b/source/satnogs.html @@ -1,186 +1,1507 @@ -