From b6809e90ed9b3e71de391c694818e4d1dd86e44d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E6=B5=A9?= Date: Thu, 16 Jan 2025 15:57:50 +0800 Subject: [PATCH] bs4-script.py --- README.md | 3 +- ScriptCreator.py | 28 + app-ollama.py | 19 +- bs4-script.py | 175 +++++ scrape_stations.py | 0 source/satnogs.html | 1675 ++++++++++++++++++++++++++++++++++++++----- 6 files changed, 1711 insertions(+), 189 deletions(-) create mode 100644 ScriptCreator.py create mode 100644 bs4-script.py create mode 100644 scrape_stations.py diff --git a/README.md b/README.md index 16d8ce4..83739a1 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,11 @@ ScrapeGraphAI是一个用于网络爬虫和数据抓取的AI工具。 - https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md - https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples +- https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples/blob/main/extras/authenticated_playwright.py ## Reference -https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph#可用管道 +https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph ## Dependencies ``` diff --git a/ScriptCreator.py b/ScriptCreator.py new file mode 100644 index 0000000..94dffe4 --- /dev/null +++ b/ScriptCreator.py @@ -0,0 +1,28 @@ +from scrapegraphai.graphs import ScriptCreatorGraph + +graph_config = { + "llm": { + "model": "ollama/mistral-nemo:12b", + "model_tokens": 1024000, + "temperature": 0, + }, + "library": "beautifulsoup4", + # https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/types.html#scriptcreatorgraph-scriptcreatormultigraph + "verbose": True, + # "headless": False, +} + +# ############################# +prompt = "创建一个Python脚本来抓取页面上的所有站点以及站点信息" +with open('source/satnogs.html', 'r', encoding='utf-8') as file: + source = file.read() + +script_creator_graph = ScriptCreatorGraph( + prompt=prompt, + source=source, + config=graph_config, + # schema=schema +) + +result = script_creator_graph.run() +print(result) diff --git a/app-ollama.py b/app-ollama.py index d3eaae5..81c82df 100644 --- a/app-ollama.py +++ b/app-ollama.py @@ -24,25 +24,22 @@ graph_config = { # "headless": False, } - # ############################# -prompt="List all article titles on the page" -prompt="列出页面上的所有文章标题" -source="https://www.aivi.fyi/" - -# ############################# -prompt="List all Stations on the page." -source="https://network.satnogs.org/stations/" +# prompt = "List all article titles on the page" +# prompt = "列出页面上的所有文章标题" +# source = "https://www.aivi.fyi/" +# +# # ############################# +# prompt = "List all Stations on the page." +# source = "https://network.satnogs.org/stations/" # ############################# # prompt="列出页面上的所有站点。" -prompt="列出页面上的所有站点以及站点信息" +prompt = "列出页面上的所有站点以及站点信息" # prompt="列出页面上的所有站点以及站点信息。antennas需要是一个数组。" -source="" with open('source/satnogs.html', 'r', encoding='utf-8') as file: source = file.read() - # ************************************************ # Create the SmartScraperGraph instance and run it # ************************************************ diff --git a/bs4-script.py b/bs4-script.py new file mode 100644 index 0000000..57e3142 --- /dev/null +++ b/bs4-script.py @@ -0,0 +1,175 @@ +""" +提示词: 在文件result-script.py创建一个Python脚本来抓取页面上的所有站点以及站点信息 + +with open('source/satnogs.html', 'r', encoding='utf-8') as file: + content = file.read() +""" +from bs4 import BeautifulSoup +import json +import re +import os +import requests + + +def clean_text(text): + """清理文本,移除多余的空白字符""" + return ' '.join(text.split()) + + +def extract_station_info(html_content): + soup = BeautifulSoup(html_content, 'html.parser') + stations = [] + + # 查找所有站点行 + for row in soup.find_all('tr', class_='station-row'): + station = {} + + # 提取站点ID + station['id'] = row.find('span', class_='station').text.strip() + + # 提取站点名称 + station['name'] = row.find_all('td')[1].text.strip() + + # 提取位置信息 + location_td = row.find_all('td')[2] + location_span = location_td.find('span', {'data-toggle': 'tooltip'}) + if location_span: + # 提取坐标 + coordinates = location_span['title'].strip() + # 提取网格代码 - 只保留网格代码部分 + grid = location_span.text.strip().split('@')[0].strip() + # 提取海拔 + altitude_text = location_td.text + altitude_match = re.search(r'@(\d+)m', altitude_text) + altitude = f"{altitude_match.group(1)}m" if altitude_match else "N/A" + + station['location'] = { + 'coordinates': coordinates, # 例如: "39.236°, -86.305°" + 'grid': grid, # 例如: "EM69uf" + 'altitude': altitude # 例如: "280m" + } + + # 提取总观测数 + total_obs = row.find('a', class_='badge-success') + station['total_observations'] = total_obs.text.strip() if total_obs else '0' + + # 提取未来观测数 + future_obs = row.find('a', class_='badge-info') + station['future_observations'] = future_obs.text.strip() if future_obs else '0' + + # 提取天线信息 + antennas = [] + for antenna_span in row.find_all('span', class_='antenna-pill'): + freq_range = antenna_span['title'].strip() + antenna_type = antenna_span.text.strip() + antennas.append({ + 'type': antenna_type, + 'frequency_range': freq_range + }) + station['antennas'] = antennas + + # 提取所有者信息 + owner_link = row.find_all('td')[-1].find('a') + if owner_link: + station['owner'] = { + 'name': owner_link.text.strip(), + 'profile_url': owner_link['href'] + } + + stations.append(station) + + return stations + + +def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3'): + """获取页面内容,可以从本地文件或网络获取""" + if use_local: + try: + with open('source/satnogs.html', 'r', encoding='utf-8') as file: + return file.read() + except FileNotFoundError: + print("本地文件不存在,将尝试从网络获取数据") + use_local = False + + if not use_local: + try: + response = requests.get(url) + response.raise_for_status() + print("成功从网络获取数据") + return response.text + except requests.RequestException as e: + print(f"获取网页数据时出错: {e}") + return None + + +def main(): + # 设置数据源 (True为本地文件,False为网络请求) + use_local = False + + # 获取内容 + content = get_content(use_local) + if not content: + print("无法获取数据,程序退出") + return + + # 提取站点信息 + stations = extract_station_info(content) + + # 确保_tmp目录存在 + os.makedirs('_tmp', exist_ok=True) + + # 将结果保存为JSON文件 + with open('_tmp/stations.json', 'w', encoding='utf-8') as f: + json.dump(stations, f, ensure_ascii=False, indent=2) + + # 打印统计信息 + print(f"已成功提取 {len(stations)} 个站点的信息") + print("详细信息已保存到 _tmp/stations.json 文件中") + + # 打印一些数据统计 + total_observations = sum(int(station['total_observations']) for station in stations) + print(f"所有站点总观测数: {total_observations}") + + # 统计天线类型 + antenna_types = {} + for station in stations: + for antenna in station['antennas']: + antenna_type = antenna['type'] + antenna_types[antenna_type] = antenna_types.get(antenna_type, 0) + 1 + + print("\n天线类型统计:") + for antenna_type, count in sorted(antenna_types.items()): + print(f"{antenna_type}: {count}个") + + # 统计频段分布 + print("\n频段分布:") + vhf_count = uhf_count = other_count = 0 + for station in stations: + for antenna in station['antennas']: + if 'VHF' in antenna['type']: + vhf_count += 1 + if 'UHF' in antenna['type']: + uhf_count += 1 + if not ('VHF' in antenna['type'] or 'UHF' in antenna['type']): + other_count += 1 + + print(f"VHF频段天线: {vhf_count}个") + print(f"UHF频段天线: {uhf_count}个") + print(f"其他频段天线: {other_count}个") + + # 打印海拔分布 + altitudes = [] + for station in stations: + alt = station['location']['altitude'] + if alt != 'N/A': + altitudes.append(int(alt[:-1])) # 移除'm'并转换为整数 + + if altitudes: + print(f"\n海拔统计:") + print(f"最高海拔: {max(altitudes)}m") + print(f"最低海拔: {min(altitudes)}m") + print(f"平均海拔: {sum(altitudes) / len(altitudes):.1f}m") + + +if __name__ == "__main__": + main() diff --git a/scrape_stations.py b/scrape_stations.py new file mode 100644 index 0000000..e69de29 diff --git a/source/satnogs.html b/source/satnogs.html index 39f533a..a4f5be3 100644 --- a/source/satnogs.html +++ b/source/satnogs.html @@ -1,186 +1,1507 @@ - - - - - - - - - - - - - - - - - - - - - -
IDNameLocationTotalFutureAntennasOwner
- - - 2 - + + + + + SatNOGS Network - Ground Stations + + + + + + + + + + + + - KB9JHU - + + + + + + +
+ +
+ +
+ + +
+
+

+ Ground Stations +

+
+ +
+ - - - EM69uf - - @280m + + + + + + + Map + + +
+ +
+ + + + + + +
+
+ +
+
+
+ +
+ +
+ + + + +
+
+
+ +
+ + +
+
+
+ +
+
+
+
+ +
+
+ + + + + + + + + + + - - - - - - - - - - - + + + + + + + + - - - KM18ub - - @104m + + + + + + + + + - - - - - - - - - - - + + + + + + + + - - - IO82vx - - @100m + + + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IDNameLocationTotalFutureAntennasOwner
- - 65779 - - - - 0 - - - - - Cross Yagi (UHF) - - - - Cross Yagi (VHF) - - - - - - Corey Shields - - -
- - - 6 - - - - Apomahon - +
+ + + 207 + + + + AAUSAT GND + + + + + JO47xa + + @20m + + + + 12072 + + + + 2 + + + + + Helical (UHF) + + + + + + aausat_gnd + + +
+ + + 216 + + + + CWVRPi3uhf + + + + + EM98di + + @270m + + + + 14537 + + + + 2 + + + + + Turnstile (UHF) + + + + + + cswiger + + +
- - 353519 - - - - 147 - - - - - Patch (UHF) - - - - - - Dimitris Papadeas - - -
- - - 34 - - - - M0SZT - +
+ + + 223 + + + + W2MMD GCARC Clubhouse + + + + + FM29jr + + @10m + + + + 53764 + + + + 7 + + + + + Cross Yagi (UHF) + + + + Cross Yagi (VHF) + + + + + + Jon Pearce + + +
+ + + 255 + + + + EA5BZ - Elche + + + + + IM98pg + + @120m + + + + 10877 + + + + 0 + + + + + Quadrafilar (VHF) + + + + + + Ruben Navarro Huedo + + +
- - 4635 +
+ + + 310 + + + + CSPD-VA3TZA + + + + + FN03cf + + @92m + + + + 14074 + + + + 19 + + + + + Turnstile (UHF, L) + + + + + + Miloslav YT2CQ Miskar + + +
+ + + 329 + + + + CU2ZG + + + + + HM77fr + + @80m + + + + 19029 + + + + 2 + + + + + Vertical (VHF) + + + + Vertical (UHF) + + + + + + Pedro Sousa + + +
+ + + 331 + + + + VK5KJP-UHF + + + + + PF95gd + + @10m + + + + 17015 + + + + 10 + + + + + Yagi (UHF) + + + + + + Joe Pereira + + +
+ + + 342 + + + + EA5WA Pobla Llarga + + + + + IM99sc + + @50m + + + + 32191 + + + + 11 + + + + + Eggbeater (UHF) + + + + + + Juan Carlos EA5WA + + +
+ + + 355 + + + + Black Mensa Ground Station + + + + + JN58ug + + @490m + + + + 15718 + + + + 7 + + + + + Vertical (VHF) + + + + + + mdz + + +
+ + + 365 + + + + DS-1 (VHF) + + + + + JO22ff + + @20m + + + + 7135 + + + + 0 + + + + + Turnstile (VHF) + + + + + + dutchspace + + +
+ + + 375 + + + + AB3DC - Detroit, MI + + + + + EN82gj + + @240m + + + + 1746 + + + + 7 + + + + + Turnstile (VHF, UHF) + + + + + + Dinesh Cyanam + + +
+ + + 385 + + + + 52HancockSt + + + + + FN42jk + + @70m + + + + 6456 + + + + 2 + + + + + Turnstile (UHF) + + + + + + tony michel + + +
+ + + 430 + + + + Leonidio + + + + + KM17kd + + @20m + + + + 13411 + + + + 2 + + + + + Eggbeater (UHF) + + + + + + Manolis Surligas + + +
+ + + 431 + + + + PE0SAT-11 + + + + + JO21mr + + @5m + + + + 112733 + + + + 15 + + + + + Yagi (UHF) + + + + + + PE0SAT + + +
+ + + 432 + + + + kc1fha + + + + + FN42fu + + @73m + + + + 15568 + + + + 7 + + + + + Ground Plane (VHF) + + + + Ground Plane (ULF, VLF, LF, MF, HF, VHF, UHF) + + + + + + John O'Neil + + +
+ + + 438 + + + + Timisoara - VHF QHA + + + + + KN05or + + @80m + + + + 21890 + + + + 0 + + + + + Quadrafilar (VHF) + + + + + + Mircea + + +
+ + + 446 + + + + IK1JNS-VHF + + + + + JN44ca + + @4m + + + + 10603 + + + + 3 + + + + + Turnstile (VHF) + + + + + + Pino IK1JNS + + +
+ + + 452 + + + + OZ1SKY + + + + + JO56dg + + @100m + + + + 6191 + + + + 10 + + + + + Vertical (UHF) + + + + + + Brian + + +
+ + + 509 + + + + F4KLD + + + + + JN03rn + + @180m + + + + 8786 + + + + 6 + + + + + Quadrafilar (UHF) + + + + + + F5MDY + + +
+ + + 526 + + + + W2GRK - UHF + + + + + FN20tp + + @29m + + + + 5524 + + + + 2 + + + + + Quadrafilar (UHF) + + + + + + w2grk + + +
+ +
- - 0 + + + + + + + +
  • + 1 +
  • + + + +
  • + 2 +
  • + + + +
  • + 3 +
  • + + + +
  • + 4 +
  • + + + +
  • + 5 +
  • + + + + +
  • + ... +
  • +
  • + 172 +
  • + + + +
  • + + -
  • - - - Quadrafilar (VHF) - - - - - - Carl Plant - - -
    \ No newline at end of file + + + + + +
    + Query returned 20 stations. +
    + + + + + + + + + + + + + + + + + +