bs4-script.py

2025-01-16 15:57:50 +08:00
parent c7f165439c
commit b6809e90ed
6 changed files with 1711 additions and 189 deletions
--- a/README.md
+++ b/README.md
@ -4,10 +4,11 @@ ScrapeGraphAI是一个用于网络爬虫和数据抓取的AI工具。
 - https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md
 - https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples
 - https://github.com/ScrapeGraphAI/ScrapegraphLib-Examples/blob/main/extras/authenticated_playwright.py
 ## Reference
-https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph#可用管道
+https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph
 ## Dependencies
 ```
--- a/ScriptCreator.py
+++ b/ScriptCreator.py
@ -0,0 +1,28 @@
 from scrapegraphai.graphs import ScriptCreatorGraph
 graph_config = {
    "llm": {
        "model": "ollama/mistral-nemo:12b",
        "model_tokens": 1024000,
        "temperature": 0,
    },
    "library": "beautifulsoup4",
    # https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/types.html#scriptcreatorgraph-scriptcreatormultigraph
    "verbose": True,
    # "headless": False,
 }
 # #############################
 prompt = "创建一个Python脚本来抓取页面上的所有站点以及站点信息"
 with open('source/satnogs.html', 'r', encoding='utf-8') as file:
    source = file.read()
 script_creator_graph = ScriptCreatorGraph(
    prompt=prompt,
    source=source,
    config=graph_config,
    # schema=schema
 )
 result = script_creator_graph.run()
 print(result)
--- a/app-ollama.py
+++ b/app-ollama.py
@ -24,25 +24,22 @@ graph_config = {
    # "headless": False,
 }
 # #############################
-prompt="List all article titles on the page"
+# prompt = "List all article titles on the page"
-prompt="列出页面上的所有文章标题"
+# prompt = "列出页面上的所有文章标题"
-source="https://www.aivi.fyi/"
+# source = "https://www.aivi.fyi/"
-
+#
-# #############################
+# # #############################
-prompt="List all Stations on the page."
+# prompt = "List all Stations on the page."
-source="https://network.satnogs.org/stations/"
+# source = "https://network.satnogs.org/stations/"
 # #############################
 # prompt="列出页面上的所有站点。"
-prompt="列出页面上的所有站点以及站点信息"
+prompt = "列出页面上的所有站点以及站点信息"
 # prompt="列出页面上的所有站点以及站点信息。antennas需要是一个数组。"
 source=""
 with open('source/satnogs.html', 'r', encoding='utf-8') as file:
    source = file.read()
 # ************************************************
 # Create the SmartScraperGraph instance and run it
 # ************************************************
--- a/bs4-script.py
+++ b/bs4-script.py
@ -0,0 +1,175 @@
 """
 提示词: 在文件result-script.py创建一个Python脚本来抓取页面上的所有站点以及站点信息
 with open('source/satnogs.html', 'r', encoding='utf-8') as file:
    content = file.read()
 """
 from bs4 import BeautifulSoup
 import json
 import re
 import os
 import requests
 def clean_text(text):
    """清理文本,移除多余的空白字符"""
    return ' '.join(text.split())
 def extract_station_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    stations = []
    # 查找所有站点行
    for row in soup.find_all('tr', class_='station-row'):
        station = {}
        # 提取站点ID
        station['id'] = row.find('span', class_='station').text.strip()
        # 提取站点名称
        station['name'] = row.find_all('td')[1].text.strip()
        # 提取位置信息
        location_td = row.find_all('td')[2]
        location_span = location_td.find('span', {'data-toggle': 'tooltip'})
        if location_span:
            # 提取坐标
            coordinates = location_span['title'].strip()
            # 提取网格代码 - 只保留网格代码部分
            grid = location_span.text.strip().split('@')[0].strip()
            # 提取海拔
            altitude_text = location_td.text
            altitude_match = re.search(r'@(\d+)m', altitude_text)
            altitude = f"{altitude_match.group(1)}m" if altitude_match else "N/A"
            station['location'] = {
                'coordinates': coordinates,  # 例如: "39.236°, -86.305°"
                'grid': grid,  # 例如: "EM69uf"
                'altitude': altitude  # 例如: "280m"
            }
        # 提取总观测数
        total_obs = row.find('a', class_='badge-success')
        station['total_observations'] = total_obs.text.strip() if total_obs else '0'
        # 提取未来观测数
        future_obs = row.find('a', class_='badge-info')
        station['future_observations'] = future_obs.text.strip() if future_obs else '0'
        # 提取天线信息
        antennas = []
        for antenna_span in row.find_all('span', class_='antenna-pill'):
            freq_range = antenna_span['title'].strip()
            antenna_type = antenna_span.text.strip()
            antennas.append({
                'type': antenna_type,
                'frequency_range': freq_range
            })
        station['antennas'] = antennas
        # 提取所有者信息
        owner_link = row.find_all('td')[-1].find('a')
        if owner_link:
            station['owner'] = {
                'name': owner_link.text.strip(),
                'profile_url': owner_link['href']
            }
        stations.append(station)
    return stations
 def get_content(use_local=True, url='https://network.satnogs.org/stations/?page=3'):
    """获取页面内容，可以从本地文件或网络获取"""
    if use_local:
        try:
            with open('source/satnogs.html', 'r', encoding='utf-8') as file:
                return file.read()
        except FileNotFoundError:
            print("本地文件不存在，将尝试从网络获取数据")
            use_local = False
    if not use_local:
        try:
            response = requests.get(url)
            response.raise_for_status()
            print("成功从网络获取数据")
            return response.text
        except requests.RequestException as e:
            print(f"获取网页数据时出错: {e}")
            return None
 def main():
    # 设置数据源 (True为本地文件，False为网络请求)
    use_local = False
    # 获取内容
    content = get_content(use_local)
    if not content:
        print("无法获取数据，程序退出")
        return
    # 提取站点信息
    stations = extract_station_info(content)
    # 确保_tmp目录存在
    os.makedirs('_tmp', exist_ok=True)
    # 将结果保存为JSON文件
    with open('_tmp/stations.json', 'w', encoding='utf-8') as f:
        json.dump(stations, f, ensure_ascii=False, indent=2)
    # 打印统计信息
    print(f"已成功提取 {len(stations)} 个站点的信息")
    print("详细信息已保存到 _tmp/stations.json 文件中")
    # 打印一些数据统计
    total_observations = sum(int(station['total_observations']) for station in stations)
    print(f"所有站点总观测数: {total_observations}")
    # 统计天线类型
    antenna_types = {}
    for station in stations:
        for antenna in station['antennas']:
            antenna_type = antenna['type']
            antenna_types[antenna_type] = antenna_types.get(antenna_type, 0) + 1
    print("\n天线类型统计:")
    for antenna_type, count in sorted(antenna_types.items()):
        print(f"{antenna_type}: {count}个")
    # 统计频段分布
    print("\n频段分布:")
    vhf_count = uhf_count = other_count = 0
    for station in stations:
        for antenna in station['antennas']:
            if 'VHF' in antenna['type']:
                vhf_count += 1
            if 'UHF' in antenna['type']:
                uhf_count += 1
            if not ('VHF' in antenna['type'] or 'UHF' in antenna['type']):
                other_count += 1
    print(f"VHF频段天线: {vhf_count}个")
    print(f"UHF频段天线: {uhf_count}个")
    print(f"其他频段天线: {other_count}个")
    # 打印海拔分布
    altitudes = []
    for station in stations:
        alt = station['location']['altitude']
        if alt != 'N/A':
            altitudes.append(int(alt[:-1]))  # 移除'm'并转换为整数
    if altitudes:
        print(f"\n海拔统计:")
        print(f"最高海拔: {max(altitudes)}m")
        print(f"最低海拔: {min(altitudes)}m")
        print(f"平均海拔: {sum(altitudes) / len(altitudes):.1f}m")
 if __name__ == "__main__":
    main()
--- a/scrape_stations.py
+++ b/scrape_stations.py
--- a/source/satnogs.html
+++ b/source/satnogs.html