demo

2025-01-13 17:21:40 +08:00
parent 456f56e40d
commit 320df7e2a5
5 changed files with 186 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,70 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 dist/
 build/
 *.egg-info/
 *.egg
 # Virtual environments
 venv/
 env/
 .env/
 .venv/
 ENV/
 # IDEs and editors
 .idea/
 .vscode/
 *.swp
 *.swo
 *.swn
 *~
 # Jupyter Notebook
 .ipynb_checkpoints
 # Local development settings
 .env
 .env.local
 .env.*.local
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .pytest_cache/
 # Logs
 *.log
 logs/
 # Database
 *.db
 *.sqlite3
 *.sqlite
 # Additional Python-specific files
 *.pyc
 *.pyo
 *.pyd
 # OS generated files
 .DS_Store
 .DS_Store?
 ._*
 .Spotlight-V100
 .Trashes
 ehthumbs.db
 Thumbs.db
--- a/README.md
+++ b/README.md
@@ -0,0 +1,28 @@
 # ScrapeGraphAI
 ScrapeGraphAI是一个用于网络爬虫和数据抓取的AI工具。
 https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md
 ## Reference
 https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph#可用管道
 ## Dependencies
 ```
 pip install scrapegraphai
 playwright install
 pip install -U duckduckgo-search
 pip install scrapegraphai'[other-language-models]'
 pip install scrapegraphai'[more-semantic-options]'
 pip install scrapegraphai'[more-browser-options]'
 ollama pull mistral-nemo
 ollama list
 ```
 ## Tips
 - Comment
  - 小参数模型的api，比调用gpt-4o的省钱很多
  - Playwright +plugins 能解决一部分captcha。如果再加上llm，基本就不是什么问题了
  - 这个repo就是传统爬虫套了一个ai的壳子，数据解析部分用ai来做代替以前的hard code, 反爬只能通过ip proxy (家宅ip供应商最好) + playwright or chrome driver&selenium attach到 chrome进程来解决
--- a/app-ollama.py
+++ b/app-ollama.py
@@ -0,0 +1,28 @@
 from scrapegraphai.graphs import SmartScraperGraph
 graph_config = {
    "llm": {
        # "model": "ollama/mistral-nemo:latest",
        "model": "ollama/mistral:latest",
        "temperature": 0,  # 更准确执行任务
        "format": "json",  # Ollama 需要显式指定格式
    },
    "embeddings": {
        "model": "ollama/nomic-embed-text",
    },
    "verbose": True,
    # "headless": False,
 }
 # 抓取网页标题
 smart_scraper_graph = SmartScraperGraph(
    prompt="List all article titles on the page",
    # prompt="列出页面上的所有文章标题",
    source="https://www.aivi.fyi/",
    config=graph_config
 )
 # Run the pipeline
 result = smart_scraper_graph.run()
 print(result)
--- a/demo.py
+++ b/demo.py
@@ -0,0 +1,44 @@
 from scrapegraphai.graphs import SmartScraperGraph
 graph_config = {
    "llm": {
        "model": "ollama/mistral-nemo:latest",
        "temperature": 0,  # 更准确执行任务
        "format": "json",  # Ollama 需要显式指定格式
        # "base_url": "http://localhost:11434",  # 设置 Ollama URL
        # LLM模型负责理解"List me all the projects with their descriptions"这个提示，并从网页内容中提取相关信息
        # - LLM模型 (ollama/mistral-nemo:latest):
        #   - 用于理解和处理自然语言
        #   - 负责理解用户的提示(prompt)
        #   - 分析网页内容并生成符合要求的响应
        #   - 将信息组织成结构化的格式（这里是JSON格式）
        #   - 主要处理理解和生成任务
    },
    "embeddings": {
        "model": "ollama/nomic-embed-text",
        # "base_url": "http://localhost:11434",  # 设置 Ollama URL
        # Embeddings模型帮助系统理解网页中不同部分的语义关系，确保提取的项目和描述是相互对应的
        # - Embeddings模型 (ollama/nomic-embed-text):
        #   - 用于文本向量化
        #   - 将文本转换为数值向量，便于计算相似度
        #   - 帮助系统理解文本的语义关系
        #   - 用于匹配相关内容和相似度计算
        #   - 主要处理语义搜索和匹配任务
    },
    # - 这种双模型架构的好处是：
    #   - 分工明确，每个模型专注于自己最擅长的任务
    #   - 提高了系统的准确性和效率
    #   - 能更好地理解网页内容的语义和结构
    "verbose": True,
    # "headless": False,
 }
 smart_scraper_graph = SmartScraperGraph(
    prompt="List me all the projects with their descriptions",
    # 也接受已下载的 HTML 代码的字符串
    source="https://perinim.github.io/projects",
    config=graph_config
 )
 result = smart_scraper_graph.run()
 print(result)
--- a/main.py
+++ b/main.py
@@ -0,0 +1,16 @@
 # 这是一个示例 Python 脚本。
 # 按 ⌃F5 执行或将其替换为您的代码。
 # 按 双击 ⇧ 在所有地方搜索类、文件、工具窗口、操作和设置。
 def print_hi(name):
    # 在下面的代码行中使用断点来调试脚本。
    print(f'Hi, {name}')  # 按 F9 切换断点。
 # 按装订区域中的绿色按钮以运行脚本。
 if __name__ == '__main__':
    print_hi('PyCharm')
 # 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助