demo

2025-01-13 17:21:40 +08:00
parent 456f56e40d
commit 320df7e2a5
5 changed files with 186 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,70 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+*.egg
+
+# Virtual environments
+venv/
+env/
+.env/
+.venv/
+ENV/
+
+# IDEs and editors
+.idea/
+.vscode/
+*.swp
+*.swo
+*.swn
+*~
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# Local development settings
+.env
+.env.local
+.env.*.local
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.pytest_cache/
+
+# Logs
+*.log
+logs/
+
+# Database
+*.db
+*.sqlite3
+*.sqlite
+
+# Additional Python-specific files
+*.pyc
+*.pyo
+*.pyd
+
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
--- a/README.md
+++ b/README.md
@@ -0,0 +1,28 @@
+# ScrapeGraphAI
+
+ScrapeGraphAI是一个用于网络爬虫和数据抓取的AI工具。
+
+https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md
+
+
+## Reference
+https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph#可用管道
+
+## Dependencies
+```
+pip install scrapegraphai
+playwright install
+pip install -U duckduckgo-search
+pip install scrapegraphai'[other-language-models]'
+pip install scrapegraphai'[more-semantic-options]'
+pip install scrapegraphai'[more-browser-options]'
+
+ollama pull mistral-nemo
+ollama list
+```
+
+## Tips
+- Comment
+  - 小参数模型的api，比调用gpt-4o的省钱很多
+  - Playwright +plugins 能解决一部分captcha。如果再加上llm，基本就不是什么问题了
+  - 这个repo就是传统爬虫套了一个ai的壳子，数据解析部分用ai来做代替以前的hard code, 反爬只能通过ip proxy (家宅ip供应商最好) + playwright or chrome driver&selenium attach到 chrome进程来解决
--- a/app-ollama.py
+++ b/app-ollama.py
@@ -0,0 +1,28 @@
+from scrapegraphai.graphs import SmartScraperGraph
+
+graph_config = {
+    "llm": {
+        # "model": "ollama/mistral-nemo:latest",
+        "model": "ollama/mistral:latest",
+        "temperature": 0,  # 更准确执行任务
+        "format": "json",  # Ollama 需要显式指定格式
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+
+    },
+    "verbose": True,
+    # "headless": False,
+}
+
+# 抓取网页标题
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List all article titles on the page",
+    # prompt="列出页面上的所有文章标题",
+    source="https://www.aivi.fyi/",
+    config=graph_config
+)
+
+# Run the pipeline
+result = smart_scraper_graph.run()
+print(result)
--- a/demo.py
+++ b/demo.py
@@ -0,0 +1,44 @@
+from scrapegraphai.graphs import SmartScraperGraph
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral-nemo:latest",
+        "temperature": 0,  # 更准确执行任务
+        "format": "json",  # Ollama 需要显式指定格式
+        # "base_url": "http://localhost:11434",  # 设置 Ollama URL
+        # LLM模型负责理解"List me all the projects with their descriptions"这个提示，并从网页内容中提取相关信息
+        # - LLM模型 (ollama/mistral-nemo:latest):
+        #   - 用于理解和处理自然语言
+        #   - 负责理解用户的提示(prompt)
+        #   - 分析网页内容并生成符合要求的响应
+        #   - 将信息组织成结构化的格式（这里是JSON格式）
+        #   - 主要处理理解和生成任务
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        # "base_url": "http://localhost:11434",  # 设置 Ollama URL
+        # Embeddings模型帮助系统理解网页中不同部分的语义关系，确保提取的项目和描述是相互对应的
+        # - Embeddings模型 (ollama/nomic-embed-text):
+        #   - 用于文本向量化
+        #   - 将文本转换为数值向量，便于计算相似度
+        #   - 帮助系统理解文本的语义关系
+        #   - 用于匹配相关内容和相似度计算
+        #   - 主要处理语义搜索和匹配任务
+    },
+    # - 这种双模型架构的好处是：
+    #   - 分工明确，每个模型专注于自己最擅长的任务
+    #   - 提高了系统的准确性和效率
+    #   - 能更好地理解网页内容的语义和结构
+    "verbose": True,
+    # "headless": False,
+}
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me all the projects with their descriptions",
+    # 也接受已下载的 HTML 代码的字符串
+    source="https://perinim.github.io/projects",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
--- a/main.py
+++ b/main.py
@@ -0,0 +1,16 @@
+# 这是一个示例 Python 脚本。
+
+# 按 ⌃F5 执行或将其替换为您的代码。
+# 按 双击 ⇧ 在所有地方搜索类、文件、工具窗口、操作和设置。
+
+
+def print_hi(name):
+    # 在下面的代码行中使用断点来调试脚本。
+    print(f'Hi, {name}')  # 按 F9 切换断点。
+
+
+# 按装订区域中的绿色按钮以运行脚本。
+if __name__ == '__main__':
+    print_hi('PyCharm')
+
+# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助