demo
This commit is contained in:
70
.gitignore
vendored
Normal file
70
.gitignore
vendored
Normal file
@ -0,0 +1,70 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
dist/
|
||||
build/
|
||||
*.egg-info/
|
||||
*.egg
|
||||
|
||||
# Virtual environments
|
||||
venv/
|
||||
env/
|
||||
.env/
|
||||
.venv/
|
||||
ENV/
|
||||
|
||||
# IDEs and editors
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
*.swn
|
||||
*~
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# Local development settings
|
||||
.env
|
||||
.env.local
|
||||
.env.*.local
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.pytest_cache/
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
logs/
|
||||
|
||||
# Database
|
||||
*.db
|
||||
*.sqlite3
|
||||
*.sqlite
|
||||
|
||||
# Additional Python-specific files
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
|
||||
# OS generated files
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
28
README.md
Normal file
28
README.md
Normal file
@ -0,0 +1,28 @@
|
||||
# ScrapeGraphAI
|
||||
|
||||
ScrapeGraphAI是一个用于网络爬虫和数据抓取的AI工具。
|
||||
|
||||
https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md
|
||||
|
||||
|
||||
## Reference
|
||||
https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph#可用管道
|
||||
|
||||
## Dependencies
|
||||
```
|
||||
pip install scrapegraphai
|
||||
playwright install
|
||||
pip install -U duckduckgo-search
|
||||
pip install scrapegraphai'[other-language-models]'
|
||||
pip install scrapegraphai'[more-semantic-options]'
|
||||
pip install scrapegraphai'[more-browser-options]'
|
||||
|
||||
ollama pull mistral-nemo
|
||||
ollama list
|
||||
```
|
||||
|
||||
## Tips
|
||||
- Comment
|
||||
- 小参数模型的api,比调用gpt-4o的省钱很多
|
||||
- Playwright +plugins 能解决一部分captcha。如果再加上llm,基本就不是什么问题了
|
||||
- 这个repo就是传统爬虫套了一个ai的壳子,数据解析部分用ai来做代替以前的hard code, 反爬只能通过ip proxy (家宅ip供应商最好) + playwright or chrome driver&selenium attach到 chrome进程来解决
|
28
app-ollama.py
Normal file
28
app-ollama.py
Normal file
@ -0,0 +1,28 @@
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
# "model": "ollama/mistral-nemo:latest",
|
||||
"model": "ollama/mistral:latest",
|
||||
"temperature": 0, # 更准确执行任务
|
||||
"format": "json", # Ollama 需要显式指定格式
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
|
||||
},
|
||||
"verbose": True,
|
||||
# "headless": False,
|
||||
}
|
||||
|
||||
# 抓取网页标题
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List all article titles on the page",
|
||||
# prompt="列出页面上的所有文章标题",
|
||||
source="https://www.aivi.fyi/",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
# Run the pipeline
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
44
demo.py
Normal file
44
demo.py
Normal file
@ -0,0 +1,44 @@
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral-nemo:latest",
|
||||
"temperature": 0, # 更准确执行任务
|
||||
"format": "json", # Ollama 需要显式指定格式
|
||||
# "base_url": "http://localhost:11434", # 设置 Ollama URL
|
||||
# LLM模型负责理解"List me all the projects with their descriptions"这个提示,并从网页内容中提取相关信息
|
||||
# - LLM模型 (ollama/mistral-nemo:latest):
|
||||
# - 用于理解和处理自然语言
|
||||
# - 负责理解用户的提示(prompt)
|
||||
# - 分析网页内容并生成符合要求的响应
|
||||
# - 将信息组织成结构化的格式(这里是JSON格式)
|
||||
# - 主要处理理解和生成任务
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
# "base_url": "http://localhost:11434", # 设置 Ollama URL
|
||||
# Embeddings模型帮助系统理解网页中不同部分的语义关系,确保提取的项目和描述是相互对应的
|
||||
# - Embeddings模型 (ollama/nomic-embed-text):
|
||||
# - 用于文本向量化
|
||||
# - 将文本转换为数值向量,便于计算相似度
|
||||
# - 帮助系统理解文本的语义关系
|
||||
# - 用于匹配相关内容和相似度计算
|
||||
# - 主要处理语义搜索和匹配任务
|
||||
},
|
||||
# - 这种双模型架构的好处是:
|
||||
# - 分工明确,每个模型专注于自己最擅长的任务
|
||||
# - 提高了系统的准确性和效率
|
||||
# - 能更好地理解网页内容的语义和结构
|
||||
"verbose": True,
|
||||
# "headless": False,
|
||||
}
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the projects with their descriptions",
|
||||
# 也接受已下载的 HTML 代码的字符串
|
||||
source="https://perinim.github.io/projects",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
16
main.py
Normal file
16
main.py
Normal file
@ -0,0 +1,16 @@
|
||||
# 这是一个示例 Python 脚本。
|
||||
|
||||
# 按 ⌃F5 执行或将其替换为您的代码。
|
||||
# 按 双击 ⇧ 在所有地方搜索类、文件、工具窗口、操作和设置。
|
||||
|
||||
|
||||
def print_hi(name):
|
||||
# 在下面的代码行中使用断点来调试脚本。
|
||||
print(f'Hi, {name}') # 按 F9 切换断点。
|
||||
|
||||
|
||||
# 按装订区域中的绿色按钮以运行脚本。
|
||||
if __name__ == '__main__':
|
||||
print_hi('PyCharm')
|
||||
|
||||
# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助
|
Reference in New Issue
Block a user