From 320df7e2a584c62ad21f2209fff1e0b08b093b70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E6=B5=A9?= Date: Mon, 13 Jan 2025 17:21:40 +0800 Subject: [PATCH] demo --- .gitignore | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 28 +++++++++++++++++++++ app-ollama.py | 28 +++++++++++++++++++++ demo.py | 44 ++++++++++++++++++++++++++++++++ main.py | 16 ++++++++++++ 5 files changed, 186 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 app-ollama.py create mode 100644 demo.py create mode 100644 main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4ed417e --- /dev/null +++ b/.gitignore @@ -0,0 +1,70 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +dist/ +build/ +*.egg-info/ +*.egg + +# Virtual environments +venv/ +env/ +.env/ +.venv/ +ENV/ + +# IDEs and editors +.idea/ +.vscode/ +*.swp +*.swo +*.swn +*~ + +# Jupyter Notebook +.ipynb_checkpoints + +# Local development settings +.env +.env.local +.env.*.local + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.pytest_cache/ + +# Logs +*.log +logs/ + +# Database +*.db +*.sqlite3 +*.sqlite + +# Additional Python-specific files +*.pyc +*.pyo +*.pyd + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db diff --git a/README.md b/README.md new file mode 100644 index 0000000..b701882 --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# ScrapeGraphAI + +ScrapeGraphAI是一个用于网络爬虫和数据抓取的AI工具。 + +https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md + + +## Reference +https://www.aivi.fyi/aiagents/introduce-ScrapeGraphAI+LangChain+LangGraph#可用管道 + +## Dependencies +``` +pip install scrapegraphai +playwright install +pip install -U duckduckgo-search +pip install scrapegraphai'[other-language-models]' +pip install scrapegraphai'[more-semantic-options]' +pip install scrapegraphai'[more-browser-options]' + +ollama pull mistral-nemo +ollama list +``` + +## Tips +- Comment + - 小参数模型的api,比调用gpt-4o的省钱很多 + - Playwright +plugins 能解决一部分captcha。如果再加上llm,基本就不是什么问题了 + - 这个repo就是传统爬虫套了一个ai的壳子,数据解析部分用ai来做代替以前的hard code, 反爬只能通过ip proxy (家宅ip供应商最好) + playwright or chrome driver&selenium attach到 chrome进程来解决 diff --git a/app-ollama.py b/app-ollama.py new file mode 100644 index 0000000..d06c7ce --- /dev/null +++ b/app-ollama.py @@ -0,0 +1,28 @@ +from scrapegraphai.graphs import SmartScraperGraph + +graph_config = { + "llm": { + # "model": "ollama/mistral-nemo:latest", + "model": "ollama/mistral:latest", + "temperature": 0, # 更准确执行任务 + "format": "json", # Ollama 需要显式指定格式 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + + }, + "verbose": True, + # "headless": False, +} + +# 抓取网页标题 +smart_scraper_graph = SmartScraperGraph( + prompt="List all article titles on the page", + # prompt="列出页面上的所有文章标题", + source="https://www.aivi.fyi/", + config=graph_config +) + +# Run the pipeline +result = smart_scraper_graph.run() +print(result) diff --git a/demo.py b/demo.py new file mode 100644 index 0000000..e2584c1 --- /dev/null +++ b/demo.py @@ -0,0 +1,44 @@ +from scrapegraphai.graphs import SmartScraperGraph + +graph_config = { + "llm": { + "model": "ollama/mistral-nemo:latest", + "temperature": 0, # 更准确执行任务 + "format": "json", # Ollama 需要显式指定格式 + # "base_url": "http://localhost:11434", # 设置 Ollama URL + # LLM模型负责理解"List me all the projects with their descriptions"这个提示,并从网页内容中提取相关信息 + # - LLM模型 (ollama/mistral-nemo:latest): + # - 用于理解和处理自然语言 + # - 负责理解用户的提示(prompt) + # - 分析网页内容并生成符合要求的响应 + # - 将信息组织成结构化的格式(这里是JSON格式) + # - 主要处理理解和生成任务 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + # "base_url": "http://localhost:11434", # 设置 Ollama URL + # Embeddings模型帮助系统理解网页中不同部分的语义关系,确保提取的项目和描述是相互对应的 + # - Embeddings模型 (ollama/nomic-embed-text): + # - 用于文本向量化 + # - 将文本转换为数值向量,便于计算相似度 + # - 帮助系统理解文本的语义关系 + # - 用于匹配相关内容和相似度计算 + # - 主要处理语义搜索和匹配任务 + }, + # - 这种双模型架构的好处是: + # - 分工明确,每个模型专注于自己最擅长的任务 + # - 提高了系统的准确性和效率 + # - 能更好地理解网页内容的语义和结构 + "verbose": True, + # "headless": False, +} + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their descriptions", + # 也接受已下载的 HTML 代码的字符串 + source="https://perinim.github.io/projects", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) diff --git a/main.py b/main.py new file mode 100644 index 0000000..a444c2c --- /dev/null +++ b/main.py @@ -0,0 +1,16 @@ +# 这是一个示例 Python 脚本。 + +# 按 ⌃F5 执行或将其替换为您的代码。 +# 按 双击 ⇧ 在所有地方搜索类、文件、工具窗口、操作和设置。 + + +def print_hi(name): + # 在下面的代码行中使用断点来调试脚本。 + print(f'Hi, {name}') # 按 F9 切换断点。 + + +# 按装订区域中的绿色按钮以运行脚本。 +if __name__ == '__main__': + print_hi('PyCharm') + +# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助