ScrapeGraphAI-experiments/demo.py

from scrapegraphai.graphs import SmartScraperGraph

graph_config = {
    "llm": {
        "model": "ollama/mistral-nemo:latest",
        "temperature": 0,  # 更准确执行任务
        "format": "json",  # Ollama 需要显式指定格式
        # "base_url": "http://localhost:11434",  # 设置 Ollama URL
        # LLM模型负责理解"List me all the projects with their descriptions"这个提示，并从网页内容中提取相关信息
        # - LLM模型 (ollama/mistral-nemo:latest):
        #   - 用于理解和处理自然语言
        #   - 负责理解用户的提示(prompt)
        #   - 分析网页内容并生成符合要求的响应
        #   - 将信息组织成结构化的格式（这里是JSON格式）
        #   - 主要处理理解和生成任务
    },
    "embeddings": {
        "model": "ollama/nomic-embed-text",
        # "base_url": "http://localhost:11434",  # 设置 Ollama URL
        # Embeddings模型帮助系统理解网页中不同部分的语义关系，确保提取的项目和描述是相互对应的
        # - Embeddings模型 (ollama/nomic-embed-text):
        #   - 用于文本向量化
        #   - 将文本转换为数值向量，便于计算相似度
        #   - 帮助系统理解文本的语义关系
        #   - 用于匹配相关内容和相似度计算
        #   - 主要处理语义搜索和匹配任务
    },
    # - 这种双模型架构的好处是：
    #   - 分工明确，每个模型专注于自己最擅长的任务
    #   - 提高了系统的准确性和效率
    #   - 能更好地理解网页内容的语义和结构
    "verbose": True,
    # "headless": False,
}

smart_scraper_graph = SmartScraperGraph(
    prompt="List me all the projects with their descriptions",
    # 也接受已下载的 HTML 代码的字符串
    source="https://perinim.github.io/projects",
    config=graph_config
)

result = smart_scraper_graph.run()
print(result)