Firecrawl 允许您将整个网站转换为 LLM 就绪的 markdown
pip install firecrawl-py
from firecrawl import FirecrawlApp app = FirecrawlApp(api_key="fc-YOUR_API_KEY") # 爬取一个网站: crawl_status = app.crawl_url( 'https://firecrawl.dev', params={ 'limit': 100, 'scrapeOptions': {'formats': ['markdown', 'html']} }, poll_interval=30 ) print(crawl_status)
async crawl
ID
{ "success": true, "id": "123-456-789", "url": "https://api.firecrawl.dev/v1/crawl/123-456-789" }
crawl_status = app.check_crawl_status("<crawl_id>") print(crawl_status)
next
{ "status": "scraping", "total": 36, "completed": 10, "creditsUsed": 10, "expiresAt": "2024-00-00T00:00:00.000Z", "next": "https://api.firecrawl.dev/v1/crawl/123-456-789?skip=10", "data": [ { "markdown": "[Firecrawl Docs home page!...", "html": "<!DOCTYPE html><html lang=\"en\" class=\"js-focus-visible lg:[--scroll-mt:9.5rem]\" data-js-focus-visible=\"\">...", "metadata": { "title": "使用 Groq Llama 3 构建 '与网站聊天' | Firecrawl", "language": "en", "sourceURL": "https://docs.firecrawl.dev/learn/rag-llama3", "description": "学习如何使用 Firecrawl、Groq Llama 3 和 Langchain 构建一个 '与您的网站聊天' 机器人。", "ogLocaleAlternate": [], "statusCode": 200 } }, ... ] }
scrape_url
from firecrawl import FirecrawlApp app = FirecrawlApp(api_key="fc-YOUR_API_KEY") # 抓取一个网站: scrape_result = app.scrape_url('firecrawl.dev', params={'formats': ['markdown', 'html']}) print(scrape_result)
{ "success": true, "data" : { "markdown": "Launch Week I来了[💥免费获得两个月...", "html": "<!DOCTYPE html><html lang=\"zh\" class=\"light\" style=\"color-scheme: light;\"><body class=\"__variable_36bd41 __variable_d7dc5d font-inter ...", "metadata": { "title": "首页 - Firecrawl", "description": "Firecrawl抓取并将任何网站转换为干净的Markdown。", "language": "zh", "keywords": "Firecrawl,Markdown,Data,Mendable,Langchain", "robots": "follow, index", "ogTitle": "Firecrawl", "ogDescription": "将任何网站转换为LLM就绪数据。", "ogUrl": "https://www.firecrawl.dev/", "ogImage": "https://www.firecrawl.dev/og.png?123", "ogLocaleAlternate": [], "ogSiteName": "Firecrawl", "sourceURL": "https://firecrawl.dev", "statusCode": 200 } } }
from firecrawl import FirecrawlApp from pydantic import BaseModel, Field # 使用你的API密钥初始化FirecrawlApp app = FirecrawlApp(api_key='your_api_key') class ExtractSchema(BaseModel): company_mission: str supports_sso: bool is_open_source: bool is_in_yc: bool data = app.scrape_url('https://docs.firecrawl.dev/', { 'formats': ['json'], 'jsonOptions': { 'schema': ExtractSchema.model_json_schema(), } }) print(data["json"])
{ "success": true, "data": { "json": { "company_mission": "训练一个安全的人工智能,利用您的技术资源回答客户和员工的问题,这样您的团队就不必这样做了", "supports_sso": true, "is_open_source": false, "is_in_yc": true }, "metadata": { "title": "Mendable", "description": "Mendable让您轻松构建AI聊天应用。摄取、定制,然后只需一行代码即可在您想要的任何地方部署。由SideGuide提供支持", "robots": "follow, index", "ogTitle": "Mendable", "ogDescription": "Mendable让您轻松构建AI聊天应用。摄取、定制,然后只需一行代码即可在您想要的任何地方部署。由SideGuide提供支持", "ogUrl": "https://docs.firecrawl.dev/", "ogImage": "https://docs.firecrawl.dev/mendable_new_og1.png", "ogLocaleAlternate": [], "ogSiteName": "Mendable", "sourceURL": "https://docs.firecrawl.dev/" }, } }
prompt
curl -X POST https://api.firecrawl.dev/v1/scrape \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer YOUR_API_KEY' \ -d '{ "url": "https://docs.firecrawl.dev/", "formats": ["json"], "jsonOptions": { "prompt": "从页面中提取公司使命。" } }'
{ "success": true, "data": { "json": { "company_mission": "训练一个安全的人工智能,使用您的技术资源回答客户和员工的问题,以便您的团队不必这样做", }, "metadata": { "title": "Mendable", "description": "Mendable 允许您轻松构建 AI 聊天应用程序。摄取、自定义,然后只需一行代码即可在您想要的任何位置部署。由 SideGuide 提供支持", "robots": "follow, index", "ogTitle": "Mendable", "ogDescription": "Mendable 允许您轻松构建 AI 聊天应用程序。摄取、自定义,然后只需一行代码即可在您想要的任何位置部署。由 SideGuide 提供支持", "ogUrl": "https://docs.firecrawl.dev/", "ogImage": "https://docs.firecrawl.dev/mendable_new_og1.png", "ogLocaleAlternate": [], "ogSiteName": "Mendable", "sourceURL": "https://docs.firecrawl.dev/" }, } }
app = FirecrawlApp(version="v0") class ArticleSchema(BaseModel): title: str points: int by: str commentsURL: str class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") data = app.scrape_url('https://news.ycombinator.com', { 'extractorOptions': { 'extractionSchema': TopArticlesSchema.model_json_schema(), 'mode': 'llm-extraction' }, 'pageOptions':{ 'onlyMainContent': True } }) print(data["llm_extraction"])