Firecrawl 允许您将整个网站转换为 LLM 就绪的 Markdown
pip install firecrawl-py
from firecrawl import FirecrawlApp app = FirecrawlApp(api_key="YOUR_API_KEY") crawl_result = app.crawl_url('docs.firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}) # Get the markdown for result in crawl_result: print(result['markdown'])
wait_until_done
false
{ "jobId": "1234-5678-9101" }
status = app.check_crawl_status(job_id)
{ "status": "completed", "current": 22, "total": 22, "data": [ { "content": "Raw Content ", "markdown": "# Markdown Content", "provider": "web-scraper", "metadata": { "title": "Firecrawl | Scrape the web reliably for your LLMs", "description": "AI for CX and Sales", "language": null, "sourceURL": "https://docs.firecrawl.dev/" } } ] }
scrape_url
from firecrawl import FirecrawlApp app = FirecrawlApp(api_key="YOUR_API_KEY") content = app.scrape_url("https://docs.firecrawl.dev")
{ "success": true, "data": { "markdown": "<string>", "content": "<string>", "html": "<string>", "rawHtml": "<string>", "metadata": { "title": "<string>", "description": "<string>", "language": "<string>", "sourceURL": "<string>", "<any other metadata> ": "<string>", "pageStatusCode": 123, "pageError": "<string>" }, "llm_extraction": {}, "warning": "<string>" } }
class ArticleSchema(BaseModel): title: str points: int by: str commentsURL: str class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") data = app.scrape_url('https://news.ycombinator.com', { 'extractorOptions': { 'extractionSchema': TopArticlesSchema.model_json_schema(), 'mode': 'llm-extraction' }, 'pageOptions':{ 'onlyMainContent': True } }) print(data["llm_extraction"])