使用Firecrawl抓取和提取结构化数据

Firecrawl利用大型语言模型(LLMs)高效地从网页中提取结构化数据。以下是具体方法:

  1. 模式定义: 使用JSON Schema(遵循OpenAI工具模式)定义要抓取的URL和所需的数据模式。此模式指定了您希望从页面中提取的数据结构。

  2. 抓取端点: 将URL和模式传递给抓取端点。可以在以下位置找到该端点的文档: 抓取端点文档

  3. 结构化数据检索: 以您定义的模式接收抓取到的数据。然后,您可以在应用程序中使用这些数据或进行进一步处理。

这种方法简化了数据提取过程,减少了手动操作并提高了效率。

提取结构化数据

/scrape (带提取) 端点

用于从抓取的页面中提取结构化数据。

curl -X POST https://api.firecrawl.dev/v0/scrape \
    -H 'Content-Type: application/json' \
    -H 'Authorization: Bearer YOUR_API_KEY' \
    -d '{
      "url": "https://docs.firecrawl.dev/",
      "extractorOptions": {
        "mode": "llm-extraction",
        "extractionPrompt": "Based on the information on the page, extract the information from the schema. ",
        "extractionSchema": {
          "type": "object",
          "properties": {
            "company_mission": {
                      "type": "string"
            },
            "supports_sso": {
                      "type": "boolean"
            },
            "is_open_source": {
                      "type": "boolean"
            },
            "is_in_yc": {
                      "type": "boolean"
            }
          },
          "required": [
            "company_mission",
            "supports_sso",
            "is_open_source",
            "is_in_yc"
          ]
        }
      }
    }'
{
  "success": true,
  "data": {
    "content": "Raw Content",
    "metadata": {
      "title": "Mendable",
      "description": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide",
      "robots": "follow, index",
      "ogTitle": "Mendable",
      "ogDescription": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide",
      "ogUrl": "https://docs.firecrawl.dev/",
      "ogImage": "https://docs.firecrawl.dev/mendable_new_og1.png",
      "ogLocaleAlternate": [],
      "ogSiteName": "Mendable",
      "sourceURL": "https://docs.firecrawl.dev/"
    },
    "llm_extraction": {
      "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to",
      "supports_sso": true,
      "is_open_source": false,
      "is_in_yc": true
    }
  }
}

使用Python SDK

from firecrawl import FirecrawlApp

# 使用您的API密钥初始化FirecrawlApp
app = FirecrawlApp(api_key='your_api_key', version='v0')

class ArticleSchema(BaseModel):
    title: str
    points: int
    by: str
    commentsURL: str

class TopArticlesSchema(BaseModel):
    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")

data = app.scrape_url('https://news.ycombinator.com', {
    'extractorOptions': {
        'extractionSchema': TopArticlesSchema.model_json_schema(),
        'mode': 'llm-extraction'
    },
    'pageOptions':{
        'onlyMainContent': True
    }
})
print(data["llm_extraction"])

使用JavaScript SDK

import FirecrawlApp from '@mendable/firecrawl-js';
import { z } from 'zod';

const app = new FirecrawlApp({
  apiKey: 'fc-YOUR_API_KEY',
  version: 'v0',
});

// 定义提取内容的架构
const schema = z.object({
  top: z
    .array(
      z.object({
        title: z.string(),
        points: z.number(),
        by: z.string(),
        commentsURL: z.string(),
      })
    )
    .length(5)
    .describe('Hacker News上排名前五的故事'),
});

const scrapeResult = await app.scrapeUrl('https://news.ycombinator.com', {
  extractorOptions: { extractionSchema: schema },
});

console.log(scrapeResult.data['llm_extraction']);
```### 使用 Go SDK

```go Go
import (
  "fmt"
  "log"

  "github.com/mendableai/firecrawl-go"
)

func main() {
  app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY")
  if err != nil {
      log.Fatalf("Failed to initialize FirecrawlApp: %v", err)
  }

  jsonSchema := map[string]any{
    "type": "object",
    "properties": map[string]any{
      "top": map[string]any{
        "type": "array",
        "items": map[string]any{
          "type": "object",
          "properties": map[string]any{
            "title":       map[string]string{"type": "string"},
            "points":      map[string]string{"type": "number"},
            "by":          map[string]string{"type": "string"},
            "commentsURL": map[string]string{"type": "string"},
          },
          "required": []string{"title", "points", "by", "commentsURL"},
        },
        "minItems":    5,
        "maxItems":    5,
        "description": "Top 5 stories on Hacker News",
      },
    },
    "required": []string{"top"},
  }

  llmExtractionParams := map[string]any{
    "extractorOptions": firecrawl.ExtractorOptions{
      ExtractionSchema: jsonSchema,
    },
  }

  scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
  if err != nil {
    log.Fatalf("Failed to perform LLM extraction: %v", err)
  }
  fmt.Println(scrapeResult)
}

使用 Rust SDK

Rust
use firecrawl::FirecrawlApp;

#[tokio::main]
async fn main() {
    // 使用 API 密钥初始化 FirecrawlApp
    let api_key = "YOUR_API_KEY";
    let api_url = "https://api.firecrawl.dev";
    let app = FirecrawlApp::new(api_key, api_url).expect("Failed to initialize FirecrawlApp");

    // 定义提取内容的 schema
    let json_schema = json!({
        "type": "object",
        "properties": {
            "top": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "title": {"type": "string"},
                        "points": {"type": "number"},
                        "by": {"type": "string"},
                        "commentsURL": {"type": "string"}
                    },
                    "required": ["title", "points", "by", "commentsURL"]
                },
                "minItems": 5,
                "maxItems": 5,
                "description": "Top 5 stories on Hacker News"
            }
        },
        "required": ["top"]
    });

    let llm_extraction_params = json!({
        "extractorOptions": {
            "extractionSchema": json_schema,
            "mode": "llm-extraction"
        },
        "pageOptions": {
            "onlyMainContent": true
        }
    });

    let llm_extraction_result = app
        .scrape_url("https://news.ycombinator.com", Some(llm_extraction_params))
        .await;
    match llm_extraction_result {
        Ok(data) => println!("LLM Extraction Result:
{}", data["llm_extraction"]),
        Err(e) => eprintln!("LLM Extraction failed: {}", e),
    }
}