Skip to content

Commit

Permalink
refactor: Update Firecrawl to use v1 API (#12574)
Browse files Browse the repository at this point in the history
Co-authored-by: Ademílson Tonato <[email protected]>
  • Loading branch information
ftonato and Ademílson Tonato authored Jan 23, 2025
1 parent f565f08 commit 6024d8a
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 81 deletions.
105 changes: 55 additions & 50 deletions api/core/rag/extractor/firecrawl/firecrawl_app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import time
from typing import cast
from typing import Any, cast

import requests

Expand All @@ -14,48 +14,47 @@ def __init__(self, api_key=None, base_url=None):
if self.api_key is None and self.base_url == "https://api.firecrawl.dev":
raise ValueError("No API key provided")

def scrape_url(self, url, params=None) -> dict:
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
json_data = {"url": url}
def scrape_url(self, url, params=None) -> dict[str, Any]:
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/scrape
headers = self._prepare_headers()
json_data = {
"url": url,
"formats": ["markdown"],
"onlyMainContent": True,
"timeout": 30000,
}
if params:
json_data.update(params)
response = requests.post(f"{self.base_url}/v0/scrape", headers=headers, json=json_data)
response = self._post_request(f"{self.base_url}/v1/scrape", json_data, headers)
if response.status_code == 200:
response_data = response.json()
if response_data["success"] == True:
data = response_data["data"]
return {
"title": data.get("metadata").get("title"),
"description": data.get("metadata").get("description"),
"source_url": data.get("metadata").get("sourceURL"),
"markdown": data.get("markdown"),
}
else:
raise Exception(f"Failed to scrape URL. Error: {response_data['error']}")

elif response.status_code in {402, 409, 500}:
error_message = response.json().get("error", "Unknown error occurred")
raise Exception(f"Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}")
data = response_data["data"]
return self._extract_common_fields(data)
elif response.status_code in {402, 409, 500, 429, 408}:
self._handle_error(response, "scrape URL")
return {} # Avoid additional exception after handling error
else:
raise Exception(f"Failed to scrape URL. Status code: {response.status_code}")

def crawl_url(self, url, params=None) -> str:
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
headers = self._prepare_headers()
json_data = {"url": url}
if params:
json_data.update(params)
response = self._post_request(f"{self.base_url}/v0/crawl", json_data, headers)
response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers)
if response.status_code == 200:
job_id = response.json().get("jobId")
# There's also another two fields in the response: "success" (bool) and "url" (str)
job_id = response.json().get("id")
return cast(str, job_id)
else:
self._handle_error(response, "start crawl job")
# FIXME: unreachable code for mypy
return "" # unreachable

def check_crawl_status(self, job_id) -> dict:
def check_crawl_status(self, job_id) -> dict[str, Any]:
headers = self._prepare_headers()
response = self._get_request(f"{self.base_url}/v0/crawl/status/{job_id}", headers)
response = self._get_request(f"{self.base_url}/v1/crawl/{job_id}", headers)
if response.status_code == 200:
crawl_status_response = response.json()
if crawl_status_response.get("status") == "completed":
Expand All @@ -66,42 +65,48 @@ def check_crawl_status(self, job_id) -> dict:
url_data_list = []
for item in data:
if isinstance(item, dict) and "metadata" in item and "markdown" in item:
url_data = {
"title": item.get("metadata", {}).get("title"),
"description": item.get("metadata", {}).get("description"),
"source_url": item.get("metadata", {}).get("sourceURL"),
"markdown": item.get("markdown"),
}
url_data = self._extract_common_fields(item)
url_data_list.append(url_data)
if url_data_list:
file_key = "website_files/" + job_id + ".txt"
if storage.exists(file_key):
storage.delete(file_key)
storage.save(file_key, json.dumps(url_data_list).encode("utf-8"))
return {
"status": "completed",
"total": crawl_status_response.get("total"),
"current": crawl_status_response.get("current"),
"data": url_data_list,
}

try:
if storage.exists(file_key):
storage.delete(file_key)
storage.save(file_key, json.dumps(url_data_list).encode("utf-8"))
except Exception as e:
raise Exception(f"Error saving crawl data: {e}")
return self._format_crawl_status_response("completed", crawl_status_response, url_data_list)
else:
return {
"status": crawl_status_response.get("status"),
"total": crawl_status_response.get("total"),
"current": crawl_status_response.get("current"),
"data": [],
}

return self._format_crawl_status_response(
crawl_status_response.get("status"), crawl_status_response, []
)
else:
self._handle_error(response, "check crawl status")
# FIXME: unreachable code for mypy
return {} # unreachable

def _prepare_headers(self):
def _format_crawl_status_response(
self, status: str, crawl_status_response: dict[str, Any], url_data_list: list[dict[str, Any]]
) -> dict[str, Any]:
return {
"status": status,
"total": crawl_status_response.get("total"),
"current": crawl_status_response.get("completed"),
"data": url_data_list,
}

def _extract_common_fields(self, item: dict[str, Any]) -> dict[str, Any]:
return {
"title": item.get("metadata", {}).get("title"),
"description": item.get("metadata", {}).get("description"),
"source_url": item.get("metadata", {}).get("sourceURL"),
"markdown": item.get("markdown"),
}

def _prepare_headers(self) -> dict[str, Any]:
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}

def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> requests.Response:
for attempt in range(retries):
response = requests.post(url, headers=headers, json=data)
if response.status_code == 502:
Expand All @@ -110,7 +115,7 @@ def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
return response
return response

def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> requests.Response:
for attempt in range(retries):
response = requests.get(url, headers=headers)
if response.status_code == 502:
Expand All @@ -119,6 +124,6 @@ def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
return response
return response

def _handle_error(self, response, action):
def _handle_error(self, response, action) -> None:
error_message = response.json().get("error", "Unknown error occurred")
raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}")
8 changes: 5 additions & 3 deletions api/services/auth/firecrawl/firecrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ def validate_credentials(self):
headers = self._prepare_headers()
options = {
"url": "https://example.com",
"crawlerOptions": {"excludes": [], "includes": [], "limit": 1},
"pageOptions": {"onlyMainContent": True},
"excludes": [],
"includes": [],
"limit": 1,
"scrapeOptions": {"onlyMainContent": True},
}
response = self._post_request(f"{self.base_url}/v0/crawl", options, headers)
response = self._post_request(f"{self.base_url}/v1/crawl", options, headers)
if response.status_code == 200:
return True
else:
Expand Down
30 changes: 12 additions & 18 deletions api/services/website_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,30 +38,24 @@ def crawl_url(cls, args: dict) -> dict:
only_main_content = options.get("only_main_content", False)
if not crawl_sub_pages:
params = {
"crawlerOptions": {
"includes": [],
"excludes": [],
"generateImgAltText": True,
"limit": 1,
"returnOnlyUrls": False,
"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
}
"includes": [],
"excludes": [],
"generateImgAltText": True,
"limit": 1,
"scrapeOptions": {"onlyMainContent": only_main_content},
}
else:
includes = options.get("includes").split(",") if options.get("includes") else []
excludes = options.get("excludes").split(",") if options.get("excludes") else []
params = {
"crawlerOptions": {
"includes": includes,
"excludes": excludes,
"generateImgAltText": True,
"limit": options.get("limit", 1),
"returnOnlyUrls": False,
"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
}
"includes": includes,
"excludes": excludes,
"generateImgAltText": True,
"limit": options.get("limit", 1),
"scrapeOptions": {"onlyMainContent": only_main_content},
}
if options.get("max_depth"):
params["crawlerOptions"]["maxDepth"] = options.get("max_depth")
params["maxDepth"] = options.get("max_depth")
job_id = firecrawl_app.crawl_url(url, params)
website_crawl_time_cache_key = f"website_crawl_{job_id}"
time = str(datetime.datetime.now().timestamp())
Expand Down Expand Up @@ -228,7 +222,7 @@ def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_
# decrypt api_key
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
params = {"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}}
params = {"onlyMainContent": only_main_content}
result = firecrawl_app.scrape_url(url, params)
return result
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,18 @@ def test_firecrawl_web_extractor_crawl_mode(mocker):
base_url = "https://api.firecrawl.dev"
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url)
params = {
"crawlerOptions": {
"includes": [],
"excludes": [],
"generateImgAltText": True,
"maxDepth": 1,
"limit": 1,
"returnOnlyUrls": False,
}
"includes": [],
"excludes": [],
"generateImgAltText": True,
"maxDepth": 1,
"limit": 1,
}
mocked_firecrawl = {
"jobId": "test",
"id": "test",
}
mocker.patch("requests.post", return_value=_mock_response(mocked_firecrawl))
job_id = firecrawl_app.crawl_url(url, params)
print(job_id)
print(f"job_id: {job_id}")

assert job_id is not None
assert isinstance(job_id, str)

0 comments on commit 6024d8a

Please sign in to comment.