From 6024d8a42d1a39d3e0d6b81df806e3d624839519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20Tonato?= Date: Thu, 23 Jan 2025 03:14:48 +0000 Subject: [PATCH] refactor: Update Firecrawl to use v1 API (#12574) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Ademílson Tonato --- .../rag/extractor/firecrawl/firecrawl_app.py | 105 +++++++++--------- api/services/auth/firecrawl/firecrawl.py | 8 +- api/services/website_service.py | 30 ++--- .../rag/extractor/firecrawl/test_firecrawl.py | 19 ++-- 4 files changed, 81 insertions(+), 81 deletions(-) diff --git a/api/core/rag/extractor/firecrawl/firecrawl_app.py b/api/core/rag/extractor/firecrawl/firecrawl_app.py index eac08aeb8bd0cf..836a1398bfdad9 100644 --- a/api/core/rag/extractor/firecrawl/firecrawl_app.py +++ b/api/core/rag/extractor/firecrawl/firecrawl_app.py @@ -1,6 +1,6 @@ import json import time -from typing import cast +from typing import Any, cast import requests @@ -14,48 +14,47 @@ def __init__(self, api_key=None, base_url=None): if self.api_key is None and self.base_url == "https://api.firecrawl.dev": raise ValueError("No API key provided") - def scrape_url(self, url, params=None) -> dict: - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} - json_data = {"url": url} + def scrape_url(self, url, params=None) -> dict[str, Any]: + # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/scrape + headers = self._prepare_headers() + json_data = { + "url": url, + "formats": ["markdown"], + "onlyMainContent": True, + "timeout": 30000, + } if params: json_data.update(params) - response = requests.post(f"{self.base_url}/v0/scrape", headers=headers, json=json_data) + response = self._post_request(f"{self.base_url}/v1/scrape", json_data, headers) if response.status_code == 200: response_data = response.json() - if response_data["success"] == True: - data = response_data["data"] - return { - "title": data.get("metadata").get("title"), - "description": data.get("metadata").get("description"), - "source_url": data.get("metadata").get("sourceURL"), - "markdown": data.get("markdown"), - } - else: - raise Exception(f"Failed to scrape URL. Error: {response_data['error']}") - - elif response.status_code in {402, 409, 500}: - error_message = response.json().get("error", "Unknown error occurred") - raise Exception(f"Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}") + data = response_data["data"] + return self._extract_common_fields(data) + elif response.status_code in {402, 409, 500, 429, 408}: + self._handle_error(response, "scrape URL") + return {} # Avoid additional exception after handling error else: raise Exception(f"Failed to scrape URL. Status code: {response.status_code}") def crawl_url(self, url, params=None) -> str: + # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post headers = self._prepare_headers() json_data = {"url": url} if params: json_data.update(params) - response = self._post_request(f"{self.base_url}/v0/crawl", json_data, headers) + response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers) if response.status_code == 200: - job_id = response.json().get("jobId") + # There's also another two fields in the response: "success" (bool) and "url" (str) + job_id = response.json().get("id") return cast(str, job_id) else: self._handle_error(response, "start crawl job") # FIXME: unreachable code for mypy return "" # unreachable - def check_crawl_status(self, job_id) -> dict: + def check_crawl_status(self, job_id) -> dict[str, Any]: headers = self._prepare_headers() - response = self._get_request(f"{self.base_url}/v0/crawl/status/{job_id}", headers) + response = self._get_request(f"{self.base_url}/v1/crawl/{job_id}", headers) if response.status_code == 200: crawl_status_response = response.json() if crawl_status_response.get("status") == "completed": @@ -66,42 +65,48 @@ def check_crawl_status(self, job_id) -> dict: url_data_list = [] for item in data: if isinstance(item, dict) and "metadata" in item and "markdown" in item: - url_data = { - "title": item.get("metadata", {}).get("title"), - "description": item.get("metadata", {}).get("description"), - "source_url": item.get("metadata", {}).get("sourceURL"), - "markdown": item.get("markdown"), - } + url_data = self._extract_common_fields(item) url_data_list.append(url_data) if url_data_list: file_key = "website_files/" + job_id + ".txt" - if storage.exists(file_key): - storage.delete(file_key) - storage.save(file_key, json.dumps(url_data_list).encode("utf-8")) - return { - "status": "completed", - "total": crawl_status_response.get("total"), - "current": crawl_status_response.get("current"), - "data": url_data_list, - } - + try: + if storage.exists(file_key): + storage.delete(file_key) + storage.save(file_key, json.dumps(url_data_list).encode("utf-8")) + except Exception as e: + raise Exception(f"Error saving crawl data: {e}") + return self._format_crawl_status_response("completed", crawl_status_response, url_data_list) else: - return { - "status": crawl_status_response.get("status"), - "total": crawl_status_response.get("total"), - "current": crawl_status_response.get("current"), - "data": [], - } - + return self._format_crawl_status_response( + crawl_status_response.get("status"), crawl_status_response, [] + ) else: self._handle_error(response, "check crawl status") # FIXME: unreachable code for mypy return {} # unreachable - def _prepare_headers(self): + def _format_crawl_status_response( + self, status: str, crawl_status_response: dict[str, Any], url_data_list: list[dict[str, Any]] + ) -> dict[str, Any]: + return { + "status": status, + "total": crawl_status_response.get("total"), + "current": crawl_status_response.get("completed"), + "data": url_data_list, + } + + def _extract_common_fields(self, item: dict[str, Any]) -> dict[str, Any]: + return { + "title": item.get("metadata", {}).get("title"), + "description": item.get("metadata", {}).get("description"), + "source_url": item.get("metadata", {}).get("sourceURL"), + "markdown": item.get("markdown"), + } + + def _prepare_headers(self) -> dict[str, Any]: return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} - def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): + def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> requests.Response: for attempt in range(retries): response = requests.post(url, headers=headers, json=data) if response.status_code == 502: @@ -110,7 +115,7 @@ def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): return response return response - def _get_request(self, url, headers, retries=3, backoff_factor=0.5): + def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> requests.Response: for attempt in range(retries): response = requests.get(url, headers=headers) if response.status_code == 502: @@ -119,6 +124,6 @@ def _get_request(self, url, headers, retries=3, backoff_factor=0.5): return response return response - def _handle_error(self, response, action): + def _handle_error(self, response, action) -> None: error_message = response.json().get("error", "Unknown error occurred") raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") diff --git a/api/services/auth/firecrawl/firecrawl.py b/api/services/auth/firecrawl/firecrawl.py index 50e4edff140346..cc6eaaa42a0611 100644 --- a/api/services/auth/firecrawl/firecrawl.py +++ b/api/services/auth/firecrawl/firecrawl.py @@ -21,10 +21,12 @@ def validate_credentials(self): headers = self._prepare_headers() options = { "url": "https://example.com", - "crawlerOptions": {"excludes": [], "includes": [], "limit": 1}, - "pageOptions": {"onlyMainContent": True}, + "excludes": [], + "includes": [], + "limit": 1, + "scrapeOptions": {"onlyMainContent": True}, } - response = self._post_request(f"{self.base_url}/v0/crawl", options, headers) + response = self._post_request(f"{self.base_url}/v1/crawl", options, headers) if response.status_code == 200: return True else: diff --git a/api/services/website_service.py b/api/services/website_service.py index 1ad7d0399d6edf..b30e2205f7db20 100644 --- a/api/services/website_service.py +++ b/api/services/website_service.py @@ -38,30 +38,24 @@ def crawl_url(cls, args: dict) -> dict: only_main_content = options.get("only_main_content", False) if not crawl_sub_pages: params = { - "crawlerOptions": { - "includes": [], - "excludes": [], - "generateImgAltText": True, - "limit": 1, - "returnOnlyUrls": False, - "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}, - } + "includes": [], + "excludes": [], + "generateImgAltText": True, + "limit": 1, + "scrapeOptions": {"onlyMainContent": only_main_content}, } else: includes = options.get("includes").split(",") if options.get("includes") else [] excludes = options.get("excludes").split(",") if options.get("excludes") else [] params = { - "crawlerOptions": { - "includes": includes, - "excludes": excludes, - "generateImgAltText": True, - "limit": options.get("limit", 1), - "returnOnlyUrls": False, - "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}, - } + "includes": includes, + "excludes": excludes, + "generateImgAltText": True, + "limit": options.get("limit", 1), + "scrapeOptions": {"onlyMainContent": only_main_content}, } if options.get("max_depth"): - params["crawlerOptions"]["maxDepth"] = options.get("max_depth") + params["maxDepth"] = options.get("max_depth") job_id = firecrawl_app.crawl_url(url, params) website_crawl_time_cache_key = f"website_crawl_{job_id}" time = str(datetime.datetime.now().timestamp()) @@ -228,7 +222,7 @@ def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_ # decrypt api_key api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None)) - params = {"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}} + params = {"onlyMainContent": only_main_content} result = firecrawl_app.scrape_url(url, params) return result else: diff --git a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py index 8fcdf2e8e5310a..120ca9c8ea9845 100644 --- a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py +++ b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py @@ -10,19 +10,18 @@ def test_firecrawl_web_extractor_crawl_mode(mocker): base_url = "https://api.firecrawl.dev" firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url) params = { - "crawlerOptions": { - "includes": [], - "excludes": [], - "generateImgAltText": True, - "maxDepth": 1, - "limit": 1, - "returnOnlyUrls": False, - } + "includes": [], + "excludes": [], + "generateImgAltText": True, + "maxDepth": 1, + "limit": 1, } mocked_firecrawl = { - "jobId": "test", + "id": "test", } mocker.patch("requests.post", return_value=_mock_response(mocked_firecrawl)) job_id = firecrawl_app.crawl_url(url, params) - print(job_id) + print(f"job_id: {job_id}") + + assert job_id is not None assert isinstance(job_id, str)