From a008d59a366962bb2385239cdf95be568d6f060e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=83=87=E3=83=AF=E3=83=B3=E3=82=B7=E3=83=A5?= <61188295+Dnouv@users.noreply.github.com> Date: Fri, 20 Dec 2024 10:34:20 +0530 Subject: [PATCH] feat: Rocket.Chat Docs Crawl (#1) * init and add spider script * add data use script --- rocket_chat_docs_spider/README.md | 76 ++++++++++++++++ rocket_chat_docs_spider/rcspider.py | 61 +++++++++++++ rocket_chat_docs_spider/read_and_send.py | 108 +++++++++++++++++++++++ rocket_chat_docs_spider/requirements.txt | 2 + 4 files changed, 247 insertions(+) create mode 100644 rocket_chat_docs_spider/README.md create mode 100644 rocket_chat_docs_spider/rcspider.py create mode 100644 rocket_chat_docs_spider/read_and_send.py create mode 100644 rocket_chat_docs_spider/requirements.txt diff --git a/rocket_chat_docs_spider/README.md b/rocket_chat_docs_spider/README.md new file mode 100644 index 0000000..5880bfd --- /dev/null +++ b/rocket_chat_docs_spider/README.md @@ -0,0 +1,76 @@ +# Rocket.Chat Documentation Spider + +A web scraping tool built with Scrapy to extract documentation content from Rocket.Chat's official documentation websites. + +## Overview + +This spider crawls both the main Rocket.Chat documentation (`docs.rocket.chat/docs`) and developer documentation (`developer.rocket.chat/docs`) to extract: + +- Page titles +- Main content +- H2 headers +- URLs + +## Prerequisites + +Install the required dependencies +: +## Create virtual environment +```sh +python -m venv .venv +``` + +## Activate virtual environment +```sh +source .venv/bin/activate +``` + +## Install dependencies +```sh +pip install -r requirements.txt +``` + +## Configuration +The spider is configured with the following settings: + +- Rate limiting: 5 seconds between requests +- Single concurrent request +- Respects robots.txt +- Automatic retry (3 times) for common error codes +- Custom user agent and headers for reliability + +## Output +The spider generates a JSONL (JSON Lines) file with the following structure: +``` +{ + "page_title": "Page Title", + "content": ["Paragraph 1", "Paragraph 2", ...], + "h2_headers": ["Header 1", "Header 2", ...], + "url": "https://docs.rocket.chat/..." +} +``` + +## Usage +To run the spider, execute the following command: + +```sh +scrapy runspider rcspider.py -o rocket_chat_docs.jsonl +``` + +The spider will crawl the Rocket.Chat documentation websites and save the extracted content to `rocket_chat_docs.jsonl`. + +To send the output to a different file, change the filename in the `-o` argument. + +For reading and processing the JSONL file, you can use the following Python code: + +```python +python read_and_send.py +``` + +By default, the script reads the `rocket_chat_docs.jsonl` file and sends the extracted content to an edpoint defined in the `DocumentProcessor` class. However, you can change the endpoint or write your own custom callback function. + + +## Important Notes +- The spider implements polite crawling with a 5-second delay between requests +- Only URLs from `docs.rocket.chat` and `developer.rocket.chat` domains are crawled +- Already visited URLs are tracked to prevent duplicate crawling diff --git a/rocket_chat_docs_spider/rcspider.py b/rocket_chat_docs_spider/rcspider.py new file mode 100644 index 0000000..a437040 --- /dev/null +++ b/rocket_chat_docs_spider/rcspider.py @@ -0,0 +1,61 @@ +import scrapy +import re + +class MySpider(scrapy.Spider): + name = 'myspider' + start_urls = ['https://docs.rocket.chat/docs', 'https://developer.rocket.chat/docs'] + visited_urls = set() + + custom_settings = { + 'CONCURRENT_REQUESTS': 1, # One request at a time + 'DOWNLOAD_DELAY': 5, # 5 seconds between requests + 'ROBOTSTXT_OBEY': True, + 'RETRY_TIMES': 3, # Retry failed requests + 'RETRY_HTTP_CODES': [403, 429, 500, 502, 503, 504], + 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', + 'DEFAULT_REQUEST_HEADERS': { + 'Accept': 'text/html,application/xhtml+xml', + 'Accept-Language': 'en-US,en;q=0.9', + 'Cache-Control': 'max-age=0', + 'Connection': 'keep-alive' + }, + 'FEEDS': { + 'data/output.jsonl': { + 'format': 'jsonlines', + 'encoding': 'utf8' + } + } + } + + def start_requests(self): + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5' + } + for url in self.start_urls: + yield scrapy.Request(url, headers=headers, callback=self.parse) + + def parse(self, response): + # Extract all links on the page + links = response.css('a::attr(href)').getall() + filtered_links = [ + response.urljoin(link) for link in links + if ('docs.rocket.chat' in response.urljoin(link) or 'developer.rocket.chat' in response.urljoin(link)) + and response.urljoin(link) not in self.visited_urls + ] + # Follow links + for link in filtered_links: + if link not in self.visited_urls: + self.visited_urls.add(link) + yield scrapy.Request(response.urljoin(link), callback=self.parse) + + content = response.css('.content_block div p::text').getall() + h2_headers = response.css('.content_block div h2::text').getall() + page_title = response.css('.content_block div h1::text').get() + + yield { + "page_title": page_title, + "content": content, + "h2_headers": h2_headers, + "url": response.url + } diff --git a/rocket_chat_docs_spider/read_and_send.py b/rocket_chat_docs_spider/read_and_send.py new file mode 100644 index 0000000..4a3c085 --- /dev/null +++ b/rocket_chat_docs_spider/read_and_send.py @@ -0,0 +1,108 @@ +from typing import Dict, Any, Optional, Union +import json +import requests +from datetime import datetime + +class DocumentProcessor: + def __init__( + self, + api_endpoint: str, + headers: Optional[Dict[str, str]] = None, + default_metadata: Optional[Dict[str, Any]] = None + ): + self.api_endpoint = api_endpoint + self.headers = headers or {} + self.default_metadata = default_metadata or {} + + def process_file(self, file_path: str, callback=None) -> None: + """Process JSONL file and send documents.""" + with open(file_path) as f: + for line in f: + try: + data = json.loads(line) + processed_doc = self.prepare_document(data) + + if callback: + # Allow custom handling of processed document + callback(processed_doc) + else: + # Default sending behavior + self.send_document(processed_doc) + except Exception as e: + print(f"Error processing line: {e}") + + def prepare_document(self, data: Dict[str, Any]) -> Dict[str, Any]: + """Prepare document from raw data.""" + # Extract content + raw_content = data.get('content', None) + content = self._process_content(raw_content) + + # Get title + title = data.get('page_title') + + if not content or not title: + raise ValueError("Missing required content or title") + + # Determine classification + classification = self._determine_classification(data.get('url', '')) + + # Combine with default metadata and return + metadata = { + **self.default_metadata, + "url": data.get('url', ''), + "h2_headers": data.get('h2_headers', []), + } + + return { + "classification": classification, + "contents": content, + "name": title, + "metadata": metadata + } + + def send_document(self, document: Dict[str, Any]) -> requests.Response: + """Send document to API endpoint.""" + response = requests.post( + self.api_endpoint, + json=document, + headers=self.headers + ) + return response + + def _process_content(self, raw_content: Union[list, str, None]) -> str: + """Process raw content into string.""" + if isinstance(raw_content, list): + return ' '.join(str(c) for c in raw_content) + elif isinstance(raw_content, str): + return raw_content + raise ValueError(f"Unexpected content type: {type(raw_content)}") + + def _determine_classification(self, url: str) -> str: + """Determine document classification based on URL.""" + return "developer" if "developer.rocket.chat" in url else "user" + +# Example usage: +if __name__ == "__main__": + # Configuration + processor = DocumentProcessor( + api_endpoint="http://{{host}}/documents", + headers={ + "Content-Type": "application/json", + # Add any other headers here + }, + default_metadata={ + "doc_expiry": "2024-12-31T23:59:59Z", + "source": "Rocket.Chat", + # Add any other default metadata here + } + ) + + # Process file + processor.process_file("rocket_chat_docs.jsonl") + + # Or with custom callback + def custom_handler(doc): + print(f"Processing document: {doc['name']}") + # Add custom logic here + + processor.process_file("rocket_chat_docs.jsonl", callback=custom_handler) \ No newline at end of file diff --git a/rocket_chat_docs_spider/requirements.txt b/rocket_chat_docs_spider/requirements.txt new file mode 100644 index 0000000..a2834c8 --- /dev/null +++ b/rocket_chat_docs_spider/requirements.txt @@ -0,0 +1,2 @@ +requests==2.32.3 +scrapy==2.12.0 \ No newline at end of file