From a008d59a366962bb2385239cdf95be568d6f060e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E3=83=87=E3=83=AF=E3=83=B3=E3=82=B7=E3=83=A5?=
 <61188295+Dnouv@users.noreply.github.com>
Date: Fri, 20 Dec 2024 10:34:20 +0530
Subject: [PATCH] feat: Rocket.Chat Docs Crawl (#1)

* init and add spider script

* add data use script
---
 rocket_chat_docs_spider/README.md        |  76 ++++++++++++++++
 rocket_chat_docs_spider/rcspider.py      |  61 +++++++++++++
 rocket_chat_docs_spider/read_and_send.py | 108 +++++++++++++++++++++++
 rocket_chat_docs_spider/requirements.txt |   2 +
 4 files changed, 247 insertions(+)
 create mode 100644 rocket_chat_docs_spider/README.md
 create mode 100644 rocket_chat_docs_spider/rcspider.py
 create mode 100644 rocket_chat_docs_spider/read_and_send.py
 create mode 100644 rocket_chat_docs_spider/requirements.txt

diff --git a/rocket_chat_docs_spider/README.md b/rocket_chat_docs_spider/README.md
new file mode 100644
index 0000000..5880bfd
--- /dev/null
+++ b/rocket_chat_docs_spider/README.md
@@ -0,0 +1,76 @@
+# Rocket.Chat Documentation Spider
+
+A web scraping tool built with Scrapy to extract documentation content from Rocket.Chat's official documentation websites.
+
+## Overview
+
+This spider crawls both the main Rocket.Chat documentation (`docs.rocket.chat/docs`) and developer documentation (`developer.rocket.chat/docs`) to extract:
+
+- Page titles
+- Main content
+- H2 headers
+- URLs
+
+## Prerequisites
+
+Install the required dependencies
+:
+## Create virtual environment
+```sh
+python -m venv .venv
+```
+
+## Activate virtual environment
+```sh
+source .venv/bin/activate
+```
+
+## Install dependencies
+```sh
+pip install -r requirements.txt
+```
+
+## Configuration
+The spider is configured with the following settings:
+
+- Rate limiting: 5 seconds between requests
+- Single concurrent request
+- Respects robots.txt
+- Automatic retry (3 times) for common error codes
+- Custom user agent and headers for reliability
+
+## Output
+The spider generates a JSONL (JSON Lines) file with the following structure:
+```
+{
+    "page_title": "Page Title",
+    "content": ["Paragraph 1", "Paragraph 2", ...],
+    "h2_headers": ["Header 1", "Header 2", ...],
+    "url": "https://docs.rocket.chat/..."
+}
+```
+
+## Usage
+To run the spider, execute the following command:
+
+```sh
+scrapy runspider rcspider.py -o rocket_chat_docs.jsonl
+```
+
+The spider will crawl the Rocket.Chat documentation websites and save the extracted content to `rocket_chat_docs.jsonl`.
+
+To send the output to a different file, change the filename in the `-o` argument.
+
+For reading and processing the JSONL file, you can use the following Python code:
+
+```python
+python read_and_send.py
+```
+
+By default, the script reads the `rocket_chat_docs.jsonl` file and sends the extracted content to an edpoint defined in the `DocumentProcessor` class. However, you can change the endpoint or write your own custom callback function.
+
+
+## Important Notes
+- The spider implements polite crawling with a 5-second delay between requests
+- Only URLs from `docs.rocket.chat` and `developer.rocket.chat` domains are crawled
+- Already visited URLs are tracked to prevent duplicate crawling
diff --git a/rocket_chat_docs_spider/rcspider.py b/rocket_chat_docs_spider/rcspider.py
new file mode 100644
index 0000000..a437040
--- /dev/null
+++ b/rocket_chat_docs_spider/rcspider.py
@@ -0,0 +1,61 @@
+import scrapy
+import re
+
+class MySpider(scrapy.Spider):
+    name = 'myspider'
+    start_urls = ['https://docs.rocket.chat/docs', 'https://developer.rocket.chat/docs']
+    visited_urls = set()
+
+    custom_settings = {
+        'CONCURRENT_REQUESTS': 1,           # One request at a time
+        'DOWNLOAD_DELAY': 5,               # 5 seconds between requests
+        'ROBOTSTXT_OBEY': True,
+        'RETRY_TIMES': 3,                  # Retry failed requests
+        'RETRY_HTTP_CODES': [403, 429, 500, 502, 503, 504],
+        'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+        'DEFAULT_REQUEST_HEADERS': {
+            'Accept': 'text/html,application/xhtml+xml',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Cache-Control': 'max-age=0',
+            'Connection': 'keep-alive'
+        },
+        'FEEDS': {
+            'data/output.jsonl': {
+                'format': 'jsonlines',
+                'encoding': 'utf8'
+            }
+        }
+    }
+
+    def start_requests(self):
+        headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5'
+        }
+        for url in self.start_urls:
+            yield scrapy.Request(url, headers=headers, callback=self.parse)
+
+    def parse(self, response):
+        # Extract all links on the page
+        links = response.css('a::attr(href)').getall()
+        filtered_links = [
+            response.urljoin(link) for link in links 
+            if ('docs.rocket.chat' in response.urljoin(link) or 'developer.rocket.chat' in response.urljoin(link))
+            and response.urljoin(link) not in self.visited_urls
+        ]
+        # Follow links
+        for link in filtered_links:
+            if link not in self.visited_urls:
+                self.visited_urls.add(link)
+                yield scrapy.Request(response.urljoin(link), callback=self.parse)
+
+        content = response.css('.content_block div p::text').getall()
+        h2_headers = response.css('.content_block div h2::text').getall()
+        page_title = response.css('.content_block div h1::text').get()
+
+        yield {
+            "page_title": page_title,
+            "content": content,
+            "h2_headers": h2_headers,
+            "url": response.url
+        }
diff --git a/rocket_chat_docs_spider/read_and_send.py b/rocket_chat_docs_spider/read_and_send.py
new file mode 100644
index 0000000..4a3c085
--- /dev/null
+++ b/rocket_chat_docs_spider/read_and_send.py
@@ -0,0 +1,108 @@
+from typing import Dict, Any, Optional, Union
+import json
+import requests
+from datetime import datetime
+
+class DocumentProcessor:
+    def __init__(
+        self,
+        api_endpoint: str,
+        headers: Optional[Dict[str, str]] = None,
+        default_metadata: Optional[Dict[str, Any]] = None
+    ):
+        self.api_endpoint = api_endpoint
+        self.headers = headers or {}
+        self.default_metadata = default_metadata or {}
+
+    def process_file(self, file_path: str, callback=None) -> None:
+        """Process JSONL file and send documents."""
+        with open(file_path) as f:
+            for line in f:
+                try:
+                    data = json.loads(line)
+                    processed_doc = self.prepare_document(data)
+                    
+                    if callback:
+                        # Allow custom handling of processed document
+                        callback(processed_doc)
+                    else:
+                        # Default sending behavior
+                        self.send_document(processed_doc)
+                except Exception as e:
+                    print(f"Error processing line: {e}")
+
+    def prepare_document(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Prepare document from raw data."""
+        # Extract content
+        raw_content = data.get('content', None)
+        content = self._process_content(raw_content)
+        
+        # Get title
+        title = data.get('page_title')
+        
+        if not content or not title:
+            raise ValueError("Missing required content or title")
+
+        # Determine classification
+        classification = self._determine_classification(data.get('url', ''))
+
+        # Combine with default metadata and return
+        metadata = {
+            **self.default_metadata,
+            "url": data.get('url', ''),
+            "h2_headers": data.get('h2_headers', []),
+        }
+
+        return {
+            "classification": classification,
+            "contents": content,
+            "name": title,
+            "metadata": metadata
+        }
+
+    def send_document(self, document: Dict[str, Any]) -> requests.Response:
+        """Send document to API endpoint."""
+        response = requests.post(
+            self.api_endpoint,
+            json=document,
+            headers=self.headers
+        )
+        return response
+
+    def _process_content(self, raw_content: Union[list, str, None]) -> str:
+        """Process raw content into string."""
+        if isinstance(raw_content, list):
+            return ' '.join(str(c) for c in raw_content)
+        elif isinstance(raw_content, str):
+            return raw_content
+        raise ValueError(f"Unexpected content type: {type(raw_content)}")
+
+    def _determine_classification(self, url: str) -> str:
+        """Determine document classification based on URL."""
+        return "developer" if "developer.rocket.chat" in url else "user"
+
+# Example usage:
+if __name__ == "__main__":
+    # Configuration
+    processor = DocumentProcessor(
+        api_endpoint="http://{{host}}/documents",
+        headers={
+            "Content-Type": "application/json",
+            # Add any other headers here
+        },
+        default_metadata={
+            "doc_expiry": "2024-12-31T23:59:59Z",
+            "source": "Rocket.Chat",
+            # Add any other default metadata here
+        }
+    )
+
+    # Process file
+    processor.process_file("rocket_chat_docs.jsonl")
+
+    # Or with custom callback
+    def custom_handler(doc):
+        print(f"Processing document: {doc['name']}")
+        # Add custom logic here
+    
+    processor.process_file("rocket_chat_docs.jsonl", callback=custom_handler)
\ No newline at end of file
diff --git a/rocket_chat_docs_spider/requirements.txt b/rocket_chat_docs_spider/requirements.txt
new file mode 100644
index 0000000..a2834c8
--- /dev/null
+++ b/rocket_chat_docs_spider/requirements.txt
@@ -0,0 +1,2 @@
+requests==2.32.3
+scrapy==2.12.0
\ No newline at end of file