Skip to content

Commit

Permalink
feat: Rocket.Chat Docs Crawl (#1)
Browse files Browse the repository at this point in the history
* init and add spider script

* add data use script
  • Loading branch information
Dnouv authored Dec 20, 2024
1 parent 5c92dd1 commit a008d59
Show file tree
Hide file tree
Showing 4 changed files with 247 additions and 0 deletions.
76 changes: 76 additions & 0 deletions rocket_chat_docs_spider/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Rocket.Chat Documentation Spider

A web scraping tool built with Scrapy to extract documentation content from Rocket.Chat's official documentation websites.

## Overview

This spider crawls both the main Rocket.Chat documentation (`docs.rocket.chat/docs`) and developer documentation (`developer.rocket.chat/docs`) to extract:

- Page titles
- Main content
- H2 headers
- URLs

## Prerequisites

Install the required dependencies
:
## Create virtual environment
```sh
python -m venv .venv
```

## Activate virtual environment
```sh
source .venv/bin/activate
```

## Install dependencies
```sh
pip install -r requirements.txt
```

## Configuration
The spider is configured with the following settings:

- Rate limiting: 5 seconds between requests
- Single concurrent request
- Respects robots.txt
- Automatic retry (3 times) for common error codes
- Custom user agent and headers for reliability

## Output
The spider generates a JSONL (JSON Lines) file with the following structure:
```
{
"page_title": "Page Title",
"content": ["Paragraph 1", "Paragraph 2", ...],
"h2_headers": ["Header 1", "Header 2", ...],
"url": "https://docs.rocket.chat/..."
}
```

## Usage
To run the spider, execute the following command:

```sh
scrapy runspider rcspider.py -o rocket_chat_docs.jsonl
```

The spider will crawl the Rocket.Chat documentation websites and save the extracted content to `rocket_chat_docs.jsonl`.

To send the output to a different file, change the filename in the `-o` argument.

For reading and processing the JSONL file, you can use the following Python code:

```python
python read_and_send.py
```

By default, the script reads the `rocket_chat_docs.jsonl` file and sends the extracted content to an edpoint defined in the `DocumentProcessor` class. However, you can change the endpoint or write your own custom callback function.


## Important Notes
- The spider implements polite crawling with a 5-second delay between requests
- Only URLs from `docs.rocket.chat` and `developer.rocket.chat` domains are crawled
- Already visited URLs are tracked to prevent duplicate crawling
61 changes: 61 additions & 0 deletions rocket_chat_docs_spider/rcspider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import scrapy
import re

class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://docs.rocket.chat/docs', 'https://developer.rocket.chat/docs']
visited_urls = set()

custom_settings = {
'CONCURRENT_REQUESTS': 1, # One request at a time
'DOWNLOAD_DELAY': 5, # 5 seconds between requests
'ROBOTSTXT_OBEY': True,
'RETRY_TIMES': 3, # Retry failed requests
'RETRY_HTTP_CODES': [403, 429, 500, 502, 503, 504],
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'DEFAULT_REQUEST_HEADERS': {
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'
},
'FEEDS': {
'data/output.jsonl': {
'format': 'jsonlines',
'encoding': 'utf8'
}
}
}

def start_requests(self):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5'
}
for url in self.start_urls:
yield scrapy.Request(url, headers=headers, callback=self.parse)

def parse(self, response):
# Extract all links on the page
links = response.css('a::attr(href)').getall()
filtered_links = [
response.urljoin(link) for link in links
if ('docs.rocket.chat' in response.urljoin(link) or 'developer.rocket.chat' in response.urljoin(link))
and response.urljoin(link) not in self.visited_urls
]
# Follow links
for link in filtered_links:
if link not in self.visited_urls:
self.visited_urls.add(link)
yield scrapy.Request(response.urljoin(link), callback=self.parse)

content = response.css('.content_block div p::text').getall()
h2_headers = response.css('.content_block div h2::text').getall()
page_title = response.css('.content_block div h1::text').get()

yield {
"page_title": page_title,
"content": content,
"h2_headers": h2_headers,
"url": response.url
}
108 changes: 108 additions & 0 deletions rocket_chat_docs_spider/read_and_send.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from typing import Dict, Any, Optional, Union
import json
import requests
from datetime import datetime

class DocumentProcessor:
def __init__(
self,
api_endpoint: str,
headers: Optional[Dict[str, str]] = None,
default_metadata: Optional[Dict[str, Any]] = None
):
self.api_endpoint = api_endpoint
self.headers = headers or {}
self.default_metadata = default_metadata or {}

def process_file(self, file_path: str, callback=None) -> None:
"""Process JSONL file and send documents."""
with open(file_path) as f:
for line in f:
try:
data = json.loads(line)
processed_doc = self.prepare_document(data)

if callback:
# Allow custom handling of processed document
callback(processed_doc)
else:
# Default sending behavior
self.send_document(processed_doc)
except Exception as e:
print(f"Error processing line: {e}")

def prepare_document(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""Prepare document from raw data."""
# Extract content
raw_content = data.get('content', None)
content = self._process_content(raw_content)

# Get title
title = data.get('page_title')

if not content or not title:
raise ValueError("Missing required content or title")

# Determine classification
classification = self._determine_classification(data.get('url', ''))

# Combine with default metadata and return
metadata = {
**self.default_metadata,
"url": data.get('url', ''),
"h2_headers": data.get('h2_headers', []),
}

return {
"classification": classification,
"contents": content,
"name": title,
"metadata": metadata
}

def send_document(self, document: Dict[str, Any]) -> requests.Response:
"""Send document to API endpoint."""
response = requests.post(
self.api_endpoint,
json=document,
headers=self.headers
)
return response

def _process_content(self, raw_content: Union[list, str, None]) -> str:
"""Process raw content into string."""
if isinstance(raw_content, list):
return ' '.join(str(c) for c in raw_content)
elif isinstance(raw_content, str):
return raw_content
raise ValueError(f"Unexpected content type: {type(raw_content)}")

def _determine_classification(self, url: str) -> str:
"""Determine document classification based on URL."""
return "developer" if "developer.rocket.chat" in url else "user"

# Example usage:
if __name__ == "__main__":
# Configuration
processor = DocumentProcessor(
api_endpoint="http://{{host}}/documents",
headers={
"Content-Type": "application/json",
# Add any other headers here
},
default_metadata={
"doc_expiry": "2024-12-31T23:59:59Z",
"source": "Rocket.Chat",
# Add any other default metadata here
}
)

# Process file
processor.process_file("rocket_chat_docs.jsonl")

# Or with custom callback
def custom_handler(doc):
print(f"Processing document: {doc['name']}")
# Add custom logic here

processor.process_file("rocket_chat_docs.jsonl", callback=custom_handler)
2 changes: 2 additions & 0 deletions rocket_chat_docs_spider/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests==2.32.3
scrapy==2.12.0

0 comments on commit a008d59

Please sign in to comment.