-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* init and add spider script * add data use script
- Loading branch information
Showing
4 changed files
with
247 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# Rocket.Chat Documentation Spider | ||
|
||
A web scraping tool built with Scrapy to extract documentation content from Rocket.Chat's official documentation websites. | ||
|
||
## Overview | ||
|
||
This spider crawls both the main Rocket.Chat documentation (`docs.rocket.chat/docs`) and developer documentation (`developer.rocket.chat/docs`) to extract: | ||
|
||
- Page titles | ||
- Main content | ||
- H2 headers | ||
- URLs | ||
|
||
## Prerequisites | ||
|
||
Install the required dependencies | ||
: | ||
## Create virtual environment | ||
```sh | ||
python -m venv .venv | ||
``` | ||
|
||
## Activate virtual environment | ||
```sh | ||
source .venv/bin/activate | ||
``` | ||
|
||
## Install dependencies | ||
```sh | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Configuration | ||
The spider is configured with the following settings: | ||
|
||
- Rate limiting: 5 seconds between requests | ||
- Single concurrent request | ||
- Respects robots.txt | ||
- Automatic retry (3 times) for common error codes | ||
- Custom user agent and headers for reliability | ||
|
||
## Output | ||
The spider generates a JSONL (JSON Lines) file with the following structure: | ||
``` | ||
{ | ||
"page_title": "Page Title", | ||
"content": ["Paragraph 1", "Paragraph 2", ...], | ||
"h2_headers": ["Header 1", "Header 2", ...], | ||
"url": "https://docs.rocket.chat/..." | ||
} | ||
``` | ||
|
||
## Usage | ||
To run the spider, execute the following command: | ||
|
||
```sh | ||
scrapy runspider rcspider.py -o rocket_chat_docs.jsonl | ||
``` | ||
|
||
The spider will crawl the Rocket.Chat documentation websites and save the extracted content to `rocket_chat_docs.jsonl`. | ||
|
||
To send the output to a different file, change the filename in the `-o` argument. | ||
|
||
For reading and processing the JSONL file, you can use the following Python code: | ||
|
||
```python | ||
python read_and_send.py | ||
``` | ||
|
||
By default, the script reads the `rocket_chat_docs.jsonl` file and sends the extracted content to an edpoint defined in the `DocumentProcessor` class. However, you can change the endpoint or write your own custom callback function. | ||
|
||
|
||
## Important Notes | ||
- The spider implements polite crawling with a 5-second delay between requests | ||
- Only URLs from `docs.rocket.chat` and `developer.rocket.chat` domains are crawled | ||
- Already visited URLs are tracked to prevent duplicate crawling |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import scrapy | ||
import re | ||
|
||
class MySpider(scrapy.Spider): | ||
name = 'myspider' | ||
start_urls = ['https://docs.rocket.chat/docs', 'https://developer.rocket.chat/docs'] | ||
visited_urls = set() | ||
|
||
custom_settings = { | ||
'CONCURRENT_REQUESTS': 1, # One request at a time | ||
'DOWNLOAD_DELAY': 5, # 5 seconds between requests | ||
'ROBOTSTXT_OBEY': True, | ||
'RETRY_TIMES': 3, # Retry failed requests | ||
'RETRY_HTTP_CODES': [403, 429, 500, 502, 503, 504], | ||
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', | ||
'DEFAULT_REQUEST_HEADERS': { | ||
'Accept': 'text/html,application/xhtml+xml', | ||
'Accept-Language': 'en-US,en;q=0.9', | ||
'Cache-Control': 'max-age=0', | ||
'Connection': 'keep-alive' | ||
}, | ||
'FEEDS': { | ||
'data/output.jsonl': { | ||
'format': 'jsonlines', | ||
'encoding': 'utf8' | ||
} | ||
} | ||
} | ||
|
||
def start_requests(self): | ||
headers = { | ||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
'Accept-Language': 'en-US,en;q=0.5' | ||
} | ||
for url in self.start_urls: | ||
yield scrapy.Request(url, headers=headers, callback=self.parse) | ||
|
||
def parse(self, response): | ||
# Extract all links on the page | ||
links = response.css('a::attr(href)').getall() | ||
filtered_links = [ | ||
response.urljoin(link) for link in links | ||
if ('docs.rocket.chat' in response.urljoin(link) or 'developer.rocket.chat' in response.urljoin(link)) | ||
and response.urljoin(link) not in self.visited_urls | ||
] | ||
# Follow links | ||
for link in filtered_links: | ||
if link not in self.visited_urls: | ||
self.visited_urls.add(link) | ||
yield scrapy.Request(response.urljoin(link), callback=self.parse) | ||
|
||
content = response.css('.content_block div p::text').getall() | ||
h2_headers = response.css('.content_block div h2::text').getall() | ||
page_title = response.css('.content_block div h1::text').get() | ||
|
||
yield { | ||
"page_title": page_title, | ||
"content": content, | ||
"h2_headers": h2_headers, | ||
"url": response.url | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
from typing import Dict, Any, Optional, Union | ||
import json | ||
import requests | ||
from datetime import datetime | ||
|
||
class DocumentProcessor: | ||
def __init__( | ||
self, | ||
api_endpoint: str, | ||
headers: Optional[Dict[str, str]] = None, | ||
default_metadata: Optional[Dict[str, Any]] = None | ||
): | ||
self.api_endpoint = api_endpoint | ||
self.headers = headers or {} | ||
self.default_metadata = default_metadata or {} | ||
|
||
def process_file(self, file_path: str, callback=None) -> None: | ||
"""Process JSONL file and send documents.""" | ||
with open(file_path) as f: | ||
for line in f: | ||
try: | ||
data = json.loads(line) | ||
processed_doc = self.prepare_document(data) | ||
|
||
if callback: | ||
# Allow custom handling of processed document | ||
callback(processed_doc) | ||
else: | ||
# Default sending behavior | ||
self.send_document(processed_doc) | ||
except Exception as e: | ||
print(f"Error processing line: {e}") | ||
|
||
def prepare_document(self, data: Dict[str, Any]) -> Dict[str, Any]: | ||
"""Prepare document from raw data.""" | ||
# Extract content | ||
raw_content = data.get('content', None) | ||
content = self._process_content(raw_content) | ||
|
||
# Get title | ||
title = data.get('page_title') | ||
|
||
if not content or not title: | ||
raise ValueError("Missing required content or title") | ||
|
||
# Determine classification | ||
classification = self._determine_classification(data.get('url', '')) | ||
|
||
# Combine with default metadata and return | ||
metadata = { | ||
**self.default_metadata, | ||
"url": data.get('url', ''), | ||
"h2_headers": data.get('h2_headers', []), | ||
} | ||
|
||
return { | ||
"classification": classification, | ||
"contents": content, | ||
"name": title, | ||
"metadata": metadata | ||
} | ||
|
||
def send_document(self, document: Dict[str, Any]) -> requests.Response: | ||
"""Send document to API endpoint.""" | ||
response = requests.post( | ||
self.api_endpoint, | ||
json=document, | ||
headers=self.headers | ||
) | ||
return response | ||
|
||
def _process_content(self, raw_content: Union[list, str, None]) -> str: | ||
"""Process raw content into string.""" | ||
if isinstance(raw_content, list): | ||
return ' '.join(str(c) for c in raw_content) | ||
elif isinstance(raw_content, str): | ||
return raw_content | ||
raise ValueError(f"Unexpected content type: {type(raw_content)}") | ||
|
||
def _determine_classification(self, url: str) -> str: | ||
"""Determine document classification based on URL.""" | ||
return "developer" if "developer.rocket.chat" in url else "user" | ||
|
||
# Example usage: | ||
if __name__ == "__main__": | ||
# Configuration | ||
processor = DocumentProcessor( | ||
api_endpoint="http://{{host}}/documents", | ||
headers={ | ||
"Content-Type": "application/json", | ||
# Add any other headers here | ||
}, | ||
default_metadata={ | ||
"doc_expiry": "2024-12-31T23:59:59Z", | ||
"source": "Rocket.Chat", | ||
# Add any other default metadata here | ||
} | ||
) | ||
|
||
# Process file | ||
processor.process_file("rocket_chat_docs.jsonl") | ||
|
||
# Or with custom callback | ||
def custom_handler(doc): | ||
print(f"Processing document: {doc['name']}") | ||
# Add custom logic here | ||
|
||
processor.process_file("rocket_chat_docs.jsonl", callback=custom_handler) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
requests==2.32.3 | ||
scrapy==2.12.0 |