Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset Collector #1, Chudinova Alla - 19FPL2 #37

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
6 changes: 3 additions & 3 deletions article.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def save_raw(self):
indent=4,
ensure_ascii=False,
separators=(',', ': '))

@staticmethod
def from_meta_json(json_path: str):
"""
Expand Down Expand Up @@ -94,13 +94,13 @@ def _get_meta(self):
'author': self.author,
'topics': self.topics
}

def _date_to_text(self):
"""
Converts datetime object to text
"""
return self.date.strftime("%Y-%m-%d %H:%M:%S")

def _get_raw_text_path(self):
"""
Returns path for requested raw article
Expand Down
6 changes: 3 additions & 3 deletions crawler_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"base_urls": [],
"total_articles_to_find_and_parse": 0,
"max_number_articles_to_get_from_one_seed": 0
"base_urls": ["https://mordovia-news.ru/"],
"total_articles_to_find_and_parse": 3,
"max_number_articles_to_get_from_one_seed": 3
}
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests == 2.25.1
beautifulsoup4 == 4.9.3
lxml == 4.6.2
84 changes: 68 additions & 16 deletions scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
Crawler implementation
"""

import re
import os
import json
import datetime
import requests
from bs4 import BeautifulSoup
from article import Article
from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH


class IncorrectURLError(Exception):
"""
Expand Down Expand Up @@ -32,66 +41,109 @@ class Crawler:
Crawler implementation
"""
def __init__(self, seed_urls: list, max_articles: int):
pass

@staticmethod
def _extract_url(article_bs):
pass
self.search_urls = seed_urls
self.max_articles = max_articles
self.found_urls = []
self.link_pattern = r'/?news-\d+-\d+\.htm'

def _extract_url(self, article_bs):
links = []
for link in article_bs.find_all('a', href=True):
potential_link = re.match(self.link_pattern, link['href'])
if potential_link:
links.append(potential_link.group(0))
return links

def find_articles(self):
"""
Finds articles
"""
pass
for url in self.search_urls:
request = requests.get(url).content
soup = BeautifulSoup(request,
features='lxml')
for article_url in self._extract_url(soup):
if len(self.found_urls) != self.max_articles \
and url+article_url not in self.found_urls:
self.found_urls.append(url+article_url)
print(f'Found {len(self.found_urls)} links to articles to process')

def get_search_urls(self):
"""
Returns seed_urls param
"""
pass
return self.found_urls


class ArticleParser:
"""
ArticleParser implementation
"""
def __init__(self, full_url: str, article_id: int):
pass
self.article = Article(url=full_url, article_id=article_id)

def _fill_article_with_text(self, article_soup):
pass
self.article.text = article_soup.find('dd', class_='text').text

def _fill_article_with_meta_information(self, article_soup):
pass
self.article.title = article_soup.find('dd', class_='title').text.strip()
self.article.author = 'NOT FOUND'
self.article.topics = article_soup.find('span', class_='title_text').find_all('a')[1].text
self.article.date = self.unify_date_format(article_soup.find('span', class_='title_data').text[-10:])

@staticmethod
def unify_date_format(date_str):
"""
Unifies date format
"""
pass
return datetime.datetime.strptime(date_str, "%d.%m.%Y")

def parse(self):
"""
Parses each article
"""
pass
request = requests.get(self.article.url).content
soup = BeautifulSoup(request, features='lxml')
self._fill_article_with_meta_information(soup)
self._fill_article_with_text(soup)
self.article.save_raw()


def prepare_environment(base_path):
"""
Creates ASSETS_PATH folder if not created and removes existing folder
"""
pass
if not os.path.exists(base_path):
os.makedirs(base_path)


def validate_config(crawler_path):
"""
Validates given config
"""
pass
with open(crawler_path, 'r', encoding='utf-8') as data:
settings = json.load(data)
url_pattern = 'https://'

for url in settings['base_urls']:
if url_pattern not in url:
raise IncorrectURLError

if not isinstance(settings['total_articles_to_find_and_parse'], int):
raise IncorrectNumberOfArticlesError

if settings['total_articles_to_find_and_parse'] > 100:
raise NumberOfArticlesOutOfRangeError
return settings['base_urls'], settings['total_articles_to_find_and_parse']


if __name__ == '__main__':
# YOUR CODE HERE
pass
urls, num_articles = validate_config(CRAWLER_CONFIG_PATH)
prepare_environment(ASSETS_PATH)

crawler = Crawler(seed_urls=urls, max_articles=num_articles)
crawler.find_articles()

for _article_id, _article_link in enumerate(crawler.get_search_urls()):
parser = ArticleParser(_article_link, _article_id+1)
parser.parse()
2 changes: 1 addition & 1 deletion target_score.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Target score for scrapper.py:
6
8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that s good switch.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that s good switch.

No, I do not think so. It is eminent I'd better switch to 4 or 6 later.


# Target score for pipeline.py:
0