Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset Collector #1, Ganina Olesya - 19FPL1 #32

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions crawler_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"base_urls": [],
"total_articles_to_find_and_parse": 0,
"max_number_articles_to_get_from_one_seed": 0
"base_urls": ["http://www.znamyatrud.ru/news-7.html"],
"total_articles_to_find_and_parse": 3,
"max_number_articles_to_get_from_one_seed": 3
}
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests == 2.25.1
beautifulsoup4 == 4.9.3
lxml == 4.6.2
80 changes: 67 additions & 13 deletions scrapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
"""
Crawler implementation
"""

import requests
from bs4 import BeautifulSoup
from time import sleep
import random
import json
import os
import re
import datetime
from article import Article
from constants import CRAWLER_CONFIG_PATH
from constants import ASSETS_PATH
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36'}

class IncorrectURLError(Exception):
"""
Expand Down Expand Up @@ -32,66 +43,109 @@ class Crawler:
Crawler implementation
"""
def __init__(self, seed_urls: list, max_articles: int):
pass
self.seed_urls = seed_urls
self.max_articles = max_articles
self.max_articles_per_seed = max_articles_per_seed
self.urls = []

@staticmethod
def _extract_url(article_bs):
pass
return article_bs.find('a').attrs['href']

def find_articles(self):
"""
Finds articles
"""
pass
for urls in self.seed_urls:
response = requests.get(urls, headers=headers)
sleep(random.randrange(2, 6))
if not response:
raise IncorrectURLError
b_soup = BeautifulSoup(response.content, features='lxml')
article_soup = b_soup.find_all('h1', class_='entry-title')
for article_bs in article_soup[:self.max_articles_per_seed]:
if len(self.urls) <= self.max_articles and article_bs not in self.urls:
seed_url = self._extract_url(article_bs)
self.urls.append(seed_url)
return self.urls

def get_search_urls(self):
"""
Returns seed_urls param
"""
pass
return self.seed_urls


class ArticleParser:
"""
ArticleParser implementation
"""
def __init__(self, full_url: str, article_id: int):
pass
self.full_url = full_url
self.article_id = article_id
self.article = Article(full_url, article_id)

def _fill_article_with_text(self, article_soup):
pass
self.article.text = article_soup.find(name='div', class_="onemidnew").text

def _fill_article_with_meta_information(self, article_soup):
pass
self.article.title = article_soup.find('h2', class_='mnname').text.strip()
self.article.author = 'NOT FOUND'
for topic in article_soup.find_all('a', rel="tag"):
self.article.topics.append(topic.text)
self.article.date = self.unify_date_format(article_soup.find(name='div', class_='mndate').text)

@staticmethod
def unify_date_format(date_str):
"""
Unifies date format
"""
pass
return datetime.datetime.strptime(date_str, "%d.%m.%Y")

def parse(self):
"""
Parses each article
"""
pass
article_bs = BeautifulSoup(requests.get(self.full_url, headers=headers).content, 'lxml')
self._fill_article_with_text(article_bs)
self._fill_article_with_meta_information(article_bs)
self.article.save_raw()
return self.article


def prepare_environment(base_path):
"""
Creates ASSETS_PATH folder if not created and removes existing folder
"""
pass
if not os.path.exists(base_path):
os.makedirs(base_path)


def validate_config(crawler_path):
"""
Validates given config
"""
pass
with open(crawler_path, 'r', encoding='utf-8') as file:
crawler_configur = json.load(file)
for base_url in crawler_configur['base_urls']:
if not re.match('https://', base_url):
raise IncorrectURLError
if 'total_articles_to_find_and_parse' in crawler_configur and \
isinstance(crawler_configur['total_articles_to_find_and_parse'], int) and \
crawler_configur['total_articles_to_find_and_parse'] > 100:
raise NumberOfArticlesOutOfRangeError
if not isinstance(crawler_configur['total_articles_to_find_and_parse'], int):
raise IncorrectNumberOfArticlesError
return crawler_configur['base_urls'], crawler_configur['total_articles_to_find_and_parse'], \
crawler_configur['max_number_articles_to_get_from_one_seed']


if __name__ == '__main__':
# YOUR CODE HERE
pass
urls, maxi_articles, maxi_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH)
crawler = Crawler(urls, maxi_articles, maxi_articles_per_seed)
crawler.find_articles()
for i, articles_url in enumerate(urls):
parser = ArticleParser(full_url=articles_url, article_id=i + 1)
article = parser.parse()
parser.parse()
5 changes: 4 additions & 1 deletion target_score.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Target score for scrapper.py:
6
4

# Target score for pipeline.py:
0

# Skip pipeline checks:
1