Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrapper, Rostislav Hmelevski - 22FPL1 #83

Closed
wants to merge 62 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
5b88425
[SYNC] Aligned config with Hello, LLM! (#2)
demid5111 Mar 31, 2024
69772f0
Merge branch 'main' of https://github.com/RostislavHmelevski/2023-2-l…
RostislavHmelevski Apr 1, 2024
abc6dae
test
RostislavHmelevski Apr 24, 2024
4bf763c
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova Apr 25, 2024
ac02a85
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova Apr 29, 2024
2f0da8e
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova Apr 30, 2024
8ac5f79
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova May 8, 2024
52b4578
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova May 12, 2024
0a9e7b2
Merge remote-tracking branch 'origin/main' into HEAD
Vasilisa-Blyudova May 20, 2024
e7c3c9d
almost done with the work on '4' but help with stage 3.2 pls
RostislavHmelevski May 26, 2024
67f713e
almost done with the work on '4' but help with stage 3.2 pls
RostislavHmelevski May 26, 2024
3138d53
almost done with the work on '4' but help with stage 3.2 pls
RostislavHmelevski May 26, 2024
f163df6
some steps forward
RostislavHmelevski Jun 1, 2024
d78e557
this was awkward
RostislavHmelevski Jun 1, 2024
c3930f7
this was akward 2.0
RostislavHmelevski Jun 1, 2024
755189a
this was akward 2.0
RostislavHmelevski Jun 1, 2024
1314c1a
still trying
RostislavHmelevski Jun 1, 2024
2317fa7
still trying
RostislavHmelevski Jun 1, 2024
c4249fd
still trying
RostislavHmelevski Jun 1, 2024
0c33364
still trying
RostislavHmelevski Jun 1, 2024
8958e09
maybe
RostislavHmelevski Jun 2, 2024
76a5c01
maybe
RostislavHmelevski Jun 2, 2024
488b860
maybe
RostislavHmelevski Jun 2, 2024
7803187
maybe
RostislavHmelevski Jun 2, 2024
0b739f7
maybe
RostislavHmelevski Jun 2, 2024
dbfddab
maybe
RostislavHmelevski Jun 2, 2024
3a46d1c
maybe
RostislavHmelevski Jun 2, 2024
ed0fc11
maybe
RostislavHmelevski Jun 2, 2024
df387db
maybe
RostislavHmelevski Jun 2, 2024
42ba7bb
requirements check
RostislavHmelevski Jun 2, 2024
9bd3007
check
RostislavHmelevski Jun 2, 2024
3ea458d
check
RostislavHmelevski Jun 2, 2024
b2a9881
check
RostislavHmelevski Jun 2, 2024
7704b14
check
RostislavHmelevski Jun 2, 2024
bd80bc4
check
RostislavHmelevski Jun 2, 2024
5390e88
check
RostislavHmelevski Jun 2, 2024
80d260e
check
RostislavHmelevski Jun 2, 2024
fcf0a1f
check
RostislavHmelevski Jun 2, 2024
94b74f3
check
RostislavHmelevski Jun 2, 2024
98b7597
check
RostislavHmelevski Jun 2, 2024
c2085d9
check
RostislavHmelevski Jun 2, 2024
53a8209
please be right
RostislavHmelevski Jun 2, 2024
23a9261
checking requirements
RostislavHmelevski Jun 2, 2024
659871e
checking requirements
RostislavHmelevski Jun 2, 2024
670b566
Update
Vasilisa-Blyudova Jun 2, 2024
a91975d
for 6
RostislavHmelevski Jun 2, 2024
d0bc23f
Merge remote-tracking branch 'origin/main'
RostislavHmelevski Jun 2, 2024
c4b6be0
for 6
RostislavHmelevski Jun 2, 2024
d466f21
final hope
RostislavHmelevski Jun 2, 2024
820524e
final hope
RostislavHmelevski Jun 2, 2024
7f59eb5
final hope
RostislavHmelevski Jun 2, 2024
3ad818e
final hope
RostislavHmelevski Jun 2, 2024
b14b84c
final hope
RostislavHmelevski Jun 2, 2024
f2cb8c4
final hope
RostislavHmelevski Jun 2, 2024
d0c2e95
final hope
RostislavHmelevski Jun 2, 2024
d25f6a5
final hope
RostislavHmelevski Jun 2, 2024
373179b
final hope
RostislavHmelevski Jun 2, 2024
dee06f5
final hope
RostislavHmelevski Jun 2, 2024
7a6a77b
final hope
RostislavHmelevski Jun 2, 2024
4c28e8c
final hope
RostislavHmelevski Jun 2, 2024
e29eb75
final hope
RostislavHmelevski Jun 2, 2024
5298faf
final hope
RostislavHmelevski Jun 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 173 additions & 1 deletion lab_5_scrapper/scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,63 @@
Crawler implementation.
"""
# pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable
import datetime
import json
import pathlib
import re
import shutil
from typing import Pattern, Union

import requests
from bs4 import BeautifulSoup

from core_utils.article.article import Article
from core_utils.article.io import to_meta, to_raw
from core_utils.config_dto import ConfigDTO
from core_utils.constants import ASSETS_PATH, CRAWLER_CONFIG_PATH


class IncorrectSeedURLError(Exception):
"""
Seed URL does not match standard pattern.
"""


class IncorrectNumberOfArticlesError(Exception):
"""
Total number of articles to parse is not integer.
"""


class NumberOfArticlesOutOfRangeError(Exception):
"""
Total number of articles is out of range from 1 to 150.
"""


class IncorrectHeadersError(Exception):
"""
Headers are not in a form of dictionary.
"""


class IncorrectEncodingError(Exception):
"""
encoding must be specified as a string.
"""


class IncorrectTimeoutError(Exception):
"""
timeout value must be a positive integer less than 60.
"""


class IncorrectVerifyError(Exception):
"""
verify certificate value must either be True or False.
"""


class Config:
"""
Expand All @@ -18,6 +72,16 @@ def __init__(self, path_to_config: pathlib.Path) -> None:
Args:
path_to_config (pathlib.Path): Path to configuration.
"""
self.path_to_config = path_to_config
self._validate_config_content()
self.config = self._extract_config_content()
self._seed_urls = self.config.seed_urls
self._num_articles = self.config.total_articles
self._headers = self.config.headers
self._encoding = self.config.encoding
self._timeout = self.config.timeout
self._should_verify_certificate = self.config.should_verify_certificate
self._headless_mode = self.config.headless_mode

def _extract_config_content(self) -> ConfigDTO:
"""
Expand All @@ -26,11 +90,50 @@ def _extract_config_content(self) -> ConfigDTO:
Returns:
ConfigDTO: Config values
"""
with open(self.path_to_config, 'r', encoding='utf-8') as file:
config = json.load(file)
return ConfigDTO(
seed_urls=config["seed_urls"],
total_articles_to_find_and_parse=config["total_articles_to_find_and_parse"],
headers=config["headers"],
encoding=config["encoding"],
timeout=config["timeout"],
should_verify_certificate=config["should_verify_certificate"],
headless_mode=config["headless_mode"]
)

def _validate_config_content(self) -> None:
"""
Ensure configuration parameters are not corrupt.
"""
with open(self.path_to_config, 'r', encoding='utf-8') as f:
config = json.load(f)

if not isinstance(config['seed_urls'], list):
raise IncorrectSeedURLError

if not all(seed_url.startswith('https://donday.ru/') for seed_url in config['seed_urls']):
raise IncorrectSeedURLError

if (not isinstance(config['total_articles_to_find_and_parse'], int) or
config['total_articles_to_find_and_parse'] <= 0):
raise IncorrectNumberOfArticlesError

if not 1 < config['total_articles_to_find_and_parse'] <= 150:
raise NumberOfArticlesOutOfRangeError

if not isinstance(config['headers'], dict):
raise IncorrectHeadersError

if not isinstance(config['encoding'], str):
raise IncorrectEncodingError

if not isinstance(config['timeout'], int) or not 0 < config['timeout'] < 60:
raise IncorrectTimeoutError

if (not isinstance(config['should_verify_certificate'], bool) or
not isinstance(config['headless_mode'], bool)):
raise IncorrectVerifyError

def get_seed_urls(self) -> list[str]:
"""
Expand All @@ -39,6 +142,7 @@ def get_seed_urls(self) -> list[str]:
Returns:
list[str]: Seed urls
"""
return self._seed_urls

def get_num_articles(self) -> int:
"""
Expand All @@ -47,6 +151,7 @@ def get_num_articles(self) -> int:
Returns:
int: Total number of articles to scrape
"""
return self._num_articles

def get_headers(self) -> dict[str, str]:
"""
Expand All @@ -55,6 +160,7 @@ def get_headers(self) -> dict[str, str]:
Returns:
dict[str, str]: Headers
"""
return self._headers

def get_encoding(self) -> str:
"""
Expand All @@ -63,6 +169,7 @@ def get_encoding(self) -> str:
Returns:
str: Encoding
"""
return self._encoding

def get_timeout(self) -> int:
"""
Expand All @@ -71,6 +178,7 @@ def get_timeout(self) -> int:
Returns:
int: Number of seconds to wait for response
"""
return self._timeout

def get_verify_certificate(self) -> bool:
"""
Expand All @@ -79,6 +187,7 @@ def get_verify_certificate(self) -> bool:
Returns:
bool: Whether to verify certificate or not
"""
return self._should_verify_certificate

def get_headless_mode(self) -> bool:
"""
Expand All @@ -87,6 +196,7 @@ def get_headless_mode(self) -> bool:
Returns:
bool: Whether to use headless mode or not
"""
return self._headless_mode


def make_request(url: str, config: Config) -> requests.models.Response:
Expand All @@ -100,6 +210,11 @@ def make_request(url: str, config: Config) -> requests.models.Response:
Returns:
requests.models.Response: A response from a request
"""
res = requests.get(url=url,
timeout=config.get_timeout(),
headers=config.get_headers(),
verify=config.get_verify_certificate())
return res


class Crawler:
Expand All @@ -116,6 +231,8 @@ def __init__(self, config: Config) -> None:
Args:
config (Config): Configuration
"""
self.config = config
self.urls = []

def _extract_url(self, article_bs: BeautifulSoup) -> str:
"""
Expand All @@ -128,18 +245,35 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str:
str: Url from HTML
"""

article_url = article_bs.find('a').get('href')
return article_url

def find_articles(self) -> None:
"""
Find articles.
"""

for url in self.get_search_urls():
response = make_request(url, self.config)
if not response.ok:
continue
soup = BeautifulSoup(response.text, "html.parser")
contents = soup.find_all('div', id='dle-content')
max_articles = self.config.get_num_articles()
for content in contents[:max_articles]:
for item in content.find_all('h3', class_='btl'):
url_news = self._extract_url(item)
if url_news not in self.urls:
self.urls.append(url_news)

def get_search_urls(self) -> list:
"""
Get seed_urls param.

Returns:
list: seed_urls param
"""
return self.config.get_seed_urls()


# 10
Expand All @@ -160,6 +294,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None:
article_id (int): Article id
config (Config): Configuration
"""
self.full_url = full_url
self.article_id = article_id
self.config = config
self.article = Article(self.full_url, self.article_id)

def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
"""
Expand All @@ -168,6 +306,11 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
Args:
article_soup (bs4.BeautifulSoup): BeautifulSoup instance
"""
allnews = article_soup.find(itemprop="articleBody")
text_split = allnews.text.replace('\n', '').split()
text = ' '.join(text_split)
clear_text = '. '.join(text.split('. ')[:-2])
self.article.text = clear_text

def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None:
"""
Expand All @@ -176,6 +319,14 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No
Args:
article_soup (bs4.BeautifulSoup): BeautifulSoup instance
"""
title_find = article_soup.find(itemprop="headline")
self.article.title = title_find.text.replace('\n', '')
author = article_soup.find(class_="argauthor")
self.article.author = [author.text.replace('\n', '').strip()]
topics = article_soup.find(class_="argcat")
self.article.topics = topics.text.replace('\n', '')
time = article_soup.find('time', itemprop="datePublished")
self.article.time = time.text.replace('\n', '')

def unify_date_format(self, date_str: str) -> datetime.datetime:
"""
Expand All @@ -195,6 +346,13 @@ def parse(self) -> Union[Article, bool, list]:
Returns:
Union[Article, bool, list]: Article instance
"""
response = make_request(self.full_url, self.config)
if response.ok:
article_bs = BeautifulSoup(response.text, features='lxml')
self._fill_article_with_text(article_bs)
self._fill_article_with_meta_information(article_bs)

return self.article


def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Expand All @@ -204,13 +362,27 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Args:
base_path (Union[pathlib.Path, str]): Path where articles stores
"""
if base_path.exists():
shutil.rmtree(base_path)
base_path.mkdir(parents=True)


def main() -> None:
"""
Entrypoint for scrapper module.
"""
conf = Config(CRAWLER_CONFIG_PATH)
prepare_environment(ASSETS_PATH)
crawler = Crawler(conf)
crawler.find_articles()

for i, url in enumerate(crawler.urls, 1):
parser = HTMLParser(url, i, conf)
article = parser.parse()
to_raw(article)
to_meta(article)
print('Done')


if __name__ == "__main__":
main()
main()
6 changes: 3 additions & 3 deletions lab_5_scrapper/scrapper_config.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"seed_urls": [],
"seed_urls": ["https://donday.ru/"],
"headers": {},
"total_articles_to_find_and_parse": 0,
"total_articles_to_find_and_parse": 3,
"encoding": "",
"timeout": 0,
"timeout": 10,
"should_verify_certificate": true,
"headless_mode": true
}
2 changes: 1 addition & 1 deletion lab_5_scrapper/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"target_score": 0
"target_score": 6
}
1 change: 1 addition & 0 deletions lab_5_scrapper/target_score.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
6
Loading
Loading