Skip to content

Commit

Permalink
added realisation lab5
Browse files Browse the repository at this point in the history
  • Loading branch information
shoodeen committed Jun 2, 2024
1 parent 36b2065 commit 2eaeb42
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 9 deletions.
182 changes: 181 additions & 1 deletion lab_5_scrapper/scrapper.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,64 @@
"""
Crawler implementation.
"""
import datetime
import json
# pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable
import pathlib
import re
import shutil
from typing import Pattern, Union

import requests
from bs4 import BeautifulSoup

from core_utils import constants
from core_utils.article.article import Article
from core_utils.article.io import to_meta, to_raw
from core_utils.config_dto import ConfigDTO


class IncorrectSeedURLError(Exception):
"""
Seed URL does not match standard pattern
"""


class NumberOfArticlesOutOfRangeError(Exception):
"""
Total number of articles is out of range from 1 to 150
"""


class IncorrectNumberOfArticlesError(Exception):
"""
Total number of articles to parse is not positive integer
"""


class IncorrectHeadersError(Exception):
"""
Headers are not in a form of dictionary
"""


class IncorrectEncodingError(Exception):
"""
Encoding is not a string
"""


class IncorrectTimeoutError(Exception):
"""
Timeout value is not a positive integer less than 60
"""


class IncorrectVerifyError(Exception):
"""
Verify certificate value is not True or False
"""


class Config:
"""
Expand All @@ -18,6 +72,17 @@ def __init__(self, path_to_config: pathlib.Path) -> None:
Args:
path_to_config (pathlib.Path): Path to configuration.
"""
self.path_to_config = path_to_config
self._validate_config_content()
self.config = self._extract_config_content()

self._seed_urls = self.config.seed_urls
self._num_articles = self.config.total_articles
self._headers = self.config.headers
self._encoding = self.config.encoding
self._timeout = self.config.timeout
self._should_verify_certificate = self.config.should_verify_certificate
self._headless_mode = self.config.headless_mode

def _extract_config_content(self) -> ConfigDTO:
"""
Expand All @@ -26,11 +91,49 @@ def _extract_config_content(self) -> ConfigDTO:
Returns:
ConfigDTO: Config values
"""
with open(self.path_to_config, "r", encoding="utf-8") as f:
conf = json.load(f)
return ConfigDTO(
seed_urls=conf["seed_urls"],
total_articles_to_find_and_parse=conf["total_articles_to_find_and_parse"],
headers=conf["headers"],
encoding=conf["encoding"],
timeout=conf["timeout"],
should_verify_certificate=conf["should_verify_certificate"],
headless_mode=conf["headless_mode"]
)

def _validate_config_content(self) -> None:
"""
Ensure configuration parameters are not corrupt.
"""
with open(self.path_to_config, 'r', encoding='utf-8') as f:
conf = json.load(f)

if not (isinstance(conf['seed_urls'], list)
and all(re.match(r"https?://(www.)?", seed_url) for seed_url in conf['seed_urls'])):
raise IncorrectSeedURLError

num = conf['total_articles_to_find_and_parse']

if not isinstance(num, int) or (num <= 0):
raise IncorrectNumberOfArticlesError

if num < 1 or num > 150:
raise NumberOfArticlesOutOfRangeError

if not isinstance(conf['headers'], dict):
raise IncorrectHeadersError

if not isinstance(conf['encoding'], str):
raise IncorrectEncodingError

if not (isinstance(conf['timeout'], int) and (0 < conf['timeout'] < 60)):
raise IncorrectTimeoutError

if not isinstance(conf['should_verify_certificate'], bool) \
or not isinstance(conf['headless_mode'], bool):
raise IncorrectVerifyError

def get_seed_urls(self) -> list[str]:
"""
Expand All @@ -39,6 +142,7 @@ def get_seed_urls(self) -> list[str]:
Returns:
list[str]: Seed urls
"""
return self._seed_urls

def get_num_articles(self) -> int:
"""
Expand All @@ -47,6 +151,7 @@ def get_num_articles(self) -> int:
Returns:
int: Total number of articles to scrape
"""
return self._num_articles

def get_headers(self) -> dict[str, str]:
"""
Expand All @@ -55,6 +160,7 @@ def get_headers(self) -> dict[str, str]:
Returns:
dict[str, str]: Headers
"""
return self._headers

def get_encoding(self) -> str:
"""
Expand All @@ -63,6 +169,7 @@ def get_encoding(self) -> str:
Returns:
str: Encoding
"""
return self._encoding

def get_timeout(self) -> int:
"""
Expand All @@ -71,6 +178,7 @@ def get_timeout(self) -> int:
Returns:
int: Number of seconds to wait for response
"""
return self._timeout

def get_verify_certificate(self) -> bool:
"""
Expand All @@ -79,6 +187,7 @@ def get_verify_certificate(self) -> bool:
Returns:
bool: Whether to verify certificate or not
"""
return self._should_verify_certificate

def get_headless_mode(self) -> bool:
"""
Expand All @@ -87,6 +196,7 @@ def get_headless_mode(self) -> bool:
Returns:
bool: Whether to use headless mode or not
"""
return self._headless_mode


def make_request(url: str, config: Config) -> requests.models.Response:
Expand All @@ -100,6 +210,8 @@ def make_request(url: str, config: Config) -> requests.models.Response:
Returns:
requests.models.Response: A response from a request
"""
return requests.get(url=url, timeout=config.get_timeout(),
headers=config.get_headers(), verify=config.get_verify_certificate())


class Crawler:
Expand All @@ -116,6 +228,9 @@ def __init__(self, config: Config) -> None:
Args:
config (Config): Configuration
"""
self.config = config
self.urls = []
self.url_pattern = self.config.get_seed_urls()[0].split('/format')[0]

def _extract_url(self, article_bs: BeautifulSoup) -> str:
"""
Expand All @@ -127,11 +242,31 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str:
Returns:
str: Url from HTML
"""
url = ""
links = article_bs.find_all('a', class_="qZbm2")
for link in links:
url = link.get('href')
url = self.url_pattern + url[len("/text")::]
if url not in self.urls:
break

return url

def find_articles(self) -> None:
"""
Find articles.
"""
seed_urls = self.get_search_urls()

while len(self.urls) < self.config.get_num_articles():
for seed_url in seed_urls:
response = make_request(seed_url, self.config)
if not response.ok:
continue

article_bs = BeautifulSoup(response.text, "html.parser")
extracted = self._extract_url(article_bs)
self.urls.append(extracted)

def get_search_urls(self) -> list:
"""
Expand All @@ -140,6 +275,7 @@ def get_search_urls(self) -> list:
Returns:
list: seed_urls param
"""
return self.config.get_seed_urls()


# 10
Expand All @@ -160,6 +296,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None:
article_id (int): Article id
config (Config): Configuration
"""
self.full_url = full_url
self.article_id = article_id
self.config = config
self.article = Article(self.full_url, self.article_id)

def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
"""
Expand All @@ -168,6 +308,13 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
Args:
article_soup (bs4.BeautifulSoup): BeautifulSoup instance
"""
raw_text = ''
text_blocks = article_soup.find_all('div', class_='uiArticleBlockText_i9h2o')
for text_block in text_blocks:
if text_block.string:
raw_text += f'\n{text_block.string}'

self.article.text = raw_text

def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None:
"""
Expand All @@ -176,6 +323,14 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No
Args:
article_soup (bs4.BeautifulSoup): BeautifulSoup instance
"""
headline = article_soup.find("h1", class_="title_ip27z")
self.article.title = headline.text
author = article_soup.find("div", class_="name_GQmWc")
if not author:
self.article.author = ["NOT FOUND"]

else:
self.article.author = [author.text]

def unify_date_format(self, date_str: str) -> datetime.datetime:
"""
Expand All @@ -195,6 +350,13 @@ def parse(self) -> Union[Article, bool, list]:
Returns:
Union[Article, bool, list]: Article instance
"""
response = make_request(self.full_url, self.config)
if response.ok:
article_bs = BeautifulSoup(response.text, "html.parser")
self._fill_article_with_text(article_bs)
self._fill_article_with_meta_information(article_bs)

return self.article


def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Expand All @@ -204,13 +366,31 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Args:
base_path (Union[pathlib.Path, str]): Path where articles stores
"""
if base_path.exists():
shutil.rmtree(base_path)
base_path.mkdir(parents=True)


def main() -> None:
"""
Entrypoint for scrapper module.
"""
configuration = Config(path_to_config=constants.CRAWLER_CONFIG_PATH)

prepare_environment(base_path=constants.ASSETS_PATH)

crawler = Crawler(config=configuration)
crawler.find_articles()
urls = crawler.urls

for index, url in enumerate(urls):
parser = HTMLParser(full_url=url, article_id=index + 1, config=configuration)
article = parser.parse()
if isinstance(article, Article):
to_raw(article)
to_meta(article)
print("done!")


if __name__ == "__main__":
main()
main()
24 changes: 19 additions & 5 deletions lab_5_scrapper/scrapper_config.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
{
"seed_urls": [],
"headers": {},
"total_articles_to_find_and_parse": 0,
"encoding": "",
"timeout": 0,
"seed_urls": ["https://74.ru/text/format/mention/",
"https://74.ru/text/format/mention/?page=2",
"https://74.ru/text/format/mention/?page=3",
"https://74.ru/text/format/mention/?page=4",
"https://74.ru/text/format/mention/?page=5",
"https://74.ru/text/format/mention/?page=6",
"https://74.ru/text/format/mention/?page=7",
"https://74.ru/text/format/mention/?page=8",
"https://74.ru/text/format/mention/?page=9",
"https://74.ru/text/format/mention/?page=10"],
"headers": {
"cookie" : "stg_returning_visitor=Mon%2C%2008%20Apr%202024%2009:30:34%20GMT; stg_traffic_source_priority=1; _ga_KLCW8G3CY6=GS1.1.1717361661.1.0.1717361661.0.0.0; _ga=GA1.1.562201468.1717361661; stg_last_interaction=Sun%2C%2002%20Jun%202024%2020:54:22%20GMT",
"accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language" : "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
},
"total_articles_to_find_and_parse": 100,
"encoding": "utf-8",
"timeout": 15,
"should_verify_certificate": true,
"headless_mode": true
}
4 changes: 2 additions & 2 deletions lab_5_scrapper/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"target_score": 4
}
"target_score": 6
}
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@

requests==2.31.0
beautifulsoup4==4.12.2

0 comments on commit 2eaeb42

Please sign in to comment.