Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset Collector #1, Zelekson Daniil - 19FPL1 #43

Open
wants to merge 54 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
9803720
Starting
daniilzelekson Mar 9, 2021
bd523b5
small rework
daniilzelekson Mar 9, 2021
ca93002
Merge remote-tracking branch 'upstream/main' into HEAD
dmitry-uraev Mar 12, 2021
684b151
small rework
daniilzelekson Mar 12, 2021
d575815
small rework
daniilzelekson Mar 18, 2021
c277d90
small rework
daniilzelekson Mar 18, 2021
692c72d
small rework
daniilzelekson Mar 18, 2021
23699ec
small rework
daniilzelekson Mar 18, 2021
b47f292
small rework
daniilzelekson Mar 18, 2021
4788560
small rework
daniilzelekson Mar 18, 2021
598df03
small rework
daniilzelekson Mar 18, 2021
ed2ace3
small rework
daniilzelekson Mar 18, 2021
69bb24b
small rework
daniilzelekson Mar 18, 2021
2f8fdc4
small rework
daniilzelekson Mar 18, 2021
dc51845
small rework
daniilzelekson Mar 18, 2021
c1d3065
small rework
daniilzelekson Mar 23, 2021
170aeb6
small rework
daniilzelekson Mar 23, 2021
1955427
small rework
daniilzelekson Mar 23, 2021
d4fe999
small rework
daniilzelekson Mar 23, 2021
451c483
small rework
daniilzelekson Mar 23, 2021
b240bb5
small rework
daniilzelekson Mar 23, 2021
4c2bac2
small rework
daniilzelekson Mar 23, 2021
180e614
small rework
daniilzelekson Mar 23, 2021
5bb34c0
small rework
daniilzelekson Mar 26, 2021
d2965c3
small rework
daniilzelekson Mar 26, 2021
778a743
small rework
daniilzelekson Mar 26, 2021
261f93b
small rework
daniilzelekson Mar 26, 2021
3de6397
small rework
daniilzelekson Mar 26, 2021
1fa2747
small rework
daniilzelekson Mar 26, 2021
62619c1
small rework
daniilzelekson Mar 26, 2021
c9bb755
small rework
daniilzelekson Mar 26, 2021
0dc4249
small rework :)
daniilzelekson Mar 26, 2021
e0efdec
small rework
daniilzelekson Mar 26, 2021
3441fda
small rework
daniilzelekson Mar 26, 2021
8615b19
Merge remote-tracking branch 'upstream/main' into HEAD
dmitry-uraev Mar 26, 2021
746bb51
Merge branch 'main' of https://github.com/daniilzelekson/2020-2-level…
daniilzelekson Mar 26, 2021
853879e
very little pf work
daniilzelekson Mar 28, 2021
7b37ca1
very little pf work
daniilzelekson Mar 28, 2021
68265db
very little pf work
daniilzelekson Mar 28, 2021
d6aa3e5
very little pf work
daniilzelekson Mar 28, 2021
78665d1
small rework
daniilzelekson Apr 3, 2021
12c290b
small rework
daniilzelekson Apr 3, 2021
3b71c8b
small rework
daniilzelekson Apr 3, 2021
0814835
small rework
daniilzelekson Apr 4, 2021
cbd7e0c
small rework
daniilzelekson Apr 4, 2021
027067c
small rework
daniilzelekson Apr 4, 2021
92fb26d
small rework
daniilzelekson Apr 4, 2021
b467bd8
small rework
daniilzelekson Apr 4, 2021
8363d33
small rework
daniilzelekson Apr 4, 2021
c49e415
small rework
daniilzelekson Apr 4, 2021
178631a
small rework
daniilzelekson Apr 4, 2021
43b58f9
small rework
daniilzelekson Apr 4, 2021
0a9fbac
small rework
daniilzelekson Apr 4, 2021
076ef72
small rework
daniilzelekson Apr 4, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions article.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ class Article:
Article class implementation.
Stores article metadata and knows how to work with articles
"""
def __init__(self, url, article_id):
def __init__(self, url, article_id, save_path=''):
self.url = url
self.article_id = article_id

self.save_path = save_path
self.title = ''
self.date = None
self.author = ''
Expand All @@ -34,12 +34,11 @@ def save_raw(self):
"""
Saves raw text and article meta data
"""
article_meta_name = "{}_meta.json".format(self.article_id)

with open(self._get_raw_text_path(), 'w', encoding='utf-8') as file:
file.write(self.text)

with open(os.path.join(ASSETS_PATH, article_meta_name), "w", encoding='utf-8') as file:
with open(self._get_meta_path(), "w", encoding='utf-8') as file:
json.dump(self._get_meta(),
file,
sort_keys=False,
Expand Down Expand Up @@ -106,11 +105,26 @@ def _get_raw_text_path(self):
Returns path for requested raw article
"""
article_txt_name = "{}_raw.txt".format(self.article_id)
return os.path.join(ASSETS_PATH, article_txt_name)
if self.save_path == '':
return os.path.join(ASSETS_PATH, article_txt_name)
else:
return os.path.join(self.save_path, article_txt_name)

def _get_meta_path(self):
"""
Returns path for requested raw article
"""
article_txt_name = "{}_meta.json".format(self.article_id)
if self.save_path == '':
return os.path.join(ASSETS_PATH, article_txt_name)
else:
return os.path.join(self.save_path, article_txt_name)
def _get_processed_text_path(self):
"""
Returns path for requested processed article
"""
article_txt_name = "{}_processed.txt".format(self.article_id)
return os.path.join(ASSETS_PATH, article_txt_name)
if self.save_path == '':
return os.path.join(ASSETS_PATH, article_txt_name)
else:
return os.path.join(self.save_path, article_txt_name)
1 change: 1 addition & 0 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
RECURSIVE = False
6 changes: 3 additions & 3 deletions crawler_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"base_urls": [],
"total_articles_to_find_and_parse": 0,
"max_number_articles_to_get_from_one_seed": 0
"base_urls": ["https://znamia29.ru/news/17197/"],
"total_articles_to_find_and_parse": 5,
"max_number_articles_to_get_from_one_seed": 5
}
86 changes: 86 additions & 0 deletions m.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
argon2-cffi==20.1.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

????

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from pip freeze

astroid==2.4.2
async-generator==1.10
attrs==20.3.0
backcall==0.2.0
beautifulsoup4==4.9.3
bleach==3.3.0
certifi==2020.4.5.1
cffi==1.14.5
chardet==3.0.4
colorama==0.4.3
DAWG-Python==0.7.2
decorator==4.4.2
defusedxml==0.7.1
docopt==0.6.2
entrypoints==0.3
et-xmlfile==1.0.1
idna==2.9
importlib-metadata==3.7.2
ipykernel==5.5.0
ipython==7.21.0
ipython-genutils==0.2.0
ipywidgets==7.6.3
jdcal==1.4.1
jedi==0.18.0
Jinja2==2.11.3
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.11
jupyter-console==6.2.0
jupyter-core==4.7.1
jupyterlab-pygments==0.1.2
jupyterlab-widgets==1.0.0
lazy-object-proxy==1.4.3
lml==0.0.9
MarkupSafe==1.1.1
mistune==0.8.4
nbclient==0.5.3
nbconvert==6.0.7
nbformat==5.1.2
nest-asyncio==1.5.1
notebook==6.2.0
numpy==1.20.1
openpyxl==3.0.3
packaging==20.9
pandocfilters==1.4.3
parso==0.8.1
pickleshare==0.7.5
prometheus-client==0.9.0
prompt-toolkit==3.0.16
pycparser==2.20
pyexcel==0.5.15
pyexcel-io==0.5.20
pyexcel-xls==0.5.8
pyexcel-xlsx==0.5.8
Pygments==2.8.1
pymorphy2==0.9.1
pymorphy2-dicts-ru==2.4.417127.4579844
pymorphy2-dicts-uk==2.4.1.1.1460299261
pyparsing==2.4.7
pyrsistent==0.17.3
python-dateutil==2.8.1
pywin32==300
pywinpty==0.5.7
pyzmq==22.0.3
qtconsole==5.0.2
QtPy==1.9.0
requests==2.23.0
Send2Trash==1.5.0
six==1.15.0
soupsieve==2.2
terminado==0.9.2
testpath==0.4.4
texttable==1.6.2
tornado==6.1
traitlets==5.0.5
typed-ast==1.4.1
typing-extensions==3.7.4.3
urllib3==1.25.8
wcwidth==0.2.5
webencodings==0.5.1
widgetsnbextension==3.5.1
wrapt==1.12.1
xlrd==1.2.0
xlwt==1.3.0
zipp==3.4.1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You used all these? Will you share with me on Monday?

26 changes: 26 additions & 0 deletions mydate_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
def get_month(m):
if m.find('нвар') != -1:
return '01'
if m.find('рал') != -1:
return '02'
if m.find('арт') != -1:
return '03'
if m.find('прел') != -1:
return '04'
if m.find('Ма') != -1:
return '05'
if m.find('Июн') != -1:
return '06'
if m.find('Июл') != -1:
return '07'
if m.find('Авгус') != -1:
return '08'
if m.find('Сент') != -1:
return '09'
if m.find('Окт') != -1:
return '10'
if m.find('Нояб') != -1:
return '11'
if m.find('Декаб') != -1:
return '12'
return ''
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
beautifulsoup4
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better specify version here for consistency

lxml
requests
Loading