Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Laboratory Work 4 Garanina Marina #167

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
4 changes: 4 additions & 0 deletions data.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Kamal and Grant arrived. �Hi Kamal!� said Tara.
�Are you going to the Halloween disco tomorrow?�
�Yes. Hi Amy,� Kamal said, smiling.
�Do you want to come and see our paintings after school?� �I�m coming too!� Tara insisted.
30 changes: 26 additions & 4 deletions lab_1/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,30 @@
"""
Labour work #1
Count frequencies dictionary by the given arbitrary text
"""
def calculate_frequences(text) :
if not isinstance(text, str):
return {}
work_text = ''
for el in str(text):
if el.isalpha() or el == ' ' or el == '\n':
work_text += el.lower()
words = work_text.split()
dictionary = {}
for key in words:
if key in dictionary:
dictionary[key] += 1
else:
dictionary[key] = 1
return dictionary

def filter_stop_words(dictionary, stopwords):
if dictionary and stopwords is not None:
filtered_dictionary = dictionary.copy()
for key in dictionary:
if not isinstance(key, str):
del filtered_dictionary[key]
for word in stopwords:
if word in filtered_dictionary:
del filtered_dictionary[word]
return filtered_dictionary
return {}

def calculate_frequences(text: str) -> dict:
"""
Expand Down
5 changes: 0 additions & 5 deletions lab_3/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
"""
Labour work #3
Building an own N-gram model
"""

import math

REFERENCE_TEXT = ''
Expand Down
76 changes: 70 additions & 6 deletions lab_4/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,88 @@


def clean_tokenize_corpus(texts: list) -> list:
pass
corpus = []
if not texts or not isinstance(texts, list):
return corpus
for text in texts:
if not isinstance(text, str):
continue
clean_text = ''
text = text.replace('\n', ' ')
text = text.replace('<br />', ' ')
while ' ' in text:
text = text.replace(' ', ' ')
for symbol in text:
if symbol.isalpha() or symbol == ' ':
clean_text += symbol.lower()
clean_text = clean_text.split()
corpus.append(clean_text)
return corpus


class TfIdfCalculator:
def __init__(self, corpus):
pass
self.corpus = corpus
self.tf_values = []
self.idf_values = {}
self.tf_idf_values = []

def calculate_tf(self):
pass
if not isinstance(self.corpus, list):
return []
for doc in self.corpus:
if not isinstance(doc, list):
continue
doc_dict = {}
cleaned_doc = []
for elem in doc:
if isinstance(elem, str):
cleaned_doc.append(elem)
for word in cleaned_doc:
if word not in doc_dict:
doc_dict[word] = doc.count(word) / len(cleaned_doc)
self.tf_values.append(doc_dict)

def calculate_idf(self):
pass
if not isinstance(self.corpus, list):
return {}
all_words = [el for doc in self.corpus if isinstance(doc, list) for el in doc if isinstance(el, str)]
words = list(set(all_words))
cleaned_corpus = []
for doc in self.corpus:
if isinstance(doc, list):
cleaned_corpus.append(doc)
for word in words:
frequency = [1 for doc in cleaned_corpus if isinstance(doc, list) and word in doc]
self.idf_values[word] = math.log(len(cleaned_corpus) / sum(frequency))

def calculate(self):
pass
if not isinstance(self.tf_values, list):
return []
for doc in self.tf_values:
new_dict = {}
for key in doc:
if key in doc and key in self.idf_values:
new_dict[key] = doc[key] * self.idf_values[key]
else:
return []
self.tf_idf_values.append(new_dict)

def report_on(self, word, document_index):
pass
if self.tf_idf_values is None or document_index > len(self.tf_idf_values) - 1 or \
word not in self.tf_idf_values[document_index]:
return ()
word_info = [self.tf_idf_values[document_index][word]]
the_most_important = list(self.tf_idf_values[document_index].items())
the_most_important.sort(key=lambda x: x[1], reverse=True)
ind = -1
for elem in the_most_important:
if elem[0] == word:
ind = the_most_important.index(elem)
break
if ind != -1:
word_info.append(ind)
return tuple(word_info)


if __name__ == '__main__':
Expand Down