fipl-hse · Marina-Garanina · Sep 16, 2019 · Sep 16, 2019 · Oct 6, 2019 · Oct 6, 2019
diff --git a/data.txt b/data.txt
@@ -0,0 +1,4 @@
+Kamal and Grant arrived. �Hi Kamal!� said Tara. 
+�Are you going to the Halloween disco tomorrow?�
+�Yes. Hi Amy,� Kamal said, smiling. 
+�Do you want to come and see our paintings after school?� �I�m coming too!� Tara insisted.
diff --git a/lab_1/main.py b/lab_1/main.py
@@ -1,8 +1,30 @@
-"""
-Labour work #1
-Count frequencies dictionary by the given arbitrary text
-"""
+def calculate_frequences(text) :
+    if not isinstance(text, str):
+        return {}
+    work_text = ''
+    for el in str(text):
+        if el.isalpha() or el == ' ' or el == '\n':
+            work_text += el.lower()
+    words = work_text.split()
+    dictionary = {}
+    for key in words:
+        if key in dictionary:
+            dictionary[key] += 1
+        else:
+            dictionary[key] = 1
+    return dictionary
 
+def filter_stop_words(dictionary, stopwords):
+    if dictionary and stopwords is not None:
+        filtered_dictionary = dictionary.copy()
+        for key in dictionary:
+            if not isinstance(key, str):
+                del filtered_dictionary[key]
+        for word in stopwords:
+            if word in filtered_dictionary:
+                del filtered_dictionary[word]
+        return filtered_dictionary
+    return {}
 
 def calculate_frequences(text: str) -> dict:
     """

diff --git a/lab_3/main.py b/lab_3/main.py
@@ -1,8 +1,3 @@
-"""
-Labour work #3
- Building an own N-gram model
-"""
-
 import math
 
 REFERENCE_TEXT = ''

diff --git a/lab_4/main.py b/lab_4/main.py
@@ -5,24 +5,88 @@
 
 
 def clean_tokenize_corpus(texts: list) -> list:
-    pass
+    corpus = []
+    if not texts or not isinstance(texts, list):
+        return corpus
+    for text in texts:
+        if not isinstance(text, str):
+            continue
+        clean_text = ''
+        text = text.replace('\n', ' ')
+        text = text.replace('<br />', ' ')
+        while '  ' in text:
+            text = text.replace('  ', ' ')
+        for symbol in text:
+            if symbol.isalpha() or symbol == ' ':
+                clean_text += symbol.lower()
+        clean_text = clean_text.split()
+        corpus.append(clean_text)
+    return corpus
 
 
 class TfIdfCalculator:
     def __init__(self, corpus):
-        pass
+        self.corpus = corpus
+        self.tf_values = []
+        self.idf_values = {}
+        self.tf_idf_values = []
 
     def calculate_tf(self):
-        pass
+        if not isinstance(self.corpus, list):
+            return []
+        for doc in self.corpus:
+            if not isinstance(doc, list):
+                continue
+            doc_dict = {}
+            cleaned_doc = []
+            for elem in doc:
+                if isinstance(elem, str):
+                    cleaned_doc.append(elem)
+            for word in cleaned_doc:
+                if word not in doc_dict:
+                    doc_dict[word] = doc.count(word) / len(cleaned_doc)
+            self.tf_values.append(doc_dict)
 
     def calculate_idf(self):
-        pass
+        if not isinstance(self.corpus, list):
+            return {}
+        all_words = [el for doc in self.corpus if isinstance(doc, list) for el in doc if isinstance(el, str)]
+        words = list(set(all_words))
+        cleaned_corpus = []
+        for doc in self.corpus:
+            if isinstance(doc, list):
+                cleaned_corpus.append(doc)
+        for word in words:
+            frequency = [1 for doc in cleaned_corpus if isinstance(doc, list) and word in doc]
+            self.idf_values[word] = math.log(len(cleaned_corpus) / sum(frequency))
 
     def calculate(self):
-        pass
+        if not isinstance(self.tf_values, list):
+            return []
+        for doc in self.tf_values:
+            new_dict = {}
+            for key in doc:
+                if key in doc and key in self.idf_values:
+                    new_dict[key] = doc[key] * self.idf_values[key]
+                else:
+                    return []
+            self.tf_idf_values.append(new_dict)
 
     def report_on(self, word, document_index):
-        pass
+        if self.tf_idf_values is None or document_index > len(self.tf_idf_values) - 1 or \
+                word not in self.tf_idf_values[document_index]:
+            return ()
+        word_info = [self.tf_idf_values[document_index][word]]
+        the_most_important = list(self.tf_idf_values[document_index].items())
+        the_most_important.sort(key=lambda x: x[1], reverse=True)
+        ind = -1
+        for elem in the_most_important:
+            if elem[0] == word:
+                ind = the_most_important.index(elem)
+                break
+        if ind != -1:
+            word_info.append(ind)
+        return tuple(word_info)
 
 
 if __name__ == '__main__':