From 26f30fd29451fae2019bfa55b1b63b200f74508d Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 16 Sep 2019 13:12:48 +0300
Subject: [PATCH 01/14] editproba

---
 lab_1/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lab_1/main.py b/lab_1/main.py
index 498bb1a..f034fcd 100644
--- a/lab_1/main.py
+++ b/lab_1/main.py
@@ -1,6 +1,6 @@
 """
 Labour work #1
-Count frequencies dictionary by the given arbitrary text
+Count a frequencies dictionary by the given arbitrary text
 """
 
 
@@ -20,4 +20,4 @@ def get_top_n(frequencies: dict, top_n: int) -> tuple:
     """
     Takes first N popular words
     """
-    pass
\ No newline at end of file
+    pass

From 2fe60f1faa76bc0bfdb476e2ed39961f1a1bf67f Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 16 Sep 2019 13:25:53 +0300
Subject: [PATCH 02/14] editproba2

---
 lab_1/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1/main.py b/lab_1/main.py
index f034fcd..81b5d50 100644
--- a/lab_1/main.py
+++ b/lab_1/main.py
@@ -6,7 +6,7 @@
 
 def calculate_frequences(text: str) -> dict:
     """
-    Calculates number of times each word appears in the text
+    Calculates number of times each word appears in this text
     """
     pass
 

From d3de4b6bc1e51ff182a52553e67188ba4c24ef47 Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 7 Oct 2019 01:40:02 +0300
Subject: [PATCH 03/14] Update main.py

---
 lab_1/main.py | 58 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/lab_1/main.py b/lab_1/main.py
index 81b5d50..01d35ce 100644
--- a/lab_1/main.py
+++ b/lab_1/main.py
@@ -1,23 +1,43 @@
-"""
-Labour work #1
-Count a frequencies dictionary by the given arbitrary text
-"""
+f = open('data.txt', 'r')
+text = f.read().strip()
+text = "".join(word for word in text if word not in ('!','.',':', ',', 'â€ś', 'â€ť', '"', '?', '?!'))
 
+stopwords = ('the', 'a', 'an', 'or', 'and', 'to', 'are', 'is',)
+top_n = 6
 
-def calculate_frequences(text: str) -> dict:
-    """
-    Calculates number of times each word appears in this text
-    """
-    pass
+def calculate_frequences(text) :
+    dictionary = {}
+    words = text.split()
+    for key in words:
+        key = key.lower()
+        if key in dictionary:
+            value = dictionary[key]
+            dictionary[key]=value+1
+        else:
+            dictionary[key]=1
+    return dictionary
+dictionary = calculate_frequences(text)
+print (dictionary)
 
-def filter_stop_words(frequencies: dict, stop_words: tuple) -> dict:
-    """
-    Removes all stop words from the given frequencies dictionary
-    """
-    pass
+def filter_stop_words(dictionary, stopwords):
+    filtered_dictionary = dictionary
+    for key in list(filtered_dictionary.keys()):
+        if key in stopwords:
+            del filtered_dictionary[key]
+    return filtered_dictionary
+filtered_dictionary = filter_stop_words(dictionary, stopwords)
+print (filtered_dictionary)
 
-def get_top_n(frequencies: dict, top_n: int) -> tuple:
-    """
-    Takes first N popular words
-    """
-    pass
+def get_top_n (dictionary, top_n):
+    list_dictionary = list(dictionary.items())
+    list_dictionary.sort(key=lambda i: i[1], reverse=True)
+    index = 0
+    toped_dictionary = list()
+    for el in list_dictionary:
+        if index < top_n:
+            toped_dictionary.append(list_dictionary[index])
+            index += 1
+    toped_dictionary = tuple(list_dictionary[:top_n])
+    return toped_dictionary
+toped_dictionary = get_top_n (dictionary, top_n)
+print (toped_dictionary)

From b470e11becdca5303aea4eaec60dd9b8df382357 Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 7 Oct 2019 01:43:00 +0300
Subject: [PATCH 04/14] Add files via upload

---
 data.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 data.txt

diff --git a/data.txt b/data.txt
new file mode 100644
index 0000000..2b354d8
--- /dev/null
+++ b/data.txt
@@ -0,0 +1,4 @@
+Kamal and Grant arrived. “Hi Kamal!” said Tara. 
+“Are you going to the Halloween disco tomorrow?”
+“Yes. Hi Amy,” Kamal said, smiling. 
+“Do you want to come and see our paintings after school?” “I’m coming too!” Tara insisted.
\ No newline at end of file

From 1642d2e8b4d13fa71a61c837f136124dea95cc97 Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 7 Oct 2019 01:43:47 +0300
Subject: [PATCH 05/14] Add files via upload

---
 lab_1/data.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 lab_1/data.txt

diff --git a/lab_1/data.txt b/lab_1/data.txt
new file mode 100644
index 0000000..2b354d8
--- /dev/null
+++ b/lab_1/data.txt
@@ -0,0 +1,4 @@
+Kamal and Grant arrived. “Hi Kamal!” said Tara. 
+“Are you going to the Halloween disco tomorrow?”
+“Yes. Hi Amy,” Kamal said, smiling. 
+“Do you want to come and see our paintings after school?” “I’m coming too!” Tara insisted.
\ No newline at end of file

From c964c42542a249f77ff8f5df3b13c927499d93dd Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 7 Oct 2019 13:27:54 +0300
Subject: [PATCH 06/14] added new function

---
 lab_1/main.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/lab_1/main.py b/lab_1/main.py
index 01d35ce..34cc5aa 100644
--- a/lab_1/main.py
+++ b/lab_1/main.py
@@ -4,6 +4,7 @@
 
 stopwords = ('the', 'a', 'an', 'or', 'and', 'to', 'are', 'is',)
 top_n = 6
+path_to_file = 'report.txt'
 
 def calculate_frequences(text) :
     dictionary = {}
@@ -39,5 +40,11 @@ def get_top_n (dictionary, top_n):
             index += 1
     toped_dictionary = tuple(list_dictionary[:top_n])
     return toped_dictionary
-toped_dictionary = get_top_n (dictionary, top_n)
-print (toped_dictionary)
+content = get_top_n (dictionary, top_n)
+print (content)
+
+def write_to_file (path_to_file, content):
+    with open('report.txt', 'w') as file:
+        print(content, file=file, sep='\n')
+
+write_to_file (path_to_file, content)

From 8716be5bfbefe02a4b3e205c6c60db2d55e563c4 Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 7 Oct 2019 13:28:43 +0300
Subject: [PATCH 07/14] Add files via upload

---
 lab_1/report.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 lab_1/report.txt

diff --git a/lab_1/report.txt b/lab_1/report.txt
new file mode 100644
index 0000000..fbb681a
--- /dev/null
+++ b/lab_1/report.txt
@@ -0,0 +1 @@
+(('kamal', 3), ('hi', 2), ('said', 2), ('tara', 2), ('you', 2), ('grant', 1))

From d242e65dbcd9d30a7654d051532f15907093eaf2 Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 7 Oct 2019 14:27:52 +0300
Subject: [PATCH 08/14] smth

---
 lab_1/main.py | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/lab_1/main.py b/lab_1/main.py
index 34cc5aa..a939bd7 100644
--- a/lab_1/main.py
+++ b/lab_1/main.py
@@ -1,13 +1,6 @@
-f = open('data.txt', 'r')
-text = f.read().strip()
-text = "".join(word for word in text if word not in ('!','.',':', ',', 'â€ś', 'â€ť', '"', '?', '?!'))
-
-stopwords = ('the', 'a', 'an', 'or', 'and', 'to', 'are', 'is',)
-top_n = 6
-path_to_file = 'report.txt'
-
 def calculate_frequences(text) :
     dictionary = {}
+    text = "".join(word for word in text if word not in ('!', '.', ':', ',', 'â€ś', 'â€ť', '"', '?', '?!'))
     words = text.split()
     for key in words:
         key = key.lower()
@@ -17,8 +10,6 @@ def calculate_frequences(text) :
         else:
             dictionary[key]=1
     return dictionary
-dictionary = calculate_frequences(text)
-print (dictionary)
 
 def filter_stop_words(dictionary, stopwords):
     filtered_dictionary = dictionary
@@ -26,8 +17,6 @@ def filter_stop_words(dictionary, stopwords):
         if key in stopwords:
             del filtered_dictionary[key]
     return filtered_dictionary
-filtered_dictionary = filter_stop_words(dictionary, stopwords)
-print (filtered_dictionary)
 
 def get_top_n (dictionary, top_n):
     list_dictionary = list(dictionary.items())
@@ -40,11 +29,3 @@ def get_top_n (dictionary, top_n):
             index += 1
     toped_dictionary = tuple(list_dictionary[:top_n])
     return toped_dictionary
-content = get_top_n (dictionary, top_n)
-print (content)
-
-def write_to_file (path_to_file, content):
-    with open('report.txt', 'w') as file:
-        print(content, file=file, sep='\n')
-
-write_to_file (path_to_file, content)

From 30f0061696516e117b90e1ddc62ba0ffab910ad1 Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 7 Oct 2019 14:48:24 +0300
Subject: [PATCH 09/14] ssss

---
 lab_1/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lab_1/main.py b/lab_1/main.py
index a939bd7..c3588c8 100644
--- a/lab_1/main.py
+++ b/lab_1/main.py
@@ -1,6 +1,6 @@
 def calculate_frequences(text) :
     dictionary = {}
-    text = "".join(word for word in text if word not in ('!', '.', ':', ',', 'â€ś', 'â€ť', '"', '?', '?!'))
+    text = "".join(word for word in text if word not in ('!', '.', ':', ';', ',', 'â€ś', 'â€ť', '"', '?', '?!', '@', '~', '$', '#', 'â„–', '%', '*', '^'))
     words = text.split()
     for key in words:
         key = key.lower()

From 8bc2a34bee94c02bfa13aac515317e9098776ba8 Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Mon, 14 Oct 2019 07:25:45 +0300
Subject: [PATCH 10/14] developed

---
 lab_1/main.py | 48 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/lab_1/main.py b/lab_1/main.py
index c3588c8..5681ab7 100644
--- a/lab_1/main.py
+++ b/lab_1/main.py
@@ -1,22 +1,30 @@
 def calculate_frequences(text) :
+    if not isinstance(text, str):
+        return {}
+    work_text = ''
+    for el in str(text):
+        if el.isalpha() or el == ' ' or el == '\n':
+            work_text += el.lower()
+    words = work_text.split()
     dictionary = {}
-    text = "".join(word for word in text if word not in ('!', '.', ':', ';', ',', 'â€ś', 'â€ť', '"', '?', '?!', '@', '~', '$', '#', 'â„–', '%', '*', '^'))
-    words = text.split()
     for key in words:
-        key = key.lower()
         if key in dictionary:
-            value = dictionary[key]
-            dictionary[key]=value+1
+            dictionary[key] += 1
         else:
-            dictionary[key]=1
+            dictionary[key] = 1
     return dictionary
 
 def filter_stop_words(dictionary, stopwords):
-    filtered_dictionary = dictionary
-    for key in list(filtered_dictionary.keys()):
-        if key in stopwords:
-            del filtered_dictionary[key]
-    return filtered_dictionary
+    if dictionary and stopwords is not None:
+        filtered_dictionary = dictionary.copy()
+        for key in dictionary:
+            if not isinstance(key, str):
+                del filtered_dictionary[key]
+        for word in stopwords:
+            if word in filtered_dictionary:
+                del filtered_dictionary[word]
+        return filtered_dictionary
+    return {}
 
 def get_top_n (dictionary, top_n):
     list_dictionary = list(dictionary.items())
@@ -29,3 +37,21 @@ def get_top_n (dictionary, top_n):
             index += 1
     toped_dictionary = tuple(list_dictionary[:top_n])
     return toped_dictionary
+
+def read_from_file (path_to_file, lines_limit):
+    file = open(path_to_file, 'r')
+    n = 0
+    text = ''
+    for line in file:
+        if n < lines_limit:
+            text += str(line)
+            n += 1
+    file.close()
+    return text
+
+def write_to_file (path_to_file, content):
+    file = open(path_to_file, 'w')
+    for el in content:
+        file.write(el)
+        file.write('\n')
+    file.close()

From b10892d0a102a1564193ff41c6fa20fe1ad3de89 Mon Sep 17 00:00:00 2001
From: Marat Fatekhov <marat.fatekhov@intel.com>
Date: Thu, 24 Oct 2019 14:11:29 +0300
Subject: [PATCH 11/14] remove redundant

---
 lab_1/data.txt   | 4 ----
 lab_1/report.txt | 1 -
 2 files changed, 5 deletions(-)
 delete mode 100644 lab_1/data.txt
 delete mode 100644 lab_1/report.txt

diff --git a/lab_1/data.txt b/lab_1/data.txt
deleted file mode 100644
index 2b354d8..0000000
--- a/lab_1/data.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-Kamal and Grant arrived. “Hi Kamal!” said Tara. 
-“Are you going to the Halloween disco tomorrow?”
-“Yes. Hi Amy,” Kamal said, smiling. 
-“Do you want to come and see our paintings after school?” “I’m coming too!” Tara insisted.
\ No newline at end of file
diff --git a/lab_1/report.txt b/lab_1/report.txt
deleted file mode 100644
index fbb681a..0000000
--- a/lab_1/report.txt
+++ /dev/null
@@ -1 +0,0 @@
-(('kamal', 3), ('hi', 2), ('said', 2), ('tara', 2), ('you', 2), ('grant', 1))

From 20365a490de268a88a8cb6c21e49ad65edd287a4 Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Tue, 26 Nov 2019 12:55:38 +0300
Subject: [PATCH 12/14] labba

a
---
 lab_2/main.py | 88 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 10 deletions(-)

diff --git a/lab_2/main.py b/lab_2/main.py
index 37932f5..1e03713 100644
--- a/lab_2/main.py
+++ b/lab_2/main.py
@@ -1,18 +1,36 @@
-"""
-Labour work #2. Levenshtein distance.
-"""
-
-
 def generate_edit_matrix(num_rows: int, num_cols: int) -> list:
-    pass
+    edit_matrix = []
+    if not isinstance(num_rows, int) or not isinstance(num_cols, int):
+        return edit_matrix
+    if num_cols <= 0 or num_rows <= 0:
+        return edit_matrix
+    for _ in range(num_rows):
+        string = []
+        for _ in range(num_cols):
+            string.append(0)
+        edit_matrix.append(string)
+    return edit_matrix
 
 
 def initialize_edit_matrix(edit_matrix: tuple, add_weight: int, remove_weight: int) -> list:
-    pass
+    if not isinstance(edit_matrix, tuple):
+        return []
+    edit_matrix = list(edit_matrix)
+    if not isinstance(add_weight, int) or not isinstance(remove_weight, int):
+        return edit_matrix
+    if edit_matrix == [[]] * len(edit_matrix):
+        return edit_matrix
+    for i in range(1, len(edit_matrix)):
+        edit_matrix[i][0] = edit_matrix[i - 1][0] + remove_weight
+    for j in range(1, len(edit_matrix[0])):
+        edit_matrix[0][j] = edit_matrix[0][j - 1] + add_weight
+    return edit_matrix
 
 
 def minimum_value(numbers: tuple) -> int:
-    pass
+    if isinstance(numbers, tuple):
+        res = min(list(numbers))
+        return res
 
 
 def fill_edit_matrix(edit_matrix: tuple,
@@ -21,7 +39,27 @@ def fill_edit_matrix(edit_matrix: tuple,
                      substitute_weight: int,
                      original_word: str,
                      target_word: str) -> list:
-    pass
+    if not isinstance(edit_matrix, tuple):
+        return []
+    edit_matrix = list(edit_matrix)
+    if not isinstance(original_word, str) or not isinstance(target_word, str) or original_word == '' \
+            or target_word == '':
+        return edit_matrix
+    if not isinstance(add_weight, int) or not isinstance(remove_weight, int) or not isinstance(substitute_weight, int):
+        return edit_matrix
+    original_word = ' ' + original_word
+    target_word = ' ' + target_word
+    for i in range(1, len(edit_matrix)):
+        for j in range(1, len(edit_matrix[0])):
+            ad = edit_matrix[i][j - 1] + add_weight
+            re = edit_matrix[i - 1][j] + remove_weight
+            su = edit_matrix[i - 1][j - 1]
+            if original_word[i] != target_word[j]:
+                su += substitute_weight
+            edit_matrix[i][j] = minimum_value((ad, re, su))
+    return edit_matrix
+
+
 
 
 def find_distance(original_word: str,
@@ -29,4 +67,34 @@ def find_distance(original_word: str,
                   add_weight: int,
                   remove_weight: int,
                   substitute_weight: int) -> int:
-    pass
+    wrong_result = -1
+    if type(original_word) == str and type(target_word) == str and type(add_weight) == int and type(remove_weight) == int and type(substitute_weight) == int:
+        num_rows = len(original_word) + 1
+        num_cols = len(target_word) + 1
+        new_matrix = generate_edit_matrix(num_rows, num_cols)
+        matrix = initialize_edit_matrix(tuple(new_matrix), add_weight, remove_weight)
+        return fill_edit_matrix(tuple(matrix), add_weight, remove_weight, substitute_weight, original_word, target_word)[num_rows - 1][num_cols - 1]
+    else:
+        return wrong_result
+
+
+def save_to_csv(edit_matrix: tuple, path_to_file: str) -> None:
+    save_file = open(path_to_file,'w')
+    for string in edit_matrix:
+        row = ''
+        for el in string:
+            row = str(el)+','
+            save_file.write(row)
+        save_file.write('\n')
+    save_file.close(row)
+
+def load_from_csv(path_to_file: str) -> list:
+    new_file = open(path_to_file)
+    matrix = []
+    for string in new_file:
+        line_with_z = string.split(',')
+        line =[]
+        for el in line_with_z:
+            line.append(int(el))
+        matrix.append(line)
+    return matrix

From 60b47f386ba713ab8797c7ffceff334150669c4a Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Wed, 11 Dec 2019 14:13:48 +0300
Subject: [PATCH 13/14] =?UTF-8?q?=D0=BB=D0=B0=D0=B1=D0=B1=D0=B1=D0=B1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lab_3/main.py | 121 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 107 insertions(+), 14 deletions(-)

diff --git a/lab_3/main.py b/lab_3/main.py
index 595f614..c82b5a3 100644
--- a/lab_3/main.py
+++ b/lab_3/main.py
@@ -1,8 +1,3 @@
-"""
-Labour work #3
- Building an own N-gram model
-"""
-
 import math
 
 REFERENCE_TEXT = ''
@@ -12,33 +7,131 @@
 
 
 class WordStorage:
+    def __init__(self):
+        self.storage = {}
+
     def put(self, word: str) -> int:
-        pass
+        if not isinstance(word, str):
+            return -1
+        if not self.storage:
+            self.storage[word] = 0
+        elif word not in self.storage:
+            self.storage[word] = max(self.storage.values()) + 1
+        return self.storage[word]
 
     def get_id_of(self, word: str) -> int:
-        pass
+        if word not in self.storage:
+            return -1
+        return self.storage.get(word)
+
 
     def get_original_by(self, id: int) -> str:
-        pass
+        if isinstance(id, int):
+            for key, value in self.storage.items():
+                if value == id:
+                    return key
+        else:
+            return 'UNK'
+        if id not in self.storage.values():
+            return 'UNK'
+
+
 
     def from_corpus(self, corpus: tuple):
-        pass
+        if isinstance(corpus, tuple):
+            for elem in corpus:
+                self.put(elem)
+            return self.storage
 
 
 class NGramTrie:
+    def __init__(self, n):
+        self.size = n
+        self.gram_frequencies = {}
+        self.gram_log_probabilities = {}
+
     def fill_from_sentence(self, sentence: tuple) -> str:
-        pass
+        if isinstance(sentence, tuple):
+            new_sent = list(sentence)
+            for i, n in enumerate(new_sent[:-self.size + 1]):
+                n_gram = []
+                identif = 0
+                while identif < self.size:
+                    n_gram.append(new_sent[i + identif])
+                    identif += 1
+                n_gram = tuple(n_gram)
+                if n_gram in self.gram_frequencies.keys():
+                    self.gram_frequencies[n_gram] += 1
+                else:
+                    self.gram_frequencies[n_gram] = 1
+            return 'OK'
+        else:
+            return 'ERROR'
 
     def calculate_log_probabilities(self):
-        pass
+        for pair in self.gram_frequencies:
+            wanted = pair[0:self.size - 1]
+            count = 0
+            for key in self.gram_frequencies:
+                if wanted == key[0:self.size - 1]:
+                    count += self.gram_frequencies[key]
+            prob = math.log(self.gram_frequencies[pair] / count)
+            self.gram_log_probabilities[pair] = prob
 
     def predict_next_sentence(self, prefix: tuple) -> list:
-        pass
+        word_1 = []
+        if not isinstance(prefix, tuple) or len(prefix) + 1 != self.size:
+            return []
+        final = list(prefix)
+        while True:
+            prob = []
+            for n_gram in list(self.gram_log_probabilities.keys()):
+                if n_gram[:-1] == prefix:
+                    prob.append(self.gram_log_probabilities[n_gram])
+            if not prob:
+                break
+            prob.sort(reverse=True)
+            prob = prob[0]
+            for word, probability in list(self.gram_log_probabilities.items()):
+                if prob == probability:
+                    word_1 = word[-1]
+            final.append(word_1)
+            pref_1 = list(prefix[1:])
+            pref_1.append(word_1)
+            prefix = tuple(pref_1)
+        return final
 
 
 def encode(storage_instance, corpus) -> list:
-    pass
+    code = []
+    for sentence in corpus:
+        code1 = []
+        for element in sentence:
+            element = storage_instance.get_id_of(element)
+            code1.append(element)
+        code.append(code1)
+    return code
 
 
 def split_by_sentence(text: str) -> list:
-    pass
+    corpus = []
+    new_text = ''
+    if isinstance(text, str) and ' ' in text:
+        text = text.replace('\n', ' ')
+        while '  ' in text:
+            text = text.replace('  ', ' ')
+        text = text.replace('!', '.')
+        text = text.replace('?', '.')
+        if '.' in text:
+            for symbol in text:
+                if symbol.isalpha() or symbol == ' ' or symbol == '.':
+                    new_text += symbol.lower()
+    sentences = new_text.split('.')
+    while '' in sentences:
+        sentences.remove('')
+    for element in sentences:
+        element = element.split()
+        element.insert(0, '<s>')
+        element.append('</s>')
+        corpus.append(element)
+    return corpus

From 1b3874029b5ffba7ad9741132729d3e830c0bda0 Mon Sep 17 00:00:00 2001
From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com>
Date: Tue, 17 Dec 2019 12:04:19 +0300
Subject: [PATCH 14/14] labb

code
---
 lab_4/main.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 70 insertions(+), 6 deletions(-)

diff --git a/lab_4/main.py b/lab_4/main.py
index db1330d..2c719a4 100644
--- a/lab_4/main.py
+++ b/lab_4/main.py
@@ -5,24 +5,88 @@
 
 
 def clean_tokenize_corpus(texts: list) -> list:
-    pass
+    corpus = []
+    if not texts or not isinstance(texts, list):
+        return corpus
+    for text in texts:
+        if not isinstance(text, str):
+            continue
+        clean_text = ''
+        text = text.replace('\n', ' ')
+        text = text.replace('<br />', ' ')
+        while '  ' in text:
+            text = text.replace('  ', ' ')
+        for symbol in text:
+            if symbol.isalpha() or symbol == ' ':
+                clean_text += symbol.lower()
+        clean_text = clean_text.split()
+        corpus.append(clean_text)
+    return corpus
 
 
 class TfIdfCalculator:
     def __init__(self, corpus):
-        pass
+        self.corpus = corpus
+        self.tf_values = []
+        self.idf_values = {}
+        self.tf_idf_values = []
 
     def calculate_tf(self):
-        pass
+        if not isinstance(self.corpus, list):
+            return []
+        for doc in self.corpus:
+            if not isinstance(doc, list):
+                continue
+            doc_dict = {}
+            cleaned_doc = []
+            for elem in doc:
+                if isinstance(elem, str):
+                    cleaned_doc.append(elem)
+            for word in cleaned_doc:
+                if word not in doc_dict:
+                    doc_dict[word] = doc.count(word) / len(cleaned_doc)
+            self.tf_values.append(doc_dict)
 
     def calculate_idf(self):
-        pass
+        if not isinstance(self.corpus, list):
+            return {}
+        all_words = [el for doc in self.corpus if isinstance(doc, list) for el in doc if isinstance(el, str)]
+        words = list(set(all_words))
+        cleaned_corpus = []
+        for doc in self.corpus:
+            if isinstance(doc, list):
+                cleaned_corpus.append(doc)
+        for word in words:
+            frequency = [1 for doc in cleaned_corpus if isinstance(doc, list) and word in doc]
+            self.idf_values[word] = math.log(len(cleaned_corpus) / sum(frequency))
 
     def calculate(self):
-        pass
+        if not isinstance(self.tf_values, list):
+            return []
+        for doc in self.tf_values:
+            new_dict = {}
+            for key in doc:
+                if key in doc and key in self.idf_values:
+                    new_dict[key] = doc[key] * self.idf_values[key]
+                else:
+                    return []
+            self.tf_idf_values.append(new_dict)
 
     def report_on(self, word, document_index):
-        pass
+        if self.tf_idf_values is None or document_index > len(self.tf_idf_values) - 1 or \
+                word not in self.tf_idf_values[document_index]:
+            return ()
+        word_info = [self.tf_idf_values[document_index][word]]
+        the_most_important = list(self.tf_idf_values[document_index].items())
+        the_most_important.sort(key=lambda x: x[1], reverse=True)
+        ind = -1
+        for elem in the_most_important:
+            if elem[0] == word:
+                ind = the_most_important.index(elem)
+                break
+        if ind != -1:
+            word_info.append(ind)
+        return tuple(word_info)
 
 
 if __name__ == '__main__':