From 26f30fd29451fae2019bfa55b1b63b200f74508d Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 16 Sep 2019 13:12:48 +0300 Subject: [PATCH 01/14] editproba --- lab_1/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lab_1/main.py b/lab_1/main.py index 498bb1a..f034fcd 100644 --- a/lab_1/main.py +++ b/lab_1/main.py @@ -1,6 +1,6 @@ """ Labour work #1 -Count frequencies dictionary by the given arbitrary text +Count a frequencies dictionary by the given arbitrary text """ @@ -20,4 +20,4 @@ def get_top_n(frequencies: dict, top_n: int) -> tuple: """ Takes first N popular words """ - pass \ No newline at end of file + pass From 2fe60f1faa76bc0bfdb476e2ed39961f1a1bf67f Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 16 Sep 2019 13:25:53 +0300 Subject: [PATCH 02/14] editproba2 --- lab_1/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1/main.py b/lab_1/main.py index f034fcd..81b5d50 100644 --- a/lab_1/main.py +++ b/lab_1/main.py @@ -6,7 +6,7 @@ def calculate_frequences(text: str) -> dict: """ - Calculates number of times each word appears in the text + Calculates number of times each word appears in this text """ pass From d3de4b6bc1e51ff182a52553e67188ba4c24ef47 Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 7 Oct 2019 01:40:02 +0300 Subject: [PATCH 03/14] Update main.py --- lab_1/main.py | 58 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/lab_1/main.py b/lab_1/main.py index 81b5d50..01d35ce 100644 --- a/lab_1/main.py +++ b/lab_1/main.py @@ -1,23 +1,43 @@ -""" -Labour work #1 -Count a frequencies dictionary by the given arbitrary text -""" +f = open('data.txt', 'r') +text = f.read().strip() +text = "".join(word for word in text if word not in ('!','.',':', ',', 'β€œ', '”', '"', '?', '?!')) +stopwords = ('the', 'a', 'an', 'or', 'and', 'to', 'are', 'is',) +top_n = 6 -def calculate_frequences(text: str) -> dict: - """ - Calculates number of times each word appears in this text - """ - pass +def calculate_frequences(text) : + dictionary = {} + words = text.split() + for key in words: + key = key.lower() + if key in dictionary: + value = dictionary[key] + dictionary[key]=value+1 + else: + dictionary[key]=1 + return dictionary +dictionary = calculate_frequences(text) +print (dictionary) -def filter_stop_words(frequencies: dict, stop_words: tuple) -> dict: - """ - Removes all stop words from the given frequencies dictionary - """ - pass +def filter_stop_words(dictionary, stopwords): + filtered_dictionary = dictionary + for key in list(filtered_dictionary.keys()): + if key in stopwords: + del filtered_dictionary[key] + return filtered_dictionary +filtered_dictionary = filter_stop_words(dictionary, stopwords) +print (filtered_dictionary) -def get_top_n(frequencies: dict, top_n: int) -> tuple: - """ - Takes first N popular words - """ - pass +def get_top_n (dictionary, top_n): + list_dictionary = list(dictionary.items()) + list_dictionary.sort(key=lambda i: i[1], reverse=True) + index = 0 + toped_dictionary = list() + for el in list_dictionary: + if index < top_n: + toped_dictionary.append(list_dictionary[index]) + index += 1 + toped_dictionary = tuple(list_dictionary[:top_n]) + return toped_dictionary +toped_dictionary = get_top_n (dictionary, top_n) +print (toped_dictionary) From b470e11becdca5303aea4eaec60dd9b8df382357 Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 7 Oct 2019 01:43:00 +0300 Subject: [PATCH 04/14] Add files via upload --- data.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 data.txt diff --git a/data.txt b/data.txt new file mode 100644 index 0000000..2b354d8 --- /dev/null +++ b/data.txt @@ -0,0 +1,4 @@ +Kamal and Grant arrived. “Hi Kamal!” said Tara. +“Are you going to the Halloween disco tomorrow?” +“Yes. Hi Amy,” Kamal said, smiling. +“Do you want to come and see our paintings after school?” “I’m coming too!” Tara insisted. \ No newline at end of file From 1642d2e8b4d13fa71a61c837f136124dea95cc97 Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 7 Oct 2019 01:43:47 +0300 Subject: [PATCH 05/14] Add files via upload --- lab_1/data.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 lab_1/data.txt diff --git a/lab_1/data.txt b/lab_1/data.txt new file mode 100644 index 0000000..2b354d8 --- /dev/null +++ b/lab_1/data.txt @@ -0,0 +1,4 @@ +Kamal and Grant arrived. “Hi Kamal!” said Tara. +“Are you going to the Halloween disco tomorrow?” +“Yes. Hi Amy,” Kamal said, smiling. +“Do you want to come and see our paintings after school?” “I’m coming too!” Tara insisted. \ No newline at end of file From c964c42542a249f77ff8f5df3b13c927499d93dd Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 7 Oct 2019 13:27:54 +0300 Subject: [PATCH 06/14] added new function --- lab_1/main.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lab_1/main.py b/lab_1/main.py index 01d35ce..34cc5aa 100644 --- a/lab_1/main.py +++ b/lab_1/main.py @@ -4,6 +4,7 @@ stopwords = ('the', 'a', 'an', 'or', 'and', 'to', 'are', 'is',) top_n = 6 +path_to_file = 'report.txt' def calculate_frequences(text) : dictionary = {} @@ -39,5 +40,11 @@ def get_top_n (dictionary, top_n): index += 1 toped_dictionary = tuple(list_dictionary[:top_n]) return toped_dictionary -toped_dictionary = get_top_n (dictionary, top_n) -print (toped_dictionary) +content = get_top_n (dictionary, top_n) +print (content) + +def write_to_file (path_to_file, content): + with open('report.txt', 'w') as file: + print(content, file=file, sep='\n') + +write_to_file (path_to_file, content) From 8716be5bfbefe02a4b3e205c6c60db2d55e563c4 Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 7 Oct 2019 13:28:43 +0300 Subject: [PATCH 07/14] Add files via upload --- lab_1/report.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 lab_1/report.txt diff --git a/lab_1/report.txt b/lab_1/report.txt new file mode 100644 index 0000000..fbb681a --- /dev/null +++ b/lab_1/report.txt @@ -0,0 +1 @@ +(('kamal', 3), ('hi', 2), ('said', 2), ('tara', 2), ('you', 2), ('grant', 1)) From d242e65dbcd9d30a7654d051532f15907093eaf2 Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 7 Oct 2019 14:27:52 +0300 Subject: [PATCH 08/14] smth --- lab_1/main.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/lab_1/main.py b/lab_1/main.py index 34cc5aa..a939bd7 100644 --- a/lab_1/main.py +++ b/lab_1/main.py @@ -1,13 +1,6 @@ -f = open('data.txt', 'r') -text = f.read().strip() -text = "".join(word for word in text if word not in ('!','.',':', ',', 'β€œ', '”', '"', '?', '?!')) - -stopwords = ('the', 'a', 'an', 'or', 'and', 'to', 'are', 'is',) -top_n = 6 -path_to_file = 'report.txt' - def calculate_frequences(text) : dictionary = {} + text = "".join(word for word in text if word not in ('!', '.', ':', ',', 'β€œ', '”', '"', '?', '?!')) words = text.split() for key in words: key = key.lower() @@ -17,8 +10,6 @@ def calculate_frequences(text) : else: dictionary[key]=1 return dictionary -dictionary = calculate_frequences(text) -print (dictionary) def filter_stop_words(dictionary, stopwords): filtered_dictionary = dictionary @@ -26,8 +17,6 @@ def filter_stop_words(dictionary, stopwords): if key in stopwords: del filtered_dictionary[key] return filtered_dictionary -filtered_dictionary = filter_stop_words(dictionary, stopwords) -print (filtered_dictionary) def get_top_n (dictionary, top_n): list_dictionary = list(dictionary.items()) @@ -40,11 +29,3 @@ def get_top_n (dictionary, top_n): index += 1 toped_dictionary = tuple(list_dictionary[:top_n]) return toped_dictionary -content = get_top_n (dictionary, top_n) -print (content) - -def write_to_file (path_to_file, content): - with open('report.txt', 'w') as file: - print(content, file=file, sep='\n') - -write_to_file (path_to_file, content) From 30f0061696516e117b90e1ddc62ba0ffab910ad1 Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 7 Oct 2019 14:48:24 +0300 Subject: [PATCH 09/14] ssss --- lab_1/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lab_1/main.py b/lab_1/main.py index a939bd7..c3588c8 100644 --- a/lab_1/main.py +++ b/lab_1/main.py @@ -1,6 +1,6 @@ def calculate_frequences(text) : dictionary = {} - text = "".join(word for word in text if word not in ('!', '.', ':', ',', 'β€œ', '”', '"', '?', '?!')) + text = "".join(word for word in text if word not in ('!', '.', ':', ';', ',', 'β€œ', '”', '"', '?', '?!', '@', '~', '$', '#', 'β„–', '%', '*', '^')) words = text.split() for key in words: key = key.lower() From 8bc2a34bee94c02bfa13aac515317e9098776ba8 Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Mon, 14 Oct 2019 07:25:45 +0300 Subject: [PATCH 10/14] developed --- lab_1/main.py | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/lab_1/main.py b/lab_1/main.py index c3588c8..5681ab7 100644 --- a/lab_1/main.py +++ b/lab_1/main.py @@ -1,22 +1,30 @@ def calculate_frequences(text) : + if not isinstance(text, str): + return {} + work_text = '' + for el in str(text): + if el.isalpha() or el == ' ' or el == '\n': + work_text += el.lower() + words = work_text.split() dictionary = {} - text = "".join(word for word in text if word not in ('!', '.', ':', ';', ',', 'β€œ', '”', '"', '?', '?!', '@', '~', '$', '#', 'β„–', '%', '*', '^')) - words = text.split() for key in words: - key = key.lower() if key in dictionary: - value = dictionary[key] - dictionary[key]=value+1 + dictionary[key] += 1 else: - dictionary[key]=1 + dictionary[key] = 1 return dictionary def filter_stop_words(dictionary, stopwords): - filtered_dictionary = dictionary - for key in list(filtered_dictionary.keys()): - if key in stopwords: - del filtered_dictionary[key] - return filtered_dictionary + if dictionary and stopwords is not None: + filtered_dictionary = dictionary.copy() + for key in dictionary: + if not isinstance(key, str): + del filtered_dictionary[key] + for word in stopwords: + if word in filtered_dictionary: + del filtered_dictionary[word] + return filtered_dictionary + return {} def get_top_n (dictionary, top_n): list_dictionary = list(dictionary.items()) @@ -29,3 +37,21 @@ def get_top_n (dictionary, top_n): index += 1 toped_dictionary = tuple(list_dictionary[:top_n]) return toped_dictionary + +def read_from_file (path_to_file, lines_limit): + file = open(path_to_file, 'r') + n = 0 + text = '' + for line in file: + if n < lines_limit: + text += str(line) + n += 1 + file.close() + return text + +def write_to_file (path_to_file, content): + file = open(path_to_file, 'w') + for el in content: + file.write(el) + file.write('\n') + file.close() From b10892d0a102a1564193ff41c6fa20fe1ad3de89 Mon Sep 17 00:00:00 2001 From: Marat Fatekhov Date: Thu, 24 Oct 2019 14:11:29 +0300 Subject: [PATCH 11/14] remove redundant --- lab_1/data.txt | 4 ---- lab_1/report.txt | 1 - 2 files changed, 5 deletions(-) delete mode 100644 lab_1/data.txt delete mode 100644 lab_1/report.txt diff --git a/lab_1/data.txt b/lab_1/data.txt deleted file mode 100644 index 2b354d8..0000000 --- a/lab_1/data.txt +++ /dev/null @@ -1,4 +0,0 @@ -Kamal and Grant arrived. “Hi Kamal!” said Tara. -“Are you going to the Halloween disco tomorrow?” -“Yes. Hi Amy,” Kamal said, smiling. -“Do you want to come and see our paintings after school?” “I’m coming too!” Tara insisted. \ No newline at end of file diff --git a/lab_1/report.txt b/lab_1/report.txt deleted file mode 100644 index fbb681a..0000000 --- a/lab_1/report.txt +++ /dev/null @@ -1 +0,0 @@ -(('kamal', 3), ('hi', 2), ('said', 2), ('tara', 2), ('you', 2), ('grant', 1)) From 20365a490de268a88a8cb6c21e49ad65edd287a4 Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Tue, 26 Nov 2019 12:55:38 +0300 Subject: [PATCH 12/14] labba a --- lab_2/main.py | 88 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 78 insertions(+), 10 deletions(-) diff --git a/lab_2/main.py b/lab_2/main.py index 37932f5..1e03713 100644 --- a/lab_2/main.py +++ b/lab_2/main.py @@ -1,18 +1,36 @@ -""" -Labour work #2. Levenshtein distance. -""" - - def generate_edit_matrix(num_rows: int, num_cols: int) -> list: - pass + edit_matrix = [] + if not isinstance(num_rows, int) or not isinstance(num_cols, int): + return edit_matrix + if num_cols <= 0 or num_rows <= 0: + return edit_matrix + for _ in range(num_rows): + string = [] + for _ in range(num_cols): + string.append(0) + edit_matrix.append(string) + return edit_matrix def initialize_edit_matrix(edit_matrix: tuple, add_weight: int, remove_weight: int) -> list: - pass + if not isinstance(edit_matrix, tuple): + return [] + edit_matrix = list(edit_matrix) + if not isinstance(add_weight, int) or not isinstance(remove_weight, int): + return edit_matrix + if edit_matrix == [[]] * len(edit_matrix): + return edit_matrix + for i in range(1, len(edit_matrix)): + edit_matrix[i][0] = edit_matrix[i - 1][0] + remove_weight + for j in range(1, len(edit_matrix[0])): + edit_matrix[0][j] = edit_matrix[0][j - 1] + add_weight + return edit_matrix def minimum_value(numbers: tuple) -> int: - pass + if isinstance(numbers, tuple): + res = min(list(numbers)) + return res def fill_edit_matrix(edit_matrix: tuple, @@ -21,7 +39,27 @@ def fill_edit_matrix(edit_matrix: tuple, substitute_weight: int, original_word: str, target_word: str) -> list: - pass + if not isinstance(edit_matrix, tuple): + return [] + edit_matrix = list(edit_matrix) + if not isinstance(original_word, str) or not isinstance(target_word, str) or original_word == '' \ + or target_word == '': + return edit_matrix + if not isinstance(add_weight, int) or not isinstance(remove_weight, int) or not isinstance(substitute_weight, int): + return edit_matrix + original_word = ' ' + original_word + target_word = ' ' + target_word + for i in range(1, len(edit_matrix)): + for j in range(1, len(edit_matrix[0])): + ad = edit_matrix[i][j - 1] + add_weight + re = edit_matrix[i - 1][j] + remove_weight + su = edit_matrix[i - 1][j - 1] + if original_word[i] != target_word[j]: + su += substitute_weight + edit_matrix[i][j] = minimum_value((ad, re, su)) + return edit_matrix + + def find_distance(original_word: str, @@ -29,4 +67,34 @@ def find_distance(original_word: str, add_weight: int, remove_weight: int, substitute_weight: int) -> int: - pass + wrong_result = -1 + if type(original_word) == str and type(target_word) == str and type(add_weight) == int and type(remove_weight) == int and type(substitute_weight) == int: + num_rows = len(original_word) + 1 + num_cols = len(target_word) + 1 + new_matrix = generate_edit_matrix(num_rows, num_cols) + matrix = initialize_edit_matrix(tuple(new_matrix), add_weight, remove_weight) + return fill_edit_matrix(tuple(matrix), add_weight, remove_weight, substitute_weight, original_word, target_word)[num_rows - 1][num_cols - 1] + else: + return wrong_result + + +def save_to_csv(edit_matrix: tuple, path_to_file: str) -> None: + save_file = open(path_to_file,'w') + for string in edit_matrix: + row = '' + for el in string: + row = str(el)+',' + save_file.write(row) + save_file.write('\n') + save_file.close(row) + +def load_from_csv(path_to_file: str) -> list: + new_file = open(path_to_file) + matrix = [] + for string in new_file: + line_with_z = string.split(',') + line =[] + for el in line_with_z: + line.append(int(el)) + matrix.append(line) + return matrix From 60b47f386ba713ab8797c7ffceff334150669c4a Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Wed, 11 Dec 2019 14:13:48 +0300 Subject: [PATCH 13/14] =?UTF-8?q?=D0=BB=D0=B0=D0=B1=D0=B1=D0=B1=D0=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_3/main.py | 121 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 107 insertions(+), 14 deletions(-) diff --git a/lab_3/main.py b/lab_3/main.py index 595f614..c82b5a3 100644 --- a/lab_3/main.py +++ b/lab_3/main.py @@ -1,8 +1,3 @@ -""" -Labour work #3 - Building an own N-gram model -""" - import math REFERENCE_TEXT = '' @@ -12,33 +7,131 @@ class WordStorage: + def __init__(self): + self.storage = {} + def put(self, word: str) -> int: - pass + if not isinstance(word, str): + return -1 + if not self.storage: + self.storage[word] = 0 + elif word not in self.storage: + self.storage[word] = max(self.storage.values()) + 1 + return self.storage[word] def get_id_of(self, word: str) -> int: - pass + if word not in self.storage: + return -1 + return self.storage.get(word) + def get_original_by(self, id: int) -> str: - pass + if isinstance(id, int): + for key, value in self.storage.items(): + if value == id: + return key + else: + return 'UNK' + if id not in self.storage.values(): + return 'UNK' + + def from_corpus(self, corpus: tuple): - pass + if isinstance(corpus, tuple): + for elem in corpus: + self.put(elem) + return self.storage class NGramTrie: + def __init__(self, n): + self.size = n + self.gram_frequencies = {} + self.gram_log_probabilities = {} + def fill_from_sentence(self, sentence: tuple) -> str: - pass + if isinstance(sentence, tuple): + new_sent = list(sentence) + for i, n in enumerate(new_sent[:-self.size + 1]): + n_gram = [] + identif = 0 + while identif < self.size: + n_gram.append(new_sent[i + identif]) + identif += 1 + n_gram = tuple(n_gram) + if n_gram in self.gram_frequencies.keys(): + self.gram_frequencies[n_gram] += 1 + else: + self.gram_frequencies[n_gram] = 1 + return 'OK' + else: + return 'ERROR' def calculate_log_probabilities(self): - pass + for pair in self.gram_frequencies: + wanted = pair[0:self.size - 1] + count = 0 + for key in self.gram_frequencies: + if wanted == key[0:self.size - 1]: + count += self.gram_frequencies[key] + prob = math.log(self.gram_frequencies[pair] / count) + self.gram_log_probabilities[pair] = prob def predict_next_sentence(self, prefix: tuple) -> list: - pass + word_1 = [] + if not isinstance(prefix, tuple) or len(prefix) + 1 != self.size: + return [] + final = list(prefix) + while True: + prob = [] + for n_gram in list(self.gram_log_probabilities.keys()): + if n_gram[:-1] == prefix: + prob.append(self.gram_log_probabilities[n_gram]) + if not prob: + break + prob.sort(reverse=True) + prob = prob[0] + for word, probability in list(self.gram_log_probabilities.items()): + if prob == probability: + word_1 = word[-1] + final.append(word_1) + pref_1 = list(prefix[1:]) + pref_1.append(word_1) + prefix = tuple(pref_1) + return final def encode(storage_instance, corpus) -> list: - pass + code = [] + for sentence in corpus: + code1 = [] + for element in sentence: + element = storage_instance.get_id_of(element) + code1.append(element) + code.append(code1) + return code def split_by_sentence(text: str) -> list: - pass + corpus = [] + new_text = '' + if isinstance(text, str) and ' ' in text: + text = text.replace('\n', ' ') + while ' ' in text: + text = text.replace(' ', ' ') + text = text.replace('!', '.') + text = text.replace('?', '.') + if '.' in text: + for symbol in text: + if symbol.isalpha() or symbol == ' ' or symbol == '.': + new_text += symbol.lower() + sentences = new_text.split('.') + while '' in sentences: + sentences.remove('') + for element in sentences: + element = element.split() + element.insert(0, '') + element.append('') + corpus.append(element) + return corpus From 1b3874029b5ffba7ad9741132729d3e830c0bda0 Mon Sep 17 00:00:00 2001 From: Marina-Garanina <55386364+Marina-Garanina@users.noreply.github.com> Date: Tue, 17 Dec 2019 12:04:19 +0300 Subject: [PATCH 14/14] labb code --- lab_4/main.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 70 insertions(+), 6 deletions(-) diff --git a/lab_4/main.py b/lab_4/main.py index db1330d..2c719a4 100644 --- a/lab_4/main.py +++ b/lab_4/main.py @@ -5,24 +5,88 @@ def clean_tokenize_corpus(texts: list) -> list: - pass + corpus = [] + if not texts or not isinstance(texts, list): + return corpus + for text in texts: + if not isinstance(text, str): + continue + clean_text = '' + text = text.replace('\n', ' ') + text = text.replace('
', ' ') + while ' ' in text: + text = text.replace(' ', ' ') + for symbol in text: + if symbol.isalpha() or symbol == ' ': + clean_text += symbol.lower() + clean_text = clean_text.split() + corpus.append(clean_text) + return corpus class TfIdfCalculator: def __init__(self, corpus): - pass + self.corpus = corpus + self.tf_values = [] + self.idf_values = {} + self.tf_idf_values = [] def calculate_tf(self): - pass + if not isinstance(self.corpus, list): + return [] + for doc in self.corpus: + if not isinstance(doc, list): + continue + doc_dict = {} + cleaned_doc = [] + for elem in doc: + if isinstance(elem, str): + cleaned_doc.append(elem) + for word in cleaned_doc: + if word not in doc_dict: + doc_dict[word] = doc.count(word) / len(cleaned_doc) + self.tf_values.append(doc_dict) def calculate_idf(self): - pass + if not isinstance(self.corpus, list): + return {} + all_words = [el for doc in self.corpus if isinstance(doc, list) for el in doc if isinstance(el, str)] + words = list(set(all_words)) + cleaned_corpus = [] + for doc in self.corpus: + if isinstance(doc, list): + cleaned_corpus.append(doc) + for word in words: + frequency = [1 for doc in cleaned_corpus if isinstance(doc, list) and word in doc] + self.idf_values[word] = math.log(len(cleaned_corpus) / sum(frequency)) def calculate(self): - pass + if not isinstance(self.tf_values, list): + return [] + for doc in self.tf_values: + new_dict = {} + for key in doc: + if key in doc and key in self.idf_values: + new_dict[key] = doc[key] * self.idf_values[key] + else: + return [] + self.tf_idf_values.append(new_dict) def report_on(self, word, document_index): - pass + if self.tf_idf_values is None or document_index > len(self.tf_idf_values) - 1 or \ + word not in self.tf_idf_values[document_index]: + return () + word_info = [self.tf_idf_values[document_index][word]] + the_most_important = list(self.tf_idf_values[document_index].items()) + the_most_important.sort(key=lambda x: x[1], reverse=True) + ind = -1 + for elem in the_most_important: + if elem[0] == word: + ind = the_most_important.index(elem) + break + if ind != -1: + word_info.append(ind) + return tuple(word_info) if __name__ == '__main__':