diff --git a/.DS_Store b/.DS_Store index f5eadf2..e29d23b 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/WEEK 9/.DS_Store b/WEEK 9/.DS_Store new file mode 100644 index 0000000..b2bdb79 Binary files /dev/null and b/WEEK 9/.DS_Store differ diff --git a/WEEK 9/README.md b/WEEK 9/README.md new file mode 100644 index 0000000..0564784 --- /dev/null +++ b/WEEK 9/README.md @@ -0,0 +1,17 @@ +# WEEK 9 - Hierarchical Clustering of documents + +**Cosine Similarity is used as similarity metric_** +## Available programs: + +* _document_clustering.py_ - This program reads all the documents from the __documents__ folder as specified in the file dictionary. Then it creates a vector space model from it, using the word list as specified in the code. After creating the vector space model, the distance matrix is formed and the same is used to perform K-Means clustering. +* _web_document_clustering.py_ - This program does the same as the above code but instead of using pre-downloaded files, it scrapes contents from websites as specified in the dictionary. + +## Sample output: +![document clustering output](output.png) +![document clustering dendrogram](dendrogram1.png) + +### To run the codes, run the following command on the terminal opened at the current directory + +```bash +python document_clustering.py +``` \ No newline at end of file diff --git a/WEEK 9/assignment.doc b/WEEK 9/assignment.doc new file mode 100644 index 0000000..d9dadf6 Binary files /dev/null and b/WEEK 9/assignment.doc differ diff --git a/WEEK 9/dendrogram1.png b/WEEK 9/dendrogram1.png new file mode 100644 index 0000000..328f657 Binary files /dev/null and b/WEEK 9/dendrogram1.png differ diff --git a/WEEK 9/document_clustering.py b/WEEK 9/document_clustering.py new file mode 100644 index 0000000..7067fe1 --- /dev/null +++ b/WEEK 9/document_clustering.py @@ -0,0 +1,126 @@ +# Implementing Vector Space Model and performing nearest neighbourhood clustering of the documents. + +# Importing the libraries +import string +import pandas as pd +import math +import matplotlib.pyplot as plt + +class document_clustering(object): + """Implementing the document clustering class. + It creates the vector space model of the passed documents and then + creates a Hierarchical Cluster to organize them. + + Parameters: + ------------- + file_dict: dictionary + Contains the path of the different files to be read. + Format: {file_index: path} + word_list: list + Contains the list of words using which the vector space model is to be + created. + + Attributes: + ----------- + listing_dict_: dictionary + Contains the frequency of the words in each document as file_index as key + and frequency list as value. + distance_matrix_ : pandas-dataframe + Contains the sqaure matrix of documents containing the pairwise distance between them + labels_: list + Contains the labels for document names + """ + + def __init__(self, file_dict, word_list): + self.file_dict = file_dict + self.word_list = word_list + + def tokenize_document(self, document): + """Returns a list of words contained in the document after converting + it to lowercase and striping punctuation marks""" + terms = document.lower().split() + return [term.strip(string.punctuation) for term in terms] + + def create_word_listing(self): + """Function to create the word listing of the objects""" + + # Dictionary to hold the frequency of words in word_list with file_index as key + self.listing_dict_ = {} + + for id in self.file_dict: + temp_word_list = [] + f = open(self.file_dict[id], 'r') + document = f.read() + terms = self.tokenize_document(document) + for term in self.word_list: + temp_word_list.append(terms.count(term.lower())) + self.listing_dict_[id] = temp_word_list + + print('Word listing of each document') + for id in self.listing_dict_: + print('%d: %s' % (id, self.listing_dict_[id])) + + def create_document_matrix(self): + """Function to create the document distance matrix""" + self.labels_ = ['doc%d' % (id) for id in self.file_dict] + main_list = [] + for id1 in self.file_dict: + temp_list = [] + for id2 in self.file_dict: + dist = 0 + l1 = 0 + l2 = 0 + for term1, term2 in zip(self.listing_dict_[id1], self.listing_dict_[id2]): + l1 += term1**2 + l2 += term2**2 + dist += term1 * term2 + dist = dist / (math.sqrt(l1) * math.sqrt(l2)) + temp_list.append(round(math.sqrt(dist), 4)) + main_list.append(temp_list) + + self.distance_matrix_ = pd.DataFrame(main_list, index = self.labels_, columns = self.labels_) + print('\nDistance Matrix') + print(self.distance_matrix_) + + def cluster(self): + """Create the vector space model from the documents. Perform Hierarchical + Clustering""" + from scipy.cluster.hierarchy import linkage + from scipy.cluster.hierarchy import fcluster + row_cluster = linkage(self.distance_matrix_.values, + method = 'single', + metric = 'cosine') + clusters = fcluster(row_cluster, 0.8) + print('\nClusters Based on Cosine Similarity') + cluster_labels = list(set(clusters)) + for i in cluster_labels: + print('Cluster %d:' % i) + for j in range(len(clusters)): + if i == clusters[j]: + print('doc%d' % (j+1)) + from scipy.cluster.hierarchy import dendrogram + dn = dendrogram(row_cluster, labels = self.labels_) + plt.ylabel('Cosine Similarity') + plt.xticks(rotation = 90) + plt.savefig('dendrogram1.png', dpi = 300) + plt.show() + + +# Dictionary containing the file_index and path +file_dict = {1: 'documents/doc1.txt', + 2: 'documents/doc2.txt', + 3: 'documents/doc3.txt', + 4: 'documents/doc4.txt', + 5: 'documents/doc5.txt', + 6: 'documents/doc6.txt', + 7: 'documents/doc7.txt', + 8: 'documents/doc8.txt', + 9: 'documents/doc9.txt'} +# List containing the words using which the vector space model is to be created +word_list = ['Automotive', 'Car', 'motorcycles', 'self-drive', 'IoT', 'hire' ,'Dhoni'] + +# Creating class instance and calling appropriate functions +document_cluster = document_clustering(file_dict = file_dict, word_list = word_list) +document_cluster.create_word_listing() +document_cluster.create_document_matrix() +document_cluster.cluster() \ No newline at end of file diff --git a/WEEK 9/documents/doc1.txt b/WEEK 9/documents/doc1.txt new file mode 100644 index 0000000..502e06e --- /dev/null +++ b/WEEK 9/documents/doc1.txt @@ -0,0 +1 @@ +Electric automotive maker Tesla Inc. is likely to introduce its products in India sometime in the summer of 2017. \ No newline at end of file diff --git a/WEEK 9/documents/doc2.txt b/WEEK 9/documents/doc2.txt new file mode 100644 index 0000000..ea51bda --- /dev/null +++ b/WEEK 9/documents/doc2.txt @@ -0,0 +1 @@ +Automotive major Mahindra likely to introduce driverless car \ No newline at end of file diff --git a/WEEK 9/documents/doc3.txt b/WEEK 9/documents/doc3.txt new file mode 100644 index 0000000..7d6f8b1 --- /dev/null +++ b/WEEK 9/documents/doc3.txt @@ -0,0 +1 @@ +BMW plans to introduce its own motorcycles in india \ No newline at end of file diff --git a/WEEK 9/documents/doc4.txt b/WEEK 9/documents/doc4.txt new file mode 100644 index 0000000..b07e9fb --- /dev/null +++ b/WEEK 9/documents/doc4.txt @@ -0,0 +1 @@ +Just drive, a self-drive car rental firm uses smart vehicle technology based on IoT \ No newline at end of file diff --git a/WEEK 9/documents/doc5.txt b/WEEK 9/documents/doc5.txt new file mode 100644 index 0000000..59cb413 --- /dev/null +++ b/WEEK 9/documents/doc5.txt @@ -0,0 +1 @@ +Automotive industry going to hire thousands in 2018 \ No newline at end of file diff --git a/WEEK 9/documents/doc6.txt b/WEEK 9/documents/doc6.txt new file mode 100644 index 0000000..7271709 --- /dev/null +++ b/WEEK 9/documents/doc6.txt @@ -0,0 +1 @@ +Famous cricket player Dhoni brought his priced car Hummer which is an SUV \ No newline at end of file diff --git a/WEEK 9/documents/doc7.txt b/WEEK 9/documents/doc7.txt new file mode 100644 index 0000000..62fa1f9 --- /dev/null +++ b/WEEK 9/documents/doc7.txt @@ -0,0 +1 @@ +Dhoni led india to its second world cup victory \ No newline at end of file diff --git a/WEEK 9/documents/doc8.txt b/WEEK 9/documents/doc8.txt new file mode 100644 index 0000000..375fd63 --- /dev/null +++ b/WEEK 9/documents/doc8.txt @@ -0,0 +1 @@ +IoT in car will lead to more safety and make driverless vehicle revolution possible \ No newline at end of file diff --git a/WEEK 9/documents/doc9.txt b/WEEK 9/documents/doc9.txt new file mode 100644 index 0000000..6729f84 --- /dev/null +++ b/WEEK 9/documents/doc9.txt @@ -0,0 +1 @@ +Sachin recommended Dhoni for the indian skipper post \ No newline at end of file diff --git a/WEEK 9/output.png b/WEEK 9/output.png new file mode 100644 index 0000000..ef2858e Binary files /dev/null and b/WEEK 9/output.png differ