-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarize.py
60 lines (47 loc) · 2.18 KB
/
summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import spacy
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
# Load SpaCy model for sentence tokenization and word vector representation
nlp = spacy.load('en_core_web_md') # Use medium or large model for word vectors
# Function to tokenize text into sentences
def tokenize_sentences(text):
doc = nlp(text)
return [sent.text.strip() for sent in doc.sents]
# Function to compute sentence similarity matrix
def sentence_similarity(sentences):
# Compute the embeddings of each sentence
sentence_embeddings = [nlp(sent).vector for sent in sentences]
# Compute the cosine similarity matrix
sim_matrix = np.zeros((len(sentences), len(sentences)))
for i in range(len(sentences)):
for j in range(len(sentences)):
if i != j:
sim_matrix[i][j] = cosine_similarity(
sentence_embeddings[i].reshape(1, -1),
sentence_embeddings[j].reshape(1, -1)
)[0][0]
return sim_matrix
# Function to rank sentences using TextRank algorithm
def rank_sentences(sentences, similarity_matrix):
# Create a graph and apply PageRank
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
# Sort the sentences by score
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
return ranked_sentences
# Function to summarize the text by selecting top N ranked sentences
def summarize(text, num_sentences=3):
sentences = tokenize_sentences(text)
sim_matrix = sentence_similarity(sentences)
ranked_sentences = rank_sentences(sentences, sim_matrix)
# Extract the top N sentences as summary
summary = " ".join([ranked_sentences[i][1] for i in range(num_sentences)])
return summary
# Example usage:
text = """Alice was born in London. She met Bob in Paris, where they discussed quantum mechanics.
Alice works for Google, while Bob is a researcher at MIT. Alice loves hiking in the mountains,
while Bob enjoys surfing. They both have a keen interest in AI and robotics."""
# Summarize the text
summary = summarize(text, num_sentences=2)
print("Summary:", summary)