-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathremove_common_words.py
executable file
·67 lines (46 loc) · 1.49 KB
/
remove_common_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
"""Remove common words from text.
Requires requests, nltk, and BeautifulSoup:
`pip install nltk requests beautifulsoup4`
"""
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from collections import Counter
import string
import re
from pprint import pprint
def get_url_text(url, selector):
"""Use requests and get HTML for a page."""
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup.select(f'{selector}')[0].text
def remove_common_words(text):
"""Remove common words from text.
This function will remove punctuations and HTML tags.
:return: String with common words removed.
"""
punctuations = string.punctuation
punctuations = r"[{}]".format(punctuations)
text = re.sub(punctuations, '', text)
text = re.sub('<[^>]*>', '', text)
return text
def main():
"""Run remove common words."""
# Place text into text object.
url = 'URL'
selector = 'SELECTOR'
if url != 'URL':
text = get_url_text(url=url,
selector=selector
) # """TEXT INPUT"""
else:
print('No url provided. Using sample text.')
text = 'Need URL provided or text set.'
s = set(stopwords.words('english'))
text = remove_common_words(text)
pprint(
Counter(list(filter(lambda w: w.lower() not in s, text.split()))
).most_common(20))
if __name__ == "__main__":
main()