-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwwtbht_clean.py
67 lines (55 loc) · 2 KB
/
wwtbht_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
import os
import re
import os.path
import nltk
import operator
from nltk import word_tokenize
dir = os.getcwd()
stop_words = nltk.corpus.stopwords.words('english')
punctuations = [".", ". ", ",", ", ", "!", "! ", "?", "? ", ";", "; ", "‘", "‘ ", "’", "’ ", ":", ": ", "'", "' ", "—", "— ", "'s", "'s ", "(", "( ", ")", ") ", " ", " ", "\n", "\t", "-", "- "]
nonsense_letters = ["b", "c", "d", "e", "f", "g", "h", "a", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
words = set(nltk.corpus.words.words())
#clean xml tags
#clean "poem" tags for separate export
def clean_html(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
for filename in os.listdir('source_text'):
with open(os.path.join('source_text', filename)) as currentfile:
content = currentfile.read().lower()
no_html_content = clean_html(content)
tokens = word_tokenize(no_html_content)
real_tokens = []
for i in tokens:
if i not in stop_words and "xml" not in i and i not in punctuations and i not in nonsense_letters and i.isalpha():
if "." in i:
place = i.index(".")
first_word = i[0:place]
second_word = i[place+1:]
real_tokens.append(first_word)
real_tokens.append(second_word)
continue
if "—" in i and i.index("—") != len(i)-1:
place = i.index("—")
first_word = i[0:place]
second_word = i[place+1]
real_tokens.append(first_word)
real_tokens.append(second_word)
continue
real_tokens.append(i)
for i in real_tokens:
for punctuation in punctuations:
if punctuation in i:
i = i.replace(punctuation, "")
if i == " " or i == "\t" or i == "\n":
real_tokens.remove(i)
for i in real_tokens:
if i not in words:
real_tokens.remove(i)
new_file_name = filename + "_new"
new_file = open(new_file_name, "a+")
for token in real_tokens:
new_file.write("%s \n" %token)
print("Created and appended for new %s file" %filename)