-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscore_cleaning.py
63 lines (48 loc) · 1.42 KB
/
score_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
from nltk.stem import WordNetLemmatizer
import numpy as np
#for regurlar expressions
import re
#for configuration
import config
def toxicity_score(comment):
'''
check comment for toxicity
input: string
output: value between 0 - 1 indicating toxicity
'''
num_toxic_words = 0
words = comment.split(' ')
print("Words:", words)
for word in words:
# keep only alphabet characters!
word = re.sub('[^a-zA-Z]+', '', word)
# convert to lowercase
word = word.lower()
if word in config.toxic_words:
num_toxic_words += 1
print(f"Toxic Words found:{num_toxic_words}")
print(f"Comment Length:{len(words)}")
score = num_toxic_words / len(words)
return score
def stemSentence(comment):
#for word tokenisation
token_words=word_tokenize(comment)
token_words
stem_sentence=[]
for word in token_words:
#porterstem word stemming method is used
stem_sentence.append(porter.stem(word))
stem_sentence.append(" ")
return "".join(stem_sentence)
def lem(comment):
wordnet_lemmatizer = WordNetLemmatizer()
#for removing the punctuatuins from the code
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(comment)
for word in sentence_words:
if word in punctuations:
sentence_words.remove(word)
return sentence_words