-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmain.py
126 lines (104 loc) · 3.82 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import urllib.request
#import pypandoc
import sys
import time
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
import re
import urllib.request
from bs4 import BeautifulSoup
stop_words = set(stopwords.words("English"))
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element.encode('utf-8'))):
return False
return True
def valid_link_text(url,query_flag = 1):
try:
cleaned = urllib.request.urlopen(url)
except urllib.error.URLError :
print("404 : Page Not Found ")
return False
if query_flag == 1 :
soup = BeautifulSoup(cleaned,"lxml")
data = soup.findAll(text=True)
result = filter(visible,data)
time.sleep(5) # get_result
return list(result)
else:
return True
def save_data(data,data_name=None,question_name=None,from_extension = None,to_extension = None,):
if from_extension!=None:
print("unavailable now !")
elif to_extension!=None:
number = data_name
if question_name!=None:
number = "blog"
text_file = open("texts/"+"answer_words-"+number+to_extension,"w")
text_file.write(question_name)
text_file.write("\n")
# ignore top 10 lines
break_point = ["Sitemap:","Related Questions"]
for line in data[10:] :
if line in break_point:
break # setting temperory break-point
words = word_tokenize(line)
for word in words :
if word not in stop_words:
content = re.sub('\s+', ' ', word) # condense all whitespace
content = re.sub('[^A-Za-z ]+', '', content) # remove non-alpha chars
text_file.write(str(content).lower())
text_file.write("\n")
text_file.close()
else:
print(" Invalid conversion ! ")
def get_answer_blog():
start_url = input("Enter answer / blog url :")
text_data = valid_link_text(start_url)
if text_data == False:
return
#print(text_data)
save_data(text_data,question_name=start_url,to_extension=".txt")
print("grabbed answer / blog successfully")
def get_latest_questions(url):
raw = urllib.request.urlopen(url)
soup = BeautifulSoup(raw, "lxml")
data = soup.findAll('span',{"class":"rendered_qtext"})
result = filter(visible, data)
time.sleep(2) # get_result
data = list(result)
question_bag = []
for it in data:
txt = str(it.text)
txt.lower()
if "?" in txt:
txt = txt[:-1].replace(' ','-') # get re here to refine
question_bag.append("http://quora.com/"+txt)
print("grabbed question_bag successfully ")
return question_bag
def get_latest_answers():
start_url= input("Enter profile link :")
text_data = valid_link_text(start_url,query_flag=0)
if text_data == False:
return
questions = get_latest_questions(start_url)
size = len(questions)
for question in range(size):
validity = valid_link_text(questions[question])
print(questions[question])
print("got - #"+str(question+1))
if validity != False:
save_data(validity,data_name=str(question+1),to_extension=".txt",question_name=questions[question])
else:
print("Validate question_bag link index #"+str(question+1))
print("grabbed answers successfully")
if __name__ == "__main__":
if len(sys.argv)==2:
if sys.argv[1] == "pick_answer":
get_answer_blog()
elif sys.argv[1] == "pick_profile":
get_latest_answers()
else:
print("pick_profile - pick answers from profile")
print("pick_answer - pick answer directly")