-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
143 lines (100 loc) · 4.01 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
# coding: utf-8
# ## Web Scraper
# Open the webpage with browser emulator and read the html file:
# In[1]:
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/", headers={'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'})
c = r.content
soup = BeautifulSoup(c,"html.parser")
all = soup.find_all("div", {"class":"propertyRow"})
all[0].find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ","")
# Iterete into the web page and extract the informations we want from the first page:
# In[12]:
l =[]
for item in all:
d = {}
d["Adress"] = item.find_all("span",{"class":"propAddressCollapse"})[0].text
d["Locality"] = item.find_all("span",{"class":"propAddressCollapse"})[1].text
d["Price"] = item.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ","")
try:
d["Beds"] = item.find("span",{"class":"infoBed"}).find("b").text
except:
d["Beds"] = None
try:
d["Area"] = item.find("span",{"class":"infoSqFt"}).find("b").text
except:
d["Area"] = None
try:
d["Full Bath"] = item.find("span",{"class":"infoValueFullBath"}).find("b").text
except:
d["Full Bath"] = None
try:
d["Half Bath"] = item.find("span",{"class":"infoValueHalfBath"}).find("b").text
except:
d["Half Bath"] = None
for column_group in item.find_all("div", {"class":"columnGroup"}):
for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}), column_group.find_all("span",{"class":"featureName"})):
if "Lot Size" in feature_group.text:
d["Lot Size"] = feature_name.text
l.append(d)
# Import the data in a data frame and save them in a csv file:
# In[15]:
import pandas
# In[17]:
df = pandas.DataFrame(l)
# In[19]:
df.to_csv("Output.csv")
# Extract the informations from several pages:
# In[44]:
base_url = "http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s="
l = []
page_nr = soup.find_all("a", {"class":"Page"})[-1].text
for page in range(0, int(page_nr)*10, 10):
print(base_url+str(page)+".html")
r = requests.get(base_url+str(page)+".html", headers={'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'})
c= r.content
soup = BeautifulSoup(c, "html.parser")
# print(soup.prettify())
all = soup.find_all("div", {"class":"propertyRow"})
for item in all:
d = {}
try:
d["Adress"] = item.find_all("span",{"class":"propAddressCollapse"})[0].text
except:
d["Adress"] = None
try:
d["Locality"] = item.find_all("span",{"class":"propAddressCollapse"})[1].text
except:
d["Locality"] = None
try:
d["Price"] = item.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ","")
except:
d["Price"] = None
try:
d["Beds"] = item.find("span",{"class":"infoBed"}).find("b").text
except:
d["Beds"] = None
try:
d["Area"] = item.find("span",{"class":"infoSqFt"}).find("b").text
except:
d["Area"] = None
try:
d["Full Bath"] = item.find("span",{"class":"infoValueFullBath"}).find("b").text
except:
d["Full Bath"] = None
try:
d["Half Bath"] = item.find("span",{"class":"infoValueHalfBath"}).find("b").text
except:
d["Half Bath"] = None
for column_group in item.find_all("div", {"class":"columnGroup"}):
for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}), column_group.find_all("span",{"class":"featureName"})):
if "Lot Size" in feature_group.text:
d["Lot Size"] = feature_name.text
l.append(d)
# In[45]:
import pandas
df= pandas.DataFrame(l)
# In[47]:
df.to_csv("Output.csv")