-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcardekho_dynamic_data_scraping.py
66 lines (65 loc) · 1.7 KB
/
cardekho_dynamic_data_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
'''
Import the necessary libraries
'''
# !pip install selenium
from selenium import webdriver
import time
import pandas as pd
from bs4 import BeautifulSoup as soup
'''
Define the browser/driver and open the desired webpage
'''
driver = webdriver.Chrome(
'D:\\Softwares\\chromedriver_win32\\chromedriver.exe'
)
driver.get('https://www.cardekho.com/filter/new-cars')
'''
Keep scrolling automatically and extract the data from the webpage and store it
'''
for i in range(0, 20):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(1)
driver.execute_script("window.scrollTo(0, \
(document.body.scrollHeight)*0.73)")
time.sleep(1)
res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()
psoup = soup(res, "lxml")
containers = psoup.findAll(
"div", {"gsc_col-md-12 gsc_col-sm-12 gsc_col-xs-12 append_list"}
)
cars = []
prices = []
engines = []
mileages = []
for i in containers:
# cars.append(i.div.img["alt"])
price = i.findAll("div", {"class": "price"})
q = price[0].text
s = ""
for h in q:
if h != "*":
s += h
else:
break
prices.append(s)
m = i.findAll("div", {"class": "dotlist"})
f = m[0].findAll("span", {"title": "Mileage"})
if len(f) != 0:
mileages.append(f[0].text)
else:
mileages.append(" ")
e = m[0].findAll("span", {"title": "Engine Displacement"})
if len(e) != 0:
engines.append(e[0].text)
else:
engines.append(" ")
df = pd.DataFrame(
{
'Car Name': cars,
'Price': prices,
'Engine': engines,
'Mileage': mileages
}
)
df.to_csv('carScrap.csv', index=False, encoding='utf-8')