-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsuperstore_main.py
253 lines (193 loc) · 10.9 KB
/
superstore_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime
import time
import pandas as pd
import numpy as np
import csv
import os
# SET ENV TO RUN ON PI ~ Thanks Disavowed!
# python3 -m venv .env && source .env/bin/activate && pip install -r requirements.txt
"""
Created to capture a snapshot of the prices from the local superstore during the COVID pandemic to be later run on data analytic tools.
# HTML ELEMENTS:
#Product Class
<div class="product-tile__details"><div class="product-tile__details__info"><h3 class="product-tile__details__info__name"><a class="product-tile__details__info__name__link" href="/Food/Pantry/Baking-Ingredients/Extracts-%26-Colouring/Artificial-Vanilla-Extract/p/20430551_EA"><span class="product-name product-name--product-tile"><span class="product-name__item product-name__item--brand">No Name</span><span class="product-name__item product-name__item--name" title="Artificial Vanilla Extract">Artificial Vanilla Extract</span><span class="product-name__item product-name__item--package-size">250 mL</span></span></a></h3><div class="product-tile__details__info__text-badge"></div></div><div class="product-prices product-prices--product-tile product-prices--product-tile"><ul class="selling-price-list selling-price-list--product-tile,product-tile"><li class="selling-price-list__item"><span class="price selling-price-list__item__price selling-price-list__item__price--now-price"><span class="price__value selling-price-list__item__price selling-price-list__item__price--now-price__value">$3.48</span><span class="price__unit selling-price-list__item__price selling-price-list__item__price--now-price__unit">ea</span></span></li></ul><ul class="comparison-price-list comparison-price-list--product-tile comparison-price-list--product-tile"><li class="comparison-price-list__item"><span class="price comparison-price-list__item__price"><span class="price__value comparison-price-list__item__price__value">$1.39</span><span class="price__unit comparison-price-list__item__price__unit">/ 100mL</span></span></li></ul></div><div class="product-fulfillment-pickup-header product-fulfillment-pickup-header--product-tile"><span>Pickup only</span></div><div class="product-button-group product-button-group--product-tile product-button-group--add-to-list-button product-button-group--inactive"><button class="quantity-selector quantity-selector--update quantity-selector--horizontal quantity-selector--product-tile quantity-selector--add-to-cart quantity-selector--add-to-list-button" data-track="productAddToCartLocalize" data-track-product-quantity="1" data-track-link-name="add-to-cart-open" data-track-products-array="[{"productSKU":"20430551_EA","productName":"Artificial Vanilla Extract","productBrand":"No Name","productCatalog":"grocery","productVendor":null,"productPrice":"3.48","productQuantity":1,"dealBadge":null,"loyaltyBadge":"false","textBadge":null,"productPosition":null,"productOrderId":null,"productVariant":null}]"><span>Add</span></button><div class="add-to-list add-to-list--product-tile"><div class="add-to-list__flyout add-to-list__flyout--product-tile"><div class="add-to-list__flyout__scroll-bar-container"><ul class="shopping-list-list shopping-list-list--product-tile"><li class="shopping-list-list__item"><button class="shopping-list-list__create-list" tabindex="-1"><span>Create a new list</span></button></li></ul></div></div></div></div></div>
#Product Item
<span class="product-name__item product-name__item--package-size">500 g</span>
#Product name
<a class="product-tile__details__info__name__link"href="/Food/Meat-%26-Seafood/Pork/Bacon/Maple-Flavoured-Naturally-Smoked-Bacon/p/20117351_EA"><span class="product-name product-name--product-tile"><span class="product-name__item product-name__item--brand">President's Choice</span><span class="product-name__item product-name__item--name" title="Maple Flavoured Naturally Smoked Bacon">Maple Flavoured Naturally Smoked Bacon</span><span class="product-name__item product-name__item--package-size">500 g</span></span></a><span class="product-name__item product-name__item--name" title="Maple Flavoured Naturally Smoked Bacon">Maple Flavoured Naturally Smoked Bacon</span>
#Price
<span class="price__value selling-price-list__item__price selling-price-list__item__price--now-price__value">$3.48</span><ul class="selling-price-list selling-price-list--product-tile,product-tile"><li class="selling-price-list__item"><span class="price selling-price-list__item__price selling-price-list__item__price--now-price"><span class="price__value selling-price-list__item__price selling-price-list__item__price--now-price__value">$3.48</span><span class="price__unit selling-price-list__item__price selling-price-list__item__price--now-price__unit">ea</span></span></li></ul>
# Region Selector
//*[@id="site-layout"]/div[5]/div[2]/div/div/ul/li[1]/button
"""
url1 = "https://www.realcanadiansuperstore.ca/Shop-by-Category/c/017377000000"
url0 = "https://www.realcanadiansuperstore.ca/"
ITEM = []
PRICE = []
PRICE_CLEAN = []
RUN_ON_PI = False # Set variable if running on pi
if os.getcwd() != '/home/girard/Scripts/Python/WebScraping/superstore_scraper':
from pyvirtualdisplay import Display
RUN_ON_PI = True
display = Display(visible=0, size=(1024, 768)) # For headless RPi
display.start()
print("Running headless. Display started.")
#DRIVER_PATH = '()' # edit driver paths; make explicit
driver = webdriver.Chrome()
WAIT_TIME = 20
wait = WebDriverWait(driver,10,5)
else:
#DRIVER_PATH = '/home/girard/Scripts/Python/WebScraping/WebDriver/chromedriver' # edit driver paths; make explicit
driver = webdriver.Chrome('/home/girard/Scripts/Python/WebScraping/WebDriver/chromedriver')
WAIT_TIME = 10
wait = WebDriverWait(driver,10,2)
#driver = webdriver.Chrome(DRIVER_PATH)
webpage = driver.get(url1)
scroll = driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
# Create bs4 instance
time.sleep(WAIT_TIME)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
# Get list of grocery aisles.
AISLE = []
for a in soup.find_all('a', class_="browse-by-aisle__list__item"):
AISLE.append((a['href']))
# Print AISLE
print("Scraping the following aisles: ")
for i in AISLE:
print(i)
def Click_Event():
"""Initial waiting for clickable elements ; Scroll to bottom and wait for element_to_be_clickable"""
while True:
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'load-more-button')))
except TimeoutException:
print("Loading...")
time.sleep(WAIT_TIME)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'load-more-button')))
except NoSuchElementException as err:
print(err)
continue
else:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
BUTTON = driver.find_elements_by_tag_name('button')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
BUTTON[-4].click()
print("...")
finally:
break
def Max_Load():
"""Extracts maximum number of times to loop over Click_Event() based on NUM of RESULTS """
NUM = []
for i in RESULTS.text:
NUM.append(i)
STR = "".join(NUM) # Creates str from list
for i in STR.split():
if i.isdigit():
return i
def Load_Count():
"""Tracks number of times to loop over Click_Event() based on NUM of RESULTS """
NUM = []
LIST = []
TEMP_NUM = []
for i in RESULTS.text:
NUM.append(i)
STR = "".join(NUM) # Creates str from list
for x in STR.split(" "):
LIST.append(x)
TEMP_NUM.append(LIST[0].split("-"))
return TEMP_NUM[0][1]
def Count_Load():
"""Counter to monitor maximum loaded items """
x = 0
MAX = int(MAX_LOAD) # Sets option to iterate over ALL items.
print("Loading items up to " + str(MAX) + " items...")
try:
while x < MAX:
Click_Event()
x += int(LOAD_COUNT)
except KeyboardInterrupt:
pass
def Soup_Extraction(ITEM, PRICE):
print("Soup extraction...")
time.sleep(WAIT_TIME)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
html = driver.page_source
soup = BeautifulSoup(html, "html.parser") # Different soup flavour
for item in soup.find_all('div', class_='product-tile__details'):
name = item.find('a', class_='product-tile__details__info__name__link')
ITEM.append(name.findChild().text)
val = item.find('ul', class_='selling-price-list')
PRICE.append(val.findChild().text[:-2])
def Price_Clean(PRICE, PRICE_CLEAN):
"""Parsing PRICE to PRICE_CLEAN"""
print("Processing prices...")
for i in PRICE:
if i[-1] == ')':
PRICE_CLEAN.append(i[:-6])
elif i[-2] == '/':
PRICE_CLEAN.append(i[:-2])
elif i[-3] == '/':
PRICE_CLEAN.append(i[:-3])
elif i[-4] == '/':
PRICE_CLEAN.append(i[:-4])
else:
PRICE_CLEAN.append(i)
print("Processed " + str(len(PRICE_CLEAN)) + " items.")
print("\nAISLE[] created. Processing...\n") # Iterate over aisles
def Aisle():
global RESULTS
global MAX_LOAD
global LOAD_COUNT
for aisle in AISLE:# Use [:] for testing
webpage = driver.get(url0+aisle)
print("Loading ---> " + str(url0+aisle) + "\n")
time.sleep(2)
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
time.sleep(2)
try:
RESULTS = driver.find_element(By.CLASS_NAME, 'pagination')
print("RESULTS found.")
MAX_LOAD = Max_Load()
LOAD_COUNT = Load_Count()
Count_Load()
print("Aisle loaded.")
except NoSuchElementException as err: # Catch Aisle with different data
print(err)
print("Continue with making soup anyway...")
finally:
Soup_Extraction(ITEM, PRICE)
print("Downloaded " + str(url0+aisle) + "\n")
print(str(len(ITEM)) + " items in ITEM with " + str(len(PRICE)) + " matching PRICES.")
Aisle()
def filename(ITEM, PRICE, PRICE_CLEAN):
"""Database function"""
NOW = datetime.now()
DATE = NOW.strftime("%Y-%m-%d")
i = 0
global FILENAME
Price_Clean(PRICE, PRICE_CLEAN) # Parse PRICE into PRICE_CLEAN
df = pd.DataFrame(list(zip(ITEM, PRICE_CLEAN)), columns = ['Item', 'Price'])
df['Date'] = DATE
print("Dataframe created...")
while os.path.exists("cart%s.csv" % i): # Increment filename
i += 1
print("New file created: cart%s.csv" % i)
FILENAME = ("cart%s.csv" % i)
df.to_csv(FILENAME, encoding='utf-8', index=False)
filename(ITEM, PRICE, PRICE_CLEAN)
if RUN_ON_PI == True:
print("Stopping display.")
display.stop() # Stop virtual display
print("Done. Scrape successful.")