-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLite-N-Excelsy.py
198 lines (166 loc) · 7.29 KB
/
Lite-N-Excelsy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#
# This script was written with Github Copilot
#
# It scrapes the nutrition information from the Lite-N-Easy website and saves it to an Excel file
# It was made for the purposes of ranking the meals by Sodium content for low-sodium diets.
#
# DISCLAIMER:
# It is the sole responsibility of you, the user to check the official Lite-N-Easy HTML pages
# for accurate and up-to-date information regarding allergens and nutrition values.
# The spreadsheet produced by this script has not been cross-referenced with the official
# HTML pages for accuracy and is intended as a guide only for comparison purposes.
#
# Any deaths resulting from the use of this script or the generated spreadsheet are not the
# responsibility of the author and no correspondence will be entered into.
#
import xlsxwriter
import requests
import os
from lxml import html
import re
#
# CONFIGURATION
#
# set output file name
output_file = 'Lite-N-Easy.xlsx'
# set menu index URL
indexURL = 'https://www.liteneasy.com.au/ingredients-nutrition'
# set header row (change at own risk)
headers = ['No.', 'Name', 'Serving size', 'Energy', 'Protein', 'Fat, Total',
'Saturated Fat', 'Carbohydrate', 'Sugars', 'Fibre', 'Sodium', 'Ingredients']
#
# FUNCTIONS
#
# function to fetch the HTML content from a URL and return a parsed HTML tree
def fetch_html_tree(url):
response = requests.get(url)
html_content = response.text
tree = html.fromstring(html_content)
return tree
# fetch the HTML tree of index page
indexTree = fetch_html_tree(indexURL)
# initialise an array for the menu pages
pages = []
# get all links inside a table
iMatches = indexTree.xpath('//table//a')
# for each link in the table
for a in iMatches:
# get the text from the link, flattening child elements
linkTitle = ' '.join(a.itertext()).strip()
# first clean the href attribute by stripping any parameters
linkURL = a.attrib['href'].split('?')[0]
# if the linkURL ends with .html and the title is not blank
if linkURL.endswith('.html') and linkTitle:
# append a dictionary with the title and URL to the pages array
pages.append({'Title': linkTitle, 'URL': linkURL})
# create a new workbook
wb = xlsxwriter.Workbook(output_file)
# for each page in the array
for page in pages:
# initialize a blank array to store the data
data = []
# create a new worksheet called the Title of the page
ws = wb.add_worksheet(page['Title'])
# append the header row to the worksheet
ws.write_row(0, 0, headers)
# format the header row to bold
bold = wb.add_format({'bold': True})
ws.set_row(0, None, bold)
# edit the 'Sodium' header to append ' (mg)' as the units are stripped to allow sorting
ws.write(0, 10, 'Sodium (mg)', bold)
# get HTML from current menu page
currentURL = page['URL']
currentTree = fetch_html_tree(currentURL)
# Iterate over td elements with a h2 inside a span.IngredName
# this corresponds to each meal on the page
iMatches = currentTree.xpath('//td[span[@class="IngredName"]//h2]')
for td in iMatches:
# create a blank dictionary to store the data for this meal
row = {}
# find a h2 element inside the current table cell
name = td.find('.//h2')
# extract the text from the h2 element and store in temporary variable
nameText = name.text if name is not None else ''
# using regex, check if the name starts with a number
nMatches = re.match(r'^(\d+)\s+', nameText)
# if the name starts with a number
if nMatches:
# set the No. key to the number
row['No.'] = nMatches.group(1)
# set the Name key to the name without the number
row['Name'] = nameText[nMatches.end():].strip()
else:
# otherwise set No. to blank
row['No.'] = ''
# set the Name key to the name
row['Name'] = nameText
# find a span.Ingred_Serving_Contents element inside the td
# this is where the serving size lives
serving_size = td.find('.//span[@class="Ingred_Serving_Contents"]')
# extract the text from the span element and set Serving size to this value
row['Serving size'] = serving_size.text if serving_size is not None else None
# find all td elemts inside a table in the current td element
# this is where the nutritional information lives
nutris = td.xpath('.//table//td')
# iterate in pairs of 3 (ignoring the per 100g column)
for i in range(0, len(nutris), 3):
# set the key to the text of the first td element
key = ' '.join(nutris[i].itertext()).strip()
# set the value to the text from the second td element
value = ' '.join(nutris[i+1].itertext()).strip()
# if the key is Sodium strip 'mg' from the value then strip any whitespace
if key == 'Sodium':
value = value.replace('mg', '').strip()
# set the key and value in the data row dictionary
row[key] = value
# if the key was Sodium, exit the loop (end of table)
if key == 'Sodium':
break
# extract the text from a span.Ingred_Ingred_Contents
# this is where the ingredients live
ings = td.find('.//span[@class="Ingred_Ingred_Contents"]')
# flatten all child elements and just get the text
ingsText = ' '.join(ings.itertext()).strip()
row['Ingredients'] = ingsText
# append the row dictionary to the data list
data.append(row)
# TODO: this could be improved to ensure that the data is written to the row
# by comparing the keys to header rows. in its current form it relies on the
# nutrition information being in the same order for every meal.
# initialize current_row to 1 (assuming 0 is the header row)
current_row = 1
# iterate over the data array
for row in data:
# convert the dictionary to a list
rowlist = [row.get(key, '') for key in data[0]]
# append the list to the next row in the worksheet
ws.write_row(current_row, 0, rowlist)
# increment current_row
current_row += 1
# if the sheet name contains 'Lunches', hide first column
if 'Lunches' in page['Title']:
ws.set_column(0, 0, None, None, {'hidden': True})
#
# SAVE
#
# move the Dinners sheet to the very first sheet position
# by first getting its current position and calculating the offset
# to move it to the first position
# move the Dinners sheet to the very first sheet position
dinnerSheet = wb.get_worksheet_by_name('Dinners')
dinnerIndex = wb.worksheets().index(dinnerSheet)
wb.worksheets().insert(0, wb.worksheets().pop(dinnerIndex))
# Note: Github copilot could not do this but the library is capable of it
# reference https://stackoverflow.com/questions/29463274/simulate-autofit-column-in-xslxwriter
for ws in wb.worksheets():
ws.autofit()
# iterate over columns examining the first cell
for col in range(len(headers)):
# if the first cell is 'Name' or 'Ingredients' set the column width to 30
if headers[col] in ['Name', 'Ingredients']:
ws.set_column(col, col, 30)
# If output file exists, delete it
if os.path.exists(output_file):
os.remove(output_file)
# save the workbook to a file
wb.close()