-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoolkit2_exercises.py
300 lines (250 loc) · 8.53 KB
/
toolkit2_exercises.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
import pandas as pd
feature_names = ['CountryName',
'CountryCode',
'IndicatorName',
'IndicatorCode',
'Year',
'Value']
row_vals = ['Arab World',
'ARB',
'Adolescent fertility rate (births per 1,000 women ages 15-19)',
'SP.ADO.TFRT',
'1960',
'133.56090740552298']
# A list of lists consisting of multiple values similar to row_vals
row_lists = [['Arab World',
'ARB',
'Adolescent fertility rate (births per 1,000 women ages 15-19)',
'SP.ADO.TFRT',
'1960',
'133.56090740552298'],
['Arab World',
'ARB',
'Age dependency ratio (% of working-age population)',
'SP.POP.DPND',
'1960',
'87.7976011532547'],
['Arab World',
'ARB',
'Age dependency ratio, old (% of working-age population)',
'SP.POP.DPND.OL',
'1960',
'6.634579191565161'],
['Arab World',
'ARB',
'Age dependency ratio, young (% of working-age population)',
'SP.POP.DPND.YG',
'1960',
'81.02332950839141'],
['Arab World',
'ARB',
'Arms exports (SIPRI trend indicator values)',
'MS.MIL.XPRT.KD',
'1960',
'3000000.0'],
['Arab World',
'ARB',
'Arms imports (SIPRI trend indicator values)',
'MS.MIL.MPRT.KD',
'1960',
'538000000.0'],
['Arab World',
'ARB',
'Birth rate, crude (per 1,000 people)',
'SP.DYN.CBRT.IN',
'1960',
'47.697888095096395'],
['Arab World',
'ARB',
'CO2 emissions (kt)',
'EN.ATM.CO2E.KT',
'1960',
'59563.9892169935'],
['Arab World',
'ARB',
'CO2 emissions (metric tons per capita)',
'EN.ATM.CO2E.PC',
'1960',
'0.6439635478877049'],
['Arab World',
'ARB',
'CO2 emissions from gaseous fuel consumption (% of total)',
'EN.ATM.CO2E.GF.ZS',
'1960',
'5.041291753975099'],
['Arab World',
'ARB',
'CO2 emissions from liquid fuel consumption (% of total)',
'EN.ATM.CO2E.LF.ZS',
'1960',
'84.8514729446567'],
['Arab World',
'ARB',
'CO2 emissions from liquid fuel consumption (kt)',
'EN.ATM.CO2E.LF.KT',
'1960',
'49541.707291032304'],
['Arab World',
'ARB',
'CO2 emissions from solid fuel consumption (% of total)',
'EN.ATM.CO2E.SF.ZS',
'1960',
'4.72698138789597'],
['Arab World',
'ARB',
'Death rate, crude (per 1,000 people)',
'SP.DYN.CDRT.IN',
'1960',
'19.7544519237187'],
['Arab World',
'ARB',
'Fertility rate, total (births per woman)',
'SP.DYN.TFRT.IN',
'1960',
'6.92402738655897'],
['Arab World',
'ARB',
'Fixed telephone subscriptions',
'IT.MLT.MAIN',
'1960',
'406833.0'],
['Arab World',
'ARB',
'Fixed telephone subscriptions (per 100 people)',
'IT.MLT.MAIN.P2',
'1960',
'0.6167005703199'],
['Arab World',
'ARB',
'Hospital beds (per 1,000 people)',
'SH.MED.BEDS.ZS',
'1960',
'1.9296220724398703'],
['Arab World',
'ARB',
'International migrant stock (% of population)',
'SM.POP.TOTL.ZS',
'1960',
'2.9906371279862403'],
['Arab World',
'ARB',
'International migrant stock, total',
'SM.POP.TOTL',
'1960',
'3324685.0']]
# This consumes the iterator, commenting out
# for feature, value in zipped_lists:
# print(feature, value)
# Define lists2dict(), function
def lists2dict(list1, list2):
"""Return a dictionary where list1 provides
the keys and list2 provides the values."""
# Zip lists: zipped_lists
zipped_lists = zip(list1, list2)
# Create a dictionary: rs_dict
dictionary = dict(zipped_lists)
return dictionary
# Define read_large_file()
def read_large_file(file_object, skip_colnames=False):
"""A generator function to read a large file lazily."""
if skip_colnames:
file_object.readline()
# Loop indefinitely until the end of the file
while True:
# Read a line from the file: data
data = file_object.readline()
# Break if this is the end of the file
if not data:
break
yield data
def process_file_differently(path):
"""
Process the file line by line using the file's returned iterator
"""
try:
with open(path) as file_handler:
# while True:
for i in range(3):
print(next(file_handler))
except (IOError, OSError):
print("Error opening / processing file")
except StopIteration:
pass
# Call lists2dict: rs_fxn
rs_fxn = lists2dict(feature_names, row_vals)
# Print rs_fxn
print("From function", rs_fxn)
# USING LIST COMPREHENSIONS
# Print the first two lists in row_lists
print(row_lists[0])
print(row_lists[1])
# Turn list of lists into list of dicts: list_of_dicts
# LIST COMPREHENSIONS: [output expression for variable in iterable]
list_of_dicts = [lists2dict(feature_names, row_list) for row_list in row_lists]
# Print the first two dictionaries in list_of_dicts
print(list_of_dicts[0])
print(list_of_dicts[1])
# Turn list of dicts into a DataFrame: df
df = pd.DataFrame(list_of_dicts)
# Print the head of the DataFrame
print(df.head())
# Processing data in chunks (1)
# The most widely used python_basics of context managers is the with statement
# Open a connection to the file
with open('datasets/world_ind_pop_data.csv', 'r') as file:
# Skip the column names
file.readline()
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Process only the first 1000 rows
for j in range(1000):
# Split the current line into a list: line
line = file.readline().split(',')
# Get the value for the first column: first_col
first_col = line[0]
# If the column value is in the dict, increment its value
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col] = 1
# Print the resulting dictionary
print(counts_dict)
# Open a connection to the file
with open('datasets/world_ind_pop_data.csv', 'r') as file:
# Create a generator object for the file: gen_file
gen_file = read_large_file(file)
# Print the first three lines of the file
print(next(gen_file))
print(next(gen_file))
print(next(gen_file))
# The following is supposed to work similarly as files return lazy iterators to begin with
process_file_differently('datasets/world_ind_pop_data.csv')
# Initialize an empty dictionary: counts_dict
counts_dict_2 = {}
# Open a connection to the file
with open('datasets/world_ind_pop_data.csv', 'r') as file:
# Iterate over the generator from read_large_file()
for line in read_large_file(file):
row = line.split(',')
first_col = row[0]
if first_col in counts_dict_2.keys():
counts_dict_2[first_col] += 1
else:
counts_dict_2[first_col] = 1
# Print
print(counts_dict_2)
# Initialize an empty dictionary: counts_dict
counts_dict_2 = {}
# Open a connection to the file
with open('datasets/world_ind_pop_data.csv', 'r') as file:
# Iterate over the generator from read_large_file()
for line in read_large_file(file, skip_colnames=True):
row = line.split(',')
first_col = row[0]
if first_col in counts_dict_2.keys():
counts_dict_2[first_col] += 1
else:
counts_dict_2[first_col] = 1
# Print
print(counts_dict_2)