-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit fe64b47
Showing
2 changed files
with
233 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,233 @@ | ||
import webbrowser | ||
import os | ||
import pprint | ||
import csv | ||
import re | ||
import pickle | ||
import matplotlib.pyplot as plt | ||
from matplotlib.dates import date2num | ||
import datetime | ||
import numpy as np | ||
import pandas as pd | ||
|
||
def pullHourlyDataLondon(): | ||
for intYr in range(2010,2019+1): | ||
for intMnt in range(1,12+1): | ||
strQuery = 'http://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=10999&Year=' + str(intYr) + '&Month=' + str(intMnt) +'&timeframe=1&submit=Download+Data' | ||
tmp = webbrowser.open_new_tab(strQuery) | ||
|
||
def sortCSVDataChronologically(): | ||
data = {} | ||
directory = os.fsencode(os.getcwd()+'/WeatherData') | ||
for file in os.listdir(directory): | ||
filename = os.fsdecode(file) | ||
year = filename.split('-')[1].split('.')[0] | ||
month = filename.split('-')[0] | ||
if filename.endswith(".csv"): | ||
if year in data: | ||
data[year][month] = os.getcwd()+'\WeatherData\\'+filename | ||
else: | ||
data[year] = {} | ||
data[year][month] = os.getcwd()+'\WeatherData\\'+filename | ||
continue | ||
else: | ||
continue | ||
return data | ||
|
||
def pullCSVData(filename): | ||
with open(filename, newline='', encoding='UTF-8') as csvfile: | ||
data = list(csv.reader(csvfile)) | ||
return data | ||
|
||
def writeToCSV(dictionary_to_output, file_name): | ||
csv_file = file_name + ".csv" | ||
w = csv.writer(open(csv_file, "w")) | ||
for key, val in dictionary.items(): | ||
w.writerow([key, val]) | ||
|
||
def cleanTwitterData(twitterData): | ||
cleanTwitterData = [] | ||
for data in twitterData: | ||
if 'basketball' in data[6].lower() or 'volleyball' in data[6].lower() or 'badminton' in data[6].lower(): | ||
tmp = [data[2],data[6].lower()] | ||
cleanTwitterData.append(tmp) | ||
tweet_dictionary = {} | ||
for data in cleanTwitterData: | ||
if data[0] not in tweet_dictionary: | ||
tweet_dictionary[data[0]] = {} | ||
for sport in ['basketball','volleyball','badminton']: | ||
if sport in data[1]: | ||
tweet_dictionary[data[0]][sport] = data[1].split(sport)[1][:3] | ||
|
||
for tweet in tweet_dictionary.keys(): | ||
for sport in ['basketball','volleyball','badminton']: | ||
if sport not in tweet_dictionary[tweet]: | ||
tweet_dictionary[tweet][sport] = '0' | ||
tweet_dictionary[tweet][sport] = re.sub('[^0-9]','', tweet_dictionary[tweet][sport]) | ||
|
||
with open('sport_data.pickle', 'wb') as handle: | ||
pickle.dump(tweet_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL) | ||
|
||
return tweet_dictionary | ||
|
||
|
||
def getTwitterData(): | ||
infile = open('sport_data.pickle','rb') | ||
new_dict = pickle.load(infile) | ||
infile.close() | ||
return new_dict | ||
|
||
def plotGraph(tweet_dictionary): | ||
time = [] | ||
basketball = [] | ||
volleyball = [] | ||
badminton = [] | ||
for key in tweet_dictionary: | ||
if tweet_dictionary[key]['basketball'] == '': | ||
tweet_dictionary[key]['basketball'] = 0 | ||
if tweet_dictionary[key]['badminton'] == '': | ||
tweet_dictionary[key]['badminton'] = 0 | ||
if tweet_dictionary[key]['volleyball'] == '': | ||
tweet_dictionary[key]['volleyball'] = 0 | ||
time.append(datetime.datetime.strptime(key,'%Y-%m-%d %H:%M:%S')) | ||
basketball.append(int(tweet_dictionary[key]['basketball'])) | ||
volleyball.append(int(tweet_dictionary[key]['volleyball'])) | ||
badminton.append(int(tweet_dictionary[key]['badminton'])) | ||
|
||
ax = plt.subplot(111) | ||
ax.bar(time, badminton, width=0.5, color='b', align='center') | ||
ax.bar(time, basketball, width=0.5, color='g', align='center') | ||
ax.bar(time, volleyball, width=0.5, color='r', align='center') | ||
ax.xaxis_date() | ||
|
||
plt.show() | ||
|
||
def createDataSet(year): | ||
dataset = [] | ||
for i in year: | ||
csvFile = pullCSVData(year[i]) | ||
dataset.append(csvFile[1:]) | ||
return dataset | ||
|
||
def safe_cast(val, to_type, default=None): | ||
try: | ||
return to_type(val) | ||
except (ValueError, TypeError): | ||
return default | ||
|
||
def removeEmpty(dirty): | ||
data = [] | ||
e = IndexError() | ||
for d in dirty: | ||
for l in range(len(d)-1): | ||
try: | ||
data.append(d[l][4:10] + [d[l][11],d[l][13],d[l][21]]) | ||
except IndexError: | ||
continue | ||
return data | ||
|
||
def tweetTimeToHour(tweets): | ||
new_dictionary = {} | ||
for key, value in tweets.items(): | ||
tmp = str(datetime.datetime.strptime(key, '%Y-%m-%d %H:%M:%S').replace(microsecond=0,second=0,minute=0)) | ||
new_dictionary[tmp] = tweets[key] | ||
return new_dictionary | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
# twitterData = pullCSVData('twitter-recCentre.csv') | ||
tmp = sortCSVDataChronologically() | ||
weather_2017 = createDataSet(tmp['2017']) | ||
weather_2017 = removeEmpty(weather_2017) | ||
weather_2018 = createDataSet(tmp['2018']) | ||
weather_2018 = removeEmpty(weather_2018) | ||
weather_2019 = createDataSet(tmp['2019']) | ||
weather_2019 = removeEmpty(weather_2019) | ||
|
||
weather_headers = ["Date/Time","Year","Month","Day","Time","Temp (°C)","Dew Point Temp (°C)","Rel Hum (%)","Stn Press (kPa)"] | ||
|
||
|
||
twitterData = getTwitterData() | ||
twitterData = tweetTimeToHour(twitterData) | ||
tmp_new_weather = [] | ||
for w in weather_2017: | ||
if w[2] in ['10','11','12']: | ||
tmp_new_weather.append(w) | ||
tmp_new_weather.reverse() | ||
weather_2017 = tmp_new_weather | ||
tmp_new_weather = [] | ||
for w in weather_2018: | ||
tmp_new_weather.append(w) | ||
tmp_new_weather.reverse() | ||
weather_2018 = tmp_new_weather | ||
tmp_new_weather = [] | ||
for w in weather_2019: | ||
tmp_new_weather.append(w) | ||
tmp_new_weather.reverse() | ||
weather_2019 = tmp_new_weather | ||
tmp_new_weather = [] | ||
for key, value in twitterData.items(): | ||
twitterData[key]['weather'] = [0,0,0,0,0,0,0,0,0,0] | ||
tweet_time = key.split(' ')[1].split(':')[0].strip() | ||
tweet_day = key.split('-')[2].split(' ')[0].strip() | ||
tweet_month = key.split('-')[1].split(' ')[0].strip() | ||
tweet_year = key[:4] | ||
if tweet_year == '2017': | ||
for w in weather_2017: | ||
if (tweet_month == w[2]): | ||
if(tweet_day == w[3]): | ||
if(int(tweet_time) == int(w[4].split(':')[0])): | ||
twitterData[key]['weather'] = (w) | ||
if tweet_year == '2018': | ||
for w in weather_2018: | ||
if (tweet_month == w[2]): | ||
if(tweet_day == w[3]): | ||
if(int(tweet_time) == int(w[4].split(':')[0])): | ||
twitterData[key]['weather'] = (w) | ||
if tweet_year == '2019': | ||
for w in weather_2019: | ||
if (tweet_month == w[2]): | ||
if(tweet_day == w[3]): | ||
if(int(tweet_time) == int(w[4].split(':')[0])): | ||
twitterData[key]['weather'] = (w) | ||
|
||
|
||
headers = "#badminton #basketball #volleyball #temp #dewpoint #humidity #pressure #year #month #day #hour" | ||
print(headers) | ||
training_data = [] | ||
for key, value in twitterData.items(): | ||
if (value['weather']) != [0,0,0,0,0,0,0,0,0,0]: | ||
training_data.append([ | ||
safe_cast(value['badminton'],int,0), | ||
safe_cast(value['basketball'],int,0), | ||
safe_cast(value['volleyball'],int,0), | ||
safe_cast(value['weather'][5],float,0), | ||
safe_cast(value['weather'][6],float,0), | ||
safe_cast(value['weather'][7],float,0.5), | ||
safe_cast(value['weather'][1],int), | ||
safe_cast(value['weather'][2],int), | ||
safe_cast(value['weather'][3],int), | ||
safe_cast(str(value['weather'][4])[:2],int)]) | ||
|
||
|
||
data = pd.DataFrame(training_data) | ||
data.columns = ["badminton","basketball", "volleyball", "temp", "dewpoint", "humidity","year", "month", "day", "hour"] | ||
data = data[data.hour != 0] | ||
data = data[data.hour != 1] | ||
data = data[data.hour != 2] | ||
data = data[data.hour != 3] | ||
data = data[data.hour != 4] | ||
data = data[data.hour != 5] | ||
|
||
with open('training_data.pickle', 'wb') as handle: | ||
pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) | ||
print(len(data)) | ||
a = np.array(training_data) | ||
np.savetxt("training_data.csv", a, delimiter=",") | ||
|
||
|
||
|
||
|
||
|
||
|
Binary file not shown.