diff --git a/pullData.py b/pullData.py new file mode 100644 index 0000000..913ee98 --- /dev/null +++ b/pullData.py @@ -0,0 +1,233 @@ +import webbrowser +import os +import pprint +import csv +import re +import pickle +import matplotlib.pyplot as plt +from matplotlib.dates import date2num +import datetime +import numpy as np +import pandas as pd + +def pullHourlyDataLondon(): + for intYr in range(2010,2019+1): + for intMnt in range(1,12+1): + strQuery = 'http://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=10999&Year=' + str(intYr) + '&Month=' + str(intMnt) +'&timeframe=1&submit=Download+Data' + tmp = webbrowser.open_new_tab(strQuery) + +def sortCSVDataChronologically(): + data = {} + directory = os.fsencode(os.getcwd()+'/WeatherData') + for file in os.listdir(directory): + filename = os.fsdecode(file) + year = filename.split('-')[1].split('.')[0] + month = filename.split('-')[0] + if filename.endswith(".csv"): + if year in data: + data[year][month] = os.getcwd()+'\WeatherData\\'+filename + else: + data[year] = {} + data[year][month] = os.getcwd()+'\WeatherData\\'+filename + continue + else: + continue + return data + +def pullCSVData(filename): + with open(filename, newline='', encoding='UTF-8') as csvfile: + data = list(csv.reader(csvfile)) + return data + +def writeToCSV(dictionary_to_output, file_name): + csv_file = file_name + ".csv" + w = csv.writer(open(csv_file, "w")) + for key, val in dictionary.items(): + w.writerow([key, val]) + +def cleanTwitterData(twitterData): + cleanTwitterData = [] + for data in twitterData: + if 'basketball' in data[6].lower() or 'volleyball' in data[6].lower() or 'badminton' in data[6].lower(): + tmp = [data[2],data[6].lower()] + cleanTwitterData.append(tmp) + tweet_dictionary = {} + for data in cleanTwitterData: + if data[0] not in tweet_dictionary: + tweet_dictionary[data[0]] = {} + for sport in ['basketball','volleyball','badminton']: + if sport in data[1]: + tweet_dictionary[data[0]][sport] = data[1].split(sport)[1][:3] + + for tweet in tweet_dictionary.keys(): + for sport in ['basketball','volleyball','badminton']: + if sport not in tweet_dictionary[tweet]: + tweet_dictionary[tweet][sport] = '0' + tweet_dictionary[tweet][sport] = re.sub('[^0-9]','', tweet_dictionary[tweet][sport]) + + with open('sport_data.pickle', 'wb') as handle: + pickle.dump(tweet_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL) + + return tweet_dictionary + + +def getTwitterData(): + infile = open('sport_data.pickle','rb') + new_dict = pickle.load(infile) + infile.close() + return new_dict + +def plotGraph(tweet_dictionary): + time = [] + basketball = [] + volleyball = [] + badminton = [] + for key in tweet_dictionary: + if tweet_dictionary[key]['basketball'] == '': + tweet_dictionary[key]['basketball'] = 0 + if tweet_dictionary[key]['badminton'] == '': + tweet_dictionary[key]['badminton'] = 0 + if tweet_dictionary[key]['volleyball'] == '': + tweet_dictionary[key]['volleyball'] = 0 + time.append(datetime.datetime.strptime(key,'%Y-%m-%d %H:%M:%S')) + basketball.append(int(tweet_dictionary[key]['basketball'])) + volleyball.append(int(tweet_dictionary[key]['volleyball'])) + badminton.append(int(tweet_dictionary[key]['badminton'])) + + ax = plt.subplot(111) + ax.bar(time, badminton, width=0.5, color='b', align='center') + ax.bar(time, basketball, width=0.5, color='g', align='center') + ax.bar(time, volleyball, width=0.5, color='r', align='center') + ax.xaxis_date() + + plt.show() + +def createDataSet(year): + dataset = [] + for i in year: + csvFile = pullCSVData(year[i]) + dataset.append(csvFile[1:]) + return dataset + +def safe_cast(val, to_type, default=None): + try: + return to_type(val) + except (ValueError, TypeError): + return default + +def removeEmpty(dirty): + data = [] + e = IndexError() + for d in dirty: + for l in range(len(d)-1): + try: + data.append(d[l][4:10] + [d[l][11],d[l][13],d[l][21]]) + except IndexError: + continue + return data + +def tweetTimeToHour(tweets): + new_dictionary = {} + for key, value in tweets.items(): + tmp = str(datetime.datetime.strptime(key, '%Y-%m-%d %H:%M:%S').replace(microsecond=0,second=0,minute=0)) + new_dictionary[tmp] = tweets[key] + return new_dictionary + + + +if __name__ == "__main__": + # twitterData = pullCSVData('twitter-recCentre.csv') + tmp = sortCSVDataChronologically() + weather_2017 = createDataSet(tmp['2017']) + weather_2017 = removeEmpty(weather_2017) + weather_2018 = createDataSet(tmp['2018']) + weather_2018 = removeEmpty(weather_2018) + weather_2019 = createDataSet(tmp['2019']) + weather_2019 = removeEmpty(weather_2019) + + weather_headers = ["Date/Time","Year","Month","Day","Time","Temp (°C)","Dew Point Temp (°C)","Rel Hum (%)","Stn Press (kPa)"] + + + twitterData = getTwitterData() + twitterData = tweetTimeToHour(twitterData) + tmp_new_weather = [] + for w in weather_2017: + if w[2] in ['10','11','12']: + tmp_new_weather.append(w) + tmp_new_weather.reverse() + weather_2017 = tmp_new_weather + tmp_new_weather = [] + for w in weather_2018: + tmp_new_weather.append(w) + tmp_new_weather.reverse() + weather_2018 = tmp_new_weather + tmp_new_weather = [] + for w in weather_2019: + tmp_new_weather.append(w) + tmp_new_weather.reverse() + weather_2019 = tmp_new_weather + tmp_new_weather = [] + for key, value in twitterData.items(): + twitterData[key]['weather'] = [0,0,0,0,0,0,0,0,0,0] + tweet_time = key.split(' ')[1].split(':')[0].strip() + tweet_day = key.split('-')[2].split(' ')[0].strip() + tweet_month = key.split('-')[1].split(' ')[0].strip() + tweet_year = key[:4] + if tweet_year == '2017': + for w in weather_2017: + if (tweet_month == w[2]): + if(tweet_day == w[3]): + if(int(tweet_time) == int(w[4].split(':')[0])): + twitterData[key]['weather'] = (w) + if tweet_year == '2018': + for w in weather_2018: + if (tweet_month == w[2]): + if(tweet_day == w[3]): + if(int(tweet_time) == int(w[4].split(':')[0])): + twitterData[key]['weather'] = (w) + if tweet_year == '2019': + for w in weather_2019: + if (tweet_month == w[2]): + if(tweet_day == w[3]): + if(int(tweet_time) == int(w[4].split(':')[0])): + twitterData[key]['weather'] = (w) + + + headers = "#badminton #basketball #volleyball #temp #dewpoint #humidity #pressure #year #month #day #hour" + print(headers) + training_data = [] + for key, value in twitterData.items(): + if (value['weather']) != [0,0,0,0,0,0,0,0,0,0]: + training_data.append([ + safe_cast(value['badminton'],int,0), + safe_cast(value['basketball'],int,0), + safe_cast(value['volleyball'],int,0), + safe_cast(value['weather'][5],float,0), + safe_cast(value['weather'][6],float,0), + safe_cast(value['weather'][7],float,0.5), + safe_cast(value['weather'][1],int), + safe_cast(value['weather'][2],int), + safe_cast(value['weather'][3],int), + safe_cast(str(value['weather'][4])[:2],int)]) + + + data = pd.DataFrame(training_data) + data.columns = ["badminton","basketball", "volleyball", "temp", "dewpoint", "humidity","year", "month", "day", "hour"] + data = data[data.hour != 0] + data = data[data.hour != 1] + data = data[data.hour != 2] + data = data[data.hour != 3] + data = data[data.hour != 4] + data = data[data.hour != 5] + + with open('training_data.pickle', 'wb') as handle: + pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) + print(len(data)) + a = np.array(training_data) + np.savetxt("training_data.csv", a, delimiter=",") + + + + + + diff --git a/training_data.pickle b/training_data.pickle new file mode 100644 index 0000000..6697609 Binary files /dev/null and b/training_data.pickle differ