-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_market_data.py
134 lines (107 loc) · 4.08 KB
/
get_market_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import pandas as pd
import pickle
import datetime
import os
import pandas as pd
import pandas_datareader.data as web
import pytz
import time
import save_tickers
# define start time and end time(change of time: datetime.timedelta(days=1))
start = datetime.datetime(2016, 1, 1)
end = datetime.datetime.now(pytz.timezone('America/New_York'))
def get_data(reload_sp500=False):
'''
the default input of reload_sp500 is False. If it is true, all tickers will be renewed.
'''
# renew the database, delete all stock files
# filelist = [f for f in os.listdir("./stock_dfs") if f.endswith(".csv")]
# for f in filelist:
# os.remove("./stock_dfs/{}".format(f))
if reload_sp500:
tickers = save_tickers.save_sp500_tickers()
else:
with open("sp500tickers.pickle", "rb") as f:
tickers = pickle.load(f)
if not os.path.exists('stock_dfs'):
os.makedirs('stock_dfs')
for ticker in tickers:
# just in case your connection breaks, we'd like to save our progress!
if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
# because now the wikipedia is using '.' instead of '-', sometimes just use . to replace -.
try:
df = web.DataReader(ticker, "google", start, end)
df.to_csv('stock_dfs/{}.csv'.format(ticker))
except Exception as e:
print('ticker error: {}'.format(ticker))
else:
print('Already have {}'.format(ticker))
def Close_price_compile():
'''
compile all data into one csv sheet.
'''
# update
if os.path.exists('sp500_joined_closes.csv'):
os.remove('sp500_joined_closes.csv')
with open("sp500tickers.pickle", "rb") as f:
tickers = pickle.load(f)
# there might be tickers that the web can not reach
tickers = [t for t in tickers if os.path.exists('stock_dfs/{}.csv'.format(t))]
main_df = pd.DataFrame()
for count, ticker in enumerate(tickers):
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
df.set_index('Date', inplace=True)
df.rename(columns={'Close': ticker}, inplace=True)
df.drop(['Open', 'High', 'Low', 'Volume'], 1, inplace=True)
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df, how='outer')
main_df.dropna()
print(main_df.head())
main_df.to_csv('sp500_joined_closes.csv')
def Rate_return_compile():
'''
compile all data into one csv sheet.
'''
# update
if os.path.exists('sp500_joined_rates.csv'):
os.remove('sp500_joined_rates.csv')
with open("sp500tickers.pickle", "rb") as f:
tickers = pickle.load(f)
# there might be tickers that the web can not reach
tickers = [t for t in tickers if os.path.exists('stock_dfs/{}.csv'.format(t))]
main_df = pd.DataFrame()
for count, ticker in enumerate(tickers):
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
df.set_index('Date', inplace=True)
df.rename(columns={'Close': ticker}, inplace=True)
df.drop(['Open', 'High', 'Low', 'Volume'], 1, inplace=True)
if main_df.empty:
main_df = df.shift(1) / df - 1
else:
main_df = main_df.join(df.shift(1) / df - 1, how='outer')
main_df.dropna()
print(main_df.head(10))
main_df.to_csv('sp500_joined_rates.csv')
return main_df
def compile_newest_OHLCV():
'''
get the newest open high low close volume data
'''
with open("sp500tickers.pickle", "rb") as f:
tickers = pickle.load(f)
# there might be tickers that the web can not reach
tickers = [t for t in tickers if os.path.exists('stock_dfs/{}.csv'.format(t))]
main_df = pd.DataFrame()
for count, ticker in enumerate(tickers):
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
df = df.tail(1)
df['stock'] = ticker
if main_df.empty:
main_df = df
else:
main_df = main_df.append(df, ignore_index=True)
main_df = main_df.set_index(['stock'])
print(main_df.head())
main_df.to_csv('sp500_new_OHLCV.csv')