-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwsjRe.py
executable file
·85 lines (65 loc) · 2.74 KB
/
wsjRe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
from urllib.request import urlretrieve
from urllib.error import HTTPError
from multiprocessing import Pool
from sys import stderr
import shutil
import requests
import pandas as pd
from tqdm import tqdm
def getURLs(fileName, retry=False, symbol=''):
urlpath = []
fileName = fileName.rstrip()
if not retry:
listFolder = (fileName.split("/")[1]).split(".")[0]
if os.path.exists('data/' + listFolder):
shutil.rmtree('data/' + listFolder)
os.makedirs('data/' + listFolder)
try:
file = open(fileName)
for line in file.read().splitlines():
urlpath.append((
f'https://www.wsj.com/market-data/quotes/{line}/historical-prices/download?num_rows=100000.958333333333&range_days=100000.958333333333&startDate=01/10/1970&endDate=04/09/2020',
'data/' + listFolder + '/' + line.split('/')[-1].rstrip() + '.csv'
))
except FileNotFoundError:
print('Unable to find list of tickers ' + fileName)
else:
tickers = []
tickers.append('mutualfund/' + symbol)
tickers.append('index/' + symbol)
tickers.append('etf/' + symbol)
tickers.append('fx/' + symbol)
tickers.append(symbol)
for tick in tickers:
if '/' in tick:
temp1 = tick.split("/")[0]
temp2 = tick.split("/")[1]
dashedfile = temp1 + temp2
else:
dashedfile = tick
dashedfile = dashedfile.rstrip()
urlpath.append((
f'https://www.wsj.com/market-data/quotes/{tick}/historical-prices/download?num_rows=100000.958333333333&range_days=100000.958333333333&startDate=01/10/1970&endDate=04/09/2020',
'data/' + fileName + '/' + dashedfile + '.csv'
))
return urlpath
def retrieve(url_and_path):
df = pd.read_csv(url_and_path[0])
print(df.head())
print(url_and_path[1])
df.to_csv(url_and_path[1])
def fetchSymbols(file, retry=False, symbol=''):
if not os.path.exists('data'):
os.makedirs('data')
print("Connecting to WSJ... ")
urls_and_paths = getURLs(fileName=file, retry=retry, symbol=symbol)
total_count = len(urls_and_paths)
for url, path in tqdm(urls_and_paths):
headers = {'Host': 'www.wsj.com', 'User-Agent': 'Chrome', 'Accept': '*/*'}
response = requests.get(url, stream=True, headers=headers)
with open(path, 'wb') as f: # open as block write.
for chunk in response.iter_content(chunk_size=4096):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush() # Afterall, force data flush into output file (optional)