-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplayer_games_scraper.py
94 lines (78 loc) · 3.57 KB
/
player_games_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pickle
import requests as rq
import time
import pandas as pd
from IPython.display import display
from datetime import datetime
import chess.pgn
import io
import os
time_start = time.time()
print('Time started: {}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_start))))
try:
os.mkdir('Player_Games')
except:
pass
path = 'Player_Games'
df= pd.read_pickle('Player_Games.pkl')
player_profile_df = pd.read_pickle('Player_Profile.pkl')
games_df=[]
for player in df['Player'].unique()[0:1400]:
reqs = []
n_games =[]
p_time_start = time.time()
if '{}.csv'.format(player) in os.listdir(path):
print('{} player data have already been collected.'.format(player))
else:
player_df = df.loc[df['Player']==player]
player_games=[]
for game in player_df['Games']:
try:
games = rq.get(game).json()['games']
reqs.append(1)
for g in games:
try:
n_games.append(1)
pgn = io.StringIO(g['pgn'])
pgn = chess.pgn.read_game(pgn)
if len(pgn.headers.keys())==21:
data={}
data['player']=player
data['player_name']=player_profile_df.loc[player]['name']
data['url']=g['url']
try:
data['white_Accuracy']=g['accuracies']['white']
data['black_Accuracy']=g['accuracies']['black']
except:
data['white_Accuracy']='-'
data['black_Accuracy']='-'
for h in pgn.headers.keys():
data[h]=pgn.headers[h]
data['pgn']=g['pgn']
data['ECOUrl']= data['ECOUrl'].replace('https://www.chess.com/openings/','').replace('-',' ')
if data['White']== player:
data['player_rating']=data['WhiteElo']
else:
data['player_rating']=data['BlackElo']
temp_df = pd.DataFrame(data,index=[data['url']])
player_games.append(temp_df)
else:
pass
except:
pass
time.sleep(0.25)
except:
time.sleep(150)
if len(player_games)>0:
temp_df = pd.concat(player_games)
temp_df.to_csv('Player_Games/{}.csv'.format(player))
else:
print('No games found for player: {}'.format(player))
p_time_stop =time.time()
print('{} player data collection completed in {} seconds, total requests: {}, total_games:{}'.format(player,
round(p_time_stop-p_time_start,2),
len(reqs),
len(n_games)))
time.sleep(0.5)
time_stop =time.time()
print('Time Finished: {}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_stop))))