Skip to content

Commit

Permalink
use jarchive as wayback fallback
Browse files Browse the repository at this point in the history
  • Loading branch information
stuartthomas25 committed May 5, 2024
1 parent b83a738 commit 5c9c5b8
Showing 1 changed file with 15 additions and 11 deletions.
26 changes: 15 additions & 11 deletions jparty/retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,24 @@ def get_Gsheet_game(file_id):

def get_game(game_id):
if len(str(game_id)) < 7:
return get_wayback_jarchive_game(game_id)
try:
return get_wayback_game(game_id)
except Exception as e:
logging.error(e)
return get_jarchive_game(game_id)
else:
return get_Gsheet_game(str(game_id))


def findanswer(clue):
return re.findall(r'correct_response">(.*?)</em', unescape(str(clue)))[0]

def get_JArchive_Game(game_id, wayback_url=None):
logging.info(f"getting game {game_id}")
if wayback_url is not None:
r = requests.get(wayback_url)
else:
r = requests.get(f"http://www.j-archive.com/showgame.php?game_id={game_id}")
def get_jarchive_game(game_id):
return get_generic_game(game_id, f"http://www.j-archive.com/showgame.php?game_id={game_id}")

def get_generic_game(game_id, url):
logging.info(f"getting game {game_id} from url {url}")
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
datesearch = re.search(
r"- \w+, (.*?)$", soup.select("#game_title > h1")[0].contents[0]
Expand Down Expand Up @@ -121,7 +125,7 @@ def get_JArchive_Game(game_id, wayback_url=None):

return GameData(boards, date, comments)

def get_wayback_jarchive_game(game_id):
def get_wayback_game(game_id):
# kudos to Abhi Kumbar: https://medium.com/analytics-vidhya/the-wayback-machine-scraper-63238f6abb66
# this query's the wayback cdx api for possible instances of the saved jarchive page with the specified game id & returns the latest one
JArchive_url = f"j-archive.com/showgame.php?game_id={str(game_id)}" # use the url w/o the http:// or https:// to include both in query
Expand All @@ -130,9 +134,8 @@ def get_wayback_jarchive_game(game_id):
parse_url = json.loads(urls) # parses the JSON from urls.
if len(parse_url) == 0: # if no results, return None
logging.info("no games found in wayback")
# return None
# alternative: use fallback to get game from scraping j-archive directly
return get_JArchive_Game(game_id)
raise Exception("no games found in wayback")

## Extracts timestamp and original columns from urls and compiles a url list.
url_list = []
Expand All @@ -143,7 +146,8 @@ def get_wayback_jarchive_game(game_id):
final_url = f'http://web.archive.org/web/{waylink}'
url_list.append(final_url)
latest_url = url_list[-1]
return get_JArchive_Game(game_id, latest_url)
return get_generic_game(game_id, latest_url)


def get_game_sum(soup):
date = re.search(
Expand Down

0 comments on commit 5c9c5b8

Please sign in to comment.