Skip to content

Commit

Permalink
ia saving
Browse files Browse the repository at this point in the history
  • Loading branch information
LNRelease committed Oct 1, 2024
1 parent 102f616 commit 236fbcb
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 44 deletions.
34 changes: 21 additions & 13 deletions lnrelease/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
import warnings
from dataclasses import dataclass
from datetime import datetime
from datetime import datetime, timezone
from threading import Lock
from time import perf_counter_ns, sleep, time
from typing import Self
Expand All @@ -23,7 +23,7 @@
'apple.co',
'bit.ly',
}
IA = re.compile(r'https://web\.archive\.org/web/\d{14}/(?P<url>.+)')
IA = re.compile(r'https://web\.archive\.org/web/(?P<time>\d{14})/(?P<url>.+)')


@dataclass
Expand Down Expand Up @@ -205,22 +205,30 @@ def bing_cache(self, url: str, **kwargs) -> requests.Response | None:
return (self._bing_cache(end, url, **kwargs)
or self._bing_cache(netloc + end, url, **kwargs))

def ia_cache(self, url: str, **kwargs) -> requests.Response | None:
now = datetime.today().strftime('%Y%m%d%H%M%S')
link = f'https://web.archive.org/web/{now}/{url}'
return self.try_get(link, retries=5, **kwargs)
def ia_cache(self, url: str, ia_save: int = -1, **kwargs) -> requests.Response | None:
now = datetime.now(timezone.utc)
link = f'https://web.archive.org/web/{now.strftime("%Y%m%d%H%M%S")}/{url}'
page = self.try_get(link, retries=2, **kwargs)
if ia_save == -1:
return page
elif page.status_code != 404:
match = IA.fullmatch(page.url)
time = datetime.strptime(match.group('time') + 'Z', '%Y%m%d%H%M%S%z')
if (now - time).days < ia_save:
return page
link = f'http://web.archive.org/save/{url}'
return self.try_get(link, retries=2, **kwargs) if random.random() > 0.5 else page

def get_cache(self, url: str, **kwargs) -> requests.Response | None:
kwargs['headers'] = CHROME
google = self.google_cache(url, **kwargs)
def get_cache(self, url: str, ia_save: int, **kwargs) -> requests.Response | None:
google = self.google_cache(url, headers=CHROME, **kwargs)
if google and google.status_code == 200:
return google

bing = self.bing_cache(url, **kwargs)
bing = self.bing_cache(url, headers=CHROME, **kwargs)
if bing:
return bing

return self.ia_cache(url, **kwargs)
return self.ia_cache(url, ia_save=ia_save, **kwargs)

def try_get(self, url: str, retries: int, **kwargs) -> requests.Response | None:
netloc = urlparse(url).netloc
Expand All @@ -231,7 +239,7 @@ def try_get(self, url: str, retries: int, **kwargs) -> requests.Response | None:
return None

def get(self, url: str, direct: bool = True, web_cache: bool = False,
**kwargs) -> requests.Response | None:
ia_save: int = -1, **kwargs) -> requests.Response | None:
kwargs.setdefault('timeout', 100)
if match := IA.fullmatch(url):
url = match.group('url')
Expand All @@ -240,7 +248,7 @@ def get(self, url: str, direct: bool = True, web_cache: bool = False,
if web_cache and (not page or page.status_code == 403):
REQUEST_STATS[urlparse(url).netloc].cache += 1
self.set_retry(total=2, status_forcelist={500, 502, 503, 504})
page = self.get_cache(url, **kwargs)
page = self.get_cache(url, ia_save=ia_save, **kwargs)
self.set_retry()

if page and page.status_code not in (200, 404):
Expand Down
48 changes: 18 additions & 30 deletions lnrelease/source/seven_seas.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ def strpdate(s: str) -> datetime.date:

def parse(session: Session, link: str, series: Series) -> set[Info]:
info = set()
page = session.get(link, web_cache=True)
page = session.get(link, web_cache=True, ia_save=7)
soup = BeautifulSoup(page.content, 'lxml')
digital = soup.find(string='Early Digital:') # assume all volumes are either digital or not
audio = False
index = 0
for release in soup.find_all(class_='series-volume'):
index += 1
header = release.parent.find_previous_sibling('h3').text
header = release.find_previous('h3').text
if format := release.find('b', string='Format:'):
format = format.next_sibling.strip()
if format in NON_FORMATS:
Expand Down Expand Up @@ -81,7 +81,7 @@ def scrape_full(series: set[Series], info: set[Info]) -> tuple[set[Series], set[
links: dict[str, str] = {}
url = 'https://sevenseasentertainment.com/tag/light-novels/'
while url:
page = session.get(url, web_cache=True)
page = session.get(url, web_cache=True, ia_save=14)
soup = BeautifulSoup(page.content, 'lxml')
lst = soup.find_all(class_='series')
if not lst:
Expand All @@ -91,32 +91,20 @@ def scrape_full(series: set[Series], info: set[Info]) -> tuple[set[Series], set[
url = url.get('href') if url else None

for a in lst:
link = a.get('href')
title = a.text
links.setdefault(link, title)
try:
link = a.get('href')
title = a.text
serie = Series(None, title)
prev = {i for i in info if i.serieskey == serie.key}
if random() > 0.5 and prev and (
today - max(i.date for i in prev)).days > 365:
continue

if inf := parse(session, link, serie):
series.add(serie)
info -= prev
info |= inf
except Exception as e:
warnings.warn(f'{link}: {e}', RuntimeWarning)

page = session.get('https://sevenseasentertainment.com/light-novels/', web_cache=True)
soup = BeautifulSoup(page.content, 'lxml')
lst = soup.find_all(class_='series')
if not lst:
warnings.warn(f'No series found: {page.url}', RuntimeWarning)
for a in lst:
link = a.get('href')
title = a.text
links.setdefault(link, title)

for link, title in links.items():
try:
serie = Series(None, title)
prev = {i for i in info if i.serieskey == serie.key}
if random() > 0.5 and prev and (
today - max(i.date for i in prev)).days > 365:
continue

if inf := parse(session, link, serie):
series.add(serie)
info -= prev
info |= inf
except Exception as e:
warnings.warn(f'{link}: {e}', RuntimeWarning)
return series, info
1 change: 1 addition & 0 deletions lnrelease/store/amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def parse(session: Session, links: list[str], *,
) -> tuple[utils.Series, set[utils.Info]] | None:
session.set_retry(total=2, status_forcelist={500, 502, 503, 504})
stats = REQUEST_STATS['www.amazon.com']
page = None
if session.skip_google <= 0:
for link in {urlparse(link)
._replace(params='', query='', fragment='')
Expand Down
6 changes: 5 additions & 1 deletion lnrelease/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
import store

TITLE = re.compile(r' [\(\[](?:(?:light )?novels?|audio(?:book)?|(?:\w+ )?e?book)[\)\]]', flags=re.IGNORECASE)
SERIES = re.compile(r'(?:\b|\s|,|:)+(?:[\(\[](?:(?:light )?novels?|audio(?:book)?|e?book)[\)\[]|(?:(vol\.|volume|part) \d[\d\-\.]*)|omnibus|(?:special|collector\'s) edition)(?:(?=\W)|$)', flags=re.IGNORECASE)
SERIES = re.compile(
r'(?:\b|\s|,|:)+(?:[\(\[](?:(?:light )?novels?|audio(?:book)?|e?book)[\)\[]|(?:(vol\.|volume|part) \d[\d\-\.]*)|omnibus|(?:special|collector\'s) edition)(?:(?=\W)|$)', flags=re.IGNORECASE)
NONWORD = re.compile(r'\W')
IA = re.compile(r'https://web\.archive\.org/web/\d{14}/(?P<url>.+)')

PHYSICAL = ('Physical', 'Hardcover', 'Hardback', 'Paperback')
DIGITAL = ('Digital', 'eBook')
Expand Down Expand Up @@ -149,6 +151,8 @@ class Info:
alts: list[str] = field(default_factory=list)

def __post_init__(self) -> None:
if match := IA.fullmatch(self.link):
self.link = match.group('url')
self.title = TITLE.sub('', self.title).replace('’', "'").strip()
self.date = self.date or EPOCH

Expand Down

0 comments on commit 236fbcb

Please sign in to comment.