ia saving

LNRelease · Oct 1, 2024 · 236fbcb · 236fbcb
1 parent 102f616
commit 236fbcb
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 44 deletions.
diff --git a/lnrelease/session.py b/lnrelease/session.py
@@ -2,7 +2,7 @@
 import re
 import warnings
 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timezone
 from threading import Lock
 from time import perf_counter_ns, sleep, time
 from typing import Self
@@ -23,7 +23,7 @@
     'apple.co',
     'bit.ly',
 }
-IA = re.compile(r'https://web\.archive\.org/web/\d{14}/(?P<url>.+)')
+IA = re.compile(r'https://web\.archive\.org/web/(?P<time>\d{14})/(?P<url>.+)')
 
 
 @dataclass
@@ -205,22 +205,30 @@ def bing_cache(self, url: str, **kwargs) -> requests.Response | None:
         return (self._bing_cache(end, url, **kwargs)
                 or self._bing_cache(netloc + end, url, **kwargs))
 
-    def ia_cache(self, url: str, **kwargs) -> requests.Response | None:
-        now = datetime.today().strftime('%Y%m%d%H%M%S')
-        link = f'https://web.archive.org/web/{now}/{url}'
-        return self.try_get(link, retries=5, **kwargs)
+    def ia_cache(self, url: str, ia_save: int = -1, **kwargs) -> requests.Response | None:
+        now = datetime.now(timezone.utc)
+        link = f'https://web.archive.org/web/{now.strftime("%Y%m%d%H%M%S")}/{url}'
+        page = self.try_get(link, retries=2, **kwargs)
+        if ia_save == -1:
+            return page
+        elif page.status_code != 404:
+            match = IA.fullmatch(page.url)
+            time = datetime.strptime(match.group('time') + 'Z', '%Y%m%d%H%M%S%z')
+            if (now - time).days < ia_save:
+                return page
+        link = f'http://web.archive.org/save/{url}'
+        return self.try_get(link, retries=2, **kwargs) if random.random() > 0.5 else page
 
-    def get_cache(self, url: str, **kwargs) -> requests.Response | None:
-        kwargs['headers'] = CHROME
-        google = self.google_cache(url, **kwargs)
+    def get_cache(self, url: str, ia_save: int, **kwargs) -> requests.Response | None:
+        google = self.google_cache(url, headers=CHROME, **kwargs)
         if google and google.status_code == 200:
             return google
 
-        bing = self.bing_cache(url, **kwargs)
+        bing = self.bing_cache(url, headers=CHROME, **kwargs)
         if bing:
             return bing
 
-        return self.ia_cache(url, **kwargs)
+        return self.ia_cache(url, ia_save=ia_save, **kwargs)
 
     def try_get(self, url: str, retries: int, **kwargs) -> requests.Response | None:
         netloc = urlparse(url).netloc
@@ -231,7 +239,7 @@ def try_get(self, url: str, retries: int, **kwargs) -> requests.Response | None:
         return None
 
     def get(self, url: str, direct: bool = True, web_cache: bool = False,
-            **kwargs) -> requests.Response | None:
+            ia_save: int = -1, **kwargs) -> requests.Response | None:
         kwargs.setdefault('timeout', 100)
         if match := IA.fullmatch(url):
             url = match.group('url')
@@ -240,7 +248,7 @@ def get(self, url: str, direct: bool = True, web_cache: bool = False,
         if web_cache and (not page or page.status_code == 403):
             REQUEST_STATS[urlparse(url).netloc].cache += 1
             self.set_retry(total=2, status_forcelist={500, 502, 503, 504})
-            page = self.get_cache(url, **kwargs)
+            page = self.get_cache(url, ia_save=ia_save, **kwargs)
             self.set_retry()
 
         if page and page.status_code not in (200, 404):

diff --git a/lnrelease/source/seven_seas.py b/lnrelease/source/seven_seas.py
@@ -26,14 +26,14 @@ def strpdate(s: str) -> datetime.date:
 
 def parse(session: Session, link: str, series: Series) -> set[Info]:
     info = set()
-    page = session.get(link, web_cache=True)
+    page = session.get(link, web_cache=True, ia_save=7)
     soup = BeautifulSoup(page.content, 'lxml')
     digital = soup.find(string='Early Digital:')  # assume all volumes are either digital or not
     audio = False
     index = 0
     for release in soup.find_all(class_='series-volume'):
         index += 1
-        header = release.parent.find_previous_sibling('h3').text
+        header = release.find_previous('h3').text
         if format := release.find('b', string='Format:'):
             format = format.next_sibling.strip()
             if format in NON_FORMATS:
@@ -81,7 +81,7 @@ def scrape_full(series: set[Series], info: set[Info]) -> tuple[set[Series], set[
         links: dict[str, str] = {}
         url = 'https://sevenseasentertainment.com/tag/light-novels/'
         while url:
-            page = session.get(url, web_cache=True)
+            page = session.get(url, web_cache=True, ia_save=14)
             soup = BeautifulSoup(page.content, 'lxml')
             lst = soup.find_all(class_='series')
             if not lst:
@@ -91,32 +91,20 @@ def scrape_full(series: set[Series], info: set[Info]) -> tuple[set[Series], set[
             url = url.get('href') if url else None
 
             for a in lst:
-                link = a.get('href')
-                title = a.text
-                links.setdefault(link, title)
+                try:
+                    link = a.get('href')
+                    title = a.text
+                    serie = Series(None, title)
+                    prev = {i for i in info if i.serieskey == serie.key}
+                    if random() > 0.5 and prev and (
+                            today - max(i.date for i in prev)).days > 365:
+                        continue
+
+                    if inf := parse(session, link, serie):
+                        series.add(serie)
+                        info -= prev
+                        info |= inf
+                except Exception as e:
+                    warnings.warn(f'{link}: {e}', RuntimeWarning)
 
-        page = session.get('https://sevenseasentertainment.com/light-novels/', web_cache=True)
-        soup = BeautifulSoup(page.content, 'lxml')
-        lst = soup.find_all(class_='series')
-        if not lst:
-            warnings.warn(f'No series found: {page.url}', RuntimeWarning)
-        for a in lst:
-            link = a.get('href')
-            title = a.text
-            links.setdefault(link, title)
-
-        for link, title in links.items():
-            try:
-                serie = Series(None, title)
-                prev = {i for i in info if i.serieskey == serie.key}
-                if random() > 0.5 and prev and (
-                        today - max(i.date for i in prev)).days > 365:
-                    continue
-
-                if inf := parse(session, link, serie):
-                    series.add(serie)
-                    info -= prev
-                    info |= inf
-            except Exception as e:
-                warnings.warn(f'{link}: {e}', RuntimeWarning)
     return series, info
diff --git a/lnrelease/store/amazon.py b/lnrelease/store/amazon.py
@@ -103,6 +103,7 @@ def parse(session: Session, links: list[str], *,
           ) -> tuple[utils.Series, set[utils.Info]] | None:
     session.set_retry(total=2, status_forcelist={500, 502, 503, 504})
     stats = REQUEST_STATS['www.amazon.com']
+    page = None
     if session.skip_google <= 0:
         for link in {urlparse(link)
                      ._replace(params='', query='', fragment='')

diff --git a/lnrelease/utils.py b/lnrelease/utils.py
@@ -12,8 +12,10 @@
 import store
 
 TITLE = re.compile(r' [\(\[](?:(?:light )?novels?|audio(?:book)?|(?:\w+ )?e?book)[\)\]]', flags=re.IGNORECASE)
-SERIES = re.compile(r'(?:\b|\s|,|:)+(?:[\(\[](?:(?:light )?novels?|audio(?:book)?|e?book)[\)\[]|(?:(vol\.|volume|part) \d[\d\-\.]*)|omnibus|(?:special|collector\'s) edition)(?:(?=\W)|$)', flags=re.IGNORECASE)
+SERIES = re.compile(
+    r'(?:\b|\s|,|:)+(?:[\(\[](?:(?:light )?novels?|audio(?:book)?|e?book)[\)\[]|(?:(vol\.|volume|part) \d[\d\-\.]*)|omnibus|(?:special|collector\'s) edition)(?:(?=\W)|$)', flags=re.IGNORECASE)
 NONWORD = re.compile(r'\W')
+IA = re.compile(r'https://web\.archive\.org/web/\d{14}/(?P<url>.+)')
 
 PHYSICAL = ('Physical', 'Hardcover', 'Hardback', 'Paperback')
 DIGITAL = ('Digital', 'eBook')
@@ -149,6 +151,8 @@ class Info:
     alts: list[str] = field(default_factory=list)
 
     def __post_init__(self) -> None:
+        if match := IA.fullmatch(self.link):
+            self.link = match.group('url')
         self.title = TITLE.sub('', self.title).replace('’', "'").strip()
         self.date = self.date or EPOCH