From 43888ac7c668044bd3685fc22b07efac8a9b2ec7 Mon Sep 17 00:00:00 2001 From: Claromes Date: Fri, 21 Jun 2024 11:42:07 -0300 Subject: [PATCH] v1.0a2 - update parser and viz, add field parsed_archived_timestamp, review poetry config --- docs/api.rst | 2 + docs/contribute.rst | 2 +- docs/field_options.rst | 12 ++-- poetry.lock | 35 +++++++--- pyproject.toml | 10 +-- waybacktweets/_cli.py | 5 +- waybacktweets/api/parse.py | 26 +++++--- waybacktweets/api/visualize.py | 95 +++++++++++++++++++++++---- waybacktweets/config/field_options.py | 5 +- waybacktweets/utils/__init__.py | 2 + waybacktweets/utils/utils.py | 86 ++++++++++++++++++++---- 11 files changed, 221 insertions(+), 59 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index d0eb615..b068e10 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -55,12 +55,14 @@ Utils .. autofunction:: check_double_status .. autofunction:: check_pattern_tweet +.. autofunction:: check_url_scheme .. autofunction:: clean_tweet_url .. autofunction:: clean_wayback_machine_url .. autofunction:: delete_tweet_pathnames .. autofunction:: get_response .. autofunction:: is_tweet_url .. autofunction:: semicolon_parser +.. autofunction:: timestamp_parser Exceptions ------------ diff --git a/docs/contribute.rst b/docs/contribute.rst index 6bfb7cc..87dffee 100644 --- a/docs/contribute.rst +++ b/docs/contribute.rst @@ -16,7 +16,7 @@ If you have Python skills, contribute to the `code =4.0.1", markers = "python_version < \"3.11\""} [package.extras] all = ["altair-tiles (>=0.3.0)", "anywidget (>=0.9.0)", "pyarrow (>=11)", "vega-datasets (>=0.9.0)", "vegafusion[embed] (>=1.6.6)", "vl-convert-python (>=1.3.0)"] @@ -105,6 +106,8 @@ mypy-extensions = ">=0.4.3" packaging = ">=22.0" pathspec = ">=0.9.0" platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""} [package.extras] colorama = ["colorama (>=0.4.3)"] @@ -304,18 +307,18 @@ files = [ [[package]] name = "filelock" -version = "3.15.1" +version = "3.15.3" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.15.1-py3-none-any.whl", hash = "sha256:71b3102950e91dfc1bb4209b64be4dc8854f40e5f534428d8684f953ac847fac"}, - {file = "filelock-3.15.1.tar.gz", hash = "sha256:58a2549afdf9e02e10720eaa4d4470f56386d7a6f72edd7d0596337af8ed7ad8"}, + {file = "filelock-3.15.3-py3-none-any.whl", hash = "sha256:0151273e5b5d6cf753a61ec83b3a9b7d8821c39ae9af9d7ecf2f9e2f17404103"}, + {file = "filelock-3.15.3.tar.gz", hash = "sha256:e1199bf5194a2277273dacd50269f0d87d0682088a3c561c15674ea9005d8635"}, ] [package.extras] docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"] typing = ["typing-extensions (>=4.8)"] [[package]] @@ -346,6 +349,7 @@ files = [ [package.dependencies] Flake8 = ">=5" +TOMLi = {version = "*", markers = "python_version < \"3.11\""} [package.extras] dev = ["pyTest", "pyTest-cov"] @@ -732,6 +736,7 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] @@ -1326,6 +1331,7 @@ sphinxcontrib-htmlhelp = ">=2.0.0" sphinxcontrib-jsmath = "*" sphinxcontrib-qthelp = "*" sphinxcontrib-serializinghtml = ">=1.1.9" +tomli = {version = ">=2", markers = "python_version < \"3.11\""} [package.extras] docs = ["sphinxcontrib-websupport"] @@ -1334,13 +1340,13 @@ test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=6.0)", "setuptools [[package]] name = "sphinx-autodoc-typehints" -version = "2.1.1" +version = "2.2.0" description = "Type hints (PEP 484) support for the Sphinx autodoc extension" optional = false python-versions = ">=3.9" files = [ - {file = "sphinx_autodoc_typehints-2.1.1-py3-none-any.whl", hash = "sha256:22427d74786274add2b6d4afccb8b3c8c1843f48a704550f15a35fd948f8a4de"}, - {file = "sphinx_autodoc_typehints-2.1.1.tar.gz", hash = "sha256:0072b65f5ab2818c229d6d6c2cc993770af55d36bb7bfb16001e2fce4d14880c"}, + {file = "sphinx_autodoc_typehints-2.2.0-py3-none-any.whl", hash = "sha256:143e22dbb096cc39f1559d3accbe423e5fbf04d02849d6564e6471b5616bbd97"}, + {file = "sphinx_autodoc_typehints-2.2.0.tar.gz", hash = "sha256:a21f0120d8657545ad5ec269d7276b0718c367c8ff2fa8ad8767ddf2c660b909"}, ] [package.dependencies] @@ -1570,6 +1576,17 @@ files = [ {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + [[package]] name = "toolz" version = "0.12.1" @@ -1706,5 +1723,5 @@ watchmedo = ["PyYAML (>=3.10)"] [metadata] lock-version = "2.0" -python-versions = "^3.11" -content-hash = "37fcbc9255674bf67e65a2db35dbd71355fc97751141e739f31bb50fe708aa04" +python-versions = "^3.10" +content-hash = "4b34e093fd7034c803ee2d6b2a5598666e343c5b2562c2a2244d1528214bcacd" diff --git a/pyproject.toml b/pyproject.toml index dfc3e15..2085a84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,10 @@ [tool.poetry] name = "waybacktweets" -version = "1.0a1" +version = "1.0a2" description = "Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data." authors = ["Claromes "] license = "GPLv3" readme = "README.md" -repository = "https://github.com/claromes/waybacktweets" -documentation = "https://claromes.github.io/waybacktweets/" keywords = [ "twitter", "tweet", @@ -22,6 +20,7 @@ classifiers = [ "Intended Audience :: Science/Research", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Natural Language :: English", + "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Topic :: Software Development", "Topic :: Utilities", @@ -29,13 +28,13 @@ classifiers = [ exclude = ["app/**", "assets/**", "docs/**", ".streamlit/**"] [tool.poetry.urls] +"Homepage" = "https://claromes.github.io/waybacktweets/" "Documentation" = "https://claromes.github.io/waybacktweets/" "Issue Tracker" = "https://github.com/claromes/waybacktweets/issues" [tool.poetry.dependencies] -python = "^3.11" +python = "^3.10" requests = "^2.30.0" -streamlit = "1.35.0" rich = "^13.6.0" click = "^8.1.7" @@ -48,6 +47,7 @@ sphinx-click = "^6.0.0" sphinx-autodoc-typehints = "^2.1.1" [tool.poetry.group.dev.dependencies] +streamlit = "1.35.0" black = "^24.4.2" flake8 = "^7.0.0" isort = "^5.13.2" diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py index d189c20..d115c09 100644 --- a/waybacktweets/_cli.py +++ b/waybacktweets/_cli.py @@ -128,10 +128,11 @@ def main( field_options = [ "archived_urlkey", "archived_timestamp", - "original_tweet_url", + "parsed_archived_timestamp", "archived_tweet_url", - "parsed_tweet_url", "parsed_archived_tweet_url", + "original_tweet_url", + "parsed_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py index 19228f0..c6d3510 100644 --- a/waybacktweets/api/parse.py +++ b/waybacktweets/api/parse.py @@ -21,11 +21,13 @@ from waybacktweets.utils.utils import ( check_double_status, check_pattern_tweet, + check_url_scheme, clean_tweet_url, delete_tweet_pathnames, get_response, is_tweet_url, semicolon_parser, + timestamp_parser, ) @@ -203,23 +205,26 @@ def _process_response(self, response: List[str]) -> None: original_tweet = delete_tweet_pathnames( clean_tweet_url(cleaned_tweet, self.username) ) - parsed_wayback_machine_url = ( - f"https://web.archive.org/web/{response[1]}/{original_tweet}" - ) double_status = check_double_status(wayback_machine_url, original_tweet) if double_status: original_tweet = delete_tweet_pathnames( - f"https://twitter.com/{original_tweet}" + f"https://twitter.com{original_tweet}" ) elif "://" not in original_tweet: original_tweet = delete_tweet_pathnames(f"https://{original_tweet}") - encoded_tweet = semicolon_parser(response[2]) - encoded_archived_tweet = semicolon_parser(wayback_machine_url) - encoded_parsed_tweet = semicolon_parser(original_tweet) - encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url) + parsed_wayback_machine_url = ( + f"https://web.archive.org/web/{response[1]}/{original_tweet}" + ) + + encoded_archived_tweet = check_url_scheme(semicolon_parser(wayback_machine_url)) + encoded_parsed_archived_tweet = check_url_scheme( + semicolon_parser(parsed_wayback_machine_url) + ) + encoded_tweet = check_url_scheme(semicolon_parser(response[2])) + encoded_parsed_tweet = check_url_scheme(semicolon_parser(original_tweet)) available_tweet_text = None available_tweet_is_RT = None @@ -242,10 +247,11 @@ def _process_response(self, response: List[str]) -> None: self._add_field("archived_urlkey", response[0]) self._add_field("archived_timestamp", response[1]) - self._add_field("original_tweet_url", encoded_tweet) + self._add_field("parsed_archived_timestamp", timestamp_parser(response[1])) self._add_field("archived_tweet_url", encoded_archived_tweet) - self._add_field("parsed_tweet_url", encoded_parsed_tweet) self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet) + self._add_field("original_tweet_url", encoded_tweet) + self._add_field("parsed_tweet_url", encoded_parsed_tweet) self._add_field("archived_mimetype", response[3]) self._add_field("archived_statuscode", response[4]) self._add_field("archived_digest", response[5]) diff --git a/waybacktweets/api/visualize.py b/waybacktweets/api/visualize.py index 70824e7..369e0f1 100644 --- a/waybacktweets/api/visualize.py +++ b/waybacktweets/api/visualize.py @@ -6,6 +6,8 @@ import json from typing import Any, Dict, List +from waybacktweets.utils import timestamp_parser + class HTMLTweetsVisualizer: """ @@ -44,35 +46,100 @@ def generate(self) -> str: The generated HTML string. """ - html = f"\n\n@{self.username} archived tweets\n" + html = f"\n\n" + html += f"\n\n@{self.username}'s archived tweets\n" html += "\n" html += "\n\n" - html += f"

@{self.username} archived tweets

\n" + html += f"

@{self.username}'s archived tweets

\n" html += '
\n' - for tweet in self.json_file_path: + for index, tweet in enumerate(self.json_file_path): html += '
\n' if ( tweet["archived_mimetype"] != "application/json" and not tweet["available_tweet_text"] ): - html += f'\n' + iframe_src = { + "Archived Tweet": tweet["archived_tweet_url"], + "Parsed Archived Tweet": tweet["parsed_archived_tweet_url"], + "Original Tweet": tweet["original_tweet_url"], + "Parsed Tweet": tweet["parsed_tweet_url"], + } + + for key, value in iframe_src.items(): + key_cleaned = key.replace(" ", "_") + + html += f'

{key}↗\n' + html += '

\n' + html += ( + f'\n' + ) + html += f'\n' + html += '
\n' + + html += f'
Loading...
\n' + html += f'\n' + html += "
\n" + html += "
\n" + + html += """ + + """.format( + index=index, url=value, key_cleaned=key_cleaned + ) - html += f'

Original Tweet↗ · \n' - html += f'Parsed Tweet↗ · \n' - html += f'Archived Tweet↗ · \n' - html += f'Parsed Archived Tweet↗

\n' + html += "
\n" + html += f'

{tweet["original_tweet_url"]}

\n' if tweet["available_tweet_text"]: html += "
\n" @@ -82,8 +149,8 @@ def generate(self) -> str: html += "
\n" html += f'

Archived URL Key: {tweet["archived_urlkey"]}

\n' - html += f'

Archived Timestamp: {tweet["archived_timestamp"]}

\n' - html += f'

Archived mimetype: {tweet["archived_mimetype"]}

\n' + html += f'

Archived Timestamp: {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})

\n' + html += f'

Archived mimetype: {tweet["archived_mimetype"]}

\n' html += f'

Archived Statuscode: {tweet["archived_statuscode"]}

\n' html += ( f'

Archived Digest: {tweet["archived_digest"]}

\n' @@ -94,7 +161,7 @@ def generate(self) -> str: html += "
\n" html += "
\n" - html += '

generated by Wayback Tweets↗

\n' + html += '

generated by Wayback Tweets↗

\n' html += "\n" return html diff --git a/waybacktweets/config/field_options.py b/waybacktweets/config/field_options.py index 9c1fcb6..1d36f03 100644 --- a/waybacktweets/config/field_options.py +++ b/waybacktweets/config/field_options.py @@ -5,10 +5,11 @@ FIELD_OPTIONS = [ "archived_urlkey", "archived_timestamp", - "original_tweet_url", + "parsed_archived_timestamp", "archived_tweet_url", - "parsed_tweet_url", "parsed_archived_tweet_url", + "original_tweet_url", + "parsed_tweet_url", "available_tweet_text", "available_tweet_is_RT", "available_tweet_info", diff --git a/waybacktweets/utils/__init__.py b/waybacktweets/utils/__init__.py index 8a76855..a6f3f7a 100644 --- a/waybacktweets/utils/__init__.py +++ b/waybacktweets/utils/__init__.py @@ -3,10 +3,12 @@ from waybacktweets.utils.utils import ( check_double_status, check_pattern_tweet, + check_url_scheme, clean_tweet_url, clean_wayback_machine_url, delete_tweet_pathnames, get_response, is_tweet_url, semicolon_parser, + timestamp_parser, ) diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py index 89be2b6..52f6bc4 100644 --- a/waybacktweets/utils/utils.py +++ b/waybacktweets/utils/utils.py @@ -2,7 +2,9 @@ Utility functions for handling HTTP requests and manipulating URLs. """ +import html import re +from datetime import datetime from typing import Optional, Tuple import requests @@ -126,18 +128,24 @@ def check_pattern_tweet(tweet_url: str) -> str: Returns: Only the extracted URL from a tweet. """ - patterns = [ - re.compile(r'/status/"([^"]+)"'), - re.compile(r'/status/"([^"]+)"'), - re.compile(r'/status/%3B([^"]+)%3B'), - ] - - for pattern in patterns: - match = pattern.search(tweet_url) - if match: - return match.group(1).lstrip("/") + pattern = r'/status/((?:"(.*?)"|"(.*?)(?=&|$)|"%3B(.*?)(?=&|$)))' + match = re.search(pattern, tweet_url) + + if match: + if match.group(2): + parsed_tweet_url = match.group(2) + elif match.group(3): + parsed_tweet_url = match.group(3) + elif match.group(4): + parsed_tweet_url = match.group(4) else: - return tweet_url + parsed_tweet_url = "" + + parsed_tweet_url = html.unescape(parsed_tweet_url) + + return parsed_tweet_url + + return tweet_url def delete_tweet_pathnames(tweet_url: str) -> str: @@ -213,3 +221,59 @@ def is_tweet_url(twitter_url: str) -> bool: return True return False + + +def timestamp_parser(timestamp): + """ + Parses a timestamp into a formatted string. + + Args: + timestamp (str): The timestamp string to parse. + + Returns: + The parsed timestamp in the format "%Y/%m/%d %H:%M:%S", or None if the + timestamp could not be parsed. + """ + formats = [ + "%Y", + "%Y%m", + "%Y%m%d", + "%Y%m%d%H", + "%Y%m%d%H%M", + "%Y%m%d%H%M%S", + ] + + for fmt in formats: + try: + parsed_time = datetime.strptime(timestamp, fmt) + + formatted_time = parsed_time.strftime("%Y/%m/%d %H:%M:%S") + return formatted_time + except ValueError: + continue + + return None + + +def check_url_scheme(url): + """ + Corrects the URL scheme if it contains more than two slashes following the scheme. + + This function uses a regular expression to find 'http:' or 'https:' followed by two or more slashes. + It then replaces this with the scheme followed by exactly two slashes. + + Args: + url (str): The URL to be corrected. + + Returns: + The corrected URL. + """ # noqa: E501 + pattern = r"(http:|https:)(/{2,})" + + def replace_function(match): + scheme = match.group(1) + return f"{scheme}//" + + parsed_url = re.sub(pattern, replace_function, url) + + return parsed_url