v1.0a2 - update parser and viz, add field parsed_archived_timestamp, …

…review poetry config
claromes · Jun 21, 2024 · 43888ac · 43888ac
1 parent 08ecb27
commit 43888ac
Show file tree

Hide file tree

Showing 11 changed files with 221 additions and 59 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -55,12 +55,14 @@ Utils
 
 .. autofunction:: check_double_status
 .. autofunction:: check_pattern_tweet
+.. autofunction:: check_url_scheme
 .. autofunction:: clean_tweet_url
 .. autofunction:: clean_wayback_machine_url
 .. autofunction:: delete_tweet_pathnames
 .. autofunction:: get_response
 .. autofunction:: is_tweet_url
 .. autofunction:: semicolon_parser
+.. autofunction:: timestamp_parser
 
 Exceptions
 ------------

diff --git a/docs/contribute.rst b/docs/contribute.rst
@@ -16,7 +16,7 @@ If you have Python skills, contribute to the `code <https://github.com/claromes/
 
 These are the prerequisites:
 
-- Python 3.11+
+- Python 3.10+
 - Poetry
 
 Install from the source, following the :ref:`installation` instructions.

diff --git a/docs/field_options.rst b/docs/field_options.rst
@@ -7,15 +7,17 @@ The package performs several parses to facilitate the analysis of archived tweet
 
 - ``archived_urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing.
 
-- ``archived_timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
+- ``archived_timestamp``: (`str`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
 
-- ``original_tweet_url``: (`str`) The original tweet URL.
+- ``parsed_archived_timestamp``: (`str`) The ``archived_timestamp`` in human-readable format.
 
-- ``archived_tweet_url``: (`str`) The original archived URL.
+- ``archived_tweet_url``: (`str`) The archived URL.
 
-- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary.  Check the :ref:`utils`.
+- ``parsed_archived_tweet_url``: (`str`) The archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
 
-- ``parsed_archived_tweet_url``: (`str`) The original archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
+- ``original_tweet_url``: (`str`) The original tweet URL.
+
+- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary.  Check the :ref:`utils`.
 
 - ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account.
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,10 @@
 [tool.poetry]
 name = "waybacktweets"
-version = "1.0a1"
+version = "1.0a2"
 description = "Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data."
 authors = ["Claromes <[email protected]>"]
 license = "GPLv3"
 readme = "README.md"
-repository = "https://github.com/claromes/waybacktweets"
-documentation = "https://claromes.github.io/waybacktweets/"
 keywords = [
     "twitter",
     "tweet",
@@ -22,20 +20,21 @@ classifiers = [
     "Intended Audience :: Science/Research",
     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
     "Natural Language :: English",
+    "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Topic :: Software Development",
     "Topic :: Utilities",
 ]
 exclude = ["app/**", "assets/**", "docs/**", ".streamlit/**"]
 
 [tool.poetry.urls]
+"Homepage" = "https://claromes.github.io/waybacktweets/"
 "Documentation" = "https://claromes.github.io/waybacktweets/"
 "Issue Tracker" = "https://github.com/claromes/waybacktweets/issues"
 
 [tool.poetry.dependencies]
-python = "^3.11"
+python = "^3.10"
 requests = "^2.30.0"
-streamlit = "1.35.0"
 rich = "^13.6.0"
 click = "^8.1.7"
 
@@ -48,6 +47,7 @@ sphinx-click = "^6.0.0"
 sphinx-autodoc-typehints = "^2.1.1"
 
 [tool.poetry.group.dev.dependencies]
+streamlit = "1.35.0"
 black = "^24.4.2"
 flake8 = "^7.0.0"
 isort = "^5.13.2"

diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py
@@ -128,10 +128,11 @@ def main(
             field_options = [
                 "archived_urlkey",
                 "archived_timestamp",
-                "original_tweet_url",
+                "parsed_archived_timestamp",
                 "archived_tweet_url",
-                "parsed_tweet_url",
                 "parsed_archived_tweet_url",
+                "original_tweet_url",
+                "parsed_tweet_url",
                 "available_tweet_text",
                 "available_tweet_is_RT",
                 "available_tweet_info",

diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py
@@ -21,11 +21,13 @@
 from waybacktweets.utils.utils import (
     check_double_status,
     check_pattern_tweet,
+    check_url_scheme,
     clean_tweet_url,
     delete_tweet_pathnames,
     get_response,
     is_tweet_url,
     semicolon_parser,
+    timestamp_parser,
 )
 
 
@@ -203,23 +205,26 @@ def _process_response(self, response: List[str]) -> None:
         original_tweet = delete_tweet_pathnames(
             clean_tweet_url(cleaned_tweet, self.username)
         )
-        parsed_wayback_machine_url = (
-            f"https://web.archive.org/web/{response[1]}/{original_tweet}"
-        )
 
         double_status = check_double_status(wayback_machine_url, original_tweet)
 
         if double_status:
             original_tweet = delete_tweet_pathnames(
-                f"https://twitter.com/{original_tweet}"
+                f"https://twitter.com{original_tweet}"
             )
         elif "://" not in original_tweet:
             original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
 
-        encoded_tweet = semicolon_parser(response[2])
-        encoded_archived_tweet = semicolon_parser(wayback_machine_url)
-        encoded_parsed_tweet = semicolon_parser(original_tweet)
-        encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
+        parsed_wayback_machine_url = (
+            f"https://web.archive.org/web/{response[1]}/{original_tweet}"
+        )
+
+        encoded_archived_tweet = check_url_scheme(semicolon_parser(wayback_machine_url))
+        encoded_parsed_archived_tweet = check_url_scheme(
+            semicolon_parser(parsed_wayback_machine_url)
+        )
+        encoded_tweet = check_url_scheme(semicolon_parser(response[2]))
+        encoded_parsed_tweet = check_url_scheme(semicolon_parser(original_tweet))
 
         available_tweet_text = None
         available_tweet_is_RT = None
@@ -242,10 +247,11 @@ def _process_response(self, response: List[str]) -> None:
 
         self._add_field("archived_urlkey", response[0])
         self._add_field("archived_timestamp", response[1])
-        self._add_field("original_tweet_url", encoded_tweet)
+        self._add_field("parsed_archived_timestamp", timestamp_parser(response[1]))
         self._add_field("archived_tweet_url", encoded_archived_tweet)
-        self._add_field("parsed_tweet_url", encoded_parsed_tweet)
         self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
+        self._add_field("original_tweet_url", encoded_tweet)
+        self._add_field("parsed_tweet_url", encoded_parsed_tweet)
         self._add_field("archived_mimetype", response[3])
         self._add_field("archived_statuscode", response[4])
         self._add_field("archived_digest", response[5])