From 43888ac7c668044bd3685fc22b07efac8a9b2ec7 Mon Sep 17 00:00:00 2001
From: Claromes <claromes@hey.com>
Date: Fri, 21 Jun 2024 11:42:07 -0300
Subject: [PATCH] v1.0a2 - update parser and viz, add field
 parsed_archived_timestamp, review poetry config

---
 docs/api.rst                          |  2 +
 docs/contribute.rst                   |  2 +-
 docs/field_options.rst                | 12 ++--
 poetry.lock                           | 35 +++++++---
 pyproject.toml                        | 10 +--
 waybacktweets/_cli.py                 |  5 +-
 waybacktweets/api/parse.py            | 26 +++++---
 waybacktweets/api/visualize.py        | 95 +++++++++++++++++++++++----
 waybacktweets/config/field_options.py |  5 +-
 waybacktweets/utils/__init__.py       |  2 +
 waybacktweets/utils/utils.py          | 86 ++++++++++++++++++++----
 11 files changed, 221 insertions(+), 59 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index d0eb615..b068e10 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -55,12 +55,14 @@ Utils
 
 .. autofunction:: check_double_status
 .. autofunction:: check_pattern_tweet
+.. autofunction:: check_url_scheme
 .. autofunction:: clean_tweet_url
 .. autofunction:: clean_wayback_machine_url
 .. autofunction:: delete_tweet_pathnames
 .. autofunction:: get_response
 .. autofunction:: is_tweet_url
 .. autofunction:: semicolon_parser
+.. autofunction:: timestamp_parser
 
 Exceptions
 ------------
diff --git a/docs/contribute.rst b/docs/contribute.rst
index 6bfb7cc..87dffee 100644
--- a/docs/contribute.rst
+++ b/docs/contribute.rst
@@ -16,7 +16,7 @@ If you have Python skills, contribute to the `code <https://github.com/claromes/
 
 These are the prerequisites:
 
-- Python 3.11+
+- Python 3.10+
 - Poetry
 
 Install from the source, following the :ref:`installation` instructions.
diff --git a/docs/field_options.rst b/docs/field_options.rst
index 358e7fa..3c4a0ae 100644
--- a/docs/field_options.rst
+++ b/docs/field_options.rst
@@ -7,15 +7,17 @@ The package performs several parses to facilitate the analysis of archived tweet
 
 - ``archived_urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing.
 
-- ``archived_timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
+- ``archived_timestamp``: (`str`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
 
-- ``original_tweet_url``: (`str`) The original tweet URL.
+- ``parsed_archived_timestamp``: (`str`) The ``archived_timestamp`` in human-readable format.
 
-- ``archived_tweet_url``: (`str`) The original archived URL.
+- ``archived_tweet_url``: (`str`) The archived URL.
 
-- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary.  Check the :ref:`utils`.
+- ``parsed_archived_tweet_url``: (`str`) The archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
 
-- ``parsed_archived_tweet_url``: (`str`) The original archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
+- ``original_tweet_url``: (`str`) The original tweet URL.
+
+- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary.  Check the :ref:`utils`.
 
 - ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account.
 
diff --git a/poetry.lock b/poetry.lock
index 740b153..d0b3de9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -29,6 +29,7 @@ numpy = "*"
 packaging = "*"
 pandas = ">=0.25"
 toolz = "*"
+typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
 
 [package.extras]
 all = ["altair-tiles (>=0.3.0)", "anywidget (>=0.9.0)", "pyarrow (>=11)", "vega-datasets (>=0.9.0)", "vegafusion[embed] (>=1.6.6)", "vl-convert-python (>=1.3.0)"]
@@ -105,6 +106,8 @@ mypy-extensions = ">=0.4.3"
 packaging = ">=22.0"
 pathspec = ">=0.9.0"
 platformdirs = ">=2"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
 
 [package.extras]
 colorama = ["colorama (>=0.4.3)"]
@@ -304,18 +307,18 @@ files = [
 
 [[package]]
 name = "filelock"
-version = "3.15.1"
+version = "3.15.3"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "filelock-3.15.1-py3-none-any.whl", hash = "sha256:71b3102950e91dfc1bb4209b64be4dc8854f40e5f534428d8684f953ac847fac"},
-    {file = "filelock-3.15.1.tar.gz", hash = "sha256:58a2549afdf9e02e10720eaa4d4470f56386d7a6f72edd7d0596337af8ed7ad8"},
+    {file = "filelock-3.15.3-py3-none-any.whl", hash = "sha256:0151273e5b5d6cf753a61ec83b3a9b7d8821c39ae9af9d7ecf2f9e2f17404103"},
+    {file = "filelock-3.15.3.tar.gz", hash = "sha256:e1199bf5194a2277273dacd50269f0d87d0682088a3c561c15674ea9005d8635"},
 ]
 
 [package.extras]
 docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"]
 typing = ["typing-extensions (>=4.8)"]
 
 [[package]]
@@ -346,6 +349,7 @@ files = [
 
 [package.dependencies]
 Flake8 = ">=5"
+TOMLi = {version = "*", markers = "python_version < \"3.11\""}
 
 [package.extras]
 dev = ["pyTest", "pyTest-cov"]
@@ -732,6 +736,7 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
     {version = ">=1.23.2", markers = "python_version == \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
@@ -1326,6 +1331,7 @@ sphinxcontrib-htmlhelp = ">=2.0.0"
 sphinxcontrib-jsmath = "*"
 sphinxcontrib-qthelp = "*"
 sphinxcontrib-serializinghtml = ">=1.1.9"
+tomli = {version = ">=2", markers = "python_version < \"3.11\""}
 
 [package.extras]
 docs = ["sphinxcontrib-websupport"]
@@ -1334,13 +1340,13 @@ test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=6.0)", "setuptools
 
 [[package]]
 name = "sphinx-autodoc-typehints"
-version = "2.1.1"
+version = "2.2.0"
 description = "Type hints (PEP 484) support for the Sphinx autodoc extension"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "sphinx_autodoc_typehints-2.1.1-py3-none-any.whl", hash = "sha256:22427d74786274add2b6d4afccb8b3c8c1843f48a704550f15a35fd948f8a4de"},
-    {file = "sphinx_autodoc_typehints-2.1.1.tar.gz", hash = "sha256:0072b65f5ab2818c229d6d6c2cc993770af55d36bb7bfb16001e2fce4d14880c"},
+    {file = "sphinx_autodoc_typehints-2.2.0-py3-none-any.whl", hash = "sha256:143e22dbb096cc39f1559d3accbe423e5fbf04d02849d6564e6471b5616bbd97"},
+    {file = "sphinx_autodoc_typehints-2.2.0.tar.gz", hash = "sha256:a21f0120d8657545ad5ec269d7276b0718c367c8ff2fa8ad8767ddf2c660b909"},
 ]
 
 [package.dependencies]
@@ -1570,6 +1576,17 @@ files = [
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
 ]
 
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+
 [[package]]
 name = "toolz"
 version = "0.12.1"
@@ -1706,5 +1723,5 @@ watchmedo = ["PyYAML (>=3.10)"]
 
 [metadata]
 lock-version = "2.0"
-python-versions = "^3.11"
-content-hash = "37fcbc9255674bf67e65a2db35dbd71355fc97751141e739f31bb50fe708aa04"
+python-versions = "^3.10"
+content-hash = "4b34e093fd7034c803ee2d6b2a5598666e343c5b2562c2a2244d1528214bcacd"
diff --git a/pyproject.toml b/pyproject.toml
index dfc3e15..2085a84 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,10 @@
 [tool.poetry]
 name = "waybacktweets"
-version = "1.0a1"
+version = "1.0a2"
 description = "Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data."
 authors = ["Claromes <support@claromes.com>"]
 license = "GPLv3"
 readme = "README.md"
-repository = "https://github.com/claromes/waybacktweets"
-documentation = "https://claromes.github.io/waybacktweets/"
 keywords = [
     "twitter",
     "tweet",
@@ -22,6 +20,7 @@ classifiers = [
     "Intended Audience :: Science/Research",
     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
     "Natural Language :: English",
+    "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Topic :: Software Development",
     "Topic :: Utilities",
@@ -29,13 +28,13 @@ classifiers = [
 exclude = ["app/**", "assets/**", "docs/**", ".streamlit/**"]
 
 [tool.poetry.urls]
+"Homepage" = "https://claromes.github.io/waybacktweets/"
 "Documentation" = "https://claromes.github.io/waybacktweets/"
 "Issue Tracker" = "https://github.com/claromes/waybacktweets/issues"
 
 [tool.poetry.dependencies]
-python = "^3.11"
+python = "^3.10"
 requests = "^2.30.0"
-streamlit = "1.35.0"
 rich = "^13.6.0"
 click = "^8.1.7"
 
@@ -48,6 +47,7 @@ sphinx-click = "^6.0.0"
 sphinx-autodoc-typehints = "^2.1.1"
 
 [tool.poetry.group.dev.dependencies]
+streamlit = "1.35.0"
 black = "^24.4.2"
 flake8 = "^7.0.0"
 isort = "^5.13.2"
diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py
index d189c20..d115c09 100644
--- a/waybacktweets/_cli.py
+++ b/waybacktweets/_cli.py
@@ -128,10 +128,11 @@ def main(
             field_options = [
                 "archived_urlkey",
                 "archived_timestamp",
-                "original_tweet_url",
+                "parsed_archived_timestamp",
                 "archived_tweet_url",
-                "parsed_tweet_url",
                 "parsed_archived_tweet_url",
+                "original_tweet_url",
+                "parsed_tweet_url",
                 "available_tweet_text",
                 "available_tweet_is_RT",
                 "available_tweet_info",
diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py
index 19228f0..c6d3510 100644
--- a/waybacktweets/api/parse.py
+++ b/waybacktweets/api/parse.py
@@ -21,11 +21,13 @@
 from waybacktweets.utils.utils import (
     check_double_status,
     check_pattern_tweet,
+    check_url_scheme,
     clean_tweet_url,
     delete_tweet_pathnames,
     get_response,
     is_tweet_url,
     semicolon_parser,
+    timestamp_parser,
 )
 
 
@@ -203,23 +205,26 @@ def _process_response(self, response: List[str]) -> None:
         original_tweet = delete_tweet_pathnames(
             clean_tweet_url(cleaned_tweet, self.username)
         )
-        parsed_wayback_machine_url = (
-            f"https://web.archive.org/web/{response[1]}/{original_tweet}"
-        )
 
         double_status = check_double_status(wayback_machine_url, original_tweet)
 
         if double_status:
             original_tweet = delete_tweet_pathnames(
-                f"https://twitter.com/{original_tweet}"
+                f"https://twitter.com{original_tweet}"
             )
         elif "://" not in original_tweet:
             original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
 
-        encoded_tweet = semicolon_parser(response[2])
-        encoded_archived_tweet = semicolon_parser(wayback_machine_url)
-        encoded_parsed_tweet = semicolon_parser(original_tweet)
-        encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
+        parsed_wayback_machine_url = (
+            f"https://web.archive.org/web/{response[1]}/{original_tweet}"
+        )
+
+        encoded_archived_tweet = check_url_scheme(semicolon_parser(wayback_machine_url))
+        encoded_parsed_archived_tweet = check_url_scheme(
+            semicolon_parser(parsed_wayback_machine_url)
+        )
+        encoded_tweet = check_url_scheme(semicolon_parser(response[2]))
+        encoded_parsed_tweet = check_url_scheme(semicolon_parser(original_tweet))
 
         available_tweet_text = None
         available_tweet_is_RT = None
@@ -242,10 +247,11 @@ def _process_response(self, response: List[str]) -> None:
 
         self._add_field("archived_urlkey", response[0])
         self._add_field("archived_timestamp", response[1])
-        self._add_field("original_tweet_url", encoded_tweet)
+        self._add_field("parsed_archived_timestamp", timestamp_parser(response[1]))
         self._add_field("archived_tweet_url", encoded_archived_tweet)
-        self._add_field("parsed_tweet_url", encoded_parsed_tweet)
         self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
+        self._add_field("original_tweet_url", encoded_tweet)
+        self._add_field("parsed_tweet_url", encoded_parsed_tweet)
         self._add_field("archived_mimetype", response[3])
         self._add_field("archived_statuscode", response[4])
         self._add_field("archived_digest", response[5])
diff --git a/waybacktweets/api/visualize.py b/waybacktweets/api/visualize.py
index 70824e7..369e0f1 100644
--- a/waybacktweets/api/visualize.py
+++ b/waybacktweets/api/visualize.py
@@ -6,6 +6,8 @@
 import json
 from typing import Any, Dict, List
 
+from waybacktweets.utils import timestamp_parser
+
 
 class HTMLTweetsVisualizer:
     """
@@ -44,35 +46,100 @@ def generate(self) -> str:
             The generated HTML string.
         """
 
-        html = f"<html>\n<head>\n<title>@{self.username} archived tweets</title>\n"
+        html = f"<html>\n<!-- This content was generated by Wayback Tweets. Visit: https://claromes.github.io/waybacktweets -->\n"
+        html += f"\n<head>\n<title>@{self.username}'s archived tweets</title>\n"
         html += "<style>\n"
         html += "body { font-family: monospace; background-color: #f5f8fa; color: #1c1e21; margin: 0; padding: 20px; }\n"
         html += ".container { display: flex; flex-wrap: wrap; gap: 20px; }\n"
-        html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #fff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; }\n"
+        html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #ffffff; border: 1px solid #e1e8ed; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; width: 600px; }\n"
         html += ".tweet strong { font-weight: bold; }\n"
-        html += ".tweet a { color: #ef5552; text-decoration: none; }\n"
-        html += ".content { color: #ef5552; }\n"
+        html += ".tweet a { color: #000000; text-decoration: none; }\n"
+        html += ".content { color: #000000; }\n"
+        html += ".source { font-size: 12px; text-align: center; }\n"
+        html += ".iframe_text { font-size: 12px; text-align: end; }\n"
         html += ".tweet a:hover { text-decoration: underline; }\n"
         html += "h1, h3 { text-align: center; }\n"
         html += "iframe { width: 600px; height: 600px; }\n"
+        html += "input {\n"
+        html += "position: absolute;\n"
+        html += "opacity: 0;\n"
+        html += "z-index: -1;\n"
+        html += "}\n"
+        html += ".accordion {\n"
+        html += "margin: 10px;\n"
+        html += "border-radius: 5px;\n"
+        html += "overflow: hidden;\n"
+        html += "box-shadow: 0 4px 4px -2px rgba(0, 0, 0, 0.4);\n"
+        html += "}\n"
+        html += ".accordion-label {\n"
+        html += "display: flex;\n"
+        html += "justify-content: space-between;\n"
+        html += "padding: 1em;\n"
+        html += "font-weight: bold;\n"
+        html += "cursor: pointer;\n"
+        html += "background: #000000;\n"
+        html += "color: #ffffff;\n"
+        html += "}\n"
+        html += ".accordion-content {\n"
+        html += "max-height: 0;\n"
+        html += "padding: 0 1em;\n"
+        html += "background: white;\n"
+        html += "transition: all 0.35s;\n"
+        html += "}\n"
+        html += "input:checked ~ .accordion-content {\n"
+        html += "max-height: 100vh;\n"
+        html += " padding: 1em;\n"
+        html += "}\n"
         html += "</style>\n"
         html += "</head>\n<body>\n"
-        html += f"<h1>@{self.username} archived tweets</h1>\n"
+        html += f"<h1>@{self.username}'s archived tweets</h1>\n"
         html += '<div class="container">\n'
 
-        for tweet in self.json_file_path:
+        for index, tweet in enumerate(self.json_file_path):
             html += '<div class="tweet">\n'
 
             if (
                 tweet["archived_mimetype"] != "application/json"
                 and not tweet["available_tweet_text"]
             ):
-                html += f'<iframe src="{tweet["parsed_archived_tweet_url"]}" frameborder="0" scrolling="auto"></iframe>\n'
+                iframe_src = {
+                    "Archived Tweet": tweet["archived_tweet_url"],
+                    "Parsed Archived Tweet": tweet["parsed_archived_tweet_url"],
+                    "Original Tweet": tweet["original_tweet_url"],
+                    "Parsed Tweet": tweet["parsed_tweet_url"],
+                }
+
+                for key, value in iframe_src.items():
+                    key_cleaned = key.replace(" ", "_")
+
+                    html += f'<p class="iframe_text"><a href="{value}" target="_blank"><strong>{key}↗</strong></a>\n'
+                    html += '<div class="accordion">\n'
+                    html += (
+                        f'<input type="checkbox" id="tab_{index}_{key_cleaned}" />\n'
+                    )
+                    html += f'<label class="accordion-label" for="tab_{index}_{key_cleaned}">Click to load the iframe from {key}</label>\n'
+                    html += '<div class="accordion-content">\n'
+
+                    html += f'<div id="loading_{index}_{key_cleaned}" class="loading">Loading...</div>\n'
+                    html += f'<iframe id="iframe_{index}_{key_cleaned}" frameborder="0" scrolling="auto" loading="lazy" style="display: none;" onload="document.getElementById(\'loading_{index}_{key_cleaned}\').style.display=\'none\'; this.style.display=\'block\';"></iframe>\n'
+                    html += "</div>\n"
+                    html += "</div>\n"
+
+                    html += """
+                    <script>
+                    document.getElementById('tab_{index}_{key_cleaned}').addEventListener('change', function() {{
+                        if (this.checked) {{
+                            document.getElementById('loading_{index}_{key_cleaned}').style.display = 'block';
+                            document.getElementById('iframe_{index}_{key_cleaned}').src = '{url}';
+                        }}
+                    }});
+                    </script>
+                    """.format(
+                        index=index, url=value, key_cleaned=key_cleaned
+                    )
 
-            html += f'<p><a href="{tweet["original_tweet_url"]}" target="_blank"><strong>Original Tweet↗</strong></a> · \n'
-            html += f'<a href="{tweet["parsed_tweet_url"]}" target="_blank"><strong>Parsed Tweet↗</strong></a> · \n'
-            html += f'<a href="{tweet["archived_tweet_url"]}" target="_blank"><strong>Archived Tweet↗</strong></a> · \n'
-            html += f'<a href="{tweet["parsed_archived_tweet_url"]}" target="_blank"><strong>Parsed Archived Tweet↗</strong></a></p>\n'
+                html += "<br>\n"
+                html += f'<p class="source">{tweet["original_tweet_url"]}</p>\n'
 
             if tweet["available_tweet_text"]:
                 html += "<br>\n"
@@ -82,8 +149,8 @@ def generate(self) -> str:
 
             html += "<br>\n"
             html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
-            html += f'<p><strong>Archived Timestamp:</strong> {tweet["archived_timestamp"]}</p>\n'
-            html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'
+            html += f'<p><strong>Archived Timestamp:</strong> {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})</p>\n'
+            html += f'<p><strong>Archived mimetype: {tweet["archived_mimetype"]}</strong></p>\n'
             html += f'<p><strong>Archived Statuscode:</strong> {tweet["archived_statuscode"]}</p>\n'
             html += (
                 f'<p><strong>Archived Digest:</strong> {tweet["archived_digest"]}</p>\n'
@@ -94,7 +161,7 @@ def generate(self) -> str:
             html += "</div>\n"
 
         html += "</div>\n"
-        html += '<h3>generated by <a href="https://github.com/claromes/waybacktweets" target="_blank">Wayback Tweets↗</a></h3>\n'
+        html += '<p class="source">generated by <a href="https://claromes.github.io/waybacktweets/" target="_blank">Wayback Tweets↗</a></p>\n'
         html += "</body>\n</html>"
 
         return html
diff --git a/waybacktweets/config/field_options.py b/waybacktweets/config/field_options.py
index 9c1fcb6..1d36f03 100644
--- a/waybacktweets/config/field_options.py
+++ b/waybacktweets/config/field_options.py
@@ -5,10 +5,11 @@
 FIELD_OPTIONS = [
     "archived_urlkey",
     "archived_timestamp",
-    "original_tweet_url",
+    "parsed_archived_timestamp",
     "archived_tweet_url",
-    "parsed_tweet_url",
     "parsed_archived_tweet_url",
+    "original_tweet_url",
+    "parsed_tweet_url",
     "available_tweet_text",
     "available_tweet_is_RT",
     "available_tweet_info",
diff --git a/waybacktweets/utils/__init__.py b/waybacktweets/utils/__init__.py
index 8a76855..a6f3f7a 100644
--- a/waybacktweets/utils/__init__.py
+++ b/waybacktweets/utils/__init__.py
@@ -3,10 +3,12 @@
 from waybacktweets.utils.utils import (
     check_double_status,
     check_pattern_tweet,
+    check_url_scheme,
     clean_tweet_url,
     clean_wayback_machine_url,
     delete_tweet_pathnames,
     get_response,
     is_tweet_url,
     semicolon_parser,
+    timestamp_parser,
 )
diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py
index 89be2b6..52f6bc4 100644
--- a/waybacktweets/utils/utils.py
+++ b/waybacktweets/utils/utils.py
@@ -2,7 +2,9 @@
 Utility functions for handling HTTP requests and manipulating URLs.
 """
 
+import html
 import re
+from datetime import datetime
 from typing import Optional, Tuple
 
 import requests
@@ -126,18 +128,24 @@ def check_pattern_tweet(tweet_url: str) -> str:
     Returns:
         Only the extracted URL from a tweet.
     """
-    patterns = [
-        re.compile(r'/status/"([^"]+)"'),
-        re.compile(r'/status/&quot;([^"]+)&quot;'),
-        re.compile(r'/status/%3B([^"]+)%3B'),
-    ]
-
-    for pattern in patterns:
-        match = pattern.search(tweet_url)
-        if match:
-            return match.group(1).lstrip("/")
+    pattern = r'/status/((?:"(.*?)"|&quot;(.*?)(?=&|$)|&quot%3B(.*?)(?=&|$)))'
+    match = re.search(pattern, tweet_url)
+
+    if match:
+        if match.group(2):
+            parsed_tweet_url = match.group(2)
+        elif match.group(3):
+            parsed_tweet_url = match.group(3)
+        elif match.group(4):
+            parsed_tweet_url = match.group(4)
         else:
-            return tweet_url
+            parsed_tweet_url = ""
+
+        parsed_tweet_url = html.unescape(parsed_tweet_url)
+
+        return parsed_tweet_url
+
+    return tweet_url
 
 
 def delete_tweet_pathnames(tweet_url: str) -> str:
@@ -213,3 +221,59 @@ def is_tweet_url(twitter_url: str) -> bool:
         return True
 
     return False
+
+
+def timestamp_parser(timestamp):
+    """
+    Parses a timestamp into a formatted string.
+
+    Args:
+        timestamp (str): The timestamp string to parse.
+
+    Returns:
+        The parsed timestamp in the format "%Y/%m/%d %H:%M:%S", or None if the
+        timestamp could not be parsed.
+    """
+    formats = [
+        "%Y",
+        "%Y%m",
+        "%Y%m%d",
+        "%Y%m%d%H",
+        "%Y%m%d%H%M",
+        "%Y%m%d%H%M%S",
+    ]
+
+    for fmt in formats:
+        try:
+            parsed_time = datetime.strptime(timestamp, fmt)
+
+            formatted_time = parsed_time.strftime("%Y/%m/%d %H:%M:%S")
+            return formatted_time
+        except ValueError:
+            continue
+
+    return None
+
+
+def check_url_scheme(url):
+    """
+    Corrects the URL scheme if it contains more than two slashes following the scheme.
+
+    This function uses a regular expression to find 'http:' or 'https:' followed by two or more slashes.
+    It then replaces this with the scheme followed by exactly two slashes.
+
+    Args:
+        url (str): The URL to be corrected.
+
+    Returns:
+        The corrected URL.
+    """  # noqa: E501
+    pattern = r"(http:|https:)(/{2,})"
+
+    def replace_function(match):
+        scheme = match.group(1)
+        return f"{scheme}//"
+
+    parsed_url = re.sub(pattern, replace_function, url)
+
+    return parsed_url