From b8886b77aae7d3443de5ccb685a2b4e6a8876c95 Mon Sep 17 00:00:00 2001
From: Martin Thomson <mt@lowentropy.net>
Date: Mon, 8 Jan 2024 12:36:46 +1100
Subject: [PATCH] Update IETF URLs

I will admit to not having completely tested all of the combinations
here, but the changes here were a great help to me when building the
last pull request.
---
 activities.json |  32 ++++++-------
 activities.py   | 118 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 97 insertions(+), 53 deletions(-)

diff --git a/activities.json b/activities.json
index 153a36b6..dcff569f 100644
--- a/activities.json
+++ b/activities.json
@@ -57,7 +57,7 @@
     "mozPositionIssue": 134,
     "org": "IETF",
     "title": "An HTTP Status Code for Indicating Hints (103)",
-    "url": "https://tools.ietf.org/html/rfc8297"
+    "url": "https://datatracker.ietf.org/doc/html/rfc8297"
   },
   {
     "ciuName": null,
@@ -142,7 +142,7 @@
     "mozPositionIssue": 264,
     "org": "Proposal",
     "title": "Bundled HTTP Exchanges",
-    "url": "https://tools.ietf.org/html/draft-yasskin-wpack-bundled-exchanges"
+    "url": "https://datatracker.ietf.org/doc/html/draft-yasskin-wpack-bundled-exchanges"
   },
   {
     "ciuName": "streams",
@@ -328,7 +328,7 @@
     "mozPositionIssue": 131,
     "org": "IETF",
     "title": "Cache Digests for HTTP/2",
-    "url": "https://tools.ietf.org/html/draft-ietf-httpbis-cache-digest"
+    "url": "https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-cache-digest"
   },
   {
     "ciuName": null,
@@ -534,7 +534,7 @@
     "mozPositionIssue": 139,
     "org": "IETF",
     "title": "Encrypted Server Name Indication for TLS 1.3",
-    "url": "https://tools.ietf.org/html/draft-ietf-tls-esni"
+    "url": "https://datatracker.ietf.org/doc/html/draft-ietf-tls-esni"
   },
   {
     "ciuName": null,
@@ -760,7 +760,7 @@
     "mozPositionIssue": 144,
     "org": "Proposal",
     "title": "HTTP Cache-Control Extensions for Stale Content",
-    "url": "https://tools.ietf.org/html/rfc5861"
+    "url": "https://datatracker.ietf.org/doc/html/rfc5861"
   },
   {
     "ciuName": "client-hints-dpr-width-viewport",
@@ -773,7 +773,7 @@
     "mozPositionIssue": 79,
     "org": "IETF",
     "title": "HTTP Client Hints",
-    "url": "https://tools.ietf.org/html/draft-ietf-httpbis-client-hints"
+    "url": "https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-client-hints"
   },
   {
     "ciuName": null,
@@ -833,7 +833,7 @@
     "mozPositionIssue": 260,
     "org": "Proposal",
     "title": "Incrementally Better Cookies",
-    "url": "https://tools.ietf.org/html/draft-west-cookie-incrementalism"
+    "url": "https://datatracker.ietf.org/doc/html/draft-west-cookie-incrementalism"
   },
   {
     "ciuName": null,
@@ -920,7 +920,7 @@
     "mozPositionIssue": 121,
     "org": "IETF",
     "title": "Let 'localhost' be localhost.",
-    "url": "https://tools.ietf.org/html/draft-ietf-dnsop-let-localhost-be-localhost"
+    "url": "https://datatracker.ietf.org/doc/html/draft-ietf-dnsop-let-localhost-be-localhost"
   },
   {
     "ciuName": null,
@@ -1318,7 +1318,7 @@
     "mozPositionIssue": 175,
     "org": "IETF",
     "title": "Secondary Certificate Authentication in HTTP/2",
-    "url": "https://tools.ietf.org/html/draft-ietf-httpbis-http2-secondary-certs"
+    "url": "https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-http2-secondary-certs"
   },
   {
     "ciuName": "mdn-api_serial",
@@ -1342,7 +1342,7 @@
     "mozPositionIssue": 208,
     "org": "IETF",
     "title": "Service binding and parameter specification via the DNS (DNS SVCB and HTTPSSVC)",
-    "url": "https://tools.ietf.org/html/draft-ietf-dnsop-svcb-httpssvc"
+    "url": "https://datatracker.ietf.org/doc/html/draft-ietf-dnsop-svcb-httpssvc"
   },
   {
     "ciuName": null,
@@ -1392,7 +1392,7 @@
     "mozPositionIssue": 29,
     "org": "Proposal",
     "title": "Signed HTTP Exchanges",
-    "url": "https://tools.ietf.org/html/draft-yasskin-http-origin-signed-responses"
+    "url": "https://datatracker.ietf.org/doc/html/draft-yasskin-http-origin-signed-responses"
   },
   {
     "ciuName": "",
@@ -1442,7 +1442,7 @@
     "mozPositionIssue": 256,
     "org": "IETF",
     "title": "Structured Headers for HTTP",
-    "url": "https://tools.ietf.org/html/draft-ietf-httpbis-header-structure"
+    "url": "https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-header-structure"
   },
   {
     "ciuName": null,
@@ -1505,7 +1505,7 @@
     "mozPositionIssue": 261,
     "org": "Proposal",
     "title": "The Privacy Pass Protocol",
-    "url": "https://tools.ietf.org/html/draft-privacy-pass"
+    "url": "https://datatracker.ietf.org/doc/html/draft-privacy-pass"
   },
   {
     "ciuName": null,
@@ -1530,7 +1530,7 @@
     "mozPositionIssue": 167,
     "org": "Proposal",
     "title": "The WebTransport Protocol Framework",
-    "url": "https://tools.ietf.org/html/draft-ietf-webtrans-overview"
+    "url": "https://datatracker.ietf.org/doc/html/draft-ietf-webtrans-overview"
   },
   {
     "ciuName": "mdn-javascript_operators_await_top_level",
@@ -1554,7 +1554,7 @@
     "mozPositionIssue": 96,
     "org": "IETF",
     "title": "Transport Layer Security (TLS) Certificate Compression",
-    "url": "https://tools.ietf.org/html/draft-ietf-tls-certificate-compression"
+    "url": "https://datatracker.ietf.org/doc/html/draft-ietf-tls-certificate-compression"
   },
   {
     "ciuName": null,
@@ -1930,7 +1930,7 @@
     "mozPositionIssue": 105,
     "org": "IETF",
     "title": "Zstandard Compression and the application/zstd Media Type",
-    "url": "https://tools.ietf.org/html/rfc8478"
+    "url": "https://datatracker.ietf.org/doc/html/rfc8478"
   },
   {
     "ciuName": "dialog",
diff --git a/activities.py b/activities.py
index b45c323d..9f290cfe 100755
--- a/activities.py
+++ b/activities.py
@@ -27,7 +27,9 @@
     from requests.auth import HTTPBasicAuth
 except ImportError:
     sys.stderr.write("ERROR: Dependency not available. Try:\n")
-    sys.stderr.write("       > pip3 install --user beautifulsoup4 requests html5lib\n\n")
+    sys.stderr.write(
+        "       > pip3 install --user beautifulsoup4 requests html5lib\n\n"
+    )
     sys.exit(1)
 
 
@@ -44,16 +46,18 @@ class IdType(object):
     "indicates an ID attribute."
     pass
 
+
 class UrlType(object):
     "indicates a URL."
     pass
 
+
 class UrlArrayType(object):
     "indicates a URL or array of URLs."
     pass
 
 
-StringType = type(u"")
+StringType = type("")
 ArrayType = type([])
 
 
@@ -67,7 +71,11 @@ class ActivitiesJson(object):
         ("title", True, StringType),
         ("description", True, StringType),
         ("ciuName", False, StringType),
-        ("org", True, ["W3C", "IETF", "Ecma", "WHATWG", "Unicode", "Proposal", "Other"]),
+        (
+            "org",
+            True,
+            ["W3C", "IETF", "Ecma", "WHATWG", "Unicode", "Proposal", "Other"],
+        ),
         ("group", False, StringType),
         ("url", True, UrlType),
         ("mdnUrl", False, UrlArrayType),
@@ -134,7 +142,9 @@ def entry_unique(self, spec_entry):
                 ["%s already contains id %s" % (self.filename, entry["id"])]
             )
         if entry["url"] in [e["url"] for e in self.data]:
-            raise ValueError(["%s already contains url %s" % (self.filename, entry["url"])])
+            raise ValueError(
+                ["%s already contains url %s" % (self.filename, entry["url"])]
+            )
 
     def validate(self, check_sorting):
         """
@@ -161,7 +171,11 @@ def validate(self, check_sorting):
 
             # Check that the entries are sorted by title, as save writes them.
             if check_sorting and prevTitle is not None and prevTitle > title:
-                errors.append("{} is sorted incorrectly based on its title (it should not be after {})".format(title, prevTitle))
+                errors.append(
+                    "{} is sorted incorrectly based on its title (it should not be after {})".format(
+                        title, prevTitle
+                    )
+                )
             prevTitle = title
         return errors
 
@@ -174,7 +188,7 @@ def validate_entry(self, entry, title=None, is_adding=False):
         if not title:
             title = "Entry"
         errors = []
-        for (name, required, value_type) in self.expected_entry_items:
+        for name, required, value_type in self.expected_entry_items:
             entry_value = entry.get(name, None)
             if required and not is_adding and entry_value is None:
                 errors.append("%s doesn't have required member %s" % (title, name))
@@ -185,25 +199,33 @@ def validate_entry(self, entry, title=None, is_adding=False):
                     if isinstance(entry_value, StringType):
                         for char in entry_value:
                             if char in string.whitespace:
-                                errors.append("%s's %s contains whitespace" % (title, name))
+                                errors.append(
+                                    "%s's %s contains whitespace" % (title, name)
+                                )
                     else:
                         errors.append("%s's %s isn't a string." % (title, name))
                 elif value_type == UrlType:
                     if isinstance(entry_value, StringType):
-                        pass # FIXME: validate URL more?
+                        pass  # FIXME: validate URL more?
                     else:
                         errors.append("%s's %s isn't a URL string." % (title, name))
                 elif value_type == UrlArrayType:
                     if isinstance(entry_value, StringType):
-                        pass # FIXME: validate URL more?
+                        pass  # FIXME: validate URL more?
                     elif isinstance(entry_value, ArrayType):
                         for url in entry_value:
                             if isinstance(url, StringType):
-                                pass # FIXME: validate URL more?
+                                pass  # FIXME: validate URL more?
                             else:
-                                errors.append("%s's %s isn't a URL string or array of them." % (title, name))
+                                errors.append(
+                                    "%s's %s isn't a URL string or array of them."
+                                    % (title, name)
+                                )
                     else:
-                        errors.append("%s's %s isn't a URL string or array of them." % (title, name))
+                        errors.append(
+                            "%s's %s isn't a URL string or array of them."
+                            % (title, name)
+                        )
                 elif isinstance(value_type, type):
                     if not isinstance(entry_value, value_type):
                         errors.append("%s's %s isn't a %s" % (title, name, value_type))
@@ -237,7 +259,7 @@ class SpecEntry(object):
     def __init__(self, spec_url):
         self.orig_url = spec_url
         self.data = {
-            "id": u"",
+            "id": "",
             "title": "",
             "description": None,
             "ciuName": None,
@@ -246,7 +268,7 @@ def __init__(self, spec_url):
             "mdnUrl": None,
             "mozBugUrl": None,
             "mozPositionIssue": None,
-            "mozPosition": u"under consideration",
+            "mozPosition": "under consideration",
             "mozPositionDetail": None,
         }
         self.parser = None
@@ -269,7 +291,8 @@ def figure_out_org(self):
             self.parser = WHATWGParser
         else:
             sys.stderr.write(
-                "* ERROR: Can't figure out what organisation %s belongs to! Using Proposal.\n" % host
+                "* ERROR: Can't figure out what organisation %s belongs to! Using Proposal.\n"
+                % host
             )
 
     def fetch_spec_data(self, url):
@@ -288,7 +311,7 @@ def fetch_spec_data(self, url):
         try:
             spec_data = self.parser().parse(soup, url)
         except BetterUrl as why:
-            new_url = why[0]
+            new_url = str(why)
             sys.stderr.write("* Trying <%s>...\n" % new_url)
             spec_data = self.fetch_spec_data(new_url)
         except FetchError:
@@ -438,10 +461,12 @@ def parse(self, spec, url_string):
             sys.exit(1)
         return data
 
+
 class W3CCGParser(W3CParser):
     "Parser for W3C community group specs"
     org = "Proposal"
 
+
 class WHATWGParser(W3CParser):
     "Parser for WHATWG specs"
     org = "WHATWG"
@@ -456,16 +481,20 @@ def get_meta(self, spec, names):
 
         Takes a list of names that are tried in sequence; if none are present, None is returned.
         """
-        try:
-            name = names.pop(0)
-        except IndexError:
-            return None
-        try:
-            return spec.head.find("meta", attrs={"name": name})["content"].replace(
-                "\n", " "
-            )
-        except (TypeError, AttributeError):
-            return self.get_meta(spec, names)
+        for name in names:
+            try:
+                return spec.head.find("meta", attrs={"name": name})["content"].replace(
+                    "\n", " "
+                )
+            except (TypeError, AttributeError):
+                pass
+            try:
+                return spec.head.find("meta", attrs={"property": name})[
+                    "content"
+                ].replace("\n", " ")
+            except (TypeError, AttributeError):
+                pass
+        return None
 
     def parse(self, spec, url_string):
         url = urlsplit(url_string)
@@ -482,16 +511,15 @@ def parse(self, spec, url_string):
                             self.html_url("rfc%s" % identifier.rsplit(":", 1)[1])
                         )
                 draft_name, draft_number = self.parse_draft_name(path_components[-1])
-                if draft_number:
-                    raise BetterUrl(self.html_url(draft_name))
+                raise BetterUrl(self.html_url(draft_name))
             elif path_components[1] in ["id", "pdf"]:
                 raise BetterUrl(self.html_url(path_components[2]))
             else:
                 raise FetchError("I don't think that's a specification.")
         elif url.netloc.lower() == "www.ietf.org" and path_components[1] == "id":
-            if path_components[1] in ["id", "pdf"]:
+            if path_components[1] in ["archive", "id", "pdf"]:
                 try:
-                    draft_name = path_components[2].rsplit(".", 1)[0]
+                    draft_name = path_components[-1].rsplit(".", 1)[0]
                 except ValueError:
                     draft_name = path_components[2]
                 draft_name = self.parse_draft_name(draft_name)[0]
@@ -500,18 +528,34 @@ def parse(self, spec, url_string):
                 raise FetchError("I don't think that's a specification.")
         elif url.netloc.lower() == "datatracker.ietf.org":
             if path_components[1] == "doc":
-                raise BetterUrl(self.html_url(path_components[2]))
+                draft_name, draft_number = self.parse_draft_name(path_components[-1])
+                if draft_number or path_components[2] != "html":
+                    raise BetterUrl(self.html_url(draft_name))
+            elif path_components[1] in ["archive", "id", "pdf"]:
+                raise BetterUrl(self.html_url(path_components[-1]))
             else:
                 raise FetchError("I don't think that's a specification.")
         data = {}
-        data["title"] = self.get_meta(spec, ["DC.Title"]) or spec.head.title.string
+        data["title"] = self.get_meta(
+            spec, ["og:title", "DC.Title"]
+        ) or spec.head.title.string.replace("\n", " ")
         data["description"] = (
             self.get_meta(
-                spec, ["description", "dcterms.abstract", "DC.Description.Abstract"]
+                spec,
+                [
+                    "og:description",
+                    "description",
+                    "dcterms.abstract",
+                    "DC.Description.Abstract",
+                ],
             )
             or ""
         )
-        is_ietf = draft_name.startswith("rfc") or draft_name.startswith("draft-ietf-") or draft_name.startswith("draft-irtf-")
+        is_ietf = (
+            draft_name.startswith("rfc")
+            or draft_name.startswith("draft-ietf-")
+            or draft_name.startswith("draft-irtf-")
+        )
         data["org"] = self.org = "IETF" if is_ietf else "Proposal"
         data["url"] = self.clean_url(url_string)
         return data
@@ -530,8 +574,8 @@ def parse_draft_name(instr):
     @staticmethod
     def html_url(doc_name):
         "Return the canonical URL for a document name."
-        path = "/".join(["html", doc_name])
-        return urlunsplit(["https", "tools.ietf.org", path, "", ""])
+        path = "/".join(["doc", "html", doc_name])
+        return urlunsplit(["https", "datatracker.ietf.org", path, "", ""])
 
 
 # Map of URL hostnames to org-specific parsers.
@@ -591,7 +635,7 @@ def usage():
 
     if VERB in ["validate", "add", "sort"]:
         ACTIVITIES = ActivitiesJson("activities.json")
-        ERRORS = ACTIVITIES.validate(check_sorting = (VERB != "sort"))
+        ERRORS = ACTIVITIES.validate(check_sorting=(VERB != "sort"))
         if ERRORS:
             sys.stderr.write("\n".join(["* ERROR: %s" % E for E in ERRORS]) + "\n")
             sys.exit(1)