Fix invalid URLs in the linkToStandard property

- Use both module and macro attributes to generate references.json
innolitics · May 12, 2020 · d4b4b33 · d4b4b33
1 parent 03003f3
commit d4b4b33
Show file tree

Hide file tree

Showing 7 changed files with 35 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -241,7 +241,7 @@ Current standard workarounds (as of rev.2020b):
 | [Table A.84.3.2-1](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_A.84.3.2.html#table_A.84.3.2-1) contains a macro that has an extra "Macro" in its name ("Frame VOI LUT With LUT Macro") | `process_ciod_func_group_macro_relationship.py` |
 | [Table C.8.25.16-8](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.8.25.16.8.html) has an include statement with an extra hierarchy marker (two instead of one) | `hierarchy_utils.py` |
 | [Table TID 1004](http://dicom.nema.org/medical/dicom/current/output/chtml/part16/chapter_A.html#sect_TID_1004) has a section URL pattern ("sect_TID_1004") that doesn't exist within the HTML version of the standard | `parse_lib.py` |
-| Certain subsections are located within the base section rather than having their own section (C.7.16.2.5.1 should be within C.7.16.2.5, but `sect_C.7.16.2.5.html` is invalid) | `parse_lib.py` |
+| Certain subsections are located within the base section rather than having their own section (C.7.16.2.5.1 should be within C.7.16.2.5, but `sect_C.7.16.2.5.html` is invalid) | `parse_lib.py`<br>`extract_modules_macros_with_attributes.py` |
 | \*[The Enhanced MR Color Image IOD](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_A.36.4.4.html) references the Enhanced MR Image IOD's functional group macros table instead of having its own (they would be identical tables) | `extract_ciod_func_group_macro_tables.py` |
 | \*The "Content Creator's Name" attribute appears twice in [Table C.36.8-1](http://dicom.nema.org/medical/dicom/2019c/output/chtml/part03/sect_C.36.8.html#table_C.36.8-1) with the same hierarchy without a conditional statement | `postprocess_merge_duplicate_nodes.py` |
 | \*[Table F.3-3](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_F.3.2.2.html#table_F.3-3) contains a "Record Selection Keys" attribute with an invalid tag ("See F.5") | `preprocess_modules_with_attributes.py` |

diff --git a/dicom_standard/Makefile b/dicom_standard/Makefile
@@ -49,7 +49,7 @@ dist/module_to_attributes.json: tmp/module_to_attributes_no_duplicates.json dist
 dist/macro_to_attributes.json: tmp/macros_attributes_updated_references.json
 	$(PYTHONPATH_PREFIX) python3 postprocess_merge_duplicate_nodes.py $< > $@
 
-dist/references.json: tmp/modules_attributes_partial_references.json tmp/raw_section_tables.json
+dist/references.json: tmp/modules_attributes_partial_references.json tmp/macros_attributes_partial_references.json tmp/raw_section_tables.json
 	$(PYTHONPATH_PREFIX) python3 postprocess_save_references.py $^ > $@
 
 

diff --git a/dicom_standard/extract_modules_macros_with_attributes.py b/dicom_standard/extract_modules_macros_with_attributes.py
@@ -25,6 +25,7 @@
 MACRO_TABLE_SUFFIX = re.compile("(.*Macro Attributes$)|(.*Macro Attributes Description$)")
 COLUMN_TITLES_WITH_TYPE = ['name', 'tag', 'type', 'description']
 COLUMN_TITLES_NO_TYPE = ['name', 'tag', 'description']
+VALID_URL_PATTERN = re.compile(r'(.*)(' + '|'.join(pl.NONSTANDARD_SECTION_IDS) + r').*(.html.*)')
 
 
 def get_module_macro_tables(standard: BeautifulSoup) -> Tuple[List[TableListType], List[Tag]]:
@@ -45,6 +46,18 @@ def module_table_to_dict(table: StringifiedTableListType) -> List[TableDictType]
     return table_to_dict(table, column_titles)
 
 
+def fix_nonstandard_section_links(link: str) -> str:
+    '''
+    Standard workaround: For some reason, certain subsections are located within the base section, so return only the valid part
+    Ex: C.7.16.2.5.1 should be within C.7.16.2.5, but "sect_C.7.16.2.5.html" is invalid
+    The pattern has three capturing groups: anything before a nonstandard section ID, the nonstandard ID, and an instance of ".html" with anything after
+    The substitution removes the extraneous subsection numbers that produce invalid links.
+    "http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.7.6.16.2.3.html#table_C.7.6.16-4" is replaced with
+    "http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.7.6.16.2.html#table_C.7.6.16-4"
+    '''
+    return VALID_URL_PATTERN.sub(r'\1\2\3', link)
+
+
 def get_table_with_metadata(table_with_tdiv: Tuple[List[TableDictType], Tag]) -> MetadataTableType:
     table, tdiv = table_with_tdiv
     table_name = pr.table_name(tdiv)
@@ -56,7 +69,7 @@ def get_table_with_metadata(table_with_tdiv: Tuple[List[TableDictType], Tag]) ->
         'attributes': table,
         'id': pl.create_slug(clean_name),
         'description': str(clean_table_description(table_description, is_macro)),
-        'linkToStandard': get_short_standard_link(tdiv),
+        'linkToStandard': fix_nonstandard_section_links(get_short_standard_link(tdiv)),
         'isMacro': is_macro,
     }
 

diff --git a/dicom_standard/parse_lib.py b/dicom_standard/parse_lib.py
@@ -194,7 +194,7 @@ def get_standard_page(sect_id: str) -> str:
     '''
     try:
         # TODO: Remove if block (and constant) once URL once links for subsections exist (Issue #10 and related sections)
-        invalid_sect_id_match = re.match(ID_PATTERN, sect_id)
+        invalid_sect_id_match = ID_PATTERN.match(sect_id)
         if invalid_sect_id_match:
             # Standard workaround: For some reason, certain subsections are located within the base section, so return only the valid part
             # Ex: C.7.16.2.5.1 should be within C.7.16.2.5, but "sect_C.7.16.2.5.html" is invalid

diff --git a/dicom_standard/postprocess_mark_references.py b/dicom_standard/postprocess_mark_references.py
@@ -17,8 +17,8 @@ def get_valid_reference_anchors(parsed_html):
     return [a for a in anchor_tags if not re.match(IGNORED_REFS_RE, a['href'])]
 
 
-def record_references_inside_pairs(module_attr_pairs):
-    updated_pairs = [record_reference_in_pair(pair) for pair in module_attr_pairs]
+def record_references_inside_pairs(pairs):
+    updated_pairs = [record_reference_in_pair(pair) for pair in pairs]
     return updated_pairs
 
 
@@ -51,6 +51,6 @@ def mark_as_recorded(anchor):
 
 
 if __name__ == '__main__':
-    module_attr_pairs = pl.read_json_data(sys.argv[1])
-    updated_pairs = record_references_inside_pairs(module_attr_pairs)
+    pairs = pl.read_json_data(sys.argv[1])
+    updated_pairs = record_references_inside_pairs(pairs)
     pl.write_pretty_json(updated_pairs)
diff --git a/dicom_standard/postprocess_save_references.py b/dicom_standard/postprocess_save_references.py
@@ -1,13 +1,15 @@
 '''
 Save reference HTML into a separate JSON file.
 '''
+from typing import cast, List
 import sys
 import re
 from urllib.parse import urljoin
 
 from bs4 import BeautifulSoup
 
 from dicom_standard import parse_lib as pl
+from dicom_standard.macro_utils import MetadataTableType
 
 
 def find_reference_html_in_sections(pairs, section_listing):
@@ -60,7 +62,8 @@ def get_location_from_ref(ref):
 
 
 if __name__ == '__main__':
-    module_attr_pairs = pl.read_json_data(sys.argv[1])
-    section_listing = pl.read_json_data(sys.argv[2])
-    references = find_reference_html_in_sections(module_attr_pairs, section_listing)
+    module_attr_pairs = cast(List[MetadataTableType], pl.read_json_data(sys.argv[1]))
+    macro_attr_pairs = cast(List[MetadataTableType], pl.read_json_data(sys.argv[2]))
+    section_listing = pl.read_json_data(sys.argv[3])
+    references = find_reference_html_in_sections(module_attr_pairs + macro_attr_pairs, section_listing)
     pl.write_pretty_json(references)
diff --git a/dicom_standard/postprocess_update_reference_links.py b/dicom_standard/postprocess_update_reference_links.py
@@ -3,20 +3,17 @@
 from dicom_standard import parse_lib as pl
 
 
-def update_sourceurls(module_attr_pairs, references):
-    for pair in module_attr_pairs:
+def update_sourceurls(pairs, references):
+    ref_fragments = {url.split('#')[-1]: url for url in references.keys()}
+    for pair in pairs:
         for ref in pair['externalReferences']:
-            for source_url in references.keys():
-                reference_fragment = source_url.split('#')[-1]
-                pair_fragment = ref['sourceUrl'].split('#')[-1]
-                if pair_fragment == reference_fragment:
-                    ref['sourceUrl'] = source_url
-                    break
-    return module_attr_pairs
+            pair_fragment = ref['sourceUrl'].split('#')[-1]
+            ref['sourceUrl'] = ref_fragments[pair_fragment]
+    return pairs
 
 
 if __name__ == '__main__':
-    module_attr_pairs = pl.read_json_data(sys.argv[1])
+    pairs = pl.read_json_data(sys.argv[1])
     references = pl.read_json_data(sys.argv[2])
-    updated_pairs = update_sourceurls(module_attr_pairs, references)
+    updated_pairs = update_sourceurls(pairs, references)
     pl.write_pretty_json(updated_pairs)