diff --git a/README.md b/README.md index 58bc0c2..f57a56e 100644 --- a/README.md +++ b/README.md @@ -241,7 +241,7 @@ Current standard workarounds (as of rev.2020b): | [Table A.84.3.2-1](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_A.84.3.2.html#table_A.84.3.2-1) contains a macro that has an extra "Macro" in its name ("Frame VOI LUT With LUT Macro") | `process_ciod_func_group_macro_relationship.py` | | [Table C.8.25.16-8](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.8.25.16.8.html) has an include statement with an extra hierarchy marker (two instead of one) | `hierarchy_utils.py` | | [Table TID 1004](http://dicom.nema.org/medical/dicom/current/output/chtml/part16/chapter_A.html#sect_TID_1004) has a section URL pattern ("sect_TID_1004") that doesn't exist within the HTML version of the standard | `parse_lib.py` | -| Certain subsections are located within the base section rather than having their own section (C.7.16.2.5.1 should be within C.7.16.2.5, but `sect_C.7.16.2.5.html` is invalid) | `parse_lib.py` | +| Certain subsections are located within the base section rather than having their own section (C.7.16.2.5.1 should be within C.7.16.2.5, but `sect_C.7.16.2.5.html` is invalid) | `parse_lib.py`
`extract_modules_macros_with_attributes.py` | | \*[The Enhanced MR Color Image IOD](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_A.36.4.4.html) references the Enhanced MR Image IOD's functional group macros table instead of having its own (they would be identical tables) | `extract_ciod_func_group_macro_tables.py` | | \*The "Content Creator's Name" attribute appears twice in [Table C.36.8-1](http://dicom.nema.org/medical/dicom/2019c/output/chtml/part03/sect_C.36.8.html#table_C.36.8-1) with the same hierarchy without a conditional statement | `postprocess_merge_duplicate_nodes.py` | | \*[Table F.3-3](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_F.3.2.2.html#table_F.3-3) contains a "Record Selection Keys" attribute with an invalid tag ("See F.5") | `preprocess_modules_with_attributes.py` | diff --git a/dicom_standard/Makefile b/dicom_standard/Makefile index 03fe780..b8be186 100644 --- a/dicom_standard/Makefile +++ b/dicom_standard/Makefile @@ -49,7 +49,7 @@ dist/module_to_attributes.json: tmp/module_to_attributes_no_duplicates.json dist dist/macro_to_attributes.json: tmp/macros_attributes_updated_references.json $(PYTHONPATH_PREFIX) python3 postprocess_merge_duplicate_nodes.py $< > $@ -dist/references.json: tmp/modules_attributes_partial_references.json tmp/raw_section_tables.json +dist/references.json: tmp/modules_attributes_partial_references.json tmp/macros_attributes_partial_references.json tmp/raw_section_tables.json $(PYTHONPATH_PREFIX) python3 postprocess_save_references.py $^ > $@ diff --git a/dicom_standard/extract_modules_macros_with_attributes.py b/dicom_standard/extract_modules_macros_with_attributes.py index c6d9fd3..6b807bf 100644 --- a/dicom_standard/extract_modules_macros_with_attributes.py +++ b/dicom_standard/extract_modules_macros_with_attributes.py @@ -25,6 +25,7 @@ MACRO_TABLE_SUFFIX = re.compile("(.*Macro Attributes$)|(.*Macro Attributes Description$)") COLUMN_TITLES_WITH_TYPE = ['name', 'tag', 'type', 'description'] COLUMN_TITLES_NO_TYPE = ['name', 'tag', 'description'] +VALID_URL_PATTERN = re.compile(r'(.*)(' + '|'.join(pl.NONSTANDARD_SECTION_IDS) + r').*(.html.*)') def get_module_macro_tables(standard: BeautifulSoup) -> Tuple[List[TableListType], List[Tag]]: @@ -45,6 +46,18 @@ def module_table_to_dict(table: StringifiedTableListType) -> List[TableDictType] return table_to_dict(table, column_titles) +def fix_nonstandard_section_links(link: str) -> str: + ''' + Standard workaround: For some reason, certain subsections are located within the base section, so return only the valid part + Ex: C.7.16.2.5.1 should be within C.7.16.2.5, but "sect_C.7.16.2.5.html" is invalid + The pattern has three capturing groups: anything before a nonstandard section ID, the nonstandard ID, and an instance of ".html" with anything after + The substitution removes the extraneous subsection numbers that produce invalid links. + "http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.7.6.16.2.3.html#table_C.7.6.16-4" is replaced with + "http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.7.6.16.2.html#table_C.7.6.16-4" + ''' + return VALID_URL_PATTERN.sub(r'\1\2\3', link) + + def get_table_with_metadata(table_with_tdiv: Tuple[List[TableDictType], Tag]) -> MetadataTableType: table, tdiv = table_with_tdiv table_name = pr.table_name(tdiv) @@ -56,7 +69,7 @@ def get_table_with_metadata(table_with_tdiv: Tuple[List[TableDictType], Tag]) -> 'attributes': table, 'id': pl.create_slug(clean_name), 'description': str(clean_table_description(table_description, is_macro)), - 'linkToStandard': get_short_standard_link(tdiv), + 'linkToStandard': fix_nonstandard_section_links(get_short_standard_link(tdiv)), 'isMacro': is_macro, } diff --git a/dicom_standard/parse_lib.py b/dicom_standard/parse_lib.py index 08eb525..8341b11 100644 --- a/dicom_standard/parse_lib.py +++ b/dicom_standard/parse_lib.py @@ -194,7 +194,7 @@ def get_standard_page(sect_id: str) -> str: ''' try: # TODO: Remove if block (and constant) once URL once links for subsections exist (Issue #10 and related sections) - invalid_sect_id_match = re.match(ID_PATTERN, sect_id) + invalid_sect_id_match = ID_PATTERN.match(sect_id) if invalid_sect_id_match: # Standard workaround: For some reason, certain subsections are located within the base section, so return only the valid part # Ex: C.7.16.2.5.1 should be within C.7.16.2.5, but "sect_C.7.16.2.5.html" is invalid diff --git a/dicom_standard/postprocess_mark_references.py b/dicom_standard/postprocess_mark_references.py index f6f92cb..dc6939b 100644 --- a/dicom_standard/postprocess_mark_references.py +++ b/dicom_standard/postprocess_mark_references.py @@ -17,8 +17,8 @@ def get_valid_reference_anchors(parsed_html): return [a for a in anchor_tags if not re.match(IGNORED_REFS_RE, a['href'])] -def record_references_inside_pairs(module_attr_pairs): - updated_pairs = [record_reference_in_pair(pair) for pair in module_attr_pairs] +def record_references_inside_pairs(pairs): + updated_pairs = [record_reference_in_pair(pair) for pair in pairs] return updated_pairs @@ -51,6 +51,6 @@ def mark_as_recorded(anchor): if __name__ == '__main__': - module_attr_pairs = pl.read_json_data(sys.argv[1]) - updated_pairs = record_references_inside_pairs(module_attr_pairs) + pairs = pl.read_json_data(sys.argv[1]) + updated_pairs = record_references_inside_pairs(pairs) pl.write_pretty_json(updated_pairs) diff --git a/dicom_standard/postprocess_save_references.py b/dicom_standard/postprocess_save_references.py index 1552b84..314ad47 100644 --- a/dicom_standard/postprocess_save_references.py +++ b/dicom_standard/postprocess_save_references.py @@ -1,6 +1,7 @@ ''' Save reference HTML into a separate JSON file. ''' +from typing import cast, List import sys import re from urllib.parse import urljoin @@ -8,6 +9,7 @@ from bs4 import BeautifulSoup from dicom_standard import parse_lib as pl +from dicom_standard.macro_utils import MetadataTableType def find_reference_html_in_sections(pairs, section_listing): @@ -60,7 +62,8 @@ def get_location_from_ref(ref): if __name__ == '__main__': - module_attr_pairs = pl.read_json_data(sys.argv[1]) - section_listing = pl.read_json_data(sys.argv[2]) - references = find_reference_html_in_sections(module_attr_pairs, section_listing) + module_attr_pairs = cast(List[MetadataTableType], pl.read_json_data(sys.argv[1])) + macro_attr_pairs = cast(List[MetadataTableType], pl.read_json_data(sys.argv[2])) + section_listing = pl.read_json_data(sys.argv[3]) + references = find_reference_html_in_sections(module_attr_pairs + macro_attr_pairs, section_listing) pl.write_pretty_json(references) diff --git a/dicom_standard/postprocess_update_reference_links.py b/dicom_standard/postprocess_update_reference_links.py index 3adda98..cf5bcf2 100644 --- a/dicom_standard/postprocess_update_reference_links.py +++ b/dicom_standard/postprocess_update_reference_links.py @@ -3,20 +3,17 @@ from dicom_standard import parse_lib as pl -def update_sourceurls(module_attr_pairs, references): - for pair in module_attr_pairs: +def update_sourceurls(pairs, references): + ref_fragments = {url.split('#')[-1]: url for url in references.keys()} + for pair in pairs: for ref in pair['externalReferences']: - for source_url in references.keys(): - reference_fragment = source_url.split('#')[-1] - pair_fragment = ref['sourceUrl'].split('#')[-1] - if pair_fragment == reference_fragment: - ref['sourceUrl'] = source_url - break - return module_attr_pairs + pair_fragment = ref['sourceUrl'].split('#')[-1] + ref['sourceUrl'] = ref_fragments[pair_fragment] + return pairs if __name__ == '__main__': - module_attr_pairs = pl.read_json_data(sys.argv[1]) + pairs = pl.read_json_data(sys.argv[1]) references = pl.read_json_data(sys.argv[2]) - updated_pairs = update_sourceurls(module_attr_pairs, references) + updated_pairs = update_sourceurls(pairs, references) pl.write_pretty_json(updated_pairs)