Skip to content

Commit

Permalink
Fix invalid URLs in the linkToStandard property
Browse files Browse the repository at this point in the history
- Use both module and macro attributes to generate references.json
  • Loading branch information
russellkan committed May 12, 2020
1 parent 03003f3 commit d4b4b33
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 22 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ Current standard workarounds (as of rev.2020b):
| [Table A.84.3.2-1](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_A.84.3.2.html#table_A.84.3.2-1) contains a macro that has an extra "Macro" in its name ("Frame VOI LUT With LUT Macro") | `process_ciod_func_group_macro_relationship.py` |
| [Table C.8.25.16-8](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.8.25.16.8.html) has an include statement with an extra hierarchy marker (two instead of one) | `hierarchy_utils.py` |
| [Table TID 1004](http://dicom.nema.org/medical/dicom/current/output/chtml/part16/chapter_A.html#sect_TID_1004) has a section URL pattern ("sect_TID_1004") that doesn't exist within the HTML version of the standard | `parse_lib.py` |
| Certain subsections are located within the base section rather than having their own section (C.7.16.2.5.1 should be within C.7.16.2.5, but `sect_C.7.16.2.5.html` is invalid) | `parse_lib.py` |
| Certain subsections are located within the base section rather than having their own section (C.7.16.2.5.1 should be within C.7.16.2.5, but `sect_C.7.16.2.5.html` is invalid) | `parse_lib.py`<br>`extract_modules_macros_with_attributes.py` |
| \*[The Enhanced MR Color Image IOD](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_A.36.4.4.html) references the Enhanced MR Image IOD's functional group macros table instead of having its own (they would be identical tables) | `extract_ciod_func_group_macro_tables.py` |
| \*The "Content Creator's Name" attribute appears twice in [Table C.36.8-1](http://dicom.nema.org/medical/dicom/2019c/output/chtml/part03/sect_C.36.8.html#table_C.36.8-1) with the same hierarchy without a conditional statement | `postprocess_merge_duplicate_nodes.py` |
| \*[Table F.3-3](http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_F.3.2.2.html#table_F.3-3) contains a "Record Selection Keys" attribute with an invalid tag ("See F.5") | `preprocess_modules_with_attributes.py` |
Expand Down
2 changes: 1 addition & 1 deletion dicom_standard/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ dist/module_to_attributes.json: tmp/module_to_attributes_no_duplicates.json dist
dist/macro_to_attributes.json: tmp/macros_attributes_updated_references.json
$(PYTHONPATH_PREFIX) python3 postprocess_merge_duplicate_nodes.py $< > $@

dist/references.json: tmp/modules_attributes_partial_references.json tmp/raw_section_tables.json
dist/references.json: tmp/modules_attributes_partial_references.json tmp/macros_attributes_partial_references.json tmp/raw_section_tables.json
$(PYTHONPATH_PREFIX) python3 postprocess_save_references.py $^ > $@


Expand Down
15 changes: 14 additions & 1 deletion dicom_standard/extract_modules_macros_with_attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
MACRO_TABLE_SUFFIX = re.compile("(.*Macro Attributes$)|(.*Macro Attributes Description$)")
COLUMN_TITLES_WITH_TYPE = ['name', 'tag', 'type', 'description']
COLUMN_TITLES_NO_TYPE = ['name', 'tag', 'description']
VALID_URL_PATTERN = re.compile(r'(.*)(' + '|'.join(pl.NONSTANDARD_SECTION_IDS) + r').*(.html.*)')


def get_module_macro_tables(standard: BeautifulSoup) -> Tuple[List[TableListType], List[Tag]]:
Expand All @@ -45,6 +46,18 @@ def module_table_to_dict(table: StringifiedTableListType) -> List[TableDictType]
return table_to_dict(table, column_titles)


def fix_nonstandard_section_links(link: str) -> str:
'''
Standard workaround: For some reason, certain subsections are located within the base section, so return only the valid part
Ex: C.7.16.2.5.1 should be within C.7.16.2.5, but "sect_C.7.16.2.5.html" is invalid
The pattern has three capturing groups: anything before a nonstandard section ID, the nonstandard ID, and an instance of ".html" with anything after
The substitution removes the extraneous subsection numbers that produce invalid links.
"http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.7.6.16.2.3.html#table_C.7.6.16-4" is replaced with
"http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.7.6.16.2.html#table_C.7.6.16-4"
'''
return VALID_URL_PATTERN.sub(r'\1\2\3', link)


def get_table_with_metadata(table_with_tdiv: Tuple[List[TableDictType], Tag]) -> MetadataTableType:
table, tdiv = table_with_tdiv
table_name = pr.table_name(tdiv)
Expand All @@ -56,7 +69,7 @@ def get_table_with_metadata(table_with_tdiv: Tuple[List[TableDictType], Tag]) ->
'attributes': table,
'id': pl.create_slug(clean_name),
'description': str(clean_table_description(table_description, is_macro)),
'linkToStandard': get_short_standard_link(tdiv),
'linkToStandard': fix_nonstandard_section_links(get_short_standard_link(tdiv)),
'isMacro': is_macro,
}

Expand Down
2 changes: 1 addition & 1 deletion dicom_standard/parse_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def get_standard_page(sect_id: str) -> str:
'''
try:
# TODO: Remove if block (and constant) once URL once links for subsections exist (Issue #10 and related sections)
invalid_sect_id_match = re.match(ID_PATTERN, sect_id)
invalid_sect_id_match = ID_PATTERN.match(sect_id)
if invalid_sect_id_match:
# Standard workaround: For some reason, certain subsections are located within the base section, so return only the valid part
# Ex: C.7.16.2.5.1 should be within C.7.16.2.5, but "sect_C.7.16.2.5.html" is invalid
Expand Down
8 changes: 4 additions & 4 deletions dicom_standard/postprocess_mark_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def get_valid_reference_anchors(parsed_html):
return [a for a in anchor_tags if not re.match(IGNORED_REFS_RE, a['href'])]


def record_references_inside_pairs(module_attr_pairs):
updated_pairs = [record_reference_in_pair(pair) for pair in module_attr_pairs]
def record_references_inside_pairs(pairs):
updated_pairs = [record_reference_in_pair(pair) for pair in pairs]
return updated_pairs


Expand Down Expand Up @@ -51,6 +51,6 @@ def mark_as_recorded(anchor):


if __name__ == '__main__':
module_attr_pairs = pl.read_json_data(sys.argv[1])
updated_pairs = record_references_inside_pairs(module_attr_pairs)
pairs = pl.read_json_data(sys.argv[1])
updated_pairs = record_references_inside_pairs(pairs)
pl.write_pretty_json(updated_pairs)
9 changes: 6 additions & 3 deletions dicom_standard/postprocess_save_references.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
'''
Save reference HTML into a separate JSON file.
'''
from typing import cast, List
import sys
import re
from urllib.parse import urljoin

from bs4 import BeautifulSoup

from dicom_standard import parse_lib as pl
from dicom_standard.macro_utils import MetadataTableType


def find_reference_html_in_sections(pairs, section_listing):
Expand Down Expand Up @@ -60,7 +62,8 @@ def get_location_from_ref(ref):


if __name__ == '__main__':
module_attr_pairs = pl.read_json_data(sys.argv[1])
section_listing = pl.read_json_data(sys.argv[2])
references = find_reference_html_in_sections(module_attr_pairs, section_listing)
module_attr_pairs = cast(List[MetadataTableType], pl.read_json_data(sys.argv[1]))
macro_attr_pairs = cast(List[MetadataTableType], pl.read_json_data(sys.argv[2]))
section_listing = pl.read_json_data(sys.argv[3])
references = find_reference_html_in_sections(module_attr_pairs + macro_attr_pairs, section_listing)
pl.write_pretty_json(references)
19 changes: 8 additions & 11 deletions dicom_standard/postprocess_update_reference_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,17 @@
from dicom_standard import parse_lib as pl


def update_sourceurls(module_attr_pairs, references):
for pair in module_attr_pairs:
def update_sourceurls(pairs, references):
ref_fragments = {url.split('#')[-1]: url for url in references.keys()}
for pair in pairs:
for ref in pair['externalReferences']:
for source_url in references.keys():
reference_fragment = source_url.split('#')[-1]
pair_fragment = ref['sourceUrl'].split('#')[-1]
if pair_fragment == reference_fragment:
ref['sourceUrl'] = source_url
break
return module_attr_pairs
pair_fragment = ref['sourceUrl'].split('#')[-1]
ref['sourceUrl'] = ref_fragments[pair_fragment]
return pairs


if __name__ == '__main__':
module_attr_pairs = pl.read_json_data(sys.argv[1])
pairs = pl.read_json_data(sys.argv[1])
references = pl.read_json_data(sys.argv[2])
updated_pairs = update_sourceurls(module_attr_pairs, references)
updated_pairs = update_sourceurls(pairs, references)
pl.write_pretty_json(updated_pairs)

0 comments on commit d4b4b33

Please sign in to comment.