-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathpostprocess_mark_references.py
56 lines (39 loc) · 1.63 KB
/
postprocess_mark_references.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
'''
Find and mark references to external sections in attribute descriptions.
Each reference is keyed by its source URL.
'''
import sys
import re
from bs4 import BeautifulSoup
from dicom_standard import parse_lib as pl
IGNORED_REFS_RE = re.compile(r'(.*ftp.*)|(.*http.*)|(.*part05.*)|(.*chapter.*)|(.*PS3.*)|(.*DCM.*)|(.*glossentry.*)')
def get_valid_reference_anchors(parsed_html):
anchor_tags = parsed_html.find_all('a', href=True)
return [a for a in anchor_tags if not re.match(IGNORED_REFS_RE, a['href'])]
def record_references_inside_pairs(pairs):
updated_pairs = [record_reference_in_pair(pair) for pair in pairs]
return updated_pairs
def record_reference_in_pair(pair):
parsed_description = BeautifulSoup(pair['description'], 'html.parser')
references = get_valid_reference_anchors(parsed_description)
external_references = list(map(reference_structure_from_anchor, references))
for ref in references:
mark_as_recorded(ref)
pair['externalReferences'] = [] if len(external_references) < 1 else external_references
pair['description'] = str(parsed_description)
finalize_descriptions(pair)
return pair
def finalize_descriptions(pair):
pair['description'] = pl.clean_html(pair['description'])
def reference_structure_from_anchor(reference):
return {
"sourceUrl": reference.get('href'),
"title": reference.get_text()
}
def mark_as_recorded(anchor):
anchor['href'] = ''
anchor.name = 'span'
if __name__ == '__main__':
pairs = pl.read_json_data(sys.argv[1])
updated_pairs = record_references_inside_pairs(pairs)
pl.write_pretty_json(updated_pairs)