-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathtable_utils.py
189 lines (141 loc) · 6.96 KB
/
table_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
'''
Functions for low-level manipulation of standard tables,
represented by a list-of-lists.
'''
from typing import Any, Callable, Dict, List, Optional, Tuple
from copy import copy
from bs4 import BeautifulSoup, Tag
from dicom_standard import parse_lib as pl
from dicom_standard import parse_relations as pr
TableListType = List[List[Tag]]
StringifiedTableListType = List[List[str]]
TableDictType = Dict[str, str]
TableToDictFunc = Callable[[StringifiedTableListType], List[TableDictType]]
GetTableFunc = Callable[[Tuple[List[TableDictType], Tag]], Dict[str, Any]]
def table_to_list(table_div: Tag) -> List[List[str]]:
return [[cell.text.strip() for cell in row.find_all('td')]
for row in pr.table_rows(table_div)]
def get_table_rows_from_ids(standard: BeautifulSoup, table_ids: List[str], col_titles: List[str]) -> List[TableDictType]:
table_dict_list: List[TableDictType] = []
all_tables = standard.find_all('div', class_='table')
for table_id in table_ids:
html_table = pl.find_tdiv_by_id(all_tables, table_id)
table_list = table_to_list(html_table)
table_dict = table_to_dict(table_list, col_titles)
table_dict_list.extend(table_dict)
return table_dict_list
def get_chapter_tables(standard: BeautifulSoup, chapter_id: str, validity_func: Callable[[Tag], bool]) -> Tuple[TableListType, List[Tag]]:
chapter_table_divs = pl.all_tdivs_in_chapter(standard, chapter_id)
filtered_table_divs = list(filter(validity_func, chapter_table_divs))
table_lists = list(map(tdiv_to_table_list, filtered_table_divs))
return (table_lists, filtered_table_divs)
def tables_to_json(
tables: List[TableListType],
tdivs: List[Tag],
table_to_dict_func: TableToDictFunc,
get_table_with_metadata: GetTableFunc
) -> List[Dict[str, Any]]:
expanded_tables = map(expand_spans, tables)
stringified_tables = map(stringify_table, expanded_tables)
table_dicts = map(table_to_dict_func, stringified_tables)
return list(map(get_table_with_metadata, zip(table_dicts, tdivs)))
def get_short_standard_link(tdiv: Tag) -> str:
return pl.SHORT_DICOM_URL_PREFIX + pl.table_parent_page(tdiv) + '.html#' + pr.table_id(tdiv)
def get_table_description(tdiv: Tag) -> Optional[Tag]:
section = tdiv.parent.parent
# Some descriptions are children of 'h3' tags while others are children of 'h5' tags
section_header = section.find(['h3', 'h5'], class_='title')
assert section_header is not None, 'Section header not found.'
return section_header.parent.parent.parent.parent.p
def table_to_dict(table: TableListType, row_names: List[str], omit_empty: bool = False) -> List[TableDictType]:
if omit_empty:
return [dict((k, v) for k, v in zip(row_names, row) if v) for row in table]
return [dict(zip(row_names, row)) for row in table]
def stringify_table(table: TableListType) -> List[List[str]]:
return [[str(cell) for cell in row] for row in table]
def tdiv_to_table_list(table_div: Tag) -> TableListType:
rows = pr.table_rows(table_div)
table_cells = [row.find_all('td') for row in rows if row.find_all('td')]
# We must also include `th` elements during table parsing to compensate
# for HTML errors in the DICOM standard.
table_headers = [row.find_all('th') for row in rows if row.find_all('th')]
return table_cells + table_headers
def expand_spans(table: TableListType) -> TableListType:
rowwise_expanded_table = expand_rows(table)
fully_expanded_table = list(map(expand_columns_in_row, rowwise_expanded_table))
return fully_expanded_table
def expand_rows(table: TableListType) -> TableListType:
'''
We can't perform a more graceful iteration through the table
(i.e. a map or comprehension) because information must be
communicated between each row (the rowspan information).
'''
extended_table = []
row_expansion: List[Tuple[Tag, int]] = []
for row in table:
expanded_row, row_expansion = expand_rowspans(row, row_expansion)
extended_table.append(expanded_row)
return extended_table
def expand_rowspans(row: List[Tag], row_expansion: List[Tuple[Tag, int]]) -> Tuple[List[Tag], List[Tuple[Tag, int]]]:
updated_row = apply_rowspans_from_prev_row(row, row_expansion)
row_expansion = update_row_expansion_counter(updated_row, row_expansion)
return updated_row, row_expansion
def apply_rowspans_from_prev_row(row: List[Tag], row_expansion: List[Tuple[Tag, int]]) -> List[Tag]:
updated_row = row
for cell, cell_idx in row_expansion:
cell = decrement_rowspan_counter(cell)
updated_row = add_row_cell(updated_row, cell, cell_idx)
return updated_row
def add_row_cell(row: List[Tag], cell: Tag, cell_idx: int) -> List[Tag]:
updated_row = slide_down(cell_idx, row)
updated_row[cell_idx] = cell
return updated_row
def slide_down(start_idx: int, row: List[Tag], num_slides: int = 1) -> List[Tag]:
'''
Move elements of array `row` down by `num_slides` number of spaces,
starting at index `start_idx`. Fills the spaces with None.
'''
try:
sliding_rows = row[start_idx:len(row)]
new_row = row[0:len(row) - len(sliding_rows)]
for i in range(num_slides):
new_row.append(None)
new_row.extend(sliding_rows)
return new_row
except IndexError:
raise ValueError('Cell spans beyond table!')
def decrement_rowspan_counter(cell: Tag) -> Tag:
if int(cell['rowspan']) >= 2:
cell['rowspan'] = int(cell['rowspan']) - 1
return cell
def clear_rowspan_counter(cell: Tag) -> None:
cell['rowspan'] = 1
def update_row_expansion_counter(row: List[Tag], row_expansion: List[Tuple[Tag, int]]) -> List[Tuple[Tag, int]]:
row_expansion = remove_completed_rowspans(row_expansion)
for idx, cell in enumerate(row):
if is_new_rowspan_cell(cell, idx, row_expansion):
row_expansion.append((copy(cell), idx))
clear_rowspan_counter(cell)
return row_expansion
def is_new_rowspan_cell(cell: Tag, idx: int, row_expansion: List[Tuple[Tag, int]]) -> bool:
is_not_recorded = (cell, idx) not in row_expansion
return has_rowspans_to_expand(cell) and is_not_recorded
def remove_completed_rowspans(row_expansion: List[Tuple[Tag, int]]) -> List[Tuple[Tag, int]]:
return [(cell, idx) for (cell, idx) in row_expansion
if has_rowspans_to_expand(cell)]
def has_rowspans_to_expand(cell: Tag) -> bool:
rowspan_attr = cell.get('rowspan')
return int(cell.get('rowspan')) > 1 if rowspan_attr is not None else False
def expand_columns_in_row(row: List[Tag]) -> List[Tag]:
expanded_cells = map(expand_cell_colspan, row)
return [cell for span_of_cells in expanded_cells
for cell in span_of_cells]
def expand_cell_colspan(cell: Tag) -> Tag:
colspan_count = cell.get('colspan')
expanded_cell = [cell]
if colspan_count is not None:
colspans = int(colspan_count)
cell['colspan'] = 1
for i in range(colspans - 1):
expanded_cell.append(None)
return expanded_cell