-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwiki.py
343 lines (281 loc) · 12.1 KB
/
wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
from genericpath import isfile
from os import path, environ
from glob import glob
from posixpath import basename
from yaml import dump
from typing import Dict, List
import logging
import docker
import pandas as pd
from scripts.utils import *
from scripts.tree import Node, load_spec
from scripts.docker_adapter import run_simple_command, __docker_client
from scripts.git_helper import GitHelper # has it been used?
from scripts.fs import MANIFEST_PATH
logger = get_logger()
def run_outputs(node: Node, all_info_cmds:Dict) -> List[Dict]:
"""Create a container & Run info commands of each image and return outputs.
Wrapper around run_simple_command()
Args:
node (Node): _description_
all_info_cmds (Dict): all available info commands generated by load_spec()['all_info_cmds']
Returns:
List[Dict]: each Dict has description (str), output (str) of one image.
"""
# Create docker container
logger.info(f"Creating container for image {node.full_image_name} ...")
outputs = []
try:
container = __docker_client.containers.run(
image=node.full_image_name, command='/bin/bash', tty=True, detach=True,
) # If detach is True, a Container object is returned instead.
except Exception as e:
logger.error(e)
logger.error(
f"*** docker container failed to run on image {node.full_image_name} ***")
return []
else:
logger.info(f"Container {container.name} created")
for key in node.info_cmds:
if key not in all_info_cmds.keys():
logger.error(f"command definition of {key} in {node.image_name} not found in spec.yml; skip")
continue
logging.debug(f"Running command {all_info_cmds[key]['command']}")
cmd_output, cmd_success = run_simple_command(
container,
node,
all_info_cmds[key]['command']
)
description = all_info_cmds[key]['description']
outputs.append(dict(description=description, output=cmd_output))
finally:
if container:
logger.info(f"Removing container {container.name} ...")
container.remove(force=True)
logger.info(f"Container {container.name} removed")
__docker_client.close()
return outputs
def get_layers(image: docker.models.images.Image):
"""Helper function of get_layers_md_table
Args:
image (docker.models.images.Image): the actual image object, not image name
Returns:
pandas.Dataframe: to be further processed.
"""
df = pd.DataFrame(image.history()).convert_dtypes()
df['CMD'] = df['CreatedBy']
df['CMD'] = df['CMD'].str.replace('|', '', regex=False)
df['CMD'] = df['CMD'].str.replace('/bin/bash -o pipefail -c', '', regex=False)
df['CMD'] = df['CMD'].str.replace('#(nop)', '', regex=False)
df['CMD'] = '`' + df['CMD'].str.strip() + '`'
df['createdAt'] = pd.to_datetime(df['Created'], unit='s')
df['hSize'] = df['Size'].apply(
lambda x: bytes_to_hstring(x) if x > 100 else f'{x} B'
)
df['cumSize'] = df['Size'][::-1].cumsum()[::-1]
df['hcumSize'] = df['cumSize'].apply(
lambda x: bytes_to_hstring(x) if x > 100 else f'{x} B'
)
df_ordered = (
df.loc[df['Tags'].notna()[::-1].cumsum()[::-1] > 0, :]
.iloc[::-1, :]
.reset_index(drop=True)
)
df_ordered['elapsed'] = df_ordered['createdAt'].diff()
df_ordered.loc[1, 'elapsed'] = pd.Timedelta(0)
df_ordered['elapsed'] = df_ordered['elapsed'].apply(strfdelta)
return df_ordered
def get_layers_md_table(node: Node, image: docker.models.images.Image) -> str:
"""Given a node, generate a table in markdown format
Args:
node (Node): _description_
image (docker.models.images.Image): the actual image object, not image name
Returns:
str: DataFrame in Markdown-friendly string format.
"""
print("Trying to docker get: ", node.image_name + ':' + node.image_tag)
node.print_info()
layers = get_layers(image)[
['createdAt', 'CMD', 'hSize', 'hcumSize', 'elapsed', 'Tags']
] # panda Dataframe: select columns to keep
layers.dropna(inplace=True)
return layers.to_markdown()
def write_report(
node: Node,
image: docker.models.images.Image,
all_info_cmds:Dict,
output_dir: str = MANIFEST_PATH
):
"""Call run_outputs(), then format and store the outputs to <image_fullname>.md
Wrapper around run_outputs() and get_layers_md_table()
Note:
There will be 2 copies.
One in wiki/, where it waits to be commited (if action in main) and made public.
One in manifests/, so developers can look at it after downloading the build-artifacts.
Args:
node (Node): _description_
image (docker.models.images.Image): the actual image object, not image name
all_info_cmds (Dict): all available info commands generated by load_spec()['all_info_cmds']
output_dir: (str, optional). Default to 'manifests'
Returns:
None
"""
outputs = run_outputs(node, all_info_cmds)
expandable_head = """<details>\n<summary>Details</summary>\n"""
expandable_foot = """</details>\n"""
sections = []
sections.append(get_layers_md_table(node, image))
for output in outputs:
is_long_output = output['output'].count('\n') > 30
if is_long_output:
sections.append(
f"""
## {output['description']}
{expandable_head}
```
{output['output']}
```
{expandable_foot}
"""
)
else:
sections.append(
f"""
## {output['description']}
```
{output['output']}
```
"""
)
stitched = '\n'.join(sections).strip()
manifest_fn = fulltag2fn(node.full_image_name)
# store in wiki/
wiki_path = path.join('wiki', f"{manifest_fn}.md")
with open(wiki_path, 'w') as f:
f.write(stitched)
# store a copy in manifests/
output_path = path.join(output_dir, f"{manifest_fn}.md")
with open(output_path, 'w') as f:
f.write(stitched)
logger.info(f"*** Individual wiki page {manifest_fn}.md successfully written.")
def update_Home() -> bool:
"""Update Home.md (the page on https://github.com/ucsd-ets/datahub-docker-stack/wiki)
by adding a (Commit, Image, Manifest) cell to the table.
It also creates new manifest pages for the stable images, which are copies of old manifests.
It will only update the (local) Home.md in wiki/ in the workflow cache.
A separate action (see .github/workflows/main.yml, Push Wiki to Github) will make it
public.
Args:
images_full_names (List[str]): a list of full image names (successfully built & tested).
Returns:
bool: success/failure
"""
try:
# 1st column: commit link [git_short_hash](LINK)
repo_url = f"https://github.com/ucsd-ets/datahub-docker-stack"
git_short_hash = GitHelper.commit_hash_tag_shortened()
cell_commit = url2mdlink(repo_url + '/commit/' + git_short_hash, f"`{git_short_hash}`")
# 2nd col: Image
# each cell_img is like ghcr.io/ucsd-ets/datahub-base-notebook:2023.1-c11a915
stable_full_names = read_var('IMAGES_TAGGED')
cell_images = list2cell([f"`{image}`" for image in stable_full_names])
# also read orignal names to copy wiki pages later
orig_full_names = read_var('IMAGES_ORIGINAL')
# 3rd column: image wiki page link ["LINK"](LINK)
manifests_links = [wiki_doc2link(fullname=image.replace("ghcr.io", "ghcr-io")) for image in stable_full_names]
cell_manifests = list2cell(manifests_links)
except Exception as e:
logger.error(f"Error when loading information to update Home.md, {e}")
return False
# group 3 columns together
latest_row = (cell_commit, cell_images, cell_manifests)
# Create new wiki pages for stable images, but same content
try:
for stable_name, orig_name in zip(stable_full_names, orig_full_names):
stable_fn = fulltag2fn(stable_name)
orig_fn = fulltag2fn(orig_name)
with open(path.join('wiki', f'{orig_fn}.md'), 'r') as f:
doc_str = f.read()
with open(path.join('wiki', f'{stable_fn}.md'), 'w') as f:
f.write(doc_str)
except AssertionError as e:
logger.error(f"Error when copying wiki page of each image: {e}")
return False
# Read old content, Update, Write back
try:
doc_str = read_Home()
# avoid duplicate entry: <year_quarter-stable> tag
_, stable_tag = stable_full_names[0].split(':', 1) # stable_tag = 2022.2-stable
stablePrefix, _ = stable_tag.split('-', 1) # 2022.2
#original_stable_names = query_images(doc_str, 'stable', stablePrefix) # a list
# we need the ability to overwrite stable tags if we want to
# todo: figure out how to delete old MD if duplicate detected
#assert len(original_stable_names) == 0, f"Images with tag {stable_tag} already exist in Home.md"
# 2nd arg of insert_row() takes in List[Tuple], each of which is a new 'latest_row'
latest_doc = insert_row(doc_str, [latest_row])
with open(path.join('wiki', 'Home.md'), 'w') as f:
f.write(latest_doc)
# such that we can look at new Home page even with dry_run
with open(path.join('artifacts', 'Home.md'), 'w') as f:
f.write(latest_doc)
except Exception as e:
logger.error(f"Error when updating Home.md, {e}")
return False
return True
def update_Stable() -> bool:
"""Read information from IMAGES_TAGGED and IMAGES_ORIGINAL, and update
Stable_Tag.md accordingly.
Note:
IMAGES_GLOBAL_STABLE should store images like ghcr.io/ucsd-ets/datahub-base-notebook:stable;
IMAGES_ORIGINAL_STABLE should store images like ghcr.io/ucsd-ets/datahub-base-notebook:2022.2-stable;
Returns:
bool: success/failure
"""
# Load data
try:
# 1st col: Image
# each cell_img is like ghcr.io/ucsd-ets/datascience-notebook:2023.1-stable
stable_full_names = read_var('IMAGES_GLOBAL_STABLE')
cell_stable = list2cell([f"`{image}`" for image in stable_full_names])
# 2nd col: Based On
# each orig_img is like ghcr.io/ucsd-ets/datascience-notebook:2023.1-stable
orig_full_names = read_var('IMAGES_ORIGINAL_STABLE')
cell_orig = list2cell([f"`{image}`" for image in orig_full_names])
assert len(stable_full_names) == len(orig_full_names), \
"IMAGES_GLOBAL_STABLE and IMAGES_ORIGINAL_STABLE mismatched."
# 3rd col: Manifest
# NOTE: the actual page will be created later, see the next try-except.
manifests_links = [wiki_doc2link(fullname=image) for image in stable_full_names]
cell_manifests = list2cell(manifests_links)
except AssertionError as e:
logger.error(f"Error when loading data for Stable_Tag.md: {e}")
return False
# Create new wiki pages for stable images, but same content
try:
for stable_name, orig_name in zip(stable_full_names, orig_full_names):
stable_fn = fulltag2fn(stable_name)
orig_fn = fulltag2fn(orig_name)
with open(path.join('wiki', f'{orig_fn}.md'), 'r') as f:
doc_str = f.read()
with open(path.join('wiki', f'{stable_fn}.md'), 'w') as f:
f.write(doc_str)
except AssertionError as e:
logger.error(f"Error when copying wiki page of each image: {e}")
return False
# Reconstruct Stable_Tag.md
header = ['| Global-Stable Image | Based On | Manifest |']
divider = ['| :- | :- | :- |']
content = ['|'.join([
"", # such that we have start and ending '|'
cell_stable,
cell_orig,
cell_manifests,
""
])]
doc = '\n'.join(header + divider + content)
with open(path.join('wiki', 'Stable_Tag.md'), 'w') as f:
f.write(doc)
# such that we can look at new Stable page even with dry_run
with open(path.join('artifacts', 'Stable_Tag.md'), 'w') as f:
f.write(doc)
return True