scripts/wiki.py

from genericpath import isfile
from os import path, environ
from glob import glob
from posixpath import basename
from yaml import dump
from typing import Dict, List
import logging
import docker
import pandas as pd

from scripts.utils import *
from scripts.tree import Node, load_spec
from scripts.docker_adapter import run_simple_command, __docker_client
from scripts.git_helper import GitHelper     # has it been used?
from scripts.fs import MANIFEST_PATH


logger = get_logger()

def run_outputs(node: Node, all_info_cmds:Dict) -> List[Dict]: 
    """Create a container & Run info commands of each image and return outputs. 
    Wrapper around run_simple_command()

    Args:
        node (Node): _description_
        all_info_cmds (Dict): all available info commands generated by load_spec()['all_info_cmds']

    Returns:
        List[Dict]: each Dict has description (str), output (str) of one image.
    """
    # Create docker container
    logger.info(f"Creating container for image {node.full_image_name} ...")
    outputs = []

    try:
        container = __docker_client.containers.run(
            image=node.full_image_name, command='/bin/bash', tty=True, detach=True,
        )   # If detach is True, a Container object is returned instead.
    except Exception as e:
        logger.error(e)
        logger.error(
            f"*** docker container failed to run on image {node.full_image_name} ***")
        return []

    else:
        logger.info(f"Container {container.name} created")
        for key in node.info_cmds:
            if key not in all_info_cmds.keys():
                logger.error(f"command definition of {key} in {node.image_name} not found in spec.yml; skip")
                continue
            
            logging.debug(f"Running command {all_info_cmds[key]['command']}")
            cmd_output, cmd_success = run_simple_command(
                container,
                node,
                all_info_cmds[key]['command']
            )

            description = all_info_cmds[key]['description']
            outputs.append(dict(description=description, output=cmd_output))

    finally:
        if container:
            logger.info(f"Removing container {container.name} ...")
            container.remove(force=True)
            logger.info(f"Container {container.name} removed")
        __docker_client.close()
    
    return outputs


def get_layers(image: docker.models.images.Image):
    """Helper function of get_layers_md_table

    Args:
        image (docker.models.images.Image): the actual image object, not image name

    Returns:
        pandas.Dataframe: to be further processed.
    """
    df = pd.DataFrame(image.history()).convert_dtypes()
    df['CMD'] = df['CreatedBy']
    df['CMD'] = df['CMD'].str.replace('|', '', regex=False)
    df['CMD'] = df['CMD'].str.replace('/bin/bash -o pipefail -c', '', regex=False)
    df['CMD'] = df['CMD'].str.replace('#(nop)', '', regex=False)
    df['CMD'] = '`' + df['CMD'].str.strip() + '`'
    df['createdAt'] = pd.to_datetime(df['Created'], unit='s')
    df['hSize'] = df['Size'].apply(
        lambda x: bytes_to_hstring(x) if x > 100 else f'{x} B'
    )
    df['cumSize'] = df['Size'][::-1].cumsum()[::-1]
    df['hcumSize'] = df['cumSize'].apply(
        lambda x: bytes_to_hstring(x) if x > 100 else f'{x} B'
    )
    df_ordered = (
        df.loc[df['Tags'].notna()[::-1].cumsum()[::-1] > 0, :]
        .iloc[::-1, :]
        .reset_index(drop=True)
    )
    df_ordered['elapsed'] = df_ordered['createdAt'].diff()
    df_ordered.loc[1, 'elapsed'] = pd.Timedelta(0)
    df_ordered['elapsed'] = df_ordered['elapsed'].apply(strfdelta)
    return df_ordered


def get_layers_md_table(node: Node, image: docker.models.images.Image) -> str:
    """Given a node, generate a table in markdown format

    Args:
        node (Node): _description_
        image (docker.models.images.Image): the actual image object, not image name

    Returns:
        str: DataFrame in Markdown-friendly string format.
    """
    print("Trying to docker get: ", node.image_name + ':' + node.image_tag)
    node.print_info()
    layers = get_layers(image)[
        ['createdAt', 'CMD', 'hSize', 'hcumSize', 'elapsed', 'Tags']
    ]   # panda Dataframe: select columns to keep
    layers.dropna(inplace=True)
    return layers.to_markdown()


def write_report(
    node: Node, 
    image: docker.models.images.Image, 
    all_info_cmds:Dict, 
    output_dir: str = MANIFEST_PATH
    ):
    """Call run_outputs(), then format and store the outputs to <image_fullname>.md
    Wrapper around run_outputs() and get_layers_md_table()

    Note:
        There will be 2 copies.
        One in wiki/, where it waits to be commited (if action in main) and made public.
        One in manifests/, so developers can look at it after downloading the build-artifacts.

    Args:
        node (Node): _description_
        image (docker.models.images.Image): the actual image object, not image name
        all_info_cmds (Dict): all available info commands generated by load_spec()['all_info_cmds']
        output_dir: (str, optional). Default to 'manifests'

    Returns:
        None
    """
    outputs = run_outputs(node, all_info_cmds)
    expandable_head = """<details>\n<summary>Details</summary>\n"""
    expandable_foot = """</details>\n"""

    sections = []
    sections.append(get_layers_md_table(node, image))

    for output in outputs:
        is_long_output = output['output'].count('\n') > 30
        if is_long_output:
            sections.append(
f"""
## {output['description']}

{expandable_head}
```
{output['output']}
```
{expandable_foot}

"""
            )
        else:
            sections.append(
f"""
## {output['description']}

```
{output['output']}
```

"""
            )

    stitched = '\n'.join(sections).strip()
    manifest_fn = fulltag2fn(node.full_image_name)

    # store in wiki/
    wiki_path = path.join('wiki', f"{manifest_fn}.md")
    with open(wiki_path, 'w') as f:
        f.write(stitched)
    
    # store a copy in manifests/
    output_path = path.join(output_dir, f"{manifest_fn}.md")
    with open(output_path, 'w') as f:
        f.write(stitched)

    logger.info(f"*** Individual wiki page {manifest_fn}.md successfully written.")


def update_Home() -> bool:
    """Update Home.md (the page on https://github.com/ucsd-ets/datahub-docker-stack/wiki)
    by adding a (Commit, Image, Manifest) cell to the table.

    It also creates new manifest pages for the stable images, which are copies of old manifests.

    It will only update the (local) Home.md in wiki/ in the workflow cache.
    A separate action (see .github/workflows/main.yml, Push Wiki to Github) will make it 
    public.

    Args:
        images_full_names (List[str]): a list of full image names (successfully built & tested). 

    Returns:
        bool: success/failure
    """    
    try:
        # 1st column: commit link [git_short_hash](LINK)
        repo_url = f"https://github.com/ucsd-ets/datahub-docker-stack"
        git_short_hash = GitHelper.commit_hash_tag_shortened()
        cell_commit = url2mdlink(repo_url + '/commit/' + git_short_hash, f"`{git_short_hash}`")

        # 2nd col: Image
        # each cell_img is like ghcr.io/ucsd-ets/datahub-base-notebook:2023.1-c11a915
        stable_full_names = read_var('IMAGES_TAGGED')
        cell_images = list2cell([f"`{image}`" for image in stable_full_names])
        # also read orignal names to copy wiki pages later
        orig_full_names = read_var('IMAGES_ORIGINAL') 

        # 3rd column: image wiki page link ["LINK"](LINK)        
        manifests_links = [wiki_doc2link(fullname=image.replace("ghcr.io", "ghcr-io")) for image in stable_full_names]
        cell_manifests = list2cell(manifests_links)
    except Exception as e:
        logger.error(f"Error when loading information to update Home.md, {e}")
        return False

    # group 3 columns together
    latest_row = (cell_commit, cell_images, cell_manifests)

    # Create new wiki pages for stable images, but same content
    try:
        for stable_name, orig_name in zip(stable_full_names, orig_full_names):
            stable_fn = fulltag2fn(stable_name)
            orig_fn = fulltag2fn(orig_name)
            with open(path.join('wiki', f'{orig_fn}.md'), 'r') as f:
                doc_str = f.read()
            with open(path.join('wiki', f'{stable_fn}.md'), 'w') as f:
                f.write(doc_str) 
    except AssertionError as e:
        logger.error(f"Error when copying wiki page of each image: {e}")
        return False 

    # Read old content, Update, Write back
    try:
        doc_str = read_Home()

        # avoid duplicate entry: <year_quarter-stable> tag
        _, stable_tag = stable_full_names[0].split(':', 1)  # stable_tag = 2022.2-stable
        stablePrefix, _ = stable_tag.split('-', 1)  # 2022.2
        #original_stable_names = query_images(doc_str, 'stable', stablePrefix)  # a list
        
        # we need the ability to overwrite stable tags if we want to
        # todo: figure out how to delete old MD if duplicate detected
        #assert len(original_stable_names) == 0, f"Images with tag {stable_tag} already exist in Home.md"
        
        # 2nd arg of insert_row() takes in List[Tuple], each of which is a new 'latest_row'
        latest_doc = insert_row(doc_str, [latest_row])

        with open(path.join('wiki', 'Home.md'), 'w') as f:
            f.write(latest_doc)
        # such that we can look at new Home page even with dry_run
        with open(path.join('artifacts', 'Home.md'), 'w') as f:
            f.write(latest_doc)
    except Exception as e:
        logger.error(f"Error when updating Home.md, {e}")
        return False

    return True


def update_Stable() -> bool:
    """Read information from IMAGES_TAGGED and IMAGES_ORIGINAL, and update
    Stable_Tag.md accordingly.

    Note:
        IMAGES_GLOBAL_STABLE should store images like ghcr.io/ucsd-ets/datahub-base-notebook:stable;
        IMAGES_ORIGINAL_STABLE should store images like ghcr.io/ucsd-ets/datahub-base-notebook:2022.2-stable;

    Returns:
        bool: success/failure
    """
    # Load data
    try:
        # 1st col: Image
        # each cell_img is like ghcr.io/ucsd-ets/datascience-notebook:2023.1-stable
        stable_full_names = read_var('IMAGES_GLOBAL_STABLE')
        cell_stable = list2cell([f"`{image}`" for image in stable_full_names])

        # 2nd col: Based On
        # each orig_img is like ghcr.io/ucsd-ets/datascience-notebook:2023.1-stable
        orig_full_names = read_var('IMAGES_ORIGINAL_STABLE')
        cell_orig = list2cell([f"`{image}`" for image in orig_full_names])

        assert len(stable_full_names) == len(orig_full_names), \
            "IMAGES_GLOBAL_STABLE and IMAGES_ORIGINAL_STABLE mismatched."

        # 3rd col: Manifest
        # NOTE: the actual page will be created later, see the next try-except.
        manifests_links = [wiki_doc2link(fullname=image) for image in stable_full_names]
        cell_manifests = list2cell(manifests_links)

    except AssertionError as e:
        logger.error(f"Error when loading data for Stable_Tag.md: {e}")
        return False

    # Create new wiki pages for stable images, but same content
    try:
        for stable_name, orig_name in zip(stable_full_names, orig_full_names):
            stable_fn = fulltag2fn(stable_name)
            orig_fn = fulltag2fn(orig_name)
            with open(path.join('wiki', f'{orig_fn}.md'), 'r') as f:
                doc_str = f.read()
            with open(path.join('wiki', f'{stable_fn}.md'), 'w') as f:
                f.write(doc_str) 
    except AssertionError as e:
        logger.error(f"Error when copying wiki page of each image: {e}")
        return False         

    # Reconstruct Stable_Tag.md
    header = ['| Global-Stable Image | Based On | Manifest |']
    divider = ['| :- | :- | :- |']
    content = ['|'.join([
        "",     # such that we have start and ending '|'
        cell_stable,
        cell_orig,
        cell_manifests,
        ""
    ])]
    doc = '\n'.join(header + divider + content)
    with open(path.join('wiki', 'Stable_Tag.md'), 'w') as f:
        f.write(doc)
    # such that we can look at new Stable page even with dry_run
    with open(path.join('artifacts', 'Stable_Tag.md'), 'w') as f:
        f.write(doc)
    return True