From 5a4989c4d86d46486199da1b5ec7f19d47fe7606 Mon Sep 17 00:00:00 2001 From: Ilija Vukotic Date: Wed, 30 Oct 2024 14:11:21 -0500 Subject: [PATCH] small fixes --- portal/jupyterlab.py | 306 ++++++++++++++++++++++++++----------------- portal/views.py | 7 +- 2 files changed, 192 insertions(+), 121 deletions(-) diff --git a/portal/jupyterlab.py b/portal/jupyterlab.py index 41ee063..43e61fa 100644 --- a/portal/jupyterlab.py +++ b/portal/jupyterlab.py @@ -1,8 +1,8 @@ -''' +""" This module supports the JupyterLab service Functionality: -=============== +=============== 1. The start_notebook_maintenance method starts a thread for removing expired notebooks 2. The deploy_notebook method lets a user deploy a notebook onto our Kubernetes cluster @@ -13,18 +13,18 @@ 7. The get_gpu_availability function lets a user know which GPU products are available for use Dependencies: -=============== +=============== 1. A portal.conf configuration file (af-portal/portal/secrets/portal.conf) 2. A kubeconfig file (either a file specified in portal.conf, or a file at the default location, ~/.kube/config) Example usage: -=============== +=============== Example #1: cd //af-portal -python +python >>> from portal import jupyterlab >>> from pprint import pprint >>> notebook = jupyterlab.get_notebook('example-notebook-1') @@ -33,7 +33,7 @@ Example #2: cd //af-portal -python +python >>> from portal import jupyterlab >>> from pprint import pprint >>> notebooks = jupyterlab.get_notebooks('my-username') @@ -42,7 +42,7 @@ Example #3: cd //af-portal -python +python >>> from portal import jupyterlab >>> from pprint import pprint >>> settings = { @@ -68,7 +68,7 @@ Example #4: cd //af-portal -python +python >>> from portal import jupyterlab >>> help(jupyterlab) >>> help(jupyterlab.deploy_notebook) @@ -94,7 +94,8 @@ python >>> from portal import jupyterlab >>> jupyterlab.list_notebooks() -''' +""" + import math import yaml import time @@ -109,34 +110,37 @@ from kubernetes.utils.quantity import parse_quantity from portal import app, logger -namespace = app.config.get('NAMESPACE') -kubeconfig = app.config.get('KUBECONFIG') +namespace = app.config.get("NAMESPACE") +kubeconfig = app.config.get("KUBECONFIG") if kubeconfig: config.load_kube_config(config_file=kubeconfig) - logger.info('Loaded kubeconfig from file %s' % kubeconfig) + logger.info("Loaded kubeconfig from file %s" % kubeconfig) else: config.load_kube_config() - logger.info('Loaded default kubeconfig file') + logger.info("Loaded default kubeconfig file") def start_notebook_maintenance(): def inner(): while True: api = client.CoreV1Api() - pods = api.list_namespaced_pod(namespace, label_selector='k8s-app=jupyterlab').items + pods = api.list_namespaced_pod( + namespace, label_selector="k8s-app=jupyterlab" + ).items for pod in pods: exp_date = get_expiration_date(pod) if exp_date and exp_date < datetime.datetime.now(datetime.timezone.utc): - logger.info('Notebook %s has expired' % pod.metadata.name) + logger.info("Notebook %s has expired" % pod.metadata.name) remove_notebook(pod.metadata.name) time.sleep(1800) + threading.Thread(target=inner).start() - logger.info('Started notebook maintenance') + logger.info("Started notebook maintenance") def deploy_notebook(**settings): - ''' + """ Deploys a Jupyter notebook on our Kubernetes cluster. Function parameters: @@ -145,8 +149,8 @@ def deploy_notebook(**settings): notebook_name: (string) The display name of the notebook notebook_id: (string) A unique name for the notebook image: (string) The Docker image that will be run in the pod's container - owner: (string) The username of the notebook's owner - owner_uid: (integer) The numeric ID of the notebook's owner + owner: (string) The username of the notebook's owner + owner_uid: (integer) The numeric ID of the notebook's owner globus_id: (string) The Globus ID of the notebook's owner cpu_request: (integer) The number of CPU cores to request from the k8s cluster cpu_limit: (integer) The max number of CPU cores that can be allocated to this pod @@ -156,37 +160,37 @@ def deploy_notebook(**settings): gpu_limit: (integer) The max number of GPU instances that can be allocated to this pod gpu_product: (string) Selects a GPU product based on name hours_remaining: (integer) The duration of the notebook in hours - ''' - settings['namespace'] = namespace - settings['domain_name'] = app.config['DOMAIN_NAME'] - settings['token'] = b64encode(os.urandom(32)).decode() - settings['start_script'] = '/ML_platform_tests/SetupPrivateJupyterLab.sh' - if settings['image'].count('oct_upgrade'): - settings['start_script'] = '/ML_platform_tests/SetupJupyterLab.sh' - templates = Environment(loader=FileSystemLoader('portal/templates/jupyterlab')) + """ + settings["namespace"] = namespace + settings["domain_name"] = app.config["DOMAIN_NAME"] + settings["token"] = b64encode(os.urandom(32)).decode() + settings["start_script"] = "/ML_platform_tests/SetupPrivateJupyterLab.sh" + if settings["image"].count("oct_upgrade"): + settings["start_script"] = "/ML_platform_tests/SetupJupyterLab.sh" + templates = Environment(loader=FileSystemLoader("portal/templates/jupyterlab")) api = client.CoreV1Api() # Create a pod for the notebook (the notebook runs as a container inside the pod) - template = templates.get_template('pod.yaml') + template = templates.get_template("pod.yaml") pod = yaml.safe_load(template.render(**settings)) api.create_namespaced_pod(namespace=namespace, body=pod) # Create a service for the pod - template = templates.get_template('service.yaml') + template = templates.get_template("service.yaml") service = yaml.safe_load(template.render(**settings)) api.create_namespaced_service(namespace=namespace, body=service) # Store the JupyterLab token in a secret - template = templates.get_template('secret.yaml') + template = templates.get_template("secret.yaml") secret = yaml.safe_load(template.render(**settings)) api.create_namespaced_secret(namespace=namespace, body=secret) # Create an ingress for the service (gives the notebook its own domain name and public key certificate) api = client.NetworkingV1Api() - template = templates.get_template('ingress.yaml') + template = templates.get_template("ingress.yaml") ingress = yaml.safe_load(template.render(**settings)) api.create_namespaced_ingress(namespace=namespace, body=ingress) - logger.info('Deployed notebook %s' % settings['notebook_name']) + logger.info("Deployed notebook %s" % settings["notebook_name"]) def get_notebook(name=None, pod=None, **options): - ''' + """ Looks up a notebook by name or by pod. Returns a dict. Function parameters: @@ -196,60 +200,89 @@ def get_notebook(name=None, pod=None, **options): pod: (object) The pod object returned by the kubernetes client log: (boolean) When log is True, the pod log is included in the dict that gets returned url: (boolean) When url is True, the notebook URL is included in the dict that gets returned - ''' + """ api = client.CoreV1Api() if pod is None: pod = api.read_namespaced_pod(name=name.lower(), namespace=namespace) notebook = dict() - notebook['id'] = pod.metadata.name - notebook['name'] = pod.metadata.labels.get('notebook-name') - notebook['namespace'] = namespace - notebook['owner'] = pod.metadata.labels.get('owner') - notebook['image'] = pod.spec.containers[0].image - notebook['node'] = pod.spec.node_name - notebook['node_selector'] = pod.spec.node_selector - notebook['pod_status'] = pod.status.phase - notebook['creation_date'] = pod.metadata.creation_timestamp.isoformat() + notebook["id"] = pod.metadata.name + notebook["name"] = pod.metadata.labels.get("notebook-name") + notebook["namespace"] = namespace + notebook["owner"] = pod.metadata.labels.get("owner") + notebook["image"] = pod.spec.containers[0].image + notebook["node"] = pod.spec.node_name + notebook["node_selector"] = pod.spec.node_selector + notebook["pod_status"] = pod.status.phase + notebook["creation_date"] = pod.metadata.creation_timestamp.isoformat() expiration_date = get_expiration_date(pod) time_remaining = expiration_date - datetime.datetime.now(datetime.timezone.utc) - notebook['expiration_date'] = expiration_date.isoformat() - notebook['hours_remaining'] = int(time_remaining.total_seconds() / 3600) - notebook['requests'] = pod.spec.containers[0].resources.requests - notebook['limits'] = pod.spec.containers[0].resources.limits - notebook['conditions'] = [{'type': c.type, 'status': c.status, - 'timestamp': c.last_transition_time.isoformat()} for c in pod.status.conditions] - notebook['conditions'].sort(key=lambda cond: dict( - PodScheduled=0, Initialized=1, ContainersReady=2, Ready=3).get(cond['type'])) + notebook["expiration_date"] = expiration_date.isoformat() + notebook["hours_remaining"] = int(time_remaining.total_seconds() / 3600) + notebook["requests"] = pod.spec.containers[0].resources.requests + notebook["limits"] = pod.spec.containers[0].resources.limits + notebook["conditions"] = [ + { + "type": c.type, + "status": c.status, + "timestamp": c.last_transition_time.isoformat(), + } + for c in pod.status.conditions + ] + notebook["conditions"].sort( + key=lambda cond: dict( + PodScheduled=0, Initialized=1, ContainersReady=2, Ready=3 + ).get(cond["type"]) + ) events = api.list_namespaced_event( - namespace=namespace, field_selector='involvedObject.uid=%s' % pod.metadata.uid).items - notebook['events'] = [{'message': e.message, 'timestamp': e.last_timestamp.isoformat( - ) if e.last_timestamp else None} for e in events] + namespace=namespace, field_selector="involvedObject.uid=%s" % pod.metadata.uid + ).items + notebook["events"] = [ + { + "message": e.message, + "timestamp": e.last_timestamp.isoformat() if e.last_timestamp else None, + } + for e in events + ] if pod.spec.node_name: node = api.read_node(pod.spec.node_name) - if node.metadata.labels.get('gpu') == 'true': - notebook['gpu'] = {'product': node.metadata.labels['nvidia.com/gpu.product'], - 'memory': node.metadata.labels['nvidia.com/gpu.memory'] + 'Mi'} + if node.metadata.labels.get("gpu") == "true": + notebook["gpu"] = { + "product": node.metadata.labels["nvidia.com/gpu.product"], + "memory": node.metadata.labels["nvidia.com/gpu.memory"] + "Mi", + } if pod.metadata.deletion_timestamp is None: - if next(filter(lambda c: c.type == 'Ready' and c.status == 'True', pod.status.conditions), None): + if next( + filter( + lambda c: c.type == "Ready" and c.status == "True", + pod.status.conditions, + ), + None, + ): log = api.read_namespaced_pod_log(pod.metadata.name, namespace=namespace) - notebook['status'] = 'Ready' if re.search( - 'Jupyter.*is running at', log) else 'Starting notebook...' + notebook["status"] = ( + "Ready" + if re.search("Jupyter.*is running at", log) + else "Starting notebook..." + ) else: - notebook['status'] = 'Pending' + notebook["status"] = "Pending" else: - notebook['status'] = 'Removing notebook...' + notebook["status"] = "Removing notebook..." # Optional fields - if options.get('log') is True and 'log' in locals(): - notebook['log'] = log - if options.get('url') is True and pod.metadata.deletion_timestamp is None: - token = api.read_namespaced_secret(pod.metadata.name, namespace).data['token'] - notebook['url'] = 'https://%s.%s?%s' % (pod.metadata.name, - app.config['DOMAIN_NAME'], urllib.parse.urlencode({'token': token})) + if options.get("log") is True and "log" in locals(): + notebook["log"] = log + if options.get("url") is True and pod.metadata.deletion_timestamp is None: + token = api.read_namespaced_secret(pod.metadata.name, namespace).data["token"] + notebook["url"] = "https://%s.%s?%s" % ( + pod.metadata.name, + app.config["DOMAIN_NAME"], + urllib.parse.urlencode({"token": token}), + ) return notebook def get_notebooks(owner=None, **options): - ''' + """ Retrieves a user's notebooks, or the notebooks for all users. Returns an array of dicts. Function parameters: @@ -258,29 +291,39 @@ def get_notebooks(owner=None, **options): owner: (string) The username of the owner. When owner is None, the function returns all notebooks for all users. log: (boolean) When log is True, the pod log is included in the dict that gets returned url: (boolean) When url is True, the notebook URL is included in the dict that gets returned - ''' + """ notebooks = [] api = client.CoreV1Api() pods = api.list_namespaced_pod( - namespace, label_selector='k8s-app=jupyterlab' if owner is None else 'k8s-app=jupyterlab,owner=%s' % owner).items + namespace, + label_selector=( + "k8s-app=jupyterlab" + if owner is None + else "k8s-app=jupyterlab,owner=%s" % owner + ), + ).items for pod in pods: try: notebook = get_notebook(pod=pod, **options) notebooks.append(notebook) except Exception as err: - logger.error('Error adding notebook %s to array.\n%s' % (pod.metadata.name, str(err))) + logger.error( + "Error adding notebook %s to array.\n%s" % (pod.metadata.name, str(err)) + ) return notebooks def list_notebooks(): - ''' Returns a list of the names of all notebooks in the namespace. ''' + """Returns a list of the names of all notebooks in the namespace.""" api = client.CoreV1Api() - pods = api.list_namespaced_pod(namespace=namespace, label_selector='k8s-app=jupyterlab').items + pods = api.list_namespaced_pod( + namespace=namespace, label_selector="k8s-app=jupyterlab" + ).items return [pod.metadata.name for pod in pods] def remove_notebook(name): - ''' Removes a notebook from the namespace, and all Kubernetes objects associated with the notebook. ''' + """Removes a notebook from the namespace, and all Kubernetes objects associated with the notebook.""" try: id = name.lower() api = client.CoreV1Api() @@ -289,7 +332,7 @@ def remove_notebook(name): api.delete_namespaced_secret(id, namespace) api = client.NetworkingV1Api() api.delete_namespaced_ingress(id, namespace) - logger.info('Removed notebook %s from namespace %s' % (id, namespace)) + logger.info("Removed notebook %s from namespace %s" % (id, namespace)) return True except Exception as err: logger.error(str(err)) @@ -297,36 +340,39 @@ def remove_notebook(name): def notebook_name_available(name): - ''' Returns a boolean indicating whether a notebook name is available for use. ''' + """Returns a boolean indicating whether a notebook name is available for use.""" api = client.CoreV1Api() pods = api.list_namespaced_pod( - namespace, field_selector='metadata.name={0}'.format(name.lower())) + namespace, field_selector="metadata.name={0}".format(name.lower()) + ) return len(pods.items) == 0 def generate_notebook_name(owner): - ''' Returns a default notebook name that is available for use, e.g. testuser-notebook-3. ''' + """Returns a default notebook name that is available for use, e.g. testuser-notebook-3.""" for i in range(1, 20): - notebook_name = f'{owner}-notebook-{i}' + notebook_name = f"{owner}-notebook-{i}" if notebook_name_available(notebook_name): return notebook_name return None def supported_images(): - ''' Returns a tuple of Docker images that are supported by the JupyterLab service. ''' - return ('hub.opensciencegrid.org/usatlas/ml-platform:latest', - 'hub.opensciencegrid.org/usatlas/ml-platform:conda', - 'hub.opensciencegrid.org/usatlas/ml-platform:julia', - 'hub.opensciencegrid.org/usatlas/ml-platform:lava', - 'hub.opensciencegrid.org/usatlas/ml-platform:centos-v2-docker', - 'hub.opensciencegrid.org/usatlas/analysis-dask-uc:main', - 'hub.opensciencegrid.org/usatlas/analysis-dask-uc:dev', - 'hub.opensciencegrid.org/usatlas/ml-platform:oct_upgrade') + """Returns a tuple of Docker images that are supported by the JupyterLab service.""" + return ( + "hub.opensciencegrid.org/usatlas/ml-platform:latest", + "hub.opensciencegrid.org/usatlas/ml-platform:conda", + "hub.opensciencegrid.org/usatlas/ml-platform:julia", + "hub.opensciencegrid.org/usatlas/ml-platform:lava", + "hub.opensciencegrid.org/usatlas/ml-platform:centos-v2-docker", + "hub.opensciencegrid.org/usatlas/analysis-dask-uc:main", + "hub.opensciencegrid.org/usatlas/analysis-dask-uc:dev", + "hub.opensciencegrid.org/usatlas/ml-platform:oct_upgrade", + ) def get_gpu_availability(product=None, memory=None): - ''' + """ Looks up a GPU product by its product name or memory cache size, and gets its availability. When this function is called without arguments, it gets the availability of every GPU product. Returns a list of dicts. @@ -354,28 +400,40 @@ def get_gpu_availability(product=None, memory=None): d. To calculate availability, subtract the total number of requests for this GPU from the total number of instances = - 4. Get the hash map values as a list. Sort the list. Each entry in the list gives the availability of a unique GPU product. - Return the sorted list of dicts. - ''' + Return the sorted list of dicts. + """ gpus = dict() api = client.CoreV1Api() if product: - nodes = api.list_node(label_selector='gpu=true,nvidia.com/gpu.product=%s' % product) + nodes = api.list_node( + label_selector="gpu=true,nvidia.com/gpu.product=%s" % product + ) elif memory: - nodes = api.list_node(label_selector='gpu=true,nvidia.com/gpu.memory=%s' % memory) + nodes = api.list_node( + label_selector="gpu=true,nvidia.com/gpu.memory=%s" % memory + ) else: - nodes = api.list_node(label_selector='nvidia.com/gpu.product') + nodes = api.list_node(label_selector="nvidia.com/gpu.product") for node in nodes.items: - product = node.metadata.labels['nvidia.com/gpu.product'] - memory = int(node.metadata.labels['nvidia.com/gpu.memory']) - count = int(node.metadata.labels['nvidia.com/gpu.count']) + product = node.metadata.labels["nvidia.com/gpu.product"] + memory = int(node.metadata.labels["nvidia.com/gpu.memory"]) + count = int(node.metadata.labels["nvidia.com/gpu.count"]) if product not in gpus: - gpus[product] = dict(mem_request_max=0, cpu_request_max=0, - product=product, memory=memory, count=count, total_requests=0) + gpus[product] = dict( + mem_request_max=0, + cpu_request_max=0, + product=product, + memory=memory, + count=count, + total_requests=0, + ) else: - gpus[product]['count'] += count + gpus[product]["count"] += count gpu = gpus[product] pods = api.list_pod_for_all_namespaces( - field_selector='spec.nodeName=%s,status.phase!=%s,status.phase!=%s' % (node.metadata.name, 'Succeeded', 'Failed')).items + field_selector="spec.nodeName=%s,status.phase!=%s,status.phase!=%s" + % (node.metadata.name, "Succeeded", "Failed") + ).items mem_request = 0 cpu_request = 0 gpu_request = 0 @@ -383,33 +441,45 @@ def get_gpu_availability(product=None, memory=None): for container in pod.spec.containers: requests = container.resources.requests if requests: - gpu['total_requests'] += int(requests.get('nvidia.com/gpu', 0)) - gpu_request += int(requests.get('nvidia.com/gpu', 0)) - mem_request += parse_quantity(requests.get('memory', 0)) - cpu_request += parse_quantity(requests.get('cpu', 0)) + gpu["total_requests"] += int(requests.get("nvidia.com/gpu", 0)) + gpu_request += int(requests.get("nvidia.com/gpu", 0)) + mem_request += parse_quantity(requests.get("memory", 0)) + cpu_request += parse_quantity(requests.get("cpu", 0)) # count in max only when there are at least 1 gpu available. the limitation is this guard is only safe if the requested # gpu is not more than 1. - if int(node.status.capacity['nvidia.com/gpu']) > gpu_request: + if int(node.status.capacity["nvidia.com/gpu"]) > gpu_request: mem_request_max = math.floor( - (parse_quantity(node.status.capacity['memory']) - mem_request)/(1024*1024*1024)) - cpu_request_max = math.floor(parse_quantity(node.status.capacity['cpu']) - cpu_request) - gpu['mem_request_max'] = mem_request_max if mem_request_max > gpu['mem_request_max'] else gpu['mem_request_max'] - gpu['cpu_request_max'] = cpu_request_max if cpu_request_max > gpu['cpu_request_max'] else gpu['cpu_request_max'] - gpu['available'] = max(gpu['count'] - gpu['total_requests'], 0) - return sorted(gpus.values(), key=lambda gpu: gpu['memory']) + (parse_quantity(node.status.capacity["memory"]) - mem_request) + / (1024 * 1024 * 1024) + ) + cpu_request_max = math.floor( + parse_quantity(node.status.capacity["cpu"]) - cpu_request + ) + gpu["mem_request_max"] = ( + mem_request_max + if mem_request_max > gpu["mem_request_max"] + else gpu["mem_request_max"] + ) + gpu["cpu_request_max"] = ( + cpu_request_max + if cpu_request_max > gpu["cpu_request_max"] + else gpu["cpu_request_max"] + ) + gpu["available"] = max(gpu["count"] - gpu["total_requests"], 0) + return sorted(gpus.values(), key=lambda gpu: gpu["memory"]) def get_expiration_date(pod): - ''' Returns the expiration date of the pod. ''' - pattern = re.compile(r'ttl-\d+') - if pattern.match(pod.metadata.labels['time2delete']): - hours = int(pod.metadata.labels['time2delete'].split('-')[1]) + """Returns the expiration date of the pod.""" + pattern = re.compile(r"ttl-\d+") + if pattern.match(pod.metadata.labels["time2delete"]): + hours = int(pod.metadata.labels["time2delete"].split("-")[1]) return pod.metadata.creation_timestamp + datetime.timedelta(hours=hours) return None def get_pod(name): - ''' Looks up a Kubernetes pod by its name and returns a pod object. ''' + """Looks up a Kubernetes pod by its name and returns a pod object.""" try: api = client.CoreV1Api() return api.read_namespaced_pod(name=name, namespace=namespace) diff --git a/portal/views.py b/portal/views.py index acf1c1b..28121fa 100644 --- a/portal/views.py +++ b/portal/views.py @@ -1,13 +1,14 @@ -""" +""" A module for all of the view functions. -The phrase "view function" gets its name from the Model-View-Template (MVT) design pattern. +The phrase "view function" gets its name from the Model-View-Template (MVT) design pattern. A view function is a request handler that returns a view. The view functions in this module are often preceded by an @app.route decorator. The @app.route decorator maps a URL to a view function (request handler). -The @app.route decorator also gives the view function access to the request and response objects in the decorator's namespace. +The @app.route decorator also gives the view function access to the request and response objects +in the decorator's namespace. For more documentation on decorators and the @app.route decorator, see decorators.py """