From f32a1eceb6f89e72749eec5c2e619aa9d6bbe975 Mon Sep 17 00:00:00 2001 From: Jacob Tomlinson Date: Fri, 16 Dec 2022 15:44:01 +0000 Subject: [PATCH 1/6] Redirect rapids.ai/cloud to Cloud Deployment docs --- cloud.md | 956 +------------------------------------------------------ 1 file changed, 4 insertions(+), 952 deletions(-) diff --git a/cloud.md b/cloud.md index 49d8e668..a9eabc53 100644 --- a/cloud.md +++ b/cloud.md @@ -1,954 +1,6 @@ --- -title: "RAPIDS + Cloud" -description: "Deploying RAPIDS in the Cloud" -tagline: "Deploying RAPIDS in the Cloud" -button_text: "Deploy Now" -button_link: "#deploy" -layout: default +title: RAPIDS + Cloud +permalink: /cloud +redirect_to: https://docs.rapids.ai/deployment/stable/cloud/ +[comment]: <> (This page is a redirect to the RAPIDS Cloud Deployment docs) --- - -![cloud]({{ site.baseurl }}{% link /assets/images/RAPIDS-cloud.png %}){: .projects-logo} - -{% capture intro_content %} - -RAPIDS GPU accelerated data science tools can be deployed on all of the major clouds, allowing anyone to take advantage of the speed increases and TCO reductions that RAPIDS enables. -{: .subtitle} - -RAPIDS can be deployed in a number of ways, from hosted Jupyter notebooks, to the major HPO services, all the way up to large-scale clusters via Dask or Kubernetes. Deploying on the cloud will require you to make use of supported GPU instances. Each major cloud provider has GPU instances that are supported by RAPIDS with varying capabilities and price points. The below charts identifies the major instance types of each cloud. - -{% endcapture %} - -{% include section-single.html - background="background-white" - padding-top="0em" padding-bottom="10em" - content-single=intro_content -%} - -{% capture csp_sel %} -# Cloud Providers - -For the various deployment options on each cloud, as well as instructions and links to more details, please select the cloud provider you wish to deploy on. -{: .subtitle} - -{% endcapture %} -{% capture csp_left %} -[![aws]({{ site.baseurl }}{% link /assets/images/AWS-logo.png %})](#aws)
-**[ Amazon Web Services ](#aws)** - -**[ Amazon SageMaker](#AWS-Sagemaker){: .block}** -**[ Single EC2 instance](#AWS-EC2){: .block}** -**[ Cluster using Dask](#AWS-Dask){: .block}** -**[ Cluster using Kubernetes](#AWS-Kubernetes){: .block}** - -{% endcapture %} -{% capture csp_mid %} - -[![azure]({{ site.baseurl }}{% link /assets/images/MS-azure-logo.png %})](#azure)
-**[ Microsoft Azure ](#azure)** - -**[ Azure Machine Learning](#AZ-ML){: .block}** -**[ Single instance](#AZ-single){: .block}** -**[ Cluster via Dask](#AZ-Dask){: .block}** -**[ Cluster via Kubernetes](#AZ-Kubernetes){: .block}** - -{% endcapture %} -{% capture csp_right %} - -[![gcp]({{ site.baseurl }}{% link /assets/images/GCP-logo.png %})](#googlecloud)
-**[ Google Cloud ](#googlecloud)** - -**[ Google AI Platform](#GC-AI){: .block}** -**[ Single instance](#GC-single){: .block}** -**[ Cluster using Dask (via Dataproc)](#GC-Dask){: .block}** -**[ Cluster using Kubernetes](#GC-Kubernetes){: .block}** - -{% endcapture %} - -
-{% include slopecap.html - background="background-purple" - position="top" - slope="down" -%} -{% include section-single.html - background="background-purple" - padding-top="5em" padding-bottom="3em" - content-single=csp_sel -%} -{% include section-thirds.html - background="background-purple" - padding-top="0em" padding-bottom="5em" - content-left-third=csp_left - content-middle-third=csp_mid - content-right-third=csp_right -%} -{% include slopecap.html - background="background-purple" - position="bottom" - slope="up" -%} - - -
-{% capture aws_intro %} - -![aws]({{ site.baseurl }}{% link /assets/images/AWS-logo.png %}) -## Amazon Web Services - -RAPIDS can be deployed on Amazon Web Services (AWS) in several ways: - -**[ Amazon SageMaker](#AWS-Sagemaker)**{: .block} -**[ Single EC2 instance](#AWS-EC2)**{: .block} -**[ Cluster using Dask](#AWS-Dask)**{: .block} -**[ Cluster using Kubernetes on EKS](#AWS-Kubernetes)**{: .block} - -| Cloud
Provider | Inst.
Type | Inst.
Name | vCPUs | GPU
Count | GPU
Type | xGPU
RAM | xGPU
RAM Total | -| :------------------ | --------------- | ---------------- | ----- | -------------- | ------------- | ------------- | ------------------: | -| AWS | G4dn | g4dn\.xlarge | 4 | 1 | T4 | 16 (GB) | 16 (GB) | -| AWS | G4dn | g4dn\.12xlarge | 48 | 4 | T4 | 16 (GB) | 64 (GB) | -| AWS | G4dn | g4dn\.metal | 96 | 8 | T4 | 16 (GB) | 128 (GB) | -| AWS | P3 | p3\.2xlarge | 8 | 1 | V100 | 16 (GB) | 16 (GB) | -| AWS | P3 | p3\.8xlarge | 32 | 4 | V100 | 16 (GB) | 64 (GB) | -| AWS | P3 | p3\.16xlarge | 64 | 8 | V100 | 16 (GB) | 128 (GB) | -| AWS | P3 | p3dn\.24xlarge | 96 | 8 | V100 | 32 (GB) | 256 (GB) | -| AWS | P4 | p4d\.24xlarge | 96 | 8 | A100 | 40 (GB) | 320 (GB) | -| AWS | G5 | g5\.xlarge | 4 | 1 | A10G | 24 (GB) | 24 (GB) | -| AWS | G5 | g5\.2xlarge | 8 | 1 | A10G | 24 (GB) | 24 (GB) | -| AWS | G5 | g5\.4xlarge | 16 | 1 | A10G | 24 (GB) | 24 (GB) | -| AWS | G5 | g5\.8xlarge | 32 | 1 | A10G | 24 (GB) | 24 (GB) | -| AWS | G5 | g5\.16xlarge | 64 | 1 | A10G | 24 (GB) | 24 (GB) | -| AWS | G5 | g5\.12xlarge | 48 | 4 | A10G | 24 (GB) | 96 (GB) | -| AWS | G5 | g5\.24xlarge | 96 | 4 | A10G | 24 (GB) | 96 (GB) | -| AWS | G5 | g5\.48xlarge | 192 | 8 | A10G | 24 (GB) | 192 (GB) | -{: .cloud-table} - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% include section-single.html - background="background-gray" - padding-top="10em" padding-bottom="3em" - content-single=aws_intro -%} -{% include slopecap.html - background="background-gray" - position="bottom" - slope="down" -%} - -{% capture aws_ec2 %} -## AWS Single Instance (EC2) - -There are multiple ways you can deploy RAPIDS on a single instance, but the easiest is to use the RAPIDS docker image: - -**1. Initiate.** Initiate an instance supported by RAPIDS. See the introduction -section for a list of supported instance types. It is recommended to use an AMI -that already includes the required NVIDIA drivers, such as the **[AWS Deep Learning AMI](https://aws.amazon.com/marketplace/pp/prodview-7ikjtg3um26wq?sr=0-9&ref_=beagle&applicationId=AWSMPContessa)** -**([docs](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html))**. Other options include the **[Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver](https://aws.amazon.com/marketplace/pp/Amazon-Web-Services-Amazon-Linux-2-AMI-with-NVIDIA/B07S5G9S1Z)**. - -**2. Credentials.** Using the credentials supplied by AWS, log into the instance via SSH. For a short guide on launching your instance and accessing it, read the Getting Started with Amazon EC2 documentation. - -**3. Install.** Install **[Docker and the NVIDIA Docker runtime](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)** in the AWS instance. This step is not required if you are using AWS Deep Learning AMI. - -**4. Install.** Install RAPIDS docker image. The docker container can be customized by using the options provided in the **[Getting Started](https://rapids.ai/start.html)** page of RAPIDS. Example of an image that can be used is provided below: -```shell ->>> docker pull rapidsai/rapidsai:cuda11.2-runtime-ubuntu18.04 ->>> docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 \ - rapidsai/rapidsai:cuda11.2-runtime-ubuntu18.04-py3.7 -``` -{: .margin-bottom-3em} - -**5. Test RAPIDS.** Test it! The RAPIDS docker image will start a Jupyter notebook instance automatically. You can log into it by going to the IP address provided by AWS on port 8888. - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% capture aws_dask %} -## AWS Cluster via Dask - -RAPIDS can be deployed on a multi-node ECS cluster using Dask’s dask-cloud provider management tools. For more details, see our **[blog post on deploying on ECS.](https://medium.com/rapids-ai/getting-started-with-rapids-on-aws-ecs-using-dask-cloud-provider-b1adfdbc9c6e)** - -**0. Run from within AWS.** The following steps assume you are running them from within the same AWS VPC. One way to ensure this is to run through the **[AWS Single Instance (EC2)](#AWS-EC2)** instructions and then run these steps from there. - -**1. Setup AWS credentials.** First, you will need AWS credentials to allow us to interact with the AWS CLI. If someone else manages your AWS account, you will need to get these keys from them. You can provide these credentials to dask-cloudprovider in a number of ways, but the easiest is to setup your local environment using the AWS command line tools: -```shell ->>> pip install awscli ->>> aws configure -``` -{: .margin-bottom-3em} - -**2. Install dask-cloudprovider.** To install, you will need to run the following: -```shell ->>> pip install dask-cloudprovider[aws] -``` -{: .margin-bottom-3em} - -**3. Create an EC2 cluster:** In the AWS console, visit the ECS dashboard. From the “Clusters” section on the left hand side, click “Create Cluster” then: -- Make sure to select an EC2 Linux + Networking cluster so that we can specify our networking options. -- Give the cluster a name EX. `rapids-cluster`. -- Change the instance type to one that supports RAPIDS-supported GPUs (see introduction section for list of supported instance types). For this example, we will use `p3.2xlarge`, each of which comes with one NVIDIA V100 GPU. -- In the networking section, select the default VPC and all the subnets available in that VPC. - -All other options can be left at defaults. You can now click “create” and wait for the cluster creation to complete. - -**4. Create a Dask cluster:** - -Get the Amazon Resource Name (ARN) for the cluster you just created. - -Set `AWS_DEFAULT_REGION` environment variable to your **[default region](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html#concepts-regions)**: -```shell -export AWS_DEFAULT_REGION=[REGION] -``` -[REGION] = code fo the region being used. -{: .margin-bottom-3em} - -Create the ECSCluster object in your Python session: -```shell ->>> from dask_cloudprovider.aws import ECSCluster ->>> cluster = ECSCluster( - cluster_arn=[CLUSTER_ARN], - n_workers=[NUM_WORKERS], - worker_gpu=[NUM_GPUS] - ) -``` -[CLUSTER_ARN] = The ARN of an existing ECS cluster to use for launching tasks
-[NUM_WORKERS] = Number of workers to start on cluster creation.
-[NUM_GPUS] = The number of GPUs to expose to the worker, this must be less than or equal to the number of GPUs in the instance type you selected for the ECS cluster (e.g `1` for `p3.2xlarge`). -{: .margin-bottom-3em} - -**5. Test RAPIDS.** Create a distributed client for our cluster: -```shell ->>> from dask.distributed import Client ->>> client = Client(cluster) -``` -{: .margin-bottom-3em} - -Load sample data and test the cluster! -```shell ->>> import dask, cudf, dask_cudf ->>> ddf = dask.datasets.timeseries() ->>> gdf = ddf.map_partitions(cudf.from_pandas) ->>> gdf.groupby(‘name’).id.count().compute().head() -Out[34]: -Xavier 99495 -Oliver 100251 -Charlie 99354 -Zelda 99709 -Alice 100106 -Name: id, dtype: int64 -``` -{: .margin-bottom-3em} - -**6. Cleanup.** Your cluster will continue to run (and incur charges!) until you shut it down. You can either scale the number of nodes down to zero instances, or shut it down altogether. If you are planning to use the cluster again soon, it is probably preferable to reduce the nodes to zero. - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% capture aws_kub %} -## AWS Cluster via Kubernetes - -RAPIDS can be deployed on AWS via AWS’s managed Kubernetes service (EKS) using Helm. More details can be found at our **[helm docs.](https://helm.rapids.ai/docs/csp.html)** - -**1. Install.** Install and configure dependencies in your local environment: -kubectl, helm, awscli, and eksctl. - -**2. Public Key.** Create a public key if you don't have one. - -**3. Create your cluster:** -```shell ->>> eksctl create cluster \ - --name [CLUSTER_NAME] \ - --version 1.14 \ - --region [REGION] \ - --nodegroup-name gpu-workers \ - --node-type [NODE_INSTANCE] \ - --nodes [NUM_NODES] \ - --nodes-min 1 \ - --nodes-max [MAX_NODES] \ - --node-volume-size [NODE_SIZE] \ - --ssh-access \ - --ssh-public-key ~/path/to/id_rsa.pub \ - --managed -``` -[CLUSTER_NAME] = Name of the EKS cluster. This will be auto generated if not specified.
-[NODE_INSTANCE] = Node instance type to be used. Select one of the instance types supported by RAPIDS Refer to the introduction section for a list of supported instance types.
-[NUM_NODES] = Number of nodes to be used.
-[MAX_NODES] = Maximum size of the nodes.
-[NODE_SIZE] = Size of the nodes.
- -Update the path to the ssh-public-key to point to the folder and file where your public key is saved. -{: .margin-bottom-3em} - -**4. Install GPU addon:** -```shell ->>> kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta4/nvidia-device-plugin.yml -``` -{: .margin-bottom-3em} - -**5. Install RAPIDS helm repo:** -```shell ->>> helm repo add rapidsai https://helm.rapids.ai ->>> helm repo update -``` -{: .margin-bottom-3em} - -**6. Install helm chart:** -```shell ->>> helm install --set dask.scheduler.serviceType="LoadBalancer" --set dask.jupyter.serviceType="LoadBalancer" rapidstest rapidsai/rapidsai -``` -{: .margin-bottom-3em} - -**7. Accessing your cluster:** -```shell ->>> kubectl get svc -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -kubernetes ClusterIP 10.100.0.1 443/TCP 12m -rapidsai-jupyter LoadBalancer 10.100.251.155 a454a9741455544cfa37fc4ac71caa53-868718558.us-east-1.elb.amazonaws.com 80:30633/TCP 85s -rapidsai-scheduler LoadBalancer 10.100.11.182 a9c703f1c002f478ea60d9acaf165bab-1146605388.us-east-1.elb.amazonaws.com 8786:30346/TCP,8787:32444/TCP 85s -``` -{: .margin-bottom-3em} - -**7. ELB IP address:** **[Convert the DNS address provided above as the EXTERNAL-IP address to an IPV4 address](https://aws.amazon.com/premiumsupport/knowledge-center/elb-find-load-balancer-IP/)**. -Then use the obtained IPV4 address to visit the rapidsai-jupyter service in your browser! -{: .margin-bottom-3em} - -**8. Delete the cluster:** List and delete services running in the cluster to release resources -```shell ->>> kubectl get svc --all-namespaces ->>> kubectl delete svc [SERVICE_NAME] -``` -[SERVICE_NAME] = Name of the services which have an EXTERNAL-IP value and are required to be removed to release resources. - -Delete the cluster and its associated nodes -```shell ->>> eksctl delete cluster --region=[REGION] --name=[CLUSTER_NAME] -``` -{: .margin-bottom-3em} - -**9. Uninstall the helm chart:** -```shell ->>> helm uninstall rapidstest -``` -{: .margin-bottom-3em} - - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% capture aws_sage %} -## AWS Sagemaker - -RAPIDS also works with AWS SageMaker. We’ve written a **[detailed guide](https://medium.com/rapids-ai/running-rapids-experiments-at-scale-using-amazon-sagemaker-d516420f165b)** with **[examples](https://github.com/rapidsai/cloud-ml-examples/tree/main/aws)** for how to use Sagemaker with RAPIDS, but the simplest version is: - -**1. Start.** Start a Sagemaker hosted Jupyter notebook instance on AWS. - -**2. Clone.** **[Clone the example repository](https://github.com/shashankprasanna/sagemaker-rapids.git)** which includes all required setup and some example data and code. - -**3. Run.** Start running the sagemaker-rapids.ipynb jupyter notebook. - -For more details, including on running large-scale HPO jobs on Sagemaker with RAPIDS, check out the **[detailed guide](https://medium.com/rapids-ai/running-rapids-experiments-at-scale-using-amazon-sagemaker-d516420f165b)** and **[examples.](https://github.com/rapidsai/cloud-ml-examples/tree/main/aws)** - -**[Jump to Top ](#deploy)** - -{% endcapture %} -
-{% include section-single.html - background="background-white" - padding-top="6em" padding-bottom="0em" - content-single=aws_sage -%} -
-{% include section-single.html - background="background-white" - padding-top="3em" padding-bottom="0em" - content-single=aws_ec2 -%} -
-{% include section-single.html - background="background-white" - padding-top="3em" padding-bottom="0em" - content-single=aws_dask -%} -
-{% include section-single.html - background="background-white" - padding-top="3em" padding-bottom="10em" - content-single=aws_kub -%} - - -
-{% capture azure_intro %} -![azure]({{ site.baseurl }}{% link /assets/images/MS-azure-logo.png %}) -## Microsoft Azure - -RAPIDS can be deployed on Microsoft Azure via several methods: -**[ Azure Machine Learning](#AZ-ML)**{: .block} -**[ Single instance](#AZ-single)**{: .block} -**[ Cluster via Dask](#AZ-Dask)**{: .block} -**[ Cluster via Kubernetes on AKS](#AZ-Kubernetes)**{: .block} - -| Cloud
Provider | Inst.
Type | Inst.
Name | GPU
Count | GPU
Type | xGPU
RAM | xGPU
RAM Total | -|----------------|---------------|---------------|---------|----------|----------------------|---------------|-----------------| -| Azure | NDs | ND6s | 1 | P40 | 24 (GB) | 24 (GB) | -| Azure | NDs | ND12s | 2 | P40 | 24 (GB) | 48 (GB) | -| Azure | NDs | ND24s | 4 | P40 | 24 (GB) | 96 (GB) | -| Azure | NCs v2 | NC6s v2 | 1 | P100 | 16 (GB) | 16 (GB) | -| Azure | NCs v2 | NC12s v2 | 2 | P100 | 16 (GB) | 32 (GB) | -| Azure | NCs v2 | NC24s v2 | 4 | P100 | 16 (GB) | 64 (GB) | -| Azure | NCas T4 v3 | NC4as T4 v3 | 1 | T4 | 16 (GB) | 16 (GB) | -| Azure | NCas T4 v3 | NC64as T4 v3 | 4 | T4 | 16 (GB) | 64 (GB) | -| Azure | NCs v3 | NC6s v3 | 1 | V100 | 16 (GB) | 16 (GB) | -| Azure | NCs v3 | NC12s v3 | 2 | V100 | 16 (GB) | 32 (GB) | -| Azure | NCs v3 | NC24s v3 | 4 | V100 | 16 (GB) | 64 (GB) | -| Azure | NDs v2 | ND40rs | 8 | V100 | 32 (GB) | 256 (GB) | -{: .cloud-table} - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% include slopecap.html - background="background-gray" - position="top" - slope="down" -%} -{% include section-single.html - background="background-gray" - padding-top="3em" padding-bottom="3em" - content-single=azure_intro -%} -{% include slopecap.html - background="background-gray" - position="bottom" - slope="up" -%} - -{% capture az_single %} -## Azure Single Instance (VM) - -There are multiple ways you can deploy RAPIDS on a single VM instance, but the easiest is to use the RAPIDS docker image: - -**1. Initiate VM.** **[Initiate a VM instance](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-portal)** using a VM supported by RAPIDS. See the introduction section for a list of supported instance types. It is recommended to use an image that already includes the required NVIDIA drivers, such as **[this one.](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/nvidia.ngc_azure_17_11?tab=Overview)** - -**2. Credentials.** Using the credentials supplied by Azure, log into the instance via SSH. - -**3. Docker Permissions.** **[Setup docker user permissions.](https://docs.docker.com/engine/install/linux-postinstall/)** - -**4. Install.** **[Install RAPIDS docker image](https://rapids.ai/start.html)**. -The docker container can be customized by using the options provided in the **[Getting Started](https://rapids.ai/start.html)** page of RAPIDS. Example of an image that can be used is provided below: - -```shell ->>> docker pull -rapidsai/rapidsai:cuda11.2-runtime-ubuntu18.04 >>> docker run --gpus all --rm --it -p 8888:8888 -p 8787:8787 -p 8786:8786 \ -rapidsai/rapidsai:cuda11.2-runtime-ubuntu18.04-py3.7 -``` -{: .margin-bottom-3em} - -**5. Test RAPIDS.** Test it! The RAPIDS docker image will start a Jupyter notebook instance automatically. You can log into it by going to the IP address provided by Azure on port 8888. - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% capture az_dask %} -## Azure Cluster via Dask - -RAPIDS can be deployed on a Dask cluster on Azure ML Compute using dask-cloud provider. - -**1. Install.** Install Azure tools (azure-cli). - -**2. Install dask-cloudprovider:** -```shell ->>> pip install dask-cloudprovider -``` -{: .margin-bottom-3em} - -**3. Config.** Create your workspace config file -see **[Azure docs](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-environment#workspace)** for details. - -**4. Environment.** Setup your Azure ML core environment using the RAPIDS docker container: -```shell -from azureml.core import Environment ->>> # create the environment ->>> rapids_env = Environment('rapids_env') ->>> # create the environment inside a Docker container ->>> rapids_env.docker.enabled = True ->>> # specify docker steps as a string. Alternatively, load the string from a file ->>> dockerfile = """ ->>> FROM [CONTAINER]:[TAG] ->>> RUN source activate rapids && \ ->>> pip install azureml-sdk && \ ->>> [ADDITIONAL_LIBRARIES] ->>> """ ->>> # set base image to None since the image is defined by dockerfile ->>> rapids_env.docker.base_image = None ->>> rapids_env.docker.base_dockerfile = dockerfile ->>> # use rapids environment in the container ->>> rapids_env.python.user_managed_dependencies = True -``` -[CONTAINER] = RAPIDS container to be used, for example, `rapidsai/rapidsai`
-[TAG] = Docker container tag.
-[ADDITIONAL_LIBRARIES] = Additional libraries required by the user can be installed by either using `conda` or `pip` install.
-{: .margin-bottom-3em} - -**5. Setup.** Setup your Azure ML Workspace using the config file created in the previous step: -```shell ->>> from azureml.core import Workspace ->>> ws = Workspace.from_config() -``` -{: .margin-bottom-3em} - -**6. Create the AzureMLCluster:** -```shell ->>> from dask_cloudprovider import AzureMLCluster ->>> cluster = AzureMLCluster(ws, - datastores=ws.datastores.values(), - environment_definition=rapids_env, - initial_node_count=[NUM_NODES]) -``` -[NUM_NODES] = Number of nodes to be used.
- -{: .margin-bottom-3em} - -**7. Run Notebook.** In a Jupyter notebook, the cluster object will return a widget allowing you to scale up and containing links to the Jupyter Lab session running on the headnode and Dask dashboard, which are forwarded to local ports for you -unless running on a remote Compute Instance. - - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% capture az_kub %} -## Azure Cluster via Kubernetes - -RAPIDS can be deployed on a Kubernetes cluster on Azure using Helm. More details can be found at our **[helm docs.](https://helm.rapids.ai/docs/csp.html)** - -**1. Install.** Install and configure dependencies on your local environment: kubectl, helm, and az (azure-cli). - -**2. Configure.** Configure az and create a resource group if you don't already have one. -```shell ->>> az login ->>> az group create --name [RESOURCE_GROUP] --location [REGION] -``` -[RESOURCE_GROUP] = resource group to be created.
-[REGION] = the location where the resource group should be created. -{: .margin-bottom-3em} - -**3. Create your cluster:** -```shell ->>> az aks create \ - --resource-group [RESOURCE_GROUP] \ - --name [CLUSTER_NAME] \ - --node-vm-size [VM_SIZE] \ - --node-count [NUM_NODES] -``` -[CLUSTER_NAME] = Name of the managed cluster.
-[NUM_NODES] = Number of nodes in the Kubernetes node pool.
-[VM_SIZE] = the size of the VM you would like to target. This must include a RAPIDS-compatible GPU. Ex. `Standard_NC12` - -Please refer to the **[Microsoft Azure CLI documentation](https://docs.microsoft.com/en-us/cli/azure/aks?view=azure-cli-latest#az-aks-create)** for more information. -{: .margin-bottom-3em} - -**4. Update your local kubectl config file:** -```shell ->>> az aks get-credentials --resource-group myResourceGroup --name rapids -``` -{: .margin-bottom-3em} - -**5. Install.** **[Install Kubernetes NVIDIA Device Plugin:](https://github.com/NVIDIA/k8s-device-plugin)** -```shell ->>> helm repo add nvdp https://nvidia.github.io/k8s-device-plugin ->>> helm repo update ->>> helm install \ - --version=0.6.0 \ - --generate-name \ - nvdp/nvidia-device-plugin -``` -{: .margin-bottom-3em} - -**6. Install RAPIDS helm repo:** -```shell ->>> helm repo add rapidsai https://helm.rapids.ai ->>> helm repo update -``` -{: .margin-bottom-3em} - -**7. Install helm chart:** -```shell ->>> helm install --set dask.scheduler.serviceType="LoadBalancer" --set dask.jupyter.serviceType="LoadBalancer" rapidstest rapidsai/rapidsai -``` -{: .margin-bottom-3em} - -**8. Accessing your cluster:** -```shell ->>> kubectl get svc -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -kubernetes ClusterIP 10.100.0.1 443/TCP 14m -rapidsai-jupyter LoadBalancer 10.100.208.179 1.2.3.4 80:32332/TCP 3m30s -rapidsai-scheduler LoadBalancer 10.100.19.121 5.6.7.8 8786:31779/TCP,80:32011/TCP 3m30s -``` -{: .margin-bottom-3em} - -You can now visit the external IP of the rapidsai-jupyter service in your browser! - -**9. Uninstall the helm chart:** -```shell ->>> helm uninstall rapidstest -``` -{: .margin-bottom-3em} - - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% capture az_ml %} -## Azure Machine Learning (Azure ML) - -RAPIDS can be deployed at scale using Azure Machine Learning Service and easily scales up to any size needed. We have written a **[detailed guide](https://medium.com/rapids-ai/rapids-on-microsoft-azure-machine-learning-b51d5d5fde2b)** with **[helper scripts](https://github.com/rapidsai/cloud-ml-examples/tree/main/azure)** to get everything deployed, but the high level procedure is: - -**1. Create.** Create your Azure Resource Group. - -**2. Workspace.** Within the Resource Group, create an Azure Machine Learning service Workspace. - -**3. Config.** Within the Workspace, download the config.json file and verify that subscription_id, resource_group, and workspace_name are set correctly for your environment. - -**4. Quota.** Within your Workspace, check your Usage + Quota to ensure you have enough quota to launch your desired cluster size. - -**5. Clone.** From your local machine, clone the RAPIDS demonstration code and helper scripts. - -**6. Run Utility.** Run the RAPIDS helper utility script to initialize the Azure Machine Learning service Workspace: - -```shell ->>> ./start_azureml.py \ - --config=[CONFIG_PATH] \ - --vm_size=[VM_SIZE] \ - --node_count=[NUM_NODES] -``` -[CONFIG_PATH] = the path to the config file you downloaded in step three. -{: .margin-bottom-3em} - -**7. Start.** Open your browser to http://localhost:8888 and get started! - -See **[the guide](https://medium.com/rapids-ai/rapids-on-microsoft-azure-machine-learning-b51d5d5fde2b#fee3)** or **[GitHub](https://github.com/rapidsai/cloud-ml-examples/tree/main/azure)** for more details. - -**[Jump to Top ](#deploy)** - -{% endcapture %} -
-{% include section-single.html - background="background-white" - padding-top="6em" padding-bottom="0em" - content-single=az_single -%} -
-{% include section-single.html - background="background-white" - padding-top="5em" padding-bottom="0em" - content-single=az_dask -%} -
-{% include section-single.html - background="background-white" - padding-top="3em" padding-bottom="0em" - content-single=az_kub -%} -
-{% include section-single.html - background="background-white" - padding-top="3em" padding-bottom="10em" - content-single=az_ml -%} - - -
-{% capture gcp_intro %} -![gcp]({{ site.baseurl }}{% link /assets/images/GCP-logo.png %}) -## Google Cloud - -RAPIDS can be used in Google Cloud in several different ways: -**[ Google AI Platform](#GC-AI)**{: .block} -**[ Single instance](#GC-single)**{: .block} -**[ Cluster using Dask (via Dataproc)](#GC-Dask)**{: .block} -**[ Cluster using Kubernetes on GKE](#GC-Kubernetes)**{: .block} - - -| Cloud
Provider | Inst.
Type | Inst.
Name | GPU
Count | GPU
Type | xGPU
RAM | xGPU
RAM Total | -|----------------|----------------------------|------------------|------------|----------|--------------------|----------------| -| Google Cloud | GPU Compute Workload Addon | Any Machine Type | 1, 2, 4 | P4 | 8 (GB) | 8, 16, 32 (GB) | -| Google Cloud | GPU Compute Workload Addon | Any Machine Type | 1, 2, 4 | P100 | 16 (GB) | 16, 32, 64 (GB) | -| Google Cloud | GPU Compute Workload Addon | Any Machine Type | 1, 2, 4 | T4 | 16 (GB) | 16, 32, 64 (GB) | -| Google Cloud | GPU Compute Workload Addon | Any Machine Type | 1, 2, 4, 8 | V100 | 16 (GB) | 16, 32, 64, 128 (GB) | -| Google Cloud | A2 | a2-highgpu-1g | 1 | A100 | 40 (GB) | 40 (GB) | -| Google Cloud | A2 | a2-highgpu-2g | 2 | A100 | 40 (GB) | 80 (GB) | -| Google Cloud | A2 | a2-highgpu-4g | 4 | A100 | 40 (GB) | 160 (GB) | -| Google Cloud | A2 | a2-highgpu-8g | 8 | A100 | 40 (GB) | 320 (GB) | -| Google Cloud | A2 | a2-highgpu-16g | 16 | A100 | 40 (GB) | 640 (GB) | -{: .cloud-table} - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% include slopecap.html - background="background-gray" - position="top" - slope="down" -%} -{% include section-single.html - background="background-gray" - padding-top="3em" padding-bottom="3em" - content-single=gcp_intro -%} -{% include slopecap.html - background="background-gray" - position="bottom" - slope="up" -%} - -{% capture gc_single %} - -## Google Single Instance -RAPIDS can be deployed on Google Cloud as a single instance: - -**1. Create.** Create a Project in your Google Cloud account. - -**2. Create VM.** See the introduction section for a list of supported GPUs. We recommend using an image that already includes prerequisites such as drivers and docker, such as the **[NVIDIA GPU-Optimized Image for Deep Learning, ML & HPC VM](https://console.cloud.google.com/marketplace/details/nvidia-ngc-public/nvidia_gpu_cloud_image?supportedpurview=project)** image. - -**3. Drivers.** Enter Y (Yes) when asked if you would like to download the latest NVIDIA drivers. - -**4. Permissions.** **[Setup Docker user permission.](https://docs.docker.com/engine/install/linux-postinstall/)** - -**5. Install.** **[Install RAPIDS docker image](https://rapids.ai/start.html)**. The docker container can be customized by using the options provided in the **[Getting Started](https://rapids.ai/start.html)** page of RAPIDS. Example of an image that can be used is provided below: -```shell ->>> docker pull rapidsai/rapidsai:cuda11.2-runtime-ubuntu18.04-py3.7 ->>> docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 \ - rapidsai/rapidsai:cuda11.2-runtime-ubuntu18.04-py3.7 -``` -{: .margin-bottom-3em} - -**6. Test RAPIDS.** The above command should start your docker container. To test the container, start a python instance and then import any one of the RAPIDS libraries in it. - -**[Jump to Top ](#deploy)** - - -{% endcapture %} - -{% capture gc_dask %} -## Google Cluster via Dask (Dataproc) - -RAPIDS can be deployed on Google Cloud Dataproc using Dask. We have **[helper scripts and detailed instructions](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/rapids)** to help. - -**1. Create Dataproc cluster with Dask RAPIDS.** Use the gcloud command to create a new cluster with the below initialization action. Because of an Anaconda version conflict, script deployment on older images is slow, we recommend using Dask with Dataproc 2.0+. -```shell ->>> export GCS_BUCKET=[BUCKET_NAME] ->>> export CLUSTER_NAME=[CLUSTER_NAME] ->>> export REGION=[REGION] ->>> export DASK_RUNTIME=[DASK_RUNTIME] ->>> gcloud dataproc clusters create $CLUSTER_NAME \ - --region $REGION \ - --image-version preview-ubuntu18 \ - --master-machine-type [MACHINE_TYPE] \ - --master-accelerator type=[GPU_TYPE],count=[NUM_GPU] \ - --worker-machine-type [MACHINE_TYPE] \ - --worker-accelerator type=[GPU_TYPE],count=[NUM_GPU] \ - --optional-components=ANACONDA \ - --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh,gs://goog-dataproc-initialization-actions-${REGION}/dask/dask.sh,gs://goog-dataproc-initialization-actions-${REGION}/rapids/rapids.sh \ - --initialization-action-timeout=60m \ - --metadata gpu-driver-provider=NVIDIA,dask-runtime=${DASK_RUNTIME},rapids-runtime=DASK \ - --enable-component-gateway -``` -[BUCKET_NAME] = name of the bucket to use.
-[CLUSTER_NAME] = name of the cluster.
-[REGION] = name of region where cluster is to be created.
-[DASK_RUNTIME] = Dask runtime could be set to either yarn or standalone. -{: .margin-bottom-3em} - -**2. Run Dask RAPIDS Workload.** Once the cluster has been created, the Dask scheduler listens for workers on port 8786, and its status dashboard is on port 8787 on the Dataproc master node. To connect to the Dask web interface, you will need to create an SSH tunnel as described in the **[Dataproc web interfaces documentation.](https://cloud.google.com/dataproc/docs/concepts/accessing/cluster-web-interfaces)** You can also connect using the Dask Client Python API from a Jupyter notebook, or from a Python script or interpreter session. - -For more, see our **[detailed instructions and helper scripts.](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/rapids)** - - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% capture gc_kub %} -## Google Cluster via Kubernetes - -RAPIDS can be deployed in a Kubernetes cluster on GCP. For more information, see the **[detailed instructions and helm charts.](https://helm.rapids.ai/docs/csp.html)** - -**1. Install.** Install and configure dependencies in your local environment: **[kubectl, helm, gcloud.](https://helm.rapids.ai/docs/csp.html)** - -**2. Configure cloud:** -```shell ->>> gcloud init -``` -{: .margin-bottom-3em} - -**3. Set your default computer zone:** -```shell ->>> gcloud config set compute/zone [REGION] -``` -{: .margin-bottom-3em} - -**4. [Create the cluster](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create):** -```shell ->>> gcloud container clusters create \ - rapids \ - --machine-type n1-standard-4 \ - --accelerator type=[GPU_TYPE],count=[NUM_GPU] \ - --region [REGION] \ - --node-locations [NODE_REGION] \ - --num-nodes [NUM_NODES] \ - --min-nodes 0 \ - --max-nodes [MAX_NODES] \ - --enable-autoscaling -``` -[NODE_REGION] = The node locations to be used in the default regions. Ex. `us-west1-b`
-[NUM_NODES] = number of nodes to be created in each of the cluster's zones.
-[MAX_NODES] = Maximum number of nodes to which the node pool specified by `--node-pool` (or default node pool if unspecified) can scale. - - -Example: -```shell ->>> gcloud container clusters create \ - rapids \ - --machine-type n1-standard-4 \ - --accelerator type=nvidia-tesla-v100,count=2 \ - --region us-west1 \ - --node-locations us-west1-a,us-west1-b \ - --num-nodes 1 \ - --min-nodes 0 \ - --max-nodes 4 \ - --enable-autoscaling -``` -{: .margin-bottom-3em} -**5. Update local kubectl:** -```shell ->>> gcloud container clusters get-credentials rapids -``` -{: .margin-bottom-3em} - -**6. Install kubectl GPU add on:** -```shell ->>> kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml -``` -{: .margin-bottom-3em} - -**7. Install RAPIDS helm repo:** -```shell ->>> helm repo add rapidsai https://helm.rapids.ai ->>> helm repo update -``` -{: .margin-bottom-3em} - -**8. Install the helm chart:** -```shell ->>> helm install --set dask.scheduler.serviceType="LoadBalancer" --set dask.jupyter.serviceType="LoadBalancer" rapidstest rapidsai/rapidsai -``` -{: .margin-bottom-3em} - -**9. Access your cluster:** -```shell ->>> kubectl get svc -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -kubernetes ClusterIP 10.100.0.1 443/TCP 14m -rapidsai-jupyter LoadBalancer 10.100.208.179 1.2.3.4 80:32332/TCP 3m30s -rapidsai-scheduler LoadBalancer 10.100.19.121 5.6.7.8 8786:31779/TCP,80:32011/TCP 3m30s -``` -{: .margin-bottom-3em} - -To run notebooks on jupyter in your browser, visit the external IP of rapidsai-jupyter. - -**10. Uninstall the helm chart:** -```shell ->>> helm uninstall rapidstest -``` -{: .margin-bottom-3em} - - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -{% capture gc_ai %} -## Google Cloud AI Platform - -RAPIDS can be deployed on Google’s Cloud AI platform. This deployment can range from a simple pre-made notebook (instructions below!) all the way up to a custom training container and HPO job. For more, see our **[detailed instructions and helper scripts.](https://github.com/rapidsai/cloud-ml-examples/tree/main/gcp)** - -**1. Login.** Log into your GCP console. - -**2. Select.** Select AI Platform, then Notebooks. - -**3. Create and Run.** Select a "New Instance" and select the "RAPIDS 0.18 [EXPERIMENTAL]" environment (comes with Conda installed): -- Select 'Install NVIDIA GPU driver automatically for me' -- Create and launch your notebook service - - -To create an instance wtih A100s: -- Select 'New Instance' -> 'Customize instance' -- Select 'us-central1' region -- Select 'RAPIDS 0.18 [EXPERIMENTAL]' environment -- Choose 'A2 highgpu' (for 1, 2, 4 or 8 A100s) or 'A2 megagpu' (for 16 A100s) as machine type -- Select 'Install NVIDIA GPU driver automatically for me' -- Create and launch your notebook service - -For more details, or for other ways to deploy on Google Cloud AI Platform, see the **[detailed instructions and helper scripts.](https://github.com/rapidsai/cloud-ml-examples/tree/main/gcp)** - - -**[Jump to Top ](#deploy)** - -{% endcapture %} - -
-{% include section-single.html - background="background-white" - padding-top="6em" padding-bottom="0em" - content-single=gc_ai -%} -
-{% include section-single.html - background="background-white" - padding-top="3em" padding-bottom="0em" - content-single=gc_single -%} -
-{% include section-single.html - background="background-white" - padding-top="3em" padding-bottom="0em" - content-single=gc_dask -%} -
-{% include section-single.html - background="background-white" - padding-top="3em" padding-bottom="10em" - content-single=gc_kub -%} - - -{% capture end_bottom %} -# TRY RAPIDS in the Cloud -{: .section-title-full .text-white} - -{% endcapture %} -{% include slopecap.html - background="background-darkpurple" - position="top" - slope="down" -%} -{% include section-single.html - background="background-darkpurple" - padding-top="0em" padding-bottom="0em" - content-single=end_bottom -%} -{% include cta-footer.html - name="Experience Data Science on GPUs with RAPIDS" - tagline="" - button="GET STARTED" - link="start.html" -%} From 298586f363eeb2a13e6a9e9283d10fd889bb52c7 Mon Sep 17 00:00:00 2001 From: Jacob Tomlinson Date: Fri, 13 Jan 2023 15:44:07 +0000 Subject: [PATCH 2/6] Switch existing redirects to use the Netlify config --- _config.yml | 1 + _redirects | 6 ++++++ _redirects/slack-invite.html | 11 ----------- 3 files changed, 7 insertions(+), 11 deletions(-) create mode 100644 _redirects delete mode 100644 _redirects/slack-invite.html diff --git a/_config.yml b/_config.yml index e3e9f890..d22a7517 100644 --- a/_config.yml +++ b/_config.yml @@ -3,6 +3,7 @@ title: RAPIDS postsurl: "https://rapidsai.github.io/site-data/posts.json" slack_invite: "https://join.slack.com/t/rapids-goai/shared_invite/zt-trnsul8g-Sblci8dk6dIoEeGpoFcFOQ" exclude: ['_drafts', 'README.md', '.gitignore', 'CNAME'] +include: ['_redirects'] plugins: - jekyll-redirect-from diff --git a/_redirects b/_redirects new file mode 100644 index 00000000..852e8f2a --- /dev/null +++ b/_redirects @@ -0,0 +1,6 @@ +--- +layout: none +--- + +# Redirects from what the browser requests to what we serve +/slack-invite {{ site.slack_invite }} diff --git a/_redirects/slack-invite.html b/_redirects/slack-invite.html deleted file mode 100644 index 24b4fc60..00000000 --- a/_redirects/slack-invite.html +++ /dev/null @@ -1,11 +0,0 @@ ---- -layout: none ---- - - - - - - - - From 1e18bf0a914c789c5aa41831da846f73135dd4aa Mon Sep 17 00:00:00 2001 From: Jacob Tomlinson Date: Fri, 13 Jan 2023 15:45:13 +0000 Subject: [PATCH 3/6] Update smsl redirect --- _redirects | 3 ++- smsl.md | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) delete mode 100644 smsl.md diff --git a/_redirects b/_redirects index 852e8f2a..96af67a5 100644 --- a/_redirects +++ b/_redirects @@ -3,4 +3,5 @@ layout: none --- # Redirects from what the browser requests to what we serve -/slack-invite {{ site.slack_invite }} +/slack-invite {{ site.slack_invite }} +/smsl https://studiolab.sagemaker.aws/import/github/rapidsai-community/rapids-smsl/blob/main/rapids-smsl.ipynb diff --git a/smsl.md b/smsl.md deleted file mode 100644 index 5854f8ed..00000000 --- a/smsl.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: RAPIDS+Sagemaker -permalink: /smsl -redirect_to: https://studiolab.sagemaker.aws/import/github/rapidsai-community/rapids-smsl/blob/main/rapids-smsl.ipynb -[comment]: <> (This page is a redirect to SageMaker Studio Lab, which copies our quick start notebook. It helps us collect analytics on user clicks) ---- \ No newline at end of file From b5d1167c24a736866c799dfc83894fd251b66896 Mon Sep 17 00:00:00 2001 From: Jacob Tomlinson Date: Fri, 13 Jan 2023 15:45:50 +0000 Subject: [PATCH 4/6] Update cloud redirect --- _redirects | 3 ++- cloud.md | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) delete mode 100644 cloud.md diff --git a/_redirects b/_redirects index 96af67a5..1c8f2975 100644 --- a/_redirects +++ b/_redirects @@ -3,5 +3,6 @@ layout: none --- # Redirects from what the browser requests to what we serve -/slack-invite {{ site.slack_invite }} +/cloud https://docs.rapids.ai/deployment/stable/cloud/ /smsl https://studiolab.sagemaker.aws/import/github/rapidsai-community/rapids-smsl/blob/main/rapids-smsl.ipynb +/slack-invite {{ site.slack_invite }} diff --git a/cloud.md b/cloud.md deleted file mode 100644 index a9eabc53..00000000 --- a/cloud.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: RAPIDS + Cloud -permalink: /cloud -redirect_to: https://docs.rapids.ai/deployment/stable/cloud/ -[comment]: <> (This page is a redirect to the RAPIDS Cloud Deployment docs) ---- From f62a821ac6f2d0d59c0bc5edc0d1986b6c571fcd Mon Sep 17 00:00:00 2001 From: Jacob Tomlinson Date: Fri, 13 Jan 2023 15:47:36 +0000 Subject: [PATCH 5/6] Remove old redirect collection --- _config.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/_config.yml b/_config.yml index d22a7517..cbaffaa5 100644 --- a/_config.yml +++ b/_config.yml @@ -12,9 +12,6 @@ collections: output: false posts: output: false - redirects: - output: true - permalink: /:path/ defaults: - scope: From 0cc1e493c411942427b0ad7aee68aa27c1a72af5 Mon Sep 17 00:00:00 2001 From: Jacob Tomlinson Date: Fri, 13 Jan 2023 15:50:31 +0000 Subject: [PATCH 6/6] Fix footer link --- _includes/footer.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_includes/footer.html b/_includes/footer.html index 3070a0e7..d772d880 100644 --- a/_includes/footer.html +++ b/_includes/footer.html @@ -12,7 +12,7 @@
COMMUNITY
- CLOUD | + CLOUD | DASK | HPC | HPO |