Add distillation finetune pipeline (Azure#2227)

* Add distillation finetune pipeline * Revert rag changes --------- Co-authored-by: Dawei Li <[email protected]>
anushree1808 · Feb 5, 2024 · 9f074d1 · 9f074d1
1 parent 5787426
commit 9f074d1
Show file tree

Hide file tree

Showing 21 changed files with 1,068 additions and 0 deletions.
diff --git a/assets/training/distillation/components/data_distillation/README.md b/assets/training/distillation/components/data_distillation/README.md
@@ -0,0 +1,19 @@
+## Data Distillation
+
+### Name 
+
+data_distillation
+
+### Version 
+
+0.0.1
+
+### Type 
+
+command
+
+### Description 
+
+## Inputs 
+
+## Outputs 
diff --git a/assets/training/distillation/components/data_distillation/asset.yaml b/assets/training/distillation/components/data_distillation/asset.yaml
@@ -0,0 +1,3 @@
+type: component
+spec: spec.yaml
+categories: ["OpenAI Models", "Finetune"]
diff --git a/assets/training/distillation/components/data_distillation/spec.yaml b/assets/training/distillation/components/data_distillation/spec.yaml
@@ -0,0 +1,64 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
+type: command
+
+name: data_distillation
+display_name: Data Distillation
+version: 0.0.1
+
+environment: azureml:distillation@latest
+
+code: ../../src/data_distillation/
+
+inputs:
+
+  # Distillation parameters
+  distillation_technique:
+    type: string
+    optional: false
+    enum:
+      - "Zero-Shot"
+      - "Chain-of-Density"
+    default: "Zero-Shot"
+    description: distillation technique
+
+  cod_steps:
+    type: integer
+    optional: false
+    default: 4
+    description: cod steps
+
+  # Task parameters
+  text_key:
+    type: string
+    optional: false
+    description: text key
+
+  # Dataset path parameters
+  train_file_path:
+    type: uri_file
+    optional: false
+    description: Path to the registered training data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`. Special characters like \ and ' are invalid in the parameter value.
+    mode: rw_mount
+
+  validation_file_path:
+    type: uri_file
+    optional: true
+    description: Path to the registered validation data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`. Special characters like \ and ' are invalid in the parameter value.
+    mode: rw_mount
+
+outputs:
+  distilled_train_dataset:
+    type: uri_folder
+    description: Output folder containing distilled train.jsonl file.
+    mode: rw_mount
+  distilled_validation_dataset:
+    type: uri_folder
+    description: Output folder containing distilled validation.jsonl file.
+    mode: rw_mount
+  distilled_test_dataset:
+    type: uri_folder
+    description: Output folder containing distilled test.jsonl file.
+    mode: rw_mount
+
+command: >-
+  python data_distillation.py --text_key '${{inputs.text_key}}' --distillation_technique '${{inputs.distillation_technique}}' --cod_steps '${{inputs.cod_steps}}' --train_file_path '${{inputs.train_file_path}}' $[[--validation_file_path '${{inputs.validation_file_path}}']] --distilled_train_dataset '${{outputs.distilled_train_dataset}}' --distilled_validation_dataset '${{outputs.distilled_validation_dataset}}'
diff --git a/...tillation/components/pipeline_components/openai_distillation_finetune_pipeline/asset.yaml b/...tillation/components/pipeline_components/openai_distillation_finetune_pipeline/asset.yaml
@@ -0,0 +1,3 @@
+type: component
+spec: spec.yaml
+categories: ["OpenAI Models", "Finetune"]
diff --git a/...stillation/components/pipeline_components/openai_distillation_finetune_pipeline/spec.yaml b/...stillation/components/pipeline_components/openai_distillation_finetune_pipeline/spec.yaml
@@ -0,0 +1,136 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineComponent.schema.json
+name: openai_distillation_finetune_pipeline
+version: 0.0.1
+type: pipeline
+display_name: OpenAI Finetune Pipeline with Distillation
+description: OpenAI Finetune Pipeline with Distillation
+
+inputs:
+  # Distillation parameters
+  distillation_technique:
+    type: string
+    optional: false
+    enum:
+      - "Zero-Shot"
+      - "Chain-of-Density"
+    default: "Zero-Shot"
+    description: distillation technique
+
+  cod_steps:
+    type: integer
+    optional: false
+    default: 4
+    description: cod steps
+
+  # Task parameters
+  text_key:
+    type: string
+    optional: false
+    description: text key
+
+  # Dataset path parameters
+  train_file_path:
+    type: uri_file
+    optional: false
+    description: Path to the registered training data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`. Special characters like \ and ' are invalid in the parameter value.
+    mode: rw_mount
+
+  validation_file_path:
+    type: uri_file
+    optional: true
+    description: Path to the registered validation data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`. Special characters like \ and ' are invalid in the parameter value.
+    mode: rw_mount
+
+  # Teacher model settings
+  openai_api_type:
+    type: string
+    optional: false
+    description: OPENAI_API_TYPE
+
+  openai_api_base:
+    type: string
+    optional: false
+    description: OPENAI_API_BASE
+
+  openai_api_version:
+    type: string
+    optional: false
+    description: OPENAI_API_VERSION
+
+  openai_api_key:
+    type: string
+    optional: false
+    description: OPENAI_API_KEY
+
+  # OpenAI Finetune parameters
+  model:
+    type: string
+    optional: False
+    default: gpt-35-turbo
+    description: GPT model engine
+    enum:
+      - babbage-002
+      - davinci-002
+      - gpt-35-turbo
+      - gpt-4
+  registered_model_name:
+    type: string
+    optional: False
+    description: User-defined registered model name
+  n_epochs:
+    type: integer
+    optional: False
+    default: -1
+    description: Number of training epochs. If set to -1, number of epochs will be determined dynamically based on the input data.
+  learning_rate_multiplier:
+    type: number
+    optional: False
+    default: 1.0
+    description: The learning rate multiplier to use for training.
+  batch_size:
+    type: integer
+    optional: False
+    default: -1
+    description: Global batch size. If set to -1, batch size will be determined dynamically based on the input data.
+
+outputs:
+  output_model:
+    type: uri_folder
+    description: Dataset with the output model weights (LoRA weights)
+    mode: mount
+
+jobs:
+  data_distillation:
+    type: command
+    component: ../../data_distillation/spec.yaml
+    environment_variables:
+      OPENAI_API_TYPE: '${{parent.inputs.openai_api_type}}'
+      OPENAI_API_BASE: '${{parent.inputs.openai_api_base}}'
+      OPENAI_API_VERSION: '${{parent.inputs.openai_api_version}}'
+      OPENAI_API_KEY: '${{parent.inputs.openai_api_key}}'
+    inputs:
+      text_key: '${{parent.inputs.text_key}}'
+      distillation_technique: '${{parent.inputs.distillation_technique}}'
+      cod_steps: '${{parent.inputs.cod_steps}}'
+      train_file_path: '${{parent.inputs.train_file_path}}'
+      validation_file_path: '${{parent.inputs.validation_file_path}}'
+  openai_data_import:
+    type: command
+    component: azureml://registries/azure-openai-v2/components/openai_data_import/versions/0.3.5
+    inputs:
+      train_dataset: '${{parent.jobs.data_distillation.outputs.distilled_train_dataset}}'
+      validation_dataset: '${{parent.jobs.data_distillation.outputs.distilled_validation_dataset}}'
+      model: ${{parent.inputs.model}}
+  openai_completions_finetune:
+    type: command
+    component: azureml://registries/azure-openai-v2/components/openai_completions_finetune/versions/0.4.5
+    inputs:
+      input_dataset: ${{parent.jobs.openai_data_import.outputs.out_dataset}}
+      model: ${{parent.inputs.model}}
+      task_type: chat
+      registered_model_name: ${{parent.inputs.registered_model_name}}
+      n_epochs: ${{parent.inputs.n_epochs}}
+      learning_rate_multiplier: ${{parent.inputs.learning_rate_multiplier}}
+      batch_size: ${{parent.inputs.batch_size}}
+    outputs:
+      output_model: ${{parent.outputs.output_model}}
diff --git a/assets/training/distillation/environments/asset.yaml b/assets/training/distillation/environments/asset.yaml
@@ -0,0 +1,6 @@
+name: distillation
+version: auto
+type: environment
+spec: spec.yaml
+extra_config: environment.yaml
+categories: ["PyTorch", "Training"]
diff --git a/assets/training/distillation/environments/context/Dockerfile b/assets/training/distillation/environments/context/Dockerfile
@@ -0,0 +1,8 @@
+FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
+
+RUN apt-get update && apt-get -y upgrade
+RUN pip install --upgrade pip
+
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt --no-cache-dir
diff --git a/assets/training/distillation/environments/context/requirements.txt b/assets/training/distillation/environments/context/requirements.txt
@@ -0,0 +1,3 @@
+azureml-mlflow==1.54.0
+openai==0.27.9
+datasets==2.14.6
diff --git a/assets/training/distillation/environments/environment.yaml b/assets/training/distillation/environments/environment.yaml
@@ -0,0 +1,12 @@
+image:
+  name: azureml/curated/distillation
+  os: linux
+  context:
+    dir: context
+    dockerfile: Dockerfile
+    template_files:
+    - Dockerfile
+    - requirements.txt
+  publish:
+    location: mcr
+    visibility: public
diff --git a/assets/training/distillation/environments/spec.yaml b/assets/training/distillation/environments/spec.yaml
@@ -0,0 +1,16 @@
+$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
+
+description: >-
+  Environment used by OpenAI Distillation components
+
+name: "{{asset.name}}"
+version: "{{asset.version}}"
+
+build:
+  path: "{{image.context.path}}"
+  dockerfile_path: "{{image.dockerfile.path}}"
+
+os_type: linux
+
+tags:
+  Preview: ""
diff --git a/assets/training/distillation/environments/spec_tmp.yaml b/assets/training/distillation/environments/spec_tmp.yaml
@@ -0,0 +1,9 @@
+$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
+build:
+  path: context
+description: Environment used by OpenAI Distillation components
+name: distillation
+os_type: linux
+tags:
+  Preview: ''
+version: 0.0.1.distillation