Skip to content

Commit

Permalink
Add distillation finetune pipeline (Azure#2227)
Browse files Browse the repository at this point in the history
* Add distillation finetune pipeline

* Revert rag changes

---------

Co-authored-by: Dawei Li <[email protected]>
  • Loading branch information
chnldw and Dawei Li authored Feb 5, 2024
1 parent 5787426 commit 9f074d1
Show file tree
Hide file tree
Showing 21 changed files with 1,068 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## Data Distillation

### Name

data_distillation

### Version

0.0.1

### Type

command

### Description

## Inputs

## Outputs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
type: component
spec: spec.yaml
categories: ["OpenAI Models", "Finetune"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
type: command

name: data_distillation
display_name: Data Distillation
version: 0.0.1

environment: azureml:distillation@latest

code: ../../src/data_distillation/

inputs:

# Distillation parameters
distillation_technique:
type: string
optional: false
enum:
- "Zero-Shot"
- "Chain-of-Density"
default: "Zero-Shot"
description: distillation technique

cod_steps:
type: integer
optional: false
default: 4
description: cod steps

# Task parameters
text_key:
type: string
optional: false
description: text key

# Dataset path parameters
train_file_path:
type: uri_file
optional: false
description: Path to the registered training data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`. Special characters like \ and ' are invalid in the parameter value.
mode: rw_mount

validation_file_path:
type: uri_file
optional: true
description: Path to the registered validation data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`. Special characters like \ and ' are invalid in the parameter value.
mode: rw_mount

outputs:
distilled_train_dataset:
type: uri_folder
description: Output folder containing distilled train.jsonl file.
mode: rw_mount
distilled_validation_dataset:
type: uri_folder
description: Output folder containing distilled validation.jsonl file.
mode: rw_mount
distilled_test_dataset:
type: uri_folder
description: Output folder containing distilled test.jsonl file.
mode: rw_mount

command: >-
python data_distillation.py --text_key '${{inputs.text_key}}' --distillation_technique '${{inputs.distillation_technique}}' --cod_steps '${{inputs.cod_steps}}' --train_file_path '${{inputs.train_file_path}}' $[[--validation_file_path '${{inputs.validation_file_path}}']] --distilled_train_dataset '${{outputs.distilled_train_dataset}}' --distilled_validation_dataset '${{outputs.distilled_validation_dataset}}'
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
type: component
spec: spec.yaml
categories: ["OpenAI Models", "Finetune"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineComponent.schema.json
name: openai_distillation_finetune_pipeline
version: 0.0.1
type: pipeline
display_name: OpenAI Finetune Pipeline with Distillation
description: OpenAI Finetune Pipeline with Distillation

inputs:
# Distillation parameters
distillation_technique:
type: string
optional: false
enum:
- "Zero-Shot"
- "Chain-of-Density"
default: "Zero-Shot"
description: distillation technique

cod_steps:
type: integer
optional: false
default: 4
description: cod steps

# Task parameters
text_key:
type: string
optional: false
description: text key

# Dataset path parameters
train_file_path:
type: uri_file
optional: false
description: Path to the registered training data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`. Special characters like \ and ' are invalid in the parameter value.
mode: rw_mount

validation_file_path:
type: uri_file
optional: true
description: Path to the registered validation data asset. The supported data formats are `jsonl`, `json`, `csv`, `tsv` and `parquet`. Special characters like \ and ' are invalid in the parameter value.
mode: rw_mount

# Teacher model settings
openai_api_type:
type: string
optional: false
description: OPENAI_API_TYPE

openai_api_base:
type: string
optional: false
description: OPENAI_API_BASE

openai_api_version:
type: string
optional: false
description: OPENAI_API_VERSION

openai_api_key:
type: string
optional: false
description: OPENAI_API_KEY

# OpenAI Finetune parameters
model:
type: string
optional: False
default: gpt-35-turbo
description: GPT model engine
enum:
- babbage-002
- davinci-002
- gpt-35-turbo
- gpt-4
registered_model_name:
type: string
optional: False
description: User-defined registered model name
n_epochs:
type: integer
optional: False
default: -1
description: Number of training epochs. If set to -1, number of epochs will be determined dynamically based on the input data.
learning_rate_multiplier:
type: number
optional: False
default: 1.0
description: The learning rate multiplier to use for training.
batch_size:
type: integer
optional: False
default: -1
description: Global batch size. If set to -1, batch size will be determined dynamically based on the input data.

outputs:
output_model:
type: uri_folder
description: Dataset with the output model weights (LoRA weights)
mode: mount

jobs:
data_distillation:
type: command
component: ../../data_distillation/spec.yaml
environment_variables:
OPENAI_API_TYPE: '${{parent.inputs.openai_api_type}}'
OPENAI_API_BASE: '${{parent.inputs.openai_api_base}}'
OPENAI_API_VERSION: '${{parent.inputs.openai_api_version}}'
OPENAI_API_KEY: '${{parent.inputs.openai_api_key}}'
inputs:
text_key: '${{parent.inputs.text_key}}'
distillation_technique: '${{parent.inputs.distillation_technique}}'
cod_steps: '${{parent.inputs.cod_steps}}'
train_file_path: '${{parent.inputs.train_file_path}}'
validation_file_path: '${{parent.inputs.validation_file_path}}'
openai_data_import:
type: command
component: azureml://registries/azure-openai-v2/components/openai_data_import/versions/0.3.5
inputs:
train_dataset: '${{parent.jobs.data_distillation.outputs.distilled_train_dataset}}'
validation_dataset: '${{parent.jobs.data_distillation.outputs.distilled_validation_dataset}}'
model: ${{parent.inputs.model}}
openai_completions_finetune:
type: command
component: azureml://registries/azure-openai-v2/components/openai_completions_finetune/versions/0.4.5
inputs:
input_dataset: ${{parent.jobs.openai_data_import.outputs.out_dataset}}
model: ${{parent.inputs.model}}
task_type: chat
registered_model_name: ${{parent.inputs.registered_model_name}}
n_epochs: ${{parent.inputs.n_epochs}}
learning_rate_multiplier: ${{parent.inputs.learning_rate_multiplier}}
batch_size: ${{parent.inputs.batch_size}}
outputs:
output_model: ${{parent.outputs.output_model}}
6 changes: 6 additions & 0 deletions assets/training/distillation/environments/asset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: distillation
version: auto
type: environment
spec: spec.yaml
extra_config: environment.yaml
categories: ["PyTorch", "Training"]
8 changes: 8 additions & 0 deletions assets/training/distillation/environments/context/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04

RUN apt-get update && apt-get -y upgrade
RUN pip install --upgrade pip

COPY requirements.txt .

RUN pip install -r requirements.txt --no-cache-dir
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
azureml-mlflow==1.54.0
openai==0.27.9
datasets==2.14.6
12 changes: 12 additions & 0 deletions assets/training/distillation/environments/environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
image:
name: azureml/curated/distillation
os: linux
context:
dir: context
dockerfile: Dockerfile
template_files:
- Dockerfile
- requirements.txt
publish:
location: mcr
visibility: public
16 changes: 16 additions & 0 deletions assets/training/distillation/environments/spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json

description: >-
Environment used by OpenAI Distillation components
name: "{{asset.name}}"
version: "{{asset.version}}"

build:
path: "{{image.context.path}}"
dockerfile_path: "{{image.dockerfile.path}}"

os_type: linux

tags:
Preview: ""
9 changes: 9 additions & 0 deletions assets/training/distillation/environments/spec_tmp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
build:
path: context
description: Environment used by OpenAI Distillation components
name: distillation
os_type: linux
tags:
Preview: ''
version: 0.0.1.distillation
Loading

0 comments on commit 9f074d1

Please sign in to comment.