diff --git a/example/LLM/musicgen/README.md b/example/LLM/musicgen/README.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/example/LLM/musicgen/runtime.yaml b/example/LLM/musicgen/runtime.yaml deleted file mode 100644 index cc31fd72dd..0000000000 --- a/example/LLM/musicgen/runtime.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# from https://github.com/facebookresearch/audiocraft/tree/main#installation -name: musicgen -mode: venv -environment: - arch: noarch - os: ubuntu:20.04 - cuda: 11.7 - python: 3.9 -configs: - pip: - index_url: https://mirrors.aliyun.com/pypi/simple -dependencies: - - pip: - # use the fixed commit of audiocraft - - audiocraft @ git+https://github.com/facebookresearch/audiocraft.git@c9179f87d9f560c96cf6ec4de65a8fcf374c91c5 - - torch >= 2.0 diff --git a/example/text-to-music/README.md b/example/text-to-music/README.md new file mode 100644 index 0000000000..64bfe16ba7 --- /dev/null +++ b/example/text-to-music/README.md @@ -0,0 +1,21 @@ +Text To Music +====== + +Examples for text-to-music offline batch evaluations and online demo. + +Models +------ + +- [musicGen](https://musicgen.com/): a powerful single Language Model (LM) redefining the boundaries of conditional music generation, with the ability to create high-quality music by taking cues from text descriptions or melodies. + +Datasets +------ + +MusicGen samples from website. + +What we learn from these examples? +------ + +- Build Starwhale Model for the text-to-music models. +- Log audio artifact in the evaluation phase. +- Write evaluation results summary by Starwhale Report. diff --git a/example/text-to-music/datasets/README.md b/example/text-to-music/datasets/README.md new file mode 100644 index 0000000000..c1193d305b --- /dev/null +++ b/example/text-to-music/datasets/README.md @@ -0,0 +1,18 @@ +Datasets for text-to-music +====== + +MusicGen mini +------ + +Run command to build dataset: + +```bash +python musicgen-mini.py +``` + +Run swcli command to show built dataset: + +```bash +swcli dataset info musicgen-mini +``` + diff --git a/example/LLM/musicgen/dataset.py b/example/text-to-music/datasets/musicgen-mini.py similarity index 91% rename from example/LLM/musicgen/dataset.py rename to example/text-to-music/datasets/musicgen-mini.py index ee97a5bebc..5fe59b3777 100644 --- a/example/LLM/musicgen/dataset.py +++ b/example/text-to-music/datasets/musicgen-mini.py @@ -1,7 +1,7 @@ from starwhale import dataset from starwhale.utils.debug import init_logger -init_logger(4) +init_logger(3) # data from https://ai.honu.io/papers/musicgen/ desc_samples = [ @@ -29,15 +29,15 @@ "An energetic hip-hop music piece, with synth sounds and strong bass. There is a rhythmic hi-hat patten in the drums." "90s rock song with electric guitar and heavy drums", "An 80s driving pop song with heavy drums and synth pads in the background", - " An energetic hip-hop music piece, with synth sounds and strong bass. There is a rhythmic hi-hat patten in the drums.", + "An energetic hip-hop music piece, with synth sounds and strong bass. There is a rhythmic hi-hat patten in the drums.", ] def build_dataset() -> None: print("Building musicgen dataset...") with dataset("musicgen-mini") as ds: - for desc in desc_samples: - ds.append({"desc": desc}) + for idx, desc in enumerate(desc_samples): + ds[idx] = {"desc": desc} ds.commit() diff --git a/example/LLM/musicgen/.gitignore b/example/text-to-music/musicgen/.gitignore similarity index 100% rename from example/LLM/musicgen/.gitignore rename to example/text-to-music/musicgen/.gitignore diff --git a/example/text-to-music/musicgen/README.md b/example/text-to-music/musicgen/README.md new file mode 100644 index 0000000000..f8376f9087 --- /dev/null +++ b/example/text-to-music/musicgen/README.md @@ -0,0 +1,47 @@ +MusicGen Example Guides +====== + +[MusicGen](https://ai.honu.io/papers/musicgen/) a single Language Model (LM) that operates over several streams of compressed discrete music representation. + +- 🏔️ Homepage: ️ +- 🌋 Github: ️ +- 🏕️ Size: small(300M), melody(1.5B), medium(1.5B), large(3.3B) + +Login Starwhale Cloud +------ + +```bash +swcli instance login --token "${TOKEN}" --alias cloud-cn https://cloud.starwhale.cn/ +``` + +Build Starwhale Runtime +------ + +```bash +swcli -vvv runtime build +swcli runtime cp musicgen https://cloud.starwhale.cn/project/starwhale:llm_text_to_audio +``` + +Build Starwhale Model +------ + +Model name choices: `melody`, `medium`, `small` and `large`. + +```bash +python3 build.py ${model_name} + +swcli runtime activate musicgen +python3 build.py small +swcli model cp musicgen-small https://cloud.starwhale.cn/project/starwhale:llm_text_to_audio +``` + +Run Starwhale Model +------ + +```bash +# use model src dir +swcli model run --workdir . --runtime musicgen --dataset musicgen-mini -m evaluation + +# use model package +swcli model run --uri musicgen-small --runtime musicgen --dataset musicgen-mini +``` diff --git a/example/LLM/musicgen/build.py b/example/text-to-music/musicgen/build.py similarity index 79% rename from example/LLM/musicgen/build.py rename to example/text-to-music/musicgen/build.py index ee3c69acc0..1624d7502d 100644 --- a/example/LLM/musicgen/build.py +++ b/example/text-to-music/musicgen/build.py @@ -5,7 +5,7 @@ from huggingface_hub import snapshot_download from starwhale import model as starwhale_model -from starwhale.utils import debug +from starwhale import init_logger try: from .utils import ( @@ -22,7 +22,13 @@ ) from evaluation import music_predict -debug.init_logger(4) +# init_logger configures the logging level of starwhale: +# -> 0: ERROR (default) +# -> 1: WARNING +# -> 2: INFO +# -> 3: DEBUG +# -> 4: TRACE +init_logger(3) def build_starwhale_model(model_name: str) -> None: @@ -39,6 +45,8 @@ def build_starwhale_model(model_name: str) -> None: ) prepare_build_model_package(model_name) + + # Use Starwhale SDK to build Starwhale model Package, `swcli model build` is also available. starwhale_model.build(name=f"musicgen-{model_name}", modules=[music_predict]) diff --git a/example/LLM/musicgen/evaluation.py b/example/text-to-music/musicgen/evaluation.py similarity index 54% rename from example/LLM/musicgen/evaluation.py rename to example/text-to-music/musicgen/evaluation.py index dac503bc14..641cf2f958 100644 --- a/example/LLM/musicgen/evaluation.py +++ b/example/text-to-music/musicgen/evaluation.py @@ -4,9 +4,8 @@ import typing as t from tempfile import NamedTemporaryFile -from audiocraft.models import MusicGen +from transformers import AutoProcessor, MusicgenForConditionalGeneration from audiocraft.data.audio import audio_write -from audiocraft.models.loaders import load_lm_model, load_compression_model from starwhale import Audio, MIMEType, evaluation @@ -15,36 +14,31 @@ except ImportError: from utils import get_model_name, PRETRAINED_MODELS_DIR -duration = int(os.environ.get("DURATION", 10)) top_k = int(os.environ.get("TOP_K", 250)) top_p = int(os.environ.get("TOP_P", 0)) temperature = float(os.environ.get("TEMPERATURE", 1.0)) -cfg_coef = float(os.environ.get("CFG_COEF", 3.0)) max_input_length = int(os.environ.get("MAX_INPUT_LENGTH", 512)) +max_new_tokens = int(os.environ.get("MAX_NEW_TOKENS", 500)) # 500 tokens = 10 seconds +guidance_scale = float(os.environ.get("GUIDANCE_SCALE", 3.0)) _g_model = None +_g_processor = None _g_model_name = None -def _load_model() -> t.Tuple[MusicGen, str]: - global _g_model, _g_model_name +def _load_model() -> t.Tuple[t.Any, t.Any, str]: + global _g_model, _g_model_name, _g_processor - if _g_model is None or _g_model_name is None: - model_name = get_model_name() - device = "cuda" - c_model = load_compression_model( - PRETRAINED_MODELS_DIR / model_name / "compression_state_dict.bin", - device=device, + if _g_model is None or _g_model_name is None or _g_processor is None: + _g_model_name = get_model_name() + _g_model = MusicgenForConditionalGeneration.from_pretrained( + PRETRAINED_MODELS_DIR / _g_model_name ) - l_model = load_lm_model( - PRETRAINED_MODELS_DIR / model_name / "state_dict.bin", device=device + _g_model.to("cuda") + _g_processor = AutoProcessor.from_pretrained( + PRETRAINED_MODELS_DIR / _g_model_name ) - if model_name == "melody": - l_model.condition_provider.conditioners["self_wav"].match_len_on_eval = True - _g_model = MusicGen(model_name, c_model, l_model) - _g_model_name = model_name - - return _g_model, _g_model_name + return _g_model, _g_processor, _g_model_name @evaluation.predict( @@ -55,30 +49,36 @@ def _load_model() -> t.Tuple[MusicGen, str]: ) def music_predict(data: dict) -> Audio: # TODO: support batch prediction - model, _ = _load_model() - model.set_generation_params( - duration=duration, + model, processor, _ = _load_model() + inputs = processor( + text=[data["desc"][:max_input_length]], + padding=True, + return_tensors="pt", + ) + # TODO: support melody + outputs = model.generate( + **inputs.to("cuda"), + do_sample=True, + guidance_scale=guidance_scale, + max_new_tokens=max_new_tokens, top_k=top_k, top_p=top_p, temperature=temperature, - cfg_coef=cfg_coef, ) - # TODO: support melody - outputs = model.generate([data.desc[:max_input_length]]) output = outputs[0].detach().cpu().float() with NamedTemporaryFile("wb", suffix=".wav", delete=True) as file: fpath = audio_write( stem_name=file.name, wav=output, - sample_rate=model.sample_rate, + sample_rate=model.config.audio_encoder.sampling_rate, strategy="loudness", loudness_headroom_db=16, loudness_compressor=True, add_suffix=False, ) audio = Audio( - fp=fpath, + fp=fpath.read_bytes(), mime_type=MIMEType.WAV, ) return audio diff --git a/example/text-to-music/musicgen/requirements.txt b/example/text-to-music/musicgen/requirements.txt new file mode 100644 index 0000000000..fc9b239762 --- /dev/null +++ b/example/text-to-music/musicgen/requirements.txt @@ -0,0 +1,4 @@ +torch==2.0.1 +transformers==4.31.0 +huggingface-hub # download hf models +audiocraft==1.0.0 diff --git a/example/text-to-music/musicgen/runtime.yaml b/example/text-to-music/musicgen/runtime.yaml new file mode 100644 index 0000000000..f1275a0476 --- /dev/null +++ b/example/text-to-music/musicgen/runtime.yaml @@ -0,0 +1,10 @@ +name: musicgen +mode: venv +environment: + arch: noarch + os: ubuntu:20.04 + cuda: 11.7 + python: 3.9 + starwhale_version: 0.6.1 # Starwhale >= 0.6.0 supports log artifacts in the evaluation phase. +dependencies: + - requirements.txt diff --git a/example/LLM/musicgen/utils.py b/example/text-to-music/musicgen/utils.py similarity index 100% rename from example/LLM/musicgen/utils.py rename to example/text-to-music/musicgen/utils.py