diff --git a/README.md b/README.md index 4217831..18ffa9e 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,13 @@ Generate image from anything with [ImageBind](https://github.com/facebookresearc - No training is need. - Integration with 🤗 [Diffusers](https://github.com/huggingface/diffusers). - `imagebind` is directly copy from [official repo](https://github.com/facebookresearch/ImageBind) with modification. +- Online gradio demo with [Huggingface Space](https://huggingface.co/spaces/aaronb/Anything2Image). ## Audio to Image -| `assets/wav/bird_audio.wav` | `assets/wav/dog_audio.wav` | `assets/wav/cattle.wav` -| --- | --- | --- | -| ![](assets/generated/bird_audio.png) | ![](assets/generated/dog_audio.png) |![](assets/generated/cattle.png) | +| `assets/wav/bird_audio.wav` | `assets/wav/dog_audio.wav` | `assets/wav/cattle.wav` | `assets/wav/cat.wav` | +| --- | --- | --- | --- | +| ![](assets/generated/bird_audio.png) | ![](assets/generated/dog_audio.png) |![](assets/generated/cattle.png) |![](assets/generated/cat.png) | ```python import imagebind diff --git a/app.py b/app.py new file mode 100644 index 0000000..6f176f6 --- /dev/null +++ b/app.py @@ -0,0 +1,33 @@ +import gradio as gr +import imagebind +import torch +from diffusers import StableUnCLIPImg2ImgPipeline +import soundfile as sf + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +pipe = StableUnCLIPImg2ImgPipeline.from_pretrained( + "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16" +) +pipe = pipe.to(device) + +model = imagebind.imagebind_huge(pretrained=True) +model.eval() +model.to(device) + +@torch.no_grad() +def anything2img(prompt, audio): + sr, waveform = audio + audio_path = 'tmp.wav' + sf.write(audio_path, waveform, sr) + audio_paths=[audio_path] + embeddings = model.forward({ + imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(audio_paths, device), + }) + embeddings = embeddings[imagebind.ModalityType.AUDIO] + images = pipe(prompt=prompt, image_embeds=embeddings.half()).images + return images[0] + + +demo = gr.Interface(fn=anything2img, inputs=["text", "audio"], outputs="image") +# demo.launch(server_name='0.0.0.0', server_port=10051, share=True) +demo.launch(server_name='0.0.0.0', server_port=10047, share=True) \ No newline at end of file diff --git a/assets/generated/cat.png b/assets/generated/cat.png new file mode 100644 index 0000000..630778f Binary files /dev/null and b/assets/generated/cat.png differ diff --git a/assets/wav/cat.wav b/assets/wav/cat.wav new file mode 100644 index 0000000..e93d3c8 Binary files /dev/null and b/assets/wav/cat.wav differ diff --git a/requirements.txt b/requirements.txt index 94587a1..d2c1895 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,6 @@ ftfy regex einops fvcore -decord==0.6.0 \ No newline at end of file +decord==0.6.0 +soundfile +transformers \ No newline at end of file