update

Zeqiang-Lai · May 16, 2023 · f5d2c11 · f5d2c11
1 parent 57b169f
commit f5d2c11
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -5,12 +5,13 @@ Generate image from anything with [ImageBind](https://github.com/facebookresearc
 - No training is need.
 - Integration with 🤗  [Diffusers](https://github.com/huggingface/diffusers).
 - `imagebind` is directly copy from [official repo](https://github.com/facebookresearch/ImageBind) with modification. 
+- Online gradio demo with [Huggingface Space](https://huggingface.co/spaces/aaronb/Anything2Image).
 
 ## Audio to Image
 
-| `assets/wav/bird_audio.wav` | `assets/wav/dog_audio.wav` |  `assets/wav/cattle.wav`
-| --- | --- | --- | 
-| ![](assets/generated/bird_audio.png) | ![](assets/generated/dog_audio.png) |![](assets/generated/cattle.png) |
+| `assets/wav/bird_audio.wav` | `assets/wav/dog_audio.wav` |  `assets/wav/cattle.wav` | `assets/wav/cat.wav` | 
+| --- | --- | --- | --- | 
+| ![](assets/generated/bird_audio.png) | ![](assets/generated/dog_audio.png) |![](assets/generated/cattle.png) |![](assets/generated/cat.png) |
 
 ```python
 import imagebind

diff --git a/app.py b/app.py
@@ -0,0 +1,33 @@
+import gradio as gr
+import imagebind
+import torch
+from diffusers import StableUnCLIPImg2ImgPipeline
+import soundfile as sf
+
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16"
+)
+pipe = pipe.to(device)
+
+model = imagebind.imagebind_huge(pretrained=True)
+model.eval()
+model.to(device)
+
+@torch.no_grad()
+def anything2img(prompt, audio):
+    sr, waveform = audio
+    audio_path = 'tmp.wav'
+    sf.write(audio_path, waveform, sr)
+    audio_paths=[audio_path]
+    embeddings = model.forward({
+        imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(audio_paths, device),
+    })
+    embeddings = embeddings[imagebind.ModalityType.AUDIO]
+    images = pipe(prompt=prompt, image_embeds=embeddings.half()).images
+    return images[0]
+
+
+demo = gr.Interface(fn=anything2img, inputs=["text", "audio"], outputs="image")
+# demo.launch(server_name='0.0.0.0', server_port=10051, share=True)
+demo.launch(server_name='0.0.0.0', server_port=10047, share=True)
diff --git a/assets/generated/cat.png b/assets/generated/cat.png
diff --git a/assets/wav/cat.wav b/assets/wav/cat.wav
diff --git a/requirements.txt b/requirements.txt
@@ -8,4 +8,6 @@ ftfy
 regex
 einops
 fvcore
-decord==0.6.0
+decord==0.6.0
+soundfile
+transformers