huggingface · julien-c · Feb 4, 2025 · Jan 29, 2025 · Jan 29, 2025 · Feb 3, 2025
@@ -21,7 +21,7 @@ export const REPLICATE_SUPPORTED_MODEL_IDS: ProviderMapping<ReplicateId> = {
 			"stability-ai/sdxl:7762fd07cf82c948538e41f63f77d685e02b063e37e496e96eefd46c929f9bdc",
 	},
 	"text-to-speech": {
-		"OuteAI/OuteTTS-0.3-500M": "jbilcke/oute-tts:39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26",
+		"hexgrad/Kokoro-82M": "jaaari/kokoro-82m:dfdf537ba482b029e0a761699e6f55e9162cfd159270bfe0e44857caa5f275a6",
 	},
 	"text-to-video": {
 		"genmo/mochi-1-preview": "genmoai/mochi-1:1944af04d098ef69bed7f9d335d102e652203f268ec4aaa2d836f6217217e460",

@@ -1,11 +1,11 @@
-import { expect, it, describe, assert } from "vitest";
+import { assert, describe, expect, it } from "vitest";
 
 import type { ChatCompletionStreamOutput } from "@huggingface/tasks";
 
 import { chatCompletion, FAL_AI_SUPPORTED_MODEL_IDS, HfInference } from "../src";
-import "./vcr";
-import { readTestFile } from "./test-files";
 import { textToVideo } from "../src/tasks/cv/textToVideo";
+import { readTestFile } from "./test-files";
+import "./vcr";
 
 const TIMEOUT = 60000 * 3;
 const env = import.meta.env;
@@ -939,11 +939,21 @@ describe.concurrent("HfInference", () => {
 				expect(res).toBeInstanceOf(Blob);
 			});
 
-			it("textToSpeech OuteTTS", async () => {
+			it.skip("textToSpeech OuteTTS - Needs to update the param name in jbilcke/oute-tts Replicate model to text instead of inputs", async () => {
 				const res = await client.textToSpeech({
 					model: "OuteAI/OuteTTS-0.3-500M",
 					provider: "replicate",
-					inputs: "OuteTTS is a frontier TTS model for its size of 1 Billion parameters",
+					text: "OuteTTS is a frontier TTS model for its size of 1 Billion parameters",
+				});
+
+				expect(res).toBeInstanceOf(Blob);
+			});
+
+			it("textToSpeech Kokoro", async () => {
+				const res = await client.textToSpeech({
+					model: "hexgrad/Kokoro-82M",
+					provider: "replicate",
+					text: "Kokoro is a frontier TTS model for its size of 1 Billion parameters",
 				});
 
 				expect(res).toBeInstanceOf(Blob);

diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -21,14 +21,14 @@ export interface TextToSpeechOutput {
  * Inputs for Text To Speech inference
  */
 export interface TextToSpeechInput {
-	/**
-	 * The input text data
-	 */
-	inputs: string;
 	/**
 	 * Additional inference parameters for Text To Speech
 	 */
 	parameters?: TextToSpeechParameters;
+	/**
+	 * The input text data
+	 */
+	text: string;
 	[property: string]: unknown;
 }
 /**

diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -5,7 +5,7 @@
 	"title": "TextToSpeechInput",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"text": {
 			"description": "The input text data",
 			"type": "string"
 		},
@@ -26,5 +26,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["text"]
 }