0xPlaygrounds · yavens · Mar 5, 2025 · Mar 5, 2025 · Mar 6, 2025 · Mar 6, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/rig-core/Cargo.toml b/rig-core/Cargo.toml
@@ -32,6 +32,9 @@ rayon = { version = "1.10.0", optional = true }
 worker = { version = "0.5", optional = true }
 bytes = "1.9.0"
 async-stream = "0.3.6"
+mime_guess = { version = "2.0.5", optional = true }
+base64 = { version = "0.22.1", optional = true}
+
 
 
 [dev-dependencies]
@@ -41,7 +44,6 @@ tokio = { version = "1.34.0", features = ["full"] }
 tracing-subscriber = "0.3.18"
 tokio-test = "0.4.4"
 serde_path_to_error = "0.1.16"
-base64 = "0.22.1"
 
 [features]
 all = ["derive", "pdf", "rayon"]
@@ -50,6 +52,7 @@ pdf = ["dep:lopdf"]
 epub = ["dep:epub", "dep:quick-xml"]
 rayon = ["dep:rayon"]
 worker = ["dep:worker"]
+blob = ["dep:base64", "dep:mime_guess"]
 
 [[test]]
 name = "embed_macro"
@@ -94,3 +97,8 @@ required-features = ["derive"]
 [[example]]
 name = "together_embeddings"
 required-features = ["derive"]
+
+[[example]]
+name = "transcription"
+required-features = ["blob_data"]
+
diff --git a/rig-core/examples/transcription.rs b/rig-core/examples/transcription.rs
@@ -1,6 +1,9 @@
 use std::env::args;
 
-use rig::{providers::openai, transcription::TranscriptionModel};
+use rig::{
+    providers::{azure, gemini, groq, openai},
+    transcription::TranscriptionModel,
+};
 
 #[tokio::main]
 async fn main() {
@@ -14,6 +17,13 @@ async fn main() {
 
     let file_path = args[1].clone();
 
+    whisper(&file_path).await;
+    gemini(&file_path).await;
+    azure(&file_path).await;
+    groq(&file_path).await;
+}
+
+async fn whisper(file_path: &str) {
     // Create an OAI client
     let openai = openai::Client::from_env();
 
@@ -22,7 +32,7 @@ async fn main() {
 
     let response = whisper
         .transcription_request()
-        .load_file(&file_path)
+        .load_file(file_path)
         .send()
         .await
         .expect("Failed to transcribe file");
@@ -31,3 +41,57 @@ async fn main() {
 
     println!("Whisper-1: {text}")
 }
+
+async fn gemini(file_path: &str) {
+    // Create an OAI client
+    let gemini = gemini::Client::from_env();
+
+    // Create the whisper transcription model
+    let gemini = gemini.transcription_model(gemini::transcription::GEMINI_1_5_FLASH);
+
+    let response = gemini
+        .transcription_request()
+        .load_file(file_path)
+        .send()
+        .await
+        .expect("Failed to transcribe file");
+
+    let text = response.text;
+
+    println!("Gemini: {text}")
+}
+
+async fn azure(file_path: &str) {
+    let azure = azure::Client::from_env();
+
+    let whisper = azure.transcription_model("whisper");
+
+    let response = whisper
+        .transcription_request()
+        .load_file(file_path)
+        .send()
+        .await
+        .expect("Failed to transcribe file");
+
+    let text = response.text;
+
+    println!("Azure Whisper-1: {text}")
+}
+
+async fn groq(file_path: &str) {
+    let groq = groq::Client::from_env();
+
+    // Create the whisper transcription model
+    let whisper = groq.transcription_model(groq::WHISPER_LARGE_V3);
+
+    let response = whisper
+        .transcription_request()
+        .load_file(file_path)
+        .send()
+        .await
+        .expect("Failed to transcribe file");
+
+    let text = response.text;
+
+    println!("Groq Whisper-Large-V3: {text}")
+}
diff --git a/rig-core/src/providers/azure.rs b/rig-core/src/providers/azure.rs
@@ -15,12 +15,16 @@ use crate::{
     extractor::ExtractorBuilder,
     json_utils,
     providers::openai,
+    transcription::{self, TranscriptionError},
     Embed,
 };
+use reqwest::multipart::Part;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use serde_json::json;
 
+use super::openai::TranscriptionResponse;
+
 // ================================================================
 // Main Azure OpenAI Client
 // ================================================================
@@ -82,6 +86,15 @@ impl Client {
         self.http_client.post(url)
     }
 
+    fn post_transcription(&self, deployment_id: &str) -> reqwest::RequestBuilder {
+        let url = format!(
+            "{}/openai/deployments/{}/audio/translations?api-version={}",
+            self.azure_endpoint, deployment_id, self.api_version
+        )
+        .replace("//", "/");
+        self.http_client.post(url)
+    }
+
     /// Create an embedding model with the given name.
     /// Note: default embedding dimension of 0 will be used if model is not known.
     /// If this is the case, it's better to use function `embedding_model_with_ndims`
@@ -154,6 +167,21 @@ impl Client {
         CompletionModel::new(self.clone(), model)
     }
 
+    /// Create a transcription model with the given name.
+    ///
+    /// # Example
+    /// ```
+    /// use rig::providers::azure::{Client, self};
+    ///
+    /// // Initialize the Azure OpenAI client
+    /// let azure = Client::new("YOUR_API_KEY", "YOUR_API_VERSION", "YOUR_ENDPOINT");
+    ///
+    /// let whisper = azure.transcription_model("model-unknown-to-rig");
+    /// ```
+    pub fn transcription_model(&self, model: &str) -> TranscriptionModel {
+        TranscriptionModel::new(self.clone(), model)
+    }
+
     /// Create an agent builder with the given completion model.
     ///
     /// # Example
@@ -447,6 +475,84 @@ impl completion::CompletionModel for CompletionModel {
     }
 }
 
+// ================================================================
+// Azure OpenAI Transcription API
+// ================================================================
+
+#[derive(Clone)]
+pub struct TranscriptionModel {
+    client: Client,
+    /// Name of the model (e.g.: gpt-3.5-turbo-1106)
+    pub model: String,
+}
+
+impl TranscriptionModel {
+    pub fn new(client: Client, model: &str) -> Self {
+        Self {
+            client,
+            model: model.to_string(),
+        }
+    }
+}
+
+impl transcription::TranscriptionModel for TranscriptionModel {
+    type Response = TranscriptionResponse;
+
+    #[cfg_attr(feature = "worker", worker::send)]
+    async fn transcription(
+        &self,
+        request: transcription::TranscriptionRequest,
+    ) -> Result<
+        transcription::TranscriptionResponse<Self::Response>,
+        transcription::TranscriptionError,
+    > {
+        let data = request.data;
+
+        let mut body = reqwest::multipart::Form::new().part(
+            "file",
+            Part::bytes(data).file_name(request.filename.clone()),
+        );
+
+        if let Some(prompt) = request.prompt {
+            body = body.text("prompt", prompt.clone());
+        }
+
+        if let Some(ref temperature) = request.temperature {
+            body = body.text("temperature", temperature.to_string());
+        }
+
+        if let Some(ref additional_params) = request.additional_params {
+            for (key, value) in additional_params
+                .as_object()
+                .expect("Additional Parameters to OpenAI Transcription should be a map")
+            {
+                body = body.text(key.to_owned(), value.to_string());
+            }
+        }
+
+        let response = self
+            .client
+            .post_transcription(&self.model)
+            .multipart(body)
+            .send()
+            .await?;
+
+        if response.status().is_success() {
+            match response
+                .json::<ApiResponse<TranscriptionResponse>>()
+                .await?
+            {
+                ApiResponse::Ok(response) => response.try_into(),
+                ApiResponse::Err(api_error_response) => Err(TranscriptionError::ProviderError(
+                    api_error_response.message,
+                )),
+            }
+        } else {
+            Err(TranscriptionError::ProviderError(response.text().await?))
+        }
+    }
+}
+
 #[cfg(test)]
 mod azure_tests {
     use super::*;

diff --git a/rig-core/src/providers/gemini/client.rs b/rig-core/src/providers/gemini/client.rs
@@ -9,6 +9,9 @@ use serde::{Deserialize, Serialize};
 
 use super::{completion::CompletionModel, embedding::EmbeddingModel};
 
+#[cfg(feature = "blob")]
+use super::transcription::TranscriptionModel;
+
 // ================================================================
 // Google Gemini Client
 // ================================================================
@@ -119,6 +122,14 @@ impl Client {
         CompletionModel::new(self.clone(), model)
     }
 
+    /// Create a transcription model with the given name.
+    /// Gemini-specific parameters can be set using the [GenerationConfig](crate::providers::gemini::completion::gemini_api_types::GenerationConfig) struct.
+    /// [Gemini API Reference](https://ai.google.dev/api/generate-content#generationconfig)
+    #[cfg(feature = "blob")]
+    pub fn transcription_model(&self, model: &str) -> TranscriptionModel {
+        TranscriptionModel::new(self.clone(), model)
+    }
+
     /// Create an agent builder with the given completion model.
     /// Gemini-specific parameters can be set using the [GenerationConfig](crate::providers::gemini::completion::gemini_api_types::GenerationConfig) struct.
     /// [Gemini API Reference](https://ai.google.dev/api/generate-content#generationconfig)

diff --git a/rig-core/src/providers/gemini/mod.rs b/rig-core/src/providers/gemini/mod.rs
@@ -12,6 +12,8 @@
 pub mod client;
 pub mod completion;
 pub mod embedding;
+#[cfg(feature = "blob")]
+pub mod transcription;
 pub use client::Client;
 
 pub mod gemini_api_types {