From 566692b34c223bf11f23bdf52dea7be30573beac Mon Sep 17 00:00:00 2001 From: Moritz Althaus Date: Thu, 8 Aug 2024 14:48:48 +0200 Subject: [PATCH 1/5] feat: max tokens is optional --- src/completion.rs | 20 +++++++++++++++++--- tests/integration.rs | 25 ++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/completion.rs b/src/completion.rs index 3b68ac2..0b4c946 100644 --- a/src/completion.rs +++ b/src/completion.rs @@ -72,7 +72,7 @@ pub struct Stopping<'a> { /// number of tokens is reached.Increase this value to allow for longer outputs. A text is split /// into tokens. Usually there are more tokens than words. The total number of tokens of prompt /// and maximum_tokens depends on the model. - pub maximum_tokens: u32, + pub maximum_tokens: Option, /// List of strings which will stop generation if they are generated. Stop sequences are /// helpful in structured texts. E.g.: In a question answering scenario a text may consist of /// lines starting with either "Question: " or "Answer: " (alternating). After producing an @@ -83,15 +83,28 @@ pub struct Stopping<'a> { } impl<'a> Stopping<'a> { + /// Only stop once the model generates end of text, or the sum of input tokens and generated + /// tokens has reached the model's context window size. + pub const CONTEXT_WINDOW: Self = Stopping { + maximum_tokens: None, + stop_sequences: &[], + }; + /// Only stop once the model generates end of text, or maximum tokens are reached. pub fn from_maximum_tokens(maximum_tokens: u32) -> Self { Self { - maximum_tokens, + maximum_tokens: Some(maximum_tokens), stop_sequences: &[], } } } +impl Default for Stopping<'_> { + fn default() -> Self { + Self::CONTEXT_WINDOW + } +} + /// Body send to the Aleph Alpha API on the POST `/completion` Route #[derive(Serialize, Debug)] struct BodyCompletion<'a> { @@ -100,7 +113,8 @@ struct BodyCompletion<'a> { /// Prompt to complete. The modalities supported depend on `model`. pub prompt: Prompt<'a>, /// Limits the number of tokens, which are generated for the completion. - pub maximum_tokens: u32, + #[serde(skip_serializing_if = "Option::is_none")] + pub maximum_tokens: Option, /// List of strings which will stop generation if they are generated. Stop sequences are /// helpful in structured texts. E.g.: In a question answering scenario a text may consist of /// lines starting with either "Question: " or "Answer: " (alternating). After producing an diff --git a/tests/integration.rs b/tests/integration.rs index 23e5713..e15a53b 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -172,7 +172,7 @@ async fn complete_structured_prompt() { let task = TaskCompletion { prompt: Prompt::from_text(prompt), stopping: Stopping { - maximum_tokens: 64, + maximum_tokens: Some(64), stop_sequences: &stop_sequences[..], }, sampling: Sampling::MOST_LIKELY, @@ -190,6 +190,29 @@ async fn complete_structured_prompt() { assert!(!response.completion.contains("User:")); } +#[tokio::test] +async fn context_window_stopping() { + // Given + let prompt = "Bot: Hello user!\nUser: Hello Bot, how are you doing?\nBot:"; + let stopping = Stopping::default(); + + // When + let task = TaskCompletion { + prompt: Prompt::from_text(prompt), + stopping, + sampling: Sampling::MOST_LIKELY, + }; + let model = "luminous-base"; + let client = Client::with_authentication(api_token()).unwrap(); + let response = client + .output_of(&task.with_model(model), &How::default()) + .await + .unwrap(); + + // Then + assert!(!response.completion.is_empty()); +} + #[tokio::test] async fn explain_request() { // Given From 698f125629aa320d05707c9ea2c474342583dc94 Mon Sep 17 00:00:00 2001 From: Moritz Althaus Date: Thu, 8 Aug 2024 15:06:18 +0200 Subject: [PATCH 2/5] feat!: remove maximum tokens argument from prompt::from_text --- README.md | 2 +- src/completion.rs | 14 +++++++++----- src/http.rs | 2 +- src/lib.rs | 10 +++++----- tests/integration.rs | 8 ++++---- tests/unit.rs | 14 ++++++++------ 6 files changed, 28 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index ccd4531..54f35a5 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ fn main() { let model = "luminous-base"; // The task we want to perform. Here we want to continue the sentence: "An apple a day ..." - let task = TaskCompletion::from_text("An apple a day", 10); + let task = TaskCompletion::from_text("An apple a day"); // Retrieve the answer from the API let response = client.completion(&task, model, &How::default()).await.unwrap(); diff --git a/src/completion.rs b/src/completion.rs index 0b4c946..1158ac9 100644 --- a/src/completion.rs +++ b/src/completion.rs @@ -14,15 +14,19 @@ pub struct TaskCompletion<'a> { } impl<'a> TaskCompletion<'a> { - /// Convenience constructor leaving most setting to default, just completing a given text and - /// taking the maximum anticipated length of the completion. - pub fn from_text(text: &'a str, maximum_tokens: u32) -> Self { + /// Convenience constructor leaving most setting to default, just completing a given text + pub fn from_text(text: &'a str) -> Self { TaskCompletion { prompt: Prompt::from_text(text), - stopping: Stopping::from_maximum_tokens(maximum_tokens), + stopping: Stopping::CONTEXT_WINDOW, sampling: Sampling::MOST_LIKELY, } } + + pub fn with_maximum_tokens(mut self, maximum_tokens: u32) -> Self { + self.stopping.maximum_tokens = Some(maximum_tokens); + self + } } /// Sampling controls how the tokens ("words") are selected for the completion. @@ -69,7 +73,7 @@ impl Default for Sampling<'_> { /// Controls the conditions under which the language models stops generating text. pub struct Stopping<'a> { /// The maximum number of tokens to be generated. Completion will terminate after the maximum - /// number of tokens is reached.Increase this value to allow for longer outputs. A text is split + /// number of tokens is reached. Increase this value to allow for longer outputs. A text is split /// into tokens. Usually there are more tokens than words. The total number of tokens of prompt /// and maximum_tokens depends on the model. pub maximum_tokens: Option, diff --git a/src/http.rs b/src/http.rs index 0bba6f9..84e37b2 100644 --- a/src/http.rs +++ b/src/http.rs @@ -114,7 +114,7 @@ impl HttpClient { /// /// // The task we want to perform. Here we want to continue the sentence: "An apple a day /// // ..." - /// let task = TaskCompletion::from_text("An apple a day", 10); + /// let task = TaskCompletion::from_text("An apple a day"); /// /// // Retrieve answer from API /// let response = client.output_of(&task.with_model(model), &How::default()).await?; diff --git a/src/lib.rs b/src/lib.rs index 251105e..9bee967 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,7 @@ //! let model = "luminous-base"; //! //! // The task we want to perform. Here we want to continue the sentence: "An apple a day ..." -//! let task = TaskCompletion::from_text("An apple a day", 10); +//! let task = TaskCompletion::from_text("An apple a day"); //! //! // Retrieve the answer from the API //! let response = client.completion(&task, model, &How::default()).await.unwrap(); @@ -65,7 +65,7 @@ impl Client { /// A new instance of an Aleph Alpha client helping you interact with the Aleph Alpha API. /// For "normal" client applications you may likely rather use [`Self::with_authentication`] or /// [`Self::with_base_url`]. - /// + /// /// You may want to only use request based authentication and skip default authentication. This /// is useful if writing an application which invokes the client on behalf of many different /// users. Having neither request, nor default authentication is considered a bug and will cause @@ -81,7 +81,7 @@ impl Client { } /// Use your on-premise inference with your API token for all requests. - /// + /// /// In production you typically would want set this to . Yet /// you may want to use a different instances for testing. pub fn with_base_url(host: String, api_token: impl Into) -> Result { @@ -103,7 +103,7 @@ impl Client { /// /// // The task we want to perform. Here we want to continue the sentence: "An apple a day /// // ..." - /// let task = TaskCompletion::from_text("An apple a day", 10); + /// let task = TaskCompletion::from_text("An apple a day"); /// /// // Retrieve answer from API /// let response = client.execute(model, &task, &How::default()).await?; @@ -167,7 +167,7 @@ impl Client { /// /// // The task we want to perform. Here we want to continue the sentence: "An apple a day /// // ..." - /// let task = TaskCompletion::from_text("An apple a day", 10); + /// let task = TaskCompletion::from_text("An apple a day"); /// /// // Retrieve answer from API /// let response = client.completion(&task, model, &How::default()).await?; diff --git a/tests/integration.rs b/tests/integration.rs index e15a53b..c97574b 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -21,7 +21,7 @@ fn api_token() -> &'static str { #[tokio::test] async fn completion_with_luminous_base() { // When - let task = TaskCompletion::from_text("Hello", 1); + let task = TaskCompletion::from_text("Hello").with_maximum_tokens(1); let model = "luminous-base"; let client = Client::with_authentication(api_token()).unwrap(); @@ -39,7 +39,7 @@ async fn completion_with_luminous_base() { #[tokio::test] async fn request_authentication_has_priority() { let bad_aa_api_token = "DUMMY"; - let task = TaskCompletion::from_text("Hello", 1); + let task = TaskCompletion::from_text("Hello").with_maximum_tokens(1); let model = "luminous-base"; let client = Client::with_authentication(bad_aa_api_token).unwrap(); @@ -64,7 +64,7 @@ async fn request_authentication_has_priority() { async fn authentication_only_per_request() { // Given let model = "luminous-base"; - let task = TaskCompletion::from_text("Hello", 1); + let task = TaskCompletion::from_text("Hello").with_maximum_tokens(1); // When let client = Client::new("https://api.aleph-alpha.com".to_owned(), None).unwrap(); @@ -88,7 +88,7 @@ async fn authentication_only_per_request() { async fn must_panic_if_authentication_is_missing() { // Given let model = "luminous-base"; - let task = TaskCompletion::from_text("Hello", 1); + let task = TaskCompletion::from_text("Hello").with_maximum_tokens(1); // When let client = Client::new("https://api.aleph-alpha.com".to_owned(), None).unwrap(); diff --git a/tests/unit.rs b/tests/unit.rs index e6b2726..e29c6a7 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -32,7 +32,7 @@ async fn completion_with_luminous_base() { .await; // When - let task = TaskCompletion::from_text("Hello,", 1); + let task = TaskCompletion::from_text("Hello,").with_maximum_tokens(1); let model = "luminous-base"; let client = Client::with_base_url(mock_server.uri(), "dummy-token").unwrap(); let response = client @@ -72,7 +72,7 @@ async fn detect_rate_limiting() { .await; // When - let task = TaskCompletion::from_text("Hello,", 1); + let task = TaskCompletion::from_text("Hello,").with_maximum_tokens(1); let model = "luminous-base"; let client = Client::with_base_url(mock_server.uri(), "dummy-token").unwrap(); let error = client @@ -116,7 +116,7 @@ async fn detect_queue_full() { .await; // When - let task = TaskCompletion::from_text("Hello,", 1); + let task = TaskCompletion::from_text("Hello,").with_maximum_tokens(1); let model = "luminous-base"; let client = Client::with_base_url(mock_server.uri(), "dummy-token").unwrap(); let error = client @@ -153,7 +153,7 @@ async fn detect_service_unavailable() { .await; // When - let task = TaskCompletion::from_text("Hello,", 1); + let task = TaskCompletion::from_text("Hello,").with_maximum_tokens(1); let model = "luminous-base"; let client = Client::with_base_url(mock_server.uri(), "dummy-token").unwrap(); let error = client @@ -175,7 +175,7 @@ async fn be_nice() { let mock_server = MockServer::start().await; // When - let task = TaskCompletion::from_text("Hello,", 1); + let task = TaskCompletion::from_text("Hello,").with_maximum_tokens(1); let model = "luminous-base"; let client = Client::with_base_url(mock_server.uri(), "dummy-token").unwrap(); // Drop result, answer is meaningless anyway @@ -211,7 +211,9 @@ async fn client_timeout() { // When let result = client .output_of( - &TaskCompletion::from_text("Hello,", 1).with_model("any"), + &TaskCompletion::from_text("Hello,") + .with_maximum_tokens(1) + .with_model("any"), &How { client_timeout: response_time / 2, ..Default::default() From fbf1a75f91c4d61761fd7db94da30b26bdd5bc82 Mon Sep 17 00:00:00 2001 From: Moritz Althaus Date: Thu, 8 Aug 2024 15:07:32 +0200 Subject: [PATCH 3/5] chore: add changelog and bump version --- Cargo.toml | 2 +- Changelog.md | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c7a21b5..e29ecd5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aleph-alpha-client" -version = "0.10.1" +version = "0.11.0" edition = "2021" description = "Interact with large language models provided by the Aleph Alpha API in Rust code" license = "MIT" diff --git a/Changelog.md b/Changelog.md index 741fa93..2a011ee 100644 --- a/Changelog.md +++ b/Changelog.md @@ -2,6 +2,12 @@ ## Unreleased +## 0.11.0 + +* Add `with_maximum_tokens` method to `Prompt` +* Remove maximum tokens argument from `Prompt::from_text` +* Make maximum tokens optional + ## 0.10.1 * Fix: Version number in Cargo.toml From 21dc28d7059363cc9aa830bca21fb9c64d067038 Mon Sep 17 00:00:00 2001 From: Moritz Althaus Date: Fri, 9 Aug 2024 09:44:44 +0200 Subject: [PATCH 4/5] test: make no token limit more explicit --- tests/integration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration.rs b/tests/integration.rs index c97574b..95aec72 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -194,7 +194,7 @@ async fn complete_structured_prompt() { async fn context_window_stopping() { // Given let prompt = "Bot: Hello user!\nUser: Hello Bot, how are you doing?\nBot:"; - let stopping = Stopping::default(); + let stopping = Stopping::NO_TOKEN_LIMIT; // When let task = TaskCompletion { From 5172fbd26904b6d9471e8df41a71da064835b63e Mon Sep 17 00:00:00 2001 From: Moritz Althaus Date: Fri, 9 Aug 2024 09:45:04 +0200 Subject: [PATCH 5/5] feat: better naming for max_tokens None --- src/completion.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/completion.rs b/src/completion.rs index 1158ac9..d59c5b7 100644 --- a/src/completion.rs +++ b/src/completion.rs @@ -18,7 +18,7 @@ impl<'a> TaskCompletion<'a> { pub fn from_text(text: &'a str) -> Self { TaskCompletion { prompt: Prompt::from_text(text), - stopping: Stopping::CONTEXT_WINDOW, + stopping: Stopping::NO_TOKEN_LIMIT, sampling: Sampling::MOST_LIKELY, } } @@ -76,6 +76,9 @@ pub struct Stopping<'a> { /// number of tokens is reached. Increase this value to allow for longer outputs. A text is split /// into tokens. Usually there are more tokens than words. The total number of tokens of prompt /// and maximum_tokens depends on the model. + /// If maximum tokens is set to None, no outside limit is opposed on the number of maximum tokens. + /// The model will generate tokens until it either emits a stop token or it reaches its technical + /// limit, which usually is its context window. pub maximum_tokens: Option, /// List of strings which will stop generation if they are generated. Stop sequences are /// helpful in structured texts. E.g.: In a question answering scenario a text may consist of @@ -87,9 +90,9 @@ pub struct Stopping<'a> { } impl<'a> Stopping<'a> { - /// Only stop once the model generates end of text, or the sum of input tokens and generated - /// tokens has reached the model's context window size. - pub const CONTEXT_WINDOW: Self = Stopping { + /// Only stop once the model generates end of text, or it reaches its technical limit, usually the + /// context window. + pub const NO_TOKEN_LIMIT: Self = Stopping { maximum_tokens: None, stop_sequences: &[], }; @@ -105,7 +108,7 @@ impl<'a> Stopping<'a> { impl Default for Stopping<'_> { fn default() -> Self { - Self::CONTEXT_WINDOW + Self::NO_TOKEN_LIMIT } }