From 9b1fcde428847c7028e91a1ee1db26156a930d70 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Thu, 27 Feb 2025 11:50:08 +1100 Subject: [PATCH 01/18] SambaNova recomment Llama 3.3 vs 3.1 --- config/locales/client.en.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/locales/client.en.yml b/config/locales/client.en.yml index e204360ba..55d239509 100644 --- a/config/locales/client.en.yml +++ b/config/locales/client.en.yml @@ -403,7 +403,7 @@ en: open_ai-o1: "Open AI's most capable reasoning model" open_ai-o3-mini: "Advanced Cost-efficient reasoning model" samba_nova-Meta-Llama-3-1-8B-Instruct: "Efficient lightweight multilingual model" - samba_nova-Meta-Llama-3-1-70B-Instruct": "Powerful multipurpose model" + samba_nova-Meta-Llama-3-3-70B-Instruct": "Powerful multipurpose model" mistral-mistral-large-latest: "Mistral's most powerful model" mistral-pixtral-large-latest: "Mistral's most powerful vision capable model" From 2b25e49462d2f67a97e87cf25d25b9468156829e Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Thu, 27 Feb 2025 11:50:21 +1100 Subject: [PATCH 02/18] 1024 tokens is a minimum --- lib/completions/endpoints/anthropic.rb | 2 +- lib/completions/endpoints/aws_bedrock.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/completions/endpoints/anthropic.rb b/lib/completions/endpoints/anthropic.rb index ed950d31f..673149eb9 100644 --- a/lib/completions/endpoints/anthropic.rb +++ b/lib/completions/endpoints/anthropic.rb @@ -40,7 +40,7 @@ def default_options(dialect) if llm_model.lookup_custom_param("enable_reasoning") reasoning_tokens = - llm_model.lookup_custom_param("reasoning_tokens").to_i.clamp(100, 65_536) + llm_model.lookup_custom_param("reasoning_tokens").to_i.clamp(1024, 65_536) # this allows for lots of tokens beyond reasoning options[:max_tokens] = reasoning_tokens + 30_000 diff --git a/lib/completions/endpoints/aws_bedrock.rb b/lib/completions/endpoints/aws_bedrock.rb index 75ed12cfe..b30338d66 100644 --- a/lib/completions/endpoints/aws_bedrock.rb +++ b/lib/completions/endpoints/aws_bedrock.rb @@ -29,7 +29,7 @@ def default_options(dialect) result = { anthropic_version: "bedrock-2023-05-31" } if llm_model.lookup_custom_param("enable_reasoning") reasoning_tokens = - llm_model.lookup_custom_param("reasoning_tokens").to_i.clamp(100, 65_536) + llm_model.lookup_custom_param("reasoning_tokens").to_i.clamp(1024, 65_536) # this allows for ample tokens beyond reasoning max_tokens = reasoning_tokens + 30_000 From b67568be18bdeebd067d48f5fb8663d0976679b6 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Thu, 27 Feb 2025 15:17:38 +1100 Subject: [PATCH 03/18] Implement streaming and generation of thinking tokens This is required for claude and introduces some new concepts into prompt --- .../anthropic_message_processor.rb | 74 ++++++++++++++++--- lib/completions/dialects/claude.rb | 25 ++++++- lib/completions/endpoints/anthropic.rb | 1 + lib/completions/endpoints/base.rb | 6 +- lib/completions/llm.rb | 3 + lib/completions/prompt.rb | 27 ++++++- lib/completions/thinking.rb | 34 +++++++++ 7 files changed, 156 insertions(+), 14 deletions(-) create mode 100644 lib/completions/thinking.rb diff --git a/lib/completions/anthropic_message_processor.rb b/lib/completions/anthropic_message_processor.rb index 44242b2d1..e67be109b 100644 --- a/lib/completions/anthropic_message_processor.rb +++ b/lib/completions/anthropic_message_processor.rb @@ -44,13 +44,15 @@ def to_tool_call end end - attr_reader :tool_calls, :input_tokens, :output_tokens + attr_reader :tool_calls, :input_tokens, :output_tokens, :output_thinking - def initialize(streaming_mode:, partial_tool_calls: false) + def initialize(streaming_mode:, partial_tool_calls: false, output_thinking: false) @streaming_mode = streaming_mode @tool_calls = [] @current_tool_call = nil @partial_tool_calls = partial_tool_calls + @output_thinking = output_thinking + @thinking = nil end def to_tool_calls @@ -69,13 +71,48 @@ def process_streamed_message(parsed) tool_id, partial_tool_calls: @partial_tool_calls, ) if tool_name + elsif parsed[:type] == "content_block_start" && parsed.dig(:content_block, :type) == "thinking" + if @output_thinking + @thinking = + DiscourseAi::Completions::Thinking.new( + message: +parsed.dig(:content_block, :thinking).to_s, + signature: +"", + partial: true, + ) + result = @thinking.dup + end + elsif parsed[:type] == "content_block_delta" && parsed.dig(:delta, :type) == "thinking_delta" + if @output_thinking + delta = parsed.dig(:delta, :thinking) + @thinking.message << delta if @thinking + result = DiscourseAi::Completions::Thinking.new(message: delta, partial: true) + end + elsif parsed[:type] == "content_block_delta" && parsed.dig(:delta, :type) == "signature_delta" + if @output_thinking + @thinking.signature << parsed.dig(:delta, :signature) if @thinking + end + elsif parsed[:type] == "content_block_stop" && @thinking + @thinking.partial = false + result = @thinking + @thinking = nil elsif parsed[:type] == "content_block_start" || parsed[:type] == "content_block_delta" if @current_tool_call tool_delta = parsed.dig(:delta, :partial_json).to_s @current_tool_call.append(tool_delta) result = @current_tool_call.partial_tool_call if @current_tool_call.has_partial? + elsif parsed.dig(:content_block, :type) == "redacted_thinking" + if @output_thinking + result = + DiscourseAi::Completions::Thinking.new( + message: nil, + signature: parsed.dig(:content_block, :data), + redacted: true, + ) + end else result = parsed.dig(:delta, :text).to_s + # no need to return empty strings for streaming, no value + result = nil if result == "" end elsif parsed[:type] == "content_block_stop" if @current_tool_call @@ -105,15 +142,32 @@ def process_message(payload) content = parsed.dig(:content) if content.is_a?(Array) result = - content.map do |data| - if data[:type] == "tool_use" - call = AnthropicToolCall.new(data[:name], data[:id]) - call.append(data[:input].to_json) - call.to_tool_call - else - data[:text] + content + .map do |data| + if data[:type] == "tool_use" + call = AnthropicToolCall.new(data[:name], data[:id]) + call.append(data[:input].to_json) + call.to_tool_call + elsif data[:type] == "thinking" + if @output_thinking + DiscourseAi::Completions::Thinking.new( + message: data[:thinking], + signature: data[:signature], + ) + end + elsif data[:type] == "redacted_thinking" + if @output_thinking + DiscourseAi::Completions::Thinking.new( + message: nil, + signature: data[:data], + redacted: true, + ) + end + else + data[:text] + end end - end + .compact end @input_tokens = parsed.dig(:usage, :input_tokens) diff --git a/lib/completions/dialects/claude.rb b/lib/completions/dialects/claude.rb index a9c0aba75..06fbe1023 100644 --- a/lib/completions/dialects/claude.rb +++ b/lib/completions/dialects/claude.rb @@ -87,7 +87,30 @@ def tool_msg(msg) end def model_msg(msg) - { role: "assistant", content: msg[:content] } + if msg[:thinking] || msg[:redacted_thinking_signature] + content_array = [] + + if msg[:thinking] + content_array << { + type: "thinking", + thinking: msg[:thinking], + signature: msg[:thinking_signature], + } + end + + if msg[:redacted_thinking_signature] + content_array << { + type: "redacted_thinking", + data: msg[:redacted_thinking_signature], + } + end + + content_array << { type: "text", text: msg[:content] } + + { role: "assistant", content: content_array } + else + { role: "assistant", content: msg[:content] } + end end def system_msg(msg) diff --git a/lib/completions/endpoints/anthropic.rb b/lib/completions/endpoints/anthropic.rb index 673149eb9..ec7656d9a 100644 --- a/lib/completions/endpoints/anthropic.rb +++ b/lib/completions/endpoints/anthropic.rb @@ -123,6 +123,7 @@ def processor DiscourseAi::Completions::AnthropicMessageProcessor.new( streaming_mode: @streaming_mode, partial_tool_calls: partial_tool_calls, + output_thinking: output_thinking, ) end diff --git a/lib/completions/endpoints/base.rb b/lib/completions/endpoints/base.rb index 74933f10f..800381e46 100644 --- a/lib/completions/endpoints/base.rb +++ b/lib/completions/endpoints/base.rb @@ -4,7 +4,7 @@ module DiscourseAi module Completions module Endpoints class Base - attr_reader :partial_tool_calls + attr_reader :partial_tool_calls, :output_thinking CompletionFailed = Class.new(StandardError) # 6 minutes @@ -67,12 +67,15 @@ def perform_completion!( feature_name: nil, feature_context: nil, partial_tool_calls: false, + output_thinking: false, &blk ) LlmQuota.check_quotas!(@llm_model, user) start_time = Time.now @partial_tool_calls = partial_tool_calls + @output_thinking = output_thinking + model_params = normalize_model_params(model_params) orig_blk = blk @@ -85,6 +88,7 @@ def perform_completion!( feature_name: feature_name, feature_context: feature_context, partial_tool_calls: partial_tool_calls, + output_thinking: output_thinking, ) wrapped = result diff --git a/lib/completions/llm.rb b/lib/completions/llm.rb index 513859206..cb75f6e7c 100644 --- a/lib/completions/llm.rb +++ b/lib/completions/llm.rb @@ -234,6 +234,7 @@ def initialize(dialect_klass, gateway_klass, llm_model, gateway: nil) # @param feature_name { String - Optional } - The feature name to use for the completion. # @param feature_context { Hash - Optional } - The feature context to use for the completion. # @param partial_tool_calls { Boolean - Optional } - If true, the completion will return partial tool calls. + # @param output_thinking { Boolean - Optional } - If true, the completion will return the thinking output for thinking models. # # @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function. # @@ -250,6 +251,7 @@ def generate( feature_name: nil, feature_context: nil, partial_tool_calls: false, + output_thinking: false, &partial_read_blk ) self.class.record_prompt(prompt) @@ -285,6 +287,7 @@ def generate( feature_name: feature_name, feature_context: feature_context, partial_tool_calls: partial_tool_calls, + output_thinking: output_thinking, &partial_read_blk ) end diff --git a/lib/completions/prompt.rb b/lib/completions/prompt.rb index 9a6d4d617..6afbc52be 100644 --- a/lib/completions/prompt.rb +++ b/lib/completions/prompt.rb @@ -41,12 +41,26 @@ def initialize( @tool_choice = tool_choice end - def push(type:, content:, id: nil, name: nil, upload_ids: nil) + def push( + type:, + content:, + id: nil, + name: nil, + upload_ids: nil, + thinking: nil, + thinking_signature: nil, + redacted_thinking_signature: nil + ) return if type == :system new_message = { type: type, content: content } new_message[:name] = name.to_s if name new_message[:id] = id.to_s if id new_message[:upload_ids] = upload_ids if upload_ids + new_message[:thinking] = thinking if thinking + new_message[:thinking_signature] = thinking_signature if thinking_signature + new_message[ + :redacted_thinking_signature + ] = redacted_thinking_signature if redacted_thinking_signature validate_message(new_message) validate_turn(messages.last, new_message) @@ -73,7 +87,16 @@ def validate_message(message) raise ArgumentError, "message type must be one of #{valid_types}" end - valid_keys = %i[type content id name upload_ids] + valid_keys = %i[ + type + content + id + name + upload_ids + thinking + thinking_signature + redacted_thinking_signature + ] if (invalid_keys = message.keys - valid_keys).any? raise ArgumentError, "message contains invalid keys: #{invalid_keys}" end diff --git a/lib/completions/thinking.rb b/lib/completions/thinking.rb new file mode 100644 index 000000000..e075835f9 --- /dev/null +++ b/lib/completions/thinking.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +module DiscourseAi + module Completions + class Thinking + attr_accessor :message, :signature, :redacted, :partial + + def initialize(message:, signature: nil, redacted: false, partial: false) + @message = message + @signature = signature + @redacted = redacted + @partial = partial + end + + def ==(other) + message == other.message && signature == other.signature && redacted == other.redacted && + partial == other.partial + end + + def dup + Thinking.new( + message: message.dup, + signature: signature.dup, + redacted: redacted, + partial: partial, + ) + end + + def to_s + "#{message} - #{signature} - #{redacted} - #{partial}" + end + end + end +end From ae3117d1b45736d2c488e1cb8d908562d23207e5 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Thu, 27 Feb 2025 15:17:49 +1100 Subject: [PATCH 04/18] tests --- .../completions/endpoints/anthropic_spec.rb | 215 ++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/spec/lib/completions/endpoints/anthropic_spec.rb b/spec/lib/completions/endpoints/anthropic_spec.rb index b43c76254..579af6953 100644 --- a/spec/lib/completions/endpoints/anthropic_spec.rb +++ b/spec/lib/completions/endpoints/anthropic_spec.rb @@ -449,4 +449,219 @@ expect(log.request_tokens).to eq(10) expect(log.response_tokens).to eq(25) end + + it "can send through thinking tokens via a completion prompt" do + body = { + id: "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY", + type: "message", + role: "assistant", + content: [{ type: "text", text: "world" }], + model: "claude-3-7-sonnet-20250219", + stop_reason: "end_turn", + usage: { + input_tokens: 25, + output_tokens: 40, + }, + }.to_json + + parsed_body = nil + stub_request(:post, url).with( + body: ->(req_body) { parsed_body = JSON.parse(req_body) }, + headers: { + "Content-Type" => "application/json", + "X-Api-Key" => "123", + "Anthropic-Version" => "2023-06-01", + }, + ).to_return(status: 200, body: body) + + prompt = DiscourseAi::Completions::Prompt.new("system prompt") + prompt.push(type: :user, content: "hello") + prompt.push( + type: :model, + id: "user1", + content: "hello", + thinking: "I am thinking", + thinking_signature: "signature", + redacted_thinking_signature: "redacted_signature", + ) + + result = llm.generate(prompt, user: Discourse.system_user) + expect(result).to eq("world") + + expected_body = { + "model" => "claude-3-opus-20240229", + "max_tokens" => 4096, + "messages" => [ + { "role" => "user", "content" => "hello" }, + { + "role" => "assistant", + "content" => [ + { "type" => "thinking", "thinking" => "I am thinking", "signature" => "signature" }, + { "type" => "redacted_thinking", "data" => "redacted_signature" }, + { "type" => "text", "text" => "hello" }, + ], + }, + ], + "system" => "system prompt", + } + + expect(parsed_body).to eq(expected_body) + end + + it "can handle a response with thinking blocks in non-streaming mode" do + body = { + id: "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY", + type: "message", + role: "assistant", + content: [ + { + type: "thinking", + thinking: "This is my thinking process about prime numbers...", + signature: "abc123signature", + }, + { type: "redacted_thinking", data: "abd456signature" }, + { type: "text", text: "Yes, there are infinitely many prime numbers where n mod 4 = 3." }, + ], + model: "claude-3-7-sonnet-20250219", + stop_reason: "end_turn", + usage: { + input_tokens: 25, + output_tokens: 40, + }, + }.to_json + + stub_request(:post, url).with( + headers: { + "Content-Type" => "application/json", + "X-Api-Key" => "123", + "Anthropic-Version" => "2023-06-01", + }, + ).to_return(status: 200, body: body) + + result = + llm.generate( + "hello", + user: Discourse.system_user, + feature_name: "testing", + output_thinking: true, + ) + + # Result should be an array with both thinking and text content + expect(result).to be_an(Array) + expect(result.length).to eq(3) + + # First item should be a Thinking object + expect(result[0]).to be_a(DiscourseAi::Completions::Thinking) + expect(result[0].message).to eq("This is my thinking process about prime numbers...") + expect(result[0].signature).to eq("abc123signature") + + expect(result[1]).to be_a(DiscourseAi::Completions::Thinking) + expect(result[1].signature).to eq("abd456signature") + expect(result[1].redacted).to eq(true) + + # Second item should be the text response + expect(result[2]).to eq("Yes, there are infinitely many prime numbers where n mod 4 = 3.") + + # Verify audit log + log = AiApiAuditLog.order(:id).last + expect(log.provider_id).to eq(AiApiAuditLog::Provider::Anthropic) + expect(log.feature_name).to eq("testing") + expect(log.response_tokens).to eq(40) + end + + it "can stream a response with thinking blocks" do + body = (<<~STRING).strip + event: message_start + data: {"type": "message_start", "message": {"id": "msg_01...", "type": "message", "role": "assistant", "content": [], "model": "claude-3-opus-20240229", "stop_reason": null, "stop_sequence": null, "usage": {"input_tokens": 25}}} + + event: content_block_start + data: {"type": "content_block_start", "index": 0, "content_block": {"type": "thinking", "thinking": ""}} + + event: content_block_delta + data: {"type": "content_block_delta", "index": 0, "delta": {"type": "thinking_delta", "thinking": "Let me solve this step by step:\\n\\n1. First break down 27 * 453"}} + + event: content_block_delta + data: {"type": "content_block_delta", "index": 0, "delta": {"type": "thinking_delta", "thinking": "\\n2. 453 = 400 + 50 + 3"}} + + event: content_block_delta + data: {"type": "content_block_delta", "index": 0, "delta": {"type": "signature_delta", "signature": "EqQBCgIYAhIM1gbcDa9GJwZA2b3hGgxBdjrkzLoky3dl1pkiMOYds..."}} + + event: content_block_stop + data: {"type": "content_block_stop", "index": 0} + + event: content_block_start +data: {"type":"content_block_start","index":0,"content_block":{"type":"redacted_thinking","data":"AAA=="} } + + event: ping + data: {"type": "ping"} + + event: content_block_stop + data: {"type":"content_block_stop","index":0 } + + event: content_block_start + data: {"type": "content_block_start", "index": 1, "content_block": {"type": "text", "text": ""}} + + event: content_block_delta + data: {"type": "content_block_delta", "index": 1, "delta": {"type": "text_delta", "text": "27 * 453 = 12,231"}} + + event: content_block_stop + data: {"type": "content_block_stop", "index": 1} + + event: message_delta + data: {"type": "message_delta", "delta": {"stop_reason": "end_turn", "stop_sequence": null, "usage": {"output_tokens": 30}}} + + event: message_stop + data: {"type": "message_stop"} + STRING + + parsed_body = nil + + stub_request(:post, url).with( + headers: { + "Content-Type" => "application/json", + "X-Api-Key" => "123", + "Anthropic-Version" => "2023-06-01", + }, + ).to_return(status: 200, body: body) + + thinking_chunks = [] + text_chunks = [] + + llm.generate( + "hello there", + user: Discourse.system_user, + feature_name: "testing", + output_thinking: true, + ) do |partial, cancel| + if partial.is_a?(DiscourseAi::Completions::Thinking) + thinking_chunks << partial + else + text_chunks << partial + end + end + + expected_thinking = [ + DiscourseAi::Completions::Thinking.new(message: "", signature: "", partial: true), + DiscourseAi::Completions::Thinking.new( + message: "Let me solve this step by step:\n\n1. First break down 27 * 453", + partial: true, + ), + DiscourseAi::Completions::Thinking.new(message: "\n2. 453 = 400 + 50 + 3", partial: true), + DiscourseAi::Completions::Thinking.new( + message: + "Let me solve this step by step:\n\n1. First break down 27 * 453\n2. 453 = 400 + 50 + 3", + signature: "EqQBCgIYAhIM1gbcDa9GJwZA2b3hGgxBdjrkzLoky3dl1pkiMOYds...", + partial: false, + ), + DiscourseAi::Completions::Thinking.new(message: nil, signature: "AAA==", redacted: true), + ] + + expect(thinking_chunks).to eq(expected_thinking) + expect(text_chunks).to eq(["27 * 453 = 12,231"]) + + log = AiApiAuditLog.order(:id).last + expect(log.provider_id).to eq(AiApiAuditLog::Provider::Anthropic) + expect(log.feature_name).to eq("testing") + expect(log.response_tokens).to eq(30) + end end From a3160509a0b2ed8008ec4c42d788715ad080461a Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Fri, 28 Feb 2025 16:06:47 +1100 Subject: [PATCH 05/18] preserve thinking context across turns --- .../composer-fields/persona-llm-selector.gjs | 14 ++--- config/locales/server.en.yml | 1 + lib/ai_bot/bot.rb | 52 ++++++++++++++++-- lib/ai_bot/playground.rb | 21 +++++++- lib/completions/endpoints/canned_response.rb | 9 +++- lib/completions/prompt_messages_builder.rb | 11 +++- lib/completions/thinking.rb | 4 ++ spec/lib/modules/ai_bot/playground_spec.rb | 53 +++++++++++++++++++ 8 files changed, 151 insertions(+), 14 deletions(-) diff --git a/assets/javascripts/discourse/connectors/composer-fields/persona-llm-selector.gjs b/assets/javascripts/discourse/connectors/composer-fields/persona-llm-selector.gjs index 951d754ae..e3e82b4d6 100644 --- a/assets/javascripts/discourse/connectors/composer-fields/persona-llm-selector.gjs +++ b/assets/javascripts/discourse/connectors/composer-fields/persona-llm-selector.gjs @@ -168,12 +168,14 @@ export default class BotSelector extends Component { .filter((bot) => !bot.is_persona) .filter(Boolean); - return availableBots.map((bot) => { - return { - id: bot.id, - name: bot.display_name, - }; - }); + return availableBots + .map((bot) => { + return { + id: bot.id, + name: bot.display_name, + }; + }) + .sort((a, b) => a.name.localeCompare(b.name)); }