diff --git a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceVisualQuestionAnsweringRemoteInferencePromptSchema.ts b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceVisualQuestionAnsweringRemoteInferencePromptSchema.ts new file mode 100644 index 000000000..30fe8cfe7 --- /dev/null +++ b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceVisualQuestionAnsweringRemoteInferencePromptSchema.ts @@ -0,0 +1,42 @@ +import { PromptSchema } from "../../utils/promptUtils"; + +export const HuggingFaceVisualQuestionAnsweringRemoteInferencePromptSchema: PromptSchema = + { + // See https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/inference/_client.py#L1780 for supported params. + // The settings below are supported settings specified in the HuggingFaceVisualQuestionAnsweringRemoteInference refine_completion_params implementation. + input: { + type: "object", + required: ["attachments", "data"], + properties: { + attachments: { + type: "array", + items: { + type: "attachment", + required: ["data"], + mime_types: ["image/*"], + properties: { + data: { + type: "string", + }, + }, + }, + max_items: 1, + }, + data: { + // The question to ask about the image + type: "string", + }, + }, + }, + model_settings: { + type: "object", + properties: { + model: { + type: "string", + description: `Hugging Face model to use. Can be a model ID hosted on the Hugging Face Hub or a URL + to a deployed Inference Endpoint`, + default: "dandelin/vilt-b32-finetuned-vqa", + }, + }, + }, + }; diff --git a/python/src/aiconfig/editor/client/src/utils/promptUtils.ts b/python/src/aiconfig/editor/client/src/utils/promptUtils.ts index 3c0fdd520..034824d9e 100644 --- a/python/src/aiconfig/editor/client/src/utils/promptUtils.ts +++ b/python/src/aiconfig/editor/client/src/utils/promptUtils.ts @@ -22,10 +22,10 @@ import { HuggingFaceTextGenerationRemoteInferencePromptSchema } from "../shared/ import { HuggingFaceTextSummarizationRemoteInferencePromptSchema } from "../shared/prompt_schemas/HuggingFaceTextSummarizationRemoteInferencePromptSchema"; import { HuggingFaceTextTranslationRemoteInferencePromptSchema } from "../shared/prompt_schemas/HuggingFaceTextTranslationRemoteInferencePromptSchema"; import { HuggingFaceImage2TextRemoteInferencePromptSchema } from "../shared/prompt_schemas/HuggingFaceImage2TextRemoteInferencePromptSchema"; +import { HuggingFaceVisualQuestionAnsweringRemoteInferencePromptSchema } from "../shared/prompt_schemas/HuggingFaceVisualQuestionAnsweringRemoteInferencePromptSchema"; import { ClaudeBedrockPromptSchema } from "../shared/prompt_schemas/ClaudeBedrockPromptSchema"; import { HuggingFaceConversationalRemoteInferencePromptSchema } from "../shared/prompt_schemas/HuggingFaceConversationalRemoteInferencePromptSchema"; - /** * Get the name of the model for the specified prompt. The name will either be specified in the prompt's * model metadata, or as the default_model in the aiconfig metadata @@ -117,6 +117,9 @@ export const PROMPT_SCHEMAS: Record = { HuggingFaceTextTranslationRemoteInference: HuggingFaceTextTranslationRemoteInferencePromptSchema, + HuggingFaceVisualQuestionAnsweringRemoteInference: + HuggingFaceVisualQuestionAnsweringRemoteInferencePromptSchema, + // PaLMTextParser "models/text-bison-001": PaLMTextParserPromptSchema, @@ -162,6 +165,8 @@ export const PROMPT_SCHEMAS: Record = { Summarization: HuggingFaceTextSummarizationRemoteInferencePromptSchema, Translation: HuggingFaceTextTranslationRemoteInferencePromptSchema, Conversational: HuggingFaceConversationalRemoteInferencePromptSchema, + "Visual Question Answering": + HuggingFaceVisualQuestionAnsweringRemoteInferencePromptSchema, "Automatic Speech Recognition (Local)": HuggingFaceAutomaticSpeechRecognitionPromptSchema,