From a04fcd185eefeb13955f0468c86c927d52436590 Mon Sep 17 00:00:00 2001 From: Vishal Dharmadhikari <61256217+vishal-dharm@users.noreply.github.com> Date: Sat, 16 Nov 2024 06:05:35 -0800 Subject: [PATCH] Fix 'argument list too long' error and add couple vision examples (#634) --- samples/rest/text_generation.sh | 144 ++++++++++++++++++++++++++------ 1 file changed, 119 insertions(+), 25 deletions(-) mode change 100644 => 100755 samples/rest/text_generation.sh diff --git a/samples/rest/text_generation.sh b/samples/rest/text_generation.sh old mode 100644 new mode 100755 index 617f7d136..8cfadd688 --- a/samples/rest/text_generation.sh +++ b/samples/rest/text_generation.sh @@ -4,6 +4,7 @@ SCRIPT_DIR=$(dirname "$0") MEDIA_DIR=$(realpath ${SCRIPT_DIR}/../../third_party) IMG_PATH=${MEDIA_DIR}/organ.jpg +IMG_PATH2=${MEDIA_DIR}/Cajun_instruments.jpg AUDIO_PATH=${MEDIA_DIR}/sample.mp3 VIDEO_PATH=${MEDIA_DIR}/Big_Buck_Bunny.mp4 PDF_PATH=${MEDIA_DIR}/test.pdf @@ -38,43 +39,136 @@ curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:s echo "[START text_gen_multimodal_one_image_prompt]" # [START text_gen_multimodal_one_image_prompt] +# Use a temporary file to hold the base64 encoded image data +TEMP_B64=$(mktemp) +trap 'rm -f "$TEMP_B64"' EXIT +base64 $B64FLAGS $IMG_PATH > "$TEMP_B64" + +# Use a temporary file to hold the JSON payload +TEMP_JSON=$(mktemp) +trap 'rm -f "$TEMP_JSON"' EXIT + +cat > "$TEMP_JSON" << EOF +{ + "contents": [{ + "parts":[ + {"text": "Tell me about this instrument"}, + { + "inline_data": { + "mime_type":"image/jpeg", + "data": "$(cat "$TEMP_B64")" + } + } + ] + }] +} +EOF + curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY" \ -H 'Content-Type: application/json' \ -X POST \ - -d '{ - "contents": [{ - "parts":[ - {"text": "Tell me about this instrument"}, - { - "inline_data": { - "mime_type":"image/jpeg", - "data": "'$(base64 $B64FLAGS $IMG_PATH)'" - } - } - ] - }] - }' 2> /dev/null + -d "@$TEMP_JSON" 2> /dev/null # [END text_gen_multimodal_one_image_prompt] echo "[START text_gen_multimodal_one_image_prompt_streaming]" # [START text_gen_multimodal_one_image_prompt_streaming] +cat > "$TEMP_JSON" << EOF +{ + "contents": [{ + "parts":[ + {"text": "Tell me about this instrument"}, + { + "inline_data": { + "mime_type":"image/jpeg", + "data": "$(cat "$TEMP_B64")" + } + } + ] + }] +} +EOF + curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:streamGenerateContent?alt=sse&key=$GOOGLE_API_KEY" \ -H 'Content-Type: application/json' \ -X POST \ - -d '{ - "contents": [{ + -d "@$TEMP_JSON" 2> /dev/null +# [END text_gen_multimodal_one_image_prompt_streaming] + +echo "[START text_gen_multimodal_two_image_prompt]" +# [START text_gen_multimodal_two_image_prompt] +# Base64 encode both images into temporary files +TEMP_B64_1=$(mktemp) +TEMP_B64_2=$(mktemp) +trap 'rm -f "$TEMP_B64_1" "$TEMP_B64_2"' EXIT +base64 $B64FLAGS "$IMG_PATH" > "$TEMP_B64_1" +base64 $B64FLAGS "$IMG_PATH2" > "$TEMP_B64_2" + +# Create the JSON payload using the base64 data from both images +cat > "$TEMP_JSON" << EOF +{ + "contents": [{ "parts":[ - {"text": "Tell me about this instrument"}, { - "inline_data": { - "mime_type":"image/jpeg", - "data": "'$(base64 $B64FLAGS $IMG_PATH)'" - } + "inline_data": { + "mime_type": "image/jpeg", + "data": "$(cat "$TEMP_B64_1")" + } + }, + { + "inline_data": { + "mime_type": "image/jpeg", + "data": "$(cat "$TEMP_B64_2")" + } + }, + { + "text": "Generate a list of all the objects contained in both images." } ] - }] - }' 2> /dev/null -# [END text_gen_multimodal_one_image_prompt_streaming] + }] +} +EOF + +# Make the API request using the JSON file +curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY" \ + -H 'Content-Type: application/json' \ + -X POST \ + -d "@$TEMP_JSON" 2> /dev/null > response.json + +# Display the response +cat response.json +# [END text_gen_multimodal_two_image_prompt] + +echo "[START text_gen_multimodal_one_image_bounding_box_prompt]" +# [START text_gen_multimodal_one_image_bounding_box_prompt] +# Re-use TEMP_B64_2 (from the previous two-image prompt) and TEMP_JSON + +# Create the JSON payload for bounding box detection +cat > "$TEMP_JSON" << EOF +{ + "contents": [{ + "parts":[ + { + "inline_data": { + "mime_type": "image/jpeg", + "data": "$(cat "$TEMP_B64_2")" + } + }, + { + "text": "Generate bounding boxes for each of the objects in this image in [y_min, x_min, y_max, x_max] format." + } + ] + }] +} +EOF + +# Make the API request using the JSON file +curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?key=$GOOGLE_API_KEY" \ + -H 'Content-Type: application/json' \ + -X POST \ + -d "@$TEMP_JSON" 2> /dev/null > response.json + +cat response.json +# [END text_gen_multimodal_one_image_bounding_box_prompt] echo "[START text_gen_multimodal_audio]" # [START text_gen_multimodal_audio] @@ -184,7 +278,7 @@ DISPLAY_NAME=VIDEO # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "${BASE_URL}/upload/v1beta/files?key=${GOOGLE_API_KEY}" \ - -D upload-header.tmp \ + -D "${tmp_header_file}" \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \ @@ -226,7 +320,7 @@ curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:g -d '{ "contents": [{ "parts":[ - {"text": "Please describe this file."}, + {"text": "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions."}, {"file_data":{"mime_type": "video/mp4", "file_uri": '$file_uri'}}] }] }' 2> /dev/null > response.json