diff --git a/quickstarts/Video.ipynb b/quickstarts/Video.ipynb index 55d1f3b72..71d90388b 100644 --- a/quickstarts/Video.ipynb +++ b/quickstarts/Video.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "cellView": "form", "id": "tuOe1ymfHZPu" @@ -64,28 +64,18 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { - "id": "qLuL9m7KhvxR" + "id": "TwpXMTnpsoHC" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m146.8/146.8 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.5/664.5 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h" - ] - } - ], + "outputs": [], "source": [ - "!pip install -U -q google-generativeai" + "!pip install -U google-generative-ai" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "ATIbQM0NHhkj" }, @@ -118,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "id": "d6lYXRcjthKV" }, @@ -135,7 +125,7 @@ "id": "MNvhBdoDFnTC" }, "source": [ - "## Extract frames\n", + "## Upload the file\n", "\n", "The Gemini API currently does not support video files directly. Instead, you can provide a series of timestamps and image files.\n", "\n", @@ -146,216 +136,83 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 34, "metadata": { "id": "_HzrDdp2Q1Cu" }, - "outputs": [], - "source": [ - "video_file_name = \"https://storage.googleapis.com/generativeai-downloads/data/SherlockJr._10min.mp4\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4_2KHEawhDD7" - }, - "source": [ - "Use OpenCV to extract image frames from the video at 1 frame per second." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "jW9ilYCdL99M" - }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Extracting https://storage.googleapis.com/generativeai-downloads/data/SherlockJr._10min.mp4 at 1 frame per second. This might take a bit...\n", - "Completed video frame extraction!\n", - "\n", - "Extracted: 600 frames\n" + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 594M 100 594M 0 0 88.1M 0 0:00:06 0:00:06 --:--:-- 79.5M\n" ] } ], "source": [ - "import cv2\n", - "import os\n", - "import shutil\n", - "\n", - "# Create or cleanup existing extracted image frames directory.\n", - "FRAME_EXTRACTION_DIRECTORY = \"/content/frames\"\n", - "FRAME_PREFIX = \"_frame\"\n", - "def create_frame_output_dir(output_dir):\n", - " if not os.path.exists(output_dir):\n", - " os.makedirs(output_dir)\n", - " else:\n", - " shutil.rmtree(output_dir)\n", - " os.makedirs(output_dir)\n", - "\n", - "def extract_frame_from_video(video_file_path):\n", - " print(f\"Extracting {video_file_path} at 1 frame per second. This might take a bit...\")\n", - " create_frame_output_dir(FRAME_EXTRACTION_DIRECTORY)\n", - " vidcap = cv2.VideoCapture(video_file_path)\n", - " fps = vidcap.get(cv2.CAP_PROP_FPS)\n", - " frame_duration = 1 / fps # Time interval between frames (in seconds)\n", - " output_file_prefix = os.path.basename(video_file_path).replace('.', '_')\n", - " frame_count = 0\n", - " count = 0\n", - " while vidcap.isOpened():\n", - " success, frame = vidcap.read()\n", - " if not success: # End of video\n", - " break\n", - " if int(count / fps) == frame_count: # Extract a frame every second\n", - " min = frame_count // 60\n", - " sec = frame_count % 60\n", - " time_string = f\"{min:02d}:{sec:02d}\"\n", - " image_name = f\"{output_file_prefix}{FRAME_PREFIX}{time_string}.jpg\"\n", - " output_filename = os.path.join(FRAME_EXTRACTION_DIRECTORY, image_name)\n", - " cv2.imwrite(output_filename, frame)\n", - " frame_count += 1\n", - " count += 1\n", - " vidcap.release() # Release the capture object\\n\",\n", - " print(f\"Completed video frame extraction!\\n\\nExtracted: {frame_count} frames\")\n", - "\n", - "extract_frame_from_video(video_file_name)" + "!curl -o video.mp4 \"https://storage.googleapis.com/generativeai-downloads/data/SherlockJr._10min.mp4\"" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 35, "metadata": { - "id": "c-z4zsCUlaru" + "id": "kDUIqno-FTqk" }, + "outputs": [], "source": [ - "## Upload frames using the File API\n", - "\n", - "Once we have the frames extracted, we are ready to upload the frames to the API.\n", - "\n", - "The File API accepts files under 2GB in size and can store up to 20GB of files per project. Files last for 2 days and cannot be downloaded from the API.\n", - "\n", - "We will just upload 10 frames so this example runs quickly. You can modify the code below to upload the entire video." + "f = genai.upload_file(\"video.mp4\")" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 36, "metadata": { - "id": "JSd4s0YygV9r" + "id": "chSGe3Aict17" }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Uploading 10 files. This might take a bit...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:40.jpg...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:41.jpg...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:42.jpg...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:43.jpg...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:44.jpg...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:45.jpg...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:46.jpg...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:47.jpg...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:48.jpg...\n", - "Uploading: /content/frames/SherlockJr__10min_mp4_frame00:49.jpg...\n", - "Completed file uploads!\n", - "\n", - "Uploaded: 10 files\n" - ] + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'files/hgxvpmisxbn9'" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "import os\n", - "\n", - "class File:\n", - " def __init__(self, file_path: str, display_name: str = None):\n", - " self.file_path = file_path\n", - " if display_name:\n", - " self.display_name = display_name\n", - " self.timestamp = get_timestamp(file_path)\n", - "\n", - " def set_file_response(self, response):\n", - " self.response = response\n", - "\n", - "def get_timestamp(filename):\n", - " \"\"\"Extracts the frame count (as an integer) from a filename with the format\n", - " 'output_file_prefix_frame00:00.jpg'.\n", - " \"\"\"\n", - " parts = filename.split(FRAME_PREFIX)\n", - " if len(parts) != 2:\n", - " return None # Indicates the filename might be incorrectly formatted\n", - " return parts[1].split('.')[0]\n", - "\n", - "# Process each frame in the output directory\n", - "files = os.listdir(FRAME_EXTRACTION_DIRECTORY)\n", - "files = sorted(files)\n", - "files_to_upload = []\n", - "for file in files:\n", - " files_to_upload.append(\n", - " File(file_path=os.path.join(FRAME_EXTRACTION_DIRECTORY, file)))\n", - "\n", - "# Upload the files to the API\n", - "# Only upload a 10 second slice of files to reduce upload time.\n", - "# Change full_video to True to upload the whole video.\n", - "full_video = False\n", - "\n", - "uploaded_files = []\n", - "print(f'Uploading {len(files_to_upload) if full_video else 10} files. This might take a bit...')\n", - "\n", - "for file in files_to_upload if full_video else files_to_upload[40:50]:\n", - " print(f'Uploading: {file.file_path}...')\n", - " response = genai.upload_file(path=file.file_path)\n", - " file.set_file_response(response)\n", - " uploaded_files.append(file)\n", - "\n", - "print(f\"Completed file uploads!\\n\\nUploaded: {len(uploaded_files)} files\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oOZmTUb4FWOa" - }, - "source": [ - "## List Files\n", - "\n", - "After uploading the file, you can verify the API has successfully received the files by calling `files.list`.\n", - "\n", - "`files.list` lets you see all files that have been uploaded to the File API that are associated with the Cloud project your API key belongs to. Only the `name` (and by extension, the `uri`) are unique." + "f.name" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 37, "metadata": { - "id": "SHMVCWHkFhJW" + "id": "dA9mtftdFcfq" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "https://generativelanguage.googleapis.com/v1beta/files/5ea7iyxfz49g\n", - "https://generativelanguage.googleapis.com/v1beta/files/g4bug5rmphkz\n", - "https://generativelanguage.googleapis.com/v1beta/files/xt2qoh3x0in5\n", - "https://generativelanguage.googleapis.com/v1beta/files/pibycmgydtwl\n", - "https://generativelanguage.googleapis.com/v1beta/files/q477xj4wnisv\n", - "https://generativelanguage.googleapis.com/v1beta/files/hldcyzswc7yh\n", - "https://generativelanguage.googleapis.com/v1beta/files/j0fqkxto51af\n", - "https://generativelanguage.googleapis.com/v1beta/files/cpq3lorfi4jm\n", - "https://generativelanguage.googleapis.com/v1beta/files/ube15rpb295f\n", - "https://generativelanguage.googleapis.com/v1beta/files/60x22ejdo34p\n" + "...." ] } ], "source": [ - "# List files uploaded in the API\n", - "for n, f in zip(range(len(uploaded_files)), genai.list_files()):\n", - " print(f.uri)" + "import time\n", + "\n", + "while f.state.name == \"PROCESSING\":\n", + " print('.', end='')\n", + " time.sleep(10)\n", + " f = genai.get_file(f.name)" ] }, { @@ -368,29 +225,31 @@ "\n", "After the file has been uploaded, you can make `GenerateContent` requests that reference the File API URI.\n", "\n", - "To understand videos with Gemini 1.5 Pro, provide 2 consecutive `Part`s for each frame: a `text` part with the **timestamp** and `fileData` part with the frame's **image URI**:\n", + "`GenerateContent` expects a list of inputs. It can operate in two modes either:\n", + "\n", + "1. If the input appears to be a list of `Part` object, it will pack them into a single `Content` object (representing one conversation turn).\n", + "2. If the input appears to be a list of `Content` objects each represents one turn of the conversation.\n", "\n", - "```\n", - "part { text = \"00:00\" }\n", - "part { fileData = fileData {\n", - " fileUri = \"https://generativelanguage.googleapis.com/v1/files/frame-0\"\n", - " mimeType = \"image/jpeg\"\n", - "}}\n", - "```" + "For now, video file references can't be mixed into a list of parts. They need to be contained in their own `Content` object, see below. `generate_content([file, 'describe this file'])` would be valid for any other media type." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 38, "metadata": { - "id": "ZYVFqmLkl5nE" + "id": "rKkc5bH5ct18" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Two men in suits and bowler hats are walking down a dirt road. They are walking in the same direction as the cars that are parked on the side of the road. There are houses and other buildings in the background. The men cross a set of railroad tracks. One of the men opens the door of a boxcar and climbs in, while the other man waves goodbye. In the background, there are oil derricks.\n" + "The video is a black and white silent movie featuring a detective who is also a film projector operator.\n", + "\n", + "The movie begins with a detective reading a note, after which he chases a suspect. He pursues him onto a moving train, then ultimately finds himself stranded on a water tower as the train pulls away. He falls off the water tower and lands directly beneath the water spout. He then gives up on the case and returns to his job as a film projector operator. \n", + "\n", + "As the projectionist, he falls asleep at the controls, interrupting the movie on-screen. This creates an interesting effect that makes it look like the characters in the film are interacting with the projectionist. He even enters the world of the film on screen and ends up in a brawl with one of the actors. The fight is broken up when the lead actor of the movie calls for a detective, \"Sherlock Jr.\" The video ends with the projectionist being called into work. \n", + "\n" ] } ], @@ -401,18 +260,9 @@ "# Set the model to Gemini 1.5 Pro.\n", "model = genai.GenerativeModel(model_name=\"models/gemini-1.5-pro-latest\")\n", "\n", - "# Make GenerateContent request with the structure described above.\n", - "def make_request(prompt, files):\n", - " request = [prompt]\n", - " for file in files:\n", - " request.append(file.timestamp)\n", - " request.append(file.response)\n", - " return request\n", - "\n", - "# Make the LLM request.\n", - "request = make_request(prompt, uploaded_files)\n", - "response = model.generate_content(request,\n", - " request_options={\"timeout\": 600})\n", + "response = model.generate_content(\n", + " [f, \"Describe this video.\"],\n", + " request_options={\"timeout\": 600})\n", "print(response.text)" ] }, @@ -429,38 +279,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { - "id": "d4eO8ZXoKdZf" + "id": "ggoi6wibct18" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Deleting 10 images. This might take a bit...\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:40.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/60x22ejdo34p\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:41.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/ube15rpb295f\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:42.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/cpq3lorfi4jm\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:43.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/j0fqkxto51af\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:44.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/hldcyzswc7yh\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:45.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/q477xj4wnisv\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:46.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/pibycmgydtwl\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:47.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/xt2qoh3x0in5\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:48.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/g4bug5rmphkz\n", - "Deleted /content/frames/SherlockJr__10min_mp4_frame00:49.jpg at URI https://generativelanguage.googleapis.com/v1beta/files/5ea7iyxfz49g\n", - "Completed deleting files!\n", - "\n", - "Deleted: 10 files\n" - ] - } - ], + "outputs": [], "source": [ - "print(f'Deleting {len(uploaded_files)} images. This might take a bit...')\n", - "for file in uploaded_files:\n", - " genai.delete_file(file.response.name)\n", - " print(f'Deleted {file.file_path} at URI {file.response.uri}')\n", - "print(f\"Completed deleting files!\\n\\nDeleted: {len(uploaded_files)} files\")" + "f.delete()" ] }, {