diff --git a/gemini-2/live_api_starter.py b/gemini-2/live_api_starter.py index 48e560d69..df8ff0964 100755 --- a/gemini-2/live_api_starter.py +++ b/gemini-2/live_api_starter.py @@ -18,6 +18,9 @@ # And to run this script, ensure the GOOGLE_API_KEY environment # variable is set to the key you obtained from Google AI Studio. +# Add the "--mode screen" if you want to share your screen to the model +# instead of your camera stream + import asyncio import base64 import io @@ -33,13 +36,20 @@ import argparse parser = argparse.ArgumentParser() -parser.add_argument('--mode', type=str, default='camera', help='pixels to stream from', choices=['camera', 'screen']) +parser.add_argument( + "--mode", + type=str, + default="camera", + help="pixels to stream from", + choices=["camera", "screen"], +) args = parser.parse_args() from google import genai if sys.version_info < (3, 11, 0): import taskgroup, exceptiongroup + asyncio.TaskGroup = taskgroup.TaskGroup asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup diff --git a/gemini-2/websockets/live_api_starter.py b/gemini-2/websockets/live_api_starter.py index 2227d2c15..42b4042aa 100755 --- a/gemini-2/websockets/live_api_starter.py +++ b/gemini-2/websockets/live_api_starter.py @@ -18,6 +18,9 @@ # And to run this script, ensure the GOOGLE_API_KEY environment # variable is set to the key you obtained from Google AI Studio. +# Add the "--mode screen" if you want to share your screen to the model +# instead of your camera stream + import asyncio import base64 import json @@ -29,11 +32,24 @@ import cv2 import pyaudio import PIL.Image +import mss +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument( + "--mode", + type=str, + default="camera", + help="pixels to stream from", + choices=["camera", "screen"], +) +args = parser.parse_args() from websockets.asyncio.client import connect if sys.version_info < (3, 11, 0): import taskgroup, exceptiongroup + asyncio.TaskGroup = taskgroup.TaskGroup asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup @@ -46,6 +62,8 @@ host = "generativelanguage.googleapis.com" model = "gemini-2.0-flash-exp" +MODE = args.mode + api_key = os.environ["GOOGLE_API_KEY"] uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}" @@ -119,6 +137,32 @@ async def get_frames(self): # Release the VideoCapture object cap.release() + def _get_screen(self): + sct = mss.mss() + monitor = sct.monitors[0] + + i = sct.grab(monitor) + mime_type = "image/jpeg" + image_bytes = mss.tools.to_png(i.rgb, i.size) + img = PIL.Image.open(io.BytesIO(image_bytes)) + + image_io = io.BytesIO() + img.save(image_io, format="jpeg") + image_io.seek(0) + + image_bytes = image_io.read() + return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()} + + async def get_screen(self): + while True: + frame = await asyncio.to_thread(self._get_screen) + if frame is None: + break + + await asyncio.sleep(1.0) + + await self.out_queue.put(frame) + async def send_realtime(self): while True: msg = await self.out_queue.get() @@ -210,7 +254,10 @@ async def run(self): tg.create_task(self.send_realtime()) tg.create_task(self.listen_audio()) - tg.create_task(self.get_frames()) + if MODE == "camera": + tg.create_task(self.get_frames()) + elif MODE == "screen": + tg.create_task(self.get_screen()) tg.create_task(self.receive_audio()) tg.create_task(self.play_audio())