Screen sharing in websocket example (#363)

Copying @clementfarabet's addition to the SDK example from #359 * Formatting and note about selecting the mode * Screen sharing mode
google-gemini · Dec 18, 2024 · df002f0 · df002f0
1 parent 979da7c
commit df002f0
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 2 deletions.
diff --git a/gemini-2/live_api_starter.py b/gemini-2/live_api_starter.py
@@ -18,6 +18,9 @@
 # And to run this script, ensure the GOOGLE_API_KEY environment
 # variable is set to the key you obtained from Google AI Studio.
 
+# Add the "--mode screen" if you want to share your screen to the model
+# instead of your camera stream
+
 import asyncio
 import base64
 import io
@@ -33,13 +36,20 @@
 import argparse
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--mode', type=str, default='camera', help='pixels to stream from', choices=['camera', 'screen'])
+parser.add_argument(
+    "--mode",
+    type=str,
+    default="camera",
+    help="pixels to stream from",
+    choices=["camera", "screen"],
+)
 args = parser.parse_args()
 
 from google import genai
 
 if sys.version_info < (3, 11, 0):
     import taskgroup, exceptiongroup
+
     asyncio.TaskGroup = taskgroup.TaskGroup
     asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup
 

diff --git a/gemini-2/websockets/live_api_starter.py b/gemini-2/websockets/live_api_starter.py
@@ -18,6 +18,9 @@
 # And to run this script, ensure the GOOGLE_API_KEY environment
 # variable is set to the key you obtained from Google AI Studio.
 
+# Add the "--mode screen" if you want to share your screen to the model
+# instead of your camera stream
+
 import asyncio
 import base64
 import json
@@ -29,11 +32,24 @@
 import cv2
 import pyaudio
 import PIL.Image
+import mss
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--mode",
+    type=str,
+    default="camera",
+    help="pixels to stream from",
+    choices=["camera", "screen"],
+)
+args = parser.parse_args()
 
 from websockets.asyncio.client import connect
 
 if sys.version_info < (3, 11, 0):
     import taskgroup, exceptiongroup
+
     asyncio.TaskGroup = taskgroup.TaskGroup
     asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup
 
@@ -46,6 +62,8 @@
 host = "generativelanguage.googleapis.com"
 model = "gemini-2.0-flash-exp"
 
+MODE = args.mode
+
 api_key = os.environ["GOOGLE_API_KEY"]
 uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
 
@@ -119,6 +137,32 @@ async def get_frames(self):
         # Release the VideoCapture object
         cap.release()
 
+    def _get_screen(self):
+        sct = mss.mss()
+        monitor = sct.monitors[0]
+
+        i = sct.grab(monitor)
+        mime_type = "image/jpeg"
+        image_bytes = mss.tools.to_png(i.rgb, i.size)
+        img = PIL.Image.open(io.BytesIO(image_bytes))
+
+        image_io = io.BytesIO()
+        img.save(image_io, format="jpeg")
+        image_io.seek(0)
+
+        image_bytes = image_io.read()
+        return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
+
+    async def get_screen(self):
+        while True:
+            frame = await asyncio.to_thread(self._get_screen)
+            if frame is None:
+                break
+
+            await asyncio.sleep(1.0)
+
+            await self.out_queue.put(frame)
+
     async def send_realtime(self):
         while True:
             msg = await self.out_queue.get()
@@ -210,7 +254,10 @@ async def run(self):
 
                 tg.create_task(self.send_realtime())
                 tg.create_task(self.listen_audio())
-                tg.create_task(self.get_frames())
+                if MODE == "camera":
+                    tg.create_task(self.get_frames())
+                elif MODE == "screen":
+                    tg.create_task(self.get_screen())
                 tg.create_task(self.receive_audio())
                 tg.create_task(self.play_audio())