Skip to content

Commit

Permalink
Screen sharing in websocket example (#363)
Browse files Browse the repository at this point in the history
Copying @clementfarabet's addition to the SDK example from #359

* Formatting and note about selecting the mode
* Screen sharing mode
  • Loading branch information
Giom-V authored Dec 18, 2024
1 parent 979da7c commit df002f0
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 2 deletions.
12 changes: 11 additions & 1 deletion gemini-2/live_api_starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
# And to run this script, ensure the GOOGLE_API_KEY environment
# variable is set to the key you obtained from Google AI Studio.

# Add the "--mode screen" if you want to share your screen to the model
# instead of your camera stream

import asyncio
import base64
import io
Expand All @@ -33,13 +36,20 @@
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--mode', type=str, default='camera', help='pixels to stream from', choices=['camera', 'screen'])
parser.add_argument(
"--mode",
type=str,
default="camera",
help="pixels to stream from",
choices=["camera", "screen"],
)
args = parser.parse_args()

from google import genai

if sys.version_info < (3, 11, 0):
import taskgroup, exceptiongroup

asyncio.TaskGroup = taskgroup.TaskGroup
asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup

Expand Down
49 changes: 48 additions & 1 deletion gemini-2/websockets/live_api_starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
# And to run this script, ensure the GOOGLE_API_KEY environment
# variable is set to the key you obtained from Google AI Studio.

# Add the "--mode screen" if you want to share your screen to the model
# instead of your camera stream

import asyncio
import base64
import json
Expand All @@ -29,11 +32,24 @@
import cv2
import pyaudio
import PIL.Image
import mss
import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
"--mode",
type=str,
default="camera",
help="pixels to stream from",
choices=["camera", "screen"],
)
args = parser.parse_args()

from websockets.asyncio.client import connect

if sys.version_info < (3, 11, 0):
import taskgroup, exceptiongroup

asyncio.TaskGroup = taskgroup.TaskGroup
asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup

Expand All @@ -46,6 +62,8 @@
host = "generativelanguage.googleapis.com"
model = "gemini-2.0-flash-exp"

MODE = args.mode

api_key = os.environ["GOOGLE_API_KEY"]
uri = f"wss://{host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"

Expand Down Expand Up @@ -119,6 +137,32 @@ async def get_frames(self):
# Release the VideoCapture object
cap.release()

def _get_screen(self):
sct = mss.mss()
monitor = sct.monitors[0]

i = sct.grab(monitor)
mime_type = "image/jpeg"
image_bytes = mss.tools.to_png(i.rgb, i.size)
img = PIL.Image.open(io.BytesIO(image_bytes))

image_io = io.BytesIO()
img.save(image_io, format="jpeg")
image_io.seek(0)

image_bytes = image_io.read()
return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}

async def get_screen(self):
while True:
frame = await asyncio.to_thread(self._get_screen)
if frame is None:
break

await asyncio.sleep(1.0)

await self.out_queue.put(frame)

async def send_realtime(self):
while True:
msg = await self.out_queue.get()
Expand Down Expand Up @@ -210,7 +254,10 @@ async def run(self):

tg.create_task(self.send_realtime())
tg.create_task(self.listen_audio())
tg.create_task(self.get_frames())
if MODE == "camera":
tg.create_task(self.get_frames())
elif MODE == "screen":
tg.create_task(self.get_screen())
tg.create_task(self.receive_audio())
tg.create_task(self.play_audio())

Expand Down

0 comments on commit df002f0

Please sign in to comment.