Skip to content

Commit

Permalink
add o1-with-ocr
Browse files Browse the repository at this point in the history
  • Loading branch information
joshbickett committed Dec 18, 2024
1 parent f370577 commit ac71d7a
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 10 deletions.
3 changes: 2 additions & 1 deletion operate/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def validation(self, model, voice_mode):
model == "gpt-4"
or voice_mode
or model == "gpt-4-with-som"
or model == "gpt-4-with-ocr",
or model == "gpt-4-with-ocr"
or model == "o1-with-ocr",
)
self.require_api_key(
"GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"
Expand Down
120 changes: 119 additions & 1 deletion operate/models/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ async def get_next_action(model, messages, objective, session_id):
if model == "gpt-4-with-ocr":
operation = await call_gpt_4o_with_ocr(messages, objective, model)
return operation, None
if model == "o1-with-ocr":
operation = await call_o1_with_ocr(messages, objective, model)
return operation, None
if model == "agent-1":
return "coming soon"
if model == "gemini-pro-vision":
Expand Down Expand Up @@ -231,7 +234,7 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
messages.append(vision_message)

response = client.chat.completions.create(
model="gpt-4o",
model="o1",
messages=messages,
temperature=0.7,
max_tokens=3000,
Expand Down Expand Up @@ -307,6 +310,121 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
return gpt_4_fallback(messages, objective, model)


async def call_o1_with_ocr(messages, objective, model):
if config.verbose:
print("[call_o1_with_ocr]")

# Construct the path to the file within the package
try:
time.sleep(1)
client = config.initialize_openai()

confirm_system_prompt(messages, objective, model)
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)

screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
# Call the function to capture the screen with the cursor
capture_screen_with_cursor(screenshot_filename)

with open(screenshot_filename, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

if len(messages) == 1:
user_prompt = get_user_first_message_prompt()
else:
user_prompt = get_user_prompt()

vision_message = {
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}
messages.append(vision_message)

response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
temperature=0.7,
max_tokens=3000,
)

content = response.choices[0].message.content

content = clean_json(content)

# used later for the messages
content_str = content

content = json.loads(content)

processed_content = []

for operation in content:
if operation.get("operation") == "click":
text_to_click = operation.get("text")
if config.verbose:
print(
"[call_o1_with_ocr][click] text_to_click",
text_to_click,
)
# Initialize EasyOCR Reader
reader = easyocr.Reader(["en"])

# Read the screenshot
result = reader.readtext(screenshot_filename)

text_element_index = get_text_element(
result, text_to_click, screenshot_filename
)
coordinates = get_text_coordinates(
result, text_element_index, screenshot_filename
)

# add `coordinates`` to `content`
operation["x"] = coordinates["x"]
operation["y"] = coordinates["y"]

if config.verbose:
print(
"[call_o1_with_ocr][click] text_element_index",
text_element_index,
)
print(
"[call_o1_with_ocr][click] coordinates",
coordinates,
)
print(
"[call_o1_with_ocr][click] final operation",
operation,
)
processed_content.append(operation)

else:
processed_content.append(operation)

# wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
assistant_message = {"role": "assistant", "content": content_str}
messages.append(assistant_message)

return processed_content

except Exception as e:
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
)
if config.verbose:
print("[Self-Operating Computer][Operate] error", e)
traceback.print_exc()
return gpt_4_fallback(messages, objective, model)


async def call_gpt_4o_labeled(messages, objective, model):
time.sleep(1)

Expand Down
11 changes: 3 additions & 8 deletions operate/models/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,20 +232,15 @@ def get_system_prompt(model, objective):
os_search_str=os_search_str,
operating_system=operating_system,
)
elif model == "gpt-4-with-ocr":
prompt = SYSTEM_PROMPT_OCR.format(
objective=objective,
cmd_string=cmd_string,
os_search_str=os_search_str,
operating_system=operating_system,
)
elif model == "claude-3":
elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3":
print("adding SYSTEM_PROMPT_OCR")
prompt = SYSTEM_PROMPT_OCR.format(
objective=objective,
cmd_string=cmd_string,
os_search_str=os_search_str,
operating_system=operating_system,
)

else:
prompt = SYSTEM_PROMPT_STANDARD.format(
objective=objective,
Expand Down

0 comments on commit ac71d7a

Please sign in to comment.