SIGN IN SIGN UP

A framework to enable multimodal models to operate a computer.

0 0 0 Python
2023-12-08 19:23:02 -05:00
import sys
import os
import subprocess
import platform
import base64
2023-12-09 12:12:32 -05:00
import json
2023-12-08 19:23:02 -05:00
import openai
2024-02-15 08:53:12 -05:00
import argparse
2023-12-08 19:23:02 -05:00
from dotenv import load_dotenv
2023-12-09 13:39:38 -05:00
# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
2023-12-08 20:59:30 -05:00
TEST_CASES = {
2024-02-15 08:53:12 -05:00
"Go to Github.com": "A Github page is visible.",
2023-12-09 13:10:22 -05:00
"Go to Youtube.com and play a video": "The YouTube video player is visible.",
2023-12-08 20:59:30 -05:00
}
EVALUATION_PROMPT = """
Your job is to look at the given screenshot and determine if the following guideline is met in the image.
2023-12-09 12:12:32 -05:00
You must respond in the following format ONLY. Do not add anything else:
{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }}
guideline_met must be set to a JSON boolean. True if the image meets the given guideline.
reason must be a string containing a justification for your decision.
Guideline: {guideline}
"""
2024-06-11 16:10:02 -07:00
SCREENSHOT_PATH = os.path.join("screenshots", "screenshot.png")
2023-12-08 20:59:30 -05:00
2023-12-08 19:23:02 -05:00
# Check if on a windows terminal that supports ANSI escape codes
def supports_ansi():
"""
Check if the terminal supports ANSI escape codes
"""
plat = platform.system()
supported_platform = plat != "Windows" or "ANSICON" in os.environ
is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
return supported_platform and is_a_tty
2024-06-11 16:10:02 -07:00
2023-12-08 19:23:02 -05:00
if supports_ansi():
# Standard green text
ANSI_GREEN = "\033[32m"
# Bright/bold green text
ANSI_BRIGHT_GREEN = "\033[92m"
# Reset to default text color
ANSI_RESET = "\033[0m"
# ANSI escape code for blue text
ANSI_BLUE = "\033[94m" # This is for bright blue
# Standard yellow text
ANSI_YELLOW = "\033[33m"
ANSI_RED = "\033[31m"
# Bright magenta text
ANSI_BRIGHT_MAGENTA = "\033[95m"
else:
ANSI_GREEN = ""
ANSI_BRIGHT_GREEN = ""
ANSI_RESET = ""
ANSI_BLUE = ""
ANSI_YELLOW = ""
ANSI_RED = ""
ANSI_BRIGHT_MAGENTA = ""
2024-06-11 16:10:02 -07:00
def format_evaluation_prompt(guideline):
prompt = EVALUATION_PROMPT.format(guideline=guideline)
return prompt
def parse_eval_content(content):
2023-12-09 12:12:32 -05:00
try:
res = json.loads(content)
2024-06-11 16:10:02 -07:00
2023-12-09 12:12:32 -05:00
print(res["reason"])
2024-06-11 16:10:02 -07:00
2023-12-09 12:12:32 -05:00
return res["guideline_met"]
except:
2024-06-11 16:10:02 -07:00
print(
"The model gave a bad evaluation response and it couldn't be parsed. Exiting..."
)
exit(1)
2023-12-08 19:23:02 -05:00
2024-01-16 09:56:03 -05:00
def evaluate_final_screenshot(guideline):
2024-06-11 16:10:02 -07:00
"""Load the final screenshot and return True or False if it meets the given guideline."""
with open(SCREENSHOT_PATH, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
2024-06-11 16:10:02 -07:00
eval_message = [
{
"role": "user",
"content": [
{"type": "text", "text": format_evaluation_prompt(guideline)},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}
]
response = openai.chat.completions.create(
2024-06-11 16:10:02 -07:00
model="gpt-4o",
messages=eval_message,
presence_penalty=1,
frequency_penalty=1,
temperature=0.7,
)
eval_content = response.choices[0].message.content
2024-06-11 16:10:02 -07:00
return parse_eval_content(eval_content)
2024-02-15 09:51:41 -05:00
def run_test_case(objective, guideline, model):
2024-06-11 16:10:02 -07:00
"""Returns True if the result of the test with the given prompt meets the given guideline for the given model."""
2024-02-15 09:51:41 -05:00
# Run `operate` with the model to evaluate and the test case prompt
2024-06-11 16:10:02 -07:00
subprocess.run(
["operate", "-m", model, "--prompt", f'"{objective}"'],
stdout=subprocess.DEVNULL,
)
try:
2024-01-16 09:56:03 -05:00
result = evaluate_final_screenshot(guideline)
2024-06-11 16:10:02 -07:00
except OSError:
2024-01-16 09:56:03 -05:00
print("[Error] Couldn't open the screenshot for evaluation")
return False
2024-06-11 16:10:02 -07:00
return result
2023-12-08 19:23:02 -05:00
2024-02-15 08:53:12 -05:00
def get_test_model():
parser = argparse.ArgumentParser(
description="Run the self-operating-computer with a specified model."
)
2024-06-11 16:10:02 -07:00
2024-02-15 08:53:12 -05:00
parser.add_argument(
"-m",
"--model",
help="Specify the model to evaluate.",
required=False,
default="gpt-4-with-ocr",
)
2024-06-11 16:10:02 -07:00
2024-02-15 08:53:12 -05:00
return parser.parse_args().model
2023-12-08 19:23:02 -05:00
def main():
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
2024-06-11 16:10:02 -07:00
2024-02-15 08:53:12 -05:00
model = get_test_model()
2024-06-11 16:10:02 -07:00
2024-02-15 08:53:12 -05:00
print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
2023-12-09 00:11:48 -05:00
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")
2023-12-08 19:23:02 -05:00
2024-06-11 16:10:02 -07:00
passed = 0
failed = 0
2023-12-08 20:59:30 -05:00
for objective, guideline in TEST_CASES.items():
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
2024-06-11 16:10:02 -07:00
2024-02-15 09:51:41 -05:00
result = run_test_case(objective, guideline, model)
2023-12-08 19:23:02 -05:00
if result:
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
2023-12-09 12:12:32 -05:00
passed += 1
2023-12-08 19:23:02 -05:00
else:
print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")
2023-12-09 12:12:32 -05:00
failed += 1
2023-12-08 19:23:02 -05:00
2023-12-09 00:11:48 -05:00
print(
2024-01-16 10:18:22 -05:00
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"
2023-12-09 00:11:48 -05:00
)
2023-12-08 19:23:02 -05:00
2024-06-11 16:10:02 -07:00
2023-12-08 19:23:02 -05:00
if __name__ == "__main__":
main()