SIGN IN SIGN UP

A framework to enable multimodal models to operate a computer.

0 0 0 Python
2023-12-08 19:23:02 -05:00
import sys
import os
import subprocess
import platform
import base64
2023-12-09 12:12:32 -05:00
import json
2023-12-08 19:23:02 -05:00
import openai
from dotenv import load_dotenv
2023-12-09 13:39:38 -05:00
# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
2023-12-08 20:59:30 -05:00
TEST_CASES = {
2023-12-09 13:10:22 -05:00
"Go to Github.com": "The Github home page is visible.",
"Go to Youtube.com and play a video": "The YouTube video player is visible.",
2023-12-08 20:59:30 -05:00
}
EVALUATION_PROMPT = """
Your job is to look at the given screenshot and determine if the following guideline is met in the image.
2023-12-09 12:12:32 -05:00
You must respond in the following format ONLY. Do not add anything else:
{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }}
guideline_met must be set to a JSON boolean. True if the image meets the given guideline.
reason must be a string containing a justification for your decision.
Guideline: {guideline}
"""
SCREENSHOT_PATH = os.path.join('screenshots', 'screenshot.png')
2023-12-08 20:59:30 -05:00
2023-12-08 19:23:02 -05:00
# Check if on a windows terminal that supports ANSI escape codes
def supports_ansi():
"""
Check if the terminal supports ANSI escape codes
"""
plat = platform.system()
supported_platform = plat != "Windows" or "ANSICON" in os.environ
is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
return supported_platform and is_a_tty
if supports_ansi():
# Standard green text
ANSI_GREEN = "\033[32m"
# Bright/bold green text
ANSI_BRIGHT_GREEN = "\033[92m"
# Reset to default text color
ANSI_RESET = "\033[0m"
# ANSI escape code for blue text
ANSI_BLUE = "\033[94m" # This is for bright blue
# Standard yellow text
ANSI_YELLOW = "\033[33m"
ANSI_RED = "\033[31m"
# Bright magenta text
ANSI_BRIGHT_MAGENTA = "\033[95m"
else:
ANSI_GREEN = ""
ANSI_BRIGHT_GREEN = ""
ANSI_RESET = ""
ANSI_BLUE = ""
ANSI_YELLOW = ""
ANSI_RED = ""
ANSI_BRIGHT_MAGENTA = ""
def format_evaluation_prompt(guideline):
prompt = EVALUATION_PROMPT.format(guideline=guideline)
return prompt
def parse_eval_content(content):
2023-12-09 12:12:32 -05:00
try:
res = json.loads(content)
print(res["reason"])
return res["guideline_met"]
except:
print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")
exit(1)
2023-12-08 19:23:02 -05:00
2024-01-16 09:56:03 -05:00
def evaluate_final_screenshot(guideline):
'''Load the final screenshot and return True or False if it meets the given guideline.'''
with open(SCREENSHOT_PATH, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
eval_message = [{
"role": "user",
"content": [
{"type": "text", "text": format_evaluation_prompt(guideline)},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}]
response = openai.chat.completions.create(
model="gpt-4-vision-preview",
messages=eval_message,
presence_penalty=1,
frequency_penalty=1,
temperature=0.7,
max_tokens=300,
)
eval_content = response.choices[0].message.content
return parse_eval_content(eval_content)
def run_test_case(objective, guideline):
2023-12-08 19:23:02 -05:00
'''Returns True if the result of the test with the given prompt meets the given guideline.'''
# Run `operate` with the test case prompt
subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
try:
2024-01-16 09:56:03 -05:00
result = evaluate_final_screenshot(guideline)
except(OSError):
2024-01-16 09:56:03 -05:00
print("[Error] Couldn't open the screenshot for evaluation")
return False
2023-12-08 19:23:02 -05:00
return result
2023-12-08 19:23:02 -05:00
def main():
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
2023-12-08 19:45:56 -05:00
2023-12-09 00:11:48 -05:00
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")
2023-12-08 19:23:02 -05:00
2023-12-09 00:11:48 -05:00
passed = 0; failed = 0
2023-12-08 20:59:30 -05:00
for objective, guideline in TEST_CASES.items():
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
2023-12-08 19:23:02 -05:00
result = run_test_case(objective, guideline)
2023-12-08 19:23:02 -05:00
if result:
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
2023-12-09 12:12:32 -05:00
passed += 1
2023-12-08 19:23:02 -05:00
else:
print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")
2023-12-09 12:12:32 -05:00
failed += 1
2023-12-08 19:23:02 -05:00
2023-12-09 00:11:48 -05:00
print(
2024-01-16 10:18:22 -05:00
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"
2023-12-09 00:11:48 -05:00
)
2023-12-08 19:23:02 -05:00
if __name__ == "__main__":
main()