2023-12-08 19:23:02 -05:00
|
|
|
import sys
|
|
|
|
|
import os
|
|
|
|
|
import subprocess
|
|
|
|
|
import platform
|
2023-12-08 20:46:10 -05:00
|
|
|
import base64
|
2023-12-09 12:12:32 -05:00
|
|
|
import json
|
2023-12-08 19:23:02 -05:00
|
|
|
import openai
|
2024-02-15 08:53:12 -05:00
|
|
|
import argparse
|
2023-12-08 19:23:02 -05:00
|
|
|
|
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
2023-12-09 13:39:38 -05:00
|
|
|
# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
|
2023-12-08 20:59:30 -05:00
|
|
|
TEST_CASES = {
|
2024-02-15 08:53:12 -05:00
|
|
|
"Go to Github.com": "A Github page is visible.",
|
2023-12-09 13:10:22 -05:00
|
|
|
"Go to Youtube.com and play a video": "The YouTube video player is visible.",
|
2023-12-08 20:59:30 -05:00
|
|
|
}
|
2023-12-08 20:46:10 -05:00
|
|
|
|
|
|
|
|
EVALUATION_PROMPT = """
|
|
|
|
|
Your job is to look at the given screenshot and determine if the following guideline is met in the image.
|
2023-12-09 12:12:32 -05:00
|
|
|
You must respond in the following format ONLY. Do not add anything else:
|
|
|
|
|
{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }}
|
|
|
|
|
guideline_met must be set to a JSON boolean. True if the image meets the given guideline.
|
|
|
|
|
reason must be a string containing a justification for your decision.
|
2023-12-08 20:46:10 -05:00
|
|
|
|
|
|
|
|
Guideline: {guideline}
|
|
|
|
|
"""
|
|
|
|
|
|
2024-06-11 16:10:02 -07:00
|
|
|
SCREENSHOT_PATH = os.path.join("screenshots", "screenshot.png")
|
|
|
|
|
|
2023-12-08 20:59:30 -05:00
|
|
|
|
2023-12-08 19:23:02 -05:00
|
|
|
# Check if on a windows terminal that supports ANSI escape codes
|
|
|
|
|
def supports_ansi():
|
|
|
|
|
"""
|
|
|
|
|
Check if the terminal supports ANSI escape codes
|
|
|
|
|
"""
|
|
|
|
|
plat = platform.system()
|
|
|
|
|
supported_platform = plat != "Windows" or "ANSICON" in os.environ
|
|
|
|
|
is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
|
|
|
|
|
return supported_platform and is_a_tty
|
|
|
|
|
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2023-12-08 19:23:02 -05:00
|
|
|
if supports_ansi():
|
|
|
|
|
# Standard green text
|
|
|
|
|
ANSI_GREEN = "\033[32m"
|
|
|
|
|
# Bright/bold green text
|
|
|
|
|
ANSI_BRIGHT_GREEN = "\033[92m"
|
|
|
|
|
# Reset to default text color
|
|
|
|
|
ANSI_RESET = "\033[0m"
|
|
|
|
|
# ANSI escape code for blue text
|
|
|
|
|
ANSI_BLUE = "\033[94m" # This is for bright blue
|
|
|
|
|
|
|
|
|
|
# Standard yellow text
|
|
|
|
|
ANSI_YELLOW = "\033[33m"
|
|
|
|
|
|
|
|
|
|
ANSI_RED = "\033[31m"
|
|
|
|
|
|
|
|
|
|
# Bright magenta text
|
|
|
|
|
ANSI_BRIGHT_MAGENTA = "\033[95m"
|
|
|
|
|
else:
|
|
|
|
|
ANSI_GREEN = ""
|
|
|
|
|
ANSI_BRIGHT_GREEN = ""
|
|
|
|
|
ANSI_RESET = ""
|
|
|
|
|
ANSI_BLUE = ""
|
|
|
|
|
ANSI_YELLOW = ""
|
|
|
|
|
ANSI_RED = ""
|
|
|
|
|
ANSI_BRIGHT_MAGENTA = ""
|
2024-06-11 16:10:02 -07:00
|
|
|
|
|
|
|
|
|
2023-12-08 20:46:10 -05:00
|
|
|
def format_evaluation_prompt(guideline):
|
|
|
|
|
prompt = EVALUATION_PROMPT.format(guideline=guideline)
|
|
|
|
|
return prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_eval_content(content):
|
2023-12-09 12:12:32 -05:00
|
|
|
try:
|
|
|
|
|
res = json.loads(content)
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2023-12-09 12:12:32 -05:00
|
|
|
print(res["reason"])
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2023-12-09 12:12:32 -05:00
|
|
|
return res["guideline_met"]
|
|
|
|
|
except:
|
2024-06-11 16:10:02 -07:00
|
|
|
print(
|
|
|
|
|
"The model gave a bad evaluation response and it couldn't be parsed. Exiting..."
|
|
|
|
|
)
|
2023-12-08 20:46:10 -05:00
|
|
|
exit(1)
|
2023-12-08 19:23:02 -05:00
|
|
|
|
|
|
|
|
|
2024-01-16 09:56:03 -05:00
|
|
|
def evaluate_final_screenshot(guideline):
|
2024-06-11 16:10:02 -07:00
|
|
|
"""Load the final screenshot and return True or False if it meets the given guideline."""
|
2024-01-16 09:53:18 -05:00
|
|
|
with open(SCREENSHOT_PATH, "rb") as img_file:
|
2023-12-08 20:46:10 -05:00
|
|
|
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
|
|
|
|
|
2024-06-11 16:10:02 -07:00
|
|
|
eval_message = [
|
|
|
|
|
{
|
|
|
|
|
"role": "user",
|
|
|
|
|
"content": [
|
|
|
|
|
{"type": "text", "text": format_evaluation_prompt(guideline)},
|
|
|
|
|
{
|
|
|
|
|
"type": "image_url",
|
|
|
|
|
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
2023-12-08 20:46:10 -05:00
|
|
|
response = openai.chat.completions.create(
|
2024-06-11 16:10:02 -07:00
|
|
|
model="gpt-4o",
|
2023-12-08 20:46:10 -05:00
|
|
|
messages=eval_message,
|
|
|
|
|
presence_penalty=1,
|
|
|
|
|
frequency_penalty=1,
|
|
|
|
|
temperature=0.7,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
eval_content = response.choices[0].message.content
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2023-12-08 20:46:10 -05:00
|
|
|
return parse_eval_content(eval_content)
|
|
|
|
|
|
|
|
|
|
|
2024-02-15 09:51:41 -05:00
|
|
|
def run_test_case(objective, guideline, model):
|
2024-06-11 16:10:02 -07:00
|
|
|
"""Returns True if the result of the test with the given prompt meets the given guideline for the given model."""
|
2024-02-15 09:51:41 -05:00
|
|
|
# Run `operate` with the model to evaluate and the test case prompt
|
2024-06-11 16:10:02 -07:00
|
|
|
subprocess.run(
|
|
|
|
|
["operate", "-m", model, "--prompt", f'"{objective}"'],
|
|
|
|
|
stdout=subprocess.DEVNULL,
|
|
|
|
|
)
|
|
|
|
|
|
2023-12-08 20:46:10 -05:00
|
|
|
try:
|
2024-01-16 09:56:03 -05:00
|
|
|
result = evaluate_final_screenshot(guideline)
|
2024-06-11 16:10:02 -07:00
|
|
|
except OSError:
|
2024-01-16 09:56:03 -05:00
|
|
|
print("[Error] Couldn't open the screenshot for evaluation")
|
2023-12-08 20:46:10 -05:00
|
|
|
return False
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2023-12-08 20:46:10 -05:00
|
|
|
return result
|
2023-12-08 19:23:02 -05:00
|
|
|
|
|
|
|
|
|
2024-02-15 08:53:12 -05:00
|
|
|
def get_test_model():
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
description="Run the self-operating-computer with a specified model."
|
|
|
|
|
)
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2024-02-15 08:53:12 -05:00
|
|
|
parser.add_argument(
|
|
|
|
|
"-m",
|
|
|
|
|
"--model",
|
|
|
|
|
help="Specify the model to evaluate.",
|
|
|
|
|
required=False,
|
|
|
|
|
default="gpt-4-with-ocr",
|
|
|
|
|
)
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2024-02-15 08:53:12 -05:00
|
|
|
return parser.parse_args().model
|
|
|
|
|
|
|
|
|
|
|
2023-12-08 19:23:02 -05:00
|
|
|
def main():
|
|
|
|
|
load_dotenv()
|
|
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY")
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2024-02-15 08:53:12 -05:00
|
|
|
model = get_test_model()
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2024-02-15 08:53:12 -05:00
|
|
|
print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
|
2023-12-09 00:11:48 -05:00
|
|
|
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")
|
2023-12-08 19:23:02 -05:00
|
|
|
|
2024-06-11 16:10:02 -07:00
|
|
|
passed = 0
|
|
|
|
|
failed = 0
|
2023-12-08 20:59:30 -05:00
|
|
|
for objective, guideline in TEST_CASES.items():
|
2023-12-08 20:46:10 -05:00
|
|
|
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2024-02-15 09:51:41 -05:00
|
|
|
result = run_test_case(objective, guideline, model)
|
2023-12-08 19:23:02 -05:00
|
|
|
if result:
|
2023-12-08 20:46:10 -05:00
|
|
|
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
|
2023-12-09 12:12:32 -05:00
|
|
|
passed += 1
|
2023-12-08 19:23:02 -05:00
|
|
|
else:
|
2023-12-08 20:46:10 -05:00
|
|
|
print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")
|
2023-12-09 12:12:32 -05:00
|
|
|
failed += 1
|
2023-12-08 19:23:02 -05:00
|
|
|
|
2023-12-09 00:11:48 -05:00
|
|
|
print(
|
2024-01-16 10:18:22 -05:00
|
|
|
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"
|
2023-12-09 00:11:48 -05:00
|
|
|
)
|
2023-12-08 19:23:02 -05:00
|
|
|
|
2024-06-11 16:10:02 -07:00
|
|
|
|
2023-12-08 19:23:02 -05:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|