Blame: evaluate.py - OthersideAI/self-operating-computer

OthersideAI / self-operating-computer UNCLAIMED

A framework to enable multimodal models to operate a computer.

0 0 0 Python

Add evaluator.py 2023-12-08 19:23:02 -05:00			`import sys`
			`import os`
			`import subprocess`
			`import platform`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`import base64`
Add evaluation justification 2023-12-09 12:12:32 -05:00			`import json`
Add evaluator.py 2023-12-08 19:23:02 -05:00			`import openai`

			`from dotenv import load_dotenv`

Add comment to TEST_CASES 2023-12-09 13:39:38 -05:00			# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
Change test cases 2023-12-08 20:59:30 -05:00			`TEST_CASES = {`
Change default test cases 2023-12-09 13:10:22 -05:00			`"Go to Github.com": "The Github home page is visible.",`
			`"Go to Youtube.com and play a video": "The YouTube video player is visible.",`
Change test cases 2023-12-08 20:59:30 -05:00			`}`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00
			`EVALUATION_PROMPT = """`
			`Your job is to look at the given screenshot and determine if the following guideline is met in the image.`
Add evaluation justification 2023-12-09 12:12:32 -05:00			`You must respond in the following format ONLY. Do not add anything else:`
			`{{ "guideline_met": (true\|false), "reason": "Explanation for why guideline was or wasn't met" }}`
			`guideline_met must be set to a JSON boolean. True if the image meets the given guideline.`
			`reason must be a string containing a justification for your decision.`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00
			`Guideline: {guideline}`
			`"""`

Check for last screenshot instead of summary screenshot 2024-01-16 09:53:18 -05:00			`SCREENSHOT_PATH = os.path.join('screenshots', 'screenshot.png')`
Change test cases 2023-12-08 20:59:30 -05:00
Add evaluator.py 2023-12-08 19:23:02 -05:00			`# Check if on a windows terminal that supports ANSI escape codes`
			`def supports_ansi():`
			`"""`
			`Check if the terminal supports ANSI escape codes`
			`"""`
			`plat = platform.system()`
			`supported_platform = plat != "Windows" or "ANSICON" in os.environ`
			`is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()`
			`return supported_platform and is_a_tty`

			`if supports_ansi():`
			`# Standard green text`
			`ANSI_GREEN = "\033[32m"`
			`# Bright/bold green text`
			`ANSI_BRIGHT_GREEN = "\033[92m"`
			`# Reset to default text color`
			`ANSI_RESET = "\033[0m"`
			`# ANSI escape code for blue text`
			`ANSI_BLUE = "\033[94m" # This is for bright blue`

			`# Standard yellow text`
			`ANSI_YELLOW = "\033[33m"`

			`ANSI_RED = "\033[31m"`

			`# Bright magenta text`
			`ANSI_BRIGHT_MAGENTA = "\033[95m"`
			`else:`
			`ANSI_GREEN = ""`
			`ANSI_BRIGHT_GREEN = ""`
			`ANSI_RESET = ""`
			`ANSI_BLUE = ""`
			`ANSI_YELLOW = ""`
			`ANSI_RED = ""`
			`ANSI_BRIGHT_MAGENTA = ""`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00

			`def format_evaluation_prompt(guideline):`
			`prompt = EVALUATION_PROMPT.format(guideline=guideline)`
			`return prompt`


			`def parse_eval_content(content):`
Add evaluation justification 2023-12-09 12:12:32 -05:00			`try:`
			`res = json.loads(content)`

			`print(res["reason"])`

			`return res["guideline_met"]`
			`except:`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")`
			`exit(1)`
Add evaluator.py 2023-12-08 19:23:02 -05:00

Update error message 2024-01-16 09:56:03 -05:00			`def evaluate_final_screenshot(guideline):`
			`'''Load the final screenshot and return True or False if it meets the given guideline.'''`
Check for last screenshot instead of summary screenshot 2024-01-16 09:53:18 -05:00			`with open(SCREENSHOT_PATH, "rb") as img_file:`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`img_base64 = base64.b64encode(img_file.read()).decode("utf-8")`

			`eval_message = [{`
			`"role": "user",`
			`"content": [`
			`{"type": "text", "text": format_evaluation_prompt(guideline)},`
			`{`
			`"type": "image_url",`
			`"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},`
			`},`
			`],`
			`}]`

			`response = openai.chat.completions.create(`
			`model="gpt-4-vision-preview",`
			`messages=eval_message,`
			`presence_penalty=1,`
			`frequency_penalty=1,`
			`temperature=0.7,`
			`max_tokens=300,`
			`)`

			`eval_content = response.choices[0].message.content`

			`return parse_eval_content(eval_content)`


			`def run_test_case(objective, guideline):`
Add evaluator.py 2023-12-08 19:23:02 -05:00			`'''Returns True if the result of the test with the given prompt meets the given guideline.'''`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			# Run `operate` with the test case prompt
			`subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)`

			`try:`
Update error message 2024-01-16 09:56:03 -05:00			`result = evaluate_final_screenshot(guideline)`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`except(OSError):`
Update error message 2024-01-16 09:56:03 -05:00			`print("[Error] Couldn't open the screenshot for evaluation")`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`return False`
Add evaluator.py 2023-12-08 19:23:02 -05:00
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`return result`
Add evaluator.py 2023-12-08 19:23:02 -05:00

			`def main():`
			`load_dotenv()`
			`openai.api_key = os.getenv("OPENAI_API_KEY")`
Silence operator stdout 2023-12-08 19:45:56 -05:00
Add summary message 2023-12-09 00:11:48 -05:00			`print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")`
Add evaluator.py 2023-12-08 19:23:02 -05:00
Add summary message 2023-12-09 00:11:48 -05:00			`passed = 0; failed = 0`
Change test cases 2023-12-08 20:59:30 -05:00			`for objective, guideline in TEST_CASES.items():`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")`
Add evaluator.py 2023-12-08 19:23:02 -05:00
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`result = run_test_case(objective, guideline)`
Add evaluator.py 2023-12-08 19:23:02 -05:00			`if result:`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")`
Add evaluation justification 2023-12-09 12:12:32 -05:00			`passed += 1`
Add evaluator.py 2023-12-08 19:23:02 -05:00			`else:`
Use gpt-4v to evalue summary screenshot 2023-12-08 20:46:10 -05:00			`print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")`
Add evaluation justification 2023-12-09 12:12:32 -05:00			`failed += 1`
Add evaluator.py 2023-12-08 19:23:02 -05:00
Add summary message 2023-12-09 00:11:48 -05:00			`print(`
Update test result message format 2024-01-16 10:18:22 -05:00			`f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"`
Add summary message 2023-12-09 00:11:48 -05:00			`)`
Add evaluator.py 2023-12-08 19:23:02 -05:00
			`if __name__ == "__main__":`
			`main()`