/
webcanvasa3b6b41
import json
import os
from typing import Any, Dict
from benchflow import BaseBench
from benchflow.schemas import BenchArgs, BenchmarkResult
class WebCanvasBench(BaseBench):
def __init__(self):
super().__init__()
def get_args(self, task_id: str) -> BenchArgs:
"""
Return a WebCanvasConfig instance, validate the input arguments.
"""
# Benchmark need to deal with the END_IDX so that it can only run one task at a time
# task_id is the start index of the task
arguments = {
"required": ["BROWSERBASE_API_KEY", "GRAPHQL_USERNAME", "GRAPHQL_PASSWORD", "OPENAI_API_KEY"],
"optional": [
{"TEST_START_IDX": task_id},
{"RESULTS_DIR": "/app/batch_tasks_results/example"}
]
}
return BenchArgs(arguments)
def get_image_name(self) -> str:
"""
Return the Docker image name for running the WebCanvas benchmark.
"""
return "kirk2000/benchflow:webcanvas-v1"
def get_results_dir_in_container(self) -> str:
"""
In the container, the result files will be written to /app/batch_tasks_results
(the environment variable RESULTS_DIR can further specify a subdirectory, such as "example").
"""
return "/app/batch_tasks_results"
def get_log_files_dir_in_container(self) -> str:
"""
In the container, the log files will be written to /app/LOGS
"""
return "/app/LOGS"
def get_result(self, task_id: str) -> BenchmarkResult:
"""
Read the result file (assuming the path is: {results_dir}/example/result/result.json),
and parse the result dictionary, which requires the is_resolved, score, and message fields.
"""
# Construct the full path to the result file (related to the RESULTS_DIR configuration inside the container)
result_file = os.path.join(self.results_dir, "example", "result", "result.json")
log_file = os.path.join(self.results_dir, "example", "result", "out.json")
if not os.path.exists(result_file):
return BenchmarkResult(task_id=task_id, is_resolved=False, metrics={"score": 0}, log={"error": "No results found"}, other={})
try:
with open(result_file, 'r') as f:
data = f.read().strip()
try:
results = json.loads(data)
except Exception:
# If the direct parsing fails, try simple text replacement and then parsing
data_fixed = data.replace('{', '{"').replace(': ', '": ').replace(', ', ', "')
results = json.loads(data_fixed)
with open(log_file, 'r') as f:
log = f.read().strip()
print(log)
except Exception as e:
return BenchmarkResult(task_id=task_id, is_resolved=False, metrics={"score": 0}, log={"error": e}, other={})
# Calculate whether the benchmark passed and the score based on the parsed results
is_resolved = results.get("task_success_rate", 0) > 0.99
score = results.get("average_step_score_rate", 0)
# Concatenate the result details in key-value pair format
message = {"details": ', '.join(f"{k}: {v}" for k, v in results.items())}
return BenchmarkResult(task_id=task_id, is_resolved=is_resolved, metrics={"score": score}, log=message, other={})
def get_all_tasks(self, split: str) -> Dict[str, Any]:
"""
Return all task_ids. If split is "train", return 20 task_ids, otherwise return 103 task_ids.
"""
if split == "train":
task_ids = [str(i) for i in range(20)]
else:
task_ids = [str(i) for i in range(103)]
return {"task_ids": task_ids, "error_message": None}
def cleanup(self):
"""
Add the logic to clean up the temporary result and log directories.
For example, delete the files in self.results_dir and self.log_files_dir.
"""
pass