PokemonGym/BenchFlow

mirrored 8 minutes ago
Benchmark Card Files and versions Leaderboard
kkUpdate benchflow_interface.pyd409696
from benchflow import BaseBench
from benchflow.BaseBench import BenchmarkResult
from benchflow.schemas import BenchArgs
from typing import Dict, Any
import os
import json

class PokemonBench(BaseBench):
    def get_args(self, task_id: str) -> BenchArgs:
        arguments = {
            "required": [],
            "optional": {"MAX_DURATION": 30 * 60},
        }
        return BenchArgs(arguments)
    
    def get_image_name(self) -> str:
        return "kirk2000/benchflow:pokemongym-v1"
    
    def get_results_dir_in_container(self) -> str:
        return "/app/evaluation_sessions/latest_evaluation"
    
    def get_log_files_dir_in_container(self) -> str:
        return "/app/evaluation_sessions/latest_evaluation"
    
    def get_result(self, task_id: str) -> BenchmarkResult:
        try:
            with open(os.path.join(self.results_dir, "summary.json"), "r") as f:
                summary = json.load(f)
            with open(os.path.join(self.results_dir, "results.csv"), "r") as f:
                results = f.read()
            summary = {
                'duration_minutes': summary['duration_minutes'],
                'total_steps': summary['total_steps'],
                'final_score': summary['final_score'],
                'total_execution_time': summary['timing']['total_execution_time'],
                'average_time_per_step': summary['timing']['average_time_per_step'],
                'pokemon_discovered': summary['stats']['pokemon_discovered'],
                'badges_earned': summary['stats']['badges_earned'],
                'locations_visited': summary['stats']['locations_visited'],
            }
            return BenchmarkResult(task_id=task_id, is_resolved=True, metrics=summary, log={"details": results}, other={})
        except Exception as e:
            return BenchmarkResult(task_id=task_id, is_resolved=False, metrics={}, log={"error": str(e)}, other={"error": str(e)})
    
    def get_all_tasks(self, split: str) -> Dict[str, Any]:
        return {
            "task_ids": ["0"],
            "error_message": None,
        }