/
webarena137fc11
import argparse
import base64
import glob
import json
import os
from collections import defaultdict
from typing import Any
from bs4 import BeautifulSoup
def main(result_folder: str, config_json: str) -> None:
all_data = {}
template_to_id: dict[str, Any] = defaultdict(lambda: len(template_to_id))
with open(config_json, "r") as f:
data_configs = json.load(f)
data_configs = {int(item["task_id"]): item for item in data_configs}
for k, v in data_configs.items():
v.pop("require_login")
v.pop("storage_state")
v.pop("start_url")
v.pop("geolocation")
v.pop("require_reset")
v.pop("intent_template_id")
v["intent_template_id"] = template_to_id[v["intent_template"]]
v["eval_types"] = v["eval"].pop("eval_types")
if v["eval"]["reference_answers"]:
v["reference_answers"] = v["eval"].pop("reference_answers")
if v["eval"]["reference_url"]:
v["reference_url"] = v["eval"].pop("reference_url")
v.pop("eval")
if v.get("reference_answers", {}).get("exact_match", "") == "N/A":
v["achievable"] = False
else:
v["achievable"] = True
with open(f"{result_folder}/merged_log.txt", "r") as f:
results = {}
for line in f:
if "[Result]" in line:
id = line.strip().split(".")[-2].split("/")[-1]
results[int(id)] = True if "(PASS)" in line else False
files = list(glob.glob(f"{result_folder}/render_*.html"))
files = [x for x in files if os.path.exists(x)]
print(f"Total number of files: {len(files)}")
for render_file in files:
task_id = int(render_file.split("_")[-1].split(".")[0])
with open(render_file, "r") as f:
try:
content = f.read()
soup = BeautifulSoup(content, "html.parser")
observations = [
obv.find("pre").text
for obv in soup.find_all("div", {"class": "state_obv"})
]
base64_images = [
img["src"].split(",")[1] for img in soup.find_all("img")
]
image_observations = []
# save image to file and change the value to be path
image_folder = f"images/{os.path.basename(result_folder)}"
os.makedirs(image_folder, exist_ok=True)
for i, image in enumerate(base64_images):
image_data = base64.b64decode(image)
filename = f"{image_folder}/image_{task_id}_{i}.png"
with open(filename, "wb") as f: # type: ignore[assignment]
f.write(image_data) # type: ignore[arg-type]
image_observations.append(filename)
urls = [
url.get_text()
for url in soup.find_all("h3", {"class": "url"})
]
actions = [
action.get_text()
for action in soup.find_all(
"div", {"class": "raw_parsed_prediction"}
)
]
parsed_actions = [
action.get_text()
for action in soup.find_all(
"div", {"class": "parsed_action"}
)
]
# fill action with parsed action if action is empty
for i in range(len(actions)):
if actions[i] == "":
actions[i] = parsed_actions[i]
messages = []
for o, u, a, image in zip(
observations, urls, actions, image_observations
):
messages.append(
{
"user": f"{u}\n\nobservation:\n{o}",
"image": image,
}
)
messages.append({"assistant": a})
all_data[f"example_{task_id}"] = {
**data_configs[task_id],
"messages": messages,
"success": results.get(task_id, False),
}
except Exception as e:
print(e)
print(f"Error in {render_file}")
with open(f"{result_folder}/json_dump.json", "w+") as f:
json.dump(all_data, f, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--result_folder", type=str)
parser.add_argument(
"--config_json", type=str, default="config_files/test.raw.json"
)
args = parser.parse_args()
main(args.result_folder, args.config_json)