/
rare1abd729
import json
from datasets import load_dataset
class RareDataset():
def __init__(self, dataset_name, dataset_path, dataset_type) -> None:
self.dataset_name = dataset_name
self.dataset_path = dataset_path
self.dataset_type = dataset_type
if dataset_path is None:
if dataset_name in ["RAMEDIS", "MME", "HMS", "LIRICAL", "PUMCH_ADM"]:
self.data = load_dataset('chenxz/RareBench', dataset_name, split='test', trust_remote_code=True)
else:
raise ValueError("Dataset not found")
else:
with open(dataset_path, "r", encoding="utf-8-sig") as f:
self.data = json.load(f)
if self.dataset_type == "PHENOTYPE":
self.patient = self.load_ehr_phenotype_data()
def load_ehr_phenotype_data(self):
phenotype_mapping = json.load(open("mapping/phenotype_mapping.json", "r", encoding="utf-8-sig"))
disease_mapping = json.load(open("mapping/disease_mapping.json", "r", encoding="utf-8-sig"))
patient = []
for p in self.data:
if self.dataset_path is None:
phenotype_list = p['Phenotype']
disease_list = p['RareDisease']
else:
phenotype_list = p[0]
disease_list = p[1]
if self.dataset_type == "PHENOTYPE":
phenotype_list = [phenotype_mapping[phenotype] for phenotype in phenotype_list if phenotype in phenotype_mapping]
disease_list = [disease_mapping[disease] for disease in disease_list if disease in disease_mapping]
phenotype = ",".join(phenotype_list)
disease = ",".join(disease_list)
patient.append((phenotype, disease))
return patient