import os 
import json 
import pandas as pd 

from concurrent.futures import ThreadPoolExecutor



def parse_dict_v1(xpath):
    dict_info = {}
    dict_info["path"]=xpath 
    xpath = os.path.basename(xpath)
    
    xpath = xpath.replace(".jsonl","")
    for dic_str in xpath.split(","):
        k,v = dic_str.split("=")[:2]
        dict_info.update({k:v})
    
    task = dict_info["task"]
    split = task.split("_")[-1]
    dict_info["split"] =split 
    task = task.replace("_"+dict_info["split"] , "")
    dict_info["lang"]=  task.split("_")[-1] if "_" in task else "unk" 
    dict_info["name"]=  task.split("_")[0] 
    if "mt" not in dict_info :
        dict_info["mt"]="baseline"

    if dict_info["mt"]=="baseline2":
        dict_info["mt"]="baseline"
        
    if dict_info["name"]=="apps":
        dict_info["lang"]="python"

    if dict_info["task"]=="archive_stackexchange":
        dict_info["split"]="test"
        dict_info["lang"]="unk"
        dict_info["name"]=dict_info["task"]
        
    if "r" not in dict_info and "role" in dict_info :
        dict_info ["r"] =dict_info ["role"]
    if "role" not in dict_info and "r" in dict_info :
        dict_info ["role"] =dict_info ["r"]
    
    new_dict_info ={}
    for k,v in dict_info.items():
        k=k.lower()
        v=v.lower()
        v=v.replace("-","_")
        new_dict_info[k]=v 
    return new_dict_info 






if __name__=="__main__":

    
    index_p = "/home/wj2_cuda12/wj_code/dl_chatgpt/llm_save_data/all_answer.txt"
    with open(index_p) as f :
        data = f.readlines() 
        data = [x for x in data if "_train,m=" not in x ]
    final_list = []
    for x in data :
        
        xinfo = parse_dict_v1(xpath=os.path.basename(x) )
        # print ( xinfo )
        final_list.append(xinfo )
    
    
    def x_unq(row):
        selected_keys = ['q', 'temp', 'topp', 'task', 'm', 'formated', 'split', 'lang', 'name',    'mt']
        v_list = [str(row[x]) for x in selected_keys ]
        # None_list = [x  for x in selected_keys if row[x] is None  ]
        # if len(None_list)!=0 :
            # print (None_list, row )
        return "@".join(  v_list  )
    
        
    df = pd.DataFrame(final_list) 
    # df = df.drop(columns=["r","role","path"])
    # df = df.drop(columns=["r","role","path"])
    df["idx"]= df.apply( x_unq,axis=1 )
    # print (df.shape, df.columns)
    # print ( df["idx"][:100])
    df_va =  df[ ~df.duplicated(subset=["idx"],keep=False )]
    # df_va .to_csv("/tmp/meta.csv",index=False )
    # print (df_va.shape )
    print ( df_va["temp"].value_counts() )
    # c= df_va["path"].tolist() 
    # with open("/tmp/c.list","w") as f :
    #     f.write( "\n".join(c) )
    # print ( df_va["q"].value_counts() )
    
    # def process_file(i):
    #     filename = videos[i]
    #     y_pred = predict_on_video(face_extractor=face_extractor, video_path=os.path.join(test_dir, filename),
    #                               input_size=input_size,
    #                               batch_size=frames_per_video,
    #                               models=models, strategy=strategy, apply_compression=apply_compression)
    #     return y_pred
    #
    # with ThreadPoolExecutor(max_workers=num_workers) as ex:
    #     predictions = ex.map(process_file, range(len(videos)))
    # predictions =  list(predictions)
    #

    
    
    
    
    