import os 
import json 
from glob2 import glob 
m_list= [
    "CodeLlama-34b-Instruct-hf_extract_sta",
    "WizardCoder-15B-V1.0_extract_sta",
    "WizardCoder-Python-34B-V1.0_extract_sta",
    "codellama2_extract_sta",
    "extract_sta",
    ]

TASK_GROUP  = {
    "apps_test":"apps_test",
    "archive_stackexchange_test":"archive_stackexchange_test",
    "archive_stackexchange":"archive_stackexchange_test",
    "code2doc_go_test":"code2doc",
    "code2doc_go_valid":"code2doc",
    "code2doc_java_test":"code2doc",
    "code2doc_java_valid":"code2doc",
    "code2doc_javascript_test":"code2doc",
    "code2doc_javascript_valid":"code2doc",
    "code2doc_php_test":"code2doc",
    "code2doc_php_valid":"code2doc",
    "code2doc_python_test":"code2doc",
    "code2doc_python_valid":"code2doc",
    "code2doc_ruby_test":"code2doc",
    "code2doc_ruby_valid":"code2doc",
    "concde_dev":"concde_dev",
    "concde_test":"concde_dev",
    "doc2code_go_test":"doc2code",
    "doc2code_go_valid":"doc2code",
    "doc2code_java_test":"doc2code",
    "doc2code_java_valid":"doc2code",
    "doc2code_javascript_test":"doc2code",
    "doc2code_javascript_valid":"doc2code",
    "doc2code_php_test":"doc2code",
    "doc2code_php_valid":"doc2code",
    "doc2code_python_test":"doc2code",
    "doc2code_python_valid":"doc2code",
    "doc2code_ruby_test":"doc2code",
    "doc2code_ruby_valid":"doc2code",
    "gdoc2code_go_test":"doc2code",
    "gdoc2code_go_valid":"doc2code",
    "gdoc2code_java_test":"doc2code",
    "gdoc2code_java_valid":"doc2code",
    "gdoc2code_javascript_test":"doc2code",
    "gdoc2code_javascript_valid":"doc2code",
    "gdoc2code_php_test":"doc2code",
    "gdoc2code_php_valid":"doc2code",
    "gdoc2code_python_test":"doc2code",
    "gdoc2code_python_valid":"doc2code",
    "gdoc2code_ruby_test":"doc2code",
    "gdoc2code_ruby_valid":"doc2code",
    } 

def parse_dict_v1(xpath):
    dict_info = {}
    dict_info["path"]=xpath 
    xpath = os.path.basename(xpath)
    
    xpath = xpath.replace(".jsonl","").replace(".csv","").replace(".txt","")
    for dic_str in xpath.split(","):
        k,v = dic_str.split("=")[:2]
        dict_info.update({k:v})
    
    task = dict_info["task"]
    split = task.split("_")[-1]
    dict_info["split"] =split 
    task = task.replace("_"+dict_info["split"] , "")
    dict_info["lang"]=  task.split("_")[-1] if "_" in task else None 
    dict_info["name"]=  task.split("_")[0] 
    if "mt" not in dict_info :
        dict_info["mt"]="baseline"
    if dict_info["name"]=="apps":
        dict_info["lang"]="python"

    if dict_info["task"]=="archive_stackexchange":
        dict_info["split"]="test"
        dict_info["lang"]=None
        dict_info["name"]=dict_info["task"]
        
    if "r" not in dict_info and "role" in dict_info :
        dict_info ["r"] =dict_info ["role"]
    if "r"  in dict_info and "role" not in dict_info :
        dict_info ["role"] =dict_info ["r"]
    
    return dict_info 


temp_list= ["0.8","0.2","0.01"]
if __name__=="__main__":
    import pandas as pd 
    from itertools import product 
    root_dir ="/home/wj2_cuda12/wj_code/dl_chatgpt/tosem_data/fse2023_dataset/NL-CCD"
    
    def process_one  (xt_path ):
        xt_path_base = os.path.basename(xt_path )
        meta_info = parse_dict_v1( xpath=xt_path_base )
        df = pd .read_csv(xt_path )
        
        df_dict = df.to_dict(orient="records")
        assert len(df_dict)>0 , xt_path
        
        
        task_grp  = TASK_GROUP[  meta_info["task"] ]
        
        meta_info.update( df_dict[0] )
        meta_info["task_grp"] =  task_grp 
        
        return meta_info 
    
    df_list = []
    for model_name, temp  in product( m_list , temp_list ):
        final_list = [] 

        find_list_p = os.path.join (root_dir , model_name , "*temp={},*.txt".format(temp ) )
        find_list = glob( find_list_p) 
        if len(find_list)<=0:
            continue 
         
        for one_path in find_list:
            
            meta= process_one( xt_path =one_path )
            final_list.append(meta)


        df_extract = pd.DataFrame( final_list )
        df_extract = df_extract[ ["m" , "chatgpt_not_none", "raw_size", "task_grp" ]]
        df_extract["extract_rate"] = df_extract["chatgpt_not_none"]/ df_extract["raw_size"]
        # print (df_extract["task_grp"].value_counts() )

        grp = df_extract.groupby( "task_grp" )["extract_rate"].mean()
        grp = grp.add_suffix('_Count').reset_index("task_grp")
        # print (grp )
        
        grp = grp.pivot_table("extract_rate",columns=["task_grp"])
        grp ["temp"]=temp 
        grp ["model_name"]=model_name 
        
        # print (grp ,"grp", grp.columns )
        df_list.append( grp )
    
    df_final = pd.concat( df_list )
    df_final. to_csv("/tmp/tosem_extract_rate.csv",index=False ) 
    
    
    
