import json 

def build_zeroshot(lang="",ideal="",user_content="",task_id="" ):
    
    def build_sql_id(task_id):
        sql_id = task_id.replace(":","/").replace("doc2code_","summary_in_text_xiaofei_code_bb/summary_in_text_train/").replace(".jsonl","")
        return sql_id+".input"
     
    
    ret = {
        "task_id":task_id,
        "sql_id":build_sql_id( task_id) ,
        "human_answer":ideal,
        }
    return ret 
    
    
from glob2 import glob 
import os 
from tqdm import tqdm 
from itertools import  product 
 
languages= ["go","python","php","java","javascript","ruby"]
splits = ["train","valid","test"]

root_dir = "/data3/icse_dataset/llm_save_data/"

model_name = "gpt-3.5-turbo"

root_dir = os.path.join(root_dir, model_name)

dt_test_all =[]

for lang,role  in tqdm( list(product(languages,splits) ) ):
    
    pattern_path =os.path.join(root_dir , "*task=code2doc_{}_{},m={}.jsonl".format(lang, role, model_name )  )
    dt_test = glob(pattern_path)
    assert len(dt_test)>0 
    dt_test_all.extend( [(lang, role ,y) for y in dt_test] )
    

# print (dt_test_all, "dt_test_all ")
# features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
for  lang , role, item_path in tqdm(dt_test_all ):

    app_list = []
    
    task_id = os.path.basename(item_path)
    assert "code2doc_" in task_id , (task_id, )
    
    # task_id = task_id.replace("code2doc_","doc2code_")
    
    print ("lang",lang, "role", role, "read from ", task_id )
    
    with open(item_path) as f :
        item_list= [json.loads(xx) for xx in f.readlines() ] 
        print ("item_list", len(item_list) )
    
    for iii, item in tqdm(enumerate(item_list), leave=False):
    
        ideal =[xx["content"] for xx in  item["data"]["prompt"] if xx["role"]=="user"]
        ideal = ideal[0]
    
        item_str = build_zeroshot(lang=lang, 
                                  ideal=ideal, 
                                  user_content=item["data"]["sampled"], 
                                  task_id =item["task_id"].replace("code2doc_","doc2code_") 
                            )
        app_list.append ( json.dumps(item_str ) )


    save_path  ="/data3/icse_dataset/wj_build_prompt_data/{}/doc2code_{}_{}.jsonl".format(model_name, lang,role) 
    os.makedirs( os.path.dirname(save_path), exist_ok=True )
    with open ( save_path,"w") as f :
    # with open ("/tmp/doc2code_{}_{}.jsonl".format(lang,role) ,"w") as f :
        f.write("\n".join(app_list)) 
    
