import json 



template_system=lambda language:{
    "role":"system",
    "content":"You will be provided with a detailed description of a {language} function, "\
            "and your task is to generate a {language} function that implements the program's behavior based on that description."\
            " You should write the function code as accurately as possible based on the description, without providing any additional explanations or assumptions. "\
            "Your implementation should conform to standard {language} syntax and coding conventions.".format(language=language),
    "name":"system",
    }



def build_template_user(content):
        
    template_user={
        "role":"user",
        "content":content,
        "name":"user",
        }
    return template_user


def build_zeroshot(lang="",ideal="",user_content="",task_id="" ):
    
    ret = {
        "input":[
            template_system(lang),
            build_template_user(user_content)
            ],
        "task_id":task_id,
        "idx":task_id,
        "ideal":ideal,
        }
    return ret 
    
    
from glob2 import glob 
import os 
from tqdm import tqdm 
from itertools import  product 
 
languages= ["go","python","php","java","javascript","ruby"]
splits = ["train","valid","test"]

root_dir = "/data3/icse_dataset/llm_save_data/"

model_name = "CodeLlama-34b-hf"

root_dir = os.path.join(root_dir, model_name)

dt_test_all =[]

for lang,role  in tqdm( list(product(languages,splits) ) ):
    
    pattern_path =os.path.join(root_dir , "*task=code2doc_{}_{},m={}.jsonl".format(lang, role, model_name )  )
    dt_test = glob(pattern_path)
    assert len(dt_test)>0 
    dt_test_all.extend( [(lang, role ,y) for y in dt_test] )
    

# print (dt_test_all, "dt_test_all ")
# features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
for  lang , role, item_path in tqdm(dt_test_all ):

    app_list = []
    
    task_id = os.path.basename(item_path)
    assert "code2doc_" in task_id , (task_id, )
    
    # task_id = task_id.replace("code2doc_","doc2code_")
    
    print ("lang",lang, "role", role, "read from ", task_id )
    
    with open(item_path) as f :
        item_list= [json.loads(xx) for xx in f.readlines() ] 
        print ("item_list", len(item_list) )
    
    exist_taks_id = set([])
    
    for iii, item in tqdm(enumerate(item_list), leave=False):
    
        ideal =[xx["content"] for xx in  item["data"]["prompt"] if xx["role"]=="user"]
        ideal = ideal[0]
        if item["data"]["sampled"] is None or len(str(item["data"]["sampled"]).strip())==0:
            continue 
        if item["task_id"].replace("code2doc_","doc2code_")  in exist_taks_id:
            continue
    
        item_str = build_zeroshot(lang=lang, 
                                  ideal=ideal, 
                                  user_content=item["data"]["sampled"], 
                                  task_id =item["task_id"].replace("code2doc_","doc2code_") 
                            )
        app_list.append ( json.dumps(item_str ) )
        
        exist_taks_id.add( item["task_id"].replace("code2doc_","doc2code_")  )


    save_path  ="/data3/icse_dataset/raw_data/{}/doc2code_{}_{}.jsonl".format(model_name, lang,role) 
    os.makedirs( os.path.dirname(save_path), exist_ok=True )
    with open ( save_path,"w") as f :
    # with open ("/tmp/doc2code_{}_{}.jsonl".format(lang,role) ,"w") as f :
        f.write("\n".join(app_list)) 
    
