

import json 


template_system=lambda lang:{
    "role":"system",
    "content":'You will be given a %s function code and your '\
'task is to generate a detailed summary of its behavior '\
'and functionality. Your summary should clearly explain what the function does, how it works, and what '\
'input parameters and output values it expects. You '\
'should write your explanation in clear and concise '%(lang),
    "name":"system",
    }



def build_template_user(content):
        
    template_user={
        "role":"user",
        "content":content,
        "name":"user",
        }
    return template_user


def build_sql_id(lang,role,index_id):
    return f"summary_in_text_xiaofei/summary_in_text_train/{lang}/{role}/{index_id}.input"

def build_zeroshot(lang="",ideal="",user_content="",task_id="", role="train", index_id=0, item_misc=None ):
    
    ret = {
        # **item_misc,
        #
        # "input":[
        #     template_system(lang),
        #     build_template_user(user_content)
        #     ],
        "task_id":task_id,
        # "idx":task_id,
        "human_answer":ideal,
        "sql_id":build_sql_id(lang=lang,role=role, index_id=index_id),
        }
    return ret 
    
    
from glob2 import glob 
import os 
from tqdm import tqdm 
from itertools import product 
languages= ["go","python","php","java","javascript","ruby"]

root_dir = "/home/wj_cuda113/wj_code/dl_chatgpt/test_download/dataset/"


dt_test_all = []

languages= [(x,y) for x,y in product(languages, ["valid","train","test"] )  ] 
print ("languages", len(languages) )


for lang,role  in tqdm(languages):
    
    pattern_path =os.path.join(root_dir , "{}/{}.jsonl".format(lang, role )  )
    print (pattern_path, "pattern_path" )
    dt_test = glob(pattern_path)
    # print("fl", dt_test )
    dt_test_all.extend( [(lang, role ,y) for y in dt_test] )
    

print (dt_test_all, "dt_test_all ")
# features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
for  lang , role, item_path in tqdm(dt_test_all ):

    app_list = []
    
    sub_task = os.path.relpath(item_path, root_dir)
    print (sub_task,"sub_task",lang)
    with open(item_path) as f :
        item_list= [json.loads(xx) for xx in f.readlines() ] 
        print ("item_list", len(item_list) )
    
    for iii, item in tqdm(enumerate(item_list), leave=False):
        
        item_str = build_zeroshot(lang=lang, ideal=item["docstring"], 
                                  user_content=item["code"], 
                                  task_id ="code2doc_{}:{}".format(sub_task,iii ), 
                                  role=role,
                                  index_id=iii,
                                  item_misc=item  )
        
        app_list.append ( json.dumps(item_str ) )

    os.makedirs("/data3/icse_dataset/wj_build_prompt_data/",exist_ok=True)
    
    with open ("/data3/icse_dataset/wj_build_prompt_data/code2doc_{}_{}.jsonl".format(lang,role) ,"w") as f :
        f.write("\n".join(app_list)) 
    
