import os 
from glob2 import glob 
from tqdm import tqdm 
import json

def load_jsonl_keys(x_path):
    split  = os.path.basename(x_path)
    
    with open(x_path) as f :
        lines = f.readlines()
        
        lines = [json.loads(x) for x in lines ]

    
    keys  = [ os.path.join( x["mapping_key"], str(x["id"]) ) for x in lines ]
    keys = set(keys)
    
    return (split, keys) 

def load_jsonl(x_path):
    split  = os.path.basename(x_path)
    
    with open(x_path) as f :
        lines = f.readlines()
        
        lines = [json.loads(x) for x in lines ]

    return lines 



if __name__=="__main__":
    
    root_dir = "/data3/icse_dataset/NL-CCD_dirs/raw2"
    
    load_root_dir = "/data3/icse_dataset/NL-CCD_dirs/raw/retrain_processed"
    
    save_dir = "/data3/icse_dataset/NL-CCD_dirs/NL-CCD-new"
    
    
    
    
    test_train_map= {
        "archive_stackexchange_test.jsonl":("archive_stackexchange","archive_stackexchange.jsonl"),
        "archive_stackexchange_train.jsonl":("archive_stackexchange","archive_stackexchange.jsonl"),
        "code_complete-CodeXGLUE_test.jsonl":("text-code", "code_complete-CodeXGLUE.jsonl"),
        "code_complete-CodeXGLUE_train.jsonl":("text-code", "code_complete-CodeXGLUE.jsonl"),
        "kilt_wiki_random_test.jsonl":("kilt_wiki_random", "kilt_wiki_random.jsonl"),
        "kilt_wiki_random_train.jsonl":("kilt_wiki_random", "kilt_wiki_random.jsonl"),
        
        "summary_in_text_test.jsonl":("summary_in_text_xiaofei", "summary_in_text_xiaofei_merged.jsonl"),
        "summary_in_text_train.jsonl":("summary_in_text_xiaofei", "summary_in_text_xiaofei_merged.jsonl"),
        
        "summary_text-to-code_test.jsonl":("summary_in_text_xiaofei_code_bb", "summary_in_text_xiaofei_code_bb_merged.jsonl"),
        "summary_text-to-code_train.jsonl":("summary_in_text_xiaofei_code_bb", "summary_in_text_xiaofei_code_bb_merged.jsonl"),
        #
        "text-code_APPS_test.jsonl":("text-code", "text-code.jsonl"),
        "text-code_APPS_train.jsonl":("text-code", "text-code.jsonl"),
        
    }
    
    
    
    keys_list=  {}
    
    for load_x_path, (mapping_key, src_x_path)  in tqdm( test_train_map.items()  ):
        load_path = os.path.join(load_root_dir ,  load_x_path )
        key, item_set = load_jsonl_keys(x_path= load_path )
        print ("now ,there are keys for ", len(item_set) ,key )

        source_path = os.path.join(root_dir , src_x_path )
        
        source_list = load_jsonl(x_path= source_path )
        
        print ("will filter from ", len(source_list ) )
        
        
        source_list_filtered = [x for x in source_list if os.path.join(mapping_key, str( x["id"] ) ) in item_set ]

        
        # source_list_filtered_uniq =list(set( [str( x["id"] )   for x in source_list_filtered] ))
        
        source_list_filtered_dict = { str( x["id"] ) :x  for x in source_list if os.path.join(mapping_key, str( x["id"] ) ) in item_set }
        
                                                                                                                                                                                           
        
        source_list_filtered_set = list( source_list_filtered_dict.values() )
        
        
        
        save_path = os.path.join(save_dir , load_x_path  )
        print ("will save to ", save_path, "No.", len(source_list_filtered),  "No set.", len(source_list_filtered_set)   )
        
        with open(save_path,"w") as f:
            f.write( "\n".join( [json.dumps(x) for x in source_list_filtered_set] ) ) 
        
        
        
        
        
    
    
    
    
    
    


