import os 
from glob2 import glob 
from funcs import utils 

import ast 

import builtins
import keyword

KEYWORDS = dir(builtins) +  keyword.kwlist

def process_file(task_id, xpath):
        
    try :
        with open(xpath) as f:
            content = f.read()
        tree_node = ast.parse(content )
    except :
        return None
    all_var_name_scope , all_var_name = utils. get_all_varnames(python_tree_node=tree_node  )

    all_func_name = utils. get_all_funcnames(python_tree_node=None, var_list=all_var_name_scope  )

    all_class_name = utils. get_all_classnames(python_tree_node=None,   var_list=all_var_name_scope )

    var_list= list(set(all_var_name))
    var_list = [x for x in var_list if not  x.startswith("__") and x not in KEYWORDS ]
    
    func_list= list(set(all_func_name))
    func_list = [x for x in func_list if not  x.startswith("__") and x not in KEYWORDS ]
    
    cls_list= list(set(all_class_name))
    cls_list = [x for x in cls_list if not  x.startswith("__") and x not in KEYWORDS ]
    
    # return {"var":list(set(all_var_name)), "func":list(set(all_func_name)) ,"cls":list(set(all_class_name)) }
    return var_list , func_list , cls_list 



var_list= []
func_list= []
cls_list= []

    
    
    
import concurrent.futures
import time
from tqdm import tqdm
from collections import Counter
import pickle 


if __name__=="__main__":
    if not os.path.isfile("/tmp/var_func_cls_list.pkl") :
        root_dir = "/data3/icse_dataset/eth_py150_open/data"
        
        scan_dir = os.path.join(root_dir, "**", "*", "*.py")
        
        fl = glob(scan_dir )
        # fl = fl[:2500]
        print ("total find", len(fl) )
        chunk_size = 10000
        for i in range( len(fl)//chunk_size +1  ):
            f_list = fl[i*chunk_size: (i+1)*chunk_size ]
            
            max_workers =min( os.cpu_count()-1 , len(f_list) )
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers ) as executor:
                futures = [executor.submit(process_file, task_id, fname) for task_id, fname in enumerate(f_list)]
                with tqdm(total=len(futures), desc="Progress") as pbar:
                    # Iterate through completed futures and update the progress bar
                    for future in concurrent.futures.as_completed(futures):
                        result = future.result()
                        pbar.update(1)
                        if result is None :
                            continue
                        var_list.extend( result[0] )
                        func_list.extend( result[1] )
                        cls_list.extend( result[2] )
    
        
        with open("/tmp/var_func_cls_list.pkl","wb") as f :
            pickle.dump(obj=dict(var_list=var_list,func_list=func_list,cls_list=cls_list), file=f )
    else:
        with open("/tmp/var_func_cls_list.pkl","rb") as f :
            data= pickle.load(f)
            var_list= data["var_list"]
            func_list= data["func_list"]
            cls_list= data["cls_list"]
        
    counter1 = Counter(var_list)
    mode = counter1.most_common(100)
    print ("var_list", mode, len(var_list) )

    counter2 = Counter(func_list)
    mode = counter2.most_common(100)
    print ("func_list", mode, len(func_list) )

    counter3 = Counter(cls_list)
    mode = counter3.most_common(100)
    print ("cls_list", mode, len(cls_list) )

    with open("/tmp/vnames.pkl","wb") as f :
        pickle.dump( obj=counter1, file=f )# dict(var_c=counter1,func_c= counter2, cls_c =counter3  ), file=f  )
        
    with open("/tmp/fnames.pkl","wb") as f :
        pickle.dump( obj=counter2, file=f )# dict(var_c=counter1,func_c= counter2, cls_c =counter3  ), file=f  )
        
