# from parser import (
#                    tree_to_token_index,
#                    index_to_code_token,
#                    )
import json 
from tree_sitter import Language, Parser



# for lang in ['python','ruby','java','go','javascript','php','c','cpp','c_sharp']:
lang=  "java"
LANGUAGE = Language('/home/wj_cuda113/wj_code/dl_chatgpt/crawler_projects/mutation/mt_treesitter/ast_parser/languages/my-languages.so', lang)
parser = Parser()
parser.set_language(LANGUAGE)



import re
from io import StringIO
import  tokenize


def tree_to_token_index(root_node):
    if (len(root_node.children)==0 or root_node.type=='string' or root_node.type=='comment' or 'comment' in root_node.type):
        return [(root_node.start_point,root_node.end_point)]
    else:
        code_tokens=[]
        for child in root_node.children:
            code_tokens+=tree_to_token_index(child)
        return code_tokens
    
def tree_to_variable_index(root_node,index_to_code):
    if (len(root_node.children)==0 or root_node.type=='string' or root_node.type=='comment' or 'comment' in root_node.type):
        index=(root_node.start_point,root_node.end_point)
        _,code=index_to_code[index]
        if root_node.type!=code:
            return [(root_node.start_point,root_node.end_point)]
        else:
            return []
    else:
        code_tokens=[]
        for child in root_node.children:
            code_tokens+=tree_to_variable_index(child,index_to_code)
        return code_tokens    

def index_to_code_token(index,code):
    start_point=index[0]
    end_point=index[1]
    if start_point[0]==end_point[0]:
        s=code[start_point[0]][start_point[1]:end_point[1]]
    else:
        s=""
        s+=code[start_point[0]][start_point[1]:]
        for i in range(start_point[0]+1,end_point[0]):
            s+=" "+code[i]
        s+=" "+code[end_point[0]][:end_point[1]]   
    return s


def tokenize_code(parser,context):
    root_node = parser.parse(bytes(context,'utf8')).root_node
    tokens_index=tree_to_token_index(root_node)
    code=context.split('\n')
    code_tokens=[index_to_code_token(x,code) for x in tokens_index] 
    return " ".join(code_tokens)



def run_token(context):
    root_node = parser.parse(bytes(context,'utf8')).root_node
    tokens_index=tree_to_token_index(root_node)
    code=context.split('\n')
    code_tokens=[index_to_code_token(x,code) for x in tokens_index] 
    return " ".join(code_tokens)




if __name__=="__main__":
    # import sys 
    # inx = sys.argv[-1]
    inx ='/data3/fse2023_dataset/NL-CCD/extract_rm_languagemodel/q=gpt-3.5-turbo,temp=0.01,topp=1.0,task=concde_dev,m=gpt-3.5-turbo,formated=true.jsonl'
    
    with open(inx) as f :
        data = [json.loads(x) for x in f.readlines()]

    new_data = []
    for x in data :
        code = x["chatgpt_answer"]
        x["chatgpt_answer_raw"] = code
        code = run_token(context=code )
        x["chatgpt_answer"] = code
        new_data.append(x)
    
    with open ('/data3/fse2023_dataset/NL-CCD/extract_rm_languagemodel/q=gpt-3.5-turbo,temp=0.01,topp=1.0,task=concode_dev,m=gpt-3.5-turbo,formated=true.jsonl',"w") as f :
        f.write( "\n".join([json.dumps(x) for x in new_data  ] )  )



    
    inx ='/data3/fse2023_dataset/NL-CCD/extract_rm_languagemodel/q=gpt-3.5-turbo,temp=0.01,topp=1.0,task=concde_train,m=gpt-3.5-turbo,formated=true.jsonl'
    
    with open(inx) as f :
        data = [json.loads(x) for x in f.readlines()]

    new_data = []
    for x in data :
        code = x["chatgpt_answer"]
        x["chatgpt_answer_raw"] = code
        code = run_token(context=code )
        x["chatgpt_answer"] = code
        new_data.append(x)
    
    with open ('/data3/fse2023_dataset/NL-CCD/extract_rm_languagemodel/q=gpt-3.5-turbo,temp=0.01,topp=1.0,task=concode_train,m=gpt-3.5-turbo,formated=true.jsonl',"w") as f :
        f.write( "\n".join([json.dumps(x) for x in new_data  ] )  )


    