#
import json 
import re 

template_system=lambda lang:{
    "role":"system",
    "content":
        "I want you to act as a stackoverflow post."\
        " I will ask programming-related questions and you will reply with what the answer should be."\
        " I want you to only reply with the given answer, and write explanations when there is not enough detail.",
         # do not write explanations.
    "name":"system",
    }

# '''
# generatopn by chatGPT, ask there is a website stackexchange.com, can you list some subdomain of this website which relates to programming language.
# list at least  30 
# '''
# content_domians="""
# stackoverflow.com - General programming questions.
# codereview.stackexchange.com - Code review for improving existing code.
# codegolf.stackexchange.com - Competitive programming challenges.
# softwareengineering.stackexchange.com - Software engineering and design questions.
# devops.stackexchange.com - DevOps practices and tools questions.
# security.stackexchange.com - Information security questions.
# datascience.stackexchange.com - Data science and machine learning questions.
# computergraphics.stackexchange.com - Computer graphics and rendering questions.
# math.stackexchange.com - Mathematics questions (often related to programming).
# electronics.stackexchange.com - Electronics and electrical engineering questions.
# crypto.stackexchange.com - Cryptography-related questions.
# reverseengineering.stackexchange.com - Reverse engineering and exploitation questions.
# webmasters.stackexchange.com - Web development and management questions.
# gamedev.stackexchange.com - Game development questions.
# blender.stackexchange.com - Questions about the Blender 3D creation suite.
# emacs.stackexchange.com - Questions about the Emacs text editor.
# tex.stackexchange.com - LaTeX typesetting system questions.
# ethereum.stackexchange.com - Questions about the Ethereum blockchain platform.
# android.stackexchange.com - Android operating system-related questions.
# apple.stackexchange.com - Apple hardware and software questions.
# iot.stackexchange.com - Internet of Things (IoT) questions.
# mathematica.stackexchange.com - Questions about the Mathematica computational software.
# arduino.stackexchange.com - Questions about the Arduino microcontroller platform.
# webapps.stackexchange.com - Web applications and their usage.
# sharepoint.stackexchange.com - Questions about the SharePoint collaboration platform.
# salesforce.stackexchange.com - Questions about the Salesforce customer relationship management platform.
# wordpress.stackexchange.com - Questions about the WordPress content management system.
# magento.stackexchange.com - Questions about the Magento e-commerce platform.
# craftcms.stackexchange.com - Questions about the Craft CMS (content management system).
# """
# content_domians = [(x.split("-")[0]).strip() for x in content_domians.split("\n") if len(x.strip())>0 ]



_language_domain = [
"spanish",
"russian",
"portuguese",
"ukrainian",
"latin",
"korean",
"japanese",
"ja",
"rus",
"ru",
"italian",
"german",
"french",
"chinese",
"hinduism",
"politics",
"hermeneutics",
"puzzling",
"linguistics",
"judaism",
"es",
    ]
language_domain = [x+".stackexchange.com" for x in _language_domain]
language_domain += [x+".stackoverflow.com" for x in _language_domain]

def is_in_nonenglish_domain(task_id):
    task_id_raw= task_id 
    if "superuser_" in task_id or "askubuntu_" in task_id or "mathoverflow_" in task_id:
        return False
    if "stackexchange" in task_id :
        task_id = task_id .split("stackexchange_")[0]
        task_id = task_id+"stackexchange.com"
    else:
        task_id = task_id .split("stackoverflow_")[0]
        task_id = task_id+"stackoverflow.com"

    if task_id in language_domain:
        return True
    return False 


def build_template_user(content):
    template_user={
        "role":"user",
        "content": content , 
        "name":"user",
        }
    return template_user


def build_zeroshot( raw_item    ):

    Question= raw_item ["data"]

    pattern = r'\n*Q:.*'
    Question = re.sub(pattern,"",Question ).strip()

    Human_raw = raw_item ["raw"]
    pattern_split = r'\n*A:\n*'
    Human = re.split(pattern_split , Human_raw )
    Human = Human[-1] if len(Human)>1 else Human_raw
    

    ret = {
        # "input":[
        #     template_system(None),
        #     build_template_user(Question)
        #     ],
        "human_answer": Human,
        }
    return ret 

import random 

p="/home/wj_cuda113/wj_code/dl_chatgpt/test_download/archive_stackexchange.jsonl"
dt_test = [json.loads(x) for x in open(p).readlines() ] 
print ("total.pre ", len(dt_test) )
dt_test = [item for item in dt_test if "raw" in item   ]
print ("total.filter ", len(dt_test) )

random.shuffle( dt_test )


# features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
app_list = []
for  idx,item in enumerate(dt_test):
    task_id = item ["id"]
    if   is_in_nonenglish_domain(task_id):
        continue 
        
    item_str = build_zeroshot(raw_item = item )
    item_ret ={  "task_id" : task_id  , **item_str , "sql_id":task_id }

    app_list.append ( json.dumps(item_ret ) )
    
# with open ("/tmp/xxx.jsonl","w") as f :
#     f.write("\n".join(app_list)) 
#
#

with open ("/data3/icse_dataset/wj_build_prompt_data/archive_stackexchange.jsonl" ,"w") as f :
    f.write("\n".join(app_list)) 
        




    
    




