# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""

from __future__ import absolute_import, division, print_function

# from accelerate import Accelerator


import argparse
# import glob
import logging
import os
# import pickle
import random
# import re
# import shutil
# import sys 
# import numpy as np
import torch
# from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
# from torch.utils.data.distributed import DistributedSampler
import json
#
from sklearn.model_selection import train_test_split 
# from tqdm import tqdm, trange
# import multiprocessing

# sys.path.append(".")
# from detector_model import Model
# from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
#                           RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import numpy as np
from datasets import Dataset, concatenate_datasets
import evaluate
import pandas as pd
import torch
from transformers import (
    HfArgumentParser, 
    RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig,
    DataCollatorWithPadding,
    Trainer, TrainingArguments
)


logger = logging.getLogger(__name__)
# logger.setLevel(  "WARN" )
logger.setLevel(  "INFO" )

os.environ["WANDB_DISABLED"] ="true"


def list_field(default=None, metadata=None):
    return field(default_factory=lambda: default, metadata=metadata)



def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYHTONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
# set_seed()



def main1(args: argparse.Namespace, training_args: argparse.Namespace ):

    def read_train_test(file_path,split="train",args=None ):
        if type(file_path)==str :
            file_path = [file_path]
        
        examples = []
        for one_file_path in file_path :
            with open(one_file_path) as f:
                examples.extend( [line.strip() for line in f.readlines() ] )

        logger.info("the raw size of {} ".format(len(examples)))
        if split=="train" and args.train_size is not None :
            if args.train_size<len(examples):
                _, examples = train_test_split( examples,test_size=args.train_size,random_state=42)
            logger.info("[train] split size of {} ".format(len(examples)))

        if split=="valid" and args.valid_size is not None :
            # valid_size=min(args.valid_size,len(self.examples) ) 
            if args.valid_size<len(examples):
                _,  examples= train_test_split( examples, test_size=args.valid_size, random_state=42)
            logger.info("[valid] split size of {} ".format(len(examples)))

        print ("finish initial", len(examples) )
        logger.info("the final examples size {} ".format(len(examples)))
        examples = [json.loads(x) for x in examples]
        def save_pick_up(item_str):
            if type(item_str)==str :
                return item_str 
            return item_str[0]
        human_examples = [{"answer":save_pick_up(item["human_answer"]), "labels":0} for item in examples ]
        chatgpt_examples = [{"answer":save_pick_up(item["chatgpt_answer"]), "labels":1} for item in examples ]

        human_dataset = Dataset.from_pandas( pd.DataFrame(human_examples))
        chatgpt_dataset = Dataset.from_pandas( pd.DataFrame(chatgpt_examples))
        return concatenate_datasets([human_dataset, chatgpt_dataset ])



    train_dataset = read_train_test(file_path=args.train_data_file ,split="train",args=args )
    test_dataset = read_train_test(file_path=args.eval_data_file ,split="valid",args=args )



    tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
    kwargs = dict(max_length=args.max_length, truncation=True)
    if args.pair:
        def tokenize_fn(example):
            return tokenizer(example['question'], example['answer'], **kwargs)
    else:
        def tokenize_fn(example):
            return tokenizer(example['answer'], **kwargs)

    print('Tokenizing and mapping...')
    # remove unused columns
    names = [ 'answer' ]
    
    tokenized_train_dataset = train_dataset.map(tokenize_fn)
    tokenized_train_dataset = tokenized_train_dataset.remove_columns(names)
    
    if test_dataset is not None:
        tokenized_test_dataset = test_dataset.map(tokenize_fn)
        tokenized_test_dataset = tokenized_test_dataset.remove_columns(names)
    else:
        tokenized_test_dataset = None
    # tokenized_train_dataset = train_dataset
    # tokenized_test_dataset = test_dataset 
    print(tokenized_train_dataset)

    accuracy = evaluate.load("accuracy")
    def compute_metrics(eval_preds):
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        return accuracy.compute(predictions=predictions, references=labels)

    model = RobertaForSequenceClassification.from_pretrained(args.model_name_or_path, num_labels=2)

    # output_dir = "./saved_models/all/" + args.input  # checkpoint save path
    # if args.pair:
    #     output_dir += '-pair'
    # training_args = TrainingArguments(
    #     output_dir=args.output_dir  ,
    #     seed=args.seed,
    #     num_train_epochs=args.num_train_epochs,
    #     per_device_train_batch_size=args.batch_size,
    #     per_device_eval_batch_size=args.batch_size,
    #     evaluation_strategy='no' if test_dataset is None else 'steps',
    #     eval_steps=2000 ,
    #     save_strategy='epoch',
    # )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()



@dataclass
class DataTrainingArguments:
    root_dir: str = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    model_name_or_path: str = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    train_data_file: List[str] = list_field(
        default=[",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�"],
        metadata={"help": "A list of characters to remove from the transcripts."},
    )
    eval_data_file: List[str] = list_field(
        default=[",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�"],
        metadata={"help": "A list of characters to remove from the transcripts."},
    )
    # seed: Optional[int] = field(
    #     default=42,
    #     metadata={"help": "The number of processes to use for the preprocessing."},
    # )
    train_size: Optional[int] = field(
        default=100000,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    valid_size: Optional[int] = field(
        default=10000,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_length: Optional[int] = field(
        default=512,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )

    pair: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )

           
def main():
    
    parser = HfArgumentParser(( DataTrainingArguments, TrainingArguments))
    args, training_args = parser.parse_args_into_dataclasses()
    data_args = args 

    # args = parser.parse_args()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    
    args.device = device
    # Setup logging
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)
    logger.warning("device: %s, n_gpu: %s", device, args.n_gpu)


    args.train_data_file = [os.path.join(args.root_dir, x) for x in args. train_data_file ]
    args.eval_data_file = [os.path.join(args.root_dir, x) for x in args. eval_data_file ]
    assert all ( [os.path.isfile(x) for x in args.train_data_file]) ,args.train_data_file
    assert all ( [os.path.isfile(x) for x in args.eval_data_file]), args.eval_data_file
    
    # Set seed
    set_seed(training_args.seed)

    # config = RobertaConfig.from_pretrained(args.model_name_or_path)
    # tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
    # model = RobertaForSequenceClassification.from_pretrained(args.model_name_or_path,config=config, ignore_mismatched_sizes=True )    

    # model=Model(model,config,tokenizer,args)

    # multi-gpu training (should be after apex fp16 initialization)
    # model.to(args.device)
    # if args.n_gpu > 1:
    #     model = torch.nn.DataParallel(model)
        
    logger.info("Training/evaluation parameters %s", args)
    logger.info("Training/evaluation parameters train%s", training_args)
    
    
    # # Evaluation
    # results = {}
    # if args.do_eval:
    #     checkpoint_prefix = 'checkpoint-best-acc/model.bin'
    #     output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))  
    #     if os.path.isfile(output_dir):
    #         model.load_state_dict(torch.load(output_dir,map_location="cpu"))     
    #     else:
    #         logger.warning("the checkpoint {} is empty".format(output_dir) ) 
    #     model.to(args.device)
    #     result=evaluate(args, model, tokenizer)
    #     logger.info("***** Eval results *****")
    #     print (result )
    #     for key in sorted(result.keys()):
    #         logger.info("  %s = %s", key, str(round(result[key],4)))


    # # Training
    # if args.do_train:
    #     train_dataset = TextDataset(tokenizer, args,args.train_data_file)
    #     train(args, train_dataset, model, tokenizer)

    # Evaluation
    # results = {}
    # if args.do_eval:
    #     checkpoint_prefix = 'checkpoint-best-acc/model.bin'
    #     output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix))  
    #     model.load_state_dict(torch.load(output_dir,map_location="cpu"))      
    #     model.to(args.device)
    #     result=evaluate(args, model, tokenizer)
    #     logger.info("***** Eval results *****")
    #     for key in sorted(result.keys()):
    #         logger.info("  %s = %s", key, str(round(result[key],4)))
    results = main1(args, training_args)

    return results


if __name__ == "__main__":
    # accelerator = Accelerator()

    main()


