import os 
import json 
import pandas as pd 

p="/tmp/final_list5.txt"

with open(p) as f :
    data= f.readlines()
    data = [json.loads(x) for x in data ]
    
    
df = pd.DataFrame(data )

def xapply(row):
    raw= row ["raw"]
    filter_c= row ["filter_c"]
    
    if filter_c==0 :
        return 0 
    
    return float( filter_c/raw )
def yapply(row):
    raw= row ["raw"]
    filter_c= row ["filter_c_parse"]
    
    if filter_c==0 :
        return 0 
    
    return float( filter_c/raw )
    
df = df[ (df["mt"]=="baseline") & (df["q"]=="writer_com_out") ]

#print ( df.columns )
df["filter_rate"] = df.apply(xapply, axis=1  )
df["parse_rate"] = df.apply(yapply, axis=1  )


# print ( df["role"].value_counts() )

print (df.shape , 
df["filter_rate"] .describe(),
df["parse_rate"] .describe(),
  )
 

df_f = df[ df["parse_rate"]<0.9 ]
print ("===============")
print ( df_f["q"].value_counts() )
print ( df_f["m"].value_counts() )
print ( df_f["role"].value_counts() )

print ( df_f["task"].value_counts() )


# # exit()
# df_f = df_f.sort_values(by=["filter_rate"])
# df_f_v1 = df_f [ (df_f["q"]=="sapling_ai_out")  & (df_f["m"]=="CodeLlama-34b-Instruct-hf") ] 
#
# print ( df_f_v1["q"].value_counts() )
#
# df_f_v1 = df_f_v1[ ["path","name","raw","filter_c","filter_rate"] ]
#
#
# for k,y in df_f_v1.to_dict(orient="index") .items():
#     # print (k)
#     print (y)
#     print ("======>\n\n") 
#
# # print ( df_f_v1["name"].value_counts() )
#
# exit()
# df_f.to_csv("/tmp/low-quality.csv",index=False )
# # for x,y in df_f.to_dict(orient="index") .items() :
# #     print ( y )
# #
# m_list=list(set(  df_f ["m"].tolist())) 
#
# print (m_list)
#
# for one_m in m_list :
#     df_f_m= df_f[ df_f["m"]==one_m ]
#
#     print ( df_f_m["q"].value_counts() )
#
#     print ( df_f_m["m"].value_counts() )
