Skip to content
Snippets Groups Projects
Commit b67484b5 authored by dha21's avatar dha21
Browse files

acuracias melhoradas

parent b172c675
Branches
No related tags found
No related merge requests found
%% Cell type:code id:2c81bc78-04e0-4bad-83ef-380cf3be1610 tags:
``` python
import numpy as np
import pandas as pd
```
%% Cell type:code id:af419e44-d6ef-41f7-970c-78c316aeb712 tags:
``` python
class DistCalc:
def __init__(self):
self.stat_f = []
self.stat_ks = []
self.stat_t = []
self.stat_cohend = []
self.stat_f_matches = []
self.stat_ks_matches = []
self.stat_t_matches = []
self.stat_cohend_matches = []
self.stat_f_new = []
self.stat_ks_new = []
self.stat_t_new = []
self.stat_cohend_new = []
self.stat_f_empty = []
self.stat_ks_empty = []
self.stat_t_empty = []
self.stat_cohend_empty = []
self.stat_f_top3 = []
self.stat_ks_top3 = []
self.stat_t_top3 = []
self.stat_cohend_top3 = []
self.stat_f_matches_top3 = []
self.stat_ks_matches_top3 = []
self.stat_t_matches_top3 = []
self.stat_cohend_matches_top3 = []
self.stat_f_new_top3 = []
self.stat_ks_new_top3 = []
self.stat_t_new_top3 = []
self.stat_cohend_new_top3 = []
self.stat_f_empty_top3 = []
self.stat_ks_empty_top3 = []
self.stat_t_empty_top3 = []
self.stat_cohend_empty_top3 = []
self.years = []
@property
def get_stat_f(self):
return self.stat_f
@property
def get_stat_ks(self):
return self.stat_ks
@property
def get_stat_t(self):
return self.stat_t
@property
def get_stat_cohend(self):
return self.stat_cohend
@property
def get_stat_f_top3(self):
return self.stat_f
@property
def get_stat_ks_top3(self):
return self.stat_ks
@property
def get_stat_t_top3(self):
return self.stat_t
@property
def get_stat_cohend_top3(self):
return self.stat_cohend
@property
def get_years(self):
return self.years
def calc(self, df, stat_column, threshold):
anos = df.ano_coluna1.unique()
self.years = np.union1d(self.years, anos)
# Itera sobre todos os anos
for ano in anos:
# Constroi dataframe do ano
ano_df = df[df.ano_coluna1 == ano]
# Estruturas
base_columns = ano_df.coluna1.unique() # Colunas que ja existiam na base
new_columns = ano_df.coluna2.unique() # Colunas do próximo ano
true_new_columns = np.setdiff1d(new_columns, base_columns) # Colunas que nao existiam na base
base_empty_columns = np.setdiff1d(base_columns, new_columns) # Colunas da base que nao receberam dados
all_columns = np.union1d(base_columns, new_columns) # Todas as colunas possiveis
# Alterar para um dicionario
prev_col = [] # Colunas da base para match
next_col = [] # Colunas do proximo ano para match
# Itera sobre o dataframe
for index, row in ano_df.iterrows():
# Ignora colunas ja selecionadas
if row['coluna1'] in prev_col or row['coluna2'] in next_col:
continue
# Testa treshold
if row[stat_column] > threshold:
break
# Adiciona nas listas
prev_col.append(row['coluna1'])
next_col.append(row['coluna2'])
all_match_columns = np.union1d(prev_col, next_col)
not_match_columns = np.setdiff1d(all_columns, all_match_columns)
found_new_columns = np.setdiff1d(new_columns, next_col) # Colunas novas encontradas pelo algoritmo
no_data_columns = np.setdiff1d(base_columns, prev_col) # Colunas que não receram dados encontradas pelo algoritmo
# Calcula resultados ========================
# ========== CALCULA ACURACIAS ==========
acertos_p = 0
acertos = 0
for i in range(len(prev_col)):
if prev_col[i] == next_col[i]:
acertos_p += 1
acuracia_matches = acertos_p / len(prev_col)
acertos += acertos_p
acertos_p = 0
unionNewColumns = np.union1d(found_new_columns, true_new_columns)
for col in unionNewColumns:
if col in true_new_columns:
if col in true_new_columns and col in found_new_columns:
acertos_p += 1
if(len(unionNewColumns) > 0):
acuracia_new_columns = acertos_p / len(unionNewColumns)
else:
acuracia_new_columns = 1.0
acertos += acertos_p
acertos_p = 0
unionEmptyColumns = np.union1d(no_data_columns, base_empty_columns)
for col in unionEmptyColumns:
if col in base_empty_columns:
if col in base_empty_columns and col in no_data_columns:
acertos_p += 1
if(len(unionEmptyColumns) > 0):
acuracia_empty_columns = acertos_p / len(unionEmptyColumns)
else:
acuracia_empty_columns = 1.0
acertos += acertos_p
soma_acuracia = acuracia_matches * len(prev_col) + acuracia_new_columns * len(unionNewColumns) + acuracia_empty_columns * len(unionEmptyColumns)
acuracia_total = soma_acuracia / (len(prev_col) + len(unionNewColumns) + len(unionEmptyColumns))
# acuracia_total = soma_acuracia / (len(prev_col) + len(unionNewColumns) + len(unionEmptyColumns))
acuracia_total = acertos / len(all_columns)
# Adiciona acuracia
# ========== ADICIONA ACURACIAS ==========
if(stat_column == 'estatistica_f'):
self.stat_f.append([ano, acuracia_total])
self.stat_f_matches.append([ano, acuracia_matches])
self.stat_f_new.append([ano, acuracia_new_columns])
self.stat_f_empty.append([ano, acuracia_empty_columns])
elif(stat_column == 'estatistica_t'):
self.stat_t.append([ano, acuracia_total])
self.stat_t_matches.append([ano, acuracia_matches])
self.stat_t_new.append([ano, acuracia_new_columns])
self.stat_t_empty.append([ano, acuracia_empty_columns])
elif(stat_column == 'estatistica_ks'):
self.stat_ks.append([ano, acuracia_total])
self.stat_ks_matches.append([ano, acuracia_matches])
self.stat_ks_new.append([ano, acuracia_new_columns])
self.stat_ks_empty.append([ano, acuracia_empty_columns])
elif(stat_column == 'estatistica_cohend'):
self.stat_cohend.append([ano, acuracia_total])
self.stat_cohend_matches.append([ano, acuracia_matches])
self.stat_cohend_new.append([ano, acuracia_new_columns])
self.stat_cohend_empty.append([ano, acuracia_empty_columns])
def calcTop3(self, df, stat_column, threshold):
anos = df.ano_coluna1.unique()
# Itera sobre todos os anos
for ano in anos:
# Constroi dataframe do ano
ano_df = df[df.ano_coluna1 == ano]
# Estruturas
base_columns = ano_df.coluna1.unique() # Colunas que ja existiam na base
new_columns = ano_df.coluna2.unique() # Colunas do próximo ano
intersection_columns = np.intersect1d(base_columns, new_columns) # Colunas que possuem match
true_new_columns = np.setdiff1d(new_columns, base_columns) # Colunas que nao existiam na base
true_empty_columns = np.setdiff1d(base_columns, new_columns) # Colunas da base que nao receberam dados
all_columns = np.union1d(base_columns, new_columns) # Todas as colunas possiveis
resultados = [] # Resultados dos matches
prev_col = [] # Colunas da base que tiveram match
next_col = [] # Colunas do proximo ano que tiveram match
# Encontra as top3 novas colunas que mais se encaixam com as colunas base
for col in base_columns:
top3 = ano_df[(ano_df.coluna1 == col) & (ano_df[stat_column] < threshold)].iloc[:3,:]
resultados.append(top3.values)
# Preenche prev_col e next_col
for res in resultados:
for i in res:
prev_col = np.union1d(prev_col, i[0])
next_col = np.union1d(next_col, i[2])
# Determina alguns c
all_match_columns = np.union1d(next_col, prev_col) # Colunas que tiveram algum match
not_match_columns = np.setdiff1d(all_columns, all_match_columns) # Colunas que não tiveram nenhum match
found_new_columns = np.setdiff1d(new_columns, next_col) # Colunas novas encontradas pelo algoritmo
no_data_columns = np.setdiff1d(base_columns, prev_col) # Colunas que não receram dados encontradas pelo algoritmo
# Calcula acurácia
acuracia_matches = 0
acuracia_novas_colunas = 0
acuracia_colunas_vazias = 0
# ========== CALCULA ACURACIA TOTAL ==========
# Acurácia matches
acertos = 0
for res in resultados:
if(len(res) == 0):
continue
for i in res:
if i[0] == i[2] and i[0] not in no_data_columns and i[0] not in found_new_columns and i[2] not in no_data_columns and i[2] not in found_new_columns:
if i[0] == i[2]:
acertos += 1
break
# Acurácia novas colunas
for new in found_new_columns:
if new in true_new_columns and new not in no_data_columns and new not in all_match_columns:
if new in true_new_columns:
acertos += 1
# Acurácia colunas vazias
for no_data in no_data_columns:
if no_data in true_empty_columns and no_data not in found_new_columns and no_data not in all_match_columns:
if no_data in true_empty_columns:
acertos += 1
# Acurácia total
acuracia_total = acertos / len(all_columns)
# =========================
# ========== CALCULA ACURACIA PARCIAL ==========
acertos_p = 0
unionNewColumns = np.union1d(found_new_columns, true_new_columns)
if len(unionNewColumns) > 0:
for col in unionNewColumns:
if col in found_new_columns and col in true_new_columns:
acertos_p += 1
acuracia_new_columns = acertos_p / len(unionNewColumns)
else:
acuracia_new_columns = 1.0
acertos_p = 0
unionEmptyColumns = np.union1d(no_data_columns, true_empty_columns)
if len(unionEmptyColumns) > 0:
for col in unionEmptyColumns:
if col in no_data_columns and col in true_empty_columns:
acertos_p += 1
acuracia_empty_columns = acertos_p / len(unionEmptyColumns)
else:
acuracia_empty_columns = 1.0
acertos_p = 0
results_len = 0
for res in resultados:
if(len(res) == 0):
continue
results_len += 1
for i in res:
if i[0] == i[2]:
acertos_p += 1
break
acuracia_matches = acertos_p / len(prev_col)
soma_acuracia = acuracia_matches * results_len + acuracia_new_columns * len(unionNewColumns) + acuracia_empty_columns * len(unionEmptyColumns)
acuracia_total = soma_acuracia / (results_len + len(unionNewColumns) + len(unionEmptyColumns))
# soma_acuracia = acuracia_matches * results_len + acuracia_new_columns * len(unionNewColumns) + acuracia_empty_columns * len(unionEmptyColumns)
# acuracia_total = soma_acuracia / (results_len + len(unionNewColumns) + len(unionEmptyColumns))
# print(ano)
# print(f'{acuracia_matches} matches')
# print(f'{acuracia_new_columns} new')
# print(f'{acuracia_empty_columns} empty')
# print(f'{acuracia_total} total')
# =========================
# Adiciona acuracia
if(stat_column == 'estatistica_f'):
self.stat_f_top3.append([ano, acuracia_total])
self.stat_f_matches_top3.append([ano, acuracia_matches])
self.stat_f_new_top3.append([ano, acuracia_new_columns])
self.stat_f_empty_top3.append([ano, acuracia_empty_columns])
elif(stat_column == 'estatistica_t'):
self.stat_t_top3.append([ano, acuracia_total])
self.stat_t_matches_top3.append([ano, acuracia_matches])
self.stat_t_new_top3.append([ano, acuracia_new_columns])
self.stat_t_empty_top3.append([ano, acuracia_empty_columns])
elif(stat_column == 'estatistica_ks'):
self.stat_ks_top3.append([ano, acuracia_total])
self.stat_ks_matches_top3.append([ano, acuracia_matches])
self.stat_ks_new_top3.append([ano, acuracia_new_columns])
self.stat_ks_empty_top3.append([ano, acuracia_empty_columns])
elif(stat_column == 'estatistica_cohend'):
self.stat_cohend_top3.append([ano, acuracia_total])
self.stat_cohend_matches_top3.append([ano, acuracia_matches])
self.stat_cohend_new_top3.append([ano, acuracia_new_columns])
self.stat_cohend_empty_top3.append([ano, acuracia_empty_columns])
```
%% Cell type:markdown id:9eaff904-7ee7-45a0-9768-0f21989c65bd tags:
## Import the results for each statistical method
%% Cell type:code id:26287a6f-5537-4509-a09d-52dd59b3a76d tags:
``` python
# Import F results
df_f = pd.read_csv('Testes_R/Result_F/F_subsequente.csv', sep=',')
stat_column = 'estatistica_f'
df_f[stat_column] = (df_f[stat_column] - 1).abs()
df_f = df_f.sort_values(by=['ano_coluna1', stat_column])
df_f = df_f[~df_f['coluna1'].str.contains('ANO_CENSO') & ~df_f['coluna2'].str.contains('ANO_CENSO')]
# Import T results
df_t = pd.read_csv('Testes_R/Result_T/T_subsequente.csv', sep=',')
stat_column = 'estatistica_t'
df_t[stat_column] = df_t[stat_column].abs()
df_t = df_t.sort_values(by=['ano_coluna1', stat_column])
df_t = df_t[~df_t['coluna1'].str.contains('ANO_CENSO') & ~df_t['coluna2'].str.contains('ANO_CENSO')]
# Import COHEND results
df_c = pd.read_csv('Testes_R/Result_COHEND/COHEND_subsequente.csv', sep=',')
stat_column = 'estatistica_cohend'
df_c[stat_column] = df_c[stat_column].abs()
df_c = df_c.sort_values(by=['ano_coluna1', stat_column])
df_c = df_c[~df_c['coluna1'].str.contains('ANO_CENSO') & ~df_c['coluna2'].str.contains('ANO_CENSO')]
# Import KS results
df_ks = pd.read_csv('Testes_R/Result_KS/KS_subsequente.csv', sep=',')
stat_column = 'estatistica_ks'
df_ks[stat_column] = (df_ks[stat_column]).abs()
df_ks = df_ks.sort_values(by=['ano_coluna1', stat_column])
df_ks = df_ks[~df_ks['coluna1'].str.contains('ANO_CENSO') & ~df_ks['coluna2'].str.contains('ANO_CENSO')]
```
%% Cell type:markdown id:e25f4f2d-3fb9-4cfc-8a92-c2e8b887262c tags:
## Calcule the columns matches
%% Cell type:code id:f9541a11-c1bf-4318-847a-100917e13204 tags:
``` python
dist = DistCalc()
dist.calc(df_f, 'estatistica_f', 0.7)
dist.calc(df_t, 'estatistica_t', 40)
dist.calc(df_c, 'estatistica_cohend', 0.15)
dist.calc(df_ks, 'estatistica_ks', 0.10)
dist.calcTop3(df_f, 'estatistica_f', 0.7)
dist.calcTop3(df_t, 'estatistica_t', 40)
dist.calcTop3(df_c, 'estatistica_cohend', 0.15)
dist.calcTop3(df_ks, 'estatistica_ks', 0.10)
```
%% Cell type:markdown id:47bcb19b-6aba-4d4a-9de0-4633bfa0eb20 tags:
## Create the result dataframes
%% Cell type:code id:527ff27d-f321-4749-a94d-dd7d824ef682 tags:
``` python
# ================= KS =================
result_ks = pd.DataFrame(columns=['ano_base', 'match', 'new', 'empty', 'total'])
resultTop3_ks = pd.DataFrame(columns=['ano_base', 'match', 'new', 'empty', 'total'])
for i, ano in enumerate(dist.get_years):
new_row = [ano, dist.stat_ks_matches[i][1], dist.stat_ks_new[i][1], dist.stat_ks_empty[i][1], dist.stat_ks[i][1]]
result_ks.loc[len(result_ks)] = new_row
new_row = [ano, dist.stat_ks_matches_top3[i][1], dist.stat_ks_new_top3[i][1], dist.stat_ks_empty_top3[i][1], dist.stat_ks_top3[i][1]]
resultTop3_ks.loc[len(resultTop3_ks)] = new_row
result_ks.loc[len(result_ks)] = result_ks.mean()
result_ks.loc[len(result_ks)] = result_ks.std()
resultTop3_ks.loc[len(resultTop3_ks)] = resultTop3_ks.mean()
resultTop3_ks.loc[len(resultTop3_ks)] = resultTop3_ks.std()
result_ks = result_ks.round(3)
resultTop3_ks = resultTop3_ks.round(3)
# ================= F =================
result_f = pd.DataFrame(columns=['ano_base', 'match', 'new', 'empty', 'total'])
resultTop3_f = pd.DataFrame(columns=['ano_base', 'match', 'new', 'empty', 'total'])
for i, ano in enumerate(dist.get_years):
new_row = [ano, dist.stat_f_matches[i][1], dist.stat_f_new[i][1], dist.stat_f_empty[i][1], dist.stat_f[i][1]]
result_f.loc[len(result_f)] = new_row
new_row = [ano, dist.stat_f_matches_top3[i][1], dist.stat_f_new_top3[i][1], dist.stat_f_empty_top3[i][1], dist.stat_f_top3[i][1]]
resultTop3_f.loc[len(resultTop3_f)] = new_row
result_f.loc[len(result_f)] = result_f.mean()
result_f.loc[len(result_f)] = result_f.std()
resultTop3_f.loc[len(resultTop3_f)] = resultTop3_f.mean()
resultTop3_f.loc[len(resultTop3_f)] = resultTop3_f.std()
result_f = result_f.round(3)
resultTop3_f = resultTop3_f.round(3)
# ================= COHEN =================
result_cohend = pd.DataFrame(columns=['ano_base', 'match', 'new', 'empty', 'total'])
resultTop3_cohend = pd.DataFrame(columns=['ano_base', 'match', 'new', 'empty', 'total'])
for i, ano in enumerate(dist.get_years):
new_row = [ano, dist.stat_cohend_matches[i][1], dist.stat_cohend_new[i][1], dist.stat_cohend_empty[i][1], dist.stat_cohend[i][1]]
result_cohend.loc[len(result_cohend)] = new_row
new_row = [ano, dist.stat_cohend_matches_top3[i][1], dist.stat_cohend_new_top3[i][1], dist.stat_cohend_empty_top3[i][1], dist.stat_cohend_top3[i][1]]
resultTop3_cohend.loc[len(resultTop3_cohend)] = new_row
result_cohend.loc[len(result_cohend)] = result_cohend.mean()
result_cohend.loc[len(result_cohend)] = result_cohend.std()
resultTop3_cohend.loc[len(resultTop3_cohend)] = resultTop3_cohend.mean()
resultTop3_cohend.loc[len(resultTop3_cohend)] = resultTop3_cohend.std()
result_cohend = result_cohend.round(3)
resultTop3_cohend = resultTop3_cohend.round(3)
# ================= T =================
result_t = pd.DataFrame(columns=['ano_base', 'match', 'new', 'empty', 'total'])
resultTop3_t = pd.DataFrame(columns=['ano_base', 'match', 'new', 'empty', 'total'])
for i, ano in enumerate(dist.get_years):
new_row = [ano, dist.stat_t_matches[i][1], dist.stat_t_new[i][1], dist.stat_t_empty[i][1], dist.stat_t[i][1]]
result_t.loc[len(result_t)] = new_row
new_row = [ano, dist.stat_t_matches_top3[i][1], dist.stat_t_new_top3[i][1], dist.stat_t_empty_top3[i][1], dist.stat_t_top3[i][1]]
resultTop3_t.loc[len(resultTop3_t)] = new_row
result_t.loc[len(result_t)] = result_t.mean()
result_t.loc[len(result_t)] = result_t.std()
resultTop3_t.loc[len(resultTop3_t)] = resultTop3_t.mean()
resultTop3_t.loc[len(resultTop3_t)] = resultTop3_t.std()
result_t = result_t.round(3)
resultTop3_t = resultTop3_t.round(3)
```
%% Cell type:code id:4cb4afc8-6149-40a7-8f77-af06183d4d23 tags:
``` python
result_ks.to_csv(f'./result_ks.csv', index=False)
resultTop3_ks.to_csv(f'./resultTop3_ks.csv', index=False)
result_f.to_csv(f'./result_f.csv', index=False)
resultTop3_f.to_csv(f'./resultTop3_f.csv', index=False)
result_t.to_csv(f'./result_t.csv', index=False)
resultTop3_t.to_csv(f'./resultTop3_t.csv', index=False)
result_cohend.to_csv(f'./result_cohend.csv', index=False)
resultTop3_cohend.to_csv(f'./resultTop3_cohend.csv', index=False)
```
%% Cell type:code id:d0d2606e-2ddb-4752-a101-823af86fec45 tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment