diff --git a/distCalc.ipynb b/distCalc.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..29d2cdfe44697715d0260c261a942d2a5d437350 --- /dev/null +++ b/distCalc.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 102, + "id": "2c81bc78-04e0-4bad-83ef-380cf3be1610", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "af419e44-d6ef-41f7-970c-78c316aeb712", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "class DistCalc:\n", + " def __init__(self):\n", + " self.stat_f = []\n", + " self.stat_ks = []\n", + " self.stat_t = []\n", + " self.stat_cohend = []\n", + " self.stat_f_top3 = []\n", + " self.stat_ks_top3 = []\n", + " self.stat_t_top3 = []\n", + " self.stat_cohend_top3 = []\n", + " self.years = []\n", + " \n", + " @property\n", + " def get_stat_f(self):\n", + " return self.stat_f\n", + " \n", + " @property\n", + " def get_stat_ks(self):\n", + " return self.stat_ks\n", + " \n", + " @property\n", + " def get_stat_t(self):\n", + " return self.stat_t\n", + " \n", + " @property\n", + " def get_stat_cohend(self):\n", + " return self.stat_cohend\n", + " \n", + " @property\n", + " def get_stat_f_top3(self):\n", + " return self.stat_f\n", + " \n", + " @property\n", + " def get_stat_ks_top3(self):\n", + " return self.stat_ks\n", + " \n", + " @property\n", + " def get_stat_t_top3(self):\n", + " return self.stat_t\n", + " \n", + " @property\n", + " def get_stat_cohend_top3(self):\n", + " return self.stat_cohend\n", + " \n", + " @property\n", + " def get_years(self):\n", + " return self.years\n", + " \n", + " def calc(self, df, stat_column, threshold):\n", + " anos = df.ano_coluna1.unique()\n", + " self.years = np.union1d(self.years, anos)\n", + "\n", + " # Itera sobre todos os anos\n", + " for ano in anos:\n", + " # Constroi dataframe do ano\n", + " ano_df = df[df.ano_coluna1 == ano]\n", + "\n", + " # Estruturas\n", + " base_columns = ano_df.coluna1.unique() # Colunas que ja existiam na base\n", + " new_columns = ano_df.coluna2.unique() # Colunas do próximo ano\n", + " true_new_columns = np.setdiff1d(new_columns, base_columns) # Colunas que nao existiam na base\n", + " base_empty_columns = np.setdiff1d(base_columns, new_columns) # Colunas da base que nao receberam dados\n", + " all_columns = np.union1d(base_columns, new_columns) # Todas as colunas possiveis\n", + " # Alterar para um dicionario\n", + " prev_col = [] # Colunas da base para match\n", + " next_col = [] # Colunas do proximo ano para match\n", + "\n", + " # Itera sobre o dataframe\n", + " for index, row in ano_df.iterrows():\n", + " # Ignora colunas ja selecionadas\n", + " if row['coluna1'] in prev_col or row['coluna2'] in next_col:\n", + " continue\n", + " # Testa treshold\n", + " if row[stat_column] > threshold:\n", + " break\n", + "\n", + " # Adiciona nas listas\n", + " prev_col.append(row['coluna1'])\n", + " next_col.append(row['coluna2'])\n", + "\n", + " all_match_columns = np.union1d(prev_col, next_col)\n", + " not_match_columns = np.setdiff1d(all_columns, all_match_columns)\n", + "\n", + " # Calcula resultados ========================\n", + " acertos = 0\n", + " acuracia = 0\n", + " # Passeia pelos matches\n", + " for i in range(len(prev_col)):\n", + " if prev_col[i] == next_col[i]: \n", + " acertos += 1\n", + "\n", + " for col in not_match_columns:\n", + " if col in true_new_columns:\n", + " acertos += 1\n", + " if col in base_empty_columns:\n", + " acertos += 1\n", + "\n", + " if len(all_columns) == 0:\n", + " acuracia = 0\n", + " else:\n", + " acuracia = acertos / len(all_columns)\n", + " \n", + " # Adiciona acuracia\n", + " if(stat_column == 'estatistica_f'):\n", + " self.stat_f.append([ano, acuracia])\n", + " elif(stat_column == 'estatistica_t'):\n", + " self.stat_t.append([ano, acuracia])\n", + " elif(stat_column == 'estatistica_ks'):\n", + " self.stat_ks.append([ano, acuracia])\n", + " elif(stat_column == 'estatistica_cohend'):\n", + " self.stat_cohend.append([ano, acuracia])\n", + "\n", + " \n", + " def calcTop3(self, df, stat_column, threshold):\n", + " anos = df.ano_coluna1.unique()\n", + " \n", + " # Itera sobre todos os anos\n", + " for ano in anos:\n", + " # Constroi dataframe do ano\n", + " ano_df = df[df.ano_coluna1 == ano]\n", + "\n", + " # Estruturas\n", + " base_columns = ano_df.coluna1.unique() # Colunas que ja existiam na base\n", + " new_columns = ano_df.coluna2.unique() # Colunas do próximo ano\n", + " intersection_columns = np.intersect1d(base_columns, new_columns) # Colunas que possuem match\n", + " true_new_columns = np.setdiff1d(new_columns, base_columns) # Colunas que nao existiam na base\n", + " true_empty_columns = np.setdiff1d(base_columns, new_columns) # Colunas da base que nao receberam dados\n", + " all_columns = np.union1d(base_columns, new_columns) # Todas as colunas possiveis\n", + " resultados = [] # Resultados dos matches\n", + " prev_col = [] # Colunas da base que tiveram match\n", + " next_col = [] # Colunas do proximo ano que tiveram match\n", + "\n", + " # Encontra as top3 novas colunas que mais se encaixam com as colunas base\n", + " for col in base_columns:\n", + " top3 = ano_df[(ano_df.coluna1 == col) & (ano_df[stat_column] < threshold)].iloc[:3,:]\n", + " resultados.append(top3.values)\n", + "\n", + " # Preenche prev_col e next_col\n", + " for res in resultados:\n", + " for i in res:\n", + " prev_col = np.union1d(prev_col, i[0])\n", + " next_col = np.union1d(next_col, i[2])\n", + "\n", + " # Determina alguns c\n", + " all_match_columns = np.union1d(next_col, prev_col) # Colunas que tiveram algum match\n", + " not_match_columns = np.setdiff1d(all_columns, all_match_columns) # Colunas que não tiveram nenhum match\n", + " found_new_columns = np.setdiff1d(new_columns, next_col) # Colunas novas encontradas pelo algoritmo\n", + " no_data_columns = np.setdiff1d(base_columns, prev_col) # Colunas que não receram dados encontradas pelo algoritmo\n", + "\n", + " # Calcula acurácia\n", + " acuracia_matches = 0\n", + " acuracia_novas_colunas = 0\n", + " acuracia_colunas_vazias = 0\n", + "\n", + " # Acurácia matches\n", + " acertos = 0\n", + " for res in resultados:\n", + " if(len(res) == 0):\n", + " continue\n", + " for i in res:\n", + " if i[0] == i[2]:\n", + " acertos += 1\n", + " break\n", + " acuracia_matches = acertos / len(intersection_columns)\n", + "\n", + " # Acurácia novas colunas\n", + " acertos = 0\n", + " for new in found_new_columns:\n", + " if new in true_new_columns:\n", + " acertos += 1\n", + " if(len(true_new_columns) == 0 and len(found_new_columns) == 0):\n", + " acuracia_novas_colunas = 1.0\n", + " else:\n", + " acuracia_novas_colunas = acertos / len(found_new_columns)\n", + "\n", + " # Acurácia colunas vazias\n", + " acertos = 0\n", + " for no_data in no_data_columns:\n", + " if no_data in true_empty_columns:\n", + " acertos += 1\n", + " if(len(true_empty_columns) == 0 and len(no_data_columns) == 0):\n", + " acuracia_colunas_vazias = 1.0\n", + " else:\n", + " acuracia_colunas_vazias = acertos / len(no_data_columns)\n", + "\n", + " # Acurácia total\n", + " acuracia_total = (acuracia_matches + acuracia_colunas_vazias + acuracia_novas_colunas) / 3 \n", + " \n", + " # Adiciona acuracia\n", + " if(stat_column == 'estatistica_f'):\n", + " self.stat_f_top3.append([ano, acuracia_total])\n", + " elif(stat_column == 'estatistica_t'):\n", + " self.stat_t_top3.append([ano, acuracia_total])\n", + " elif(stat_column == 'estatistica_ks'):\n", + " self.stat_ks_top3.append([ano, acuracia_total])\n", + " elif(stat_column == 'estatistica_cohend'):\n", + " self.stat_cohend_top3.append([ano, acuracia_total])" + ] + }, + { + "cell_type": "markdown", + "id": "9eaff904-7ee7-45a0-9768-0f21989c65bd", + "metadata": {}, + "source": [ + "## Import the results for each statistical method" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "id": "26287a6f-5537-4509-a09d-52dd59b3a76d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Import F results\n", + "df_f = pd.read_csv('Testes_R/Result_F/F_subsequente.csv', sep=',')\n", + "stat_column = 'estatistica_f'\n", + "df_f[stat_column] = (df_f[stat_column] - 1).abs()\n", + "df_f = df_f.sort_values(by=['ano_coluna1', stat_column])\n", + "df_f = df_f[~df_f['coluna1'].str.contains('ANO_CENSO') & ~df_f['coluna2'].str.contains('ANO_CENSO')]\n", + "\n", + "# Import T results\n", + "df_t = pd.read_csv('Testes_R/Result_T/T_subsequente.csv', sep=',')\n", + "stat_column = 'estatistica_t'\n", + "df_t[stat_column] = df_t[stat_column].abs()\n", + "df_t = df_t.sort_values(by=['ano_coluna1', stat_column])\n", + "df_t = df_t[~df_t['coluna1'].str.contains('ANO_CENSO') & ~df_t['coluna2'].str.contains('ANO_CENSO')]\n", + "\n", + "# Import COHEND results\n", + "df_c = pd.read_csv('Testes_R/Result_COHEND/COHEND_subsequente.csv', sep=',')\n", + "stat_column = 'estatistica_cohend'\n", + "df_c[stat_column] = df_c[stat_column].abs()\n", + "df_c = df_c.sort_values(by=['ano_coluna1', stat_column])\n", + "df_c = df_c[~df_c['coluna1'].str.contains('ANO_CENSO') & ~df_c['coluna2'].str.contains('ANO_CENSO')]\n", + "\n", + "# Import KS results\n", + "df_ks = pd.read_csv('Testes_R/Result_KS/KS_subsequente.csv', sep=',')\n", + "stat_column = 'estatistica_ks'\n", + "df_ks[stat_column] = (df_ks[stat_column]).abs()\n", + "df_ks = df_ks.sort_values(by=['ano_coluna1', stat_column])\n", + "df_ks = df_ks[~df_ks['coluna1'].str.contains('ANO_CENSO') & ~df_ks['coluna2'].str.contains('ANO_CENSO')]" + ] + }, + { + "cell_type": "markdown", + "id": "e25f4f2d-3fb9-4cfc-8a92-c2e8b887262c", + "metadata": {}, + "source": [ + "## Calcule the columns matches" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "f9541a11-c1bf-4318-847a-100917e13204", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dist = DistCalc()\n", + "dist.calc(df_f, 'estatistica_f', 0.7)\n", + "dist.calc(df_t, 'estatistica_t', 50)\n", + "dist.calc(df_c, 'estatistica_cohend', 0.15)\n", + "dist.calc(df_ks, 'estatistica_ks', 0.10)\n", + "\n", + "dist.calcTop3(df_f, 'estatistica_f', 0.7)\n", + "dist.calcTop3(df_t, 'estatistica_t', 40)\n", + "dist.calcTop3(df_c, 'estatistica_cohend', 0.15)\n", + "dist.calcTop3(df_ks, 'estatistica_ks', 0.10)" + ] + }, + { + "cell_type": "markdown", + "id": "47bcb19b-6aba-4d4a-9de0-4633bfa0eb20", + "metadata": {}, + "source": [ + "## Create the result dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "01ba08fd-63ce-4618-b3b2-434227604dcd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "result = pd.DataFrame(columns=['ano_base', 'estatistica_ks', 'estatistica_f', 'estatistica_t', 'estatistica_cohend'])\n", + "resultTop3 = pd.DataFrame(columns=['ano_base', 'estatistica_ks', 'estatistica_f', 'estatistica_t', 'estatistica_cohend'])\n", + "for i, ano in enumerate(dist.get_years):\n", + " new_row = [ano, dist.stat_ks[i][1], dist.stat_f[i][1], dist.stat_t[i][1], dist.stat_cohend[i][1]]\n", + " result.loc[len(result)] = new_row\n", + " new_row = [ano, dist.stat_ks_top3[i][1], dist.stat_f_top3[i][1], dist.stat_t_top3[i][1], dist.stat_cohend_top3[i][1]]\n", + " resultTop3.loc[len(resultTop3)] = new_row\n", + "result.loc[len(result)] = result.mean()\n", + "result.loc[len(result)] = result.std()\n", + "resultTop3.loc[len(resultTop3)] = resultTop3.mean()\n", + "resultTop3.loc[len(resultTop3)] = resultTop3.std()" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "4cb4afc8-6149-40a7-8f77-af06183d4d23", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "result.to_csv(f'./result.csv', index=False)\n", + "resultTop3.to_csv(f'./resultTop3.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f88a6745-c669-49f3-85b7-12c53b35d28a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/result.csv b/result.csv new file mode 100644 index 0000000000000000000000000000000000000000..64fc5e77fa06c7170a9fe2a655c5aaaf35046b04 --- /dev/null +++ b/result.csv @@ -0,0 +1,17 @@ +ano_base,estatistica_ks,estatistica_f,estatistica_t,estatistica_cohend +2007.0,1.0,1.0,1.0,1.0 +2008.0,0.6666666666666666,1.0,0.6666666666666666,0.5 +2009.0,0.6666666666666666,1.0,0.3333333333333333,0.3333333333333333 +2010.0,0.6666666666666666,1.0,0.3333333333333333,0.3333333333333333 +2011.0,0.6666666666666666,1.0,0.6666666666666666,0.6666666666666666 +2012.0,0.9411764705882353,0.8235294117647058,0.9411764705882353,0.9411764705882353 +2013.0,0.5294117647058824,0.7647058823529411,0.29411764705882354,0.23529411764705882 +2014.0,0.17647058823529413,0.29411764705882354,0.058823529411764705,0.058823529411764705 +2015.0,1.0,0.8823529411764706,0.7647058823529411,0.7647058823529411 +2016.0,0.8823529411764706,0.7647058823529411,0.47058823529411764,0.47058823529411764 +2017.0,1.0,1.0,0.8823529411764706,0.8823529411764706 +2018.0,0.8571428571428571,0.8571428571428571,0.8571428571428571,0.8571428571428571 +2019.0,0.5769230769230769,0.6538461538461539,0.6153846153846154,0.6153846153846154 +2020.0,0.5,0.34615384615384615,0.3076923076923077,0.2692307692307692 +2013.5,0.7235817403884631,0.813325330132053,0.5851417490072953,0.5662880536830116 +4.031128874149275,0.23007808432257312,0.22895112188004565,0.2785497227870999,0.2856578253359031 diff --git a/resultTop3.csv b/resultTop3.csv new file mode 100644 index 0000000000000000000000000000000000000000..a43901ac3fa05386cefae5c22a8a13abb523b379 --- /dev/null +++ b/resultTop3.csv @@ -0,0 +1,17 @@ +ano_base,estatistica_ks,estatistica_f,estatistica_t,estatistica_cohend +2007.0,1.0,1.0,1.0,1.0 +2008.0,1.0,1.0,1.0,0.2777777777777778 +2009.0,1.0,1.0,1.0,1.0 +2010.0,1.0,1.0,1.0,1.0 +2011.0,1.0,1.0,1.0,1.0 +2012.0,0.5833333333333334,0.9444444444444445,0.9166666666666666,0.9166666666666666 +2013.0,0.6470588235294118,0.2549019607843137,0.23529411764705885,0.23529411764705885 +2014.0,0.11764705882352942,0.23529411764705885,0.05882352941176471,0.05882352941176471 +2015.0,1.0,0.2745098039215686,0.2549019607843137,0.2549019607843137 +2016.0,1.0,0.2549019607843137,0.23529411764705885,0.23529411764705885 +2017.0,1.0,1.0,1.0,1.0 +2018.0,1.0,1.0,1.0,1.0 +2019.0,0.6947368421052631,0.3317384370015949,0.36437246963562747,0.34941520467836257 +2020.0,0.8974358974358975,0.1923076923076923,0.15384615384615385,0.15384615384615385 +2013.5,0.8528722825162454,0.6777213154922134,0.6585142154027602,0.6058585377470825 +4.031128874149275,0.250339537305587,0.36541467448480874,0.38775010065919086,0.387616877128408