diff --git a/distCalc.ipynb b/distCalc.ipynb index 67cac9e6667f08eefb024c93b7d041f1dd2d4bdc..6066c1061263539ee650e82f2979e08d25ff1f27 100644 --- a/distCalc.ipynb +++ b/distCalc.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 14, "id": "af419e44-d6ef-41f7-970c-78c316aeb712", "metadata": { "tags": [] @@ -133,38 +133,42 @@ " found_new_columns = np.setdiff1d(new_columns, next_col) # Colunas novas encontradas pelo algoritmo\n", " no_data_columns = np.setdiff1d(base_columns, prev_col) # Colunas que não receram dados encontradas pelo algoritmo\n", "\n", - " # Calcula resultados ========================\n", + " # ========== CALCULA ACURACIAS ========== \n", " acertos_p = 0\n", + " acertos = 0\n", " for i in range(len(prev_col)):\n", " if prev_col[i] == next_col[i]: \n", " acertos_p += 1\n", " acuracia_matches = acertos_p / len(prev_col)\n", + " acertos += acertos_p\n", " \n", " acertos_p = 0\n", " unionNewColumns = np.union1d(found_new_columns, true_new_columns)\n", " for col in unionNewColumns:\n", - " if col in true_new_columns:\n", + " if col in true_new_columns and col in found_new_columns:\n", " acertos_p += 1\n", " if(len(unionNewColumns) > 0):\n", " acuracia_new_columns = acertos_p / len(unionNewColumns)\n", " else:\n", " acuracia_new_columns = 1.0\n", - " \n", + " acertos += acertos_p \n", " \n", " acertos_p = 0\n", " unionEmptyColumns = np.union1d(no_data_columns, base_empty_columns)\n", " for col in unionEmptyColumns:\n", - " if col in base_empty_columns:\n", + " if col in base_empty_columns and col in no_data_columns:\n", " acertos_p += 1\n", " if(len(unionEmptyColumns) > 0):\n", " acuracia_empty_columns = acertos_p / len(unionEmptyColumns)\n", " else:\n", " acuracia_empty_columns = 1.0\n", + " acertos += acertos_p\n", " \n", " soma_acuracia = acuracia_matches * len(prev_col) + acuracia_new_columns * len(unionNewColumns) + acuracia_empty_columns * len(unionEmptyColumns)\n", - " acuracia_total = soma_acuracia / (len(prev_col) + len(unionNewColumns) + len(unionEmptyColumns))\n", + " # acuracia_total = soma_acuracia / (len(prev_col) + len(unionNewColumns) + len(unionEmptyColumns))\n", + " acuracia_total = acertos / len(all_columns)\n", " \n", - " # Adiciona acuracia\n", + " # ========== ADICIONA ACURACIAS ==========\n", " if(stat_column == 'estatistica_f'):\n", " self.stat_f.append([ano, acuracia_total])\n", " self.stat_f_matches.append([ano, acuracia_matches])\n", @@ -228,32 +232,32 @@ " acuracia_novas_colunas = 0\n", " acuracia_colunas_vazias = 0\n", " \n", - "\n", + " # ========== CALCULA ACURACIA TOTAL ==========\n", " # Acurácia matches\n", " acertos = 0\n", " for res in resultados:\n", " if(len(res) == 0):\n", " continue\n", " for i in res:\n", - " if i[0] == i[2] and i[0] not in no_data_columns and i[0] not in found_new_columns and i[2] not in no_data_columns and i[2] not in found_new_columns:\n", + " if i[0] == i[2]:\n", " acertos += 1\n", " break\n", " \n", " # Acurácia novas colunas\n", " for new in found_new_columns:\n", - " if new in true_new_columns and new not in no_data_columns and new not in all_match_columns:\n", + " if new in true_new_columns:\n", " acertos += 1\n", "\n", " # Acurácia colunas vazias\n", " for no_data in no_data_columns:\n", - " if no_data in true_empty_columns and no_data not in found_new_columns and no_data not in all_match_columns:\n", + " if no_data in true_empty_columns:\n", " acertos += 1\n", "\n", " # Acurácia total\n", " acuracia_total = acertos / len(all_columns)\n", " \n", " \n", - " # =========================\n", + " # ========== CALCULA ACURACIA PARCIAL ==========\n", " acertos_p = 0\n", " unionNewColumns = np.union1d(found_new_columns, true_new_columns)\n", " if len(unionNewColumns) > 0:\n", @@ -286,8 +290,8 @@ " break\n", " \n", " acuracia_matches = acertos_p / len(prev_col)\n", - " soma_acuracia = acuracia_matches * results_len + acuracia_new_columns * len(unionNewColumns) + acuracia_empty_columns * len(unionEmptyColumns)\n", - " acuracia_total = soma_acuracia / (results_len + len(unionNewColumns) + len(unionEmptyColumns))\n", + " # soma_acuracia = acuracia_matches * results_len + acuracia_new_columns * len(unionNewColumns) + acuracia_empty_columns * len(unionEmptyColumns)\n", + " # acuracia_total = soma_acuracia / (results_len + len(unionNewColumns) + len(unionEmptyColumns))\n", " \n", " # print(ano)\n", " # print(f'{acuracia_matches} matches')\n", @@ -337,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 15, "id": "26287a6f-5537-4509-a09d-52dd59b3a76d", "metadata": { "tags": [] @@ -383,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 16, "id": "f9541a11-c1bf-4318-847a-100917e13204", "metadata": { "tags": [] @@ -412,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 17, "id": "527ff27d-f321-4749-a94d-dd7d824ef682", "metadata": { "tags": [] @@ -486,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 18, "id": "4cb4afc8-6149-40a7-8f77-af06183d4d23", "metadata": { "tags": [] @@ -531,7 +535,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.11.9" } }, "nbformat": 4,