diff --git a/analise_resultados_muriki.ipynb b/analise_resultados_muriki.ipynb index 2ad62a19ea61027fb525d7fad2bda3433df5397a..da43327c1733044ed1686db7b0b886a7455ab21c 100644 --- a/analise_resultados_muriki.ipynb +++ b/analise_resultados_muriki.ipynb @@ -2,9 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "e13d5b4a-95a3-404c-bf01-7b4b48b4d121", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import numpy as np\n", @@ -14,10 +16,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "id": "3ee220b5-c632-415e-8a0e-76e85f31b5f1", "metadata": { - "scrolled": true + "scrolled": true, + "tags": [] }, "outputs": [ { @@ -121,7 +124,7 @@ "16 74158 0.066157 1.129997e-22 " ] }, - "execution_count": 4, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -150,9 +153,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "id": "ae5ebf5b-a2b3-4d87-af13-62fc187b3ecc", - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true, + "source_hidden": true + }, + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -581,6 +591,286 @@ " print(f'NOVAS COLUNAS: {name}')\n", " print('\\n')" ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "81aa6d2b-582d-4982-ae36-35eb7ea3a004", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2007\n", + "NUM_SALAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_COMPUTADPR_ALUNO:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_COMPUTADOR_ADM:\n", + " NUM_COMPUTADOR_ADM\n", + "NUM_COMPUTADOR:\n", + " NUM_COMPUTADOR\n", + "NOVAS COLUNAS: []\n", + "2008\n", + "NUM_SALAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_COMPUTADPR_ALUNO:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_COMPUTADOR:\n", + " NUM_COMPUTADOR\n", + "NOVAS COLUNAS: []\n", + "2009\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_SALAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_COMPUTADOR_ADM:\n", + " NUM_COMPUTADOR_ADM\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_COMPUTADOR:\n", + " NUM_COMPUTADOR\n", + "NUM_COMPUTADPR_ALUNO:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NOVAS COLUNAS: []\n", + "2010\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_SALAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_COMPUTADOR_ADM:\n", + " NUM_COMPUTADOR_ADM\n", + "NUM_COMPUTADPR_ALUNO:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NUM_COMPUTADOR:\n", + " NUM_COMPUTADOR\n", + "NOVAS COLUNAS: []\n", + "2011\n", + "NUM_SALAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_COMPUTADOR:\n", + " NUM_COMPUTADOR\n", + "NUM_COMPUTADPR_ALUNO:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NUM_COMPUTADOR_ADM:\n", + " NUM_COMPUTADOR_ADM\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NOVAS COLUNAS: []\n", + "2012\n", + "NUM_COMPUTADOR_ADM:\n", + " NUM_COMPUTADOR_ADM\n", + "NUM_COMPUTADPR_ALUNO:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_SALAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NOVAS COLUNAS: []\n", + "2013\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_SALAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_COMPUTADOR:\n", + " NUM_COMPUTADOR\n", + "NUM_SOM:\n", + " NUM_TV\n", + " NUM_SOM\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_COMPUTADPR_ALUNO:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NUM_DVD:\n", + " NUM_DVD\n", + "NUM_TV:\n", + " NUM_TV\n", + " NUM_SOM\n", + "NOVAS COLUNAS: ['NUM_DVD' 'NUM_SOM' 'NUM_TV']\n", + "2014\n", + "NUM_SALAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_COMPUTADOR:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NUM_SOM:\n", + " NUM_TV\n", + "NOVAS COLUNAS: ['NUM_TV']\n", + "2015\n", + "NUM_SOM:\n", + " NUM_SOM\n", + "NUM_SALAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_DVD:\n", + " NUM_DVD\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_TV:\n", + " NUM_TV\n", + "NUM_COMPUTADPR_ALUNO:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NUM_COMPUTADOR:\n", + " NUM_COMPUTADOR\n", + "NOVAS COLUNAS: ['NUM_DVD' 'NUM_SOM' 'NUM_TV']\n", + "2016\n", + "NUM_TV:\n", + " NUM_TV\n", + "NUM_SOM:\n", + " NUM_SOM\n", + " NUM_COMPUTADOR_ADM\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_SALAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NUM_COMPUTADOR:\n", + " NUM_COMPUTADOR\n", + "NUM_COMPUTADPR_ALUNO:\n", + " NUM_COMPUTADPR_ALUNO\n", + "NOVAS COLUNAS: ['NUM_SOM' 'NUM_TV']\n", + "2017\n", + "NUM_FUNCIONARIOS:\n", + " NUM_FUNCIONARIOS\n", + "NUM_SALAS:\n", + " NUM_SALAS\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + " NUM_SALAS\n", + "NOVAS COLUNAS: []\n", + "2018\n", + "NUM_SALAS:\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + "NOVAS COLUNAS: []\n", + "2019\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_SOM:\n", + " NUM_SOM\n", + "NUM_TV:\n", + " NUM_TV\n", + "NUM_DVD:\n", + " NUM_DVD\n", + "NOVAS COLUNAS: ['NUM_DVD' 'NUM_SOM' 'NUM_TV']\n", + "2020\n", + "NUM_SALAS_UTILIZADAS:\n", + " NUM_SALAS_UTILIZADAS\n", + "NUM_DVD:\n", + " NUM_DVD\n", + "NUM_TV:\n", + " NUM_TV\n", + "NUM_SOM:\n", + " NUM_SOM\n", + "NOVAS COLUNAS: ['NUM_DVD' 'NUM_SOM' 'NUM_TV']\n" + ] + } + ], + "source": [ + "anos = df.ano_coluna1.unique()\n", + "threshold = 0.2\n", + "\n", + "# Itera sobre todos os anos\n", + "for ano in anos:\n", + " # Constroi dataframe do ano\n", + " ano_df = df[(df.ano_coluna1 == ano) & (df.estatistica_f < threshold)]\n", + " \n", + " # Estruturas\n", + " base_columns = ano_df.coluna1.unique() # Colunas que ja existiam na base\n", + " new_columns = ano_df.coluna2.unique() # Colunas do próximo ano\n", + " true_new_columns = np.setdiff1d(new_columns, base_columns) # Colunas que nao existiam na base\n", + " base_empty_columns = np.setdiff1d(base_columns, new_columns) # Colunas da base que nao receberam dados\n", + " all_columns = np.union1d(base_columns, new_columns) # Todas as colunas possiveis\n", + " resultados = [] # Resultados dos matches\n", + " \n", + " # Alterar para um dicionario\n", + " prev_col = [] # Colunas da base para match\n", + " next_col = [] # Colunas do proximo ano para match\n", + " \n", + " \n", + " # Itera sobre o dataframe\n", + " for col in base_columns:\n", + " top3 = ano_df[ano_df.coluna1 == col]\n", + " resultados.append(top3.iloc[:3,:].values)\n", + " \n", + " # Encontra colunas prev que deram match\n", + " all_match_columns = []\n", + " not_match_columns = [] \n", + "\n", + " for res in resultados:\n", + " for i in res:\n", + " next_col = np.union1d(next_col, i[2])\n", + " prev_col = np.union1d(prev_col, i[0])\n", + " \n", + " all_match_columns = np.union1d(next_col, prev_col)\n", + " not_match_columns = np.setdiff1d(all_columns, all_match_columns) \n", + " found_new_columns = np.setdiff1d(new_columns, next_columns)\n", + " \n", + " # Calcula acurácia\n", + " acuracia = 0\n", + " acertos = 0\n", + " for res in resultados:\n", + " for i in res:\n", + " if i[0] == i[2]:\n", + " acertos += 1\n", + " break\n", + " acuracia = acertos / len(base_columns) \n", + " \n", + " # Imprime resultados\n", + " print(ano)\n", + " for res in resultados:\n", + " print(f'{res[0][0]}:')\n", + " for i in res:\n", + " print(f' {i[2]}')\n", + " print(f'NOVAS COLUNAS: {found_new_columns}')\n", + " # print(f'{acuracia}\\n')\n", + " \n", + " " + ] } ], "metadata": { @@ -599,7 +889,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.11.4" } }, "nbformat": 4,