Initial commit

5c0a23c6 · Henrique Varella Ehrenfried · 5c0a23c6 · 5c0a23c6 · 5c0a23c6 · 5c0a23c6
Commit 5c0a23c6 authored 6 years ago by Henrique Varella Ehrenfried
--- a/.gitignore
+++ b/.gitignore
+env/
+__pycache__
+*.pyc
+pairing/
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "mapping_protocols"]
+	path = mapping_protocols
+	url = git@gitlab.c3sl.ufpr.br:simcaq/mapping_protocols.git
--- a/.pylintrc
+++ b/.pylintrc
+[MASTER]
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-whitelist=
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS
+
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint.
+jobs=1
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# Specify a configuration file.
+#rcfile=
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,long-suffix,old-ne-operator,old-octal-literal,import-star-module-level,raw-checker-failed,bad-inline-option,locally-disabled,locally-enabled,file-ignored,suppressed-message,useless-suppression,deprecated-pragma,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,eq-without-hash,div-method,idiv-method,rdiv-method,exception-message-attribute,invalid-str-codec,sys-max-int,bad-python3-import,deprecated-string-function,deprecated-str-translate-call
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=
+
+
+[REPORTS]
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio).You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,future.builtins
+
+
+[BASIC]
+
+# Naming hint for argument names
+argument-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+# Regular expression matching correct argument names
+argument-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+# Naming hint for attribute names
+attr-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+# Regular expression matching correct attribute names
+attr-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=foo,bar,baz,toto,tutu,tata
+
+# Naming hint for class attribute names
+class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Naming hint for class names
+class-name-hint=[A-Z_][a-zA-Z0-9]+$
+
+# Regular expression matching correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+# Naming hint for constant names
+const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Regular expression matching correct constant names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming hint for function names
+function-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+# Regular expression matching correct function names
+function-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,k,ex,Run,_
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# Naming hint for inline iteration names
+inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Naming hint for method names
+method-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+# Regular expression matching correct method names
+method-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+# Naming hint for module names
+module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Regular expression matching correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty
+
+# Naming hint for variable names
+variable-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+# Regular expression matching correct variable names
+variable-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Maximum number of lines in a module
+max-module-lines=1000
+
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,dict-separator
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[SIMILARITIES]
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=Table.*
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in a if statement
+max-bool-expr=5
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[IMPORTS]
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=optparse,tkinter.tix
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
--- a/README.md
+++ b/README.md
+# Administrador de base de dados SimCAQ/SMPPIR #
+
+Esse repositório implementa a classe DatabaseTable e funções para verificar pareamento entre
+diferentes anos inseridos no banco de dados. A ferramenta é desenvolvida em Python 3, e usa
+como base arquivos de mapeamento em formato CSV.
+
+Para a utilização a partir da linha de comando, a CLI manage.py pode ser utilizada sem
+que se invoque manualmente as funções a partir da linha de comando Python.
+
+## Requisitos ##
+
+O utilitário foi desenvolvido em Python 3 usando a biblioteca SQLAlchemy com vistas ao banco
+de dados MonetDB. Versões futuras podem ter modificações visando a compatibilidade com outros
+bancos de dados, aproveitando as capacidades da biblioteca base.
+
+Para a instalação dos requisitos conforme usados durante o desenvolvimento, o arquivo
+requirements.txt pode ser usado como base (Recomenda-se o uso de um ambiente virtual).
+
+```bash
+(env) $ pip install -r requirements.txt
+```
+
+A CLI depende do módulo manage.py. Demais dependências serão listadas a seguir.
+
+### Requisitos para a interface com a base de dados ###
+
+* pymonetdb
+* SQLAlchemy
+* sqlalchemy-monetdb
+
+### Requisitos para geração de pareamentos ###
+
+* numpy
+* pandas
+* xlrd
+* XlsxWriter
+
+## Interface de linha de comando ##
+
+A invocação da CLI utiliza o padrão do pacote manage.py, que é:
+
+```bash
+$ python manage.py [commando] [argumentos posicionais] [argumentos opcionais com valor]
+```
+
+Os comandos já implementados são:
+
+* create: Cria a tabela conforme definido no protocolo de mapeamento.
+
+```bash
+$ python manage.py create <nome da tabela>
+```
+
+O único argumento usado é o nome da tabela. O script procurará por um protocolo de
+mapeamento com o mesmo nome para a busca do esquema das colunas.
+
+* insert: insere um arquivo de dados em formato CSV ou similar em uma tabela existente.
+
+```bash
+$ python manage.py insert <caminho para o arquivo> <nome da tabela> <ano> [--sep separador] [--null valor_nulo]
+```
+
+O caminho para o arquivo deve ser absoluto. A tabela utilizada deve existir e estar
+sincronizada com o protocolo de mapeamento correspondente. O separador padrão utilizado
+é ponto e vírgula (';'); caso outros separadores sejam utilizados pelo arquivo fonte,
+devem ser especificados com --sep (por exemplo --sep \\| para pipe). O valor nulo padrão
+é string vazia. Caso outro valor seja usado, deve ser especificado com --null.
+
+* drop: derruba uma tabela do banco de dados.
+
+```bash
+$ python manage.py drop <nome da tabela>
+```
+
+O comando não contorna chaves estrangeiras que apontem para a tabela, e o banco de dados
+pode retornar um erro caso exista alguma.
+
+* remap: sincroniza uma tabela com o protocolo de mapeamento.
+
+```bash
+$ python manage.py remap <nome da tabela>
+```
+
+Esse comando deve ser utilizado sempre que um protocolo de mapeamento for atualizado.
+
+O remapeamento permite a criação de novas colunas, derrubada de colunas existentes,
+renomeamento de colunas e mudança de tipo. Dependendo do tamanho da tabela, o uso de
+memória primária pode ser intenso.
+
+* generate_pairing_report: gera relatórios de pareamento para comparação de dados ano
+a ano.
+
+```bash
+$ python manage.py generate_pairing_report [--output xlsx|csv]
+```
+
+Os relatórios são criados na pasta pairing. Caso o formato não seja especificado,
+csv será utilizado (um arquivo será criado para cada tabela). Caso xlsx seja o formato
+utilizado, um arquivo será criado com todas as tabelas separadas em diferentes planilhas.
+
+* generate_backup: Cria/Atualiza o arquivo monitorado para o backup.
+
+```bash
+$ python manage.py generate_backup
+```
+
+O arquivo é criado ou atualizado na máquina onde o banco de dados da produção está,
+o procedimento de backup da equipe de infraestrutura o monitora para realizar o procedimento.
\ No newline at end of file
--- a/database/__init__.py
+++ b/database/__init__.py
--- a/database/actions.py
+++ b/database/actions.py
+'''Database manipulation actions - these can be used as models for other modules.'''
+import logging
+from sqlalchemy import create_engine, MetaData
+
+from database.database_table import gen_data_table, gen_temporary, copy_to_temporary
+from mapping_functions import generate_pairing_xlsx, generate_pairing_csv
+import settings
+
+
+ENGINE = create_engine(settings.DATABASE_URI, echo=settings.ECHO)
+META = MetaData(bind=ENGINE)
+
+logging.basicConfig(format = settings.LOGGING_FORMAT)
+
+database_table_logger = logging.getLogger('database.database_table')
+database_table_logger.setLevel(settings.LOGGING_LEVEL)
+sqlalchemy_logger = logging.getLogger('sqlalchemy.engine')
+sqlalchemy_logger.setLevel(settings.LOGGING_LEVEL)
+
+def temporary_data(connection, file_name, table, year, offset=2, sep=';', null=''):
+    header = open(file_name, encoding="ISO-8859-9").readline()
+    header = header.split(sep)
+    columns = table.mount_original_columns(header, year)
+
+    ttable = gen_temporary('t_' + table.name, META, *columns)
+    table.set_temporary_primary_keys(ttable, year)
+
+    ttable.create(bind=connection)
+    copy_to_temporary(connection, file_name, ttable, offset, sep, null)
+
+    return ttable
+
+
+def insert(file_name, table, year, offset=2, sep=';', null=''):
+    '''Inserts contents of csv in file_name in table using year as index for mapping'''
+    table = gen_data_table(table, META)
+
+    with ENGINE.connect() as connection:
+        trans = connection.begin()
+
+        ttable = temporary_data(connection, file_name, table, year, offset, sep, null)
+        table.insert_from_temporary(connection, ttable, year)
+
+        trans.commit()
+
+def create(table):
+    '''Creates table from mapping_protocol metadata'''
+    table = gen_data_table(table, META)
+
+    table.create()
+
+def drop(table):
+    '''Drops table'''
+    table = gen_data_table(table, META)
+
+    table.drop()
+
+def remap(table):
+    '''Applies change made in mapping protocols to database'''
+    table = gen_data_table(table, META)
+
+    table.remap()
+
+def generate_pairing_report(output='csv'):
+    '''Generates the pairing report for a given table'''
+    if output == 'csv':
+        generate_pairing_csv(ENGINE)
+    elif output == 'xlsx':
+        generate_pairing_xlsx(ENGINE)
+    else:
+        print('Unsuported output type "{}"'.format(output))
+
+def update_from_file(csv_file, table, year, columns=None, target_list=None,
+                     offset=2, sep=';', null=''):
+    '''Updates table columns from an input csv file'''
+    table = gen_data_table(table, META)
+
+    if columns is None:
+        columns = []
+    columns = columns + table.columns_from_targets(target_list)
+
+    with ENGINE.connect() as connection:
+        trans = connection.begin()
+
+        ttable = temporary_data(connection, csv_file, table, year, offset, sep, null)
+        table.update_from_temporary(connection, ttable, year, columns)
+
+        trans.commit()
--- a/database/database_table.py
+++ b/database/database_table.py
--- a/database/protocol.py
+++ b/database/protocol.py
+''' Routines related to column dictionary generation.
+Names comonly used:
+- original columns: columns as they are named in the original database;
+- target columns: columns as named internaly in project;
+- dbcolumns: columns as named in database.'''
+import pandas as pd
+
+
+standard_columns = {
+    'description': 'Novo Rótulo',
+    'target_name': 'Var.Lab',
+    'standard_name': 'Rot.Padrão',
+    'database_name': 'Nome Banco',
+    'data_type': 'Tipo de Dado'
+}
+
+class Protocol():
+    ''' Protocol for table translation'''
+    def __init__(self, in_file=None, columns=None):
+        self._dataframe = None
+        self._remaped = None
+        self.columns = standard_columns.copy()
+        if in_file:
+            self.load_csv(in_file, columns)
+
+    def load_csv(self, in_file, columns=None):
+        ''' Loads csv into TableDict '''
+        self._dataframe = pd.read_csv(in_file)
+        self._dataframe = self._dataframe.fillna('')
+        if isinstance(columns, dict):
+            for column in columns:
+                self.columns[column] = columns[column]
+        else:
+            columns = standard_columns.copy()
+        self._remaped = self._dataframe[columns['target_name']]
+
+    def get_targets(self):
+        '''Returns the list of targets from the protocol file'''
+        return list(self._remaped)
+
+    def target_from_original(self, name, year):
+        '''Gets a target column from an original name and a year
+        Input example: **{'name': 'TP_COR_RACA', 'year': '2015'}
+        output could look like 'CEBMA015N0' '''
+        if self._dataframe is None:
+            return None
+        indexes = self._dataframe[self._dataframe[year] == name].index.tolist()
+        if not indexes:
+            return None
+        if len(indexes) > 1:
+            return None
+        return self._remaped[indexes[0]]
+
+    def original_from_target(self, name, year):
+        '''Gets original column from target column and a year
+        Input example: **{'name': 'CEBMA015N0', 'year': '2015'}
+        output could look like 'TP_COR_RACA' '''
+        if self._dataframe is None:
+            return None
+        indexes = self._dataframe[self._remaped == name].index.tolist()
+        if not indexes:
+            return None
+        if len(indexes) > 1:
+            return None
+        return self._dataframe[year][indexes[0]]
+
+    def target_from_dbcolumn(self, name):
+        '''Returns the target corresponding to a given dbcolumn'''
+        if self._dataframe is None:
+            return None
+        indexes = self._dataframe[self._dataframe[standard_columns['database_name']]\
+                  == name].index.tolist()
+        if not indexes:
+            return None
+        if len(indexes) > 1:
+            return None
+        return self._remaped[indexes[0]]
+
+
+    def dbcolumn_from_target(self, name):
+        '''Gets database column from a target column name. Ouput is a list
+        with the column name, type and comment contents.
+        Input example: **{'name': 'CEBMA015N0'}
+        output could look like ['cor_raca_id', 'TINYINT', 'Cor/raça', 'TP_COR_RACA'] '''
+        indexes = self._dataframe[self._remaped == name].index.tolist()
+        if not indexes or len(indexes) > 1:
+            return [None, None, None, None]
+        comment = self._dataframe[standard_columns['description']][indexes[0]].strip()
+        standard = self._dataframe[standard_columns['standard_name']][indexes[0]].strip()
+        column_name = self._dataframe[standard_columns['database_name']][indexes[0]].strip()
+        column_type = self._dataframe[standard_columns['data_type']][indexes[0]].strip()
+        return [column_name, column_type, comment, standard]
+
+    def remap_from_protocol(self, new_protocol, column_list, reference_year='2015'):
+        '''Method to update a mapping protocol from another file'''
+        cur_targets = self.get_targets()
+
+        for target in cur_targets:
+            original = self.original_from_target(target, reference_year)
+            new_target = new_protocol.target_from_original(original, reference_year)
+
+            if new_target and target != new_target:
+                print('[' + target + ']', '[' + new_target + ']')
+                self._dataframe[self._dataframe[self.columns['target_name']] ==\
+                                target][self.columns['target_name']] = new_target
+                self._remaped[self._remaped == target] = new_target
+
+
+        new_targets = new_protocol.get_targets()
+
+        # Exclude unused targets
+        to_exclude = [t for t in cur_targets if t not in new_targets]
+        for target in to_exclude:
+            indexes = self._dataframe[self._remaped == target].index.tolist()
+            self._dataframe = self._dataframe.drop(indexes)
+            self._dataframe = self._dataframe.reset_index(drop=True)
+            self._remaped = self._remaped.drop(indexes)
+            self._remaped = self._remaped.reset_index(drop=True)
+
+        self._dataframe.index = self._remaped
+        new_protocol._dataframe.index = new_protocol._remaped
+
+        new_targets = [c for c in list(new_protocol._remaped) if c not in cur_targets]
+
+        new_rows = new_protocol._dataframe.loc[new_targets]
+        self._dataframe = pd.concat([self._dataframe, new_rows])
+
+        for column in column_list:
+            self._dataframe[column] = new_protocol._dataframe[column]
--- a/database/types.py
+++ b/database/types.py
+import re
+from sqlalchemy_monetdb.types import MONETDB_TYPE_MAP, TINYINT, DOUBLE_PRECISION
+from sqlalchemy.ext.compiler import compiles
+
+
+TYPE_RE = re.compile('[a-z]+')
+ARGS_RE = re.compile('\\( *[0-9,.]+ *\\)')
+
+MONETDB_TYPE_MAP['integer'] = MONETDB_TYPE_MAP['int']
+
+@compiles(TINYINT)
+def compile_tinyint(element, compiler, **kwargs):
+    '''Translation for tinyint - not sure if implemented in sqlalchemy_monetdb
+    by default'''
+    return 'TINYINT'
+
+@compiles(DOUBLE_PRECISION)
+def compile_double(element, compiler, **kwargs):
+    '''Translation for double - not sure if implemented in sqlalchemy_monetdb
+    by default'''
+    return 'DOUBLE'
+
+def get_type(in_string):
+    '''Returns a remapped type object for a given type string'''
+    in_string = in_string.lower()
+    in_string = re.sub(' +', ' ', in_string)
+    field_type = re.search(TYPE_RE, in_string).group()
+    field_type = MONETDB_TYPE_MAP[field_type]
+    targs = re.search(ARGS_RE, in_string)
+    if targs:
+        targs = targs.group()
+        targs = targs.strip('()')
+        targs = targs.split(',')
+        targs = [a.strip() for a in targs]
+        args = []
+        for arg in targs:
+            try:
+                arg = int(arg)
+            except ValueError:
+                pass
+            finally:
+                args.append(arg)
+        field_type = field_type(*args)
+    else:
+        field_type = field_type()
+    return field_type
--- a/dictionaries/anexo_2013.xlsx
+++ b/dictionaries/anexo_2013.xlsx
--- a/dictionaries/anexo_2014.xlsx
+++ b/dictionaries/anexo_2014.xlsx
--- a/dictionaries/anexo_2016.xlsx
+++ b/dictionaries/anexo_2016.xlsx
--- a/dictionaries/docente.xls
+++ b/dictionaries/docente.xls
--- a/dictionaries/escola.xls
+++ b/dictionaries/escola.xls
--- a/dictionaries/matricula.xls
+++ b/dictionaries/matricula.xls
--- a/dictionaries/turma.xls
+++ b/dictionaries/turma.xls
--- a/manage.py
+++ b/manage.py
+#!/usr/bin/env python3
+'''CLI for database module'''
+from manager import Manager
+
+import database.actions
+
+
+manager = Manager()
+
+@manager.command
+def insert(csv_file, table, year, sep=';', null=''):
+    '''Inserts file in table using a year as index'''
+    database.actions.insert(csv_file, table, year, sep=sep, null=null)
+
+@manager.command
+def create(table):
+    '''Creates table using mapping protocols'''
+    database.actions.create(table)
+
+@manager.command
+def drop(table):
+    '''Drops a table'''
+    database.actions.drop(table)
+
+@manager.command
+def remap(table):
+    '''TODO'''
+    database.actions.remap(table)
+
+@manager.command
+def generate_pairing_report(output='csv'):
+    '''In progress'''
+    database.actions.generate_pairing_report(output)
+
+@manager.command
+def update_from_file(csv_file, table, year, columns=None, target_list=None, offset=2, sep=';',
+                     null=''):
+    if columns:
+        columns = columns.split(',')
+    if target_list:
+        target_list = target_list.split(',')
+    database.actions.update_from_file(csv_file, table, year, columns=columns,
+                    target_list=target_list, offset=offset, sep=sep, null=null)
+
+if __name__ == "__main__":
+    manager.main()
--- a/mapping_functions/__init__.py
+++ b/mapping_functions/__init__.py
+''' This module contains functions to generate pairing reports '''
+import os
+import pandas as pd
+
+from mapping_functions.table_manipulation import walk_tables
+import settings
+
+
+def generate_pairing_csv(engine):
+    '''Generates pairing reports in csv format. Generates a file for each table'''
+    for table_name, pairing, _ in walk_tables(engine):
+        output_file = table_name + '.csv'
+        output_file = os.path.join(settings.PAIRING_OUTPUT_FOLDER, output_file)
+        pairing.to_csv(output_file, index=False)
+
+
+def generate_pairing_xlsx(engine):
+    '''Generate pairing reports in xlsx format on a single file, where each sheet corresponds
+    to one of the tables'''
+    xls_output_name = os.path.join(settings.PAIRING_OUTPUT_FOLDER,
+                                   settings.XLS_OUTPUT_FILE_NAME)
+    xls_writer = pd.ExcelWriter(xls_output_name, engine='xlsxwriter')
+
+    workbook = xls_writer.book
+
+    merge_format = workbook.add_format({
+        'align': 'center',
+        'valign': 'vcenter'})
+
+    for table_name, pairing, variable_sizes in walk_tables(engine):
+        pairing.to_excel(xls_writer, sheet_name=table_name, index=False)
+        worksheet = xls_writer.sheets[table_name]
+        current_line = 1
+        for size in variable_sizes:
+            line_range = [current_line + 1, current_line + size]
+            merge_ranges = [chr(65+i) + '{}:' + chr(65+i) + '{}' for i in range(4)]
+            merge_ranges = [m.format(*line_range) for m in merge_ranges]
+            for i, merge_range in enumerate(merge_ranges):
+                worksheet.merge_range(merge_range, pairing.iloc[current_line-1, i], merge_format)
+
+            current_line += size
+
+    xls_writer.save()
--- a/mapping_functions/table_manipulation.py
+++ b/mapping_functions/table_manipulation.py
+'''Defines functions to build the pairing reports, without concern for specific type output'''
+import os
+import numpy as np
+import pandas as pd
+
+from database.protocol import Protocol
+import settings
+
+
+VALUES = 'valores'
+VALUES_DESCRIPTION = 'descrição'
+VARIABLE_DESCRIPTION = 'Descrição'
+
+def get_from_dict_file(file, variable):
+    '''Gets dictionary information about a variable'''
+    dict_file = pd.read_excel(file)
+    dict_file = dict_file.fillna('')
+    columns = [(dict_file[c][9] or dict_file[c][8]) for c in dict_file]
+    dict_file.columns = columns
+    dict_file = dict_file.drop(range(10))
+    dict_file = dict_file.reset_index(drop=True)
+
+    variables = [c for c in columns if 'variavel' in c.lower() or 'variável' in c.lower()]
+    lde_name = [c for c in variables if 'lde' in c.lower()][0]
+
+    found = False
+    for line, row in dict_file.iterrows():
+        if row[lde_name].strip() == variable:
+            start_line = line
+            found = True
+            break
+    if not found:
+        return None
+    line_count = 1
+    while (start_line + line_count) < len(dict_file) and\
+          (not dict_file.iloc[start_line + line_count][lde_name]) and\
+          dict_file.iloc[start_line + line_count][VALUES]:
+        line_count += 1
+    line_range = [l+start_line for l in range(line_count)]
+    return dict_file.iloc[line_range]
+
+def get_from_attch_file(table_name,year_list,variable,protocol,attachments):
+    '''Gets attachments information about a variable in attachments according to attachment year and year_list'''
+    attach_dict = {
+        'turma': 'Tabela de Turma',
+        'matricula': 'Tabela de Matrícula',
+        'escola': 'Tabela de Escola',
+        'docente': 'Tabela de Docente',
+    }
+    attachs = pd.DataFrame()
+    for attachment in attachments:
+            attach_file_location = os.path.join(settings.DICTIONARY_FOLDER, attachment)
+            year = attachment[6:10]
+            skip = 0
+            if int(year) in year_list:
+                original_cod = protocol.original_from_target(variable,year)
+                attach_file = pd.read_excel(attach_file_location,None)
+                for a in attach_file[attach_dict[table_name]].iterrows():
+                    skip=skip+1
+                    if a[0] == 'N':
+                        break
+                attach_table = pd.read_excel(attach_file_location,skiprows=skip,sheetname=attach_dict[table_name])
+                found = False
+                for line, row in attach_table[attach_table['Nome da Variável'] == original_cod].iterrows():
+                    start_line = line
+                    found = True
+                    break
+                if not found:
+                    attach = ''
+                    continue
+                attach = attach_table['N']
+                line_count = 1
+                while attach.iloc[start_line + line_count] is np.nan:
+                    line_count += 1
+                line_range = [l+start_line for l in range(line_count)]
+                attach = attach_table.iloc[line_range]['Categoria'].reset_index(drop=True)
+                if pd.isnull(attach).any() or attach.empty or len(attach)==0:
+                    attach = ''
+                elif attach[0].find('\n') != -1:
+                     attach = attach[0].split('\n')
+                     attach = pd.DataFrame(attach)
+                     attach.index = np.arange(1,len(attach)+1)
+                     attach.columns = ['Descricao_'+year]
+                     attachs = pd.concat([attachs,attach],axis=1)
+                else:
+                     attach = pd.DataFrame(attach)
+                     attach.index = np.arange(1,len(attach)+1)
+                     attach.columns = ['Descricao_'+year]
+                     attachs = pd.concat([attachs,attach],axis=1)
+    return attachs
+
+def get_year_list(table_name, engine):
+    '''Builds the year list from a table using the given engine'''
+    response = engine.execute('select distinct ano_censo from {} order by ano_censo'.\
+                              format(table_name))
+    return [r[0] for r in response.fetchall()]
+
+def get_variable_values(table_name, variable_name, year, engine):
+    '''Builds a list with all possible values for a variable from a table using a given engine,
+    using only results for the given year. Values are ordered in the database query'''
+    response = engine.execute('select distinct {0} from {1} where ano_censo={2} order by {0}'.\
+        format(variable_name, table_name, year))
+    return [r[0] for r in response.fetchall() if not (r[0]== '' or r[0] is None)]
+
+def handle_table_field(table_name, field, engine, year_list):
+    '''Checks tables for pairing information'''
+    variable_content = pd.DataFrame([])
+    for i in range(len(year_list)):
+        year = year_list[i]
+        value1 = get_variable_values(table_name, field, year, engine)
+        content1 = pd.DataFrame(value1, columns=[year],index=value1)
+        variable_content = pd.concat([variable_content,content1], axis=1)
+    return variable_content
+
+def assemble_variable_content(table_name, protocol, variable, engine, year_list,attachments):
+    '''Builds the variable contents to populate the pairing report for a given variable from
+    a given table using the given engine'''
+    field_name, data_type, comment, _ = protocol.dbcolumn_from_target(variable)
+    if not field_name:
+        return None
+    field_name = field_name.strip()
+    if not field_name:
+        return None
+    variable_content = handle_table_field(table_name, field_name, engine, year_list)
+
+    dict_file_location = os.path.join(settings.DICTIONARY_FOLDER, table_name + '.xls')
+
+    variable_dict = get_from_dict_file(dict_file_location, variable)
+    if variable_dict is not None:
+        variable_attachments = get_from_attch_file(table_name,year_list,variable,protocol,attachments)
+        variable_dict.index = variable_dict[VALUES]
+        variable_content = pd.concat([variable_dict[[VALUES, VALUES_DESCRIPTION]],
+                                      variable_content], axis=1)
+    else:
+        variable_attachments = pd.DataFrame([])
+    variable_content = variable_content.reset_index(drop=True)
+    original_names_line = ['', '']
+    for year in iter(variable_content.columns[2:]):
+        original_names_line.append(protocol.original_from_target(variable, str(year)))
+    original_names_line = pd.DataFrame(original_names_line, index=variable_content.columns)
+    original_names_line = original_names_line.transpose()
+    variable_content = pd.concat([original_names_line, variable_content])
+    variable_content = variable_content.reset_index(drop=True)
+    contents = [variable, field_name, comment, data_type]
+    contents = pd.DataFrame(contents,index=['Variável','Nome no Banco','Comentário','Tipo'])
+    contents = contents.transpose()
+    contents = pd.concat([contents, variable_content,variable_attachments], axis=1)
+    return contents
+
+def output_per_variable(table_name, variables, engine, attachments):
+    '''Yields the contents for a given variable in a pandas DataFrame. Can be used to iterate a
+    variable list (variables) and get formated output for report'''
+    print(table_name)
+    protocol_file = os.path.join(settings.MAPPING_PROTOCOLS_FOLDER, table_name + '.csv')
+    protocol = Protocol()
+    protocol.load_csv(protocol_file)
+    year_list = get_year_list(table_name, engine)
+    for variable in variables:
+        contents = assemble_variable_content(table_name, protocol, variable, engine, year_list, attachments)
+        if contents is not None:
+            yield contents
+
+def walk_tables(engine):
+    '''Uses given engine to search for tables that have both a mapping protocol and a dicionary
+    in the folders listed in settings.py, then iterates over them to output master DataFrames
+    for each of those tables'''
+    variables = open(settings.PAIRING_VARIABLE_FILE).read()
+    variables = [v for v in variables.split('\n') if v]
+    protocols = [f for f in os.listdir(settings.MAPPING_PROTOCOLS_FOLDER) if
+                 f.lower().endswith('.csv')]
+    dictionaries = [f for f in os.listdir(settings.DICTIONARY_FOLDER) if f.lower().endswith('.xls')]
+    attachments = [f for f in os.listdir(settings.DICTIONARY_FOLDER) if f.lower().endswith('.xlsx')]
+    for protocol in protocols:
+        table_name = os.path.splitext(protocol)[0]
+        dictionary = table_name + '.xls'
+        if dictionary in dictionaries:
+            output_table = pd.DataFrame()
+            variable_sizes = []
+            for variable_table in output_per_variable(table_name, variables, engine, attachments):
+                output_table = pd.concat([output_table, variable_table])
+                variable_sizes.append(len(variable_table))
+            yield [table_name, output_table, variable_sizes]
--- a/requirements.txt
+++ b/requirements.txt
+astroid==1.5.3
+decorator==4.0.11
+ipython==6.1.0
+ipython-genutils==0.2.0
+isort==4.2.15
+jedi==0.10.2
+lazy-object-proxy==1.3.1
+manage.py==0.2.10
+mccabe==0.6.1
+numpy==1.13.0
+pandas==0.20.2
+pexpect==4.2.1
+pickleshare==0.7.4
+prompt-toolkit==1.0.14
+ptyprocess==0.5.1
+Pygments==2.2.0
+pylint==1.7.1
+pymonetdb==1.0.6
+python-dateutil==2.6.0
+pytz==2017.2
+simplegeneric==0.8.1
+six==1.10.0
+SQLAlchemy==1.1.13
+sqlalchemy-monetdb==0.9.3
+traitlets==4.3.2
+wcwidth==0.1.7
+wrapt==1.10.10
+xlrd==1.0.0
+XlsxWriter==0.9.8