Skip to content
Snippets Groups Projects
Commit 5c0a23c6 authored by Henrique Varella Ehrenfried's avatar Henrique Varella Ehrenfried
Browse files

Initial commit

parents
Branches
No related tags found
No related merge requests found
Showing
with 1104 additions and 0 deletions
env/
__pycache__
*.pyc
pairing/
[submodule "mapping_protocols"]
path = mapping_protocols
url = git@gitlab.c3sl.ufpr.br:simcaq/mapping_protocols.git
.pylintrc 0 → 100644
[MASTER]
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code
extension-pkg-whitelist=
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS
# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint.
jobs=1
# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=
# Pickle collected data for later comparisons.
persistent=yes
# Specify a configuration file.
#rcfile=
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
confidence=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once).You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use"--disable=all --enable=classes
# --disable=W"
disable=print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,long-suffix,old-ne-operator,old-octal-literal,import-star-module-level,raw-checker-failed,bad-inline-option,locally-disabled,locally-enabled,file-ignored,suppressed-message,useless-suppression,deprecated-pragma,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,eq-without-hash,div-method,idiv-method,rdiv-method,exception-message-attribute,invalid-str-codec,sys-max-int,bad-python3-import,deprecated-string-function,deprecated-str-translate-call
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=
[REPORTS]
# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio).You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text
# Tells whether to display a full report or only the messages
reports=no
# Activate the evaluation score.
score=yes
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,_cb
# A regular expression matching the name of dummy variables (i.e. expectedly
# not used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,future.builtins
[BASIC]
# Naming hint for argument names
argument-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct argument names
argument-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Naming hint for attribute names
attr-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct attribute names
attr-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Bad variable names which should always be refused, separated by a comma
bad-names=foo,bar,baz,toto,tutu,tata
# Naming hint for class attribute names
class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Regular expression matching correct class attribute names
class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Naming hint for class names
class-name-hint=[A-Z_][a-zA-Z0-9]+$
# Regular expression matching correct class names
class-rgx=[A-Z_][a-zA-Z0-9]+$
# Naming hint for constant names
const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Regular expression matching correct constant names
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming hint for function names
function-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct function names
function-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Good variable names which should always be accepted, separated by a comma
good-names=i,j,k,ex,Run,_
# Include a hint for the correct naming format with invalid-name
include-naming-hint=no
# Naming hint for inline iteration names
inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
# Regular expression matching correct inline iteration names
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
# Naming hint for method names
method-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct method names
method-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Naming hint for module names
module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Regular expression matching correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
property-classes=abc.abstractproperty
# Naming hint for variable names
variable-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
# Regular expression matching correct variable names
variable-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=100
# Maximum number of lines in a module
max-module-lines=1000
# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,dict-separator
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[SIMILARITIES]
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
# Minimum lines number of a similarity.
min-similarity-lines=4
[LOGGING]
# Logging modules to check that the string format arguments are in logging
# function parameter format
logging-modules=logging
[SPELLING]
# Spelling dictionary name. Available dictionaries: none. To make it working
# install python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to indicated private dictionary in
# --spelling-private-dict-file option instead of raising a message.
spelling-store-unknown-words=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=Table.*
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis. It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[DESIGN]
# Maximum number of arguments for function / method
max-args=5
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Maximum number of boolean expressions in a if statement
max-bool-expr=5
# Maximum number of branch for function / method body
max-branches=12
# Maximum number of locals for function / method body
max-locals=15
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body
max-returns=6
# Maximum number of statements in function / method body
max-statements=50
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs
[IMPORTS]
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Deprecated modules which should not be used, separated by a comma
deprecated-modules=optparse,tkinter.tix
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception
README.md 0 → 100644
# Administrador de base de dados SimCAQ/SMPPIR #
Esse repositório implementa a classe DatabaseTable e funções para verificar pareamento entre
diferentes anos inseridos no banco de dados. A ferramenta é desenvolvida em Python 3, e usa
como base arquivos de mapeamento em formato CSV.
Para a utilização a partir da linha de comando, a CLI manage.py pode ser utilizada sem
que se invoque manualmente as funções a partir da linha de comando Python.
## Requisitos ##
O utilitário foi desenvolvido em Python 3 usando a biblioteca SQLAlchemy com vistas ao banco
de dados MonetDB. Versões futuras podem ter modificações visando a compatibilidade com outros
bancos de dados, aproveitando as capacidades da biblioteca base.
Para a instalação dos requisitos conforme usados durante o desenvolvimento, o arquivo
requirements.txt pode ser usado como base (Recomenda-se o uso de um ambiente virtual).
```bash
(env) $ pip install -r requirements.txt
```
A CLI depende do módulo manage.py. Demais dependências serão listadas a seguir.
### Requisitos para a interface com a base de dados ###
* pymonetdb
* SQLAlchemy
* sqlalchemy-monetdb
### Requisitos para geração de pareamentos ###
* numpy
* pandas
* xlrd
* XlsxWriter
## Interface de linha de comando ##
A invocação da CLI utiliza o padrão do pacote manage.py, que é:
```bash
$ python manage.py [commando] [argumentos posicionais] [argumentos opcionais com valor]
```
Os comandos já implementados são:
* create: Cria a tabela conforme definido no protocolo de mapeamento.
```bash
$ python manage.py create <nome da tabela>
```
O único argumento usado é o nome da tabela. O script procurará por um protocolo de
mapeamento com o mesmo nome para a busca do esquema das colunas.
* insert: insere um arquivo de dados em formato CSV ou similar em uma tabela existente.
```bash
$ python manage.py insert <caminho para o arquivo> <nome da tabela> <ano> [--sep separador] [--null valor_nulo]
```
O caminho para o arquivo deve ser absoluto. A tabela utilizada deve existir e estar
sincronizada com o protocolo de mapeamento correspondente. O separador padrão utilizado
é ponto e vírgula (';'); caso outros separadores sejam utilizados pelo arquivo fonte,
devem ser especificados com --sep (por exemplo --sep \\| para pipe). O valor nulo padrão
é string vazia. Caso outro valor seja usado, deve ser especificado com --null.
* drop: derruba uma tabela do banco de dados.
```bash
$ python manage.py drop <nome da tabela>
```
O comando não contorna chaves estrangeiras que apontem para a tabela, e o banco de dados
pode retornar um erro caso exista alguma.
* remap: sincroniza uma tabela com o protocolo de mapeamento.
```bash
$ python manage.py remap <nome da tabela>
```
Esse comando deve ser utilizado sempre que um protocolo de mapeamento for atualizado.
O remapeamento permite a criação de novas colunas, derrubada de colunas existentes,
renomeamento de colunas e mudança de tipo. Dependendo do tamanho da tabela, o uso de
memória primária pode ser intenso.
* generate_pairing_report: gera relatórios de pareamento para comparação de dados ano
a ano.
```bash
$ python manage.py generate_pairing_report [--output xlsx|csv]
```
Os relatórios são criados na pasta pairing. Caso o formato não seja especificado,
csv será utilizado (um arquivo será criado para cada tabela). Caso xlsx seja o formato
utilizado, um arquivo será criado com todas as tabelas separadas em diferentes planilhas.
* generate_backup: Cria/Atualiza o arquivo monitorado para o backup.
```bash
$ python manage.py generate_backup
```
O arquivo é criado ou atualizado na máquina onde o banco de dados da produção está,
o procedimento de backup da equipe de infraestrutura o monitora para realizar o procedimento.
\ No newline at end of file
'''Database manipulation actions - these can be used as models for other modules.'''
import logging
from sqlalchemy import create_engine, MetaData
from database.database_table import gen_data_table, gen_temporary, copy_to_temporary
from mapping_functions import generate_pairing_xlsx, generate_pairing_csv
import settings
ENGINE = create_engine(settings.DATABASE_URI, echo=settings.ECHO)
META = MetaData(bind=ENGINE)
logging.basicConfig(format = settings.LOGGING_FORMAT)
database_table_logger = logging.getLogger('database.database_table')
database_table_logger.setLevel(settings.LOGGING_LEVEL)
sqlalchemy_logger = logging.getLogger('sqlalchemy.engine')
sqlalchemy_logger.setLevel(settings.LOGGING_LEVEL)
def temporary_data(connection, file_name, table, year, offset=2, sep=';', null=''):
header = open(file_name, encoding="ISO-8859-9").readline()
header = header.split(sep)
columns = table.mount_original_columns(header, year)
ttable = gen_temporary('t_' + table.name, META, *columns)
table.set_temporary_primary_keys(ttable, year)
ttable.create(bind=connection)
copy_to_temporary(connection, file_name, ttable, offset, sep, null)
return ttable
def insert(file_name, table, year, offset=2, sep=';', null=''):
'''Inserts contents of csv in file_name in table using year as index for mapping'''
table = gen_data_table(table, META)
with ENGINE.connect() as connection:
trans = connection.begin()
ttable = temporary_data(connection, file_name, table, year, offset, sep, null)
table.insert_from_temporary(connection, ttable, year)
trans.commit()
def create(table):
'''Creates table from mapping_protocol metadata'''
table = gen_data_table(table, META)
table.create()
def drop(table):
'''Drops table'''
table = gen_data_table(table, META)
table.drop()
def remap(table):
'''Applies change made in mapping protocols to database'''
table = gen_data_table(table, META)
table.remap()
def generate_pairing_report(output='csv'):
'''Generates the pairing report for a given table'''
if output == 'csv':
generate_pairing_csv(ENGINE)
elif output == 'xlsx':
generate_pairing_xlsx(ENGINE)
else:
print('Unsuported output type "{}"'.format(output))
def update_from_file(csv_file, table, year, columns=None, target_list=None,
offset=2, sep=';', null=''):
'''Updates table columns from an input csv file'''
table = gen_data_table(table, META)
if columns is None:
columns = []
columns = columns + table.columns_from_targets(target_list)
with ENGINE.connect() as connection:
trans = connection.begin()
ttable = temporary_data(connection, csv_file, table, year, offset, sep, null)
table.update_from_temporary(connection, ttable, year, columns)
trans.commit()
This diff is collapsed.
''' Routines related to column dictionary generation.
Names comonly used:
- original columns: columns as they are named in the original database;
- target columns: columns as named internaly in project;
- dbcolumns: columns as named in database.'''
import pandas as pd
standard_columns = {
'description': 'Novo Rótulo',
'target_name': 'Var.Lab',
'standard_name': 'Rot.Padrão',
'database_name': 'Nome Banco',
'data_type': 'Tipo de Dado'
}
class Protocol():
''' Protocol for table translation'''
def __init__(self, in_file=None, columns=None):
self._dataframe = None
self._remaped = None
self.columns = standard_columns.copy()
if in_file:
self.load_csv(in_file, columns)
def load_csv(self, in_file, columns=None):
''' Loads csv into TableDict '''
self._dataframe = pd.read_csv(in_file)
self._dataframe = self._dataframe.fillna('')
if isinstance(columns, dict):
for column in columns:
self.columns[column] = columns[column]
else:
columns = standard_columns.copy()
self._remaped = self._dataframe[columns['target_name']]
def get_targets(self):
'''Returns the list of targets from the protocol file'''
return list(self._remaped)
def target_from_original(self, name, year):
'''Gets a target column from an original name and a year
Input example: **{'name': 'TP_COR_RACA', 'year': '2015'}
output could look like 'CEBMA015N0' '''
if self._dataframe is None:
return None
indexes = self._dataframe[self._dataframe[year] == name].index.tolist()
if not indexes:
return None
if len(indexes) > 1:
return None
return self._remaped[indexes[0]]
def original_from_target(self, name, year):
'''Gets original column from target column and a year
Input example: **{'name': 'CEBMA015N0', 'year': '2015'}
output could look like 'TP_COR_RACA' '''
if self._dataframe is None:
return None
indexes = self._dataframe[self._remaped == name].index.tolist()
if not indexes:
return None
if len(indexes) > 1:
return None
return self._dataframe[year][indexes[0]]
def target_from_dbcolumn(self, name):
'''Returns the target corresponding to a given dbcolumn'''
if self._dataframe is None:
return None
indexes = self._dataframe[self._dataframe[standard_columns['database_name']]\
== name].index.tolist()
if not indexes:
return None
if len(indexes) > 1:
return None
return self._remaped[indexes[0]]
def dbcolumn_from_target(self, name):
'''Gets database column from a target column name. Ouput is a list
with the column name, type and comment contents.
Input example: **{'name': 'CEBMA015N0'}
output could look like ['cor_raca_id', 'TINYINT', 'Cor/raça', 'TP_COR_RACA'] '''
indexes = self._dataframe[self._remaped == name].index.tolist()
if not indexes or len(indexes) > 1:
return [None, None, None, None]
comment = self._dataframe[standard_columns['description']][indexes[0]].strip()
standard = self._dataframe[standard_columns['standard_name']][indexes[0]].strip()
column_name = self._dataframe[standard_columns['database_name']][indexes[0]].strip()
column_type = self._dataframe[standard_columns['data_type']][indexes[0]].strip()
return [column_name, column_type, comment, standard]
def remap_from_protocol(self, new_protocol, column_list, reference_year='2015'):
'''Method to update a mapping protocol from another file'''
cur_targets = self.get_targets()
for target in cur_targets:
original = self.original_from_target(target, reference_year)
new_target = new_protocol.target_from_original(original, reference_year)
if new_target and target != new_target:
print('[' + target + ']', '[' + new_target + ']')
self._dataframe[self._dataframe[self.columns['target_name']] ==\
target][self.columns['target_name']] = new_target
self._remaped[self._remaped == target] = new_target
new_targets = new_protocol.get_targets()
# Exclude unused targets
to_exclude = [t for t in cur_targets if t not in new_targets]
for target in to_exclude:
indexes = self._dataframe[self._remaped == target].index.tolist()
self._dataframe = self._dataframe.drop(indexes)
self._dataframe = self._dataframe.reset_index(drop=True)
self._remaped = self._remaped.drop(indexes)
self._remaped = self._remaped.reset_index(drop=True)
self._dataframe.index = self._remaped
new_protocol._dataframe.index = new_protocol._remaped
new_targets = [c for c in list(new_protocol._remaped) if c not in cur_targets]
new_rows = new_protocol._dataframe.loc[new_targets]
self._dataframe = pd.concat([self._dataframe, new_rows])
for column in column_list:
self._dataframe[column] = new_protocol._dataframe[column]
import re
from sqlalchemy_monetdb.types import MONETDB_TYPE_MAP, TINYINT, DOUBLE_PRECISION
from sqlalchemy.ext.compiler import compiles
TYPE_RE = re.compile('[a-z]+')
ARGS_RE = re.compile('\\( *[0-9,.]+ *\\)')
MONETDB_TYPE_MAP['integer'] = MONETDB_TYPE_MAP['int']
@compiles(TINYINT)
def compile_tinyint(element, compiler, **kwargs):
'''Translation for tinyint - not sure if implemented in sqlalchemy_monetdb
by default'''
return 'TINYINT'
@compiles(DOUBLE_PRECISION)
def compile_double(element, compiler, **kwargs):
'''Translation for double - not sure if implemented in sqlalchemy_monetdb
by default'''
return 'DOUBLE'
def get_type(in_string):
'''Returns a remapped type object for a given type string'''
in_string = in_string.lower()
in_string = re.sub(' +', ' ', in_string)
field_type = re.search(TYPE_RE, in_string).group()
field_type = MONETDB_TYPE_MAP[field_type]
targs = re.search(ARGS_RE, in_string)
if targs:
targs = targs.group()
targs = targs.strip('()')
targs = targs.split(',')
targs = [a.strip() for a in targs]
args = []
for arg in targs:
try:
arg = int(arg)
except ValueError:
pass
finally:
args.append(arg)
field_type = field_type(*args)
else:
field_type = field_type()
return field_type
File added
File added
File added
File added
File added
File added
File added
#!/usr/bin/env python3
'''CLI for database module'''
from manager import Manager
import database.actions
manager = Manager()
@manager.command
def insert(csv_file, table, year, sep=';', null=''):
'''Inserts file in table using a year as index'''
database.actions.insert(csv_file, table, year, sep=sep, null=null)
@manager.command
def create(table):
'''Creates table using mapping protocols'''
database.actions.create(table)
@manager.command
def drop(table):
'''Drops a table'''
database.actions.drop(table)
@manager.command
def remap(table):
'''TODO'''
database.actions.remap(table)
@manager.command
def generate_pairing_report(output='csv'):
'''In progress'''
database.actions.generate_pairing_report(output)
@manager.command
def update_from_file(csv_file, table, year, columns=None, target_list=None, offset=2, sep=';',
null=''):
if columns:
columns = columns.split(',')
if target_list:
target_list = target_list.split(',')
database.actions.update_from_file(csv_file, table, year, columns=columns,
target_list=target_list, offset=offset, sep=sep, null=null)
if __name__ == "__main__":
manager.main()
''' This module contains functions to generate pairing reports '''
import os
import pandas as pd
from mapping_functions.table_manipulation import walk_tables
import settings
def generate_pairing_csv(engine):
'''Generates pairing reports in csv format. Generates a file for each table'''
for table_name, pairing, _ in walk_tables(engine):
output_file = table_name + '.csv'
output_file = os.path.join(settings.PAIRING_OUTPUT_FOLDER, output_file)
pairing.to_csv(output_file, index=False)
def generate_pairing_xlsx(engine):
'''Generate pairing reports in xlsx format on a single file, where each sheet corresponds
to one of the tables'''
xls_output_name = os.path.join(settings.PAIRING_OUTPUT_FOLDER,
settings.XLS_OUTPUT_FILE_NAME)
xls_writer = pd.ExcelWriter(xls_output_name, engine='xlsxwriter')
workbook = xls_writer.book
merge_format = workbook.add_format({
'align': 'center',
'valign': 'vcenter'})
for table_name, pairing, variable_sizes in walk_tables(engine):
pairing.to_excel(xls_writer, sheet_name=table_name, index=False)
worksheet = xls_writer.sheets[table_name]
current_line = 1
for size in variable_sizes:
line_range = [current_line + 1, current_line + size]
merge_ranges = [chr(65+i) + '{}:' + chr(65+i) + '{}' for i in range(4)]
merge_ranges = [m.format(*line_range) for m in merge_ranges]
for i, merge_range in enumerate(merge_ranges):
worksheet.merge_range(merge_range, pairing.iloc[current_line-1, i], merge_format)
current_line += size
xls_writer.save()
'''Defines functions to build the pairing reports, without concern for specific type output'''
import os
import numpy as np
import pandas as pd
from database.protocol import Protocol
import settings
VALUES = 'valores'
VALUES_DESCRIPTION = 'descrição'
VARIABLE_DESCRIPTION = 'Descrição'
def get_from_dict_file(file, variable):
'''Gets dictionary information about a variable'''
dict_file = pd.read_excel(file)
dict_file = dict_file.fillna('')
columns = [(dict_file[c][9] or dict_file[c][8]) for c in dict_file]
dict_file.columns = columns
dict_file = dict_file.drop(range(10))
dict_file = dict_file.reset_index(drop=True)
variables = [c for c in columns if 'variavel' in c.lower() or 'variável' in c.lower()]
lde_name = [c for c in variables if 'lde' in c.lower()][0]
found = False
for line, row in dict_file.iterrows():
if row[lde_name].strip() == variable:
start_line = line
found = True
break
if not found:
return None
line_count = 1
while (start_line + line_count) < len(dict_file) and\
(not dict_file.iloc[start_line + line_count][lde_name]) and\
dict_file.iloc[start_line + line_count][VALUES]:
line_count += 1
line_range = [l+start_line for l in range(line_count)]
return dict_file.iloc[line_range]
def get_from_attch_file(table_name,year_list,variable,protocol,attachments):
'''Gets attachments information about a variable in attachments according to attachment year and year_list'''
attach_dict = {
'turma': 'Tabela de Turma',
'matricula': 'Tabela de Matrícula',
'escola': 'Tabela de Escola',
'docente': 'Tabela de Docente',
}
attachs = pd.DataFrame()
for attachment in attachments:
attach_file_location = os.path.join(settings.DICTIONARY_FOLDER, attachment)
year = attachment[6:10]
skip = 0
if int(year) in year_list:
original_cod = protocol.original_from_target(variable,year)
attach_file = pd.read_excel(attach_file_location,None)
for a in attach_file[attach_dict[table_name]].iterrows():
skip=skip+1
if a[0] == 'N':
break
attach_table = pd.read_excel(attach_file_location,skiprows=skip,sheetname=attach_dict[table_name])
found = False
for line, row in attach_table[attach_table['Nome da Variável'] == original_cod].iterrows():
start_line = line
found = True
break
if not found:
attach = ''
continue
attach = attach_table['N']
line_count = 1
while attach.iloc[start_line + line_count] is np.nan:
line_count += 1
line_range = [l+start_line for l in range(line_count)]
attach = attach_table.iloc[line_range]['Categoria'].reset_index(drop=True)
if pd.isnull(attach).any() or attach.empty or len(attach)==0:
attach = ''
elif attach[0].find('\n') != -1:
attach = attach[0].split('\n')
attach = pd.DataFrame(attach)
attach.index = np.arange(1,len(attach)+1)
attach.columns = ['Descricao_'+year]
attachs = pd.concat([attachs,attach],axis=1)
else:
attach = pd.DataFrame(attach)
attach.index = np.arange(1,len(attach)+1)
attach.columns = ['Descricao_'+year]
attachs = pd.concat([attachs,attach],axis=1)
return attachs
def get_year_list(table_name, engine):
'''Builds the year list from a table using the given engine'''
response = engine.execute('select distinct ano_censo from {} order by ano_censo'.\
format(table_name))
return [r[0] for r in response.fetchall()]
def get_variable_values(table_name, variable_name, year, engine):
'''Builds a list with all possible values for a variable from a table using a given engine,
using only results for the given year. Values are ordered in the database query'''
response = engine.execute('select distinct {0} from {1} where ano_censo={2} order by {0}'.\
format(variable_name, table_name, year))
return [r[0] for r in response.fetchall() if not (r[0]== '' or r[0] is None)]
def handle_table_field(table_name, field, engine, year_list):
'''Checks tables for pairing information'''
variable_content = pd.DataFrame([])
for i in range(len(year_list)):
year = year_list[i]
value1 = get_variable_values(table_name, field, year, engine)
content1 = pd.DataFrame(value1, columns=[year],index=value1)
variable_content = pd.concat([variable_content,content1], axis=1)
return variable_content
def assemble_variable_content(table_name, protocol, variable, engine, year_list,attachments):
'''Builds the variable contents to populate the pairing report for a given variable from
a given table using the given engine'''
field_name, data_type, comment, _ = protocol.dbcolumn_from_target(variable)
if not field_name:
return None
field_name = field_name.strip()
if not field_name:
return None
variable_content = handle_table_field(table_name, field_name, engine, year_list)
dict_file_location = os.path.join(settings.DICTIONARY_FOLDER, table_name + '.xls')
variable_dict = get_from_dict_file(dict_file_location, variable)
if variable_dict is not None:
variable_attachments = get_from_attch_file(table_name,year_list,variable,protocol,attachments)
variable_dict.index = variable_dict[VALUES]
variable_content = pd.concat([variable_dict[[VALUES, VALUES_DESCRIPTION]],
variable_content], axis=1)
else:
variable_attachments = pd.DataFrame([])
variable_content = variable_content.reset_index(drop=True)
original_names_line = ['', '']
for year in iter(variable_content.columns[2:]):
original_names_line.append(protocol.original_from_target(variable, str(year)))
original_names_line = pd.DataFrame(original_names_line, index=variable_content.columns)
original_names_line = original_names_line.transpose()
variable_content = pd.concat([original_names_line, variable_content])
variable_content = variable_content.reset_index(drop=True)
contents = [variable, field_name, comment, data_type]
contents = pd.DataFrame(contents,index=['Variável','Nome no Banco','Comentário','Tipo'])
contents = contents.transpose()
contents = pd.concat([contents, variable_content,variable_attachments], axis=1)
return contents
def output_per_variable(table_name, variables, engine, attachments):
'''Yields the contents for a given variable in a pandas DataFrame. Can be used to iterate a
variable list (variables) and get formated output for report'''
print(table_name)
protocol_file = os.path.join(settings.MAPPING_PROTOCOLS_FOLDER, table_name + '.csv')
protocol = Protocol()
protocol.load_csv(protocol_file)
year_list = get_year_list(table_name, engine)
for variable in variables:
contents = assemble_variable_content(table_name, protocol, variable, engine, year_list, attachments)
if contents is not None:
yield contents
def walk_tables(engine):
'''Uses given engine to search for tables that have both a mapping protocol and a dicionary
in the folders listed in settings.py, then iterates over them to output master DataFrames
for each of those tables'''
variables = open(settings.PAIRING_VARIABLE_FILE).read()
variables = [v for v in variables.split('\n') if v]
protocols = [f for f in os.listdir(settings.MAPPING_PROTOCOLS_FOLDER) if
f.lower().endswith('.csv')]
dictionaries = [f for f in os.listdir(settings.DICTIONARY_FOLDER) if f.lower().endswith('.xls')]
attachments = [f for f in os.listdir(settings.DICTIONARY_FOLDER) if f.lower().endswith('.xlsx')]
for protocol in protocols:
table_name = os.path.splitext(protocol)[0]
dictionary = table_name + '.xls'
if dictionary in dictionaries:
output_table = pd.DataFrame()
variable_sizes = []
for variable_table in output_per_variable(table_name, variables, engine, attachments):
output_table = pd.concat([output_table, variable_table])
variable_sizes.append(len(variable_table))
yield [table_name, output_table, variable_sizes]
astroid==1.5.3
decorator==4.0.11
ipython==6.1.0
ipython-genutils==0.2.0
isort==4.2.15
jedi==0.10.2
lazy-object-proxy==1.3.1
manage.py==0.2.10
mccabe==0.6.1
numpy==1.13.0
pandas==0.20.2
pexpect==4.2.1
pickleshare==0.7.4
prompt-toolkit==1.0.14
ptyprocess==0.5.1
Pygments==2.2.0
pylint==1.7.1
pymonetdb==1.0.6
python-dateutil==2.6.0
pytz==2017.2
simplegeneric==0.8.1
six==1.10.0
SQLAlchemy==1.1.13
sqlalchemy-monetdb==0.9.3
traitlets==4.3.2
wcwidth==0.1.7
wrapt==1.10.10
xlrd==1.0.0
XlsxWriter==0.9.8
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment