From 7a18db035ebe0e763032f7dfe7136376925147b9 Mon Sep 17 00:00:00 2001 From: edvs19 <edvs19@inf.ufpr.br> Date: Wed, 12 Mar 2025 01:11:00 -0300 Subject: [PATCH] adding pessoa and saude --- gen_bronze_10000.py | 145 ------------------ gen_bronze_100000.py | 145 ------------------ gen_bronze_20000.py | 145 ------------------ gen_bronze_200000.py | 145 ------------------ gen_bronze_5000.py | 145 ------------------ gen_bronze_50000.py | 145 ------------------ proj_teste/models/newmodel/new.sql | 37 ----- proj_teste/models/silver/dim_pessoa.sql | 32 ++++ proj_teste/models/silver/dim_saude.sql | 30 ++++ .../fichas_cadastro_individual_silver.sql | 1 + 10 files changed, 63 insertions(+), 907 deletions(-) delete mode 100644 gen_bronze_10000.py delete mode 100644 gen_bronze_100000.py delete mode 100644 gen_bronze_20000.py delete mode 100644 gen_bronze_200000.py delete mode 100644 gen_bronze_5000.py delete mode 100644 gen_bronze_50000.py delete mode 100644 proj_teste/models/newmodel/new.sql create mode 100644 proj_teste/models/silver/dim_pessoa.sql create mode 100644 proj_teste/models/silver/dim_saude.sql diff --git a/gen_bronze_10000.py b/gen_bronze_10000.py deleted file mode 100644 index 7f6d0b7..0000000 --- a/gen_bronze_10000.py +++ /dev/null @@ -1,145 +0,0 @@ -from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, DateType, ArrayType - -# Configuração do Spark para acessar o MinIO -print("Iniciando a configuração do Spark...") -spark = SparkSession.builder \ - .appName("Landig to Bronze") \ - .config("spark.hadoop.fs.s3a.endpoint", "http://minio.minio-cluster.svc.cluster.local:9000") \ - .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.path.style.access", "true") \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ - .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \ - .config("spark.sql.warehouse.dir", "s3a://landing/warehouse") \ - .getOrCreate() - -# Definição completa do esquema -print("Definindo o esquema dos dados...") -schema = StructType([ - StructField("profissionalCNS", LongType(), True), - StructField("cboCodigo_2002", LongType(), True), - StructField("cnes", LongType(), True), - StructField("ine", LongType(), True), - StructField("dataAtendimento", DateType(), True), - StructField("condicoesDeSaude", StructType([ - StructField("descricaoCausaInternacaoEm12Meses", StringType(), True), - StructField("descricaoOutraCondicao1", StringType(), True), - StructField("descricaoOutraCondicao2", StringType(), True), - StructField("descricaoOutraCondicao3", StringType(), True), - StructField("descricaoPlantasMedicinaisUsadas", ArrayType(StringType()), True), - StructField("doencaRespiratoria", ArrayType(StringType()), True), - StructField("doencaRins", ArrayType(StringType()), True), - StructField("maternidadeDeReferencia", StringType(), True), - StructField("situacaoPeso", StringType(), True), - StructField("statusEhDependenteAlcool", BooleanType(), True), - StructField("statusEhDependenteOutrasDrogas", BooleanType(), True), - StructField("statusEhFumante", BooleanType(), True), - StructField("statusEhGestante", BooleanType(), True), - StructField("statusEstaAcamado", BooleanType(), True), - StructField("statusEstaDomiciliado", BooleanType(), True), - StructField("statusTemDiabetes", BooleanType(), True), - StructField("statusTemDoencaRespiratoria", BooleanType(), True), - StructField("statusTemHanseniase", BooleanType(), True), - StructField("statusTemHipertensaoArterial", BooleanType(), True), - StructField("statusTemTeveCancer", BooleanType(), True), - StructField("statusTemTeveDoencasRins", BooleanType(), True), - StructField("statusTemTuberculose", BooleanType(), True), - StructField("statusTeveAvcDerrame", BooleanType(), True), - StructField("statusTeveDoencaCardiaca", BooleanType(), True), - StructField("statusTeveInfarto", BooleanType(), True), - StructField("statusTeveInternadoem12Meses", BooleanType(), True), - StructField("statusUsaOutrasPraticasIntegrativasOuComplementares", BooleanType(), True), - StructField("statusUsaPlantasMedicinais", BooleanType(), True), - StructField("statusDiagnosticoMental", StringType(), True) - ]), True), - StructField("emSituacaoDeRua", StructType([ - StructField("grauParentescoFamiliarFrequentado", StringType(), True), - StructField("higienePessoalSituacaoRua", ArrayType(StringType()), True), - StructField("origemAlimentoSituacaoRua", ArrayType(StringType()), True), - StructField("outraInstituicaoQueAcompanha", StringType(), True), - StructField("quantidadeAlimentacoesAoDiaSituacaoRua", StringType(), True), - StructField("statusAcompanhadoPorOutraInstituicao", BooleanType(), True), - StructField("statusPossuiReferenciaFamiliar", BooleanType(), True), - StructField("statusRecebeBeneficio", BooleanType(), True), - StructField("statusSituacaoRua", BooleanType(), True), - StructField("statusTemAcessoHigienePessoalSituacaoRua", BooleanType(), True), - StructField("statusVisitaFamiliarFrequentemente", BooleanType(), True), - StructField("tempoSituacaoRua", StringType(), True) - ]), True), - StructField("identificacaoUsuarioCidadao", StructType([ - StructField("nomeSocial", StringType(), True), - StructField("município", StringType(), True), - StructField("dataNascimentoCidadao", DateType(), True), - StructField("emailCidadao", StringType(), True), - StructField("nacionalidadeCidadao", StringType(), True), - StructField("nomeCidadao", StringType(), True), - StructField("nomeMaeCidadao", StringType(), True), - StructField("cnsCidadao", LongType(), True), - StructField("cnsResponsavelFamiliar", LongType(), True), - StructField("telefoneCelular", StringType(), True), - StructField("numeroNisPisPasep", LongType(), True), - StructField("paisNascimento", StringType(), True), - StructField("racaCorCidadao", StringType(), True), - StructField("sexoCidadao", StringType(), True), - StructField("statusEhResponsavel", BooleanType(), True), - StructField("etnia", StringType(), True), - StructField("nomePaiCidadao", StringType(), True), - StructField("desconheceNomePai", BooleanType(), True), - StructField("dtNaturalizacao", DateType(), True), - StructField("portariaNaturalizacao", StringType(), True), - StructField("dtEntradaBrasil", DateType(), True), - StructField("microarea", LongType(), True), - StructField("stForaArea", BooleanType(), True), - StructField("cpfCidadao", StringType(), True), - StructField("cpfResponsavelFamiliar", StringType(), True) - ]), True), - StructField("InformacoesSocioDemograficas", StructType([ - StructField("deficienciasCidadao", ArrayType(StringType()), True), - StructField("grauInstrucaoCidadao", StringType(), True), - StructField("ocupacao", StringType(), True), - StructField("orientacaoSexualCidadao", StringType(), True), - StructField("relacaoParentescoCidadao", StringType(), True), - StructField("situacaoMercadoTrabalhoCidadao", StringType(), True), - StructField("statusDesejaInformarOrientacaoSexual", BooleanType(), True), - StructField("statusFrequentaBenzedeira", BooleanType(), True), - StructField("statusFrequentaEscola", BooleanType(), True), - StructField("statusMembroPovoComunidadeTradicional", BooleanType(), True), - StructField("statusParticipaGrupoComunitario", BooleanType(), True), - StructField("statusPossuiPlanoSaudePrivado", BooleanType(), True), - StructField("statusTemAlgumaDeficiencia", BooleanType(), True), - StructField("identidadeGeneroCidadao", StringType(), True), - StructField("statusDesejaInformarIdentidadeGenero", BooleanType(), True), - StructField("responsavelPorCrianca", StringType(), True), - StructField("coPovoComunidadeTradicional", StringType(), True) - ]), True), - StructField("statusTermoRecusaCadastroIndividualAtencaoBasica", BooleanType(), True), - StructField("saidaCidadaoCadastro", StructType([ - StructField("motivoSaidaCidadao", StringType(), True), - StructField("dataObito", DateType(), True), - StructField("numeroDO", StringType(), True) - ]), True) -]) - - -print("Esquema definido com sucesso.") -# Carregar JSON com o esquema -input_path = "s3a://landing/warehouse/fichas_cadastro_individual_10000.jsonl" -output_path = "s3a://bronze/warehouse/fichas_cadastro_individual_parquet" -try: - print("Carregando dados do JSON...") - df = spark.read.schema(schema).json(input_path) - print("Dados carregados com sucesso.") -except Exception as e: - print(f"Erro ao carregar JSON: {e}") - spark.stop() - exit(1) - -try: - print("Gravando dados em formato Parquet...") - df.write.mode("overwrite").parquet(output_path) - print("Dados gravados com sucesso.") -except Exception as e: - print(f"Erro ao gravar Parquet: {e}") -finally: - spark.stop() diff --git a/gen_bronze_100000.py b/gen_bronze_100000.py deleted file mode 100644 index 750ecfa..0000000 --- a/gen_bronze_100000.py +++ /dev/null @@ -1,145 +0,0 @@ -from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, DateType, ArrayType - -# Configuração do Spark para acessar o MinIO -print("Iniciando a configuração do Spark...") -spark = SparkSession.builder \ - .appName("Landig to Bronze") \ - .config("spark.hadoop.fs.s3a.endpoint", "http://minio.minio-cluster.svc.cluster.local:9000") \ - .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.path.style.access", "true") \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ - .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \ - .config("spark.sql.warehouse.dir", "s3a://landing/warehouse") \ - .getOrCreate() - -# Definição completa do esquema -print("Definindo o esquema dos dados...") -schema = StructType([ - StructField("profissionalCNS", LongType(), True), - StructField("cboCodigo_2002", LongType(), True), - StructField("cnes", LongType(), True), - StructField("ine", LongType(), True), - StructField("dataAtendimento", DateType(), True), - StructField("condicoesDeSaude", StructType([ - StructField("descricaoCausaInternacaoEm12Meses", StringType(), True), - StructField("descricaoOutraCondicao1", StringType(), True), - StructField("descricaoOutraCondicao2", StringType(), True), - StructField("descricaoOutraCondicao3", StringType(), True), - StructField("descricaoPlantasMedicinaisUsadas", ArrayType(StringType()), True), - StructField("doencaRespiratoria", ArrayType(StringType()), True), - StructField("doencaRins", ArrayType(StringType()), True), - StructField("maternidadeDeReferencia", StringType(), True), - StructField("situacaoPeso", StringType(), True), - StructField("statusEhDependenteAlcool", BooleanType(), True), - StructField("statusEhDependenteOutrasDrogas", BooleanType(), True), - StructField("statusEhFumante", BooleanType(), True), - StructField("statusEhGestante", BooleanType(), True), - StructField("statusEstaAcamado", BooleanType(), True), - StructField("statusEstaDomiciliado", BooleanType(), True), - StructField("statusTemDiabetes", BooleanType(), True), - StructField("statusTemDoencaRespiratoria", BooleanType(), True), - StructField("statusTemHanseniase", BooleanType(), True), - StructField("statusTemHipertensaoArterial", BooleanType(), True), - StructField("statusTemTeveCancer", BooleanType(), True), - StructField("statusTemTeveDoencasRins", BooleanType(), True), - StructField("statusTemTuberculose", BooleanType(), True), - StructField("statusTeveAvcDerrame", BooleanType(), True), - StructField("statusTeveDoencaCardiaca", BooleanType(), True), - StructField("statusTeveInfarto", BooleanType(), True), - StructField("statusTeveInternadoem12Meses", BooleanType(), True), - StructField("statusUsaOutrasPraticasIntegrativasOuComplementares", BooleanType(), True), - StructField("statusUsaPlantasMedicinais", BooleanType(), True), - StructField("statusDiagnosticoMental", StringType(), True) - ]), True), - StructField("emSituacaoDeRua", StructType([ - StructField("grauParentescoFamiliarFrequentado", StringType(), True), - StructField("higienePessoalSituacaoRua", ArrayType(StringType()), True), - StructField("origemAlimentoSituacaoRua", ArrayType(StringType()), True), - StructField("outraInstituicaoQueAcompanha", StringType(), True), - StructField("quantidadeAlimentacoesAoDiaSituacaoRua", StringType(), True), - StructField("statusAcompanhadoPorOutraInstituicao", BooleanType(), True), - StructField("statusPossuiReferenciaFamiliar", BooleanType(), True), - StructField("statusRecebeBeneficio", BooleanType(), True), - StructField("statusSituacaoRua", BooleanType(), True), - StructField("statusTemAcessoHigienePessoalSituacaoRua", BooleanType(), True), - StructField("statusVisitaFamiliarFrequentemente", BooleanType(), True), - StructField("tempoSituacaoRua", StringType(), True) - ]), True), - StructField("identificacaoUsuarioCidadao", StructType([ - StructField("nomeSocial", StringType(), True), - StructField("município", StringType(), True), - StructField("dataNascimentoCidadao", DateType(), True), - StructField("emailCidadao", StringType(), True), - StructField("nacionalidadeCidadao", StringType(), True), - StructField("nomeCidadao", StringType(), True), - StructField("nomeMaeCidadao", StringType(), True), - StructField("cnsCidadao", LongType(), True), - StructField("cnsResponsavelFamiliar", LongType(), True), - StructField("telefoneCelular", StringType(), True), - StructField("numeroNisPisPasep", LongType(), True), - StructField("paisNascimento", StringType(), True), - StructField("racaCorCidadao", StringType(), True), - StructField("sexoCidadao", StringType(), True), - StructField("statusEhResponsavel", BooleanType(), True), - StructField("etnia", StringType(), True), - StructField("nomePaiCidadao", StringType(), True), - StructField("desconheceNomePai", BooleanType(), True), - StructField("dtNaturalizacao", DateType(), True), - StructField("portariaNaturalizacao", StringType(), True), - StructField("dtEntradaBrasil", DateType(), True), - StructField("microarea", LongType(), True), - StructField("stForaArea", BooleanType(), True), - StructField("cpfCidadao", StringType(), True), - StructField("cpfResponsavelFamiliar", StringType(), True) - ]), True), - StructField("InformacoesSocioDemograficas", StructType([ - StructField("deficienciasCidadao", ArrayType(StringType()), True), - StructField("grauInstrucaoCidadao", StringType(), True), - StructField("ocupacao", StringType(), True), - StructField("orientacaoSexualCidadao", StringType(), True), - StructField("relacaoParentescoCidadao", StringType(), True), - StructField("situacaoMercadoTrabalhoCidadao", StringType(), True), - StructField("statusDesejaInformarOrientacaoSexual", BooleanType(), True), - StructField("statusFrequentaBenzedeira", BooleanType(), True), - StructField("statusFrequentaEscola", BooleanType(), True), - StructField("statusMembroPovoComunidadeTradicional", BooleanType(), True), - StructField("statusParticipaGrupoComunitario", BooleanType(), True), - StructField("statusPossuiPlanoSaudePrivado", BooleanType(), True), - StructField("statusTemAlgumaDeficiencia", BooleanType(), True), - StructField("identidadeGeneroCidadao", StringType(), True), - StructField("statusDesejaInformarIdentidadeGenero", BooleanType(), True), - StructField("responsavelPorCrianca", StringType(), True), - StructField("coPovoComunidadeTradicional", StringType(), True) - ]), True), - StructField("statusTermoRecusaCadastroIndividualAtencaoBasica", BooleanType(), True), - StructField("saidaCidadaoCadastro", StructType([ - StructField("motivoSaidaCidadao", StringType(), True), - StructField("dataObito", DateType(), True), - StructField("numeroDO", StringType(), True) - ]), True) -]) - - -print("Esquema definido com sucesso.") -# Carregar JSON com o esquema -input_path = "s3a://landing/warehouse/fichas_cadastro_individual_100000.jsonl" -output_path = "s3a://bronze/warehouse/fichas_cadastro_individual_parquet" -try: - print("Carregando dados do JSON...") - df = spark.read.schema(schema).json(input_path) - print("Dados carregados com sucesso.") -except Exception as e: - print(f"Erro ao carregar JSON: {e}") - spark.stop() - exit(1) - -try: - print("Gravando dados em formato Parquet...") - df.write.mode("overwrite").parquet(output_path) - print("Dados gravados com sucesso.") -except Exception as e: - print(f"Erro ao gravar Parquet: {e}") -finally: - spark.stop() diff --git a/gen_bronze_20000.py b/gen_bronze_20000.py deleted file mode 100644 index 49789c4..0000000 --- a/gen_bronze_20000.py +++ /dev/null @@ -1,145 +0,0 @@ -from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, DateType, ArrayType - -# Configuração do Spark para acessar o MinIO -print("Iniciando a configuração do Spark...") -spark = SparkSession.builder \ - .appName("Landig to Bronze") \ - .config("spark.hadoop.fs.s3a.endpoint", "http://minio.minio-cluster.svc.cluster.local:9000") \ - .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.path.style.access", "true") \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ - .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \ - .config("spark.sql.warehouse.dir", "s3a://landing/warehouse") \ - .getOrCreate() - -# Definição completa do esquema -print("Definindo o esquema dos dados...") -schema = StructType([ - StructField("profissionalCNS", LongType(), True), - StructField("cboCodigo_2002", LongType(), True), - StructField("cnes", LongType(), True), - StructField("ine", LongType(), True), - StructField("dataAtendimento", DateType(), True), - StructField("condicoesDeSaude", StructType([ - StructField("descricaoCausaInternacaoEm12Meses", StringType(), True), - StructField("descricaoOutraCondicao1", StringType(), True), - StructField("descricaoOutraCondicao2", StringType(), True), - StructField("descricaoOutraCondicao3", StringType(), True), - StructField("descricaoPlantasMedicinaisUsadas", ArrayType(StringType()), True), - StructField("doencaRespiratoria", ArrayType(StringType()), True), - StructField("doencaRins", ArrayType(StringType()), True), - StructField("maternidadeDeReferencia", StringType(), True), - StructField("situacaoPeso", StringType(), True), - StructField("statusEhDependenteAlcool", BooleanType(), True), - StructField("statusEhDependenteOutrasDrogas", BooleanType(), True), - StructField("statusEhFumante", BooleanType(), True), - StructField("statusEhGestante", BooleanType(), True), - StructField("statusEstaAcamado", BooleanType(), True), - StructField("statusEstaDomiciliado", BooleanType(), True), - StructField("statusTemDiabetes", BooleanType(), True), - StructField("statusTemDoencaRespiratoria", BooleanType(), True), - StructField("statusTemHanseniase", BooleanType(), True), - StructField("statusTemHipertensaoArterial", BooleanType(), True), - StructField("statusTemTeveCancer", BooleanType(), True), - StructField("statusTemTeveDoencasRins", BooleanType(), True), - StructField("statusTemTuberculose", BooleanType(), True), - StructField("statusTeveAvcDerrame", BooleanType(), True), - StructField("statusTeveDoencaCardiaca", BooleanType(), True), - StructField("statusTeveInfarto", BooleanType(), True), - StructField("statusTeveInternadoem12Meses", BooleanType(), True), - StructField("statusUsaOutrasPraticasIntegrativasOuComplementares", BooleanType(), True), - StructField("statusUsaPlantasMedicinais", BooleanType(), True), - StructField("statusDiagnosticoMental", StringType(), True) - ]), True), - StructField("emSituacaoDeRua", StructType([ - StructField("grauParentescoFamiliarFrequentado", StringType(), True), - StructField("higienePessoalSituacaoRua", ArrayType(StringType()), True), - StructField("origemAlimentoSituacaoRua", ArrayType(StringType()), True), - StructField("outraInstituicaoQueAcompanha", StringType(), True), - StructField("quantidadeAlimentacoesAoDiaSituacaoRua", StringType(), True), - StructField("statusAcompanhadoPorOutraInstituicao", BooleanType(), True), - StructField("statusPossuiReferenciaFamiliar", BooleanType(), True), - StructField("statusRecebeBeneficio", BooleanType(), True), - StructField("statusSituacaoRua", BooleanType(), True), - StructField("statusTemAcessoHigienePessoalSituacaoRua", BooleanType(), True), - StructField("statusVisitaFamiliarFrequentemente", BooleanType(), True), - StructField("tempoSituacaoRua", StringType(), True) - ]), True), - StructField("identificacaoUsuarioCidadao", StructType([ - StructField("nomeSocial", StringType(), True), - StructField("município", StringType(), True), - StructField("dataNascimentoCidadao", DateType(), True), - StructField("emailCidadao", StringType(), True), - StructField("nacionalidadeCidadao", StringType(), True), - StructField("nomeCidadao", StringType(), True), - StructField("nomeMaeCidadao", StringType(), True), - StructField("cnsCidadao", LongType(), True), - StructField("cnsResponsavelFamiliar", LongType(), True), - StructField("telefoneCelular", StringType(), True), - StructField("numeroNisPisPasep", LongType(), True), - StructField("paisNascimento", StringType(), True), - StructField("racaCorCidadao", StringType(), True), - StructField("sexoCidadao", StringType(), True), - StructField("statusEhResponsavel", BooleanType(), True), - StructField("etnia", StringType(), True), - StructField("nomePaiCidadao", StringType(), True), - StructField("desconheceNomePai", BooleanType(), True), - StructField("dtNaturalizacao", DateType(), True), - StructField("portariaNaturalizacao", StringType(), True), - StructField("dtEntradaBrasil", DateType(), True), - StructField("microarea", LongType(), True), - StructField("stForaArea", BooleanType(), True), - StructField("cpfCidadao", StringType(), True), - StructField("cpfResponsavelFamiliar", StringType(), True) - ]), True), - StructField("InformacoesSocioDemograficas", StructType([ - StructField("deficienciasCidadao", ArrayType(StringType()), True), - StructField("grauInstrucaoCidadao", StringType(), True), - StructField("ocupacao", StringType(), True), - StructField("orientacaoSexualCidadao", StringType(), True), - StructField("relacaoParentescoCidadao", StringType(), True), - StructField("situacaoMercadoTrabalhoCidadao", StringType(), True), - StructField("statusDesejaInformarOrientacaoSexual", BooleanType(), True), - StructField("statusFrequentaBenzedeira", BooleanType(), True), - StructField("statusFrequentaEscola", BooleanType(), True), - StructField("statusMembroPovoComunidadeTradicional", BooleanType(), True), - StructField("statusParticipaGrupoComunitario", BooleanType(), True), - StructField("statusPossuiPlanoSaudePrivado", BooleanType(), True), - StructField("statusTemAlgumaDeficiencia", BooleanType(), True), - StructField("identidadeGeneroCidadao", StringType(), True), - StructField("statusDesejaInformarIdentidadeGenero", BooleanType(), True), - StructField("responsavelPorCrianca", StringType(), True), - StructField("coPovoComunidadeTradicional", StringType(), True) - ]), True), - StructField("statusTermoRecusaCadastroIndividualAtencaoBasica", BooleanType(), True), - StructField("saidaCidadaoCadastro", StructType([ - StructField("motivoSaidaCidadao", StringType(), True), - StructField("dataObito", DateType(), True), - StructField("numeroDO", StringType(), True) - ]), True) -]) - - -print("Esquema definido com sucesso.") -# Carregar JSON com o esquema -input_path = "s3a://landing/warehouse/fichas_cadastro_individual_20000.jsonl" -output_path = "s3a://bronze/warehouse/fichas_cadastro_individual_parquet" -try: - print("Carregando dados do JSON...") - df = spark.read.schema(schema).json(input_path) - print("Dados carregados com sucesso.") -except Exception as e: - print(f"Erro ao carregar JSON: {e}") - spark.stop() - exit(1) - -try: - print("Gravando dados em formato Parquet...") - df.write.mode("overwrite").parquet(output_path) - print("Dados gravados com sucesso.") -except Exception as e: - print(f"Erro ao gravar Parquet: {e}") -finally: - spark.stop() diff --git a/gen_bronze_200000.py b/gen_bronze_200000.py deleted file mode 100644 index 77dadea..0000000 --- a/gen_bronze_200000.py +++ /dev/null @@ -1,145 +0,0 @@ -from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, DateType, ArrayType - -# Configuração do Spark para acessar o MinIO -print("Iniciando a configuração do Spark...") -spark = SparkSession.builder \ - .appName("Landig to Bronze") \ - .config("spark.hadoop.fs.s3a.endpoint", "http://minio.minio-cluster.svc.cluster.local:9000") \ - .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.path.style.access", "true") \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ - .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \ - .config("spark.sql.warehouse.dir", "s3a://landing/warehouse") \ - .getOrCreate() - -# Definição completa do esquema -print("Definindo o esquema dos dados...") -schema = StructType([ - StructField("profissionalCNS", LongType(), True), - StructField("cboCodigo_2002", LongType(), True), - StructField("cnes", LongType(), True), - StructField("ine", LongType(), True), - StructField("dataAtendimento", DateType(), True), - StructField("condicoesDeSaude", StructType([ - StructField("descricaoCausaInternacaoEm12Meses", StringType(), True), - StructField("descricaoOutraCondicao1", StringType(), True), - StructField("descricaoOutraCondicao2", StringType(), True), - StructField("descricaoOutraCondicao3", StringType(), True), - StructField("descricaoPlantasMedicinaisUsadas", ArrayType(StringType()), True), - StructField("doencaRespiratoria", ArrayType(StringType()), True), - StructField("doencaRins", ArrayType(StringType()), True), - StructField("maternidadeDeReferencia", StringType(), True), - StructField("situacaoPeso", StringType(), True), - StructField("statusEhDependenteAlcool", BooleanType(), True), - StructField("statusEhDependenteOutrasDrogas", BooleanType(), True), - StructField("statusEhFumante", BooleanType(), True), - StructField("statusEhGestante", BooleanType(), True), - StructField("statusEstaAcamado", BooleanType(), True), - StructField("statusEstaDomiciliado", BooleanType(), True), - StructField("statusTemDiabetes", BooleanType(), True), - StructField("statusTemDoencaRespiratoria", BooleanType(), True), - StructField("statusTemHanseniase", BooleanType(), True), - StructField("statusTemHipertensaoArterial", BooleanType(), True), - StructField("statusTemTeveCancer", BooleanType(), True), - StructField("statusTemTeveDoencasRins", BooleanType(), True), - StructField("statusTemTuberculose", BooleanType(), True), - StructField("statusTeveAvcDerrame", BooleanType(), True), - StructField("statusTeveDoencaCardiaca", BooleanType(), True), - StructField("statusTeveInfarto", BooleanType(), True), - StructField("statusTeveInternadoem12Meses", BooleanType(), True), - StructField("statusUsaOutrasPraticasIntegrativasOuComplementares", BooleanType(), True), - StructField("statusUsaPlantasMedicinais", BooleanType(), True), - StructField("statusDiagnosticoMental", StringType(), True) - ]), True), - StructField("emSituacaoDeRua", StructType([ - StructField("grauParentescoFamiliarFrequentado", StringType(), True), - StructField("higienePessoalSituacaoRua", ArrayType(StringType()), True), - StructField("origemAlimentoSituacaoRua", ArrayType(StringType()), True), - StructField("outraInstituicaoQueAcompanha", StringType(), True), - StructField("quantidadeAlimentacoesAoDiaSituacaoRua", StringType(), True), - StructField("statusAcompanhadoPorOutraInstituicao", BooleanType(), True), - StructField("statusPossuiReferenciaFamiliar", BooleanType(), True), - StructField("statusRecebeBeneficio", BooleanType(), True), - StructField("statusSituacaoRua", BooleanType(), True), - StructField("statusTemAcessoHigienePessoalSituacaoRua", BooleanType(), True), - StructField("statusVisitaFamiliarFrequentemente", BooleanType(), True), - StructField("tempoSituacaoRua", StringType(), True) - ]), True), - StructField("identificacaoUsuarioCidadao", StructType([ - StructField("nomeSocial", StringType(), True), - StructField("município", StringType(), True), - StructField("dataNascimentoCidadao", DateType(), True), - StructField("emailCidadao", StringType(), True), - StructField("nacionalidadeCidadao", StringType(), True), - StructField("nomeCidadao", StringType(), True), - StructField("nomeMaeCidadao", StringType(), True), - StructField("cnsCidadao", LongType(), True), - StructField("cnsResponsavelFamiliar", LongType(), True), - StructField("telefoneCelular", StringType(), True), - StructField("numeroNisPisPasep", LongType(), True), - StructField("paisNascimento", StringType(), True), - StructField("racaCorCidadao", StringType(), True), - StructField("sexoCidadao", StringType(), True), - StructField("statusEhResponsavel", BooleanType(), True), - StructField("etnia", StringType(), True), - StructField("nomePaiCidadao", StringType(), True), - StructField("desconheceNomePai", BooleanType(), True), - StructField("dtNaturalizacao", DateType(), True), - StructField("portariaNaturalizacao", StringType(), True), - StructField("dtEntradaBrasil", DateType(), True), - StructField("microarea", LongType(), True), - StructField("stForaArea", BooleanType(), True), - StructField("cpfCidadao", StringType(), True), - StructField("cpfResponsavelFamiliar", StringType(), True) - ]), True), - StructField("InformacoesSocioDemograficas", StructType([ - StructField("deficienciasCidadao", ArrayType(StringType()), True), - StructField("grauInstrucaoCidadao", StringType(), True), - StructField("ocupacao", StringType(), True), - StructField("orientacaoSexualCidadao", StringType(), True), - StructField("relacaoParentescoCidadao", StringType(), True), - StructField("situacaoMercadoTrabalhoCidadao", StringType(), True), - StructField("statusDesejaInformarOrientacaoSexual", BooleanType(), True), - StructField("statusFrequentaBenzedeira", BooleanType(), True), - StructField("statusFrequentaEscola", BooleanType(), True), - StructField("statusMembroPovoComunidadeTradicional", BooleanType(), True), - StructField("statusParticipaGrupoComunitario", BooleanType(), True), - StructField("statusPossuiPlanoSaudePrivado", BooleanType(), True), - StructField("statusTemAlgumaDeficiencia", BooleanType(), True), - StructField("identidadeGeneroCidadao", StringType(), True), - StructField("statusDesejaInformarIdentidadeGenero", BooleanType(), True), - StructField("responsavelPorCrianca", StringType(), True), - StructField("coPovoComunidadeTradicional", StringType(), True) - ]), True), - StructField("statusTermoRecusaCadastroIndividualAtencaoBasica", BooleanType(), True), - StructField("saidaCidadaoCadastro", StructType([ - StructField("motivoSaidaCidadao", StringType(), True), - StructField("dataObito", DateType(), True), - StructField("numeroDO", StringType(), True) - ]), True) -]) - - -print("Esquema definido com sucesso.") -# Carregar JSON com o esquema -input_path = "s3a://landing/warehouse/fichas_cadastro_individual_200000.jsonl" -output_path = "s3a://bronze/warehouse/fichas_cadastro_individual_parquet" -try: - print("Carregando dados do JSON...") - df = spark.read.schema(schema).json(input_path) - print("Dados carregados com sucesso.") -except Exception as e: - print(f"Erro ao carregar JSON: {e}") - spark.stop() - exit(1) - -try: - print("Gravando dados em formato Parquet...") - df.write.mode("overwrite").parquet(output_path) - print("Dados gravados com sucesso.") -except Exception as e: - print(f"Erro ao gravar Parquet: {e}") -finally: - spark.stop() diff --git a/gen_bronze_5000.py b/gen_bronze_5000.py deleted file mode 100644 index f6ba772..0000000 --- a/gen_bronze_5000.py +++ /dev/null @@ -1,145 +0,0 @@ -from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, DateType, ArrayType - -# Configuração do Spark para acessar o MinIO -print("Iniciando a configuração do Spark...") -spark = SparkSession.builder \ - .appName("Landig to Bronze") \ - .config("spark.hadoop.fs.s3a.endpoint", "http://minio.minio-cluster.svc.cluster.local:9000") \ - .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.path.style.access", "true") \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ - .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \ - .config("spark.sql.warehouse.dir", "s3a://landing/warehouse") \ - .getOrCreate() - -# Definição completa do esquema -print("Definindo o esquema dos dados...") -schema = StructType([ - StructField("profissionalCNS", LongType(), True), - StructField("cboCodigo_2002", LongType(), True), - StructField("cnes", LongType(), True), - StructField("ine", LongType(), True), - StructField("dataAtendimento", DateType(), True), - StructField("condicoesDeSaude", StructType([ - StructField("descricaoCausaInternacaoEm12Meses", StringType(), True), - StructField("descricaoOutraCondicao1", StringType(), True), - StructField("descricaoOutraCondicao2", StringType(), True), - StructField("descricaoOutraCondicao3", StringType(), True), - StructField("descricaoPlantasMedicinaisUsadas", ArrayType(StringType()), True), - StructField("doencaRespiratoria", ArrayType(StringType()), True), - StructField("doencaRins", ArrayType(StringType()), True), - StructField("maternidadeDeReferencia", StringType(), True), - StructField("situacaoPeso", StringType(), True), - StructField("statusEhDependenteAlcool", BooleanType(), True), - StructField("statusEhDependenteOutrasDrogas", BooleanType(), True), - StructField("statusEhFumante", BooleanType(), True), - StructField("statusEhGestante", BooleanType(), True), - StructField("statusEstaAcamado", BooleanType(), True), - StructField("statusEstaDomiciliado", BooleanType(), True), - StructField("statusTemDiabetes", BooleanType(), True), - StructField("statusTemDoencaRespiratoria", BooleanType(), True), - StructField("statusTemHanseniase", BooleanType(), True), - StructField("statusTemHipertensaoArterial", BooleanType(), True), - StructField("statusTemTeveCancer", BooleanType(), True), - StructField("statusTemTeveDoencasRins", BooleanType(), True), - StructField("statusTemTuberculose", BooleanType(), True), - StructField("statusTeveAvcDerrame", BooleanType(), True), - StructField("statusTeveDoencaCardiaca", BooleanType(), True), - StructField("statusTeveInfarto", BooleanType(), True), - StructField("statusTeveInternadoem12Meses", BooleanType(), True), - StructField("statusUsaOutrasPraticasIntegrativasOuComplementares", BooleanType(), True), - StructField("statusUsaPlantasMedicinais", BooleanType(), True), - StructField("statusDiagnosticoMental", StringType(), True) - ]), True), - StructField("emSituacaoDeRua", StructType([ - StructField("grauParentescoFamiliarFrequentado", StringType(), True), - StructField("higienePessoalSituacaoRua", ArrayType(StringType()), True), - StructField("origemAlimentoSituacaoRua", ArrayType(StringType()), True), - StructField("outraInstituicaoQueAcompanha", StringType(), True), - StructField("quantidadeAlimentacoesAoDiaSituacaoRua", StringType(), True), - StructField("statusAcompanhadoPorOutraInstituicao", BooleanType(), True), - StructField("statusPossuiReferenciaFamiliar", BooleanType(), True), - StructField("statusRecebeBeneficio", BooleanType(), True), - StructField("statusSituacaoRua", BooleanType(), True), - StructField("statusTemAcessoHigienePessoalSituacaoRua", BooleanType(), True), - StructField("statusVisitaFamiliarFrequentemente", BooleanType(), True), - StructField("tempoSituacaoRua", StringType(), True) - ]), True), - StructField("identificacaoUsuarioCidadao", StructType([ - StructField("nomeSocial", StringType(), True), - StructField("município", StringType(), True), - StructField("dataNascimentoCidadao", DateType(), True), - StructField("emailCidadao", StringType(), True), - StructField("nacionalidadeCidadao", StringType(), True), - StructField("nomeCidadao", StringType(), True), - StructField("nomeMaeCidadao", StringType(), True), - StructField("cnsCidadao", LongType(), True), - StructField("cnsResponsavelFamiliar", LongType(), True), - StructField("telefoneCelular", StringType(), True), - StructField("numeroNisPisPasep", LongType(), True), - StructField("paisNascimento", StringType(), True), - StructField("racaCorCidadao", StringType(), True), - StructField("sexoCidadao", StringType(), True), - StructField("statusEhResponsavel", BooleanType(), True), - StructField("etnia", StringType(), True), - StructField("nomePaiCidadao", StringType(), True), - StructField("desconheceNomePai", BooleanType(), True), - StructField("dtNaturalizacao", DateType(), True), - StructField("portariaNaturalizacao", StringType(), True), - StructField("dtEntradaBrasil", DateType(), True), - StructField("microarea", LongType(), True), - StructField("stForaArea", BooleanType(), True), - StructField("cpfCidadao", StringType(), True), - StructField("cpfResponsavelFamiliar", StringType(), True) - ]), True), - StructField("InformacoesSocioDemograficas", StructType([ - StructField("deficienciasCidadao", ArrayType(StringType()), True), - StructField("grauInstrucaoCidadao", StringType(), True), - StructField("ocupacao", StringType(), True), - StructField("orientacaoSexualCidadao", StringType(), True), - StructField("relacaoParentescoCidadao", StringType(), True), - StructField("situacaoMercadoTrabalhoCidadao", StringType(), True), - StructField("statusDesejaInformarOrientacaoSexual", BooleanType(), True), - StructField("statusFrequentaBenzedeira", BooleanType(), True), - StructField("statusFrequentaEscola", BooleanType(), True), - StructField("statusMembroPovoComunidadeTradicional", BooleanType(), True), - StructField("statusParticipaGrupoComunitario", BooleanType(), True), - StructField("statusPossuiPlanoSaudePrivado", BooleanType(), True), - StructField("statusTemAlgumaDeficiencia", BooleanType(), True), - StructField("identidadeGeneroCidadao", StringType(), True), - StructField("statusDesejaInformarIdentidadeGenero", BooleanType(), True), - StructField("responsavelPorCrianca", StringType(), True), - StructField("coPovoComunidadeTradicional", StringType(), True) - ]), True), - StructField("statusTermoRecusaCadastroIndividualAtencaoBasica", BooleanType(), True), - StructField("saidaCidadaoCadastro", StructType([ - StructField("motivoSaidaCidadao", StringType(), True), - StructField("dataObito", DateType(), True), - StructField("numeroDO", StringType(), True) - ]), True) -]) - - -print("Esquema definido com sucesso.") -# Carregar JSON com o esquema -input_path = "s3a://landing/warehouse/fichas_cadastro_individual_5000.jsonl" -output_path = "s3a://bronze/warehouse/fichas_cadastro_individual_parquet" -try: - print("Carregando dados do JSON...") - df = spark.read.schema(schema).json(input_path) - print("Dados carregados com sucesso.") -except Exception as e: - print(f"Erro ao carregar JSON: {e}") - spark.stop() - exit(1) - -try: - print("Gravando dados em formato Parquet...") - df.write.mode("overwrite").parquet(output_path) - print("Dados gravados com sucesso.") -except Exception as e: - print(f"Erro ao gravar Parquet: {e}") -finally: - spark.stop() diff --git a/gen_bronze_50000.py b/gen_bronze_50000.py deleted file mode 100644 index 248b866..0000000 --- a/gen_bronze_50000.py +++ /dev/null @@ -1,145 +0,0 @@ -from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, DateType, ArrayType - -# Configuração do Spark para acessar o MinIO -print("Iniciando a configuração do Spark...") -spark = SparkSession.builder \ - .appName("Landig to Bronze") \ - .config("spark.hadoop.fs.s3a.endpoint", "http://minio.minio-cluster.svc.cluster.local:9000") \ - .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \ - .config("spark.hadoop.fs.s3a.path.style.access", "true") \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ - .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \ - .config("spark.sql.warehouse.dir", "s3a://landing/warehouse") \ - .getOrCreate() - -# Definição completa do esquema -print("Definindo o esquema dos dados...") -schema = StructType([ - StructField("profissionalCNS", LongType(), True), - StructField("cboCodigo_2002", LongType(), True), - StructField("cnes", LongType(), True), - StructField("ine", LongType(), True), - StructField("dataAtendimento", DateType(), True), - StructField("condicoesDeSaude", StructType([ - StructField("descricaoCausaInternacaoEm12Meses", StringType(), True), - StructField("descricaoOutraCondicao1", StringType(), True), - StructField("descricaoOutraCondicao2", StringType(), True), - StructField("descricaoOutraCondicao3", StringType(), True), - StructField("descricaoPlantasMedicinaisUsadas", ArrayType(StringType()), True), - StructField("doencaRespiratoria", ArrayType(StringType()), True), - StructField("doencaRins", ArrayType(StringType()), True), - StructField("maternidadeDeReferencia", StringType(), True), - StructField("situacaoPeso", StringType(), True), - StructField("statusEhDependenteAlcool", BooleanType(), True), - StructField("statusEhDependenteOutrasDrogas", BooleanType(), True), - StructField("statusEhFumante", BooleanType(), True), - StructField("statusEhGestante", BooleanType(), True), - StructField("statusEstaAcamado", BooleanType(), True), - StructField("statusEstaDomiciliado", BooleanType(), True), - StructField("statusTemDiabetes", BooleanType(), True), - StructField("statusTemDoencaRespiratoria", BooleanType(), True), - StructField("statusTemHanseniase", BooleanType(), True), - StructField("statusTemHipertensaoArterial", BooleanType(), True), - StructField("statusTemTeveCancer", BooleanType(), True), - StructField("statusTemTeveDoencasRins", BooleanType(), True), - StructField("statusTemTuberculose", BooleanType(), True), - StructField("statusTeveAvcDerrame", BooleanType(), True), - StructField("statusTeveDoencaCardiaca", BooleanType(), True), - StructField("statusTeveInfarto", BooleanType(), True), - StructField("statusTeveInternadoem12Meses", BooleanType(), True), - StructField("statusUsaOutrasPraticasIntegrativasOuComplementares", BooleanType(), True), - StructField("statusUsaPlantasMedicinais", BooleanType(), True), - StructField("statusDiagnosticoMental", StringType(), True) - ]), True), - StructField("emSituacaoDeRua", StructType([ - StructField("grauParentescoFamiliarFrequentado", StringType(), True), - StructField("higienePessoalSituacaoRua", ArrayType(StringType()), True), - StructField("origemAlimentoSituacaoRua", ArrayType(StringType()), True), - StructField("outraInstituicaoQueAcompanha", StringType(), True), - StructField("quantidadeAlimentacoesAoDiaSituacaoRua", StringType(), True), - StructField("statusAcompanhadoPorOutraInstituicao", BooleanType(), True), - StructField("statusPossuiReferenciaFamiliar", BooleanType(), True), - StructField("statusRecebeBeneficio", BooleanType(), True), - StructField("statusSituacaoRua", BooleanType(), True), - StructField("statusTemAcessoHigienePessoalSituacaoRua", BooleanType(), True), - StructField("statusVisitaFamiliarFrequentemente", BooleanType(), True), - StructField("tempoSituacaoRua", StringType(), True) - ]), True), - StructField("identificacaoUsuarioCidadao", StructType([ - StructField("nomeSocial", StringType(), True), - StructField("município", StringType(), True), - StructField("dataNascimentoCidadao", DateType(), True), - StructField("emailCidadao", StringType(), True), - StructField("nacionalidadeCidadao", StringType(), True), - StructField("nomeCidadao", StringType(), True), - StructField("nomeMaeCidadao", StringType(), True), - StructField("cnsCidadao", LongType(), True), - StructField("cnsResponsavelFamiliar", LongType(), True), - StructField("telefoneCelular", StringType(), True), - StructField("numeroNisPisPasep", LongType(), True), - StructField("paisNascimento", StringType(), True), - StructField("racaCorCidadao", StringType(), True), - StructField("sexoCidadao", StringType(), True), - StructField("statusEhResponsavel", BooleanType(), True), - StructField("etnia", StringType(), True), - StructField("nomePaiCidadao", StringType(), True), - StructField("desconheceNomePai", BooleanType(), True), - StructField("dtNaturalizacao", DateType(), True), - StructField("portariaNaturalizacao", StringType(), True), - StructField("dtEntradaBrasil", DateType(), True), - StructField("microarea", LongType(), True), - StructField("stForaArea", BooleanType(), True), - StructField("cpfCidadao", StringType(), True), - StructField("cpfResponsavelFamiliar", StringType(), True) - ]), True), - StructField("InformacoesSocioDemograficas", StructType([ - StructField("deficienciasCidadao", ArrayType(StringType()), True), - StructField("grauInstrucaoCidadao", StringType(), True), - StructField("ocupacao", StringType(), True), - StructField("orientacaoSexualCidadao", StringType(), True), - StructField("relacaoParentescoCidadao", StringType(), True), - StructField("situacaoMercadoTrabalhoCidadao", StringType(), True), - StructField("statusDesejaInformarOrientacaoSexual", BooleanType(), True), - StructField("statusFrequentaBenzedeira", BooleanType(), True), - StructField("statusFrequentaEscola", BooleanType(), True), - StructField("statusMembroPovoComunidadeTradicional", BooleanType(), True), - StructField("statusParticipaGrupoComunitario", BooleanType(), True), - StructField("statusPossuiPlanoSaudePrivado", BooleanType(), True), - StructField("statusTemAlgumaDeficiencia", BooleanType(), True), - StructField("identidadeGeneroCidadao", StringType(), True), - StructField("statusDesejaInformarIdentidadeGenero", BooleanType(), True), - StructField("responsavelPorCrianca", StringType(), True), - StructField("coPovoComunidadeTradicional", StringType(), True) - ]), True), - StructField("statusTermoRecusaCadastroIndividualAtencaoBasica", BooleanType(), True), - StructField("saidaCidadaoCadastro", StructType([ - StructField("motivoSaidaCidadao", StringType(), True), - StructField("dataObito", DateType(), True), - StructField("numeroDO", StringType(), True) - ]), True) -]) - - -print("Esquema definido com sucesso.") -# Carregar JSON com o esquema -input_path = "s3a://landing/warehouse/fichas_cadastro_individual_50000.jsonl" -output_path = "s3a://bronze/warehouse/fichas_cadastro_individual_parquet" -try: - print("Carregando dados do JSON...") - df = spark.read.schema(schema).json(input_path) - print("Dados carregados com sucesso.") -except Exception as e: - print(f"Erro ao carregar JSON: {e}") - spark.stop() - exit(1) - -try: - print("Gravando dados em formato Parquet...") - df.write.mode("overwrite").parquet(output_path) - print("Dados gravados com sucesso.") -except Exception as e: - print(f"Erro ao gravar Parquet: {e}") -finally: - spark.stop() diff --git a/proj_teste/models/newmodel/new.sql b/proj_teste/models/newmodel/new.sql deleted file mode 100644 index 9954901..0000000 --- a/proj_teste/models/newmodel/new.sql +++ /dev/null @@ -1,37 +0,0 @@ -pessoa -nome_completo identificacaoUsuarioCidadao.nomeCidadao -data_nascimento identificacaoUsuarioCidadao.dataNascimentoCidadao -sexo identificacaoUsuarioCidadao.sexoCidadao -raca_cor identificacaoUsuarioCidadao.racaCorCidadao -etnia identificacaoUsuarioCidadao.etnia -cns identificacaoUsuarioCidadao.cnsCidadao -telefone identificacaoUsuarioCidadao.telefoneCelular -email identificacaoUsuarioCidadao.emailCidadao -nome_social identificacaoUsuarioCidadao.nomeSocial -pis_pasep identificacaoUsuarioCidadao.numeroNisPisPasep -nome_mae identificacaoUsuarioCidadao.nomeMaeCidadao -nome_pai identificacaoUsuarioCidadao.nomePaiCidadao -nacionalidade identificacaoUsuarioCidadao.paisNascimento -orientacao_sexual informacoesSocioDemograficas.orientacaoSexualCidadao -genero informacoesSocioDemograficas.identidadeGeneroCidadao -obito saidaCidadaoCadastro.motivoSaidaCidadao == 'Óbito' -dt_obito saidaCidadaoCadastro.dataObito -num_do_obito saidaCidadaoCadastro.numeroDO - -saude -deficiencia_auditiva 'Auditiva' in informacoesSocioDemograficas.deficienciasCidadao -deficiencia_visual 'Visual' in informacoesSocioDemograficas.deficienciasCidadao -deficiencia_intelectual 'Intelectual / Cognitiva' in informacoesSocioDemograficas.deficienciasCidadao -deficiencia_fisica 'Física' in informacoesSocioDemograficas.deficienciasCidadao -peso_status condicoesDeSaude.situacaoPeso -doenca_coracao condicoesDeSaude.statusTeveDoencaCardiaca -doenca_rim condicoesDeSaude.doencaRins != null -situacao_rua emSituacaoDeRua != null -acompanhado_outra_insituicao emSituacaoDeRua.statusAcompanhadoPorOutraInstituicao -nome_insituicao emSituacaoDeRua.outraInstituicaoQueAcompanha -acesso_higiene_banho 'Banho' in emSituacaoDeRua.higienePessoalSituacaoRua -acesso_higiene_sanitario 'Acesso a sanitário' in emSituacaoDeRua.higienePessoalSituacaoRua -acesso_higiene_bucal 'Higiene bucal' in emSituacaoDeRua.higienePessoalSituacaoRua -outras_higiene 'Outros' in emSituacaoDeRua.higienePessoalSituacaoRua -mudanca_territorio saidaCidadaoCadastro.motivoSaidaCidadao == 'Mudança de território' -situacao_gestacional condicoesDeSaude.statusEhGestante diff --git a/proj_teste/models/silver/dim_pessoa.sql b/proj_teste/models/silver/dim_pessoa.sql new file mode 100644 index 0000000..7523062 --- /dev/null +++ b/proj_teste/models/silver/dim_pessoa.sql @@ -0,0 +1,32 @@ +{{config( +alias='dim_pessoa', +table_type='iceberg', +format='parquet', +materialized='table' +)}} + +WITH cleaned_data AS ( + SELECT + identificacaoUsuarioCidadao.nomeCidadao AS nome_completo, + identificacaoUsuarioCidadao.dataNascimentoCidadao AS data_nascimento, + identificacaoUsuarioCidadao.sexoCidadao AS sexo, + identificacaoUsuarioCidadao.racaCorCidadao AS raca_cor, + identificacaoUsuarioCidadao.etnia AS etnia, + identificacaoUsuarioCidadao.cnsCidadao AS cns, + identificacaoUsuarioCidadao.telefoneCelular AS telefone, + identificacaoUsuarioCidadao.emailCidadao AS email, + identificacaoUsuarioCidadao.nomeSocial AS nome_social, + identificacaoUsuarioCidadao.numeroNisPisPasep AS pis_pasep, + identificacaoUsuarioCidadao.nomeMaeCidadao AS nome_mae, + identificacaoUsuarioCidadao.nomePaiCidadao AS nome_pai, + identificacaoUsuarioCidadao.paisNascimento AS nacionalidade, + informacoesSocioDemograficas.orientacaoSexualCidadao AS orientacao_sexual, + informacoesSocioDemograficas.identidadeGeneroCidadao AS genero, + (saidaCidadaoCadastro.motivoSaidaCidadao = 'Óbito') AS obito, + saidaCidadaoCadastro.dataObito AS dt_obito, + saidaCidadaoCadastro.numeroDO AS num_do_obito + + FROM {{ source('bronze', 'fichas_cadastro_individual') }} +) + +SELECT * FROM cleaned_data diff --git a/proj_teste/models/silver/dim_saude.sql b/proj_teste/models/silver/dim_saude.sql new file mode 100644 index 0000000..b67a720 --- /dev/null +++ b/proj_teste/models/silver/dim_saude.sql @@ -0,0 +1,30 @@ +{{config( +alias='dim_saude', +table_type='iceberg', +format='parquet', +materialized='table' +)}} + +WITH cleaned_data AS ( + SELECT + contains(informacoesSocioDemograficas.deficienciasCidadao, 'Auditiva') AS deficiencia_auditiva, + contains(informacoesSocioDemograficas.deficienciasCidadao, 'Visual') AS deficiencia_visual, + contains(informacoesSocioDemograficas.deficienciasCidadao, 'Intelectual / Cognitiva') AS deficiencia_intelectual, + contains(informacoesSocioDemograficas.deficienciasCidadao, 'Física') AS deficiencia_fisica, + condicoesDeSaude.situacaoPeso AS peso_status, + condicoesDeSaude.statusTeveDoencaCardiaca AS doenca_coracao, + (condicoesDeSaude.doencaRins != NULL) AS doenca_rim, + (emSituacaoDeRua != NULL ) AS situacao_rua, + emSituacaoDeRua.statusAcompanhadoPorOutraInstituicao AS acompanhado_outra_insituicao, + emSituacaoDeRua.outraInstituicaoQueAcompanha AS nome_insituicao, + contains(emSituacaoDeRua.higienePessoalSituacaoRua, 'Banho') AS acesso_higiene_banho, + contains(emSituacaoDeRua.higienePessoalSituacaoRua, 'Acesso a sanitário') AS acesso_higiene_sanitario, + contains(emSituacaoDeRua.higienePessoalSituacaoRua, 'Higiene bucal') AS acesso_higiene_bucal, + contains(emSituacaoDeRua.higienePessoalSituacaoRua, 'Outros') AS outras_higiene, + (saidaCidadaoCadastro.motivoSaidaCidadao = 'Mudança de território') AS mudanca_territorio, + condicoesDeSaude.statusEhGestante AS situacao_gestacional + + FROM {{ source('bronze', 'fichas_cadastro_individual') }} +) + +SELECT * FROM cleaned_data diff --git a/proj_teste/models/silver/fichas_cadastro_individual_silver.sql b/proj_teste/models/silver/fichas_cadastro_individual_silver.sql index 21192fd..fa51bf5 100644 --- a/proj_teste/models/silver/fichas_cadastro_individual_silver.sql +++ b/proj_teste/models/silver/fichas_cadastro_individual_silver.sql @@ -19,3 +19,4 @@ WITH cleaned_data AS ( ) SELECT * FROM cleaned_data + -- GitLab