"lib_lista_complementar.c" did not exist on "9e96698c67f13dfb580254cb0855d81442f4cacf"
Select Git revision
Forked from
Diego Giovane Pasqualin / gitlab-ci-by-example
Source project has a limited visibility.
-
Diego Giovane Pasqualin authoredDiego Giovane Pasqualin authored
gen_bronze.py NaN GiB
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, DateType, ArrayType
# Configuração do Spark para acessar o MinIO
print("Iniciando a configuração do Spark...")
spark = SparkSession.builder \
.appName("Landig to Bronze") \
.config("spark.hadoop.fs.s3a.endpoint", "http://minio.minio-cluster.svc.cluster.local:9000") \
.config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
.config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
.config("spark.hadoop.fs.s3a.path.style.access", "true") \
.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
.config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
.config("spark.sql.warehouse.dir", "s3a://landing/warehouse") \
.getOrCreate()
# Definição completa do esquema
print("Definindo o esquema dos dados...")
schema = StructType([
StructField("profissionalCNS", LongType(), True),
StructField("cboCodigo_2002", LongType(), True),
StructField("cnes", LongType(), True),
StructField("ine", LongType(), True),
StructField("dataAtendimento", DateType(), True),
StructField("condicoesDeSaude", StructType([
StructField("descricaoCausaInternacaoEm12Meses", StringType(), True),
StructField("descricaoOutraCondicao1", StringType(), True),
StructField("descricaoOutraCondicao2", StringType(), True),
StructField("descricaoOutraCondicao3", StringType(), True),
StructField("descricaoPlantasMedicinaisUsadas", ArrayType(StringType()), True),
StructField("doencaRespiratoria", ArrayType(StringType()), True),
StructField("doencaRins", ArrayType(StringType()), True),
StructField("maternidadeDeReferencia", StringType(), True),
StructField("situacaoPeso", StringType(), True),
StructField("statusEhDependenteAlcool", BooleanType(), True),
StructField("statusEhDependenteOutrasDrogas", BooleanType(), True),
StructField("statusEhFumante", BooleanType(), True),
StructField("statusEhGestante", BooleanType(), True),
StructField("statusEstaAcamado", BooleanType(), True),
StructField("statusEstaDomiciliado", BooleanType(), True),
StructField("statusTemDiabetes", BooleanType(), True),
StructField("statusTemDoencaRespiratoria", BooleanType(), True),
StructField("statusTemHanseniase", BooleanType(), True),
StructField("statusTemHipertensaoArterial", BooleanType(), True),
StructField("statusTemTeveCancer", BooleanType(), True),
StructField("statusTemTeveDoencasRins", BooleanType(), True),
StructField("statusTemTuberculose", BooleanType(), True),
StructField("statusTeveAvcDerrame", BooleanType(), True),
StructField("statusTeveDoencaCardiaca", BooleanType(), True),
StructField("statusTeveInfarto", BooleanType(), True),
StructField("statusTeveInternadoem12Meses", BooleanType(), True),
StructField("statusUsaOutrasPraticasIntegrativasOuComplementares", BooleanType(), True),
StructField("statusUsaPlantasMedicinais", BooleanType(), True),
StructField("statusDiagnosticoMental", StringType(), True)
]), True),
StructField("emSituacaoDeRua", StructType([
StructField("grauParentescoFamiliarFrequentado", StringType(), True),
StructField("higienePessoalSituacaoRua", ArrayType(StringType()), True),
StructField("origemAlimentoSituacaoRua", ArrayType(StringType()), True),
StructField("outraInstituicaoQueAcompanha", StringType(), True),
StructField("quantidadeAlimentacoesAoDiaSituacaoRua", StringType(), True),
StructField("statusAcompanhadoPorOutraInstituicao", BooleanType(), True),
StructField("statusPossuiReferenciaFamiliar", BooleanType(), True),
StructField("statusRecebeBeneficio", BooleanType(), True),
StructField("statusSituacaoRua", BooleanType(), True),
StructField("statusTemAcessoHigienePessoalSituacaoRua", BooleanType(), True),
StructField("statusVisitaFamiliarFrequentemente", BooleanType(), True),
StructField("tempoSituacaoRua", StringType(), True)
]), True),
StructField("identificacaoUsuarioCidadao", StructType([
StructField("nomeSocial", StringType(), True),
StructField("município", StringType(), True),
StructField("dataNascimentoCidadao", DateType(), True),
StructField("emailCidadao", StringType(), True),
StructField("nacionalidadeCidadao", StringType(), True),
StructField("nomeCidadao", StringType(), True),
StructField("nomeMaeCidadao", StringType(), True),
StructField("cnsCidadao", LongType(), True),
StructField("cnsResponsavelFamiliar", LongType(), True),
StructField("telefoneCelular", StringType(), True),
StructField("numeroNisPisPasep", LongType(), True),
StructField("paisNascimento", StringType(), True),
StructField("racaCorCidadao", StringType(), True),
StructField("sexoCidadao", StringType(), True),
StructField("statusEhResponsavel", BooleanType(), True),
StructField("etnia", StringType(), True),
StructField("nomePaiCidadao", StringType(), True),
StructField("desconheceNomePai", BooleanType(), True),
StructField("dtNaturalizacao", DateType(), True),
StructField("portariaNaturalizacao", StringType(), True),
StructField("dtEntradaBrasil", DateType(), True),
StructField("microarea", LongType(), True),
StructField("stForaArea", BooleanType(), True),
StructField("cpfCidadao", StringType(), True),
StructField("cpfResponsavelFamiliar", StringType(), True)
]), True),
StructField("InformacoesSocioDemograficas", StructType([
StructField("deficienciasCidadao", ArrayType(StringType()), True),
StructField("grauInstrucaoCidadao", StringType(), True),
StructField("ocupacao", StringType(), True),
StructField("orientacaoSexualCidadao", StringType(), True),
StructField("relacaoParentescoCidadao", StringType(), True),
StructField("situacaoMercadoTrabalhoCidadao", StringType(), True),
StructField("statusDesejaInformarOrientacaoSexual", BooleanType(), True),
StructField("statusFrequentaBenzedeira", BooleanType(), True),
StructField("statusFrequentaEscola", BooleanType(), True),
StructField("statusMembroPovoComunidadeTradicional", BooleanType(), True),
StructField("statusParticipaGrupoComunitario", BooleanType(), True),
StructField("statusPossuiPlanoSaudePrivado", BooleanType(), True),
StructField("statusTemAlgumaDeficiencia", BooleanType(), True),
StructField("identidadeGeneroCidadao", StringType(), True),
StructField("statusDesejaInformarIdentidadeGenero", BooleanType(), True),
StructField("responsavelPorCrianca", StringType(), True),
StructField("coPovoComunidadeTradicional", StringType(), True)
]), True),
StructField("statusTermoRecusaCadastroIndividualAtencaoBasica", BooleanType(), True),
StructField("saidaCidadaoCadastro", StructType([
StructField("motivoSaidaCidadao", StringType(), True),
StructField("dataObito", DateType(), True),
StructField("numeroDO", StringType(), True)
]), True)
])
print("Esquema definido com sucesso.")
# Carregar JSON com o esquema
input_path = "s3a://landing/warehouse/fichas_cadastro_individual_1000.json"
output_path = "s3a://bronze/warehouse/fichas_cadastro_individual_parquet"
try:
print("Carregando dados do JSON...")
df = spark.read.schema(schema).json(input_path)
print("Dados carregados com sucesso.")
except Exception as e:
print(f"Erro ao carregar JSON: {e}")
spark.stop()
exit(1)
try:
print("Gravando dados em formato Parquet...")
df.write.mode("overwrite").parquet(output_path)
print("Dados gravados com sucesso.")
except Exception as e:
print(f"Erro ao gravar Parquet: {e}")
finally:
spark.stop()