diff --git a/database/create.sh b/database/create.sh index 5a65f1c9e27b1aa0f0e05f7aeb193f97bb5f50e9..81236920eaf2a4b04ddec465d0b4ec1a02ae11af 100755 --- a/database/create.sh +++ b/database/create.sh @@ -1,6 +1,7 @@ #!/bin/bash DB_NAME=$1 +DB_LINK_LOCATION=/usr/share/postgresql/8.4/contrib/dblink.sql # checking parameters if [[ -z $DB_NAME ]]; then @@ -23,6 +24,10 @@ for file in $(ls -B data/); do psql $DB_NAME -f data/$file done +# importing pre-defined queries for file in $(ls -B query/); do psql $DB_NAME -f query/$file done + +# importing db_link functions +psql $DB_NAMR -f $DB_LINK_LOCATION diff --git a/database/create/001-proinfo-oltp.sql b/database/create/001-proinfo-oltp.sql index 80cdd68577a4f3bbaa557e0fec5e2ac7459c9cf7..ca1affa2b190bf2c0849e43190102c68e56558da 100644 --- a/database/create/001-proinfo-oltp.sql +++ b/database/create/001-proinfo-oltp.sql @@ -18,6 +18,17 @@ CREATE TABLE proinfo_inventory ( extra_hds SMALLINT DEFAULT 0 NOT NULL ); +CREATE TABLE proinfo_net_usage ( + contact_date date default CURRENT_DATE not null, + inep CHARACTER VARYING(18) not null, + macaddr CHARACTER VARYING(18) not null, + collect_time TIME WITHOUT TIME ZONE, + down_kbits BIGINT, + down_packages INTEGER, + up_kbits BIGINT, + up_packages INTEGER +); + CREATE TABLE rejected_inventory ( id INTEGER, sch_id INTEGER, @@ -40,11 +51,13 @@ CREATE TABLE rejected_inventory ( extra_hds SMALLINT ); -CREATE TABLE proinfo_net_usage ( +CREATE TABLE rejected_net_usage ( + sch_id INTEGER, + cit_id INTEGER, contact_date date default CURRENT_DATE not null, inep CHARACTER VARYING(18) not null, macaddr CHARACTER VARYING(18) not null, - collect_time TIMESTAMP, + collect_time TIME WITHOUT TIME ZONE, down_kbits BIGINT, down_packages INTEGER, up_kbits BIGINT, diff --git a/database/create/003-facts.sql b/database/create/003-facts.sql index 563fcb0b92bb988faf911e367364bc7ede9ac5f6..b3f4b72eec5e46b225da24bd7ba40cf955a2a671 100644 --- a/database/create/003-facts.sql +++ b/database/create/003-facts.sql @@ -15,7 +15,7 @@ CREATE TABLE fact_net_usage ( dat_id DATE NOT NULL REFERENCES dim_date, macaddr MACADDR NOT NULL, - collect_time TIMESTAMP NOT NULL, + collect_time TIME WITHOUT TIME ZONE NOT NULL, down_kbits BIGINT NOT NULL DEFAULT 0, down_packages INTEGER NOT NULL DEFAULT 0, up_kbits BIGINT NOT NULL DEFAULT 0, diff --git a/database/load/000-sa.sql b/database/load/000-sa.sql index c18f36cfda0eda76133f2734298c21fb7cccc8fc..412937e92a8e38e9ebf7f16f01a8d55e0418e23e 100644 --- a/database/load/000-sa.sql +++ b/database/load/000-sa.sql @@ -48,7 +48,8 @@ BEGIN i.disk2_model, i.disk2_size, i.disk2_used, - i.extra_hds + i.extra_hds, + '1'::boolean as valid FROM proinfo_inventory i LEFT JOIN @@ -65,7 +66,7 @@ BEGIN VALUES ('create sa inventory', start_ts, CLOCK_TIMESTAMP(), total_rows); -- truncating OLTP table - TRUNCATE TABLE proinfo_inventory; + --TRUNCATE TABLE proinfo_inventory; END; $$ language plpgsql; @@ -94,12 +95,14 @@ BEGIN s.id as sch_id, s.cit_id, n.contact_date, + s.inep, CAST(n.macaddr as MACADDR), n.collect_time, n.down_kbits, n.down_packages, n.up_kbits, - n.up_packages + n.up_packages, + '1'::boolean as valid FROM proinfo_net_usage n LEFT JOIN @@ -113,6 +116,9 @@ BEGIN -- how many rows were inserted? GET DIAGNOSTICS total_rows = ROW_COUNT; + -- creating the primary key + --ALTER TABLE sa_net_usage ADD PRIMARY KEY ("id"); + -- logging INSERT INTO control (name, start_time, end_time, total) VALUES ('create sa net usage', start_ts, CLOCK_TIMESTAMP(), total_rows); @@ -125,6 +131,113 @@ END; $$ language plpgsql; +-- sanitize, removing invalid rows +CREATE OR REPLACE FUNCTION sa_sanitize() returns void as $$ +BEGIN + + PERFORM sa_sanitize_inventory(); + PERFORM sa_sanitize_net_usage(); + +END; +$$ language plpgsql; + + +-- sanitizes sa_inventory table data +CREATE OR REPLACE FUNCTION sa_sanitize_inventory() returns void as $$ +DECLARE + start_ts TIMESTAMP; + total_rows INTEGER; +BEGIN + -- getting initial timestamp + start_ts = CLOCK_TIMESTAMP(); + + -- updating the 'valid' column on invalid records + UPDATE sa_inventory + SET valid = '0' + WHERE + sch_id is NULL OR + cit_id is NULL OR + contact_date is NULL OR + os_type is NULL OR + os_distro is NULL OR + os_kernel is NULL OR + processor is NULL OR + memory is NULL OR + disk1_model is NULL OR + disk1_size is NULL OR + disk1_used is NULL OR + NOT valid_macaddress(macaddr) OR + project < 0 OR project > 3 OR + inep is NULL; + + -- log the invalid information into rejected table + INSERT INTO rejected_inventory + (sch_id, cit_id, contact_date, project, inep, macaddr, + os_type, os_distro, os_kernel, processor, memory, disk1_model, + disk1_size, disk1_used, disk2_model, disk2_size, disk2_used) + (SELECT + sch_id, cit_id, contact_date, project, inep, macaddr, + os_type, os_distro, os_kernel, processor, memory, disk1_model, + disk1_size, disk1_used, disk2_model, disk2_size, disk2_used + FROM + sa_inventory + WHERE + valid = '0'); + + -- how many rows we inserted? + GET DIAGNOSTICS total_rows = ROW_COUNT; + + -- logging + INSERT INTO control (name, start_time, end_time, total) + VALUES ('sanitize sa inventory', start_ts, CLOCK_TIMESTAMP(), total_rows); + +END; +$$ language plpgsql; + + +-- sanitize the data in the net_usage staging area. Invalid data +-- will be marked using the column 'valid' and inserted into the +-- rejected_net_usage table. +CREATE OR REPLACE FUNCTION sa_sanitize_net_usage() returns void as $$ +DECLARE + start_ts TIMESTAMP; + total_rows INTEGER; +BEGIN + -- getting initial timestamp + start_ts = CLOCK_TIMESTAMP(); + + -- check required information updating the valid column + UPDATE sa_net_usage + SET valid = '0' + WHERE + sch_id is NULL OR + cit_id is NULL OR + contact_date is NULL OR + collect_time is NULL OR + NOT valid_macaddress(macaddr::text); + + -- log the invalid information into rejected table + INSERT INTO rejected_net_usage + (sch_id, cit_id, contact_date, inep, macaddr, collect_time, + down_kbits, down_packages, up_kbits, up_packages) + (SELECT + sch_id, cit_id, contact_date, inep, macaddr, collect_time, + down_kbits, down_packages, up_kbits, up_packages + FROM + sa_net_usage + WHERE + valid = '0'); + + -- how many rows we inserted? + GET DIAGNOSTICS total_rows = ROW_COUNT; + + -- logging + INSERT INTO control (name, start_time, end_time, total) + VALUES ('sanitize sa net_usage', start_ts, CLOCK_TIMESTAMP(), total_rows); +END; +$$ language plpgsql; + + -- Function that drops staging area tables. -- This function should NOT be called directly. CREATE OR REPLACE FUNCTION sa_drop() returns void as $$ @@ -157,25 +270,3 @@ BEGIN VALUES ('drop sa usage', start_ts, CLOCK_TIMESTAMP(), total_rows); END; $$ language plpgsql; - - --- sanitize, removing invalid rows -CREATE OR REPLACE FUNCTION sa_sanitize() returns void as $$ -BEGIN - INSERT INTO rejected_inventory - (SELECT * FROM sa_inventory WHERE - os_type is NULL OR - os_distro is NULL OR - os_kernel is NULL OR - processor is NULL OR - memory is NULL OR - disk1_model is NULL OR - disk1_size is NULL OR - disk1_used is NULL OR - NOT valid_macaddress(macaddr) OR - project < 0 OR project > 3 OR - inep NOT IN (SELECT inep FROM dim_school)); - DELETE FROM sa_inventory WHERE id IN (SELECT id FROM rejected_inventory); - -END; -$$ language plpgsql; diff --git a/database/load/998-copy_from_seed.sql b/database/load/998-copy_from_seed.sql new file mode 100644 index 0000000000000000000000000000000000000000..bfce814a764cb69cd0c63465e6a478079555ab08 --- /dev/null +++ b/database/load/998-copy_from_seed.sql @@ -0,0 +1,98 @@ +-- copy inventory data directly from the production database +CREATE OR REPLACE FUNCTION copy_inventory_from_seed() returns void as $$ +BEGIN + + -- inserting rows from production staging area into sa_inventory + INSERT INTO proinfo_inventory + (contact_date, project, inep, macaddr, os_type, os_distro, + os_kernel, processor, memory, disk1_model, disk1_size, + disk1_used, disk2_model, disk2_size, disk2_used) + (SELECT + * + FROM + dblink('dbname=db + host=host + user=user + password=passwd', + 'SELECT + sa_data, + sa_projeto, + sa_inep, + sa_mac, + sa_so_nome, + sa_so_distribuicao, + sa_so_kernel, + sa_processador, + sa_memoria, + sa_disco1_modelo, + sa_disco1_capacidade, + sa_disco1_usado, + sa_disco2_modelo, + sa_disco2_capacidade, + sa_disco2_usado + FROM mectb00_staging_area' + ) + AS ( + contact_date date, + project integer, + inep varchar(18), + macaddr varchar(18), + os_type text, + os_distro text, + os_kernel text, + processor text, + memory integer, + disk1_model text, + disk1_size integer, + disk1_used integer, + disk2_model text, + disk2_size integer, + disk2_used integer + )); + +END; +$$ language plpgsql; + +-- copy net usage data directly from the production database +CREATE OR REPLACE FUNCTION copy_net_from_seed(date_to_load DATE) returns void as $$ +BEGIN + + INSERT INTO proinfo_net_usage + (contact_date, inep, macaddr, collect_time, down_kbits, down_packages, up_kbits, up_packages) + (SELECT + * + FROM + dblink('dbname=db + host=host + user=user + password=passwd', + 'SELECT + net_data, + net_inep, + net_mac, + net_hora, + net_bytes_in::integer, + net_pacotes_in, + net_bytes_out::integer, + net_pacotes_out + FROM mectb00_net_staging_area + WHERE + net_data = ''' || date_to_load ||''' + AND net_bytes_in <> 0 + AND net_pacotes_in <> 0 + AND net_bytes_out <> 0 + AND net_pacotes_in <> 0' + ) + AS ( + contact_date DATE, + inep VARCHAR(18), + macaddr VARCHAR(18), + collect_time TIME WITHOUT TIME ZONE, + down_kbits BIGINT, + down_packages INTEGER, + up_kbits BIGINT, + up_packages INTEGER + )); + +END; +$$ language plpgsql;