From b3775210b573759b78349ab6b95e480f294c8a94 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Fri, 20 Oct 2017 10:09:55 -0400 Subject: [PATCH 01/20] merge pt branch updates --- do_map.pgsql | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/do_map.pgsql b/do_map.pgsql index 5767e93..7c9fff1 100644 --- a/do_map.pgsql +++ b/do_map.pgsql @@ -8,6 +8,7 @@ SELECT m.srce, m.target, t.id, + t.rec, jsonb_build_object( e.v ->> 'key', (t.rec #> ((e.v ->> 'key')::text[])) @@ -30,12 +31,14 @@ FROM LEFT JOIN LATERAL jsonb_array_elements(m.regex->'defn') WITH ORDINALITY e(v, rn) ON true LEFT JOIN LATERAL regexp_matches(t.rec #>> ((e.v ->> 'key')::text[]), e.v ->> 'regex'::text) WITH ORDINALITY mt(mt, rn) ON true WHERE - t.srce = 'PNCC' + t.map IS NULL + AND t.srce = 'PNCC' ORDER BY m.srce, m.seq, m.target, t.id, + t.rec, e.rn ), @@ -47,6 +50,7 @@ agg_rx AS ( rx.srce, rx.target, rx.id, + rx.rec, tps.jsonb_concat_obj(rx.rkey) rkey, tps.jsonb_concat_obj(rx.retval) AS retval, tps.jsonb_concat_obj(CASE rx.retain WHEN 'y' THEN rx.retval ELSE '{}'::jsonb END) retain, @@ -59,27 +63,60 @@ agg_rx AS ( rx.srce, rx.target, rx.id, + rx.rec, rx.seq ) -------------aggregate all targets back to row level (id)------------------------------------------------------------------------------------------------ +,agg_orig AS ( SELECT u.srce, u.id, + u.rec, string_agg(u.target,',') target, - jsonb_pretty(tps.jsonb_concat_obj(coalesce(v.map,'{}'::jsonb) ORDER BY seq )) map, - jsonb_pretty(tps.jsonb_concat_obj(u.retval||coalesce(v.map,'{}'::jsonb) ORDER BY seq)) comb, - jsonb_pretty(tps.jsonb_concat_obj(u.retain||coalesce(v.map,'{}'::jsonb) ORDER BY seq)) retain + tps.jsonb_concat_obj(coalesce(v.map,'{}'::jsonb) ORDER BY seq ) map, + --tps.jsonb_concat_obj(u.retval||coalesce(v.map,'{}'::jsonb) ORDER BY seq) comb, + tps.jsonb_concat_obj(u.retain||coalesce(v.map,'{}'::jsonb) ORDER BY seq) retain FROM --re-aggregate return values and explude any records where one or more regex failed with a null result agg_rx u - LEFT OUTER JOIN tps.map_rv v ON + INNER JOIN tps.map_rv v ON v.target = u.target AND v.srce = u.srce AND v.retval <@ u.retval GROUP BY u.srce, - u.id - LIMIT 1000 \ No newline at end of file + u.id, + u.rec +) + + +--SELECT * FROM agg_orig LIMIT 10 +--UPDATE tps.trans t SET (map) = (SELECT retain FROM agg_orig WHERE t.id = agg_orig.id); + +UPDATE + tps.trans t +SET + map = o.retain +FROM + agg_orig o +WHERE + o.id = t.id + + +/* +SELECT + retain->>'f20', + rec->>'Description', + COUNT(*) +FROM + agg_orig +GROUP BY + retain->>'f20', + rec->>'Description' +ORDER BY + retain->>'f20', + rec->>'Description' +*/ \ No newline at end of file From 51fbec1244c029739b009f6e1114849be4e46014 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Mon, 23 Oct 2017 10:12:37 -0400 Subject: [PATCH 02/20] push inserts into do block so they can consume local variables, need to setup log inserts --- srce.pgsql | 217 ++++++++++++++++++++++++++++------------------------- 1 file changed, 113 insertions(+), 104 deletions(-) diff --git a/srce.pgsql b/srce.pgsql index 0e8ac0a..f749a77 100644 --- a/srce.pgsql +++ b/srce.pgsql @@ -15,8 +15,13 @@ DO $$ DECLARE _t text; DECLARE _c text; +DECLARE _path text; +DECLARE _srce text; BEGIN + + _path := 'C:\users\ptrowbridge\downloads\lon_loan_ledgerbal.csv'; + _srce := 'PNCO'; ----------------------------------------------------build the column list of the temp table---------------------------------------------------------------- @@ -31,7 +36,7 @@ BEGIN --unwrap the schema definition array LEFT JOIN LATERAL jsonb_populate_recordset(null::tps.srce_defn_schema, defn->'schema') prs ON TRUE WHERE - srce = 'PNCC' + srce = _srce GROUP BY srce; @@ -48,122 +53,126 @@ BEGIN ----------------------------------------------------do the insert------------------------------------------------------------------------------------------- --the column list needs to be dynamic forcing this whole line to be dynamic - _t := format('COPY csv_i (%s) FROM ''C:\Users\ptrowbridge\downloads\transsearchcsv.csv'' WITH (HEADER TRUE,DELIMITER '','', FORMAT CSV, ENCODING ''SQL_ASCII'',QUOTE ''"'');',_c); + _t := format('COPY csv_i (%s) FROM %L WITH (HEADER TRUE,DELIMITER '','', FORMAT CSV, ENCODING ''SQL_ASCII'',QUOTE ''"'');',_c,_path); --RAISE NOTICE '%', _t; EXECUTE _t; + WITH -END -$$; + -------------for each imported row in the COPY table, genereate the json rec, and a column for the json key specified in the srce.defn----------- + pending_list AS ( + SELECT + ---creates a key value pair and then aggregates rows of key value pairs + jsonb_object_agg( + (ae.e::text[])[1], --the key name + (row_to_json(i)::jsonb) #> ae.e::text[] --get the target value from the key from the csv row that has been converted to json + ) json_key, + row_to_json(i)::JSONB rec, + srce, + --ae.rn, + id + FROM + csv_i i + INNER JOIN tps.srce s ON + s.srce = _srce + LEFT JOIN LATERAL JSONB_ARRAY_ELEMENTS_TEXT(defn->'unique_constraint'->'fields') WITH ORDINALITY ae(e, rn) ON TRUE + GROUP BY + i.*, + srce, + id + ORDER BY + id ASC + ) -WITH + -----------create a unique list of keys from staged rows------------------------------------------------------------------------------------------ --------------for each imported row in the COPY table, genereate the json rec, and a column for the json key specified in the srce.defn----------- + , pending_keys AS ( + SELECT DISTINCT + json_key + FROM + pending_list + ) -pending_list AS ( + -----------return unique keys that are not already in tps.trans----------------------------------------------------------------------------------- + + , unmatched_keys AS ( SELECT - ---creates a key value pair and then aggregates rows of key value pairs - jsonb_object_agg( - (ae.e::text[])[1], --the key name - (row_to_json(i)::jsonb) #> ae.e::text[] --get the target value from the key from the csv row that has been converted to json - ) json_key, - row_to_json(i)::JSONB rec, - srce, - --ae.rn, - id + json_key FROM - csv_i i + pending_keys + + EXCEPT + + SELECT DISTINCT + k.json_key + FROM + pending_keys k + INNER JOIN tps.trans t ON + t.rec @> k.json_key + ) + + -----------insert pending rows that have key with no trans match----------------------------------------------------------------------------------- + --need to look into mapping the transactions prior to loading + + , inserted AS ( + INSERT INTO + tps.trans (srce, rec) + SELECT + pl.srce + ,pl.rec + FROM + pending_list pl + INNER JOIN unmatched_keys u ON + u.json_key = pl.json_key + ORDER BY + pl.id ASC + ----this conflict is only if an exact duplicate rec json happens, which will be rejected + ----therefore, records may not be inserted due to ay matches with certain json fields, or if the entire json is a duplicate, reason is not specified + RETURNING * + ) + + -----------list of records not inserted-------------------------------------------------------------------------------------------------------------- + + , not_inserted AS ( + SELECT + srce + ,rec + FROM + pending_list + + EXCEPT ALL + + SELECT + srce + ,rec + FROM + inserted + ) + + --------insert to log------------------------------------------------------------------------------------------------------------------------------------- + --below select should be loaded to the log table + + + + --------summarize records not inserted-------------------+------------------------------------------------------------------------------------------------ + + SELECT + t.srce + ,(ae.e::text[])[1] unq_constr + ,MIN(rec #>> ae.e::text[]) min_text + ,MAX(rec #>> ae.e::text[]) max_text + ,JSONB_PRETTY(JSON_AGG(rec #> ae.e::text[] ORDER BY rec #>> ae.e::text[])::JSONB) + FROM + not_inserted t INNER JOIN tps.srce s ON - s.srce = 'PNCC' + s.srce = t.srce LEFT JOIN LATERAL JSONB_ARRAY_ELEMENTS_TEXT(defn->'unique_constraint'->'fields') WITH ORDINALITY ae(e, rn) ON TRUE GROUP BY - i.*, - srce, - id - ORDER BY - id ASC -) + t.srce + ,(ae.e::text[])[1]; ------------create a unique list of keys from staged rows------------------------------------------------------------------------------------------ - -, pending_keys AS ( - SELECT DISTINCT - json_key - FROM - pending_list -) - ------------return unique keys that are not already in tps.trans----------------------------------------------------------------------------------- - -, unmatched_keys AS ( -SELECT - json_key -FROM - pending_keys - -EXCEPT - -SELECT DISTINCT - k.json_key -FROM - pending_keys k - INNER JOIN tps.trans t ON - t.rec @> k.json_key -) - ------------insert pending rows that have key with no trans match----------------------------------------------------------------------------------- - -, inserted AS ( - INSERT INTO - tps.trans (srce, rec) - SELECT - pl.srce - ,pl.rec - FROM - pending_list pl - INNER JOIN unmatched_keys u ON - u.json_key = pl.json_key - ORDER BY - pl.id ASC - ----this conflict is only if an exact duplicate rec json happens, which will be rejected - ----therefore, records may not be inserted due to ay matches with certain json fields, or if the entire json is a duplicate, reason is not specified - RETURNING * -) - ------------list of records not inserted-------------------------------------------------------------------------------------------------------------- - -, not_inserted AS ( - SELECT - srce - ,rec - FROM - pending_list - - EXCEPT ALL - - SELECT - srce - ,rec - FROM - inserted -) - ---------summarize records not inserted-------------------+------------------------------------------------------------------------------------------------ - -SELECT - t.srce - ,(ae.e::text[])[1] unq_constr - ,MIN(rec #>> ae.e::text[]) min_text - ,MAX(rec #>> ae.e::text[]) max_text - ,JSONB_PRETTY(JSON_AGG(rec #> ae.e::text[] ORDER BY rec #>> ae.e::text[])::JSONB) -FROM - not_inserted t - INNER JOIN tps.srce s ON - s.srce = t.srce - LEFT JOIN LATERAL JSONB_ARRAY_ELEMENTS_TEXT(defn->'unique_constraint'->'fields') WITH ORDINALITY ae(e, rn) ON TRUE -GROUP BY - t.srce - ,(ae.e::text[])[1]; \ No newline at end of file +END +$$; \ No newline at end of file From 57fa123adf3493c1f8f2560104ee7a5c949730ff Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Mon, 23 Oct 2017 10:13:32 -0400 Subject: [PATCH 03/20] add transaction log --- ubm_data.sql | 15 ++++++++++++++- ubm_schema.sql | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/ubm_data.sql b/ubm_data.sql index 96a1f74..238684a 100644 --- a/ubm_data.sql +++ b/ubm_data.sql @@ -147,6 +147,12 @@ INSERT INTO map_rv (srce, target, retval, map) VALUES ('PNCC', 'ACH Debits', '{" INSERT INTO map_rv (srce, target, retval, map) VALUES ('PNCC', 'ACH Debits', '{"ini": "ACH DEBIT RECEIVED", "desc": " DBI COBRA", "compn": " DBI COBRA"}', '{"party": "ADP", "reason": "Payroll Direct Deposit"}'); +-- +-- Data for Name: trans_log; Type: TABLE DATA; Schema: tps; Owner: - +-- + + + SET search_path = evt, pg_catalog; -- @@ -162,7 +168,14 @@ SET search_path = tps, pg_catalog; -- Name: trans_id_seq; Type: SEQUENCE SET; Schema: tps; Owner: - -- -SELECT pg_catalog.setval('trans_id_seq', 1543757, true); +SELECT pg_catalog.setval('trans_id_seq', 1544069, true); + + +-- +-- Name: trans_log_id_seq; Type: SEQUENCE SET; Schema: tps; Owner: - +-- + +SELECT pg_catalog.setval('trans_log_id_seq', 1, false); -- diff --git a/ubm_schema.sql b/ubm_schema.sql index fcda4b0..a81bac7 100644 --- a/ubm_schema.sql +++ b/ubm_schema.sql @@ -237,6 +237,30 @@ ALTER TABLE trans ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY ( ); +-- +-- Name: trans_log; Type: TABLE; Schema: tps; Owner: - +-- + +CREATE TABLE trans_log ( + id integer NOT NULL, + info jsonb +); + + +-- +-- Name: trans_log_id_seq; Type: SEQUENCE; Schema: tps; Owner: - +-- + +ALTER TABLE trans_log ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME trans_log_id_seq + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + + SET search_path = evt, pg_catalog; -- @@ -273,6 +297,14 @@ ALTER TABLE ONLY srce ADD CONSTRAINT srce_pkey PRIMARY KEY (srce); +-- +-- Name: trans_log trans_log_pkey; Type: CONSTRAINT; Schema: tps; Owner: - +-- + +ALTER TABLE ONLY trans_log + ADD CONSTRAINT trans_log_pkey PRIMARY KEY (id); + + -- -- Name: trans trans_pkey; Type: CONSTRAINT; Schema: tps; Owner: - -- From 937cf38af15915ba4d18ec3cf54fd04ee48aa268 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Tue, 24 Oct 2017 17:37:20 -0400 Subject: [PATCH 04/20] updates in progress --- loan_bal.pgsql | 2 +- srce.pgsql | 20 ++++++++++++++------ srce_defn.pgsql | 17 +++++++++++++++++ trans_log_template.pgsql | 16 ++++++++++++++++ ubm_data.sql | 2 +- ubm_schema.sql | 24 ++++++++++++++++++++++++ 6 files changed, 73 insertions(+), 8 deletions(-) create mode 100644 srce_defn.pgsql create mode 100644 trans_log_template.pgsql diff --git a/loan_bal.pgsql b/loan_bal.pgsql index 240c296..60da152 100644 --- a/loan_bal.pgsql +++ b/loan_bal.pgsql @@ -1,4 +1,4 @@ - +\timing SELECT r.*, SUM(r."Advances"+r."Adjustments"-r."Payments") OVER (ORDER BY r."Post Date" asc ,r."Reference #" asc) diff --git a/srce.pgsql b/srce.pgsql index f749a77..6e7e67c 100644 --- a/srce.pgsql +++ b/srce.pgsql @@ -1,3 +1,6 @@ +SET auto_explain.log_min_duration = 0; +SHOW ALL; + \timing /*-------------------------------------------------------- @@ -59,6 +62,12 @@ BEGIN EXECUTE _t; + + +END +$$; + + WITH -------------for each imported row in the COPY table, genereate the json rec, and a column for the json key specified in the srce.defn----------- @@ -77,7 +86,7 @@ BEGIN FROM csv_i i INNER JOIN tps.srce s ON - s.srce = _srce + s.srce = 'PNCO' LEFT JOIN LATERAL JSONB_ARRAY_ELEMENTS_TEXT(defn->'unique_constraint'->'fields') WITH ORDINALITY ae(e, rn) ON TRUE GROUP BY i.*, @@ -155,12 +164,13 @@ BEGIN --------insert to log------------------------------------------------------------------------------------------------------------------------------------- --below select should be loaded to the log table - + --------summarize records not inserted-------------------+------------------------------------------------------------------------------------------------ + SELECT - t.srce + jsonb_build_object('srce', t.srce) srce, ,(ae.e::text[])[1] unq_constr ,MIN(rec #>> ae.e::text[]) min_text ,MAX(rec #>> ae.e::text[]) max_text @@ -173,6 +183,4 @@ BEGIN GROUP BY t.srce ,(ae.e::text[])[1]; - -END -$$; \ No newline at end of file + */ \ No newline at end of file diff --git a/srce_defn.pgsql b/srce_defn.pgsql new file mode 100644 index 0000000..79046fb --- /dev/null +++ b/srce_defn.pgsql @@ -0,0 +1,17 @@ +WITH +ext AS ( +SELECT + srce + ,defn->'unique_constraint'->>'fields' + ,ARRAY(SELECT ae.e::text[] FROM jsonb_array_elements_text(defn->'unique_constraint'->'fields') ae(e)) txa +FROM + tps.srce +) +SELECT + srce + , + public.jsonb_extract(rec,txa) +FROM + tps.trans + INNER JOIN ext ON + trans.srce = ext.srce \ No newline at end of file diff --git a/trans_log_template.pgsql b/trans_log_template.pgsql new file mode 100644 index 0000000..07b958e --- /dev/null +++ b/trans_log_template.pgsql @@ -0,0 +1,16 @@ +SELECT +jsonb_pretty( +$$ +{ + "path":"C:\\users\\ptrowbridge\\downloads\\transsearchcsv.csv" + ,"srce":"PNCC" + ,"stamp":"2017-10-24 08:32:06.599067-04" + ,"inserted":{ + "keys":[ + 1,2,3,4,5,6,7 + ] + ,"summary":"" + } +} +$$::jsonb +) \ No newline at end of file diff --git a/ubm_data.sql b/ubm_data.sql index 238684a..a83a65c 100644 --- a/ubm_data.sql +++ b/ubm_data.sql @@ -168,7 +168,7 @@ SET search_path = tps, pg_catalog; -- Name: trans_id_seq; Type: SEQUENCE SET; Schema: tps; Owner: - -- -SELECT pg_catalog.setval('trans_id_seq', 1544069, true); +SELECT pg_catalog.setval('trans_id_seq', 1544080, true); -- diff --git a/ubm_schema.sql b/ubm_schema.sql index a81bac7..543484f 100644 --- a/ubm_schema.sql +++ b/ubm_schema.sql @@ -119,6 +119,30 @@ CREATE TYPE srce_defn_schema AS ( ); +SET search_path = public, pg_catalog; + +-- +-- Name: jsonb_extract(jsonb, text[]); Type: FUNCTION; Schema: public; Owner: - +-- + +CREATE FUNCTION jsonb_extract(rec jsonb, key_list text[]) RETURNS jsonb + LANGUAGE plpgsql + AS $$ +DECLARE + t text; + j jsonb := '{}'::jsonb; + +BEGIN + FOREACH t IN ARRAY key_list LOOP + j := j || jsonb_build_object(t,rec->t); + END LOOP; + RETURN j; +END; +$$; + + +SET search_path = tps, pg_catalog; + -- -- Name: jsonb_concat(jsonb, jsonb); Type: FUNCTION; Schema: tps; Owner: - -- From 6056491ace8ee3b974d5cf05809d4faa9054656e Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Wed, 25 Oct 2017 09:07:51 -0400 Subject: [PATCH 05/20] merge srce changes from pt branch --- srce.pgsql | 132 +++++++++++++++++++++++++---------------------------- 1 file changed, 63 insertions(+), 69 deletions(-) diff --git a/srce.pgsql b/srce.pgsql index 6e7e67c..0f1b237 100644 --- a/srce.pgsql +++ b/srce.pgsql @@ -1,6 +1,3 @@ -SET auto_explain.log_min_duration = 0; -SHOW ALL; - \timing /*-------------------------------------------------------- @@ -9,8 +6,7 @@ SHOW ALL; 2. get unqiue pending keys 3. see which keys not already in tps.trans 4. insert pending records associated with keys that are not already in trans -5. get list of recors not inserted -6. summarize records not inserted +5. insert summary to log table */--------------------------------------------------------- @@ -23,8 +19,8 @@ DECLARE _srce text; BEGIN - _path := 'C:\users\ptrowbridge\downloads\lon_loan_ledgerbal.csv'; - _srce := 'PNCO'; + _path := 'C:\users\fleet\downloads\dc1024.csv'; + _srce := 'DCARD'; ----------------------------------------------------build the column list of the temp table---------------------------------------------------------------- @@ -35,7 +31,7 @@ BEGIN _t, _c FROM - TPS.srce + tps.srce --unwrap the schema definition array LEFT JOIN LATERAL jsonb_populate_recordset(null::tps.srce_defn_schema, defn->'schema') prs ON TRUE WHERE @@ -62,22 +58,29 @@ BEGIN EXECUTE _t; - - -END -$$; - - WITH + -------------extract the limiter fields to one row per source---------------------------------- + + ext AS ( + SELECT + srce + ,defn->'unique_constraint'->>'fields' + ,ARRAY(SELECT ae.e::text[] FROM jsonb_array_elements_text(defn->'unique_constraint'->'fields') ae(e)) text_array + FROM + tps.srce + WHERE + srce = _srce + --add where clause for targeted source + ) + -------------for each imported row in the COPY table, genereate the json rec, and a column for the json key specified in the srce.defn----------- - pending_list AS ( + ,pending_list AS ( SELECT - ---creates a key value pair and then aggregates rows of key value pairs - jsonb_object_agg( - (ae.e::text[])[1], --the key name - (row_to_json(i)::jsonb) #> ae.e::text[] --get the target value from the key from the csv row that has been converted to json + jsonb_extract( + row_to_json(i)::jsonb + ,ext.text_array ) json_key, row_to_json(i)::JSONB rec, srce, @@ -85,13 +88,8 @@ $$; id FROM csv_i i - INNER JOIN tps.srce s ON - s.srce = 'PNCO' - LEFT JOIN LATERAL JSONB_ARRAY_ELEMENTS_TEXT(defn->'unique_constraint'->'fields') WITH ORDINALITY ae(e, rn) ON TRUE - GROUP BY - i.*, - srce, - id + INNER JOIN ext ON + ext.srce = _srce ORDER BY id ASC ) @@ -105,6 +103,17 @@ $$; pending_list ) + -----------list of keys already loaded to tps----------------------------------------------------------------------------------------------------- + + , matched_keys AS ( + SELECT DISTINCT + k.json_key + FROM + pending_keys k + INNER JOIN tps.trans t ON + t.rec @> k.json_key + ) + -----------return unique keys that are not already in tps.trans----------------------------------------------------------------------------------- , unmatched_keys AS ( @@ -115,12 +124,10 @@ $$; EXCEPT - SELECT DISTINCT - k.json_key + SELECT + json_key FROM - pending_keys k - INNER JOIN tps.trans t ON - t.rec @> k.json_key + matched_keys ) -----------insert pending rows that have key with no trans match----------------------------------------------------------------------------------- @@ -143,44 +150,31 @@ $$; RETURNING * ) - -----------list of records not inserted-------------------------------------------------------------------------------------------------------------- - - , not_inserted AS ( - SELECT - srce - ,rec - FROM - pending_list - - EXCEPT ALL - - SELECT - srce - ,rec - FROM - inserted - ) - - --------insert to log------------------------------------------------------------------------------------------------------------------------------------- - --below select should be loaded to the log table - - - --------summarize records not inserted-------------------+------------------------------------------------------------------------------------------------ - + INSERT INTO + tps.trans_log (info) SELECT - jsonb_build_object('srce', t.srce) srce, - ,(ae.e::text[])[1] unq_constr - ,MIN(rec #>> ae.e::text[]) min_text - ,MAX(rec #>> ae.e::text[]) max_text - ,JSONB_PRETTY(JSON_AGG(rec #> ae.e::text[] ORDER BY rec #>> ae.e::text[])::JSONB) - FROM - not_inserted t - INNER JOIN tps.srce s ON - s.srce = t.srce - LEFT JOIN LATERAL JSONB_ARRAY_ELEMENTS_TEXT(defn->'unique_constraint'->'fields') WITH ORDINALITY ae(e, rn) ON TRUE - GROUP BY - t.srce - ,(ae.e::text[])[1]; - */ \ No newline at end of file + JSONB_BUILD_OBJECT('time_stamp',CURRENT_TIMESTAMP) + ||JSONB_BUILD_OBJECT('srce',_srce) + ||JSONB_BUILD_OBJECT('path',_path) + ||JSONB_BUILD_OBJECT('not_inserted', + ( + SELECT + jsonb_agg(json_key) + FROM + matched_keys + ) + ) + ||JSONB_BUILD_OBJECT('inserted', + ( + SELECT + jsonb_agg(json_key) + FROM + unmatched_keys + ) + ); + +END +$$; + From 65ee6d703b1a26ef68a929d531c79d3859772f2a Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Wed, 25 Oct 2017 10:44:32 -0400 Subject: [PATCH 06/20] modify to print log info --- srce.pgsql | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/srce.pgsql b/srce.pgsql index 0f1b237..251198c 100644 --- a/srce.pgsql +++ b/srce.pgsql @@ -16,11 +16,13 @@ DECLARE _t text; DECLARE _c text; DECLARE _path text; DECLARE _srce text; +DECLARE _log_info text; +DECLARE _log_id text; BEGIN - _path := 'C:\users\fleet\downloads\dc1024.csv'; - _srce := 'DCARD'; + _path := 'C:\users\ptrowbridge\downloads\llcol.csv'; + _srce := 'PNCL'; ----------------------------------------------------build the column list of the temp table---------------------------------------------------------------- @@ -152,6 +154,7 @@ BEGIN --------summarize records not inserted-------------------+------------------------------------------------------------------------------------------------ + , logged AS ( INSERT INTO tps.trans_log (info) SELECT @@ -173,7 +176,20 @@ BEGIN FROM unmatched_keys ) - ); + ) + RETURNING * + ) + + SELECT + id + ,info + INTO + _log_id + ,_log_info + FROM + logged; + + RAISE NOTICE 'import logged under id# %, info: %', _log_id, _log_info; END $$; From 44373bc616797433bd481f813fc9979bb4d21541 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Wed, 25 Oct 2017 10:44:57 -0400 Subject: [PATCH 07/20] copy source template from pt branch --- srce_template.pgsql | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 srce_template.pgsql diff --git a/srce_template.pgsql b/srce_template.pgsql new file mode 100644 index 0000000..c32ace4 --- /dev/null +++ b/srce_template.pgsql @@ -0,0 +1,39 @@ +insert into tps.srce +SELECT +'CAMZ', +$$ +{ + "name": "CAMZ", + "description":"Chase Amazon Credit Card", + "type": "csv", + "schema": [ + { + "key": "Type", + "type": "text" + }, + { + "key": "Trans Date", + "type": "date" + }, + { + "key": "Post Date", + "type": "date" + }, + { + "key": "Description", + "type": "text" + }, + { + "key": "Amount", + "type": "numeric" + } + ], + "unique_constraint": { + "type": "key", + "fields": [ + "{Trans Date}" + ,"{Post Date}" + ] + } +} +$$::JSONB \ No newline at end of file From ba4a404980341a766cda171ca36bbe713520a0f5 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Wed, 25 Oct 2017 10:45:24 -0400 Subject: [PATCH 08/20] added partion by major item, and then sort on import id --- col_balance.pgsql | 9 +++++---- loan_bal.pgsql | 6 ++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/col_balance.pgsql b/col_balance.pgsql index 09e726a..8f812f3 100644 --- a/col_balance.pgsql +++ b/col_balance.pgsql @@ -7,7 +7,7 @@ SELECT WHEN '02IN Raw Material' THEN 13097563.42 WHEN '03IN Finished Goods' THEN 35790696.52 ELSE 0 - END + SUM("Sales"+"Credits & Adjustments"-"Gross Collections") OVER (ORDER BY "PostDate" ASC, rec->>'id' ASC) running_bal + END + SUM("Sales"+"Credits & Adjustments"-"Gross Collections") OVER (PARTITION BY "Schedule#" ORDER BY "Schedule#" ASC, "PostDate" ASC, rec->>'id' ASC) running_bal ,(LEAST("CollateralBalance" - "Ineligible Amount","MaxEligible")*("AdvanceRate"/100))::NUMERIC(20,2) qualified_collateral ,(("CollateralBalance" - "Ineligible Amount")*("AdvanceRate"/100))::NUMERIC(20,2) qualified_collateral_nl FROM @@ -15,7 +15,8 @@ FROM LEFT JOIN LATERAL jsonb_populate_record(null::tps.pncl, rec) r ON TRUE WHERE srce = 'PNCL' - AND "PostDate" >= '2017-10-12' - --rec @> '{"Schedule#":"03IN Finished Goods"}' + --AND rec @> '{"Schedule#":"03IN Finished Goods"}' ORDER BY - r."PostDate" asc \ No newline at end of file + "Schedule#" asc + ,r."PostDate" asc + ,rec->>'id' asc \ No newline at end of file diff --git a/loan_bal.pgsql b/loan_bal.pgsql index 60da152..a149817 100644 --- a/loan_bal.pgsql +++ b/loan_bal.pgsql @@ -1,12 +1,14 @@ \timing SELECT r.*, - SUM(r."Advances"+r."Adjustments"-r."Payments") OVER (ORDER BY r."Post Date" asc ,r."Reference #" asc) + SUM(r."Advances"+r."Adjustments"-r."Payments") OVER (PARTITION BY "Loan#" ORDER BY r."Post Date" asc ,rec->>'id' asc, r."Reference #" asc) FROM tps.trans LEFT JOIN LATERAL jsonb_populate_record(null::tps.pnco, rec) r ON TRUE WHERE rec @> '{"Loan#":"606780191"}' ORDER BY - r."Post Date" asc + r."Loan#" + ,r."Post Date" asc + ,rec->>'id' asc ,r."Reference #" asc From 8929b4eea93940728ff0377002fb486af1275e82 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Wed, 25 Oct 2017 12:05:28 -0400 Subject: [PATCH 09/20] try using correlated subquery for unique list of keys, is pretty slow --- srce_defn.pgsql | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/srce_defn.pgsql b/srce_defn.pgsql index 79046fb..65361c3 100644 --- a/srce_defn.pgsql +++ b/srce_defn.pgsql @@ -1,17 +1,24 @@ +\timing +/* WITH ext AS ( SELECT srce ,defn->'unique_constraint'->>'fields' - ,ARRAY(SELECT ae.e::text[] FROM jsonb_array_elements_text(defn->'unique_constraint'->'fields') ae(e)) txa + ,ARRAY(SELECT ae.e::text[] FROM jsonb_array_elements_text(defn->'unique_constraint'->'fields') ae(e)) text_array FROM tps.srce + --add where clause for targeted source ) -SELECT - srce - , - public.jsonb_extract(rec,txa) +*/ + +SELECT COUNT(*) FROM +( +SELECT DISTINCT + t.srce + ,(SELECT JSONB_OBJECT_agg(ae.e,rec #> ae.e::text[]) FROM jsonb_array_elements_text(defn->'unique_constraint'->'fields') ae(e)) ja FROM - tps.trans - INNER JOIN ext ON - trans.srce = ext.srce \ No newline at end of file + tps.trans t + INNER JOIN tps.srce s ON + s.srce = t.srce +) X \ No newline at end of file From 4a85732890e147f3285d7919a4ded9e69ac90308 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Wed, 25 Oct 2017 12:07:04 -0400 Subject: [PATCH 10/20] edit function to recognize slice of array, ad bank schema to hold types there, add function to json regex definition to accomodate differntiating extracts from replaces --- ubm_data.sql | 21 ++++++++++++++------- ubm_schema.sql | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/ubm_data.sql b/ubm_data.sql index a83a65c..b6aa4be 100644 --- a/ubm_data.sql +++ b/ubm_data.sql @@ -38,11 +38,11 @@ INSERT INTO srce (srce, defn) VALUES ('ADPRP', '{"name": "ADPRP", "type": "csv", -- Data for Name: map_rm; Type: TABLE DATA; Schema: tps; Owner: - -- -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'ACH Debits', '{"defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "compn", "regex": "Comp Name:(.+?)(?=$| Comp|\\w+?:)", "retain": "y"}, {"key": "{Description}", "field": "adp_comp", "regex": "Cust ID:.*?(B3X|UDV|U7E|U7C|U7H|U7J).*?(?=$|\\w+?:)", "retain": "y"}, {"key": "{Description}", "field": "desc", "regex": "Desc:(.+?) Comp", "retain": "y"}, {"key": "{Description}", "field": "discr", "regex": "Discr:(.+?)(?=$| SEC:|\\w+?:)", "retain": "y"}], "where": [{"Transaction": "ACH Debits"}]}', 2); -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Trans Type', '{"defn": [{"key": "{AccountName}", "field": "acctn", "regex": "(.*)", "retain": "n"}, {"key": "{Transaction}", "field": "trans", "regex": "(.*)", "retain": "n"}, {"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}], "where": [{}]}', 1); -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Wires Out', '{"defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "bene", "regex": "BENEFICIARY:(.+?) AC/", "retain": "y"}, {"key": "{Description}", "field": "accts", "regex": "AC/(\\w*) .*AC/(\\w*) ", "retain": "y"}], "where": [{"Transaction": "Intl Money Transfer Debits"}, {"Transaction": "Money Transfer DB - Wire"}]}', 2); -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Currency', '{"defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "curr1", "regex": ".*(DEBIT|CREDIT).*(USD|CAD).*(?=DEBIT|CREDIT).*(?=USD|CAD).*", "retain": "y"}, {"key": "{Description}", "field": "curr2", "regex": ".*(?=DEBIT|CREDIT).*(?=USD|CAD).*(DEBIT|CREDIT).*(USD|CAD).*", "retain": "y"}], "where": [{"Transaction": "Miscellaneous Credits"}, {"Transaction": "Miscellaneous Debits"}]}', 2); -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Check Number', '{"defn": [{"key": "{Description}", "field": "checkn", "regex": "[^0-9]*([0-9]*)\\s|$", "retain": "y"}], "where": [{"Transaction": "Checks Paid"}]}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'ACH Debits', '{"defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "compn", "regex": "Comp Name:(.+?)(?=$| Comp|\\w+?:)", "retain": "y"}, {"key": "{Description}", "field": "adp_comp", "regex": "Cust ID:.*?(B3X|UDV|U7E|U7C|U7H|U7J).*?(?=$|\\w+?:)", "retain": "y"}, {"key": "{Description}", "field": "desc", "regex": "Desc:(.+?) Comp", "retain": "y"}, {"key": "{Description}", "field": "discr", "regex": "Discr:(.+?)(?=$| SEC:|\\w+?:)", "retain": "y"}], "where": [{"Transaction": "ACH Debits"}], "function": "extract"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Trans Type', '{"defn": [{"key": "{AccountName}", "field": "acctn", "regex": "(.*)", "retain": "n"}, {"key": "{Transaction}", "field": "trans", "regex": "(.*)", "retain": "n"}, {"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}], "where": [{}], "function": "extract"}', 1); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Wires Out', '{"defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "bene", "regex": "BENEFICIARY:(.+?) AC/", "retain": "y"}, {"key": "{Description}", "field": "accts", "regex": "AC/(\\w*) .*AC/(\\w*) ", "retain": "y"}], "where": [{"Transaction": "Intl Money Transfer Debits"}, {"Transaction": "Money Transfer DB - Wire"}], "function": "extract"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Currency', '{"defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "curr1", "regex": ".*(DEBIT|CREDIT).*(USD|CAD).*(?=DEBIT|CREDIT).*(?=USD|CAD).*", "retain": "y"}, {"key": "{Description}", "field": "curr2", "regex": ".*(?=DEBIT|CREDIT).*(?=USD|CAD).*(DEBIT|CREDIT).*(USD|CAD).*", "retain": "y"}], "where": [{"Transaction": "Miscellaneous Credits"}, {"Transaction": "Miscellaneous Debits"}], "function": "extract"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Check Number', '{"defn": [{"key": "{Description}", "field": "checkn", "regex": "[^0-9]*([0-9]*)\\s|$", "retain": "y"}], "where": [{"Transaction": "Checks Paid"}], "function": "extract"}', 2); -- @@ -145,12 +145,19 @@ INSERT INTO map_rv (srce, target, retval, map) VALUES ('PNCC', 'Trans Type', '{" INSERT INTO map_rv (srce, target, retval, map) VALUES ('PNCC', 'ACH Debits', '{"ini": "ACH DEBIT RECEIVED", "desc": " WAGE GARN", "compn": " ADP WAGE GARN"}', '{"party": "ADP", "reason": "Payroll Garnishments"}'); INSERT INTO map_rv (srce, target, retval, map) VALUES ('PNCC', 'ACH Debits', '{"ini": "ACH DEBIT RECEIVED", "desc": " WAGE PAY", "compn": " ADP WAGE PAY"}', '{"party": "ADP", "reason": "Payroll Direct Deposit"}'); INSERT INTO map_rv (srce, target, retval, map) VALUES ('PNCC', 'ACH Debits', '{"ini": "ACH DEBIT RECEIVED", "desc": " DBI COBRA", "compn": " DBI COBRA"}', '{"party": "ADP", "reason": "Payroll Direct Deposit"}'); +INSERT INTO map_rv (srce, target, retval, map) VALUES ('PNCC', 'Trans Type', '{"ini": "ACH DEBIT RETURN", "acctn": "The HC Operating Company OPERA", "trans": "ACH Debits"}', '{"sign": "-1", "ledger": "manual", "trantype": "Disbursement"}'); +INSERT INTO map_rv (srce, target, retval, map) VALUES ('PNCC', 'Trans Type', '{"ini": "ACH CREDIT SETTLEMENT", "acctn": "The HC Operating Company OPERA", "trans": "ACH Credits"}', '{"sign": "1", "ledger": "manual", "trantype": "Disbursement"}'); -- -- Data for Name: trans_log; Type: TABLE DATA; Schema: tps; Owner: - -- +INSERT INTO trans_log (id, info) VALUES (1, '{"path": "C:\\users\\ptrowbridge\\downloads\\lon_loan_ledgercol.csv", "srce": "PNCL", "inserted": [{"PostDate": "2017-10-17", "Schedule#": "05AR"}, {"PostDate": "2017-10-17", "Schedule#": "MR"}, {"PostDate": "2017-10-20", "Schedule#": "01AR"}, {"PostDate": "2017-10-17", "Schedule#": "03IN Finished Goods"}, {"PostDate": "2017-10-17", "Schedule#": "04AR RS"}, {"PostDate": "2017-10-19", "Schedule#": "01AR"}, {"PostDate": "2017-10-17", "Schedule#": "06AR RS"}, {"PostDate": "2017-10-17", "Schedule#": "02IN Raw Material"}], "time_stamp": "2017-10-25T09:52:10.221392-04:00", "not_inserted": [{"PostDate": "2017-10-18", "Schedule#": "01AR"}, {"PostDate": "2017-10-17", "Schedule#": "01AR"}]}'); +INSERT INTO trans_log (id, info) VALUES (2, '{"path": "C:\\users\\ptrowbridge\\downloads\\transsearchcsv(1).csv", "srce": "PNCC", "inserted": [{"AsOfDate": "2017-10-24"}, {"AsOfDate": "2017-10-23"}], "time_stamp": "2017-10-25T10:04:21.618701-04:00", "not_inserted": [{"AsOfDate": "2017-10-19"}, {"AsOfDate": "2017-10-20"}, {"AsOfDate": "2017-10-18"}]}'); +INSERT INTO trans_log (id, info) VALUES (3, '{"path": "C:\\users\\ptrowbridge\\downloads\\transsearchcsv(1).csv", "srce": "PNCC", "inserted": null, "time_stamp": "2017-10-25T10:08:11.443367-04:00", "not_inserted": [{"AsOfDate": "2017-10-19"}, {"AsOfDate": "2017-10-24"}, {"AsOfDate": "2017-10-20"}, {"AsOfDate": "2017-10-23"}, {"AsOfDate": "2017-10-18"}]}'); +INSERT INTO trans_log (id, info) VALUES (4, '{"path": "C:\\users\\ptrowbridge\\downloads\\llbal.csv", "srce": "PNCO", "inserted": [{"Loan#": "606780191", "Post Date": "2017-10-23", "Effective Date": "2017-10-23"}, {"Loan#": "606780191", "Post Date": "2017-10-24", "Effective Date": "2017-10-24"}], "time_stamp": "2017-10-25T10:13:37.760308-04:00", "not_inserted": [{"Loan#": "606780191", "Post Date": "2017-10-19", "Effective Date": "2017-10-19"}, {"Loan#": "606780191", "Post Date": "2017-10-18", "Effective Date": "2017-10-18"}, {"Loan#": "606780191", "Post Date": "2017-10-20", "Effective Date": "2017-10-20"}]}'); +INSERT INTO trans_log (id, info) VALUES (5, '{"path": "C:\\users\\ptrowbridge\\downloads\\llcol.csv", "srce": "PNCL", "inserted": [{"PostDate": "2017-10-24", "Schedule#": "01AR"}, {"PostDate": "2017-10-23", "Schedule#": "01AR"}], "time_stamp": "2017-10-25T10:14:10.004265-04:00", "not_inserted": [{"PostDate": "2017-10-18", "Schedule#": "01AR"}, {"PostDate": "2017-10-17", "Schedule#": "05AR"}, {"PostDate": "2017-10-17", "Schedule#": "MR"}, {"PostDate": "2017-10-20", "Schedule#": "01AR"}, {"PostDate": "2017-10-17", "Schedule#": "03IN Finished Goods"}, {"PostDate": "2017-10-17", "Schedule#": "04AR RS"}, {"PostDate": "2017-10-19", "Schedule#": "01AR"}, {"PostDate": "2017-10-17", "Schedule#": "01AR"}, {"PostDate": "2017-10-17", "Schedule#": "06AR RS"}, {"PostDate": "2017-10-17", "Schedule#": "02IN Raw Material"}]}'); SET search_path = evt, pg_catalog; @@ -168,14 +175,14 @@ SET search_path = tps, pg_catalog; -- Name: trans_id_seq; Type: SEQUENCE SET; Schema: tps; Owner: - -- -SELECT pg_catalog.setval('trans_id_seq', 1544080, true); +SELECT pg_catalog.setval('trans_id_seq', 1544252, true); -- -- Name: trans_log_id_seq; Type: SEQUENCE SET; Schema: tps; Owner: - -- -SELECT pg_catalog.setval('trans_log_id_seq', 1, false); +SELECT pg_catalog.setval('trans_log_id_seq', 5, true); -- diff --git a/ubm_schema.sql b/ubm_schema.sql index 543484f..dc97c91 100644 --- a/ubm_schema.sql +++ b/ubm_schema.sql @@ -14,6 +14,13 @@ SET check_function_bodies = false; SET client_min_messages = warning; SET row_security = off; +-- +-- Name: bank; Type: SCHEMA; Schema: -; Owner: - +-- + +CREATE SCHEMA bank; + + -- -- Name: evt; Type: SCHEMA; Schema: -; Owner: - -- @@ -70,6 +77,27 @@ CREATE EXTENSION IF NOT EXISTS plprofiler WITH SCHEMA public; COMMENT ON EXTENSION plprofiler IS 'server-side support for profiling PL/pgSQL functions'; +SET search_path = bank, pg_catalog; + +-- +-- Name: pncc; Type: TYPE; Schema: bank; Owner: - +-- + +CREATE TYPE pncc AS ( + "AsOfDate" date, + "BankId" text, + "AccountNumber" text, + "AccountName" text, + "BaiControl" text, + "Currency" text, + "Transaction" text, + "Reference" text, + "Amount" numeric, + "Description" text, + "AdditionalRemittance" text +); + + SET search_path = tps, pg_catalog; -- @@ -129,12 +157,14 @@ CREATE FUNCTION jsonb_extract(rec jsonb, key_list text[]) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE - t text; + t text[]; j jsonb := '{}'::jsonb; BEGIN - FOREACH t IN ARRAY key_list LOOP - j := j || jsonb_build_object(t,rec->t); + FOREACH t SLICE 1 IN ARRAY key_list LOOP + --RAISE NOTICE '%', t; + --RAISE NOTICE '%', t[1]; + j := j || jsonb_build_object(t[1],rec#>t); END LOOP; RETURN j; END; From a01c5c1a04d85f56a7b67bc307d2e6c894d03393 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Wed, 25 Oct 2017 15:21:13 -0400 Subject: [PATCH 11/20] add mapping template --- map_rm_template.pgsql | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 map_rm_template.pgsql diff --git a/map_rm_template.pgsql b/map_rm_template.pgsql new file mode 100644 index 0000000..5be6c32 --- /dev/null +++ b/map_rm_template.pgsql @@ -0,0 +1,31 @@ +DELETE FROM tps.map_rm where target = 'Strip Amount Commas'; +INSERT INTO +tps.map_rm +SELECT * +FROM +(VALUES + ('PNCC', 'Strip Amount Commas', + $j$ + { + "name":"Strip Amount Commas", + "description":"the Amount field come from PNC with commas embeded so it cannot be cast to numeric", + "defn": [ + { + "key": "{Amount}", + "field": "amount", + "regex": ",", + "replace":"", + "flag":"g", + "retain":"y" + } + ], + "function":"replace", + "map":"no", + "where": [ + { + } + ] + } + $j$::jsonb + , 1) +) x; \ No newline at end of file From 1af3f05a1e52c1bd0f4d7f211702cd9eed4c7b47 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Wed, 25 Oct 2017 16:28:42 -0400 Subject: [PATCH 12/20] overhaul of do_map, add seperate column for parse versus mapped, accomodate rexexp_replace --- do_map.pgsql | 99 +++++++++++++++++++++++++++----------------------- ubm_data.sql | 11 +++--- ubm_schema.sql | 1 + 3 files changed, 61 insertions(+), 50 deletions(-) diff --git a/do_map.pgsql b/do_map.pgsql index 7c9fff1..740a892 100644 --- a/do_map.pgsql +++ b/do_map.pgsql @@ -13,33 +13,60 @@ SELECT e.v ->> 'key', (t.rec #> ((e.v ->> 'key')::text[])) ) AS rkey, - jsonb_build_object( - e.v->>'field', - CASE WHEN array_upper(mt.mt,1)=1 - THEN to_json(mt.mt[1]) - ELSE array_to_json(mt.mt) - END - ) retval, - m.seq, - e.v->>'retain' retain + CASE regex->>'map' + WHEN 'yes' THEN + jsonb_build_object( + e.v->>'field', + CASE regex->>'function' + WHEN 'extract' THEN + CASE WHEN array_upper(mt.mt,1)=1 + THEN to_json(mt.mt[1]) + ELSE array_to_json(mt.mt) + END::jsonb + WHEN 'replace' THEN + to_jsonb(rp.rp) + ELSE + '{}'::jsonb + END + ) + ELSE + '{}'::jsonb + END retval, + CASE e.v->>'retain' + WHEN 'y' THEN + jsonb_build_object( + e.v->>'field', + CASE regex->>'function' + WHEN 'extract' THEN + CASE WHEN array_upper(mt.mt,1)=1 + THEN to_json(mt.mt[1]) + ELSE array_to_json(mt.mt) + END::jsonb + WHEN 'replace' THEN + to_jsonb(rp.rp) + ELSE + '{}'::jsonb + END + ) + ELSE + '{}'::jsonb + END retain, + m.seq FROM tps.map_rm m LEFT JOIN LATERAL jsonb_array_elements(m.regex->'where') w(v) ON TRUE - JOIN tps.trans t ON + INNER JOIN tps.trans t ON t.srce = m.srce AND t.rec @> w.v LEFT JOIN LATERAL jsonb_array_elements(m.regex->'defn') WITH ORDINALITY e(v, rn) ON true - LEFT JOIN LATERAL regexp_matches(t.rec #>> ((e.v ->> 'key')::text[]), e.v ->> 'regex'::text) WITH ORDINALITY mt(mt, rn) ON true + LEFT JOIN LATERAL regexp_matches(t.rec #>> ((e.v ->> 'key')::text[]), e.v ->> 'regex'::text) WITH ORDINALITY mt(mt, rn) ON + m.regex->>'function' = 'extract' + LEFT JOIN LATERAL regexp_replace(t.rec #>> ((e.v ->> 'key')::text[]), e.v ->> 'regex'::text, e.v ->> 'replace'::text,e.v ->> 'flag'::text) WITH ORDINALITY rp(rp, rn) ON + m.regex->>'function' = 'replace' WHERE t.map IS NULL - AND t.srce = 'PNCC' ORDER BY - m.srce, - m.seq, - m.target, - t.id, - t.rec, - e.rn + t.id DESC ), ----------aggregate regex back to the target level (may be several targets per row)--------------------------------------------------------------- @@ -53,8 +80,8 @@ agg_rx AS ( rx.rec, tps.jsonb_concat_obj(rx.rkey) rkey, tps.jsonb_concat_obj(rx.retval) AS retval, - tps.jsonb_concat_obj(CASE rx.retain WHEN 'y' THEN rx.retval ELSE '{}'::jsonb END) retain, - rx.seq + tps.jsonb_concat_obj(rx.retain) AS retain, + rx.seq FROM --unwrap json instruction and apply regex using a count per original line for re-aggregation --need to look at integrating regex option like 'g' that would then need aggegated back as an array, or adding the ordinality number to the title @@ -76,13 +103,13 @@ agg_rx AS ( u.id, u.rec, string_agg(u.target,',') target, - tps.jsonb_concat_obj(coalesce(v.map,'{}'::jsonb) ORDER BY seq ) map, - --tps.jsonb_concat_obj(u.retval||coalesce(v.map,'{}'::jsonb) ORDER BY seq) comb, - tps.jsonb_concat_obj(u.retain||coalesce(v.map,'{}'::jsonb) ORDER BY seq) retain + tps.jsonb_concat_obj(u.retval) retval, + tps.jsonb_concat_obj(u.retain) retain, + tps.jsonb_concat_obj(coalesce(v.map,'{}'::jsonb) ORDER BY seq ) map FROM --re-aggregate return values and explude any records where one or more regex failed with a null result agg_rx u - INNER JOIN tps.map_rv v ON + LEFT OUTER JOIN tps.map_rv v ON v.target = u.target AND v.srce = u.srce AND v.retval <@ u.retval @@ -93,30 +120,12 @@ agg_rx AS ( ) ---SELECT * FROM agg_orig LIMIT 10 ---UPDATE tps.trans t SET (map) = (SELECT retain FROM agg_orig WHERE t.id = agg_orig.id); - UPDATE tps.trans t SET - map = o.retain + map = o.map, + parse = o.retain FROM agg_orig o WHERE - o.id = t.id - - -/* -SELECT - retain->>'f20', - rec->>'Description', - COUNT(*) -FROM - agg_orig -GROUP BY - retain->>'f20', - rec->>'Description' -ORDER BY - retain->>'f20', - rec->>'Description' -*/ \ No newline at end of file + o.id = t.id \ No newline at end of file diff --git a/ubm_data.sql b/ubm_data.sql index b6aa4be..6d6a56d 100644 --- a/ubm_data.sql +++ b/ubm_data.sql @@ -38,11 +38,12 @@ INSERT INTO srce (srce, defn) VALUES ('ADPRP', '{"name": "ADPRP", "type": "csv", -- Data for Name: map_rm; Type: TABLE DATA; Schema: tps; Owner: - -- -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'ACH Debits', '{"defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "compn", "regex": "Comp Name:(.+?)(?=$| Comp|\\w+?:)", "retain": "y"}, {"key": "{Description}", "field": "adp_comp", "regex": "Cust ID:.*?(B3X|UDV|U7E|U7C|U7H|U7J).*?(?=$|\\w+?:)", "retain": "y"}, {"key": "{Description}", "field": "desc", "regex": "Desc:(.+?) Comp", "retain": "y"}, {"key": "{Description}", "field": "discr", "regex": "Discr:(.+?)(?=$| SEC:|\\w+?:)", "retain": "y"}], "where": [{"Transaction": "ACH Debits"}], "function": "extract"}', 2); -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Trans Type', '{"defn": [{"key": "{AccountName}", "field": "acctn", "regex": "(.*)", "retain": "n"}, {"key": "{Transaction}", "field": "trans", "regex": "(.*)", "retain": "n"}, {"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}], "where": [{}], "function": "extract"}', 1); -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Wires Out', '{"defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "bene", "regex": "BENEFICIARY:(.+?) AC/", "retain": "y"}, {"key": "{Description}", "field": "accts", "regex": "AC/(\\w*) .*AC/(\\w*) ", "retain": "y"}], "where": [{"Transaction": "Intl Money Transfer Debits"}, {"Transaction": "Money Transfer DB - Wire"}], "function": "extract"}', 2); -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Currency', '{"defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "curr1", "regex": ".*(DEBIT|CREDIT).*(USD|CAD).*(?=DEBIT|CREDIT).*(?=USD|CAD).*", "retain": "y"}, {"key": "{Description}", "field": "curr2", "regex": ".*(?=DEBIT|CREDIT).*(?=USD|CAD).*(DEBIT|CREDIT).*(USD|CAD).*", "retain": "y"}], "where": [{"Transaction": "Miscellaneous Credits"}, {"Transaction": "Miscellaneous Debits"}], "function": "extract"}', 2); -INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Check Number', '{"defn": [{"key": "{Description}", "field": "checkn", "regex": "[^0-9]*([0-9]*)\\s|$", "retain": "y"}], "where": [{"Transaction": "Checks Paid"}], "function": "extract"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'ACH Debits', '{"map": "yes", "defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "compn", "regex": "Comp Name:(.+?)(?=$| Comp|\\w+?:)", "retain": "y"}, {"key": "{Description}", "field": "adp_comp", "regex": "Cust ID:.*?(B3X|UDV|U7E|U7C|U7H|U7J).*?(?=$|\\w+?:)", "retain": "y"}, {"key": "{Description}", "field": "desc", "regex": "Desc:(.+?) Comp", "retain": "y"}, {"key": "{Description}", "field": "discr", "regex": "Discr:(.+?)(?=$| SEC:|\\w+?:)", "retain": "y"}], "where": [{"Transaction": "ACH Debits"}], "function": "extract"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Trans Type', '{"map": "yes", "defn": [{"key": "{AccountName}", "field": "acctn", "regex": "(.*)", "retain": "n"}, {"key": "{Transaction}", "field": "trans", "regex": "(.*)", "retain": "n"}, {"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}], "where": [{}], "function": "extract"}', 1); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Wires Out', '{"map": "yes", "defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "bene", "regex": "BENEFICIARY:(.+?) AC/", "retain": "y"}, {"key": "{Description}", "field": "accts", "regex": "AC/(\\w*) .*AC/(\\w*) ", "retain": "y"}], "where": [{"Transaction": "Intl Money Transfer Debits"}, {"Transaction": "Money Transfer DB - Wire"}], "function": "extract"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Currency', '{"map": "yes", "defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "curr1", "regex": ".*(DEBIT|CREDIT).*(USD|CAD).*(?=DEBIT|CREDIT).*(?=USD|CAD).*", "retain": "y"}, {"key": "{Description}", "field": "curr2", "regex": ".*(?=DEBIT|CREDIT).*(?=USD|CAD).*(DEBIT|CREDIT).*(USD|CAD).*", "retain": "y"}], "where": [{"Transaction": "Miscellaneous Credits"}, {"Transaction": "Miscellaneous Debits"}], "function": "extract"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Check Number', '{"map": "yes", "defn": [{"key": "{Description}", "field": "checkn", "regex": "[^0-9]*([0-9]*)\\s|$", "retain": "y"}], "where": [{"Transaction": "Checks Paid"}], "function": "extract"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Strip Amount Commas', '{"map": "no", "defn": [{"key": "{Amount}", "flag": "g", "field": "amount", "regex": ",", "retain": "y", "replace": ""}], "name": "Strip Amount Commas", "where": [{}], "function": "replace", "description": "the Amount field come from PNC with commas embeded so it cannot be cast to numeric"}', 1); -- diff --git a/ubm_schema.sql b/ubm_schema.sql index dc97c91..fbb876a 100644 --- a/ubm_schema.sql +++ b/ubm_schema.sql @@ -273,6 +273,7 @@ CREATE TABLE trans ( id integer NOT NULL, srce text, rec jsonb, + parse jsonb, map jsonb ); From fb93f73b6c38d510635567552c5238e77ca9b72b Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Thu, 26 Oct 2017 12:15:58 -0400 Subject: [PATCH 13/20] create another mapping approach to better handle option g on regexp_ functions. --- do_map_g_option.pgsql | 186 ++++++++++++++++++++++++++++++++++++++++++ map_rm_template.pgsql | 32 ++++++++ 2 files changed, 218 insertions(+) create mode 100644 do_map_g_option.pgsql diff --git a/do_map_g_option.pgsql b/do_map_g_option.pgsql new file mode 100644 index 0000000..3f1eb10 --- /dev/null +++ b/do_map_g_option.pgsql @@ -0,0 +1,186 @@ + +WITH + +--------------------apply regex operations to transactions----------------------------------------------------------------------------------- + +rx AS ( +SELECT + t.srce, + t.id, + t.rec, + m.target, + regex->>'map' map_intention, + regex->>'function' regex_function, + e.v ->> 'field' result_key_name, + e.v ->> 'key' target_json_path, + e.v ->> 'flag' regex_options_flag, + e.v->>'retain' retain_result, + e.v->>'regex' regex_expression, + e.rn target_item_number, + COALESCE(mt.rn,rp.rn,1) result_number, + mt.mt rx_match, + rp.rp rx_replace, + CASE regex->>'map' + WHEN 'yes' THEN + e.v->>'field' + ELSE + null + END map_key, + CASE regex->>'map' + WHEN 'yes' THEN + CASE regex->>'function' + WHEN 'extract' THEN + CASE WHEN array_upper(mt.mt,1)=1 + THEN to_json(mt.mt[1]) + ELSE array_to_json(mt.mt) + END::jsonb + WHEN 'replace' THEN + to_jsonb(rp.rp) + ELSE + '{}'::jsonb + END + ELSE + NULL + END map_val, + CASE e.v->>'retain' + WHEN 'y' THEN + e.v->>'field' + ELSE + NULL + END retain_key, + CASE e.v->>'retain' + WHEN 'y' THEN + CASE regex->>'function' + WHEN 'extract' THEN + CASE WHEN array_upper(mt.mt,1)=1 + THEN to_json(mt.mt[1]) + ELSE array_to_json(mt.mt) + END::jsonb + WHEN 'replace' THEN + to_jsonb(rp.rp) + ELSE + '{}'::jsonb + END + ELSE + NULL + END retain_val +FROM + tps.map_rm m + LEFT JOIN LATERAL jsonb_array_elements(m.regex->'where') w(v) ON TRUE + INNER JOIN tps.trans t ON + t.srce = m.srce AND + t.rec @> w.v + LEFT JOIN LATERAL jsonb_array_elements(m.regex->'defn') WITH ORDINALITY e(v, rn) ON true + LEFT JOIN LATERAL regexp_matches(t.rec #>> ((e.v ->> 'key')::text[]), e.v ->> 'regex'::text,COALESCE(e.v ->> 'flag','')) WITH ORDINALITY mt(mt, rn) ON + m.regex->>'function' = 'extract' + LEFT JOIN LATERAL regexp_replace(t.rec #>> ((e.v ->> 'key')::text[]), e.v ->> 'regex'::text, e.v ->> 'replace'::text,e.v ->> 'flag') WITH ORDINALITY rp(rp, rn) ON + m.regex->>'function' = 'replace' +WHERE + t.srce = 'PNCC' +ORDER BY + t.id DESC, + m.target, + e.rn, + COALESCE(mt.rn,rp.rn,1) +) + +, agg_to_target_items AS ( +SELECT + srce + ,id + ,target + ,map_intention + ,regex_function + ,target_item_number + ,result_key_name + ,target_json_path + ,CASE WHEN map_key IS NULL + THEN + NULL + ELSE + jsonb_build_object( + map_key, + CASE WHEN max(result_number) = 1 + THEN + jsonb_agg(map_val ORDER BY result_number) -> 0 + ELSE + jsonb_agg(map_val ORDER BY result_number) + END + ) + END map_val + ,CASE WHEN retain_key IS NULL + THEN + NULL + ELSE + jsonb_build_object( + retain_key, + CASE WHEN max(result_number) = 1 + THEN + jsonb_agg(retain_val ORDER BY result_number) -> 0 + ELSE + jsonb_agg(retain_val ORDER BY result_number) + END + ) + END retain_val +FROM + rx +GROUP BY + srce + ,id + ,target + ,map_intention + ,regex_function + ,target_item_number + ,result_key_name + ,target_json_path + ,map_key + ,retain_key +) + +, agg_to_target AS ( +SELECT + srce + ,id + ,target + ,map_intention + ,tps.jsonb_concat_obj(map_val) map_val + ,tps.jsonb_concat_obj(retain_val) retain_val +FROM + agg_to_target_items +GROUP BY + srce + ,id + ,target + ,map_intention +ORDER BY + id +) + +, link_map AS ( +SELECT + a.srce + ,a.id + ,a.target + ,a.map_intention + ,a.map_val + ,jsonb_strip_nulls(a.retain_val) retain_value + ,v.map +FROM + agg_to_target a + LEFT OUTER JOIN tps.map_rv v ON + v.srce = a.srce AND + v.target = a.target AND + v.retval = a.map_val +) + +SELECT + srce + ,id + ,tps.jsonb_concat_obj(COALESCE(retain_value,'{}'::jsonb)) retain_val + ,tps.jsonb_concat_obj(COALESCE(map,'{}'::jsonb)) map +FROM + link_map +GROUP BY + srce + ,id + \ No newline at end of file diff --git a/map_rm_template.pgsql b/map_rm_template.pgsql index 5be6c32..a15bf58 100644 --- a/map_rm_template.pgsql +++ b/map_rm_template.pgsql @@ -1,3 +1,4 @@ +/* DELETE FROM tps.map_rm where target = 'Strip Amount Commas'; INSERT INTO tps.map_rm @@ -28,4 +29,35 @@ FROM } $j$::jsonb , 1) +) x; +*/ +DELETE FROM tps.map_rm where target = 'Parse Descr'; +INSERT INTO +tps.map_rm +SELECT * +FROM +(VALUES + ('PNCC', 'Parse Descr', + $j$ + { + "name":"Parse Descr", + "description":"parse the description based on at least three capital letters followed by a comma until another set of at lesat 3 capital letters and a comma is encountered", + "defn": [ + { + "key": "{Description}", + "field": "dparse", + "regex": "([A-Z]{3,}?:)(.*)(?=[A-Z]{3,}?:|$)", + "flag":"g", + "retain":"y" + } + ], + "function":"extract", + "map":"no", + "where": [ + { + } + ] + } + $j$::jsonb + , 2) ) x; \ No newline at end of file From bf8e901ffdb2b94ccd51e51d84e73dd279b72be1 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Thu, 26 Oct 2017 13:32:14 -0400 Subject: [PATCH 14/20] add intermediate selects, add coalesce to target aggregation since some items are SQL null, then apply a strip_nulls, but exclude from subsequent step. add update line at end --- do_map_g_option.pgsql | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/do_map_g_option.pgsql b/do_map_g_option.pgsql index 3f1eb10..b4c3a74 100644 --- a/do_map_g_option.pgsql +++ b/do_map_g_option.pgsql @@ -77,6 +77,7 @@ FROM m.regex->>'function' = 'replace' WHERE t.srce = 'PNCC' + --rec @> '{"Description":"CHECK 93013270 086129935"}'::jsonb ORDER BY t.id DESC, m.target, @@ -84,6 +85,8 @@ ORDER BY COALESCE(mt.rn,rp.rn,1) ) +--SELECT * FROM rx + , agg_to_target_items AS ( SELECT srce @@ -137,14 +140,16 @@ GROUP BY ,retain_key ) +--SELECT * FROM agg_to_target_items + , agg_to_target AS ( SELECT srce ,id ,target ,map_intention - ,tps.jsonb_concat_obj(map_val) map_val - ,tps.jsonb_concat_obj(retain_val) retain_val + ,tps.jsonb_concat_obj(COALESCE(map_val,'{}'::JSONB)) map_val + ,jsonb_strip_nulls(tps.jsonb_concat_obj(COALESCE(retain_val,'{}'::JSONB))) retain_val FROM agg_to_target_items GROUP BY @@ -156,6 +161,10 @@ ORDER BY id ) + +--SELECT * FROM agg_to_target + + , link_map AS ( SELECT a.srce @@ -163,7 +172,7 @@ SELECT ,a.target ,a.map_intention ,a.map_val - ,jsonb_strip_nulls(a.retain_val) retain_value + ,a.retain_val retain_value ,v.map FROM agg_to_target a @@ -173,6 +182,9 @@ FROM v.retval = a.map_val ) +--SELECT * FROM link_map + +, agg_to_id AS ( SELECT srce ,id @@ -183,4 +195,18 @@ FROM GROUP BY srce ,id - \ No newline at end of file +) + +--SELECT * FROM agg_to_id + + +UPDATE + tps.trans t +SET + map = o.map, + parse = o.retain_val, + allj = t.rec||o.map||o.retain_val +FROM + agg_to_id o +WHERE + o.id = t.id; \ No newline at end of file From 2448ccf64156470510980878086bb3ed02411cac Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Thu, 26 Oct 2017 13:39:50 -0400 Subject: [PATCH 15/20] add new map, add column for all json, and apply index --- ubm_data.sql | 1 + ubm_schema.sql | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ubm_data.sql b/ubm_data.sql index 6d6a56d..7e9f75d 100644 --- a/ubm_data.sql +++ b/ubm_data.sql @@ -44,6 +44,7 @@ INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Wires Out', '{"ma INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Currency', '{"map": "yes", "defn": [{"key": "{Description}", "field": "ini", "regex": "([\\w].*?)(?=$| -|\\s[0-9].*?|\\s[\\w/]+?:)", "retain": "y"}, {"key": "{Description}", "field": "curr1", "regex": ".*(DEBIT|CREDIT).*(USD|CAD).*(?=DEBIT|CREDIT).*(?=USD|CAD).*", "retain": "y"}, {"key": "{Description}", "field": "curr2", "regex": ".*(?=DEBIT|CREDIT).*(?=USD|CAD).*(DEBIT|CREDIT).*(USD|CAD).*", "retain": "y"}], "where": [{"Transaction": "Miscellaneous Credits"}, {"Transaction": "Miscellaneous Debits"}], "function": "extract"}', 2); INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Check Number', '{"map": "yes", "defn": [{"key": "{Description}", "field": "checkn", "regex": "[^0-9]*([0-9]*)\\s|$", "retain": "y"}], "where": [{"Transaction": "Checks Paid"}], "function": "extract"}', 2); INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Strip Amount Commas', '{"map": "no", "defn": [{"key": "{Amount}", "flag": "g", "field": "amount", "regex": ",", "retain": "y", "replace": ""}], "name": "Strip Amount Commas", "where": [{}], "function": "replace", "description": "the Amount field come from PNC with commas embeded so it cannot be cast to numeric"}', 1); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Parse Descr', '{"map": "no", "defn": [{"key": "{Description}", "flag": "g", "field": "dparse", "regex": "([A-Z]{3,}?:)(.*)(?=[A-Z]{3,}?:|$)", "retain": "y"}], "name": "Parse Descr", "where": [{}], "function": "extract", "description": "parse the description based on at least three capital letters followed by a comma until another set of at lesat 3 capital letters and a comma is encountered"}', 2); -- diff --git a/ubm_schema.sql b/ubm_schema.sql index fbb876a..3f19da5 100644 --- a/ubm_schema.sql +++ b/ubm_schema.sql @@ -274,7 +274,8 @@ CREATE TABLE trans ( srce text, rec jsonb, parse jsonb, - map jsonb + map jsonb, + allj jsonb ); @@ -368,6 +369,13 @@ ALTER TABLE ONLY trans ADD CONSTRAINT trans_pkey PRIMARY KEY (id); +-- +-- Name: trans_allj; Type: INDEX; Schema: tps; Owner: - +-- + +CREATE INDEX trans_allj ON trans USING gin (allj); + + -- -- Name: trans_rec; Type: INDEX; Schema: tps; Owner: - -- From 6a90cc4b32ab3a26f6fc29f200fd62f6b9ff168f Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Thu, 26 Oct 2017 14:31:37 -0400 Subject: [PATCH 16/20] add maps --- map_rm_template.pgsql | 91 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/map_rm_template.pgsql b/map_rm_template.pgsql index a15bf58..631bc57 100644 --- a/map_rm_template.pgsql +++ b/map_rm_template.pgsql @@ -30,7 +30,7 @@ FROM $j$::jsonb , 1) ) x; -*/ + DELETE FROM tps.map_rm where target = 'Parse Descr'; INSERT INTO tps.map_rm @@ -60,4 +60,93 @@ FROM } $j$::jsonb , 2) +) x; + + +DELETE FROM tps.map_rm where target = 'Extract OBI'; +INSERT INTO +tps.map_rm +SELECT * +FROM +(VALUES + ('PNCC', 'Extract OBI', + $j$ + { + "name":"Extract OBI", + "description":"pull out whatever follows OBI in the description until atleast 3 capital letters followed by a colon are encountered", + "defn": [ + { + "key": "{Description}", + "field": "obi", + "regex": "OBI:(.*?)(?=[A-Z]{3,}?:|$)", + "flag":"", + "retain":"y" + } + ], + "function":"extract", + "map":"no", + "where": [ + { + "Transaction":"Money Transfer DB - Wire" + }, + { + "Transaction":"Money Transfer CR-Other" + }, + { + "Transaction":"Intl Money Transfer Debits" + }, + { + "Transaction":"Money Transfer DB - Other" + }, + { + "Transaction":"Money Transfer CR-Wire" + } + ] + } + $j$::jsonb + , 2) +) x; +*/ +DELETE FROM tps.map_rm where target = 'Extract RFB'; +INSERT INTO +tps.map_rm +SELECT * +FROM +(VALUES + ('PNCC', 'Extract RFB', + $j$ + { + "name":"Extract RFB", + "description":"pull out whatever follows RFB in the description until atleast 3 capital letters followed by a colon are encountered", + "defn": [ + { + "key": "{Description}", + "field": "rfb", + "regex": "RFB:(.*?)(?=[A-Z]{3,}?:|$)", + "flag":"", + "retain":"y" + } + ], + "function":"extract", + "map":"no", + "where": [ + { + "Transaction":"Money Transfer DB - Wire" + }, + { + "Transaction":"Money Transfer CR-Other" + }, + { + "Transaction":"Intl Money Transfer Debits" + }, + { + "Transaction":"Money Transfer DB - Other" + }, + { + "Transaction":"Money Transfer CR-Wire" + } + ] + } + $j$::jsonb + , 2) ) x; \ No newline at end of file From 7eba5981ef5a119164a0aa1905d380bc074b5d04 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Thu, 26 Oct 2017 17:34:21 -0400 Subject: [PATCH 17/20] trim retained values --- do_map_g_option.pgsql | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/do_map_g_option.pgsql b/do_map_g_option.pgsql index b4c3a74..31ddc3b 100644 --- a/do_map_g_option.pgsql +++ b/do_map_g_option.pgsql @@ -1,4 +1,4 @@ - +\timing WITH --------------------apply regex operations to transactions----------------------------------------------------------------------------------- @@ -53,11 +53,11 @@ SELECT CASE regex->>'function' WHEN 'extract' THEN CASE WHEN array_upper(mt.mt,1)=1 - THEN to_json(mt.mt[1]) + THEN to_json(trim(mt.mt[1])) ELSE array_to_json(mt.mt) END::jsonb WHEN 'replace' THEN - to_jsonb(rp.rp) + to_jsonb(rtrim(rp.rp)) ELSE '{}'::jsonb END @@ -76,7 +76,8 @@ FROM LEFT JOIN LATERAL regexp_replace(t.rec #>> ((e.v ->> 'key')::text[]), e.v ->> 'regex'::text, e.v ->> 'replace'::text,e.v ->> 'flag') WITH ORDINALITY rp(rp, rn) ON m.regex->>'function' = 'replace' WHERE - t.srce = 'PNCC' + --t.srce = 'PNCC' + rec @> '{"Transaction":"ACH Credits","Transaction":"ACH Debits"}' --rec @> '{"Description":"CHECK 93013270 086129935"}'::jsonb ORDER BY t.id DESC, @@ -197,9 +198,9 @@ GROUP BY ,id ) ---SELECT * FROM agg_to_id - +SELECT srce, id, jsonb_pretty(retain_val), jsonb_pretty(map) FROM agg_to_id +/* UPDATE tps.trans t SET @@ -209,4 +210,5 @@ SET FROM agg_to_id o WHERE - o.id = t.id; \ No newline at end of file + o.id = t.id; +*/ \ No newline at end of file From 9623853387cc5ebecbc97bde36d780e819cba4a3 Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Thu, 26 Oct 2017 17:34:37 -0400 Subject: [PATCH 18/20] add map to parse all ACH transactions --- map_rm_template.pgsql | 103 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 2 deletions(-) diff --git a/map_rm_template.pgsql b/map_rm_template.pgsql index 631bc57..34dae15 100644 --- a/map_rm_template.pgsql +++ b/map_rm_template.pgsql @@ -106,7 +106,7 @@ FROM $j$::jsonb , 2) ) x; -*/ + DELETE FROM tps.map_rm where target = 'Extract RFB'; INSERT INTO tps.map_rm @@ -149,4 +149,103 @@ FROM } $j$::jsonb , 2) -) x; \ No newline at end of file +) x; +*/ +DELETE FROM tps.map_rm where target = 'Parse ACH'; + +INSERT INTO +tps.map_rm +SELECT * +FROM +(VALUES + ('PNCC', 'Parse ACH', + $j$ + { + "name":"Parse ACH", + "description":"parse select components of the description for ACH Credits Receieved", + "defn": [ + { + "key": "{Description}", + "field":"Comp Name", + "regex": "Comp Name:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + }, + { + "key": "{Description}", + "field":"Cust ID", + "regex": "Cust ID:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + }, + { + "key": "{Description}", + "field":"Desc", + "regex": "Desc:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + }, + { + "key": "{Description}", + "field":"Cust Name", + "regex": "Cust Name:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + }, + { + "key": "{Description}", + "field":"Batch Discr", + "regex": "Batch Discr:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + }, + { + "key": "{Description}", + "field":"Comp ID", + "regex": "Comp ID:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + }, + { + "key": "{Description}", + "field":"Addenda", + "regex": "Addenda:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + }, + { + "key": "{Description}", + "field":"SETT", + "regex": "SETT:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + }, + { + "key": "{Description}", + "field":"Date", + "regex": "Date:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + }, + { + "key": "{Description}", + "field":"Time", + "regex": "Time:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", + "flag":"", + "retain":"y" + } + ], + "function":"extract", + "map":"no", + "where": [ + { + "Transaction":"ACH Credits" + }, + { + "Transaction":"ACH Debits" + } + ] + } + $j$::jsonb + , 2) +) x; From 3924776161ff927b24eaee147d9f003d4708cafb Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Thu, 26 Oct 2017 17:34:45 -0400 Subject: [PATCH 19/20] update readme --- readme.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/readme.md b/readme.md index 6a326a9..b2acb67 100644 --- a/readme.md +++ b/readme.md @@ -4,6 +4,8 @@ Concepts pull various static files into postgres and do basic transformation without losing the original document or getting into custom code for each scenario +the is an in-between for an foreign data wrapper & custom programming + ## Storage all records are jsonb applied mappings are in associated jsonb documents @@ -15,6 +17,9 @@ applied mappings are in associated jsonb documents 1. regular expressions are used to extract pieces of the json objects 2. the results of the regular expressions are bumped up against a list of basic mappings and written to an associated jsonb document +a target represents a whole scenario that needs matched. it can contain several regex expressions. if one fails, then no match is attempted because it coudl result in a false positive based on the @> oeprator used at join time +`this probably isn't correctly implemented` + ## Transformation tools * `COPY` * `regexp_matches()` From 6b08ea8fb8c2501f3dc5d8405cac8660d2c6e9fa Mon Sep 17 00:00:00 2001 From: Paul Trowbridge Date: Thu, 26 Oct 2017 17:36:03 -0400 Subject: [PATCH 20/20] add new maps, need to convert OBI & RFB maps to a single target --- ubm_data.sql | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ubm_data.sql b/ubm_data.sql index 7e9f75d..166a0c3 100644 --- a/ubm_data.sql +++ b/ubm_data.sql @@ -45,6 +45,9 @@ INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Currency', '{"map INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Check Number', '{"map": "yes", "defn": [{"key": "{Description}", "field": "checkn", "regex": "[^0-9]*([0-9]*)\\s|$", "retain": "y"}], "where": [{"Transaction": "Checks Paid"}], "function": "extract"}', 2); INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Strip Amount Commas', '{"map": "no", "defn": [{"key": "{Amount}", "flag": "g", "field": "amount", "regex": ",", "retain": "y", "replace": ""}], "name": "Strip Amount Commas", "where": [{}], "function": "replace", "description": "the Amount field come from PNC with commas embeded so it cannot be cast to numeric"}', 1); INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Parse Descr', '{"map": "no", "defn": [{"key": "{Description}", "flag": "g", "field": "dparse", "regex": "([A-Z]{3,}?:)(.*)(?=[A-Z]{3,}?:|$)", "retain": "y"}], "name": "Parse Descr", "where": [{}], "function": "extract", "description": "parse the description based on at least three capital letters followed by a comma until another set of at lesat 3 capital letters and a comma is encountered"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Extract OBI', '{"map": "no", "defn": [{"key": "{Description}", "flag": "", "field": "obi", "regex": "OBI:(.*?)(?=[A-Z]{3,}?:|$)", "retain": "y"}], "name": "Extract OBI", "where": [{"Transaction": "Money Transfer DB - Wire"}, {"Transaction": "Money Transfer CR-Other"}, {"Transaction": "Intl Money Transfer Debits"}, {"Transaction": "Money Transfer DB - Other"}, {"Transaction": "Money Transfer CR-Wire"}], "function": "extract", "description": "pull out whatever follows OBI in the description until atleast 3 capital letters followed by a colon are encountered"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Extract RFB', '{"map": "no", "defn": [{"key": "{Description}", "flag": "", "field": "rfb", "regex": "RFB:(.*?)(?=[A-Z]{3,}?:|$)", "retain": "y"}], "name": "Extract RFB", "where": [{"Transaction": "Money Transfer DB - Wire"}, {"Transaction": "Money Transfer CR-Other"}, {"Transaction": "Intl Money Transfer Debits"}, {"Transaction": "Money Transfer DB - Other"}, {"Transaction": "Money Transfer CR-Wire"}], "function": "extract", "description": "pull out whatever follows RFB in the description until atleast 3 capital letters followed by a colon are encountered"}', 2); +INSERT INTO map_rm (srce, target, regex, seq) VALUES ('PNCC', 'Parse ACH', '{"map": "no", "defn": [{"key": "{Description}", "flag": "", "field": "Comp Name", "regex": "Comp Name:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}, {"key": "{Description}", "flag": "", "field": "Cust ID", "regex": "Cust ID:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}, {"key": "{Description}", "flag": "", "field": "Desc", "regex": "Desc:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}, {"key": "{Description}", "flag": "", "field": "Cust Name", "regex": "Cust Name:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}, {"key": "{Description}", "flag": "", "field": "Batch Discr", "regex": "Batch Discr:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}, {"key": "{Description}", "flag": "", "field": "Comp ID", "regex": "Comp ID:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}, {"key": "{Description}", "flag": "", "field": "Addenda", "regex": "Addenda:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}, {"key": "{Description}", "flag": "", "field": "SETT", "regex": "SETT:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}, {"key": "{Description}", "flag": "", "field": "Date", "regex": "Date:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}, {"key": "{Description}", "flag": "", "field": "Time", "regex": "Time:(.+?)(?=SEC:|Cust ID:|Desc:|Comp Name:|Comp ID:|Batch Discr:|Cust Name:|Addenda:|SETT:|Date:|Time:|$)", "retain": "y"}], "name": "Parse ACH", "where": [{"Transaction": "ACH Credits"}, {"Transaction": "ACH Debits"}], "function": "extract", "description": "parse select components of the description for ACH Credits Receieved"}', 2); --