Rewrite apply_transformations as set-based CTE chain
Replaces the nested FOR loops (row-by-row, rule-by-rule) with a single SQL CTE chain that processes all records × rules in one pass, mirroring the TPS approach. CTE chain: qualifying → all untransformed records for the source rx → apply each rule (extract/replace) to each record linked → LEFT JOIN mappings to find mapped output rule_output → build per-rule JSONB (with retain support) record_additions → merge all rule outputs per record in sequence order UPDATE → set transformed = data || additions Also adds jsonb_concat_obj aggregate (jsonb merge with ORDER BY support) needed to collapse multiple rule outputs per record into one object. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f7f88bb5cf
commit
4cf5be52e8
@ -66,122 +66,139 @@ $$ LANGUAGE plpgsql;
|
|||||||
|
|
||||||
COMMENT ON FUNCTION import_records IS 'Import records with automatic deduplication';
|
COMMENT ON FUNCTION import_records IS 'Import records with automatic deduplication';
|
||||||
|
|
||||||
|
------------------------------------------------------
|
||||||
|
-- Aggregate: jsonb_concat_obj
|
||||||
|
-- Merge JSONB objects across rows (later rows win on key conflicts)
|
||||||
|
-- Usage: jsonb_concat_obj(col ORDER BY sequence)
|
||||||
|
------------------------------------------------------
|
||||||
|
CREATE OR REPLACE FUNCTION dataflow.jsonb_merge(a JSONB, b JSONB)
|
||||||
|
RETURNS JSONB AS $$
|
||||||
|
SELECT COALESCE(a, '{}') || COALESCE(b, '{}')
|
||||||
|
$$ LANGUAGE sql IMMUTABLE;
|
||||||
|
|
||||||
|
DROP AGGREGATE IF EXISTS dataflow.jsonb_concat_obj(JSONB);
|
||||||
|
CREATE AGGREGATE dataflow.jsonb_concat_obj(JSONB) (
|
||||||
|
sfunc = dataflow.jsonb_merge,
|
||||||
|
stype = JSONB,
|
||||||
|
initcond = '{}'
|
||||||
|
);
|
||||||
|
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
-- Function: apply_transformations
|
-- Function: apply_transformations
|
||||||
-- Apply all transformation rules to records
|
-- Apply all transformation rules to records (set-based)
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
CREATE OR REPLACE FUNCTION apply_transformations(
|
CREATE OR REPLACE FUNCTION apply_transformations(
|
||||||
p_source_name TEXT,
|
p_source_name TEXT,
|
||||||
p_record_ids INTEGER[] DEFAULT NULL -- NULL = all untransformed
|
p_record_ids INTEGER[] DEFAULT NULL -- NULL = all untransformed
|
||||||
) RETURNS JSON AS $$
|
) RETURNS JSON AS $$
|
||||||
DECLARE
|
WITH
|
||||||
v_record RECORD;
|
-- All records to process
|
||||||
v_rule RECORD;
|
qualifying AS (
|
||||||
v_transformed JSONB;
|
|
||||||
v_match_count BIGINT;
|
|
||||||
v_extracted JSONB;
|
|
||||||
v_mapping JSONB;
|
|
||||||
v_count INTEGER := 0;
|
|
||||||
BEGIN
|
|
||||||
-- Loop through records to transform
|
|
||||||
FOR v_record IN
|
|
||||||
SELECT id, data
|
SELECT id, data
|
||||||
FROM dataflow.records
|
FROM dataflow.records
|
||||||
WHERE source_name = p_source_name
|
WHERE source_name = p_source_name
|
||||||
AND (p_record_ids IS NULL OR id = ANY(p_record_ids))
|
|
||||||
AND transformed IS NULL
|
AND transformed IS NULL
|
||||||
LOOP
|
AND (p_record_ids IS NULL OR id = ANY(p_record_ids))
|
||||||
-- Start with original data
|
),
|
||||||
v_transformed := v_record.data;
|
-- Apply each enabled rule to each qualifying record that has the required field
|
||||||
|
rx AS (
|
||||||
-- Apply each rule in sequence
|
|
||||||
FOR v_rule IN
|
|
||||||
SELECT * FROM dataflow.rules
|
|
||||||
WHERE source_name = p_source_name
|
|
||||||
AND enabled = true
|
|
||||||
ORDER BY sequence
|
|
||||||
LOOP
|
|
||||||
-- Apply rule based on function type
|
|
||||||
IF v_rule.function_type = 'replace' THEN
|
|
||||||
-- Pass flags as third arg so 'g' (replace all) works correctly
|
|
||||||
v_transformed := jsonb_set(
|
|
||||||
v_transformed,
|
|
||||||
ARRAY[v_rule.output_field],
|
|
||||||
to_jsonb(regexp_replace(
|
|
||||||
v_record.data->>v_rule.field,
|
|
||||||
v_rule.pattern,
|
|
||||||
v_rule.replace_value,
|
|
||||||
v_rule.flags
|
|
||||||
))
|
|
||||||
);
|
|
||||||
ELSE
|
|
||||||
-- extract: use regexp_matches so 'g' flag returns all occurrences
|
|
||||||
-- Aggregate directly to JSONB: single capture → scalar, multi → array
|
|
||||||
SELECT
|
SELECT
|
||||||
|
q.id,
|
||||||
|
r.name AS rule_name,
|
||||||
|
r.sequence,
|
||||||
|
r.output_field,
|
||||||
|
r.retain,
|
||||||
|
CASE r.function_type
|
||||||
|
WHEN 'replace' THEN
|
||||||
|
to_jsonb(regexp_replace(
|
||||||
|
q.data ->> r.field, r.pattern, r.replace_value, r.flags
|
||||||
|
))
|
||||||
|
ELSE
|
||||||
|
-- extract: aggregate all matches; single match → scalar, multiple → array
|
||||||
|
-- Aggregate first so we can inspect count and first element cleanly
|
||||||
|
(SELECT
|
||||||
|
CASE WHEN cnt = 0 THEN NULL
|
||||||
|
WHEN cnt = 1 THEN agg->0
|
||||||
|
ELSE agg
|
||||||
|
END
|
||||||
|
FROM (
|
||||||
|
SELECT
|
||||||
|
count(*) AS cnt,
|
||||||
jsonb_agg(
|
jsonb_agg(
|
||||||
CASE WHEN array_length(mt, 1) = 1 THEN to_jsonb(mt[1])
|
CASE WHEN array_length(mt, 1) = 1
|
||||||
|
THEN to_jsonb(mt[1])
|
||||||
ELSE to_jsonb(mt)
|
ELSE to_jsonb(mt)
|
||||||
END
|
END
|
||||||
ORDER BY rn
|
ORDER BY rn
|
||||||
),
|
) AS agg
|
||||||
count(*)
|
FROM regexp_matches(q.data ->> r.field, r.pattern, r.flags)
|
||||||
INTO v_extracted, v_match_count
|
WITH ORDINALITY AS m(mt, rn)
|
||||||
FROM regexp_matches(
|
) _agg)
|
||||||
v_record.data->>v_rule.field,
|
END AS extracted
|
||||||
v_rule.pattern,
|
FROM qualifying q
|
||||||
v_rule.flags
|
CROSS JOIN dataflow.rules r
|
||||||
) WITH ORDINALITY AS m(mt, rn);
|
WHERE r.source_name = p_source_name
|
||||||
|
AND r.enabled = true
|
||||||
IF v_match_count > 0 THEN
|
AND q.data ? r.field
|
||||||
-- Single match: unwrap the array to get scalar or capture array directly
|
),
|
||||||
IF v_match_count = 1 THEN
|
-- Join with mappings to find mapped output for each extracted value
|
||||||
v_extracted := v_extracted->0;
|
linked AS (
|
||||||
END IF;
|
SELECT
|
||||||
-- v_extracted is now: scalar string, array of captures, or array of matches (g)
|
rx.id,
|
||||||
|
rx.sequence,
|
||||||
-- Check if there's a mapping for this value
|
rx.output_field,
|
||||||
SELECT output INTO v_mapping
|
rx.retain,
|
||||||
FROM dataflow.mappings
|
rx.extracted,
|
||||||
WHERE source_name = p_source_name
|
m.output AS mapped
|
||||||
AND rule_name = v_rule.name
|
FROM rx
|
||||||
AND input_value = v_extracted;
|
LEFT JOIN dataflow.mappings m ON
|
||||||
|
m.source_name = p_source_name
|
||||||
IF v_mapping IS NOT NULL THEN
|
AND m.rule_name = rx.rule_name
|
||||||
-- Apply mapping (merge mapped fields into result)
|
AND m.input_value = rx.extracted
|
||||||
v_transformed := v_transformed || v_mapping;
|
WHERE rx.extracted IS NOT NULL
|
||||||
-- If retain is set, also write the extracted value to output_field
|
),
|
||||||
IF v_rule.retain THEN
|
-- Build per-rule output JSONB:
|
||||||
v_transformed := jsonb_set(v_transformed, ARRAY[v_rule.output_field], v_extracted);
|
-- mapped → use mapping output; also write output_field if retain = true
|
||||||
END IF;
|
-- no map → write extracted value to output_field
|
||||||
|
rule_output AS (
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
sequence,
|
||||||
|
CASE
|
||||||
|
WHEN mapped IS NOT NULL THEN
|
||||||
|
mapped ||
|
||||||
|
CASE WHEN retain
|
||||||
|
THEN jsonb_build_object(output_field, extracted)
|
||||||
|
ELSE '{}'::jsonb
|
||||||
|
END
|
||||||
ELSE
|
ELSE
|
||||||
-- No mapping, store extracted value (scalar or array)
|
jsonb_build_object(output_field, extracted)
|
||||||
v_transformed := jsonb_set(
|
END AS output
|
||||||
v_transformed,
|
FROM linked
|
||||||
ARRAY[v_rule.output_field],
|
),
|
||||||
v_extracted
|
-- Merge all rule outputs per record in sequence order (higher sequence wins on conflict)
|
||||||
);
|
record_additions AS (
|
||||||
END IF;
|
SELECT
|
||||||
END IF;
|
id,
|
||||||
END IF;
|
dataflow.jsonb_concat_obj(output ORDER BY sequence) AS additions
|
||||||
END LOOP;
|
FROM rule_output
|
||||||
|
GROUP BY id
|
||||||
-- Update record with transformed data
|
),
|
||||||
UPDATE dataflow.records
|
-- Update all qualifying records; records with no rule matches get transformed = data
|
||||||
SET transformed = v_transformed,
|
updated AS (
|
||||||
|
UPDATE dataflow.records rec
|
||||||
|
SET transformed = rec.data || COALESCE(ra.additions, '{}'::jsonb),
|
||||||
transformed_at = CURRENT_TIMESTAMP
|
transformed_at = CURRENT_TIMESTAMP
|
||||||
WHERE id = v_record.id;
|
FROM qualifying q
|
||||||
|
LEFT JOIN record_additions ra ON ra.id = q.id
|
||||||
|
WHERE rec.id = q.id
|
||||||
|
RETURNING rec.id
|
||||||
|
)
|
||||||
|
SELECT json_build_object('success', true, 'transformed', count(*))
|
||||||
|
FROM updated
|
||||||
|
$$ LANGUAGE sql;
|
||||||
|
|
||||||
v_count := v_count + 1;
|
COMMENT ON FUNCTION apply_transformations IS 'Apply transformation rules and mappings to records (set-based CTE)';
|
||||||
END LOOP;
|
|
||||||
|
|
||||||
RETURN json_build_object(
|
|
||||||
'success', true,
|
|
||||||
'transformed', v_count
|
|
||||||
);
|
|
||||||
END;
|
|
||||||
$$ LANGUAGE plpgsql;
|
|
||||||
|
|
||||||
COMMENT ON FUNCTION apply_transformations IS 'Apply transformation rules and mappings to records';
|
|
||||||
|
|
||||||
------------------------------------------------------
|
------------------------------------------------------
|
||||||
-- Function: get_unmapped_values
|
-- Function: get_unmapped_values
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user