Rewrite apply_transformations as set-based CTE chain

Replaces the nested FOR loops (row-by-row, rule-by-rule) with a single
SQL CTE chain that processes all records × rules in one pass, mirroring
the TPS approach.

CTE chain:
  qualifying      → all untransformed records for the source
  rx              → apply each rule (extract/replace) to each record
  linked          → LEFT JOIN mappings to find mapped output
  rule_output     → build per-rule JSONB (with retain support)
  record_additions → merge all rule outputs per record in sequence order
  UPDATE          → set transformed = data || additions

Also adds jsonb_concat_obj aggregate (jsonb merge with ORDER BY support)
needed to collapse multiple rule outputs per record into one object.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Paul Trowbridge 2026-04-04 21:13:49 -04:00
parent f7f88bb5cf
commit 4cf5be52e8

View File

@ -66,122 +66,139 @@ $$ LANGUAGE plpgsql;
COMMENT ON FUNCTION import_records IS 'Import records with automatic deduplication'; COMMENT ON FUNCTION import_records IS 'Import records with automatic deduplication';
------------------------------------------------------
-- Aggregate: jsonb_concat_obj
-- Merge JSONB objects across rows (later rows win on key conflicts)
-- Usage: jsonb_concat_obj(col ORDER BY sequence)
------------------------------------------------------
CREATE OR REPLACE FUNCTION dataflow.jsonb_merge(a JSONB, b JSONB)
RETURNS JSONB AS $$
SELECT COALESCE(a, '{}') || COALESCE(b, '{}')
$$ LANGUAGE sql IMMUTABLE;
DROP AGGREGATE IF EXISTS dataflow.jsonb_concat_obj(JSONB);
CREATE AGGREGATE dataflow.jsonb_concat_obj(JSONB) (
sfunc = dataflow.jsonb_merge,
stype = JSONB,
initcond = '{}'
);
------------------------------------------------------ ------------------------------------------------------
-- Function: apply_transformations -- Function: apply_transformations
-- Apply all transformation rules to records -- Apply all transformation rules to records (set-based)
------------------------------------------------------ ------------------------------------------------------
CREATE OR REPLACE FUNCTION apply_transformations( CREATE OR REPLACE FUNCTION apply_transformations(
p_source_name TEXT, p_source_name TEXT,
p_record_ids INTEGER[] DEFAULT NULL -- NULL = all untransformed p_record_ids INTEGER[] DEFAULT NULL -- NULL = all untransformed
) RETURNS JSON AS $$ ) RETURNS JSON AS $$
DECLARE WITH
v_record RECORD; -- All records to process
v_rule RECORD; qualifying AS (
v_transformed JSONB; SELECT id, data
v_match_count BIGINT; FROM dataflow.records
v_extracted JSONB; WHERE source_name = p_source_name
v_mapping JSONB; AND transformed IS NULL
v_count INTEGER := 0; AND (p_record_ids IS NULL OR id = ANY(p_record_ids))
BEGIN ),
-- Loop through records to transform -- Apply each enabled rule to each qualifying record that has the required field
FOR v_record IN rx AS (
SELECT id, data SELECT
FROM dataflow.records q.id,
WHERE source_name = p_source_name r.name AS rule_name,
AND (p_record_ids IS NULL OR id = ANY(p_record_ids)) r.sequence,
AND transformed IS NULL r.output_field,
LOOP r.retain,
-- Start with original data CASE r.function_type
v_transformed := v_record.data; WHEN 'replace' THEN
to_jsonb(regexp_replace(
-- Apply each rule in sequence q.data ->> r.field, r.pattern, r.replace_value, r.flags
FOR v_rule IN ))
SELECT * FROM dataflow.rules
WHERE source_name = p_source_name
AND enabled = true
ORDER BY sequence
LOOP
-- Apply rule based on function type
IF v_rule.function_type = 'replace' THEN
-- Pass flags as third arg so 'g' (replace all) works correctly
v_transformed := jsonb_set(
v_transformed,
ARRAY[v_rule.output_field],
to_jsonb(regexp_replace(
v_record.data->>v_rule.field,
v_rule.pattern,
v_rule.replace_value,
v_rule.flags
))
);
ELSE ELSE
-- extract: use regexp_matches so 'g' flag returns all occurrences -- extract: aggregate all matches; single match → scalar, multiple → array
-- Aggregate directly to JSONB: single capture → scalar, multi → array -- Aggregate first so we can inspect count and first element cleanly
SELECT (SELECT
jsonb_agg( CASE WHEN cnt = 0 THEN NULL
CASE WHEN array_length(mt, 1) = 1 THEN to_jsonb(mt[1]) WHEN cnt = 1 THEN agg->0
ELSE to_jsonb(mt) ELSE agg
END END
ORDER BY rn FROM (
), SELECT
count(*) count(*) AS cnt,
INTO v_extracted, v_match_count jsonb_agg(
FROM regexp_matches( CASE WHEN array_length(mt, 1) = 1
v_record.data->>v_rule.field, THEN to_jsonb(mt[1])
v_rule.pattern, ELSE to_jsonb(mt)
v_rule.flags END
) WITH ORDINALITY AS m(mt, rn); ORDER BY rn
) AS agg
FROM regexp_matches(q.data ->> r.field, r.pattern, r.flags)
WITH ORDINALITY AS m(mt, rn)
) _agg)
END AS extracted
FROM qualifying q
CROSS JOIN dataflow.rules r
WHERE r.source_name = p_source_name
AND r.enabled = true
AND q.data ? r.field
),
-- Join with mappings to find mapped output for each extracted value
linked AS (
SELECT
rx.id,
rx.sequence,
rx.output_field,
rx.retain,
rx.extracted,
m.output AS mapped
FROM rx
LEFT JOIN dataflow.mappings m ON
m.source_name = p_source_name
AND m.rule_name = rx.rule_name
AND m.input_value = rx.extracted
WHERE rx.extracted IS NOT NULL
),
-- Build per-rule output JSONB:
-- mapped → use mapping output; also write output_field if retain = true
-- no map → write extracted value to output_field
rule_output AS (
SELECT
id,
sequence,
CASE
WHEN mapped IS NOT NULL THEN
mapped ||
CASE WHEN retain
THEN jsonb_build_object(output_field, extracted)
ELSE '{}'::jsonb
END
ELSE
jsonb_build_object(output_field, extracted)
END AS output
FROM linked
),
-- Merge all rule outputs per record in sequence order (higher sequence wins on conflict)
record_additions AS (
SELECT
id,
dataflow.jsonb_concat_obj(output ORDER BY sequence) AS additions
FROM rule_output
GROUP BY id
),
-- Update all qualifying records; records with no rule matches get transformed = data
updated AS (
UPDATE dataflow.records rec
SET transformed = rec.data || COALESCE(ra.additions, '{}'::jsonb),
transformed_at = CURRENT_TIMESTAMP
FROM qualifying q
LEFT JOIN record_additions ra ON ra.id = q.id
WHERE rec.id = q.id
RETURNING rec.id
)
SELECT json_build_object('success', true, 'transformed', count(*))
FROM updated
$$ LANGUAGE sql;
IF v_match_count > 0 THEN COMMENT ON FUNCTION apply_transformations IS 'Apply transformation rules and mappings to records (set-based CTE)';
-- Single match: unwrap the array to get scalar or capture array directly
IF v_match_count = 1 THEN
v_extracted := v_extracted->0;
END IF;
-- v_extracted is now: scalar string, array of captures, or array of matches (g)
-- Check if there's a mapping for this value
SELECT output INTO v_mapping
FROM dataflow.mappings
WHERE source_name = p_source_name
AND rule_name = v_rule.name
AND input_value = v_extracted;
IF v_mapping IS NOT NULL THEN
-- Apply mapping (merge mapped fields into result)
v_transformed := v_transformed || v_mapping;
-- If retain is set, also write the extracted value to output_field
IF v_rule.retain THEN
v_transformed := jsonb_set(v_transformed, ARRAY[v_rule.output_field], v_extracted);
END IF;
ELSE
-- No mapping, store extracted value (scalar or array)
v_transformed := jsonb_set(
v_transformed,
ARRAY[v_rule.output_field],
v_extracted
);
END IF;
END IF;
END IF;
END LOOP;
-- Update record with transformed data
UPDATE dataflow.records
SET transformed = v_transformed,
transformed_at = CURRENT_TIMESTAMP
WHERE id = v_record.id;
v_count := v_count + 1;
END LOOP;
RETURN json_build_object(
'success', true,
'transformed', v_count
);
END;
$$ LANGUAGE plpgsql;
COMMENT ON FUNCTION apply_transformations IS 'Apply transformation rules and mappings to records';
------------------------------------------------------ ------------------------------------------------------
-- Function: get_unmapped_values -- Function: get_unmapped_values