Rewrite apply_transformations as set-based CTE chain

Replaces the nested FOR loops (row-by-row, rule-by-rule) with a single
SQL CTE chain that processes all records × rules in one pass, mirroring
the TPS approach.

CTE chain:
  qualifying      → all untransformed records for the source
  rx              → apply each rule (extract/replace) to each record
  linked          → LEFT JOIN mappings to find mapped output
  rule_output     → build per-rule JSONB (with retain support)
  record_additions → merge all rule outputs per record in sequence order
  UPDATE          → set transformed = data || additions

Also adds jsonb_concat_obj aggregate (jsonb merge with ORDER BY support)
needed to collapse multiple rule outputs per record into one object.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Paul Trowbridge 2026-04-04 21:13:49 -04:00
parent f7f88bb5cf
commit 4cf5be52e8

View File

@ -66,122 +66,139 @@ $$ LANGUAGE plpgsql;
COMMENT ON FUNCTION import_records IS 'Import records with automatic deduplication'; COMMENT ON FUNCTION import_records IS 'Import records with automatic deduplication';
------------------------------------------------------
-- Aggregate: jsonb_concat_obj
-- Merge JSONB objects across rows (later rows win on key conflicts)
-- Usage: jsonb_concat_obj(col ORDER BY sequence)
------------------------------------------------------
CREATE OR REPLACE FUNCTION dataflow.jsonb_merge(a JSONB, b JSONB)
RETURNS JSONB AS $$
SELECT COALESCE(a, '{}') || COALESCE(b, '{}')
$$ LANGUAGE sql IMMUTABLE;
DROP AGGREGATE IF EXISTS dataflow.jsonb_concat_obj(JSONB);
CREATE AGGREGATE dataflow.jsonb_concat_obj(JSONB) (
sfunc = dataflow.jsonb_merge,
stype = JSONB,
initcond = '{}'
);
------------------------------------------------------ ------------------------------------------------------
-- Function: apply_transformations -- Function: apply_transformations
-- Apply all transformation rules to records -- Apply all transformation rules to records (set-based)
------------------------------------------------------ ------------------------------------------------------
CREATE OR REPLACE FUNCTION apply_transformations( CREATE OR REPLACE FUNCTION apply_transformations(
p_source_name TEXT, p_source_name TEXT,
p_record_ids INTEGER[] DEFAULT NULL -- NULL = all untransformed p_record_ids INTEGER[] DEFAULT NULL -- NULL = all untransformed
) RETURNS JSON AS $$ ) RETURNS JSON AS $$
DECLARE WITH
v_record RECORD; -- All records to process
v_rule RECORD; qualifying AS (
v_transformed JSONB;
v_match_count BIGINT;
v_extracted JSONB;
v_mapping JSONB;
v_count INTEGER := 0;
BEGIN
-- Loop through records to transform
FOR v_record IN
SELECT id, data SELECT id, data
FROM dataflow.records FROM dataflow.records
WHERE source_name = p_source_name WHERE source_name = p_source_name
AND (p_record_ids IS NULL OR id = ANY(p_record_ids))
AND transformed IS NULL AND transformed IS NULL
LOOP AND (p_record_ids IS NULL OR id = ANY(p_record_ids))
-- Start with original data ),
v_transformed := v_record.data; -- Apply each enabled rule to each qualifying record that has the required field
rx AS (
-- Apply each rule in sequence
FOR v_rule IN
SELECT * FROM dataflow.rules
WHERE source_name = p_source_name
AND enabled = true
ORDER BY sequence
LOOP
-- Apply rule based on function type
IF v_rule.function_type = 'replace' THEN
-- Pass flags as third arg so 'g' (replace all) works correctly
v_transformed := jsonb_set(
v_transformed,
ARRAY[v_rule.output_field],
to_jsonb(regexp_replace(
v_record.data->>v_rule.field,
v_rule.pattern,
v_rule.replace_value,
v_rule.flags
))
);
ELSE
-- extract: use regexp_matches so 'g' flag returns all occurrences
-- Aggregate directly to JSONB: single capture → scalar, multi → array
SELECT SELECT
q.id,
r.name AS rule_name,
r.sequence,
r.output_field,
r.retain,
CASE r.function_type
WHEN 'replace' THEN
to_jsonb(regexp_replace(
q.data ->> r.field, r.pattern, r.replace_value, r.flags
))
ELSE
-- extract: aggregate all matches; single match → scalar, multiple → array
-- Aggregate first so we can inspect count and first element cleanly
(SELECT
CASE WHEN cnt = 0 THEN NULL
WHEN cnt = 1 THEN agg->0
ELSE agg
END
FROM (
SELECT
count(*) AS cnt,
jsonb_agg( jsonb_agg(
CASE WHEN array_length(mt, 1) = 1 THEN to_jsonb(mt[1]) CASE WHEN array_length(mt, 1) = 1
THEN to_jsonb(mt[1])
ELSE to_jsonb(mt) ELSE to_jsonb(mt)
END END
ORDER BY rn ORDER BY rn
) AS agg
FROM regexp_matches(q.data ->> r.field, r.pattern, r.flags)
WITH ORDINALITY AS m(mt, rn)
) _agg)
END AS extracted
FROM qualifying q
CROSS JOIN dataflow.rules r
WHERE r.source_name = p_source_name
AND r.enabled = true
AND q.data ? r.field
), ),
count(*) -- Join with mappings to find mapped output for each extracted value
INTO v_extracted, v_match_count linked AS (
FROM regexp_matches( SELECT
v_record.data->>v_rule.field, rx.id,
v_rule.pattern, rx.sequence,
v_rule.flags rx.output_field,
) WITH ORDINALITY AS m(mt, rn); rx.retain,
rx.extracted,
IF v_match_count > 0 THEN m.output AS mapped
-- Single match: unwrap the array to get scalar or capture array directly FROM rx
IF v_match_count = 1 THEN LEFT JOIN dataflow.mappings m ON
v_extracted := v_extracted->0; m.source_name = p_source_name
END IF; AND m.rule_name = rx.rule_name
-- v_extracted is now: scalar string, array of captures, or array of matches (g) AND m.input_value = rx.extracted
WHERE rx.extracted IS NOT NULL
-- Check if there's a mapping for this value ),
SELECT output INTO v_mapping -- Build per-rule output JSONB:
FROM dataflow.mappings -- mapped → use mapping output; also write output_field if retain = true
WHERE source_name = p_source_name -- no map → write extracted value to output_field
AND rule_name = v_rule.name rule_output AS (
AND input_value = v_extracted; SELECT
id,
IF v_mapping IS NOT NULL THEN sequence,
-- Apply mapping (merge mapped fields into result) CASE
v_transformed := v_transformed || v_mapping; WHEN mapped IS NOT NULL THEN
-- If retain is set, also write the extracted value to output_field mapped ||
IF v_rule.retain THEN CASE WHEN retain
v_transformed := jsonb_set(v_transformed, ARRAY[v_rule.output_field], v_extracted); THEN jsonb_build_object(output_field, extracted)
END IF; ELSE '{}'::jsonb
END
ELSE ELSE
-- No mapping, store extracted value (scalar or array) jsonb_build_object(output_field, extracted)
v_transformed := jsonb_set( END AS output
v_transformed, FROM linked
ARRAY[v_rule.output_field], ),
v_extracted -- Merge all rule outputs per record in sequence order (higher sequence wins on conflict)
); record_additions AS (
END IF; SELECT
END IF; id,
END IF; dataflow.jsonb_concat_obj(output ORDER BY sequence) AS additions
END LOOP; FROM rule_output
GROUP BY id
-- Update record with transformed data ),
UPDATE dataflow.records -- Update all qualifying records; records with no rule matches get transformed = data
SET transformed = v_transformed, updated AS (
UPDATE dataflow.records rec
SET transformed = rec.data || COALESCE(ra.additions, '{}'::jsonb),
transformed_at = CURRENT_TIMESTAMP transformed_at = CURRENT_TIMESTAMP
WHERE id = v_record.id; FROM qualifying q
LEFT JOIN record_additions ra ON ra.id = q.id
WHERE rec.id = q.id
RETURNING rec.id
)
SELECT json_build_object('success', true, 'transformed', count(*))
FROM updated
$$ LANGUAGE sql;
v_count := v_count + 1; COMMENT ON FUNCTION apply_transformations IS 'Apply transformation rules and mappings to records (set-based CTE)';
END LOOP;
RETURN json_build_object(
'success', true,
'transformed', v_count
);
END;
$$ LANGUAGE plpgsql;
COMMENT ON FUNCTION apply_transformations IS 'Apply transformation rules and mappings to records';
------------------------------------------------------ ------------------------------------------------------
-- Function: get_unmapped_values -- Function: get_unmapped_values