Rewrite apply_transformations as set-based CTE chain

Replaces the nested FOR loops (row-by-row, rule-by-rule) with a single SQL CTE chain that processes all records × rules in one pass, mirroring the TPS approach. CTE chain: qualifying → all untransformed records for the source rx → apply each rule (extract/replace) to each record linked → LEFT JOIN mappings to find mapped output rule_output → build per-rule JSONB (with retain support) record_additions → merge all rule outputs per record in sequence order UPDATE → set transformed = data || additions Also adds jsonb_concat_obj aggregate (jsonb merge with ORDER BY support) needed to collapse multiple rule outputs per record into one object. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 21:13:49 -04:00 · 2026-04-04 21:13:49 -04:00 · 4cf5be52e8
commit 4cf5be52e8
parent f7f88bb5cf
1 changed files with 124 additions and 107 deletions
--- a/database/functions.sql
+++ b/database/functions.sql
@ -66,122 +66,139 @@ $$ LANGUAGE plpgsql;
 COMMENT ON FUNCTION import_records IS 'Import records with automatic deduplication';
 ------------------------------------------------------
 -- Aggregate: jsonb_concat_obj
 -- Merge JSONB objects across rows (later rows win on key conflicts)
 -- Usage: jsonb_concat_obj(col ORDER BY sequence)
 ------------------------------------------------------
 CREATE OR REPLACE FUNCTION dataflow.jsonb_merge(a JSONB, b JSONB)
 RETURNS JSONB AS $$
    SELECT COALESCE(a, '{}') || COALESCE(b, '{}')
 $$ LANGUAGE sql IMMUTABLE;
 DROP AGGREGATE IF EXISTS dataflow.jsonb_concat_obj(JSONB);
 CREATE AGGREGATE dataflow.jsonb_concat_obj(JSONB) (
    sfunc    = dataflow.jsonb_merge,
    stype    = JSONB,
    initcond = '{}'
 );
 ------------------------------------------------------
 -- Function: apply_transformations
-- Apply all transformation rules to records
+-- Apply all transformation rules to records (set-based)
 ------------------------------------------------------
 CREATE OR REPLACE FUNCTION apply_transformations(
    p_source_name TEXT,
    p_record_ids INTEGER[] DEFAULT NULL  -- NULL = all untransformed
 ) RETURNS JSON AS $$
-DECLARE
+WITH
-    v_record RECORD;
+-- All records to process
-    v_rule RECORD;
+qualifying AS (
    v_transformed JSONB;
    v_match_count BIGINT;
    v_extracted JSONB;
    v_mapping JSONB;
    v_count INTEGER := 0;
 BEGIN
    -- Loop through records to transform
    FOR v_record IN
    SELECT id, data
    FROM dataflow.records
    WHERE source_name = p_source_name
          AND (p_record_ids IS NULL OR id = ANY(p_record_ids))
      AND transformed IS NULL
-    LOOP
+      AND (p_record_ids IS NULL OR id = ANY(p_record_ids))
-        -- Start with original data
+),
-        v_transformed := v_record.data;
+-- Apply each enabled rule to each qualifying record that has the required field
-
+rx AS (
        -- Apply each rule in sequence
        FOR v_rule IN
            SELECT * FROM dataflow.rules
            WHERE source_name = p_source_name
              AND enabled = true
            ORDER BY sequence
        LOOP
            -- Apply rule based on function type
            IF v_rule.function_type = 'replace' THEN
                -- Pass flags as third arg so 'g' (replace all) works correctly
                v_transformed := jsonb_set(
                    v_transformed,
                    ARRAY[v_rule.output_field],
                    to_jsonb(regexp_replace(
                        v_record.data->>v_rule.field,
                        v_rule.pattern,
                        v_rule.replace_value,
                        v_rule.flags
                    ))
                );
            ELSE
                -- extract: use regexp_matches so 'g' flag returns all occurrences
                -- Aggregate directly to JSONB: single capture → scalar, multi → array
    SELECT
        q.id,
        r.name         AS rule_name,
        r.sequence,
        r.output_field,
        r.retain,
        CASE r.function_type
            WHEN 'replace' THEN
                to_jsonb(regexp_replace(
                    q.data ->> r.field, r.pattern, r.replace_value, r.flags
                ))
            ELSE
                -- extract: aggregate all matches; single match → scalar, multiple → array
                -- Aggregate first so we can inspect count and first element cleanly
                (SELECT
                    CASE WHEN cnt = 0 THEN NULL
                         WHEN cnt = 1 THEN agg->0
                         ELSE agg
                    END
                 FROM (
                     SELECT
                         count(*) AS cnt,
                         jsonb_agg(
-                        CASE WHEN array_length(mt, 1) = 1 THEN to_jsonb(mt[1])
+                             CASE WHEN array_length(mt, 1) = 1
                                 THEN to_jsonb(mt[1])
                                 ELSE to_jsonb(mt)
                             END
                             ORDER BY rn
                         ) AS agg
                     FROM regexp_matches(q.data ->> r.field, r.pattern, r.flags)
                     WITH ORDINALITY AS m(mt, rn)
                 ) _agg)
        END AS extracted
    FROM qualifying q
    CROSS JOIN dataflow.rules r
    WHERE r.source_name = p_source_name
      AND r.enabled = true
      AND q.data ? r.field
 ),
-                    count(*)
+-- Join with mappings to find mapped output for each extracted value
-                INTO v_extracted, v_match_count
+linked AS (
-                FROM regexp_matches(
+    SELECT
-                    v_record.data->>v_rule.field,
+        rx.id,
-                    v_rule.pattern,
+        rx.sequence,
-                    v_rule.flags
+        rx.output_field,
-                ) WITH ORDINALITY AS m(mt, rn);
+        rx.retain,
-
+        rx.extracted,
-                IF v_match_count > 0 THEN
+        m.output AS mapped
-                    -- Single match: unwrap the array to get scalar or capture array directly
+    FROM rx
-                    IF v_match_count = 1 THEN
+    LEFT JOIN dataflow.mappings m ON
-                        v_extracted := v_extracted->0;
+        m.source_name = p_source_name
-                    END IF;
+        AND m.rule_name = rx.rule_name
-                    -- v_extracted is now: scalar string, array of captures, or array of matches (g)
+        AND m.input_value = rx.extracted
-
+    WHERE rx.extracted IS NOT NULL
-                    -- Check if there's a mapping for this value
+),
-                    SELECT output INTO v_mapping
+-- Build per-rule output JSONB:
-                    FROM dataflow.mappings
+--   mapped  → use mapping output; also write output_field if retain = true
-                    WHERE source_name = p_source_name
+--   no map  → write extracted value to output_field
-                      AND rule_name = v_rule.name
+rule_output AS (
-                      AND input_value = v_extracted;
+    SELECT
-
+        id,
-                    IF v_mapping IS NOT NULL THEN
+        sequence,
-                        -- Apply mapping (merge mapped fields into result)
+        CASE
-                        v_transformed := v_transformed || v_mapping;
+            WHEN mapped IS NOT NULL THEN
-                        -- If retain is set, also write the extracted value to output_field
+                mapped ||
-                        IF v_rule.retain THEN
+                CASE WHEN retain
-                            v_transformed := jsonb_set(v_transformed, ARRAY[v_rule.output_field], v_extracted);
+                     THEN jsonb_build_object(output_field, extracted)
-                        END IF;
+                     ELSE '{}'::jsonb
                END
            ELSE
-                        -- No mapping, store extracted value (scalar or array)
+                jsonb_build_object(output_field, extracted)
-                        v_transformed := jsonb_set(
+        END AS output
-                            v_transformed,
+    FROM linked
-                            ARRAY[v_rule.output_field],
+),
-                            v_extracted
+-- Merge all rule outputs per record in sequence order (higher sequence wins on conflict)
-                        );
+record_additions AS (
-                    END IF;
+    SELECT
-                END IF;
+        id,
-            END IF;
+        dataflow.jsonb_concat_obj(output ORDER BY sequence) AS additions
-        END LOOP;
+    FROM rule_output
-
+    GROUP BY id
-        -- Update record with transformed data
+),
-        UPDATE dataflow.records
+-- Update all qualifying records; records with no rule matches get transformed = data
-        SET transformed = v_transformed,
+updated AS (
    UPDATE dataflow.records rec
    SET transformed    = rec.data || COALESCE(ra.additions, '{}'::jsonb),
        transformed_at = CURRENT_TIMESTAMP
-        WHERE id = v_record.id;
+    FROM qualifying q
    LEFT JOIN record_additions ra ON ra.id = q.id
    WHERE rec.id = q.id
    RETURNING rec.id
 )
 SELECT json_build_object('success', true, 'transformed', count(*))
 FROM updated
 $$ LANGUAGE sql;
-        v_count := v_count + 1;
+COMMENT ON FUNCTION apply_transformations IS 'Apply transformation rules and mappings to records (set-based CTE)';
    END LOOP;
    RETURN json_build_object(
        'success', true,
        'transformed', v_count
    );
 END;
 $$ LANGUAGE plpgsql;
 COMMENT ON FUNCTION apply_transformations IS 'Apply transformation rules and mappings to records';
 ------------------------------------------------------
 -- Function: get_unmapped_values