dataflow/database/functions.sql
Paul Trowbridge 3be5ccc435 Add TSV export/import backend and update unmapped sample column
- Restore export.tsv and import-csv endpoints to mappings routes
- sample column is always last in export and discarded on import
- get_unmapped_values now returns distinct source field values as sample instead of full raw records

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 20:19:51 -04:00

349 lines
12 KiB
PL/PgSQL

--
-- Dataflow Functions
-- Simple, clear functions for import and transformation
--
SET search_path TO dataflow, public;
------------------------------------------------------
-- Function: import_records
-- Import data with automatic deduplication
------------------------------------------------------
CREATE OR REPLACE FUNCTION import_records(
p_source_name TEXT,
p_data JSONB -- Array of records
) RETURNS JSON AS $$
DECLARE
v_dedup_fields TEXT[];
v_record JSONB;
v_dedup_key TEXT;
v_inserted INTEGER := 0;
v_duplicates INTEGER := 0;
v_log_id INTEGER;
BEGIN
-- Get dedup fields for this source
SELECT dedup_fields INTO v_dedup_fields
FROM dataflow.sources
WHERE name = p_source_name;
IF v_dedup_fields IS NULL THEN
RETURN json_build_object(
'success', false,
'error', 'Source not found: ' || p_source_name
);
END IF;
-- Process each record
FOR v_record IN SELECT * FROM jsonb_array_elements(p_data)
LOOP
-- Generate dedup key
v_dedup_key := dataflow.generate_dedup_key(v_record, v_dedup_fields);
-- Try to insert (will fail silently if duplicate)
BEGIN
INSERT INTO dataflow.records (source_name, data, dedup_key)
VALUES (p_source_name, v_record, v_dedup_key);
v_inserted := v_inserted + 1;
EXCEPTION WHEN unique_violation THEN
v_duplicates := v_duplicates + 1;
END;
END LOOP;
-- Log the import
INSERT INTO dataflow.import_log (source_name, records_imported, records_duplicate)
VALUES (p_source_name, v_inserted, v_duplicates)
RETURNING id INTO v_log_id;
RETURN json_build_object(
'success', true,
'imported', v_inserted,
'duplicates', v_duplicates,
'log_id', v_log_id
);
END;
$$ LANGUAGE plpgsql;
COMMENT ON FUNCTION import_records IS 'Import records with automatic deduplication';
------------------------------------------------------
-- Function: apply_transformations
-- Apply all transformation rules to records
------------------------------------------------------
CREATE OR REPLACE FUNCTION apply_transformations(
p_source_name TEXT,
p_record_ids INTEGER[] DEFAULT NULL -- NULL = all untransformed
) RETURNS JSON AS $$
DECLARE
v_record RECORD;
v_rule RECORD;
v_transformed JSONB;
v_match_count BIGINT;
v_extracted JSONB;
v_mapping JSONB;
v_count INTEGER := 0;
BEGIN
-- Loop through records to transform
FOR v_record IN
SELECT id, data
FROM dataflow.records
WHERE source_name = p_source_name
AND (p_record_ids IS NULL OR id = ANY(p_record_ids))
AND transformed IS NULL
LOOP
-- Start with original data
v_transformed := v_record.data;
-- Apply each rule in sequence
FOR v_rule IN
SELECT * FROM dataflow.rules
WHERE source_name = p_source_name
AND enabled = true
ORDER BY sequence
LOOP
-- Apply rule based on function type
IF v_rule.function_type = 'replace' THEN
-- Pass flags as third arg so 'g' (replace all) works correctly
v_transformed := jsonb_set(
v_transformed,
ARRAY[v_rule.output_field],
to_jsonb(regexp_replace(
v_record.data->>v_rule.field,
v_rule.pattern,
v_rule.replace_value,
v_rule.flags
))
);
ELSE
-- extract: use regexp_matches so 'g' flag returns all occurrences
-- Aggregate directly to JSONB: single capture → scalar, multi → array
SELECT
jsonb_agg(
CASE WHEN array_length(mt, 1) = 1 THEN to_jsonb(mt[1])
ELSE to_jsonb(mt)
END
ORDER BY rn
),
count(*)
INTO v_extracted, v_match_count
FROM regexp_matches(
v_record.data->>v_rule.field,
v_rule.pattern,
v_rule.flags
) WITH ORDINALITY AS m(mt, rn);
IF v_match_count > 0 THEN
-- Single match: unwrap the array to get scalar or capture array directly
IF v_match_count = 1 THEN
v_extracted := v_extracted->0;
END IF;
-- v_extracted is now: scalar string, array of captures, or array of matches (g)
-- Check if there's a mapping for this value
SELECT output INTO v_mapping
FROM dataflow.mappings
WHERE source_name = p_source_name
AND rule_name = v_rule.name
AND input_value = v_extracted;
IF v_mapping IS NOT NULL THEN
-- Apply mapping (merge mapped fields into result)
v_transformed := v_transformed || v_mapping;
ELSE
-- No mapping, store extracted value (scalar or array)
v_transformed := jsonb_set(
v_transformed,
ARRAY[v_rule.output_field],
v_extracted
);
END IF;
END IF;
END IF;
END LOOP;
-- Update record with transformed data
UPDATE dataflow.records
SET transformed = v_transformed,
transformed_at = CURRENT_TIMESTAMP
WHERE id = v_record.id;
v_count := v_count + 1;
END LOOP;
RETURN json_build_object(
'success', true,
'transformed', v_count
);
END;
$$ LANGUAGE plpgsql;
COMMENT ON FUNCTION apply_transformations IS 'Apply transformation rules and mappings to records';
------------------------------------------------------
-- Function: get_unmapped_values
-- Find extracted values that need mappings
------------------------------------------------------
DROP FUNCTION IF EXISTS get_unmapped_values(TEXT, TEXT);
CREATE FUNCTION get_unmapped_values(
p_source_name TEXT,
p_rule_name TEXT DEFAULT NULL
) RETURNS TABLE (
rule_name TEXT,
output_field TEXT,
source_field TEXT,
extracted_value JSONB,
record_count BIGINT,
sample JSONB
) AS $$
BEGIN
RETURN QUERY
WITH extracted AS (
SELECT
r.name AS rule_name,
r.output_field,
r.field AS source_field,
rec.transformed->r.output_field AS extracted_value,
rec.data->>r.field AS source_value
FROM
dataflow.records rec
CROSS JOIN dataflow.rules r
WHERE
rec.source_name = p_source_name
AND r.source_name = p_source_name
AND rec.transformed IS NOT NULL
AND rec.transformed ? r.output_field
AND (p_rule_name IS NULL OR r.name = p_rule_name)
AND rec.data ? r.field
)
SELECT
e.rule_name,
e.output_field,
e.source_field,
e.extracted_value,
count(*) AS record_count,
jsonb_agg(DISTINCT e.source_value) FILTER (WHERE e.source_value IS NOT NULL) AS sample
FROM extracted e
WHERE NOT EXISTS (
SELECT 1 FROM dataflow.mappings m
WHERE m.source_name = p_source_name
AND m.rule_name = e.rule_name
AND m.input_value = e.extracted_value
)
GROUP BY e.rule_name, e.output_field, e.source_field, e.extracted_value
ORDER BY count(*) DESC;
END;
$$ LANGUAGE plpgsql;
COMMENT ON FUNCTION get_unmapped_values IS 'Find extracted values that need mappings defined';
------------------------------------------------------
-- Function: reprocess_records
-- Clear and reapply transformations
------------------------------------------------------
CREATE OR REPLACE FUNCTION reprocess_records(p_source_name TEXT)
RETURNS JSON AS $$
BEGIN
-- Clear existing transformations
UPDATE dataflow.records
SET transformed = NULL,
transformed_at = NULL
WHERE source_name = p_source_name;
-- Reapply transformations
RETURN dataflow.apply_transformations(p_source_name);
END;
$$ LANGUAGE plpgsql;
COMMENT ON FUNCTION reprocess_records IS 'Clear and reapply all transformations for a source';
------------------------------------------------------
-- Function: generate_source_view
-- Build a typed flat view in dfv schema
------------------------------------------------------
CREATE OR REPLACE FUNCTION generate_source_view(p_source_name TEXT)
RETURNS JSON AS $$
DECLARE
v_config JSONB;
v_fields JSONB;
v_field JSONB;
v_cols TEXT := '';
v_sql TEXT;
v_view TEXT;
BEGIN
SELECT config INTO v_config
FROM dataflow.sources
WHERE name = p_source_name;
IF v_config IS NULL OR NOT (v_config ? 'fields') OR jsonb_array_length(v_config->'fields') = 0 THEN
RETURN json_build_object('success', false, 'error', 'No schema fields defined for this source');
END IF;
v_fields := v_config->'fields';
FOR v_field IN SELECT * FROM jsonb_array_elements(v_fields)
LOOP
IF v_cols != '' THEN v_cols := v_cols || ', '; END IF;
IF v_field->>'expression' IS NOT NULL THEN
-- Computed expression: substitute {fieldname} refs with (transformed->>'fieldname')::type
-- e.g. "{Amount} * {sign}" → "(transformed->>'Amount')::numeric * (transformed->>'sign')::numeric"
DECLARE
v_expr TEXT := v_field->>'expression';
v_ref TEXT;
v_cast TEXT := COALESCE(NULLIF(v_field->>'type', ''), 'numeric');
BEGIN
WHILE v_expr ~ '\{[^}]+\}' LOOP
v_ref := substring(v_expr FROM '\{([^}]+)\}');
v_expr := replace(v_expr, '{' || v_ref || '}',
format('(transformed->>%L)::numeric', v_ref));
END LOOP;
v_cols := v_cols || format('%s AS %I', v_expr, v_field->>'name');
END;
ELSE
CASE v_field->>'type'
WHEN 'date' THEN
v_cols := v_cols || format('(transformed->>%L)::date AS %I',
v_field->>'name', v_field->>'name');
WHEN 'numeric' THEN
v_cols := v_cols || format('(transformed->>%L)::numeric AS %I',
v_field->>'name', v_field->>'name');
ELSE
v_cols := v_cols || format('transformed->>%L AS %I',
v_field->>'name', v_field->>'name');
END CASE;
END IF;
END LOOP;
CREATE SCHEMA IF NOT EXISTS dfv;
v_view := 'dfv.' || quote_ident(p_source_name);
EXECUTE format('DROP VIEW IF EXISTS %s', v_view);
v_sql := format(
'CREATE VIEW %s AS SELECT %s FROM dataflow.records WHERE source_name = %L AND transformed IS NOT NULL',
v_view, v_cols, p_source_name
);
EXECUTE v_sql;
RETURN json_build_object('success', true, 'view', v_view, 'sql', v_sql);
END;
$$ LANGUAGE plpgsql;
COMMENT ON FUNCTION generate_source_view IS 'Generate a typed flat view in dfv schema from source config.fields';
------------------------------------------------------
-- Summary
------------------------------------------------------
-- Functions: 4 simple, focused functions
-- 1. import_records - Import with deduplication
-- 2. apply_transformations - Apply rules and mappings
-- 3. get_unmapped_values - Find values needing mappings
-- 4. reprocess_records - Re-transform all records
--
-- Each function does ONE thing clearly
-- No complex nested CTEs
-- Easy to understand and debug
------------------------------------------------------