- POST /api/sources/suggest: derive source definition from CSV upload - GET /api/sources/:name/import-log: query import history - GET /api/rules/:id/test: test rule pattern against real records - rules: add function_type (extract/replace) and flags columns - get_unmapped_values: include up to 3 sample records per value - npm start now uses nodemon for auto-reload Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
169 lines
6.2 KiB
PL/PgSQL
169 lines
6.2 KiB
PL/PgSQL
--
|
|
-- Dataflow Database Schema
|
|
-- Simple, clear structure for data transformation
|
|
--
|
|
|
|
-- Create schema
|
|
CREATE SCHEMA IF NOT EXISTS dataflow;
|
|
|
|
-- Set search path
|
|
SET search_path TO dataflow, public;
|
|
|
|
------------------------------------------------------
|
|
-- Table: sources
|
|
-- Defines data sources and how to deduplicate them
|
|
------------------------------------------------------
|
|
CREATE TABLE sources (
|
|
name TEXT PRIMARY KEY,
|
|
dedup_fields TEXT[] NOT NULL, -- Fields used for deduplication (e.g., ['date', 'amount', 'description'])
|
|
config JSONB DEFAULT '{}'::jsonb,
|
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
COMMENT ON TABLE sources IS 'Data source definitions';
|
|
COMMENT ON COLUMN sources.dedup_fields IS 'Array of field names used to identify duplicate records';
|
|
COMMENT ON COLUMN sources.config IS 'Additional source configuration (optional)';
|
|
|
|
------------------------------------------------------
|
|
-- Table: records
|
|
-- Stores imported data (raw and transformed)
|
|
------------------------------------------------------
|
|
CREATE TABLE records (
|
|
id SERIAL PRIMARY KEY,
|
|
source_name TEXT NOT NULL REFERENCES sources(name) ON DELETE CASCADE,
|
|
|
|
-- Data
|
|
data JSONB NOT NULL, -- Original imported data
|
|
dedup_key TEXT NOT NULL, -- Hash of dedup fields for fast lookup
|
|
transformed JSONB, -- Data after transformations applied
|
|
|
|
-- Metadata
|
|
imported_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
transformed_at TIMESTAMPTZ,
|
|
|
|
-- Constraints
|
|
UNIQUE(source_name, dedup_key) -- Prevent duplicates
|
|
);
|
|
|
|
COMMENT ON TABLE records IS 'Imported records with raw and transformed data';
|
|
COMMENT ON COLUMN records.data IS 'Original data as imported';
|
|
COMMENT ON COLUMN records.dedup_key IS 'Hash of deduplication fields for fast duplicate detection';
|
|
COMMENT ON COLUMN records.transformed IS 'Data after applying transformation rules';
|
|
|
|
-- Indexes
|
|
CREATE INDEX idx_records_source ON records(source_name);
|
|
CREATE INDEX idx_records_dedup ON records(source_name, dedup_key);
|
|
CREATE INDEX idx_records_data ON records USING gin(data);
|
|
CREATE INDEX idx_records_transformed ON records USING gin(transformed);
|
|
|
|
------------------------------------------------------
|
|
-- Table: rules
|
|
-- Transformation rules (regex extraction)
|
|
------------------------------------------------------
|
|
CREATE TABLE rules (
|
|
id SERIAL PRIMARY KEY,
|
|
source_name TEXT NOT NULL REFERENCES sources(name) ON DELETE CASCADE,
|
|
name TEXT NOT NULL,
|
|
|
|
-- Rule definition
|
|
field TEXT NOT NULL, -- Field to extract from (e.g., 'description')
|
|
pattern TEXT NOT NULL, -- Regex pattern
|
|
output_field TEXT NOT NULL, -- Name of extracted field (e.g., 'merchant')
|
|
function_type TEXT NOT NULL DEFAULT 'extract', -- 'extract' or 'replace'
|
|
flags TEXT NOT NULL DEFAULT '', -- Regex flags (e.g., 'i' for case-insensitive)
|
|
|
|
-- Options
|
|
enabled BOOLEAN DEFAULT true,
|
|
sequence INTEGER DEFAULT 0, -- Execution order
|
|
|
|
-- Metadata
|
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
|
|
UNIQUE(source_name, name)
|
|
);
|
|
|
|
COMMENT ON TABLE rules IS 'Transformation rules for extracting data';
|
|
COMMENT ON COLUMN rules.field IS 'Source field to apply regex to';
|
|
COMMENT ON COLUMN rules.pattern IS 'Regular expression pattern';
|
|
COMMENT ON COLUMN rules.output_field IS 'Name of field to store extracted value';
|
|
|
|
CREATE INDEX idx_rules_source ON rules(source_name);
|
|
|
|
------------------------------------------------------
|
|
-- Table: mappings
|
|
-- Value mappings (extracted value → standardized output)
|
|
------------------------------------------------------
|
|
CREATE TABLE mappings (
|
|
id SERIAL PRIMARY KEY,
|
|
source_name TEXT NOT NULL REFERENCES sources(name) ON DELETE CASCADE,
|
|
rule_name TEXT NOT NULL,
|
|
|
|
-- Mapping
|
|
input_value TEXT NOT NULL, -- Extracted value to match
|
|
output JSONB NOT NULL, -- Standardized output
|
|
|
|
-- Metadata
|
|
created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
|
|
|
UNIQUE(source_name, rule_name, input_value),
|
|
FOREIGN KEY (source_name, rule_name) REFERENCES rules(source_name, name) ON DELETE CASCADE
|
|
);
|
|
|
|
COMMENT ON TABLE mappings IS 'Maps extracted values to standardized output';
|
|
COMMENT ON COLUMN mappings.input_value IS 'Value extracted by rule';
|
|
COMMENT ON COLUMN mappings.output IS 'Standardized output (can contain multiple fields)';
|
|
|
|
CREATE INDEX idx_mappings_source_rule ON mappings(source_name, rule_name);
|
|
CREATE INDEX idx_mappings_input ON mappings(source_name, rule_name, input_value);
|
|
|
|
------------------------------------------------------
|
|
-- Table: import_log
|
|
-- Audit trail of imports
|
|
------------------------------------------------------
|
|
CREATE TABLE import_log (
|
|
id SERIAL PRIMARY KEY,
|
|
source_name TEXT NOT NULL REFERENCES sources(name) ON DELETE CASCADE,
|
|
records_imported INTEGER DEFAULT 0,
|
|
records_duplicate INTEGER DEFAULT 0,
|
|
imported_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
COMMENT ON TABLE import_log IS 'Audit log of data imports';
|
|
|
|
CREATE INDEX idx_import_log_source ON import_log(source_name);
|
|
CREATE INDEX idx_import_log_timestamp ON import_log(imported_at);
|
|
|
|
------------------------------------------------------
|
|
-- Helper function: Generate dedup key
|
|
------------------------------------------------------
|
|
CREATE OR REPLACE FUNCTION generate_dedup_key(
|
|
data JSONB,
|
|
dedup_fields TEXT[]
|
|
) RETURNS TEXT AS $$
|
|
DECLARE
|
|
field TEXT;
|
|
values TEXT := '';
|
|
BEGIN
|
|
-- Concatenate values from dedup fields
|
|
FOREACH field IN ARRAY dedup_fields LOOP
|
|
values := values || COALESCE(data->>field, '') || '|';
|
|
END LOOP;
|
|
|
|
-- Return MD5 hash of concatenated values
|
|
RETURN md5(values);
|
|
END;
|
|
$$ LANGUAGE plpgsql IMMUTABLE;
|
|
|
|
COMMENT ON FUNCTION generate_dedup_key IS 'Generate hash key from specified fields for deduplication';
|
|
|
|
------------------------------------------------------
|
|
-- Summary
|
|
------------------------------------------------------
|
|
-- Tables: 5 (sources, records, rules, mappings, import_log)
|
|
-- Simple, clear structure
|
|
-- JSONB for flexibility
|
|
-- Deduplication via hash key
|
|
-- All transformations traceable
|
|
------------------------------------------------------
|