dataflow/api/routes/rules.js
Paul Trowbridge 1ed08755c1 Add g flag support and fix regex aggregation in extract rules
- Switch apply_transformations from regexp_match to regexp_matches with
  ORDINALITY, enabling the g flag to return all occurrences as a JSONB array
- Aggregate matches directly to JSONB in lateral subquery to avoid
  text[][] type errors when subscripting array_agg results
- Pass flags as proper third argument to regexp_matches/regexp_replace
  instead of inline (?flags) prefix — the only way g works correctly
- Apply same fix to preview and test endpoints in rules.js
- Add migrate_tps.sql script for migrating data from TPS to Dataflow

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-29 22:48:50 -04:00

234 lines
8.7 KiB
JavaScript

/**
* Rules Routes
* Manage transformation rules
*/
const express = require('express');
module.exports = (pool) => {
const router = express.Router();
// List all rules for a source
router.get('/source/:source_name', async (req, res, next) => {
try {
const result = await pool.query(
'SELECT * FROM rules WHERE source_name = $1 ORDER BY sequence, name',
[req.params.source_name]
);
res.json(result.rows);
} catch (err) {
next(err);
}
});
// Preview an ad-hoc pattern against real records (no saved rule needed)
router.get('/preview', async (req, res, next) => {
try {
const { source, field, pattern, flags, function_type = 'extract', replace_value = '', limit = 20 } = req.query;
if (!source || !field || !pattern) {
return res.status(400).json({ error: 'source, field, and pattern are required' });
}
const query = function_type === 'replace'
? `SELECT
id,
data->>$1 AS raw_value,
to_jsonb(regexp_replace(data->>$1, $2, $3, $4)) AS extracted_value
FROM records
WHERE source_name = $5 AND data ? $1
ORDER BY id DESC LIMIT $6`
: `SELECT
r.id,
r.data->>$1 AS raw_value,
CASE
WHEN agg.match_count = 0 THEN NULL
WHEN agg.match_count = 1 THEN agg.matches->0
ELSE agg.matches
END AS extracted_value
FROM records r
CROSS JOIN LATERAL (
SELECT
jsonb_agg(
CASE WHEN array_length(mt, 1) = 1 THEN to_jsonb(mt[1])
ELSE to_jsonb(mt)
END
ORDER BY rn
) AS matches,
count(*)::int AS match_count
FROM regexp_matches(r.data->>$1, $2, $3) WITH ORDINALITY AS m(mt, rn)
) agg
WHERE r.source_name = $4 AND r.data ? $1
ORDER BY r.id DESC LIMIT $5`;
const params = function_type === 'replace'
? [field, pattern, replace_value, flags || '', source, parseInt(limit)]
: [field, pattern, flags || '', source, parseInt(limit)];
const result = await pool.query(query, params);
res.json(result.rows);
} catch (err) {
next(err);
}
});
// Test a rule against real records
router.get('/:id/test', async (req, res, next) => {
try {
const { limit = 20 } = req.query;
const ruleResult = await pool.query(
'SELECT * FROM rules WHERE id = $1',
[req.params.id]
);
if (ruleResult.rows.length === 0) {
return res.status(404).json({ error: 'Rule not found' });
}
const rule = ruleResult.rows[0];
const result = await pool.query(
`SELECT
r.id,
r.data->>$1 AS raw_value,
CASE
WHEN agg.match_count = 0 THEN NULL
WHEN agg.match_count = 1 AND array_length(agg.matches[1], 1) = 1
THEN to_jsonb(agg.matches[1][1])
WHEN agg.match_count = 1
THEN to_jsonb(agg.matches[1])
WHEN array_length(agg.matches[1], 1) = 1
THEN (SELECT jsonb_agg(m[1] ORDER BY idx)
FROM unnest(agg.matches) WITH ORDINALITY u(m, idx))
ELSE to_jsonb(agg.matches)
END AS extracted_value
FROM records r
CROSS JOIN LATERAL (
SELECT array_agg(mt ORDER BY rn) AS matches, count(*)::int AS match_count
FROM regexp_matches(r.data->>$1, $2, $3) WITH ORDINALITY AS m(mt, rn)
) agg
WHERE r.source_name = $4
AND r.data ? $1
ORDER BY r.id DESC
LIMIT $5`,
[rule.field, rule.pattern, rule.flags || '', rule.source_name, parseInt(limit)]
);
res.json({
rule: { id: rule.id, name: rule.name, field: rule.field, pattern: rule.pattern, output_field: rule.output_field },
results: result.rows
});
} catch (err) {
next(err);
}
});
// Get single rule
router.get('/:id', async (req, res, next) => {
try {
const result = await pool.query(
'SELECT * FROM rules WHERE id = $1',
[req.params.id]
);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Rule not found' });
}
res.json(result.rows[0]);
} catch (err) {
next(err);
}
});
// Create rule
router.post('/', async (req, res, next) => {
try {
const { source_name, name, field, pattern, output_field, function_type, flags, replace_value, enabled, sequence } = req.body;
if (!source_name || !name || !field || !pattern || !output_field) {
return res.status(400).json({
error: 'Missing required fields: source_name, name, field, pattern, output_field'
});
}
if (function_type && !['extract', 'replace'].includes(function_type)) {
return res.status(400).json({ error: 'function_type must be "extract" or "replace"' });
}
const result = await pool.query(
`INSERT INTO rules (source_name, name, field, pattern, output_field, function_type, flags, replace_value, enabled, sequence)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
RETURNING *`,
[source_name, name, field, pattern, output_field, function_type || 'extract', flags || '', replace_value || '', enabled !== false, sequence || 0]
);
res.status(201).json(result.rows[0]);
} catch (err) {
if (err.code === '23505') { // Unique violation
return res.status(409).json({ error: 'Rule already exists for this source' });
}
if (err.code === '23503') { // Foreign key violation
return res.status(404).json({ error: 'Source not found' });
}
next(err);
}
});
// Update rule
router.put('/:id', async (req, res, next) => {
try {
const { name, field, pattern, output_field, function_type, flags, replace_value, enabled, sequence } = req.body;
if (function_type && !['extract', 'replace'].includes(function_type)) {
return res.status(400).json({ error: 'function_type must be "extract" or "replace"' });
}
const result = await pool.query(
`UPDATE rules
SET name = COALESCE($2, name),
field = COALESCE($3, field),
pattern = COALESCE($4, pattern),
output_field = COALESCE($5, output_field),
function_type = COALESCE($6, function_type),
flags = COALESCE($7, flags),
replace_value = COALESCE($8, replace_value),
enabled = COALESCE($9, enabled),
sequence = COALESCE($10, sequence)
WHERE id = $1
RETURNING *`,
[req.params.id, name, field, pattern, output_field, function_type, flags, replace_value, enabled, sequence]
);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Rule not found' });
}
res.json(result.rows[0]);
} catch (err) {
next(err);
}
});
// Delete rule
router.delete('/:id', async (req, res, next) => {
try {
const result = await pool.query(
'DELETE FROM rules WHERE id = $1 RETURNING id, name',
[req.params.id]
);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Rule not found' });
}
res.json({ success: true, deleted: result.rows[0] });
} catch (err) {
next(err);
}
});
return router;
};