Initial commit: Pipekit rewrite.

Orchestration layer around the jrunner Java JDBC CLI, replacing the
previous shell-based sync system in .archive/pre-rewrite. Includes
the FastAPI + Jinja web frontend, per-driver adapters (DB2, MSSQL,
PG), wizard-driven module creation with editable dest types and
source-sourced table/column descriptions, watermark/hook CRUD,
and the engine that runs modules end-to-end.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Paul Trowbridge 2026-04-22 00:38:26 -04:00
commit 574ada5258
59 changed files with 9296 additions and 0 deletions

View File

View File

@ -0,0 +1,582 @@
"""Pipekit API — FastAPI application."""
import os
import sys
import secrets
import queue
from typing import Optional
from fastapi import FastAPI, HTTPException, Depends, Query
from fastapi.responses import StreamingResponse
from fastapi.security import HTTPBasic, HTTPBasicCredentials
from pydantic import BaseModel
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from engine.db import (
init_db, clear_stale_locks,
# drivers
create_driver, get_driver, list_drivers, delete_driver,
# connections
create_connection, get_connection, list_connections, update_connection, delete_connection,
# modules
create_module, get_module, list_modules, update_module, delete_module,
# watermarks
create_watermark, get_watermark, list_watermarks, update_watermark, delete_watermark,
# hooks
create_hook, get_hook, list_hooks, update_hook, delete_hook,
# groups
create_group, get_group, list_groups, delete_group,
add_group_member, remove_group_member,
# schedules
create_schedule, get_schedule, list_schedules, update_schedule, delete_schedule,
# group runs
list_group_runs, get_group_run,
# runs
list_runs, get_run,
# settings
get_setting, set_setting,
)
from engine.runner import run_module, run_group, preview_module
from engine.introspect import fetch_tables, fetch_columns, propose_module
app = FastAPI(title="Pipekit", version="0.2.0", description="JDBC-based ETL orchestration")
security = HTTPBasic()
@app.on_event("startup")
def startup():
init_db()
clear_stale_locks()
if not get_setting("api_username"):
set_setting("api_username", "admin")
set_setting("api_password", "pipekit")
def authenticate(credentials: HTTPBasicCredentials = Depends(security)):
expected_user = get_setting("api_username") or "admin"
expected_pass = get_setting("api_password") or "pipekit"
if not (secrets.compare_digest(credentials.username, expected_user) and
secrets.compare_digest(credentials.password, expected_pass)):
raise HTTPException(status_code=401, detail="Invalid credentials")
return credentials.username
# ---------------------------------------------------------------------------
# Pydantic models
# ---------------------------------------------------------------------------
class DriverCreate(BaseModel):
name: str
jar_file: str
class_name: str
url_template: Optional[str] = None
class ConnectionCreate(BaseModel):
name: str
jdbc_url: str
driver_id: Optional[int] = None
username: Optional[str] = None
password: Optional[str] = None
default_dest_connection_id: Optional[int] = None
default_dest_schema: Optional[str] = None
notes: Optional[str] = None
class ConnectionUpdate(BaseModel):
name: Optional[str] = None
jdbc_url: Optional[str] = None
driver_id: Optional[int] = None
username: Optional[str] = None
password: Optional[str] = None
default_dest_connection_id: Optional[int] = None
default_dest_schema: Optional[str] = None
notes: Optional[str] = None
class ModuleCreate(BaseModel):
name: str
source_connection_id: int
dest_connection_id: int
dest_table: str
source_query: str
merge_strategy: str = "full"
merge_key: Optional[str] = None
class ModuleUpdate(BaseModel):
name: Optional[str] = None
source_connection_id: Optional[int] = None
dest_connection_id: Optional[int] = None
dest_table: Optional[str] = None
source_query: Optional[str] = None
merge_strategy: Optional[str] = None
merge_key: Optional[str] = None
enabled: Optional[bool] = None
class WatermarkCreate(BaseModel):
module_id: int
name: str
connection_id: int
resolver_sql: str
default_value: Optional[str] = None
class WatermarkUpdate(BaseModel):
name: Optional[str] = None
connection_id: Optional[int] = None
resolver_sql: Optional[str] = None
default_value: Optional[str] = None
class HookCreate(BaseModel):
module_id: int
sql: str
run_order: int = 0
connection_id: Optional[int] = None
run_on: str = "success"
class HookUpdate(BaseModel):
sql: Optional[str] = None
run_order: Optional[int] = None
connection_id: Optional[int] = None
run_on: Optional[str] = None
class GroupCreate(BaseModel):
name: str
class GroupMemberAdd(BaseModel):
module_id: int
run_order: int = 0
class ScheduleCreate(BaseModel):
group_id: int
cron_expr: str
enabled: bool = True
class ScheduleUpdate(BaseModel):
cron_expr: Optional[str] = None
enabled: Optional[bool] = None
class SettingUpdate(BaseModel):
value: str
# ---------------------------------------------------------------------------
# Health
# ---------------------------------------------------------------------------
@app.get("/health")
def health():
return {"status": "ok"}
# ---------------------------------------------------------------------------
# Drivers
# ---------------------------------------------------------------------------
@app.get("/drivers")
def api_list_drivers(user: str = Depends(authenticate)):
return list_drivers()
@app.get("/drivers/{driver_id}")
def api_get_driver(driver_id: int, user: str = Depends(authenticate)):
d = get_driver(driver_id)
if not d:
raise HTTPException(404, "Driver not found")
return d
@app.post("/drivers", status_code=201)
def api_create_driver(body: DriverCreate, user: str = Depends(authenticate)):
return create_driver(**body.model_dump())
@app.post("/drivers/{driver_id}/delete")
def api_delete_driver(driver_id: int, user: str = Depends(authenticate)):
delete_driver(driver_id)
return {"ok": True}
# ---------------------------------------------------------------------------
# Connections
# ---------------------------------------------------------------------------
@app.get("/connections")
def api_list_connections(user: str = Depends(authenticate)):
return list_connections()
@app.get("/connections/{conn_id}")
def api_get_connection(conn_id: int, user: str = Depends(authenticate)):
c = get_connection(conn_id)
if not c:
raise HTTPException(404, "Connection not found")
return c
@app.post("/connections", status_code=201)
def api_create_connection(body: ConnectionCreate, user: str = Depends(authenticate)):
return create_connection(**body.model_dump())
@app.post("/connections/{conn_id}")
def api_update_connection(conn_id: int, body: ConnectionUpdate,
user: str = Depends(authenticate)):
return update_connection(conn_id, **body.model_dump(exclude_none=True))
@app.post("/connections/{conn_id}/delete")
def api_delete_connection(conn_id: int, user: str = Depends(authenticate)):
delete_connection(conn_id)
return {"ok": True}
@app.post("/connections/{conn_id}/test")
def api_test_connection(conn_id: int, user: str = Depends(authenticate)):
from engine.introspect import run_jrunner_query
import time
start = time.time()
try:
run_jrunner_query(conn_id, "SELECT 1")
elapsed = round(time.time() - start, 2)
return {"status": "ok", "elapsed_seconds": elapsed}
except Exception as e:
elapsed = round(time.time() - start, 2)
return {"status": "error", "detail": str(e), "elapsed_seconds": elapsed}
# ---------------------------------------------------------------------------
# Introspection
# ---------------------------------------------------------------------------
@app.post("/introspect/tables")
def api_introspect_tables(body: dict, user: str = Depends(authenticate)):
conn_id = body["connection_id"]
qualifiers = body.get("qualifiers", {})
schema = qualifiers.get("schema")
tables = fetch_tables(conn_id, schema_filter=schema)
return [t.to_dict() for t in tables]
@app.post("/introspect/columns")
def api_introspect_columns(body: dict, user: str = Depends(authenticate)):
conn_id = body["connection_id"]
table_name = body["table_name"]
qualifiers = body.get("qualifiers", {})
schema = qualifiers.get("schema", "")
columns = fetch_columns(conn_id, schema, table_name,
linked_server=qualifiers.get("linked_server"),
linked_db=qualifiers.get("linked_db"))
return [c.to_dict() for c in columns]
@app.post("/introspect/propose")
def api_introspect_propose(body: dict, user: str = Depends(authenticate)):
conn_id = body["connection_id"]
table_name = body["table_name"]
qualifiers = body.get("qualifiers", {})
schema = qualifiers.get("schema", "")
return propose_module(conn_id, schema, table_name,
dest_schema=qualifiers.get("dest_schema"),
linked_server=qualifiers.get("linked_server"),
linked_db=qualifiers.get("linked_db"))
# Keep old GET endpoints for backward compat with TUI
@app.get("/connections/{conn_id}/tables")
def api_list_tables(conn_id: int, schema: Optional[str] = None,
user: str = Depends(authenticate)):
tables = fetch_tables(conn_id, schema_filter=schema)
return [t.to_dict() for t in tables]
@app.get("/connections/{conn_id}/tables/{schema}.{table}/columns")
def api_list_columns(conn_id: int, schema: str, table: str,
user: str = Depends(authenticate)):
columns = fetch_columns(conn_id, schema, table)
return [c.to_dict() for c in columns]
@app.get("/connections/{conn_id}/tables/{schema}.{table}/propose")
def api_propose_module(conn_id: int, schema: str, table: str,
dest_schema: Optional[str] = None,
linked_server: Optional[str] = None,
linked_db: Optional[str] = None,
user: str = Depends(authenticate)):
return propose_module(conn_id, schema, table, dest_schema,
linked_server=linked_server, linked_db=linked_db)
# ---------------------------------------------------------------------------
# Modules
# ---------------------------------------------------------------------------
@app.get("/modules")
def api_list_modules(user: str = Depends(authenticate)):
return list_modules()
@app.get("/modules/{module_id}")
def api_get_module(module_id: int, user: str = Depends(authenticate)):
m = get_module(module_id)
if not m:
raise HTTPException(404, "Module not found")
return m
@app.post("/modules", status_code=201)
def api_create_module(body: ModuleCreate, user: str = Depends(authenticate)):
return create_module(**body.model_dump())
@app.post("/modules/{module_id}")
def api_update_module(module_id: int, body: ModuleUpdate,
user: str = Depends(authenticate)):
return update_module(module_id, **body.model_dump(exclude_none=True))
@app.post("/modules/{module_id}/delete")
def api_delete_module(module_id: int, user: str = Depends(authenticate)):
delete_module(module_id)
return {"ok": True}
@app.get("/modules/{module_id}/preview")
def api_preview_module(module_id: int, user: str = Depends(authenticate)):
return preview_module(module_id)
@app.get("/modules/{module_id}/columns")
def api_module_columns(module_id: int, user: str = Depends(authenticate)):
"""Parse source query and return column list."""
import re
module = get_module(module_id)
if not module:
raise HTTPException(404, "Module not found")
columns = []
for m in re.finditer(
r'(?:RTRIM\(([^)]+)\)|(\[?["\w#@$]+\]?(?:\.["\w#@$]+)*))\s+AS\s+(\w+)',
module["source_query"], re.IGNORECASE
):
columns.append({
"source": (m.group(1) or m.group(2)).strip(),
"alias": m.group(3),
"trimmed": bool(m.group(1)),
})
return columns
@app.post("/modules/{module_id}/run")
def api_run_module(module_id: int, user: str = Depends(authenticate)):
return run_module(module_id)
@app.get("/runs/{run_id}/stream")
def api_stream_run(run_id: int, user: str = Depends(authenticate)):
"""SSE stream for watching a run. Placeholder — full impl in async phase."""
raise HTTPException(501, "SSE streaming not yet implemented")
@app.post("/modules/{module_id}/run/stream")
def api_run_module_stream(module_id: int, user: str = Depends(authenticate)):
"""Trigger a sync run and stream jrunner output as text/event-stream."""
import threading, json
q = queue.Queue()
def on_output(line: str):
q.put(line)
def run_in_thread():
try:
result = run_module(module_id, on_output=on_output)
q.put(f"__DONE__{json.dumps(result)}")
except Exception as e:
q.put(f"__ERROR__{str(e)}")
threading.Thread(target=run_in_thread, daemon=True).start()
def event_stream():
while True:
try:
line = q.get(timeout=600)
except queue.Empty:
yield "data: __TIMEOUT__\n\n"
return
if line.startswith("__DONE__"):
yield f"data: {line}\n\n"
return
elif line.startswith("__ERROR__"):
yield f"data: {line}\n\n"
return
else:
yield f"data: {line}\n\n"
return StreamingResponse(event_stream(), media_type="text/event-stream")
@app.get("/modules/{module_id}/runs")
def api_module_runs(module_id: int, limit: int = 50,
user: str = Depends(authenticate)):
return list_runs(module_id=module_id, limit=limit)
# Keep old path for TUI compat
@app.get("/modules/{module_id}/history")
def api_module_history(module_id: int, limit: int = 50,
user: str = Depends(authenticate)):
return list_runs(module_id=module_id, limit=limit)
# ---------------------------------------------------------------------------
# Watermarks
# ---------------------------------------------------------------------------
@app.get("/modules/{module_id}/watermarks")
def api_list_watermarks(module_id: int, user: str = Depends(authenticate)):
return list_watermarks(module_id)
@app.get("/watermarks/{watermark_id}")
def api_get_watermark(watermark_id: int, user: str = Depends(authenticate)):
w = get_watermark(watermark_id)
if not w:
raise HTTPException(404, "Watermark not found")
return w
@app.post("/watermarks", status_code=201)
def api_create_watermark(body: WatermarkCreate, user: str = Depends(authenticate)):
return create_watermark(**body.model_dump())
@app.post("/watermarks/{watermark_id}")
def api_update_watermark(watermark_id: int, body: WatermarkUpdate,
user: str = Depends(authenticate)):
return update_watermark(watermark_id, **body.model_dump(exclude_none=True))
@app.post("/watermarks/{watermark_id}/delete")
def api_delete_watermark(watermark_id: int, user: str = Depends(authenticate)):
delete_watermark(watermark_id)
return {"ok": True}
# ---------------------------------------------------------------------------
# Hooks
# ---------------------------------------------------------------------------
@app.get("/modules/{module_id}/hooks")
def api_list_hooks(module_id: int, user: str = Depends(authenticate)):
return list_hooks(module_id)
@app.get("/hooks/{hook_id}")
def api_get_hook(hook_id: int, user: str = Depends(authenticate)):
h = get_hook(hook_id)
if not h:
raise HTTPException(404, "Hook not found")
return h
@app.post("/hooks", status_code=201)
def api_create_hook(body: HookCreate, user: str = Depends(authenticate)):
return create_hook(**body.model_dump())
@app.post("/hooks/{hook_id}")
def api_update_hook(hook_id: int, body: HookUpdate,
user: str = Depends(authenticate)):
return update_hook(hook_id, **body.model_dump(exclude_none=True))
@app.post("/hooks/{hook_id}/delete")
def api_delete_hook(hook_id: int, user: str = Depends(authenticate)):
delete_hook(hook_id)
return {"ok": True}
# ---------------------------------------------------------------------------
# Groups
# ---------------------------------------------------------------------------
@app.get("/groups")
def api_list_groups(user: str = Depends(authenticate)):
return list_groups()
@app.get("/groups/{group_id}")
def api_get_group(group_id: int, user: str = Depends(authenticate)):
g = get_group(group_id)
if not g:
raise HTTPException(404, "Group not found")
return g
@app.post("/groups", status_code=201)
def api_create_group(body: GroupCreate, user: str = Depends(authenticate)):
return create_group(**body.model_dump())
@app.post("/groups/{group_id}/delete")
def api_delete_group(group_id: int, user: str = Depends(authenticate)):
delete_group(group_id)
return {"ok": True}
@app.post("/groups/{group_id}/members", status_code=201)
def api_add_member(group_id: int, body: GroupMemberAdd,
user: str = Depends(authenticate)):
return add_group_member(group_id, **body.model_dump())
@app.post("/groups/members/{member_id}/delete")
def api_remove_member(member_id: int, user: str = Depends(authenticate)):
remove_group_member(member_id)
return {"ok": True}
@app.post("/groups/{group_id}/run")
def api_run_group(group_id: int, user: str = Depends(authenticate)):
return run_group(group_id)
# ---------------------------------------------------------------------------
# Group Runs
# ---------------------------------------------------------------------------
@app.get("/group-runs")
def api_list_group_runs(group_id: Optional[int] = None, limit: int = 50,
user: str = Depends(authenticate)):
return list_group_runs(group_id=group_id, limit=limit)
@app.get("/group-runs/{group_run_id}")
def api_get_group_run(group_run_id: int, user: str = Depends(authenticate)):
gr = get_group_run(group_run_id)
if not gr:
raise HTTPException(404, "Group run not found")
return gr
# ---------------------------------------------------------------------------
# Runs
# ---------------------------------------------------------------------------
@app.get("/runs")
def api_list_runs(module_id: Optional[int] = None, status: Optional[str] = None,
limit: int = 50, user: str = Depends(authenticate)):
return list_runs(module_id=module_id, status=status, limit=limit)
@app.get("/runs/{run_id}")
def api_get_run(run_id: int, user: str = Depends(authenticate)):
r = get_run(run_id)
if not r:
raise HTTPException(404, "Run not found")
return r
# ---------------------------------------------------------------------------
# Schedules
# ---------------------------------------------------------------------------
@app.get("/schedules")
def api_list_schedules(user: str = Depends(authenticate)):
return list_schedules()
@app.get("/schedules/{schedule_id}")
def api_get_schedule(schedule_id: int, user: str = Depends(authenticate)):
s = get_schedule(schedule_id)
if not s:
raise HTTPException(404, "Schedule not found")
return s
@app.post("/schedules", status_code=201)
def api_create_schedule(body: ScheduleCreate, user: str = Depends(authenticate)):
return create_schedule(**body.model_dump())
@app.post("/schedules/{schedule_id}")
def api_update_schedule(schedule_id: int, body: ScheduleUpdate,
user: str = Depends(authenticate)):
return update_schedule(schedule_id, **body.model_dump(exclude_none=True))
@app.post("/schedules/{schedule_id}/delete")
def api_delete_schedule(schedule_id: int, user: str = Depends(authenticate)):
delete_schedule(schedule_id)
return {"ok": True}
# ---------------------------------------------------------------------------
# Settings
# ---------------------------------------------------------------------------
@app.get("/settings")
def api_get_settings(user: str = Depends(authenticate)):
from engine.db import get_conn
with get_conn() as conn:
rows = conn.execute("SELECT key, value FROM settings ORDER BY key").fetchall()
return {r["key"]: r["value"] for r in rows}
@app.post("/settings/{key}")
def api_set_setting(key: str, body: SettingUpdate,
user: str = Depends(authenticate)):
set_setting(key, body.value)
return {"ok": True}

View File

@ -0,0 +1,26 @@
"""Load bootstrap config from config.yaml."""
import os
from pathlib import Path
import yaml
CONFIG_PATH = os.environ.get("PIPEKIT_CONFIG", "/opt/pipekit/config.yaml")
def load_config() -> dict:
path = Path(CONFIG_PATH)
if not path.exists():
raise FileNotFoundError(f"Config not found: {path}")
with open(path) as f:
return yaml.safe_load(f)
_config = None
def get_config() -> dict:
global _config
if _config is None:
_config = load_config()
return _config

View File

View File

@ -0,0 +1,686 @@
"""SQLite database layer for Pipekit."""
import sqlite3
from contextlib import contextmanager
from pathlib import Path
from config import get_config
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS driver (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
jar_file TEXT NOT NULL,
class_name TEXT NOT NULL,
url_template TEXT,
created_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS connection (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
driver_id INTEGER REFERENCES driver(id),
jdbc_url TEXT NOT NULL,
username TEXT,
password TEXT,
default_dest_connection_id INTEGER REFERENCES connection(id),
default_dest_schema TEXT,
notes TEXT,
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS module (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
source_connection_id INTEGER NOT NULL REFERENCES connection(id),
dest_connection_id INTEGER NOT NULL REFERENCES connection(id),
dest_table TEXT NOT NULL,
source_query TEXT NOT NULL,
merge_strategy TEXT NOT NULL DEFAULT 'full',
merge_key TEXT,
enabled INTEGER DEFAULT 1,
running INTEGER DEFAULT 0,
running_pid TEXT,
running_since TEXT,
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS watermark (
id INTEGER PRIMARY KEY AUTOINCREMENT,
module_id INTEGER NOT NULL REFERENCES module(id) ON DELETE CASCADE,
name TEXT NOT NULL,
connection_id INTEGER NOT NULL REFERENCES connection(id),
resolver_sql TEXT NOT NULL,
default_value TEXT,
UNIQUE(module_id, name)
);
CREATE TABLE IF NOT EXISTS hook (
id INTEGER PRIMARY KEY AUTOINCREMENT,
module_id INTEGER NOT NULL REFERENCES module(id) ON DELETE CASCADE,
run_order INTEGER NOT NULL DEFAULT 0,
connection_id INTEGER REFERENCES connection(id),
sql TEXT NOT NULL,
run_on TEXT NOT NULL DEFAULT 'success'
);
CREATE TABLE IF NOT EXISTS grp (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS group_member (
id INTEGER PRIMARY KEY AUTOINCREMENT,
group_id INTEGER NOT NULL REFERENCES grp(id) ON DELETE CASCADE,
module_id INTEGER NOT NULL REFERENCES module(id) ON DELETE CASCADE,
run_order INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS schedule (
id INTEGER PRIMARY KEY AUTOINCREMENT,
group_id INTEGER NOT NULL REFERENCES grp(id) ON DELETE CASCADE,
cron_expr TEXT NOT NULL,
enabled INTEGER DEFAULT 1
);
CREATE TABLE IF NOT EXISTS group_run (
id INTEGER PRIMARY KEY AUTOINCREMENT,
group_id INTEGER NOT NULL REFERENCES grp(id),
started_at TEXT DEFAULT (datetime('now')),
finished_at TEXT,
status TEXT NOT NULL DEFAULT 'running',
triggered_by TEXT
);
CREATE TABLE IF NOT EXISTS run_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
module_id INTEGER NOT NULL REFERENCES module(id),
group_run_id INTEGER REFERENCES group_run(id),
started_at TEXT DEFAULT (datetime('now')),
finished_at TEXT,
row_count INTEGER,
status TEXT NOT NULL DEFAULT 'running',
error TEXT,
resolved_source_sql TEXT,
merge_sql TEXT,
watermark_values_json TEXT,
jrunner_stdout TEXT,
jrunner_stderr TEXT,
hook_log TEXT
);
CREATE TABLE IF NOT EXISTS settings (
key TEXT PRIMARY KEY,
value TEXT
);
"""
def get_db_path() -> str:
return get_config()["database"]
def init_db():
"""Create all tables if they don't exist."""
with get_conn() as conn:
conn.executescript(SCHEMA_SQL)
@contextmanager
def get_conn():
"""Get a SQLite connection with row_factory set."""
conn = sqlite3.connect(get_db_path())
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
conn.close()
# ---------------------------------------------------------------------------
# Drivers
# ---------------------------------------------------------------------------
def create_driver(name: str, jar_file: str, class_name: str,
url_template: str = None) -> dict:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO driver (name, jar_file, class_name, url_template) "
"VALUES (?, ?, ?, ?)",
(name, jar_file, class_name, url_template),
)
return dict(conn.execute(
"SELECT * FROM driver WHERE id = ?", (cur.lastrowid,)
).fetchone())
def get_driver(driver_id: int) -> dict | None:
with get_conn() as conn:
row = conn.execute("SELECT * FROM driver WHERE id = ?", (driver_id,)).fetchone()
return dict(row) if row else None
def list_drivers() -> list[dict]:
with get_conn() as conn:
return [dict(r) for r in conn.execute(
"SELECT * FROM driver ORDER BY name"
).fetchall()]
def delete_driver(driver_id: int):
with get_conn() as conn:
conn.execute("DELETE FROM driver WHERE id = ?", (driver_id,))
# ---------------------------------------------------------------------------
# Connections
# ---------------------------------------------------------------------------
def create_connection(name: str, jdbc_url: str, driver_id: int = None,
username: str = None, password: str = None,
default_dest_connection_id: int = None,
default_dest_schema: str = None,
notes: str = None) -> dict:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO connection (name, jdbc_url, driver_id, username, password, "
"default_dest_connection_id, default_dest_schema, notes) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(name, jdbc_url, driver_id, username, password,
default_dest_connection_id, default_dest_schema, notes),
)
return dict(conn.execute(
"SELECT * FROM connection WHERE id = ?", (cur.lastrowid,)
).fetchone())
def get_connection(conn_id: int) -> dict | None:
with get_conn() as conn:
row = conn.execute("SELECT * FROM connection WHERE id = ?", (conn_id,)).fetchone()
return dict(row) if row else None
def list_connections() -> list[dict]:
with get_conn() as conn:
return [dict(r) for r in conn.execute(
"SELECT * FROM connection ORDER BY name"
).fetchall()]
def update_connection(conn_id: int, **kwargs) -> dict:
allowed = {"name", "jdbc_url", "driver_id", "username", "password",
"default_dest_connection_id", "default_dest_schema", "notes"}
fields = {k: v for k, v in kwargs.items() if k in allowed and v is not None}
if not fields:
return get_connection(conn_id)
sets = ", ".join(f"{k} = ?" for k in fields)
vals = list(fields.values())
with get_conn() as conn:
conn.execute(
f"UPDATE connection SET {sets}, updated_at = datetime('now') WHERE id = ?",
vals + [conn_id],
)
return get_connection(conn_id)
def delete_connection(conn_id: int):
with get_conn() as conn:
conn.execute("DELETE FROM connection WHERE id = ?", (conn_id,))
# ---------------------------------------------------------------------------
# Modules
# ---------------------------------------------------------------------------
def create_module(name: str, source_connection_id: int, dest_connection_id: int,
dest_table: str, source_query: str, merge_strategy: str = "full",
merge_key: str = None) -> dict:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO module (name, source_connection_id, dest_connection_id, "
"dest_table, source_query, merge_strategy, merge_key) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
(name, source_connection_id, dest_connection_id, dest_table,
source_query, merge_strategy, merge_key),
)
return dict(conn.execute(
"SELECT * FROM module WHERE id = ?", (cur.lastrowid,)
).fetchone())
def get_module(module_id: int) -> dict | None:
with get_conn() as conn:
row = conn.execute("SELECT * FROM module WHERE id = ?", (module_id,)).fetchone()
return dict(row) if row else None
def list_modules() -> list[dict]:
with get_conn() as conn:
return [dict(r) for r in conn.execute(
"SELECT * FROM module ORDER BY name"
).fetchall()]
def update_module(module_id: int, **kwargs) -> dict:
allowed = {"name", "source_connection_id", "dest_connection_id", "dest_table",
"source_query", "merge_strategy", "merge_key", "enabled"}
fields = {k: v for k, v in kwargs.items() if k in allowed and v is not None}
if not fields:
return get_module(module_id)
sets = ", ".join(f"{k} = ?" for k in fields)
vals = list(fields.values())
with get_conn() as conn:
conn.execute(
f"UPDATE module SET {sets}, updated_at = datetime('now') WHERE id = ?",
vals + [module_id],
)
return get_module(module_id)
def acquire_module_lock(module_id: int, pid: str) -> bool:
"""Atomically acquire the run lock. Returns True if acquired."""
with get_conn() as conn:
cur = conn.execute(
"UPDATE module SET running = 1, running_pid = ?, "
"running_since = datetime('now') "
"WHERE id = ? AND running = 0",
(pid, module_id),
)
return cur.rowcount > 0
def release_module_lock(module_id: int):
"""Release the run lock."""
with get_conn() as conn:
conn.execute(
"UPDATE module SET running = 0, running_pid = NULL, "
"running_since = NULL WHERE id = ?",
(module_id,),
)
def clear_stale_locks(max_age_hours: int = 24):
"""Clear locks held longer than max_age_hours or by dead PIDs."""
with get_conn() as conn:
conn.execute(
"UPDATE module SET running = 0, running_pid = NULL, running_since = NULL "
"WHERE running = 1 AND running_since < datetime('now', ?)",
(f"-{max_age_hours} hours",),
)
def delete_module(module_id: int):
with get_conn() as conn:
conn.execute("DELETE FROM module WHERE id = ?", (module_id,))
# ---------------------------------------------------------------------------
# Watermarks
# ---------------------------------------------------------------------------
def create_watermark(module_id: int, name: str, connection_id: int,
resolver_sql: str, default_value: str = None) -> dict:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO watermark (module_id, name, connection_id, resolver_sql, "
"default_value) VALUES (?, ?, ?, ?, ?)",
(module_id, name, connection_id, resolver_sql, default_value),
)
return dict(conn.execute(
"SELECT * FROM watermark WHERE id = ?", (cur.lastrowid,)
).fetchone())
def get_watermark(watermark_id: int) -> dict | None:
with get_conn() as conn:
row = conn.execute(
"SELECT * FROM watermark WHERE id = ?", (watermark_id,)
).fetchone()
return dict(row) if row else None
def list_watermarks(module_id: int) -> list[dict]:
with get_conn() as conn:
return [dict(r) for r in conn.execute(
"SELECT * FROM watermark WHERE module_id = ? ORDER BY name",
(module_id,),
).fetchall()]
def update_watermark(watermark_id: int, **kwargs) -> dict:
allowed = {"name", "connection_id", "resolver_sql", "default_value"}
fields = {k: v for k, v in kwargs.items() if k in allowed and v is not None}
if not fields:
return get_watermark(watermark_id)
sets = ", ".join(f"{k} = ?" for k in fields)
vals = list(fields.values())
with get_conn() as conn:
conn.execute(
f"UPDATE watermark SET {sets} WHERE id = ?",
vals + [watermark_id],
)
return get_watermark(watermark_id)
def delete_watermark(watermark_id: int):
with get_conn() as conn:
conn.execute("DELETE FROM watermark WHERE id = ?", (watermark_id,))
# ---------------------------------------------------------------------------
# Hooks
# ---------------------------------------------------------------------------
def create_hook(module_id: int, sql: str, run_order: int = 0,
connection_id: int = None, run_on: str = "success") -> dict:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO hook (module_id, run_order, connection_id, sql, run_on) "
"VALUES (?, ?, ?, ?, ?)",
(module_id, run_order, connection_id, sql, run_on),
)
return dict(conn.execute(
"SELECT * FROM hook WHERE id = ?", (cur.lastrowid,)
).fetchone())
def get_hook(hook_id: int) -> dict | None:
with get_conn() as conn:
row = conn.execute("SELECT * FROM hook WHERE id = ?", (hook_id,)).fetchone()
return dict(row) if row else None
def list_hooks(module_id: int) -> list[dict]:
with get_conn() as conn:
return [dict(r) for r in conn.execute(
"SELECT * FROM hook WHERE module_id = ? ORDER BY run_order",
(module_id,),
).fetchall()]
def update_hook(hook_id: int, **kwargs) -> dict:
allowed = {"run_order", "connection_id", "sql", "run_on"}
fields = {k: v for k, v in kwargs.items() if k in allowed and v is not None}
if not fields:
return get_hook(hook_id)
sets = ", ".join(f"{k} = ?" for k in fields)
vals = list(fields.values())
with get_conn() as conn:
conn.execute(f"UPDATE hook SET {sets} WHERE id = ?", vals + [hook_id])
return get_hook(hook_id)
def delete_hook(hook_id: int):
with get_conn() as conn:
conn.execute("DELETE FROM hook WHERE id = ?", (hook_id,))
# ---------------------------------------------------------------------------
# Groups
# ---------------------------------------------------------------------------
def create_group(name: str) -> dict:
with get_conn() as conn:
cur = conn.execute("INSERT INTO grp (name) VALUES (?)", (name,))
return dict(conn.execute(
"SELECT * FROM grp WHERE id = ?", (cur.lastrowid,)
).fetchone())
def get_group(group_id: int) -> dict | None:
with get_conn() as conn:
row = conn.execute("SELECT * FROM grp WHERE id = ?", (group_id,)).fetchone()
if not row:
return None
g = dict(row)
g["members"] = [dict(r) for r in conn.execute(
"SELECT gm.*, m.name AS module_name FROM group_member gm "
"JOIN module m ON gm.module_id = m.id "
"WHERE gm.group_id = ? ORDER BY gm.run_order", (group_id,)
).fetchall()]
g["schedules"] = [dict(r) for r in conn.execute(
"SELECT * FROM schedule WHERE group_id = ? ORDER BY id",
(group_id,),
).fetchall()]
return g
def list_groups() -> list[dict]:
with get_conn() as conn:
groups = [dict(r) for r in conn.execute(
"SELECT * FROM grp ORDER BY name"
).fetchall()]
for g in groups:
full = get_group(g["id"])
g["members"] = full["members"] if full else []
g["schedules"] = full["schedules"] if full else []
return groups
def delete_group(group_id: int):
with get_conn() as conn:
conn.execute("DELETE FROM grp WHERE id = ?", (group_id,))
def add_group_member(group_id: int, module_id: int, run_order: int = 0) -> dict:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO group_member (group_id, module_id, run_order) "
"VALUES (?, ?, ?)",
(group_id, module_id, run_order),
)
return dict(conn.execute(
"SELECT * FROM group_member WHERE id = ?", (cur.lastrowid,)
).fetchone())
def remove_group_member(member_id: int):
with get_conn() as conn:
conn.execute("DELETE FROM group_member WHERE id = ?", (member_id,))
# ---------------------------------------------------------------------------
# Schedules
# ---------------------------------------------------------------------------
def create_schedule(group_id: int, cron_expr: str, enabled: bool = True) -> dict:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO schedule (group_id, cron_expr, enabled) VALUES (?, ?, ?)",
(group_id, cron_expr, int(enabled)),
)
return dict(conn.execute(
"SELECT * FROM schedule WHERE id = ?", (cur.lastrowid,)
).fetchone())
def get_schedule(schedule_id: int) -> dict | None:
with get_conn() as conn:
row = conn.execute(
"SELECT * FROM schedule WHERE id = ?", (schedule_id,)
).fetchone()
return dict(row) if row else None
def list_schedules() -> list[dict]:
with get_conn() as conn:
return [dict(r) for r in conn.execute(
"SELECT s.*, g.name AS group_name FROM schedule s "
"JOIN grp g ON s.group_id = g.id ORDER BY g.name"
).fetchall()]
def update_schedule(schedule_id: int, **kwargs) -> dict:
allowed = {"cron_expr", "enabled"}
fields = {k: v for k, v in kwargs.items() if k in allowed and v is not None}
if not fields:
return get_schedule(schedule_id)
sets = ", ".join(f"{k} = ?" for k in fields)
vals = list(fields.values())
with get_conn() as conn:
conn.execute(f"UPDATE schedule SET {sets} WHERE id = ?", vals + [schedule_id])
return get_schedule(schedule_id)
def delete_schedule(schedule_id: int):
with get_conn() as conn:
conn.execute("DELETE FROM schedule WHERE id = ?", (schedule_id,))
# ---------------------------------------------------------------------------
# Group Runs
# ---------------------------------------------------------------------------
def create_group_run(group_id: int, triggered_by: str = None) -> dict:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO group_run (group_id, triggered_by) VALUES (?, ?)",
(group_id, triggered_by),
)
return dict(conn.execute(
"SELECT * FROM group_run WHERE id = ?", (cur.lastrowid,)
).fetchone())
def finish_group_run(group_run_id: int, status: str):
with get_conn() as conn:
conn.execute(
"UPDATE group_run SET finished_at = datetime('now'), status = ? "
"WHERE id = ?",
(status, group_run_id),
)
def get_group_run(group_run_id: int) -> dict | None:
with get_conn() as conn:
row = conn.execute(
"SELECT * FROM group_run WHERE id = ?", (group_run_id,)
).fetchone()
if not row:
return None
gr = dict(row)
gr["runs"] = [dict(r) for r in conn.execute(
"SELECT rl.*, m.name AS module_name FROM run_log rl "
"JOIN module m ON rl.module_id = m.id "
"WHERE rl.group_run_id = ? ORDER BY rl.id",
(group_run_id,),
).fetchall()]
return gr
def list_group_runs(group_id: int = None, limit: int = 50) -> list[dict]:
with get_conn() as conn:
if group_id:
return [dict(r) for r in conn.execute(
"SELECT gr.*, g.name AS group_name FROM group_run gr "
"JOIN grp g ON gr.group_id = g.id "
"WHERE gr.group_id = ? ORDER BY gr.id DESC LIMIT ?",
(group_id, limit),
).fetchall()]
return [dict(r) for r in conn.execute(
"SELECT gr.*, g.name AS group_name FROM group_run gr "
"JOIN grp g ON gr.group_id = g.id "
"ORDER BY gr.id DESC LIMIT ?", (limit,)
).fetchall()]
# ---------------------------------------------------------------------------
# Run Log
# ---------------------------------------------------------------------------
def create_run(module_id: int, group_run_id: int = None) -> dict:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO run_log (module_id, group_run_id) VALUES (?, ?)",
(module_id, group_run_id),
)
return dict(conn.execute(
"SELECT * FROM run_log WHERE id = ?", (cur.lastrowid,)
).fetchone())
def log_run_sql(run_id: int, resolved_source_sql: str, merge_sql: str = None):
with get_conn() as conn:
conn.execute(
"UPDATE run_log SET resolved_source_sql = ?, merge_sql = ? WHERE id = ?",
(resolved_source_sql, merge_sql, run_id),
)
def log_run_output(run_id: int, jrunner_stdout: str = None,
jrunner_stderr: str = None, hook_log: str = None,
watermark_values_json: str = None):
sets, vals = [], []
if jrunner_stdout is not None:
sets.append("jrunner_stdout = ?"); vals.append(jrunner_stdout)
if jrunner_stderr is not None:
sets.append("jrunner_stderr = ?"); vals.append(jrunner_stderr)
if hook_log is not None:
sets.append("hook_log = ?"); vals.append(hook_log)
if watermark_values_json is not None:
sets.append("watermark_values_json = ?"); vals.append(watermark_values_json)
if not sets:
return
with get_conn() as conn:
conn.execute(
f"UPDATE run_log SET {', '.join(sets)} WHERE id = ?",
vals + [run_id],
)
def finish_run(run_id: int, status: str, row_count: int = None, error: str = None):
with get_conn() as conn:
conn.execute(
"UPDATE run_log SET finished_at = datetime('now'), status = ?, "
"row_count = ?, error = ? WHERE id = ?",
(status, row_count, error, run_id),
)
def get_run(run_id: int) -> dict | None:
with get_conn() as conn:
row = conn.execute("SELECT * FROM run_log WHERE id = ?", (run_id,)).fetchone()
return dict(row) if row else None
def list_runs(module_id: int = None, status: str = None,
limit: int = 50) -> list[dict]:
with get_conn() as conn:
where, params = [], []
if module_id:
where.append("r.module_id = ?"); params.append(module_id)
if status:
where.append("r.status = ?"); params.append(status)
where_sql = ("WHERE " + " AND ".join(where)) if where else ""
params.append(limit)
return [dict(r) for r in conn.execute(
f"SELECT r.*, m.name AS module_name FROM run_log r "
f"LEFT JOIN module m ON r.module_id = m.id "
f"{where_sql} ORDER BY r.id DESC LIMIT ?", params
).fetchall()]
# ---------------------------------------------------------------------------
# Settings
# ---------------------------------------------------------------------------
def get_setting(key: str) -> str | None:
with get_conn() as conn:
row = conn.execute("SELECT value FROM settings WHERE key = ?", (key,)).fetchone()
return row["value"] if row else None
def set_setting(key: str, value: str):
with get_conn() as conn:
conn.execute(
"INSERT INTO settings (key, value) VALUES (?, ?) "
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
(key, value),
)

View File

@ -0,0 +1,462 @@
"""Introspect source systems — browse tables, fetch columns, generate queries and DDL."""
import csv
import io
import os
import re
import subprocess
import tempfile
from dataclasses import dataclass
from config import get_config
from engine.db import get_connection
@dataclass
class RemoteTable:
schema: str
name: str
table_type: str
linked_server: str = None
linked_db: str = None
@property
def full_name(self) -> str:
if self.linked_server:
return f"[{self.linked_server}].[{self.linked_db}].{self.schema}.{self.name}"
return f"{self.schema}.{self.name}"
@property
def type_label(self) -> str:
mapping = {
"BASE TABLE": "Table", "VIEW": "View",
"P": "Table", "L": "View", "T": "Table", "V": "View",
}
return mapping.get(self.table_type, self.table_type)
def to_dict(self) -> dict:
return {"schema": self.schema, "name": self.name,
"table_type": self.table_type, "type_label": self.type_label,
"full_name": self.full_name,
"linked_server": self.linked_server,
"linked_db": self.linked_db}
@dataclass
class RemoteColumn:
name: str
data_type: str
position: int
nullable: bool = True
def to_dict(self) -> dict:
return {"name": self.name, "data_type": self.data_type,
"position": self.position, "nullable": self.nullable}
# ---------------------------------------------------------------------------
# JDBC type to PostgreSQL type mapping
# ---------------------------------------------------------------------------
TYPE_MAP_PG = {
# integers
"int": "integer", "integer": "integer", "smallint": "smallint",
"bigint": "bigint", "tinyint": "smallint",
# floats
"float": "double precision", "real": "real", "double": "double precision",
# decimal
"decimal": "numeric", "numeric": "numeric", "money": "numeric(19,4)",
"smallmoney": "numeric(10,4)",
# strings
"varchar": "text", "char": "text", "nvarchar": "text", "nchar": "text",
"text": "text", "ntext": "text", "character": "text",
# dates
"date": "date", "datetime": "timestamp", "datetime2": "timestamp",
"smalldatetime": "timestamp", "timestamp": "timestamp",
"timestamptz": "timestamptz",
# boolean
"bit": "boolean",
# binary
"binary": "bytea", "varbinary": "bytea", "image": "bytea",
# uuid
"uniqueidentifier": "uuid",
}
def map_type_pg(source_type: str) -> str:
"""Map a source column type to a PostgreSQL type."""
base = source_type.lower().split("(")[0].strip()
return TYPE_MAP_PG.get(base, "text")
# ---------------------------------------------------------------------------
# jrunner query helper
# ---------------------------------------------------------------------------
def _resolve_password(password: str) -> str:
"""Resolve a password — if it starts with $, look up the env var."""
if password and password.startswith("$"):
return os.environ.get(password[1:], "")
return password or ""
def run_jrunner_query(connection_id: int, sql: str) -> str:
"""Run a query via jrunner in CSV mode and return raw output."""
conn = get_connection(connection_id)
if not conn:
raise ValueError(f"Connection {connection_id} not found")
cfg = get_config()
jrunner = cfg["jrunner_path"]
password = _resolve_password(conn["password"])
with tempfile.NamedTemporaryFile(mode="w", suffix=".sql", delete=False) as f:
f.write(sql)
sql_path = f.name
try:
result = subprocess.run(
[jrunner,
"-scu", conn["jdbc_url"],
"-scn", conn["username"] or "",
"-scp", password,
"-sq", sql_path,
"-f", "csv"],
capture_output=True, text=True, timeout=60,
)
if result.returncode != 0:
raise RuntimeError(f"jrunner error: {result.stderr or result.stdout}")
return result.stdout
finally:
os.unlink(sql_path)
def _parse_csv(output: str) -> list[list[str]]:
"""Parse CSV output from jrunner, skipping the header."""
reader = csv.reader(io.StringIO(output))
header = next(reader, None)
if not header:
return []
return [row for row in reader if row]
# ---------------------------------------------------------------------------
# Table browsing
# ---------------------------------------------------------------------------
def _detect_source_type(jdbc_url: str) -> str:
"""Detect source type from JDBC URL."""
url = jdbc_url.lower()
if "as400" in url:
return "as400"
if "sqlserver" in url:
return "sqlserver"
if "postgresql" in url:
return "postgresql"
if "clickhouse" in url:
return "clickhouse"
if "mysql" in url:
return "mysql"
return "unknown"
def fetch_tables(connection_id: int, schema_filter: str = None) -> list[RemoteTable]:
"""Fetch list of tables and views from a source connection."""
conn = get_connection(connection_id)
if not conn:
raise ValueError(f"Connection {connection_id} not found")
source_type = _detect_source_type(conn["jdbc_url"])
linked_server = None
linked_db = None
if source_type == "as400":
sql = (
"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
"FROM QSYS2.SYSTABLES "
"WHERE TABLE_SCHEMA NOT LIKE 'Q%' "
)
if schema_filter:
sql += f"AND TABLE_SCHEMA = '{schema_filter}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
elif source_type == "sqlserver":
# Parse schema_filter formats:
# "LINKED.DB" -> linked server + database
# "LINKED.DB.SCHEMA" -> linked server + database + schema
# ".DB" -> database only (no linked server)
# ".DB.SCHEMA" -> database + schema
# "SCHEMA" -> schema only (current database)
linked_schema = None
local_db = None
if schema_filter and "." in schema_filter:
parts = schema_filter.split(".")
if parts[0] == "":
# Starts with dot: ".DB" or ".DB.SCHEMA"
local_db = parts[1] if len(parts) > 1 else None
linked_schema = parts[2] if len(parts) > 2 else None
elif len(parts) == 2:
linked_server, linked_db = parts
elif len(parts) >= 3:
linked_server, linked_db, linked_schema = parts[0], parts[1], parts[2]
if linked_server:
sql = (
f"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
f"FROM [{linked_server}].[{linked_db}].INFORMATION_SCHEMA.TABLES "
f"WHERE TABLE_TYPE IN ('BASE TABLE','VIEW') "
)
if linked_schema:
sql += f"AND TABLE_SCHEMA = '{linked_schema}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
elif local_db:
sql = (
f"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
f"FROM [{local_db}].INFORMATION_SCHEMA.TABLES "
f"WHERE TABLE_TYPE IN ('BASE TABLE','VIEW') "
)
if linked_schema:
sql += f"AND TABLE_SCHEMA = '{linked_schema}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
else:
sql = (
"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
"FROM INFORMATION_SCHEMA.TABLES "
"WHERE TABLE_TYPE IN ('BASE TABLE','VIEW') "
)
if schema_filter:
sql += f"AND TABLE_SCHEMA = '{schema_filter}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
elif source_type == "postgresql":
sql = (
"SELECT table_schema, table_name, table_type "
"FROM information_schema.tables "
"WHERE table_schema NOT IN ('pg_catalog','information_schema') "
)
if schema_filter:
sql += f"AND table_schema = '{schema_filter}' "
sql += "ORDER BY table_schema, table_name"
elif source_type == "clickhouse":
sql = (
"SELECT database AS TABLE_SCHEMA, name AS TABLE_NAME, engine AS TABLE_TYPE "
"FROM system.tables "
"WHERE database NOT IN ('system','INFORMATION_SCHEMA','information_schema') "
)
if schema_filter:
sql += f"AND database = '{schema_filter}' "
sql += "ORDER BY database, name"
elif source_type == "mysql":
sql = (
"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
"FROM INFORMATION_SCHEMA.TABLES "
"WHERE TABLE_SCHEMA NOT IN ('mysql','information_schema','performance_schema','sys') "
)
if schema_filter:
sql += f"AND TABLE_SCHEMA = '{schema_filter}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
else:
# Generic fallback — INFORMATION_SCHEMA is widely supported
sql = (
"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
"FROM INFORMATION_SCHEMA.TABLES "
"ORDER BY TABLE_SCHEMA, TABLE_NAME"
)
# For database-only queries, store the db in linked_db so downstream can reference it
effective_db = linked_db if linked_server else (local_db if source_type == "sqlserver" else None)
rows = _parse_csv(run_jrunner_query(connection_id, sql))
return [RemoteTable(schema=r[0].strip(), name=r[1].strip(), table_type=r[2].strip(),
linked_server=linked_server if source_type == "sqlserver" else None,
linked_db=effective_db)
for r in rows if len(r) >= 3]
def fetch_columns(connection_id: int, schema: str, table: str,
linked_server: str = None, linked_db: str = None) -> list[RemoteColumn]:
"""Fetch column metadata for a specific table."""
conn = get_connection(connection_id)
if not conn:
raise ValueError(f"Connection {connection_id} not found")
source_type = _detect_source_type(conn["jdbc_url"])
if source_type == "as400":
sql = (
f"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION "
f"FROM QSYS2.SYSCOLUMNS "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
f"ORDER BY ORDINAL_POSITION"
)
elif source_type == "clickhouse":
sql = (
f"SELECT name, type, position() "
f"FROM system.columns "
f"WHERE database = '{schema}' AND table = '{table}' "
f"ORDER BY position"
)
elif source_type == "sqlserver" and linked_server and linked_db:
sql = (
f"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION "
f"FROM [{linked_server}].[{linked_db}].INFORMATION_SCHEMA.COLUMNS "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
f"ORDER BY ORDINAL_POSITION"
)
elif source_type == "sqlserver" and linked_db:
sql = (
f"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION "
f"FROM [{linked_db}].INFORMATION_SCHEMA.COLUMNS "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
f"ORDER BY ORDINAL_POSITION"
)
else:
# Works for SQL Server, PostgreSQL, MySQL
sql = (
f"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION "
f"FROM INFORMATION_SCHEMA.COLUMNS "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
f"ORDER BY ORDINAL_POSITION"
)
rows = _parse_csv(run_jrunner_query(connection_id, sql))
return [RemoteColumn(name=r[0].strip(), data_type=r[1].strip(), position=int(r[2].strip()))
for r in rows if len(r) >= 3]
# ---------------------------------------------------------------------------
# Query and DDL generation
# ---------------------------------------------------------------------------
_IDENTIFIER_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
def _needs_quoting(name: str) -> bool:
"""Check if a column name needs quoting (has spaces, special chars, etc.)."""
return not _IDENTIFIER_RE.match(name)
def _safe_alias(name: str) -> str:
"""Generate a safe lowercase alias for a column name.
Replaces special characters with underscores and strips leading/trailing
underscores. If the result still needs quoting, wraps in double quotes.
"""
alias = re.sub(r'[^a-z0-9_]', '_', name.lower())
alias = re.sub(r'_+', '_', alias).strip('_')
if not alias or not _IDENTIFIER_RE.match(alias):
alias = f'"{alias}"'
return alias
def generate_select(connection_id: int, schema: str, table: str,
columns: list[RemoteColumn] = None,
linked_server: str = None, linked_db: str = None) -> str:
"""Generate a SELECT query from column metadata."""
if columns is None:
columns = fetch_columns(connection_id, schema, table,
linked_server=linked_server, linked_db=linked_db)
conn = get_connection(connection_id)
source_type = _detect_source_type(conn["jdbc_url"])
text_types = {"varchar", "char", "nvarchar", "nchar", "character", "text", "ntext"}
lines = ["SELECT"]
for i, col in enumerate(columns):
prefix = " ," if i > 0 else " "
alias = _safe_alias(col.name)
# Quote source column name if it contains special characters
# SQL Server uses [brackets], others use "double quotes"
if _needs_quoting(col.name):
if source_type == "sqlserver":
col_ref = f"[{col.name}]"
else:
col_ref = f'"{col.name}"'
else:
col_ref = col.name
base_type = col.data_type.lower().split("(")[0].strip()
# RTRIM text columns for SQL Server and AS/400 (padded char fields)
if base_type in text_types and source_type in ("sqlserver", "as400"):
expr = f"RTRIM({col_ref})"
lines.append(f"{prefix}{expr:<35} AS {alias}")
else:
lines.append(f"{prefix}{col_ref:<35} AS {alias}")
lines.append("FROM")
if linked_server and linked_db:
lines.append(f" [{linked_server}].[{linked_db}].{schema}.{table}")
elif linked_db:
lines.append(f" [{linked_db}].{schema}.{table}")
else:
lines.append(f" {schema}.{table}")
return "\n".join(lines)
def generate_dest_ddl(dest_table: str, columns: list[RemoteColumn]) -> str:
"""Generate CREATE TABLE DDL for the destination (PostgreSQL)."""
schema_table = dest_table
lines = [f"CREATE TABLE IF NOT EXISTS {schema_table} ("]
col_lines = []
for col in columns:
pg_type = map_type_pg(col.data_type)
col_name = _safe_alias(col.name)
col_lines.append(f" {col_name:<30} {pg_type}")
lines.append(",\n".join(col_lines))
lines.append(");")
return "\n".join(lines)
def propose_module(connection_id: int, schema: str, table: str,
dest_schema: str = None,
linked_server: str = None, linked_db: str = None) -> dict:
"""
Given a source table, propose a complete module config:
- source_query (auto-generated SELECT with RTRIM)
- dest_table
- dest_ddl (CREATE TABLE for destination)
- suggested merge_strategy
- suggested merge_key (first column)
- suggested watermark_column (if DEX_ROW_TS or similar found)
"""
columns = fetch_columns(connection_id, schema, table,
linked_server=linked_server, linked_db=linked_db)
source_query = generate_select(connection_id, schema, table, columns,
linked_server=linked_server, linked_db=linked_db)
# Propose destination table name
if dest_schema is None:
dest_schema = "public"
dest_table = f"{dest_schema}.{table.lower()}"
# Generate DDL
dest_ddl = generate_dest_ddl(dest_table, columns)
# Suggest merge strategy based on columns present
col_names_lower = [c.name.lower() for c in columns]
timestamp_col = None
for candidate in ["dex_row_ts", "modified_date", "updated_at", "last_modified",
"modifieddate", "changedate"]:
if candidate in col_names_lower:
timestamp_col = candidate
break
merge_key = columns[0].name.lower() if columns else None
if timestamp_col:
strategy = "incremental"
else:
strategy = "full"
return {
"name": table.lower(),
"source_query": source_query,
"dest_table": dest_table,
"dest_ddl": dest_ddl,
"columns": [c.to_dict() for c in columns],
"merge_strategy": strategy,
"merge_key": merge_key,
"watermark_column": timestamp_col,
}

View File

@ -0,0 +1,491 @@
"""Sync runner — orchestrates jrunner transfers, staging, merge, hooks, logging."""
import json
import os
import re
import subprocess
import tempfile
import logging
from config import get_config
from engine.db import (
get_module, get_connection, get_run, create_run, finish_run,
log_run_sql, log_run_output, list_hooks, list_watermarks,
get_group, acquire_module_lock, release_module_lock,
create_group_run, finish_group_run,
)
from engine.introspect import _resolve_password, fetch_columns, map_type_pg
logger = logging.getLogger("pipekit.runner")
class SyncError(Exception):
pass
def _parse_pg_jdbc_url(jdbc_url: str) -> dict:
"""Extract host, port, dbname from a PostgreSQL JDBC URL."""
m = re.match(r"jdbc:postgresql://([^:/]+)(?::(\d+))?/(\w+)", jdbc_url)
if not m:
return {}
return {"host": m.group(1), "port": m.group(2) or "5432", "dbname": m.group(3)}
def _run_dest_sql(conn_info: dict, sql: str) -> str:
"""Run SQL against a database connection.
Uses psql for PostgreSQL (supports DDL/DML), jrunner query mode for others."""
password = _resolve_password(conn_info["password"])
with tempfile.NamedTemporaryFile(mode="w", suffix=".sql", delete=False) as f:
f.write(sql)
sql_path = f.name
try:
if "postgresql" in conn_info["jdbc_url"].lower():
pg = _parse_pg_jdbc_url(conn_info["jdbc_url"])
env = os.environ.copy()
env["PGPASSWORD"] = password
result = subprocess.run(
["psql",
"-h", pg.get("host", "localhost"),
"-p", pg.get("port", "5432"),
"-U", conn_info["username"] or "",
"-d", pg.get("dbname", ""),
"-f", sql_path],
capture_output=True, text=True, timeout=300, env=env,
)
if result.returncode != 0:
raise SyncError(f"psql error: {result.stderr}")
return result.stdout
else:
cfg = get_config()
jrunner = cfg["jrunner_path"]
result = subprocess.run(
[jrunner,
"-scu", conn_info["jdbc_url"],
"-scn", conn_info["username"] or "",
"-scp", password,
"-sq", sql_path,
"-f", "csv"],
capture_output=True, text=True, timeout=300,
)
return result.stdout
finally:
os.unlink(sql_path)
def _run_jrunner_query(conn_info: dict, sql: str) -> str:
"""Run a query via jrunner query mode and return stdout."""
cfg = get_config()
jrunner = cfg["jrunner_path"]
password = _resolve_password(conn_info["password"])
with tempfile.NamedTemporaryFile(mode="w", suffix=".sql", delete=False) as f:
f.write(sql)
sql_path = f.name
try:
result = subprocess.run(
[jrunner,
"-scu", conn_info["jdbc_url"],
"-scn", conn_info["username"] or "",
"-scp", password,
"-sq", sql_path,
"-f", "csv"],
capture_output=True, text=True, timeout=60,
)
if result.returncode != 0:
raise SyncError(f"jrunner query error: {result.stderr or result.stdout}")
return result.stdout
finally:
os.unlink(sql_path)
def _staging_table_exists(dest_conn: dict, staging_table: str) -> bool:
"""Check if a staging table already exists in the destination."""
parts = staging_table.split(".")
schema = parts[0] if len(parts) == 2 else "public"
table = parts[-1]
sql = (
f"SELECT 1 FROM information_schema.tables "
f"WHERE table_schema = '{schema}' AND table_name = '{table}'"
)
try:
output = _run_dest_sql(dest_conn, sql).strip()
return "1" in output
except Exception:
return False
def _create_staging_from_source(source_conn: dict, dest_conn: dict,
source_query: str, staging_table: str) -> None:
"""Ensure a staging table exists and is empty."""
if _staging_table_exists(dest_conn, staging_table):
_run_dest_sql(dest_conn, f"TRUNCATE TABLE {staging_table};")
return
from engine.introspect import _detect_source_type
source_type = _detect_source_type(source_conn["jdbc_url"])
base_query = source_query.rstrip().rstrip(";")
if source_type == "sqlserver":
probe_query = f"SELECT TOP 0 * FROM ({base_query}) AS probe0"
elif source_type == "postgresql":
probe_query = f"SELECT * FROM ({base_query}) AS probe0 LIMIT 0"
elif source_type == "as400":
probe_query = f"SELECT * FROM ({base_query}) AS probe0 FETCH FIRST 0 ROWS ONLY"
else:
probe_query = f"SELECT * FROM ({base_query}) AS probe0 WHERE 1=0"
cfg = get_config()
jrunner = cfg["jrunner_path"]
src_pw = _resolve_password(source_conn["password"])
dst_pw = _resolve_password(dest_conn["password"])
with tempfile.NamedTemporaryFile(mode="w", suffix=".sql", delete=False) as f:
f.write(probe_query)
sql_path = f.name
try:
result = subprocess.run(
[jrunner,
"-scu", source_conn["jdbc_url"],
"-scn", source_conn["username"] or "",
"-scp", src_pw,
"-dcu", dest_conn["jdbc_url"],
"-dcn", dest_conn["username"] or "",
"-dcp", dst_pw,
"-dt", staging_table,
"-sq", sql_path],
capture_output=True, text=True, timeout=30,
)
output = result.stdout + result.stderr
finally:
os.unlink(sql_path)
columns = []
for m in re.finditer(r"\*\s+(\S+):\s+(\S+)", output):
col_name = m.group(1).lower()
col_type = m.group(2)
pg_type = map_type_pg(col_type)
columns.append(f" {col_name:<30} {pg_type}")
if not columns:
raise SyncError(f"Could not introspect source columns. jrunner output: {output[:500]}")
col_defs = ",\n".join(columns)
ddl = (
f"DROP TABLE IF EXISTS {staging_table};\n"
f"CREATE TABLE {staging_table} (\n{col_defs}\n);"
)
_run_dest_sql(dest_conn, ddl)
def _run_jdbc_transfer(source_conn: dict, dest_conn: dict, source_query: str,
dest_table: str, on_output: callable = None) -> tuple[int, str, str]:
"""Run jrunner to transfer data from source to destination.
Returns (row_count, stdout, stderr)."""
cfg = get_config()
jrunner = cfg["jrunner_path"]
src_pw = _resolve_password(source_conn["password"])
dst_pw = _resolve_password(dest_conn["password"])
with tempfile.NamedTemporaryFile(mode="w", suffix=".sql", delete=False) as f:
f.write(source_query)
sql_path = f.name
try:
proc = subprocess.Popen(
[jrunner,
"-scu", source_conn["jdbc_url"],
"-scn", source_conn["username"] or "",
"-scp", src_pw,
"-dcu", dest_conn["jdbc_url"],
"-dcn", dest_conn["username"] or "",
"-dcp", dst_pw,
"-dt", dest_table,
"-sq", sql_path],
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
text=True,
)
stdout_lines = []
for line in proc.stdout:
line = line.rstrip("\n")
stdout_lines.append(line)
if on_output:
on_output(line)
proc.wait()
stdout = "\n".join(stdout_lines)
stderr = proc.stderr.read() if proc.stderr else ""
if proc.returncode != 0:
raise SyncError(f"jrunner transfer failed: {stdout}\n{stderr}")
row_count = _parse_row_count(stdout)
return row_count, stdout, stderr
finally:
os.unlink(sql_path)
def _parse_row_count(output: str) -> int:
"""Extract row count from jrunner output."""
for line in output.splitlines():
if "rows written" in line.lower():
m = re.search(r"(\d+)\s*rows written", line, re.IGNORECASE)
if m:
return int(m.group(1))
return 0
def _resolve_watermarks(module_id: int) -> dict[str, str]:
"""Resolve all watermarks for a module. Returns {name: resolved_value}."""
watermarks = list_watermarks(module_id)
resolved = {}
for wm in watermarks:
conn = get_connection(wm["connection_id"])
if not conn:
raise SyncError(f"Watermark '{wm['name']}' references missing connection {wm['connection_id']}")
try:
output = _run_jrunner_query(conn, wm["resolver_sql"])
# Take first row, first column
lines = [l.strip() for l in output.strip().splitlines() if l.strip()]
# Skip CSV header
value = lines[1] if len(lines) > 1 else None
if value:
# Strip quotes if CSV-wrapped
value = value.strip('"').strip("'")
if not value or value.lower() == "null":
value = wm["default_value"]
resolved[wm["name"]] = value or ""
except Exception as e:
logger.warning(f"Watermark '{wm['name']}' resolver failed: {e}")
if wm["default_value"]:
resolved[wm["name"]] = wm["default_value"]
else:
raise SyncError(
f"Watermark '{wm['name']}' resolver failed and no default: {e}"
)
return resolved
def _materialize_query(source_query: str, watermark_values: dict[str, str]) -> str:
"""Substitute {name} placeholders in source_query with resolved values."""
result = source_query
for name, value in watermark_values.items():
result = result.replace(f"{{{name}}}", value)
return result
def preview_module(module_id: int) -> dict:
"""Preview the exact SQL that would be executed for a module."""
module = get_module(module_id)
if not module:
raise SyncError(f"Module {module_id} not found")
dest_conn = get_connection(module["dest_connection_id"])
staging_table = f"pipekit_staging.{module['name']}"
# Resolve watermarks and materialize query
watermark_values = _resolve_watermarks(module_id)
source_query = _materialize_query(module["source_query"], watermark_values)
# Merge SQL
merge_sql = _build_merge_sql(module, staging_table)
# Hooks
hooks = list_hooks(module_id)
hook_sql = []
for h in hooks:
if h["run_on"] in ("success", "always"):
hook_sql.append(f"-- hook ({h['run_on']}): {h['sql']}")
return {
"source_query": source_query,
"base_query": module["source_query"],
"staging_table": staging_table,
"merge_sql": merge_sql,
"hooks": hook_sql,
"strategy": module["merge_strategy"],
"watermark_values": watermark_values,
}
def run_module(module_id: int, group_run_id: int = None,
on_output: callable = None) -> dict:
"""Execute a single sync module. Returns the run log entry."""
module = get_module(module_id)
if not module:
raise SyncError(f"Module {module_id} not found")
if not module["enabled"]:
raise SyncError(f"Module {module['name']} is disabled")
# Atomic lock acquisition
pid = str(os.getpid())
if not acquire_module_lock(module_id, pid):
raise SyncError(f"Module {module['name']} is already running")
source_conn = get_connection(module["source_connection_id"])
dest_conn = get_connection(module["dest_connection_id"])
if not source_conn or not dest_conn:
release_module_lock(module_id)
raise SyncError("Source or destination connection not found")
run = create_run(module_id, group_run_id)
run_id = run["id"]
staging_table = f"pipekit_staging.{module['name']}"
logger.info(f"Starting sync: {module['name']} (run {run_id})")
try:
# 1. Resolve watermarks
watermark_values = _resolve_watermarks(module_id)
if watermark_values:
log_run_output(run_id, watermark_values_json=json.dumps(watermark_values))
# 2. Materialize source query
source_query = _materialize_query(module["source_query"], watermark_values)
log_run_sql(run_id, source_query)
# 3. Ensure schemas exist and create staging table
if "postgresql" in dest_conn["jdbc_url"].lower():
dest_schema = module["dest_table"].split(".")[0] if "." in module["dest_table"] else "public"
setup_sql = (
f"CREATE SCHEMA IF NOT EXISTS pipekit_staging;\n"
f"CREATE SCHEMA IF NOT EXISTS {dest_schema};\n"
)
_run_dest_sql(dest_conn, setup_sql)
# 4. Create staging table from source metadata
logger.info(f"Creating staging table {staging_table}")
if on_output:
on_output(f"Creating staging table {staging_table}")
_create_staging_from_source(source_conn, dest_conn, module["source_query"], staging_table)
# 5. Transfer data to staging table
logger.info(f"Transferring data to {staging_table}")
if on_output:
on_output("Transferring data...")
row_count, stdout, stderr = _run_jdbc_transfer(
source_conn, dest_conn, source_query, staging_table, on_output=on_output
)
log_run_output(run_id, jrunner_stdout=stdout, jrunner_stderr=stderr)
if on_output:
on_output(f"Transferred {row_count} rows")
logger.info(f"Transferred {row_count} rows")
# 6. Execute merge strategy
merge_sql = _build_merge_sql(module, staging_table)
log_run_sql(run_id, source_query, merge_sql)
logger.info(f"Executing merge: {module['merge_strategy']}")
if on_output:
on_output(f"Executing merge: {module['merge_strategy']}")
_run_dest_sql(dest_conn, merge_sql)
# 7. Run success hooks
hook_log = _run_hooks(module_id, "success", dest_conn)
if hook_log:
log_run_output(run_id, hook_log=hook_log)
finish_run(run_id, "success", row_count)
release_module_lock(module_id)
logger.info(f"Sync complete: {module['name']}{row_count} rows")
return get_run(run_id)
except Exception as e:
error_msg = str(e)
logger.error(f"Sync failed: {module['name']}{error_msg}")
# Run failure hooks
try:
hook_log = _run_hooks(module_id, "failure", dest_conn)
if hook_log:
log_run_output(run_id, hook_log=hook_log)
except Exception:
pass
finish_run(run_id, "error", error=error_msg)
release_module_lock(module_id)
return get_run(run_id)
def _build_merge_sql(module: dict, staging_table: str) -> str:
"""Build the merge SQL based on strategy."""
dest_table = module["dest_table"]
strategy = module["merge_strategy"]
merge_key = module["merge_key"]
if strategy == "full":
return (
f"CREATE TABLE IF NOT EXISTS {dest_table} (LIKE {staging_table} INCLUDING ALL);\n"
f"BEGIN;\n"
f"TRUNCATE TABLE {dest_table};\n"
f"INSERT INTO {dest_table} SELECT * FROM {staging_table};\n"
f"COMMIT;\n"
)
elif strategy == "incremental":
create_if = f"CREATE TABLE IF NOT EXISTS {dest_table} (LIKE {staging_table} INCLUDING ALL);\n"
if merge_key:
return (
f"{create_if}"
f"BEGIN;\n"
f"DELETE FROM {dest_table} WHERE {merge_key} IN "
f"(SELECT DISTINCT {merge_key} FROM {staging_table});\n"
f"INSERT INTO {dest_table} SELECT * FROM {staging_table};\n"
f"COMMIT;\n"
)
else:
return f"{create_if}INSERT INTO {dest_table} SELECT * FROM {staging_table};\n"
elif strategy == "append":
return (
f"CREATE TABLE IF NOT EXISTS {dest_table} (LIKE {staging_table} INCLUDING ALL);\n"
f"INSERT INTO {dest_table} SELECT * FROM {staging_table};\n"
)
raise SyncError(f"Unknown merge strategy: {strategy}")
def _run_hooks(module_id: int, run_on: str, dest_conn: dict) -> str:
"""Execute hooks for a module. Returns combined hook output log."""
hooks = list_hooks(module_id)
log_parts = []
for hook in hooks:
if hook["run_on"] == run_on or hook["run_on"] == "always":
# Use hook's own connection if specified, otherwise dest
if hook["connection_id"]:
hook_conn = get_connection(hook["connection_id"])
if not hook_conn:
log_parts.append(f"SKIP hook #{hook['id']}: connection {hook['connection_id']} not found")
continue
else:
hook_conn = dest_conn
logger.info(f"Running hook: {hook['sql'][:80]}")
try:
output = _run_dest_sql(hook_conn, hook["sql"])
log_parts.append(f"hook #{hook['id']} OK: {output[:200]}")
except Exception as e:
log_parts.append(f"hook #{hook['id']} FAILED: {e}")
return "\n".join(log_parts)
def run_group(group_id: int, triggered_by: str = "manual") -> dict:
"""Execute all modules in a group in order. Stops on first failure."""
group = get_group(group_id)
if not group:
raise SyncError(f"Group {group_id} not found")
group_run = create_group_run(group_id, triggered_by=triggered_by)
group_run_id = group_run["id"]
final_status = "success"
for member in group["members"]:
run = run_module(member["module_id"], group_run_id=group_run_id)
if run["status"] == "error":
logger.error(f"Group {group['name']} stopped: {member['module_name']} failed")
final_status = "error"
break
finish_group_run(group_run_id, final_status)
return get_group_run(group_run_id)

View File

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,100 @@
"""HTTP client for Pipekit API."""
import requests
from requests.auth import HTTPBasicAuth
class PipekitClient:
def __init__(self, base_url: str = "http://localhost:8100",
username: str = "admin", password: str = "pipekit"):
self.base_url = base_url.rstrip("/")
self.auth = HTTPBasicAuth(username, password)
def _get(self, path: str, params: dict = None) -> dict | list:
r = requests.get(f"{self.base_url}{path}", auth=self.auth, params=params)
r.raise_for_status()
return r.json()
def _post(self, path: str, json: dict = None) -> dict:
r = requests.post(f"{self.base_url}{path}", auth=self.auth, json=json)
r.raise_for_status()
return r.json()
def _put(self, path: str, json: dict = None) -> dict:
r = requests.put(f"{self.base_url}{path}", auth=self.auth, json=json)
r.raise_for_status()
return r.json()
def _delete(self, path: str) -> dict:
r = requests.delete(f"{self.base_url}{path}", auth=self.auth)
r.raise_for_status()
return r.json()
# Connections
def list_connections(self): return self._get("/connections")
def create_connection(self, data): return self._post("/connections", data)
def get_connection(self, id): return self._get(f"/connections/{id}")
def update_connection(self, id, data): return self._put(f"/connections/{id}", data)
def delete_connection(self, id): return self._delete(f"/connections/{id}")
def test_connection(self, id): return self._post(f"/connections/{id}/test")
# Introspection
def list_tables(self, conn_id, schema=None):
params = {"schema": schema} if schema else None
return self._get(f"/connections/{conn_id}/tables", params)
def list_columns(self, conn_id, schema, table):
return self._get(f"/connections/{conn_id}/tables/{schema}.{table}/columns")
def propose_module(self, conn_id, schema, table, dest_schema=None,
linked_server=None, linked_db=None):
params = {}
if dest_schema: params["dest_schema"] = dest_schema
if linked_server: params["linked_server"] = linked_server
if linked_db: params["linked_db"] = linked_db
return self._get(f"/connections/{conn_id}/tables/{schema}.{table}/propose", params or None)
# Modules
def list_modules(self): return self._get("/modules")
def create_module(self, data): return self._post("/modules", data)
def get_module(self, id): return self._get(f"/modules/{id}")
def update_module(self, id, data): return self._put(f"/modules/{id}", data)
def delete_module(self, id): return self._delete(f"/modules/{id}")
def preview_module(self, id): return self._get(f"/modules/{id}/preview")
def run_module(self, id): return self._post(f"/modules/{id}/run")
def run_module_stream(self, id):
"""Stream sync output. Yields lines, final line starts with __DONE__ or __ERROR__."""
r = requests.post(f"{self.base_url}/modules/{id}/run/stream",
auth=self.auth, stream=True)
r.raise_for_status()
for line in r.iter_lines(decode_unicode=True):
if line.startswith("data: "):
yield line[6:]
def module_history(self, id): return self._get(f"/modules/{id}/history")
# Hooks
def list_hooks(self, module_id): return self._get(f"/modules/{module_id}/hooks")
def create_hook(self, data): return self._post("/hooks", data)
def delete_hook(self, id): return self._delete(f"/hooks/{id}")
# Groups
def list_groups(self): return self._get("/groups")
def create_group(self, data): return self._post("/groups", data)
def get_group(self, id): return self._get(f"/groups/{id}")
def delete_group(self, id): return self._delete(f"/groups/{id}")
def add_group_member(self, group_id, data): return self._post(f"/groups/{group_id}/members", data)
def remove_group_member(self, member_id): return self._delete(f"/groups/members/{member_id}")
def run_group(self, id): return self._post(f"/groups/{id}/run")
# Runs
def list_runs(self, limit=50): return self._get("/runs", {"limit": limit})
def get_run(self, id): return self._get(f"/runs/{id}")
# Schedules
def list_schedules(self): return self._get("/schedules")
def create_schedule(self, data): return self._post("/schedules", data)
def update_schedule(self, id, data): return self._put(f"/schedules/{id}", data)
def delete_schedule(self, id): return self._delete(f"/schedules/{id}")
# Drivers
def list_drivers(self): return self._get("/drivers")
def create_driver(self, data): return self._post("/drivers", data)
def delete_driver(self, id): return self._delete(f"/drivers/{id}")

12
.gitignore vendored Normal file
View File

@ -0,0 +1,12 @@
__pycache__/
*.py[cod]
*.egg-info/
# Local SQLite database — contains connection rows + user state.
pipekit.db
pipekit.db-journal
pipekit.db-wal
pipekit.db-shm
# Local Claude Code settings.
.claude/settings.local.json

636
SPEC.md Normal file
View File

@ -0,0 +1,636 @@
# Pipekit — Spec
This spec was built from a clean-slate conversation that rederived the
design from first principles. The previous version is archived at
`SPEC_v1_archive.md` for reference.
## Status
**Spec is done.** Ready to move to implementation planning.
One item is intentionally deferred: the **migration plan** for bringing
over the ~90 existing modules from `/opt/sync`. Not needed to start
implementation — Pipekit can be built and tested against new modules
first, and migration can happen later (likely via a parser that walks
`/opt/sync/*/`, extracts `pull.sql` / `insert.sql` / shell wrapper,
infers merge strategy and key, and creates module rows).
## How we got here
Started by asking what was painful about the existing shell-script-based
sync setup. Three things surfaced: authoring new modules is tedious,
observability is poor (no easy way to see what ran, how long, how many
rows, any errors), and there's no central management UI. That framed
Pipekit as an orchestration layer on top of the existing `jrunner` JDBC
tool — not replacing jrunner, wrapping it with the state and UI that
shell scripts can't provide.
Everything in this document was worked out by walking through concrete
examples from the current `/opt/sync` modules (`code`, `qcrh`,
`ffsbglr1`) and asking "what would this look like under the new system?"
When the original spec proposed something that didn't fit (like
"watermark is a single column name"), we redesigned it. The result is a
spec that reflects the actual complexity of real modules, not an
idealized simple-sync model.
---
## Motivation
User has ~90 sync modules in `/opt/sync` today, organized as shell scripts
that wrap `jrunner` (a JDBC bulk-transfer CLI at `/opt/jrunner`). Pain points
that drove this redesign:
- **Authoring is tedious.** Building SQL for new sync modules takes too long —
hand-writing pull.sql, insert.sql, the .sh wrapper, the import table DDL.
- **No observability.** Hard to answer: how often does each module run, how
many rows transfer, what SQL was used, when's the next run, how long does
it take, are tables in a good state, were there errors on the last run and
for which modules.
- **No central management.** Want a TUI like lazygit for browsing, inspecting,
running, configuring modules. User browses with nvim today and wants the TUI
to feel as spatial and navigable as a file tree.
## What jrunner does (and doesn't)
`jrunner` (at `/opt/jrunner`) is a Java CLI that does two things:
1. **Migration mode** — given source connection (`-scu/-scn/-scp`), dest
connection (`-dcu/-dcn/-dcp`), a SQL file (`-sq`), and a dest table (`-dt`),
it streams rows from source to dest with batched INSERTs.
2. **Query mode** — same source flags but no dest flags, outputs query results
to stdout in CSV/TSV. Useful for piping to visidata, less, etc.
It has no merge logic, no scheduling, no state, no awareness of incremental
syncs. It's a dumb pipe. That's the right shape — Pipekit wraps it with the
orchestration layer.
## Architecture
```
jrunner (Java CLI — bulk JDBC transfer + query mode)
engine (Python — orchestrates jrunner, watermarks, merge, hooks, run log)
API (FastAPI — REST, Basic Auth)
TUI / web UI / curl
```
The engine shells out to jrunner for **everything that touches a database**
bulk transfers, watermark resolver queries, hooks. No separate JDBC layer in
Python. One driver-loading code path, one set of bugs.
The API exists so a web front-end or curl can drive Pipekit, not just the TUI.
## Storage: SQLite
Everything lives in one SQLite file (`pipekit.db`). Why:
- ~90+ modules already exists; flat files don't scale to "show me all modules
that errored last night" type queries.
- The SQL itself belongs in the database, not as file references — a module is
a self-contained unit and splitting it across rows + files means two things
to keep in sync.
- Single file, copy with `cp`, no server. Schema translates to PostgreSQL later
if ever needed.
User was uneasy about losing filesystem browsing. Resolution: the **TUI is the
file browser**. Inspecting a module feels like `cat`, editing opens `$EDITOR`,
the module list feels like `ls`. For raw access, `sqlite3 pipekit.db` works.
## Module model
A module = one sync job. Fields:
- `name`
- `source_connection_id`, `dest_connection_id`
- `dest_table`
- `staging_table` (auto-managed: `pipekit_staging.{name}`)
- `source_query` — full SQL text with `{watermark_name}` placeholders. Free-form.
- `merge_strategy``full` / `incremental` / `append`
- `merge_key` — destination column(s) used in DELETE before INSERT
- `enabled`
- `running` (lock flag — see locking section)
The source query is **a text blob**. Not split into structured columns. The
TUI offers a column-editor mode that *parses* the SELECT list out of the
stored query, lets you edit it as a table, and *splices the new SELECT list
back in* (preserving CTEs, FROM, WHERE). For queries the parser can't handle
(too complex), the TUI falls back to raw `$EDITOR`. **Raw editing always
works.**
### Merge strategies
Two patterns from existing scripts:
- **full** — TRUNCATE dest, INSERT all from staging
- **incremental** — pull delta via watermark, DELETE rows in dest matching
merge_key, INSERT from staging
- **append** — INSERT only, no deletes
**No upsert.** The DELETE+INSERT approach already handles row-level changes
without needing column-by-column ON CONFLICT UPDATE SET clauses.
### Watermarks (multi, type-agnostic, resolver SQL)
A module can have **multiple named watermarks**. Real example from user: a
query that needs both `{date}` (max modified-timestamp from one table) and
`{number}` (max order number from another) to build a list of changed orders
to repull.
A watermark =
- `name` — placeholder name in the source query
- `connection_id` — which connection runs the resolver (could be dest, source,
or a third)
- `resolver_sql` — free-form SQL. Engine runs it via jrunner query mode, takes
first row's first column as a string.
- `default_value` — used if resolver returns NULL or zero rows
**Type-agnostic.** The engine reads the resolver result as an opaque string and
substitutes it literally. No type coercion. The user controls quoting in the
resolver SQL itself (e.g. wrap in `quote_literal()` if you want `'2610'`,
return raw if you want `2610`).
**Dialect-aware by user.** The user writes the resolver in the connection's
dialect. Engine doesn't translate. Same as today — they already write DB2 in
pull.sql and PG in insert.sql.
**No hidden generation.** Resolved SQL gets **materialized** before each run
and stored on the module record (`next_resolved_query` or similar) so the TUI
can always show "here's exactly what would run next." After the run, the
exact resolved SQL goes into the run_log.
### Hooks
A module can have post-execution hooks for things like
`REFRESH MATERIALIZED VIEW rlarp.cust` or `CALL rlarp.osm_stack_refresh()`.
A hook =
- `module_id`, `run_order`
- `connection_id` — usually dest, but anywhere
- `sql`
- `run_on``success` / `failure` / `always`
Hooks run sequentially after the merge. Failures get logged but don't roll
back the merge (it's already committed).
**No group-level hooks for now.** Decision deferred. The `REFRESH MATERIALIZED
VIEW rlarp.cust` at the end of `codes.sh` would attach to whichever module
logically owns that data, even if not strictly the last in order. Add group
hooks later if it gets painful.
## Engine flow (per module run)
1. **Acquire lock** atomically: `UPDATE module SET running=1 WHERE id=? AND running=0`. If row count is 0, bail with "already running."
2. **Resolve watermarks.** For each watermark: shell out to jrunner query mode against the watermark's connection with its resolver SQL. Take first row's first column as a string. Fall back to `default_value` on NULL/empty.
3. **Materialize the resolved source query.** Substitute `{name}` placeholders in `source_query`. Store on the module record so the TUI can preview.
4. **Truncate staging** (`TRUNCATE pipekit_staging.{module_name}`).
5. **Run jrunner** (migration mode) with the resolved query, target = staging.
6. **Materialize the merge SQL** based on strategy + merge_key.
7. **Run merge** against dest connection (also via jrunner, or whatever path the engine uses for SQL execution).
8. **Run hooks** in order, respecting `run_on`.
9. **Write `run_log` entry** with everything (see below).
10. **Release lock** in a `finally` block — always runs, even on error.
## Locking
The `running` flag on the module is the lock. The atomic UPDATE-with-WHERE
above ensures no race window. Belt-and-suspenders for stuck locks:
- **PID-based.** Store the API process PID/UUID on the lock. On API startup,
clear locks owned by PIDs that no longer exist.
- **Time-based backstop.** On startup, also clear locks held longer than some
absurd threshold (e.g. 24h).
Lock is enforced regardless of trigger source — scheduler, group runner,
ad-hoc single-module, ad-hoc group run. All paths hit the same atomic check.
**No separate group lock needed.** If a group runner tries to start a module
that's already locked, it fails on that module and stops the group (per
stop-on-failure rule).
## Run log / observability
Two tables:
```
group_run(
id, group_id, started_at, finished_at, status, triggered_by
-- triggered_by: schedule | manual | null
)
run_log(
id,
module_id,
group_run_id, -- nullable; set when run as part of a group
started_at, finished_at,
row_count,
status, -- running | success | error | cancelled
error,
resolved_source_sql, -- exact SQL that ran on source
merge_sql, -- exact merge SQL that ran on dest
watermark_values_json, -- {prev_period: "'2610'", ...}
jrunner_stdout,
jrunner_stderr,
hook_log
)
```
Module history is **independent of group context**`WHERE module_id=?` shows
every run, scheduled or manual, group or standalone. The `group_run_id` is
just an annotation.
**Run detail screen** (in TUI) shows: timing, status, row count, trigger
context, watermark values, plus keys to open in `$EDITOR`:
- `s` — resolved source SQL
- `m` — merge SQL
- `h` — hook output
- `o` — jrunner stdout/stderr
**Global run log** (`L` from main screen) — sortable, filterable across all
modules and groups. Answer "show me everything that errored in the last 24
hours" in two keystrokes.
## Groups and scheduling
```
grp(id, name)
group_member(id, group_id, module_id, run_order)
-- many-to-many; same module can live in multiple groups with different run_orders
schedule(id, group_id, cron_expr, enabled)
-- a group can have 0..N schedules
```
**Sequential execution, stop on failure.** Mirrors the `set -e` behavior of
existing orchestrator scripts.
**Many-to-many membership.** Junction table is needed anyway for `run_order`,
so many-to-many costs nothing extra. Unique constraint can be added later if
ever needed.
**Schedule attaches to groups, not modules.** Matches the user's mental model
and avoids a huge cron-list. Individual modules can still be run ad-hoc.
**Scheduler.** Background thread inside the API process. Wakes every minute,
evaluates all enabled schedules, fires any whose cron matches. A scheduled
fire and a manual fire use the same code path — only `triggered_by` differs.
**Ad-hoc runs:**
- `POST /modules/{id}/run` — single module
- `POST /groups/{id}/run` — whole group sequentially
Both create normal run_log entries.
## Connections and credentials
```
driver(id, name, jar_file, class_name, url_template)
connection(
id,
name,
driver_id,
jdbc_url,
username,
password,
default_dest_connection_id, -- nullable; wizard default when this is source
default_dest_schema, -- nullable; wizard default when this is source
notes,
created_at, updated_at
)
```
**Credentials = env var references.** The `password` column stores something
like `$DB2PW`. Engine resolves at runtime by reading the env var. Passwords
never live in the database. Matches existing setup
(`/opt/sync/.env` + shell scripts) and keeps `pipekit.db` safe to copy/back-up.
Test-connection: engine runs a trivial query (`SELECT 1` or equivalent) via
jrunner against the connection. Confirms URL, credentials, driver all work.
**jrunner handles all SQL execution** — bulk transfers (migration mode) and
single-value queries for watermark resolvers / hooks (query mode). Trade-off:
~100ms JVM spawn per resolver call, but one tool, one set of bugs, one
driver-loading path.
## Bootstrap / install hygiene
Pipekit verifies jrunner exists on startup (configurable path in
`config.yaml`). If missing, surfaces a clear error pointing at
`/opt/jrunner/deploy.sh`.
**`pipekit doctor`** CLI command — checks jrunner present, jrunner version,
drivers loadable, database accessible, all configured connections testable.
First thing to run after a `git pull`.
**Packaging.** Start loose-coupled (install jrunner separately, point Pipekit
at it). Bundle later if/when the two-step gets annoying.
## New module wizard
The centerpiece for fixing the authoring pain. Goal: from "I want to sync
table X from connection Y" to "module created, query previewed, ready to
test-run" in under a minute.
### Step 1 — Source
Pick source connection. Filter by schema. Search tables incrementally. The
TUI calls jrunner in query mode against the source's INFORMATION_SCHEMA
equivalent (DB2: `SYSIBM.SYSTABLES`, SQL Server / PG: `INFORMATION_SCHEMA.TABLES`).
### Step 2 — Columns
The engine introspects the chosen table. Proposes one row per column with:
- **In/out toggle** (default all on; toggle off the noise like `dcfut*` futures)
- **Default alias** — lowercase, special chars stripped: `DCORD#``dcord`
- **Default source expression** — bare column for most types; `RTRIM(col)` for
char/varchar; `CASE WHEN col IN ('0001-01-01','9999-12-31') THEN NULL ELSE col END`
for date (sentinel-NULL pattern from existing modules)
- **Default dest type** — mapped from source: `INT`→`integer`, `DECIMAL(15,4)`→`numeric(15,4)`,
`CHAR(40)`→`text`, `DATE`→`date`, etc.
`e` opens an edit modal for one row to override alias / expression / type.
Most of the time you accept defaults.
### Step 3 — Destination & merge
Pick dest connection. Dest table defaults to
`{source_conn.default_dest_schema}.{lowercase_source_table_name}`. Pick
merge strategy. Pick merge key from a dropdown of dest column names. Add
zero or more watermarks via a sub-form.
**Multiple destinations are real** (e.g. PG → SQL Server). The wizard
doesn't assume one dest. Each source connection has a
`default_dest_connection_id` + `default_dest_schema` pair that
pre-populate Step 3. Both are nullable; fallback is last-used dest.
### Step 1 — Source (driver-dependent browse form)
Different drivers need different scope fields ("qualifiers") to identify a
table. DB2 needs just `schema`. SQL Server can need up to three:
`linked_server`, `database`, `schema` (any combination — linked server
optional, database optional, schema defaults to `dbo`). This is because
SQL Server can reference tables in other databases on the same server, or
tables on entirely different servers via linked servers — and the FROM
clause syntax changes (`schema.table`, `db.schema.table`,
`[linked].[db].[schema].[table]`).
Each driver exposes:
```python
class Driver:
def browse_fields(self) -> list[BrowseField]:
"""Qualifier fields for the wizard's Step 1 form."""
def list_tables(self, **qualifiers) -> list[Table]:
"""INFORMATION_SCHEMA query using whatever qualifiers are set."""
def get_columns(self, table_name: str, **qualifiers) -> list[Column]:
"""Column lookup for a specific table."""
def qualified_table_name(self, table_name: str, **qualifiers) -> str:
"""FROM-clause identifier. Wizard-time only."""
def map_type(self, source_type) -> str: ...
def default_expression(self, source_type, column_name) -> str: ...
def quote_identifier(self, name) -> str: ...
```
Textual renders Step 1 dynamically from `browse_fields()`. The wizard
calls `qualified_table_name()` once to bake the FROM clause into the
stored source query. **Linked servers / qualifiers are not first-class in
Pipekit** — they exist only as syntax inside the generated FROM. Nothing
is persisted on the module about how the table was qualified at author
time. If you later need to add a column, you type the expression and
alias by hand in the column editor — no re-browsing needed.
### Step 4 — Preview
Show the generated source query, generated staging DDL, generated merge SQL.
Everything visible. `e` to drop into `$EDITOR` for free-form fixes. `c` to
create — writes the module row, creates the staging table on dest, offers a
test-run.
### Per-driver capability needed
Each driver module (`engine/drivers/db2.py`, etc.) implements:
- `list_tables(schema_filter)` — SQL template for INFORMATION_SCHEMA
- `get_columns(schema, table)` — column name, type, length, nullable
- `map_type(source_type)` → dest type
- `default_expression(source_type, column_name)` → wrap in RTRIM, CASE, etc.
- `quote_identifier(name)``"DCORD#"` (DB2/PG) vs `[DCORD#]` (MSSQL)
Defaults are **opinions hardcoded in driver modules** for now. Lift to a
`driver_default` table later if configurability is ever needed.
### Wizard scope (what it does NOT do)
- **No CTE-based queries.** Wizard generates simple `SELECT cols FROM table WHERE watermark`. For complex queries (like `ffsbglr1`), create with the wizard and edit the source query post-creation via `e`.
- **No multi-watermark wizard.** Single watermark. Add more after.
- **No hooks in the wizard.** Add hooks from the module detail screen.
- **No group assignment in the wizard.** Assign separately.
These are intentional. The wizard handles the 80% case fast. The 20% cases
are post-creation edits where you already have a working module to start from.
## TUI — main screen sketch
```
Pipekit
─────────────────────────────────────────────────
▼ s7830956 (AS/400 DB2)
✔ code full 2m ago 1,204 rows 0.8s
✔ name full 2m ago 892 rows 0.6s
✔ qcrh incr 2m ago 1,031 rows 3.2s
✗ qcri incr 2m ago — err
○ cust full disabled
▼ usmidsql01 (SQL Server)
✔ live_quotes full 2m ago 340 rows 1.1s
Groups
pricing 9 modules cron 0 20 2 * * * next: 2:20am
codes 26 modules cron 0 0 2 * * * next: 2:00am
```
Modules grouped by source connection (mirrors today's directory layout).
Status / strategy / last-run / row-count / duration on each line. Groups at
the bottom with schedules and next-fire times.
`i` inspect, `r` run, `l` history, `L` global log, `n` new module, `c`
connections, `/` search, `j/k` navigate, `q` quit. Should feel like lazygit /
nvim file tree.
### Module detail (i)
Top: module info (strategy, merge key, watermark, dest table, staging table,
enabled, last/next run). Middle: column table (parsed from source query).
Bottom: keybindings.
Keys open things in `$EDITOR` (read-only):
- `q` — next resolved source SQL
- `m` — merge SQL
- `b` — base query template (with placeholders)
- `e` — edit base query (writable)
- `w` — watermarks
- `h` — hooks
- `c` — column editor (parsed from query)
- `r` — run
- `l` — history
## API surface
**REST over HTTP**, FastAPI, HTTP Basic Auth on all endpoints except
`/health`. In practice the API only uses **GET (reads) and POST
(writes)** — PUT/DELETE avoided to keep the mental model simple.
### Resource CRUD
Every core table (connection, driver, module, watermark, hook, group,
group_member, schedule) gets the same URL pattern:
```
GET /things list (with filter query params)
GET /things/{id} read one
POST /things create
POST /things/{id} update
POST /things/{id}/delete delete
```
JSON shape = snake_case matching database columns. ISO 8601 timestamps.
Integer IDs. No transformation layer between SQL and JSON.
### Operation endpoints
Anything with side effects or that composes multiple steps:
```
POST /connections/{id}/test run SELECT 1 via jrunner; return ok/fail/elapsed
GET /modules/{id}/preview return next resolved source SQL + merge SQL
(runs watermark resolvers but does NOT sync)
GET /modules/{id}/columns parse source query, return column list
POST /modules/{id}/run start async run; return {run_id} immediately
POST /groups/{id}/run start async group run; return {group_run_id}
POST /modules/{id}/cancel cancel running module (release lock, kill jrunner)
POST /groups/{id}/cancel cancel running group
GET /runs list runs (filter: ?module_id= ?status= ?since=)
GET /runs/{id} run detail (SQL, stdout/stderr, hook output)
GET /runs/{id}/stream Server-Sent Events: live log + status
GET /group-runs list group runs
GET /group-runs/{id} group run with child module runs
GET /modules/{id}/runs shortcut: runs for one module
```
### Introspection endpoints (wizard backend)
```
POST /introspect/tables body: {connection_id, qualifiers: {...}}
POST /introspect/columns body: {connection_id, table_name, qualifiers}
POST /introspect/propose body: {connection_id, table_name, qualifiers}
returns a ready-to-POST module JSON
```
`propose` is curl-able — you can generate a module proposal, tweak the
JSON, then POST it to `/modules` to create. No TUI required.
### System endpoints
```
GET /health liveness only, no auth required
GET /doctor full check (jrunner, drivers, db, connections, scheduler)
powers `pipekit doctor` CLI
GET /settings
POST /settings/{key}
```
### Async runs + SSE
`POST /modules/{id}/run` does NOT block. It atomically acquires the
module lock, kicks off the sync in a background task, and returns
`{"run_id": 4892}` immediately.
Two ways to watch a run after that:
1. **Polling**`GET /runs/{id}` returns the run_log row; keep hitting
it until `status != running`. Simple, works anywhere.
2. **Streaming**`GET /runs/{id}/stream` opens a Server-Sent Events
connection. The server pushes event lines as things happen — log
lines, row-count updates, final status. The TUI uses this for the
run watch screen. curl supports it with `-N` (no buffering).
SSE is plain HTTP with a long-lived connection, not WebSockets. Simpler
to implement, works in browsers natively (`EventSource` in JS), works in
curl for debugging.
Splitting `start` from `watch` (two endpoints) means:
- Cron-triggered runs don't have to watch
- Curl scripting can fire-and-forget
- TUI can reconnect to an already-running sync if it crashes mid-run
### Auth
HTTP Basic. Username/password in the `settings` table. Single-user tool
for now; swap to JWT later if multi-user is ever needed, without
breaking URL structure.
### TUI = HTTP client
The TUI never touches SQLite directly. Every screen reads from an
endpoint. This guarantees zero behavioral drift between TUI and any
future web UI, and makes the API the single source of truth for
behavior.
## Open questions still to answer
1. ~~**Wizard defaults match user's mental model?**~~ Confirmed — RTRIM,
sentinel-date NULL, lowercased aliases are fine for now.
2. ~~**Dest table default?**~~ Resolved — per-source connection
`default_dest_connection_id` + `default_dest_schema`.
3. ~~**API surface.**~~ Resolved — REST, GET/POST only, async runs, SSE
for live output, CRUD + operations + introspection mix.
4. **Migration plan.** Deferred. Would involve a parser that walks
`/opt/sync/*/`, extracts pull.sql / insert.sql / sh wrapper, infers
merge strategy and key, creates module rows.
## Decisions log (fast reference)
| Decision | Choice |
|---|---|
| Storage | SQLite, single file |
| Where SQL lives | In the database (text blobs), not files |
| Source query shape | Free text with `{watermark}` placeholders |
| Columns | Parsed from query; not separate rows; wizard auto-introspects on create |
| Watermarks | Multiple per module, type-agnostic, free-form resolver SQL |
| Merge strategies | full / incremental / append (no upsert) |
| Hooks | Per-module, post-merge, run_on success/failure/always |
| Group hooks | Deferred — not needed yet |
| Group membership | Many-to-many (junction table for run_order anyway) |
| Group execution | Sequential, stop on failure |
| Schedules | Attach to groups; multiple schedules per group allowed |
| Locking | Atomic UPDATE on `module.running`; PID + time-based stale clearing |
| Credentials | Env var references (`$DB2PW`); resolved at runtime |
| SQL execution | Everything via jrunner (migration + query mode) |
| Materialized SQL | Always — resolved source SQL stored before run + after run |
| Install | Loose-coupled to jrunner for now; bundle later |
| TUI feel | Like lazygit / nvim file tree; spatial, keyboard-driven |
| Authoring | Wizard handles 80% case; post-creation editing handles the rest |
| Multiple destinations | Supported. Source conn holds `default_dest_connection_id` + `default_dest_schema` for wizard prepopulation |
| Driver browse fields | Per-driver qualifier set (`schema` for DB2/PG, up to `linked_server`/`database`/`schema` for MSSQL) |
| Linked servers | Not first-class; only affect FROM-clause syntax at author time; not persisted on module |
| API style | REST, GET for reads, POST for writes, no PUT/DELETE |
| Run model | Async — POST /run returns run_id immediately; watch via polling or SSE stream |
| Live output | Server-Sent Events (SSE) — plain HTTP, curl-friendly, browser-native |
| Auth | HTTP Basic, single user, creds in settings table |
| TUI ↔ backend | TUI is an HTTP client; never touches SQLite directly |

485
SPEC_v1_archive.md Normal file
View File

@ -0,0 +1,485 @@
# Pipekit — ETL Tool Specification
## Overview
A lightweight, JDBC-based ETL tool for syncing tables between source systems and a PostgreSQL destination (or other JDBC destinations). Config-driven, no boilerplate scripts. Managed via TUI, API, or future web UI.
## Architecture
```
jrunner (JDBC transfer engine — existing Java app)
^
engine (Python — orchestrates jrunner, manages staging, merge, DDL, logging)
^
API (FastAPI — REST interface, Basic Auth)
^
TUI / Web UI / external callers
```
## Core Concepts
| Concept | Description |
|----------------|-----------------------------------------------------------------------------|
| **Connection** | A JDBC source or destination — URL, driver class, credentials |
| **Driver** | A JDBC driver jar registered with the system |
| **Module** | A sync job — source query + destination table + merge strategy |
| **Hook** | Post-sync SQL action run against the destination (e.g. refresh mat view) |
| **Group** | An ordered list of modules that run together |
| **Schedule** | A cron expression tied to a group |
| **Run** | A single execution — tracked with timing, row count, status, error, SQL |
## Bootstrap Config (only file on disk)
```yaml
# /opt/pipekit/config.yaml
database: /opt/pipekit/pipekit.db # SQLite — self-contained, no external DB required
jrunner_path: /usr/local/bin/jrunner
driver_dir: /opt/pipekit/drivers/
api_port: 8100
smtp: # optional, for failure notifications
host: smtp.example.com
port: 587
from: etl@example.com
to: admin@example.com
```
Everything else lives in SQLite (`pipekit.db`). No external database dependency for config — destinations can be PostgreSQL, SQL Server, or anything with a JDBC driver.
## Column Identity Model
A module's source query defines column mappings from source to destination. This is the central design constraint — every column has two identities:
| Context | Name | Example | Where used |
|---------|------|---------|------------|
| **Source column** | The original column name in the source system | `DCORD#`, `DCODAT` | Source query SELECT, WHERE clauses against the source |
| **Destination column** | The alias in the SELECT, which becomes the column name in staging and dest tables | `dcord`, `dcodat` | Staging table DDL, merge SQL, destination queries |
### Rules
1. The **source query** maps source → destination: `SELECT "DCORD#" AS dcord ...`
2. **`merge_key`** references the **destination column name** — it's used in merge SQL that runs against PostgreSQL (e.g. `DELETE FROM dest WHERE dcord IN (SELECT dcord FROM staging)`)
3. **`watermark_column`** references the **destination column name** — the engine looks up `MAX(watermark_column)` in the destination table, then must translate it back to the source column name to build the WHERE clause against the source
4. The **watermark WHERE clause** must use the **source column name** — e.g. `WHERE "DCORD#" > 12345`, not `WHERE dcord > 12345` (the source system doesn't know the alias)
5. The engine maintains a **column mapping** (alias → source expression) parsed from the source query to perform this translation
### Column Mapping Derivation
The source query is parsed to extract the mapping:
```sql
SELECT
"DCORD#" AS dcord -- source: "DCORD#", dest: dcord
,RTRIM(DCOTYP) AS dcotyp -- source: DCOTYP, dest: dcotyp (trimmed)
,DCODAT AS dcodat -- source: DCODAT, dest: dcodat
FROM LGDAT.QCRH
```
From this, the engine derives:
- `dcord``"DCORD#"` (used for WHERE clause on source)
- `dcotyp``DCOTYP` (the unwrapped column, without RTRIM)
- `dcodat``DCODAT`
When building an incremental WHERE clause for watermark column `dcord`:
1. Query dest: `SELECT MAX(dcord) FROM sync.qcrh``12345`
2. Look up source expression for `dcord``"DCORD#"`
3. Build: `WHERE "DCORD#" > 12345`
### Special Character Handling
Source columns with special characters (`#`, `@`, `$`, spaces) are:
- **Quoted in the source query** using platform-appropriate syntax: `[DCORD#]` (SQL Server), `"DCORD#"` (DB2/PostgreSQL)
- **Aliased to safe names** that are valid unquoted PostgreSQL identifiers: `dcord`, `company_name`
- The alias generation (`_safe_alias`) strips special characters, lowercases, and replaces non-alphanumeric chars with underscores
## Database Schema
All tables in SQLite (`pipekit.db`). Same schema works if migrated to PostgreSQL later.
### connection
| Column | Type | Description |
|------------------|---------|--------------------------------------------------|
| id | integer PK | Auto-increment |
| name | text | Human-readable label |
| jdbc_url | text | JDBC connection string |
| driver_id | integer | FK to driver |
| username | text | |
| password | text | Env var reference (e.g. `$DB2PW`) resolved at runtime |
| supports_deletes | boolean | Whether destination supports DELETE/UPDATE |
| created_at | text | ISO datetime |
| updated_at | text | ISO datetime |
### driver
| Column | Type | Description |
|--------------|---------|--------------------------------------------------|
| id | integer PK | Auto-increment |
| name | text | e.g. "SQL Server", "AS/400 DB2" |
| jar_file | text | Filename in driver_dir |
| class_name | text | JDBC driver class |
| url_template | text | e.g. `jdbc:sqlserver://{host};databaseName={db}` |
### module
| Column | Type | Description |
|---------------------|---------|-------------------------------------------------|
| id | integer PK | Auto-increment |
| name | text | Module identifier (unique) |
| source_connection_id| integer | FK to connection |
| dest_connection_id | integer | FK to connection |
| dest_table | text | Fully qualified destination (schema.table) |
| source_query | text | The SELECT query to run against the source |
| merge_strategy | text | `full`, `incremental`, `append`, `upsert` |
| merge_key | text | **Destination** column name for merge operations |
| watermark_column | text | **Destination** column name for incremental watermark. If null, falls back to merge_key |
| key_sync | boolean | After incremental, reconcile keys and delete orphans |
| key_sync_query | text | Optional custom query to fetch source keys |
| full_refresh_cron | text | Optional cron for periodic full refresh |
| enabled | boolean | Whether the module is active |
| running | boolean | Lock flag — set during execution |
| created_at | text | ISO datetime |
| updated_at | text | ISO datetime |
### hook
| Column | Type | Description |
|-----------|---------|------------------------------------------------------|
| id | integer PK | Auto-increment |
| module_id | integer | FK to module (CASCADE delete) |
| run_order | integer | Execution order |
| sql | text | SQL to execute against destination |
| run_on | text | `success`, `failure`, `always` |
### grp (group)
| Column | Type | Description |
|--------|---------|--------------------|
| id | integer PK | Auto-increment |
| name | text | e.g. "pricing" |
### group_member
| Column | Type | Description |
|-----------|---------|----------------------------|
| id | integer PK | Auto-increment |
| group_id | integer | FK to grp (CASCADE) |
| module_id | integer | FK to module (CASCADE) |
| run_order | integer | Execution order in group |
### schedule
| Column | Type | Description |
|-----------|---------|-------------------------------------|
| id | integer PK | Auto-increment |
| group_id | integer | FK to grp (CASCADE) |
| cron_expr | text | Cron expression (e.g. `0 2 * * *`) |
| enabled | boolean | |
### run_log
| Column | Type | Description |
|--------------|---------|----------------------------------------------------------|
| id | integer PK | Auto-increment |
| module_id | integer | FK to module |
| group_id | integer | FK to grp (nullable — null if run manually) |
| started_at | text | ISO datetime |
| finished_at | text | ISO datetime |
| row_count | integer | |
| status | text | `running`, `success`, `error`, `cancelled` |
| error | text | Error message if failed |
| source_query | text | The exact source SQL executed (with resolved WHERE) |
| merge_sql | text | The exact merge SQL executed against destination |
### module_history
| Column | Type | Description |
|-------------|---------|-------------------------------------|
| id | integer PK | Auto-increment |
| module_id | integer | FK to module (CASCADE) |
| source_query| text | Previous query text |
| changed_at | text | ISO datetime |
### settings
| Column | Type | Description |
|--------|------|-------------------------------|
| key | text PK | e.g. `smtp_host` |
| value | text | |
## Merge Strategies
| Strategy | Behavior |
|---------------|-----------------------------------------------------------------------|
| `full` | Transfer all rows to staging, TRUNCATE dest, INSERT from staging |
| `incremental` | Query dest for MAX(watermark), build WHERE clause using source column name, transfer delta, DELETE matching rows by merge_key, INSERT from staging |
| `append` | Transfer, INSERT into dest, no deletes |
| `upsert` | Transfer, INSERT ON CONFLICT(merge_key) DO UPDATE |
### Incremental Sync Flow (detailed)
1. Resolve watermark column: use `watermark_column`, fall back to `merge_key`
2. Query destination: `SELECT MAX({watermark_col}) FROM {dest_table}`
3. Parse the result — handle NULL (empty table), numeric values, date/text values
4. Parse source query to find the source expression for the watermark alias
5. Build WHERE clause using the **source expression** (not the alias):
- Numeric watermark: `WHERE "DCORD#" > 12345`
- Date/text watermark: `WHERE DEX_ROW_TS >= '2026-04-01 00:00:00'`
6. Append WHERE clause to the base source query
7. Transfer delta rows to staging
8. Merge: DELETE from dest WHERE merge_key IN (SELECT merge_key FROM staging), then INSERT
9. Run hooks
**NULL watermark handling**: If `MAX(watermark)` returns NULL (empty dest table or psql null representation like `∅`), skip the WHERE clause entirely — pull all rows.
### Handling Source Deletes
Incremental strategies only detect new/changed rows — not rows deleted from the source. Two mechanisms address this:
**1. Key reconciliation (`key_sync`)** — optional per module. After the incremental load, pull all primary key values from the source (lightweight query), compare against destination, and delete any destination rows whose key is not in the source.
**2. Periodic full refresh (`full_refresh_cron`)** — optional per module. A cron expression that triggers a full refresh on a different cadence than the incremental schedule.
### Destination-Aware Merge
The engine checks `connection.supports_deletes`:
- If true: DELETE + INSERT merge works normally
- If false: incremental/upsert fall back to insert-only, relying on the destination's dedup engine (e.g. ClickHouse ReplacingMergeTree)
## Staging Table Management
- Named `pipekit_staging.{module_name}` (persistent across runs)
- If table exists: TRUNCATE before transfer
- If table doesn't exist: probe source for column metadata (0-row jrunner transfer), create table with mapped PostgreSQL types
- Probe always uses the **base source query** (no WHERE clause) to avoid comment/subquery issues
- Left in place after runs (success or failure) for debugging
- Schemas `pipekit_staging` and destination schema auto-created if missing
## Source Introspection
The engine can browse source systems via jrunner query mode against INFORMATION_SCHEMA (or equivalent):
- **Table browsing**: list tables/views filtered by schema
- **Column metadata**: column names, types, positions
- **Linked server support** (SQL Server): query tables on linked servers via 4-part naming
- **Cross-database** (SQL Server): specify a different database than the connection default
- **Auto-propose**: given a source table, generate complete module config:
- SELECT query with RTRIM on text columns, safe aliases for special characters
- Platform-aware identifier quoting (`[brackets]` for SQL Server, `"double quotes"` for DB2/others)
- Destination DDL with mapped PostgreSQL types
- Suggested merge strategy, key, and watermark column
### Source Type Detection
Detected from JDBC URL: `as400`, `sqlserver`, `postgresql`, `clickhouse`, `mysql`
### Type Mapping (source → PostgreSQL)
varchar/char/nvarchar/nchar/text → text, int/integer → integer, bigint → bigint, decimal/numeric → numeric, float/double → double precision, date → date, datetime/timestamp → timestamp, bit → boolean, binary/varbinary → bytea, uniqueidentifier → uuid
## API Endpoints
```
# Auth: HTTP Basic Auth on all endpoints
# Connections
GET /connections
POST /connections
GET /connections/{id}
PUT /connections/{id}
DELETE /connections/{id}
POST /connections/{id}/test
GET /connections/{id}/tables?schema=
GET /connections/{id}/tables/{schema}.{table}/columns
GET /connections/{id}/tables/{schema}.{table}/propose
# Modules
GET /modules
POST /modules
GET /modules/{id}
PUT /modules/{id}
DELETE /modules/{id}
GET /modules/{id}/preview
GET /modules/{id}/dest-columns
POST /modules/{id}/run
POST /modules/{id}/run/stream
GET /modules/{id}/history
# Hooks
GET /modules/{module_id}/hooks
POST /hooks
DELETE /hooks/{id}
# Groups
GET /groups
POST /groups
GET /groups/{id}
DELETE /groups/{id}
POST /groups/{id}/members
DELETE /groups/members/{id}
POST /groups/{id}/run
# Runs
GET /runs
GET /runs/{id}
# Drivers
GET /drivers
POST /drivers
DELETE /drivers/{id}
# Schedules
GET /schedules
POST /schedules
PUT /schedules/{id}
DELETE /schedules/{id}
```
## TUI
### Main Screen
Module tree grouped by source connection. Icons: `✔` enabled, `○` disabled, `▶` running.
| Key | Action |
|-----|--------|
| `i` | Inspect module |
| `r` | Run selected module |
| `l` | Module run history |
| `L` | Global run log (all modules) |
| `n` | New module wizard |
| `c` | Manage connections |
| `/` | Search modules |
| `j/k` | Navigate |
| `g/G` | Top/bottom |
| `F5` | Refresh |
| `q` | Quit |
### Module Detail Screen (i)
Top section: module info (strategy, merge key, watermark, dest table, staging table, enabled, updated).
Middle section: column table showing source column, destination alias, and whether RTRIM is applied.
Bottom: footer with keybindings. **No SQL visible by default** — all SQL opens in `$EDITOR` (read-only) via keybindings:
| Key | Opens in editor |
|-----|-----------------|
| `q` | Next source SQL — the resolved query that would execute on next run (with WHERE clause) |
| `m` | Merge SQL — the staging-to-dest merge statements |
| `h` | Post-merge hooks |
| `b` | Base query template — the stored SELECT before watermark WHERE is appended |
| `e` | Edit base query (writable) |
| `s` | Module settings (opens edit screen) |
| `r` | Run sync |
| `l` | Run history |
### Module Settings Screen (s)
Full edit form matching the new module wizard layout:
- Module name, source/dest connections, dest table
- Merge strategy (radio buttons)
- Merge key and watermark column (searchable dropdowns populated from source query aliases = destination column names)
- Enabled toggle
Source query is **not** on this screen — use `e` from the detail screen to edit it in `$EDITOR`.
### New Module Wizard (n)
- Source/destination connection selection
- Table browser: linked server, database, schema filter fields + Load button
- Real-time search/filter over loaded tables (DataTable)
- Auto-propose on table selection (generates query, DDL, strategy suggestions)
- Merge strategy, key, watermark, dest table fields
### History Screens (l, L)
Run table with status, rows, timing, error. Below: **separate** panels for source query and merge SQL (not combined). Error shown as red text. `v` opens selected run's SQL in editor. `esc` closes.
### Run Screen (r)
Streaming jrunner output via SSE. Shows real-time transfer progress.
## Concurrency Control
Each module has a `running` flag. Before starting a sync:
1. Check if module is already running — reject if so
2. Set `running = true`
3. Execute sync
4. Set `running = false` on success or failure
## Error Handling
- On module failure: log error to run_log, stop group execution
- No automatic retries
- Staging tables preserved for debugging
- Generated SQL logged to run_log for post-mortem analysis
## Security
- API: HTTP Basic Auth (username/password stored in settings table)
- Connection passwords: stored as env var references (e.g. `$DB2PW`) resolved at runtime
## Deployment
- Single directory install (`/opt/pipekit/`)
- Bootstrap config file (`config.yaml`)
- SQLite database (`pipekit.db`) — created on first run
- JDBC drivers directory
- Python dependencies via pip/venv
- Portable: copy the directory and you've moved the whole install
## Directory Structure
```
/opt/pipekit/
config.yaml # bootstrap config (only file-based config)
pipekit.db # SQLite — all config, queries, run history
drivers/ # JDBC .jar files
engine/
db.py # SQLite schema + CRUD operations
runner.py # Sync orchestration (staging, transfer, merge, hooks)
introspect.py # Source browsing, query generation, type mapping
api/
main.py # FastAPI app
tui/
app.py # Textual TUI
client.py # HTTP client for API
requirements.txt
```
## jrunner Fixes
- **NVARCHAR/NCHAR/NTEXT/NCLOB quoting** — added case labels to jrunner's INSERT builder type switch so Unicode string types get quoted correctly.
## Migration Path from Current Setup
1. Create connections for s7830956, usmidsql01, gpserver, localhost PostgreSQL
2. Import existing modules — parse shell scripts to extract query, dest table, strategy
3. Import orchestrators as groups
4. Set up schedules to match current crontab
5. Verify runs produce same results
6. Decommission shell scripts and cron entries
## TODO
- [ ] **Implement column mapping for watermark WHERE clause** — parse source query to build alias → source expression map, use source expression (not alias) in incremental WHERE clauses
- [ ] **Cancel running sync** — track PID, add cancel endpoint + TUI binding
- [ ] **Scheduler** — background thread in the API process evaluating cron expressions every minute
- [ ] **Email notifications** — SMTP on failure
- [ ] **Upsert + incremental combo** — pull only changed rows, then INSERT ON CONFLICT UPDATE
- [ ] **Module history — full audit** — expand module_history to track all field changes, store as JSON diff
### Resolved
- **Persistent staging tables**`pipekit_staging.{name}`, truncated before each run, left in place after
- **Global run log in TUI**`L` from main screen
- **Connection pooling** — not needed at current scale
- **Scheduler location** — built into the API process (background thread)
- **module_history scope** — track all field changes
- **`timestamp_column` renamed to `watermark_column`** — reflects actual purpose (any monotonic value, not just timestamps)
## Known Issues
- **Watermark WHERE clause uses alias instead of source column name**`WHERE dcord > 12345` should be `WHERE "DCORD#" > 12345`. Blocked on implementing the column mapping (top TODO item).
- **psql null display**`MAX()` on empty table can render as `∅` depending on locale. The null check must handle this.
- **Merge key stored as `dcord#` vs alias `dcord`** — historical data may have source column names stored where alias was intended. Merge key should always be the destination column name.

4
bin/pipekit Executable file
View File

@ -0,0 +1,4 @@
#!/usr/bin/env bash
# Thin launcher: run `pipekit` from anywhere.
set -euo pipefail
exec python3 -m pipekit "$@"

9
config.yaml Normal file
View File

@ -0,0 +1,9 @@
database: /opt/pipekit/pipekit.db
jrunner_path: /usr/local/bin/jrunner
driver_dir: /opt/pipekit/drivers/
api_port: 8100
# smtp:
# host: smtp.example.com
# port: 587
# from: etl@example.com
# to: admin@example.com

1
pipekit/__init__.py Normal file
View File

@ -0,0 +1 @@
__version__ = "0.1.0"

3
pipekit/__main__.py Normal file
View File

@ -0,0 +1,3 @@
from .cli import main
raise SystemExit(main())

3
pipekit/api/__init__.py Normal file
View File

@ -0,0 +1,3 @@
from .app import create_app
__all__ = ["create_app"]

25
pipekit/api/app.py Normal file
View File

@ -0,0 +1,25 @@
"""FastAPI app factory.
JSON endpoints live under ``/api``. HTML pages (added in a later
increment) will live at ``/``. Keeping them separate avoids
content-negotiation complexity and keeps the API curl-testable.
"""
from __future__ import annotations
from fastapi import FastAPI
from .. import __version__, db, jrunner
from ..web import mount_web
from .routes import connections, introspect, modules, runs, system
def create_app() -> FastAPI:
app = FastAPI(title="Pipekit", version=__version__)
app.include_router(system.router)
app.include_router(connections.router, prefix="/api")
app.include_router(introspect.router, prefix="/api")
app.include_router(modules.router, prefix="/api")
app.include_router(runs.router, prefix="/api")
mount_web(app)
return app

50
pipekit/api/auth.py Normal file
View File

@ -0,0 +1,50 @@
"""HTTP Basic auth. Credentials live in the ``settings`` table.
Auth is disabled by default so the API is usable out-of-the-box on
localhost. Flip it on per SPEC.md §"Auth" by setting
``api_auth_enabled: true`` in config.yaml and seeding the two settings::
pipekit set-password admin
The secret never leaves pipekit.db.
"""
from __future__ import annotations
import secrets
from fastapi import Depends, HTTPException, status
from fastapi.security import HTTPBasic, HTTPBasicCredentials
from .. import repo
from ..config import get_config
_security = HTTPBasic(auto_error=False)
def require_auth(
credentials: HTTPBasicCredentials | None = Depends(_security),
) -> str | None:
"""Return the authenticated username, or raise 401."""
enabled = bool(get_config().get("api_auth_enabled", False))
if not enabled:
return None
expected_user = repo.get_setting("api_user") or ""
expected_pass = repo.get_setting("api_pass") or ""
if not credentials or not expected_user or not expected_pass:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="authentication required",
headers={"WWW-Authenticate": "Basic"},
)
user_ok = secrets.compare_digest(credentials.username, expected_user)
pass_ok = secrets.compare_digest(credentials.password, expected_pass)
if not (user_ok and pass_ok):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="invalid credentials",
headers={"WWW-Authenticate": "Basic"},
)
return credentials.username

View File

View File

@ -0,0 +1,94 @@
"""Drivers + connections CRUD. Mirrors SPEC.md §"Resource CRUD" — GET/POST only."""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException
from ... import repo
from ..auth import require_auth
router = APIRouter(tags=["connections"], dependencies=[Depends(require_auth)])
# ---- drivers ----
@router.get("/drivers")
def list_drivers() -> list[dict]:
return repo.list_drivers()
@router.post("/drivers")
def create_driver(payload: dict) -> dict:
_require_fields(payload, ["name", "kind", "jar_file", "class_name"])
return repo.create_driver(
name=payload["name"], kind=payload["kind"],
jar_file=payload["jar_file"], class_name=payload["class_name"],
url_template=payload.get("url_template"),
)
# ---- connections ----
@router.get("/connections")
def list_connections() -> list[dict]:
return repo.list_connections()
@router.get("/connections/{connection_id}")
def get_connection(connection_id: int) -> dict:
conn = repo.get_connection(connection_id)
if conn is None:
raise HTTPException(404, f"connection id={connection_id} not found")
return conn
@router.post("/connections")
def create_connection(payload: dict) -> dict:
_require_fields(payload, ["name", "driver_id", "jdbc_url"])
return repo.create_connection(
name=payload["name"],
driver_id=payload["driver_id"],
jdbc_url=payload["jdbc_url"],
username=payload.get("username"),
password=payload.get("password"),
default_dest_connection_id=payload.get("default_dest_connection_id"),
default_dest_schema=payload.get("default_dest_schema"),
notes=payload.get("notes"),
)
@router.patch("/connections/{connection_id}")
def update_connection(connection_id: int, payload: dict) -> dict:
conn = repo.get_connection(connection_id)
if conn is None:
raise HTTPException(404, f"connection id={connection_id} not found")
return repo.update_connection(
connection_id,
name=payload.get("name"),
driver_id=int(payload["driver_id"]) if payload.get("driver_id") else None,
jdbc_url=payload.get("jdbc_url"),
username=payload.get("username"),
password=payload.get("password"),
default_dest_connection_id=int(payload["default_dest_connection_id"])
if payload.get("default_dest_connection_id") else None,
default_dest_schema=payload.get("default_dest_schema"),
notes=payload.get("notes"),
)
@router.delete("/connections/{connection_id}")
def delete_connection(connection_id: int) -> dict:
conn = repo.get_connection(connection_id)
if conn is None:
raise HTTPException(404, f"connection id={connection_id} not found")
try:
repo.delete_connection(connection_id)
except repo.ConnectionInUse as e:
raise HTTPException(409, str(e))
return {"deleted": connection_id}
def _require_fields(payload: dict, fields: list[str]) -> None:
missing = [f for f in fields if payload.get(f) in (None, "")]
if missing:
raise HTTPException(400, f"missing required fields: {', '.join(missing)}")

View File

@ -0,0 +1,94 @@
"""Introspection endpoints — back the wizard's remote-browsing steps.
Per-driver capabilities (SPEC.md §"Per-driver capability needed"):
- GET /api/drivers/{kind}/browse_fields qualifier schema
- GET /api/introspect/tables list tables/views
- GET /api/introspect/columns list columns for one table
All three go through the :class:`Driver` registry so the wizard never
branches on which database kind it's talking to.
"""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Query
from ... import drivers, jrunner, repo
from ..auth import require_auth
router = APIRouter(tags=["introspect"], dependencies=[Depends(require_auth)])
@router.get("/drivers/{kind}/browse_fields")
def driver_browse_fields(kind: str) -> list[dict]:
try:
drv = drivers.get_driver(kind)
except ValueError as e:
raise HTTPException(404, str(e))
return [
{"name": f.name, "label": f.label, "required": f.required,
"default": f.default, "help": f.help}
for f in drv.browse_fields()
]
@router.get("/introspect/tables")
def introspect_tables(connection_id: int = Query(...),
qualifier: list[str] = Query(default=[])) -> list[dict]:
"""List tables/views on the remote. `qualifier` entries are `name=value` pairs."""
conn, drv = _load_conn_and_driver(connection_id)
quals = _parse_qualifiers(qualifier, drv)
try:
tables = drv.list_tables(conn, **quals)
except (jrunner.JrunnerError, ValueError) as e:
raise HTTPException(502, f"list_tables failed: {e}")
return [t.to_dict() for t in tables]
@router.get("/introspect/columns")
def introspect_columns(connection_id: int = Query(...),
table: str = Query(...),
qualifier: list[str] = Query(default=[])) -> list[dict]:
conn, drv = _load_conn_and_driver(connection_id)
quals = _parse_qualifiers(qualifier, drv)
try:
cols = drv.get_columns(conn, table, **quals)
except (jrunner.JrunnerError, ValueError) as e:
raise HTTPException(502, f"get_columns failed: {e}")
return [c.to_dict() for c in cols]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _load_conn_and_driver(connection_id: int):
conn = repo.get_connection(connection_id)
if conn is None:
raise HTTPException(404, f"connection id={connection_id} not found")
drow = repo.get_driver_row(conn["driver_id"])
if drow is None:
raise HTTPException(500, f"connection {connection_id} references missing driver")
try:
drv = drivers.get_driver(drow["kind"])
except ValueError as e:
raise HTTPException(500, str(e))
return conn, drv
def _parse_qualifiers(pairs: list[str], drv: drivers.Driver) -> dict:
"""Turn ['schema=FOO', 'database=BAR'] into {'schema': 'FOO', ...},
restricted to names the driver declared in browse_fields()."""
allowed = {f.name for f in drv.browse_fields()}
out: dict = {}
for p in pairs:
if "=" not in p:
raise HTTPException(400, f"bad qualifier {p!r} — expected name=value")
name, _, value = p.partition("=")
name = name.strip()
if name not in allowed:
raise HTTPException(400, f"unknown qualifier {name!r} for driver "
f"{drv.kind} (allowed: {sorted(allowed)})")
if value:
out[name] = value
return out

View File

@ -0,0 +1,216 @@
"""Modules + operations (run, preview). Per SPEC.md §"Operation endpoints"."""
from __future__ import annotations
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from ... import engine, repo
from ...engine import watermark
from ...engine.merge import MergeError, build_merge_sql
from ..auth import require_auth
router = APIRouter(tags=["modules"], dependencies=[Depends(require_auth)])
@router.get("/modules")
def list_modules() -> list[dict]:
return repo.list_modules()
@router.get("/modules/{module_id}")
def get_module(module_id: int) -> dict:
m = repo.get_module(module_id)
if m is None:
raise HTTPException(404, f"module id={module_id} not found")
m["watermarks"] = repo.list_watermarks(module_id)
m["hooks"] = repo.list_hooks(module_id)
return m
@router.post("/modules")
def create_module(payload: dict) -> dict:
required = ["name", "source_connection_id", "dest_connection_id",
"dest_table", "source_query"]
missing = [f for f in required if payload.get(f) in (None, "")]
if missing:
raise HTTPException(400, f"missing required fields: {', '.join(missing)}")
return repo.create_module(
name=payload["name"],
source_connection_id=payload["source_connection_id"],
dest_connection_id=payload["dest_connection_id"],
dest_table=payload["dest_table"],
source_query=payload["source_query"],
merge_strategy=payload.get("merge_strategy", "full"),
merge_key=payload.get("merge_key"),
staging_table=payload.get("staging_table"),
)
@router.get("/modules/{module_id}/preview")
def preview_module(module_id: int) -> dict:
"""Resolve watermarks, build merge SQL. No sync — safe to poke."""
m = repo.get_module(module_id)
if m is None:
raise HTTPException(404, f"module id={module_id} not found")
try:
wm_values = watermark.resolve_watermarks(m)
except Exception as e: # noqa: BLE001
raise HTTPException(502, f"watermark resolver failed: {e}")
resolved = watermark.materialise(m["source_query"], wm_values)
try:
merge_sql = build_merge_sql(
strategy=m["merge_strategy"],
dest_table=m["dest_table"],
staging_table=m["staging_table"],
merge_key=m["merge_key"],
)
except MergeError as e:
raise HTTPException(400, str(e))
return {
"module_id": module_id,
"watermark_values": wm_values,
"resolved_source_sql": resolved,
"merge_sql": merge_sql,
}
@router.post("/modules/{module_id}/run")
def run_module(module_id: int, background: BackgroundTasks,
dry_run: bool = False) -> dict:
"""Kick off a run. Returns run_id immediately (SPEC.md §"Async runs")."""
m = repo.get_module(module_id)
if m is None:
raise HTTPException(404, f"module id={module_id} not found")
run_id = repo.create_run(module_id)
background.add_task(_run_in_background, module_id, run_id, dry_run)
return {"run_id": run_id}
def _run_in_background(module_id: int, run_id: int, dry_run: bool) -> None:
try:
engine.run_module(module_id, run_id=run_id, dry_run=dry_run)
except engine.LockBusy as e:
repo.finish_run(run_id, status="error", error=str(e))
# ---------------------------------------------------------------------------
# Watermarks — scoped to a module
# ---------------------------------------------------------------------------
_WATERMARK_REQUIRED = ["name", "connection_id", "resolver_sql"]
@router.get("/modules/{module_id}/watermarks")
def list_watermarks(module_id: int) -> list[dict]:
if repo.get_module(module_id) is None:
raise HTTPException(404, f"module id={module_id} not found")
return repo.list_watermarks(module_id)
@router.post("/modules/{module_id}/watermarks")
def create_watermark(module_id: int, payload: dict) -> dict:
if repo.get_module(module_id) is None:
raise HTTPException(404, f"module id={module_id} not found")
missing = [f for f in _WATERMARK_REQUIRED if payload.get(f) in (None, "")]
if missing:
raise HTTPException(400, f"missing required fields: {', '.join(missing)}")
return repo.create_watermark(
module_id=module_id,
name=payload["name"],
connection_id=int(payload["connection_id"]),
resolver_sql=payload["resolver_sql"],
default_value=payload.get("default_value"),
)
@router.get("/watermarks/{watermark_id}")
def get_watermark(watermark_id: int) -> dict:
wm = repo.get_watermark(watermark_id)
if wm is None:
raise HTTPException(404, f"watermark id={watermark_id} not found")
return wm
@router.patch("/watermarks/{watermark_id}")
def update_watermark(watermark_id: int, payload: dict) -> dict:
wm = repo.get_watermark(watermark_id)
if wm is None:
raise HTTPException(404, f"watermark id={watermark_id} not found")
return repo.update_watermark(
watermark_id,
name=payload.get("name"),
connection_id=int(payload["connection_id"]) if payload.get("connection_id") else None,
resolver_sql=payload.get("resolver_sql"),
default_value=payload.get("default_value"),
)
@router.delete("/watermarks/{watermark_id}")
def delete_watermark(watermark_id: int) -> dict:
if not repo.delete_watermark(watermark_id):
raise HTTPException(404, f"watermark id={watermark_id} not found")
return {"deleted": watermark_id}
# ---------------------------------------------------------------------------
# Hooks — scoped to a module
# ---------------------------------------------------------------------------
_VALID_RUN_ON = {"success", "failure", "always"}
@router.get("/modules/{module_id}/hooks")
def list_hooks(module_id: int) -> list[dict]:
if repo.get_module(module_id) is None:
raise HTTPException(404, f"module id={module_id} not found")
return repo.list_hooks(module_id)
@router.post("/modules/{module_id}/hooks")
def create_hook(module_id: int, payload: dict) -> dict:
if repo.get_module(module_id) is None:
raise HTTPException(404, f"module id={module_id} not found")
if not payload.get("sql"):
raise HTTPException(400, "missing required field: sql")
run_on = payload.get("run_on", "success")
if run_on not in _VALID_RUN_ON:
raise HTTPException(400, f"run_on must be one of {sorted(_VALID_RUN_ON)}")
return repo.create_hook(
module_id=module_id,
sql=payload["sql"],
run_order=int(payload.get("run_order", 0)),
connection_id=int(payload["connection_id"]) if payload.get("connection_id") else None,
run_on=run_on,
)
@router.get("/hooks/{hook_id}")
def get_hook(hook_id: int) -> dict:
h = repo.get_hook(hook_id)
if h is None:
raise HTTPException(404, f"hook id={hook_id} not found")
return h
@router.patch("/hooks/{hook_id}")
def update_hook(hook_id: int, payload: dict) -> dict:
h = repo.get_hook(hook_id)
if h is None:
raise HTTPException(404, f"hook id={hook_id} not found")
run_on = payload.get("run_on")
if run_on is not None and run_on not in _VALID_RUN_ON:
raise HTTPException(400, f"run_on must be one of {sorted(_VALID_RUN_ON)}")
return repo.update_hook(
hook_id,
run_order=int(payload["run_order"]) if payload.get("run_order") is not None else None,
connection_id=int(payload["connection_id"]) if payload.get("connection_id") else None,
sql=payload.get("sql"),
run_on=run_on,
)
@router.delete("/hooks/{hook_id}")
def delete_hook(hook_id: int) -> dict:
if not repo.delete_hook(hook_id):
raise HTTPException(404, f"hook id={hook_id} not found")
return {"deleted": hook_id}

View File

@ -0,0 +1,32 @@
"""Run log reads. Writes happen inside the engine."""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Query
from ... import repo
from ..auth import require_auth
router = APIRouter(tags=["runs"], dependencies=[Depends(require_auth)])
@router.get("/runs")
def list_runs(
module_id: int | None = Query(None),
status: str | None = Query(None),
limit: int = Query(50, ge=1, le=500),
) -> list[dict]:
return repo.list_runs(module_id=module_id, status=status, limit=limit)
@router.get("/runs/{run_id}")
def get_run(run_id: int) -> dict:
r = repo.get_run(run_id)
if r is None:
raise HTTPException(404, f"run id={run_id} not found")
return r
@router.get("/modules/{module_id}/runs")
def list_module_runs(module_id: int, limit: int = 50) -> list[dict]:
return repo.list_runs(module_id=module_id, limit=limit)

View File

@ -0,0 +1,25 @@
"""Health + doctor endpoints. /health is unauthenticated (SPEC.md §"System endpoints")."""
from __future__ import annotations
from fastapi import APIRouter
from ... import db, jrunner
router = APIRouter(tags=["system"])
@router.get("/health")
def health() -> dict:
return {"status": "ok"}
@router.get("/api/doctor")
def doctor() -> dict:
jr_ok, jr_msg = jrunner.version()
db_ok, db_msg = db.ping()
checks = [
{"name": "jrunner", "ok": jr_ok, "detail": jr_msg},
{"name": "database", "ok": db_ok, "detail": db_msg},
]
return {"ok": all(c["ok"] for c in checks), "checks": checks}

175
pipekit/cli.py Normal file
View File

@ -0,0 +1,175 @@
"""Pipekit CLI — `pipekit doctor`, `pipekit init`, later `serve` and `tui`."""
from __future__ import annotations
import argparse
import sys
from . import __version__
from . import db, drivers, engine, jrunner, repo
from .config import get_config
def cmd_init(args) -> int:
db.init_db()
print(f"initialised {get_config().database}")
return 0
def cmd_doctor(args) -> int:
checks: list[tuple[str, bool, str]] = []
try:
cfg = get_config()
checks.append(("config", True, str(cfg.source)))
except Exception as e:
checks.append(("config", False, f"{type(e).__name__}: {e}"))
_report(checks)
return 1
ok, msg = jrunner.version()
checks.append(("jrunner", ok, msg))
ok, msg = db.ping()
checks.append(("database", ok, msg))
return _report(checks)
def cmd_drivers_list(args) -> int:
kinds = drivers.available_kinds()
width = max(len(k) for k, _ in kinds)
print("available drivers:")
for kind, label in kinds:
print(f" {kind.ljust(width)} {label}")
return 0
def cmd_drivers_show(args) -> int:
try:
d = drivers.get_driver(args.kind)
except ValueError as e:
print(f"error: {e}")
return 1
fields = d.browse_fields()
print(f"driver: {d.kind} {d.label}")
print(f"wizard browse fields ({len(fields)}):")
for f in fields:
req = "required" if f.required else "optional"
default = f" default={f.default!r}" if f.default else ""
help_ = f"{f.help}" if f.help else ""
print(f" {f.name:<16} {req:<8} [{f.label}]{default}{help_}")
return 0
def cmd_run(args) -> int:
module = repo.get_module_by_name(args.module)
if module is None:
print(f"error: module {args.module!r} not found")
return 1
try:
outcome = engine.run_module(module["id"], dry_run=args.dry_run)
except engine.LockBusy as e:
print(f"busy: {e}")
return 1
tag = "DRY RUN — no jrunner calls made" if args.dry_run else ""
print(f"run_id={outcome.run_id} status={outcome.status} "
f"rows={outcome.row_count} {tag}".rstrip())
print()
if outcome.resolved_source_sql:
print("-- resolved source SQL --")
print(outcome.resolved_source_sql)
print()
if outcome.merge_sql:
print("-- merge SQL --")
print(outcome.merge_sql)
print()
if outcome.error:
print("-- error --")
print(outcome.error)
return 0 if outcome.status == "success" else 1
def cmd_serve(args) -> int:
import uvicorn
from .api import create_app
port = args.port or get_config().api_port
uvicorn.run(create_app(), host=args.host, port=port, reload=args.reload)
return 0
def cmd_set_password(args) -> int:
import getpass
pw = getpass.getpass(f"password for {args.username}: ")
if not pw:
print("error: empty password")
return 1
repo.set_setting("api_user", args.username)
repo.set_setting("api_pass", pw)
print(f"credentials saved for user {args.username!r}")
print("(set `api_auth_enabled: true` in config.yaml to enforce)")
return 0
def _report(checks) -> int:
width = max(len(name) for name, _, _ in checks)
failures = 0
for name, ok, msg in checks:
mark = "OK " if ok else "FAIL"
print(f" [{mark}] {name.ljust(width)} {msg}")
if not ok:
failures += 1
print()
if failures:
print(f"{failures} check(s) failed")
return 1
print("all checks passed")
return 0
def main(argv: list[str] | None = None) -> int:
p = argparse.ArgumentParser(prog="pipekit")
p.add_argument("--version", action="version", version=f"pipekit {__version__}")
sub = p.add_subparsers(dest="cmd", required=True)
p_init = sub.add_parser("init", help="create/upgrade the SQLite schema")
p_init.set_defaults(func=cmd_init)
p_doc = sub.add_parser("doctor", help="check config, jrunner, database")
p_doc.set_defaults(func=cmd_doctor)
p_drv = sub.add_parser("drivers", help="inspect the driver registry")
drv_sub = p_drv.add_subparsers(dest="drv_cmd", required=True)
p_drv_list = drv_sub.add_parser("list", help="list available drivers")
p_drv_list.set_defaults(func=cmd_drivers_list)
p_drv_show = drv_sub.add_parser("show", help="show a driver's wizard browse fields")
p_drv_show.add_argument("kind", help="one of the kinds from `pipekit drivers list`")
p_drv_show.set_defaults(func=cmd_drivers_show)
p_run = sub.add_parser("run", help="run a module by name (synchronous)")
p_run.add_argument("module", help="module name")
p_run.add_argument("--dry-run", action="store_true",
help="build SQL but do not invoke jrunner")
p_run.set_defaults(func=cmd_run)
p_serve = sub.add_parser("serve", help="start the HTTP API")
p_serve.add_argument("--host", default="127.0.0.1")
p_serve.add_argument("--port", type=int, default=None,
help="defaults to config.yaml api_port")
p_serve.add_argument("--reload", action="store_true")
p_serve.set_defaults(func=cmd_serve)
p_pw = sub.add_parser("set-password", help="set API Basic Auth credentials")
p_pw.add_argument("username")
p_pw.set_defaults(func=cmd_set_password)
args = p.parse_args(argv)
return args.func(args)
if __name__ == "__main__":
sys.exit(main())

41
pipekit/config.py Normal file
View File

@ -0,0 +1,41 @@
"""Bootstrap config loaded from config.yaml."""
from __future__ import annotations
import os
from functools import lru_cache
from pathlib import Path
import yaml
DEFAULT_PATH = "/opt/pipekit/config.yaml"
class Config:
def __init__(self, data: dict, source: Path):
self._data = data
self.source = source
@property
def database(self) -> Path:
return Path(self._data["database"])
@property
def jrunner_path(self) -> Path:
return Path(self._data["jrunner_path"])
@property
def api_port(self) -> int:
return int(self._data.get("api_port", 8100))
def get(self, key: str, default=None):
return self._data.get(key, default)
@lru_cache(maxsize=1)
def get_config() -> Config:
path = Path(os.environ.get("PIPEKIT_CONFIG", DEFAULT_PATH))
if not path.exists():
raise FileNotFoundError(f"Pipekit config not found: {path}")
with open(path) as f:
return Config(yaml.safe_load(f) or {}, path)

76
pipekit/db.py Normal file
View File

@ -0,0 +1,76 @@
"""SQLite connection + schema init.
Higher-level CRUD helpers live in later modules (per resource). This module
only owns: opening a connection, committing transactions, and creating the
schema from schema.sql.
"""
from __future__ import annotations
import sqlite3
from contextlib import contextmanager
from pathlib import Path
from .config import get_config
SCHEMA_PATH = Path(__file__).parent / "schema.sql"
def init_db(db_path: Path | None = None) -> None:
path = db_path or get_config().database
path.parent.mkdir(parents=True, exist_ok=True)
ddl = SCHEMA_PATH.read_text()
conn = sqlite3.connect(path)
try:
conn.executescript(ddl)
_apply_migrations(conn)
conn.commit()
finally:
conn.close()
def _apply_migrations(conn: sqlite3.Connection) -> None:
"""Idempotent ALTERs for columns added after initial release. SQLite has
no IF NOT EXISTS on ADD COLUMN, so we introspect first."""
cols = {r[1] for r in conn.execute("PRAGMA table_info(module)")}
if "columns_json" not in cols:
conn.execute("ALTER TABLE module ADD COLUMN columns_json TEXT")
if "dest_description" not in cols:
conn.execute("ALTER TABLE module ADD COLUMN dest_description TEXT")
@contextmanager
def connect(db_path: Path | None = None):
path = db_path or get_config().database
conn = sqlite3.connect(path)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
conn.close()
def ping() -> tuple[bool, str]:
"""Return (ok, message). Used by pipekit doctor."""
try:
path = get_config().database
if not path.exists():
return False, f"database file missing: {path} (run `pipekit init`)"
with connect(path) as c:
tables = [r[0] for r in c.execute(
"SELECT name FROM sqlite_master WHERE type='table' "
"AND name NOT LIKE 'sqlite_%' ORDER BY name"
)]
expected = {"connection", "driver", "grp", "group_member", "group_run",
"hook", "module", "run_log", "schedule", "settings", "watermark"}
missing = expected - set(tables)
if missing:
return False, f"schema incomplete — missing: {', '.join(sorted(missing))}"
return True, f"{path} ({len(tables)} tables)"
except Exception as e:
return False, f"{type(e).__name__}: {e}"

View File

@ -0,0 +1,32 @@
"""Driver registry — one :class:`Driver` instance per kind."""
from __future__ import annotations
from .base import (BrowseField, Driver, RemoteColumn, RemoteTable,
validate_identifier)
from .db2 import DB2Driver
from .mssql import MSSQLDriver
from .pg import PGDriver
_REGISTRY: dict[str, Driver] = {
DB2Driver.kind: DB2Driver(),
MSSQLDriver.kind: MSSQLDriver(),
PGDriver.kind: PGDriver(),
}
def get_driver(kind: str) -> Driver:
try:
return _REGISTRY[kind]
except KeyError:
known = ", ".join(sorted(_REGISTRY))
raise ValueError(f"unknown driver kind {kind!r} (known: {known})")
def available_kinds() -> list[tuple[str, str]]:
"""Return [(kind, label), ...] for every registered driver."""
return [(d.kind, d.label) for d in _REGISTRY.values()]
__all__ = ["BrowseField", "Driver", "RemoteColumn", "RemoteTable",
"validate_identifier", "get_driver", "available_kinds"]

149
pipekit/drivers/base.py Normal file
View File

@ -0,0 +1,149 @@
"""The Driver contract.
Every database kind (DB2, MSSQL, Postgres, ...) implements :class:`Driver`
so the rest of Pipekit (wizard, engine, API) never branches on which
database it is talking to. See SPEC.md §"Per-driver capability needed".
"""
from __future__ import annotations
import abc
import re
from dataclasses import dataclass, field
from typing import ClassVar
from .. import jrunner
# ---------------------------------------------------------------------------
# Plain data shapes returned by every driver
# ---------------------------------------------------------------------------
@dataclass
class BrowseField:
"""One qualifier field rendered by the wizard's Step-1 form.
DB2 exposes `[schema]`; MSSQL exposes `[linked_server, database, schema]`.
The TUI renders whatever the driver returns, so the wizard code does not
need to know which database kind is underneath.
"""
name: str
label: str
required: bool = False
default: str | None = None
help: str | None = None
@dataclass
class RemoteTable:
schema: str
name: str
kind: str # "table" | "view"
full_name: str # already qualified for a FROM clause
def to_dict(self) -> dict:
return {"schema": self.schema, "name": self.name,
"kind": self.kind, "full_name": self.full_name}
@dataclass
class RemoteColumn:
name: str
type_raw: str # e.g. "DECIMAL(15,4)", "CHAR", "VARCHAR(40)"
position: int
nullable: bool = True
description: str | None = None # source-side column remark, if any
def to_dict(self) -> dict:
return {"name": self.name, "type_raw": self.type_raw,
"position": self.position, "nullable": self.nullable,
"description": self.description}
# ---------------------------------------------------------------------------
# Identifier safety — jrunner has no bind params, so qualifier values get
# interpolated into SQL. Accept only characters real databases use in
# identifiers; reject everything else before it reaches a query.
# ---------------------------------------------------------------------------
_SAFE_IDENT = re.compile(r"^[A-Za-z_][A-Za-z0-9_$#]*$")
def validate_identifier(value: str, field_name: str = "identifier") -> str:
if not isinstance(value, str) or not _SAFE_IDENT.match(value):
raise ValueError(f"invalid {field_name}: {value!r}")
return value
# ---------------------------------------------------------------------------
# The Driver contract
# ---------------------------------------------------------------------------
class Driver(abc.ABC):
"""Stateless per-dialect adapter.
Connection info (url/user/password) is passed in to the two methods
that need to run SQL; everything else is pure logic.
"""
kind: ClassVar[str] # "db2" | "mssql" | "pg" — must match driver.kind in DB
label: ClassVar[str] # human-readable for the TUI
# ---- Wizard Step 1 ----
@abc.abstractmethod
def browse_fields(self) -> list[BrowseField]:
"""Qualifier fields the wizard needs to scope a table search."""
@abc.abstractmethod
def list_tables(self, conn: dict, **qualifiers) -> list[RemoteTable]:
"""Fetch tables/views matching the qualifiers."""
@abc.abstractmethod
def get_columns(self, conn: dict, table: str, **qualifiers) -> list[RemoteColumn]:
"""Fetch column metadata for one table."""
def describe_table(self, conn: dict, table: str, **qualifiers) -> str | None:
"""Return the source-side table-level description/remark, or None.
Default implementation returns None drivers opt in by overriding."""
return None
@abc.abstractmethod
def qualified_table_name(self, table: str, **qualifiers) -> str:
"""Build the FROM-clause identifier (e.g. 'RLDBF12.QCUSTCDT' or
'[link].[db].[dbo].[orders]'). Wizard-time only result is baked
into `module.source_query` and never re-derived."""
# ---- Dialect-specific SQL shaping ----
@abc.abstractmethod
def quote_identifier(self, name: str) -> str:
"""Wrap a column/table name in the dialect's quoting scheme if needed."""
@abc.abstractmethod
def default_expression(self, type_raw: str, column_name: str) -> str:
"""Default source-side expression for a column. Usually the bare
column; but char types get RTRIM, sentinel-dated columns get a CASE
that maps '0001-01-01'/'9999-12-31' to NULL, etc."""
@abc.abstractmethod
def map_type(self, type_raw: str) -> str:
"""Map a source type string to the destination DDL type. Current
target assumption is PostgreSQL; generalise later if needed."""
def build_create_table_sql(self, qualified_table: str,
columns: list[dict]) -> str:
"""Generate CREATE TABLE IF NOT EXISTS SQL for a destination table.
``columns`` is a list of ``{dest_name, dest_type}`` dicts.
Default implementation raises only destination drivers (PG today)
need to implement it."""
raise NotImplementedError(
f"driver {self.kind!r} does not implement build_create_table_sql "
"(not a supported destination)")
# ---- Shared helper ----
def query(self, conn: dict, sql: str) -> jrunner.QueryResult:
"""Run `sql` in jrunner query mode against `conn`."""
return jrunner.query(
conn["jdbc_url"], conn.get("username"), conn.get("password"), sql,
)

145
pipekit/drivers/db2.py Normal file
View File

@ -0,0 +1,145 @@
"""IBM i / DB2 for i driver (jt400)."""
from __future__ import annotations
from .base import (BrowseField, Driver, RemoteColumn, RemoteTable,
validate_identifier)
_TEXT_TYPES = {"char", "varchar", "nchar", "nvarchar", "graphic", "vargraphic",
"clob", "nclob"}
_DATE_TYPES = {"date"}
_TYPE_MAP = {
"smallint": "smallint", "integer": "integer", "int": "integer",
"bigint": "bigint",
"decimal": "numeric", "numeric": "numeric",
"real": "real", "float": "double precision", "double": "double precision",
"char": "text", "varchar": "text", "nchar": "text", "nvarchar": "text",
"graphic": "text", "vargraphic": "text", "clob": "text", "nclob": "text",
"date": "date", "time": "time", "timestamp": "timestamp",
"blob": "bytea", "binary": "bytea", "varbinary": "bytea",
"rowid": "text",
}
_SAFE_IDENT_CHARS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
def _base(type_raw: str) -> str:
return type_raw.lower().split("(", 1)[0].strip()
def _needs_quoting(name: str) -> bool:
return bool(name) and (not name[0].isalpha() and name[0] != "_"
or any(c not in _SAFE_IDENT_CHARS for c in name))
class DB2Driver(Driver):
kind = "db2"
label = "IBM i / DB2 for i"
def browse_fields(self) -> list[BrowseField]:
return [
BrowseField(name="schema", label="Schema / library",
required=True,
help="e.g. RLDBF12"),
]
def list_tables(self, conn, *, schema: str) -> list[RemoteTable]:
validate_identifier(schema, "schema")
sql = (
"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
"FROM QSYS2.SYSTABLES "
f"WHERE TABLE_SCHEMA = '{schema}' "
"ORDER BY TABLE_NAME"
)
result = self.query(conn, sql)
tables: list[RemoteTable] = []
for row in result.rows:
if len(row) < 3:
continue
sch, name, ttype = row[0].strip(), row[1].strip(), row[2].strip()
kind = "view" if ttype in ("L", "V") else "table"
tables.append(RemoteTable(
schema=sch, name=name, kind=kind,
full_name=self.qualified_table_name(name, schema=sch),
))
return tables
def get_columns(self, conn, table: str, *, schema: str) -> list[RemoteColumn]:
validate_identifier(schema, "schema")
validate_identifier(table, "table")
sql = (
"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION, IS_NULLABLE, "
" LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, "
" COALESCE(COLUMN_TEXT, COLUMN_HEADING, '') "
"FROM QSYS2.SYSCOLUMNS "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
"ORDER BY ORDINAL_POSITION"
)
result = self.query(conn, sql)
cols: list[RemoteColumn] = []
for row in result.rows:
if len(row) < 4:
continue
name, dtype, pos, nullable = [c.strip() for c in row[:4]]
length = row[4].strip() if len(row) > 4 else ""
prec = row[5].strip() if len(row) > 5 else ""
scale = row[6].strip() if len(row) > 6 else ""
desc = row[7].strip() if len(row) > 7 else ""
type_raw = _format_type(dtype, length, prec, scale)
cols.append(RemoteColumn(
name=name, type_raw=type_raw,
position=int(pos), nullable=(nullable.upper() == "Y"),
description=desc or None,
))
return cols
def describe_table(self, conn, table: str, *, schema: str) -> str | None:
validate_identifier(schema, "schema")
validate_identifier(table, "table")
sql = (
"SELECT COALESCE(TABLE_TEXT, LONG_COMMENT, '') "
"FROM QSYS2.SYSTABLES "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
"FETCH FIRST 1 ROWS ONLY"
)
result = self.query(conn, sql)
if not result.rows or not result.rows[0]:
return None
v = result.rows[0][0].strip()
return v or None
def qualified_table_name(self, table: str, *, schema: str) -> str:
return f"{self.quote_identifier(schema)}.{self.quote_identifier(table)}"
def quote_identifier(self, name: str) -> str:
if _needs_quoting(name):
return '"' + name.replace('"', '""') + '"'
return name
def default_expression(self, type_raw: str, column_name: str) -> str:
col = self.quote_identifier(column_name)
base = _base(type_raw)
if base in _TEXT_TYPES:
return f"RTRIM({col})"
if base in _DATE_TYPES:
return (f"CASE WHEN {col} IN (DATE('0001-01-01'), DATE('9999-12-31')) "
f"THEN NULL ELSE {col} END")
return col
def map_type(self, type_raw: str) -> str:
base = _base(type_raw)
mapped = _TYPE_MAP.get(base, "text")
if mapped == "numeric" and "(" in type_raw:
return "numeric" + type_raw[type_raw.index("("):]
return mapped
def _format_type(dtype: str, length: str, prec: str, scale: str) -> str:
base = dtype.upper()
if base in ("DECIMAL", "NUMERIC") and prec:
return f"{base}({prec},{scale or '0'})"
if base in ("CHAR", "VARCHAR", "NCHAR", "NVARCHAR",
"GRAPHIC", "VARGRAPHIC") and length:
return f"{base}({length})"
return base

228
pipekit/drivers/mssql.py Normal file
View File

@ -0,0 +1,228 @@
"""Microsoft SQL Server driver (mssql-jdbc).
Structured qualifiers instead of the pre-rewrite dotted-string hack: each
field linked server, database, schema is a separate form input, and
only the ones the user fills in show up in the generated FROM clause.
"""
from __future__ import annotations
from .base import (BrowseField, Driver, RemoteColumn, RemoteTable,
validate_identifier)
_TEXT_TYPES = {"char", "varchar", "nchar", "nvarchar", "text", "ntext"}
_TYPE_MAP = {
"tinyint": "smallint", "smallint": "smallint",
"int": "integer", "integer": "integer", "bigint": "bigint",
"decimal": "numeric", "numeric": "numeric",
"money": "numeric(19,4)", "smallmoney": "numeric(10,4)",
"real": "real", "float": "double precision",
"char": "text", "varchar": "text", "nchar": "text", "nvarchar": "text",
"text": "text", "ntext": "text",
"date": "date", "datetime": "timestamp", "datetime2": "timestamp",
"smalldatetime": "timestamp", "datetimeoffset": "timestamptz",
"time": "time",
"bit": "boolean",
"binary": "bytea", "varbinary": "bytea", "image": "bytea",
"uniqueidentifier": "uuid",
}
def _base(type_raw: str) -> str:
return type_raw.lower().split("(", 1)[0].strip()
class MSSQLDriver(Driver):
kind = "mssql"
label = "Microsoft SQL Server"
def browse_fields(self) -> list[BrowseField]:
return [
BrowseField(name="linked_server", label="Linked server",
required=False,
help="only for cross-server lookups; usually blank"),
BrowseField(name="database", label="Database",
required=False,
help="leave blank to use the connection's current DB"),
BrowseField(name="schema", label="Schema",
required=False, default="dbo"),
]
def list_tables(
self, conn, *, linked_server: str | None = None,
database: str | None = None, schema: str | None = None,
) -> list[RemoteTable]:
self._validate(linked_server, database, schema)
prefix = self._info_schema_prefix(linked_server, database)
where = ["TABLE_TYPE IN ('BASE TABLE','VIEW')"]
if schema:
where.append(f"TABLE_SCHEMA = '{schema}'")
sql = (
f"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
f"FROM {prefix}INFORMATION_SCHEMA.TABLES "
f"WHERE {' AND '.join(where)} "
f"ORDER BY TABLE_SCHEMA, TABLE_NAME"
)
result = self.query(conn, sql)
tables: list[RemoteTable] = []
for row in result.rows:
if len(row) < 3:
continue
sch, name, ttype = row[0].strip(), row[1].strip(), row[2].strip()
kind = "view" if ttype.upper() == "VIEW" else "table"
tables.append(RemoteTable(
schema=sch, name=name, kind=kind,
full_name=self.qualified_table_name(
name, schema=sch, database=database,
linked_server=linked_server),
))
return tables
def get_columns(
self, conn, table: str, *, linked_server: str | None = None,
database: str | None = None, schema: str | None = None,
) -> list[RemoteColumn]:
validate_identifier(table, "table")
self._validate(linked_server, database, schema)
prefix = self._info_schema_prefix(linked_server, database)
where = [f"TABLE_NAME = '{table}'"]
if schema:
where.append(f"TABLE_SCHEMA = '{schema}'")
sql = (
f"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION, IS_NULLABLE, "
f" CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE "
f"FROM {prefix}INFORMATION_SCHEMA.COLUMNS "
f"WHERE {' AND '.join(where)} "
f"ORDER BY ORDINAL_POSITION"
)
result = self.query(conn, sql)
cols: list[RemoteColumn] = []
for row in result.rows:
if len(row) < 4:
continue
name, dtype, pos, nullable = [c.strip() for c in row[:4]]
length = row[4].strip() if len(row) > 4 else ""
prec = row[5].strip() if len(row) > 5 else ""
scale = row[6].strip() if len(row) > 6 else ""
type_raw = _format_type(dtype, length, prec, scale)
cols.append(RemoteColumn(
name=name, type_raw=type_raw,
position=int(pos), nullable=(nullable.upper() == "YES"),
))
# Extended-property descriptions live in sys.extended_properties,
# which isn't available over a linked-server call from this side.
if not linked_server:
descs = self._column_descriptions(conn, table, database=database,
schema=schema or "dbo")
for c in cols:
c.description = descs.get(c.name) or None
return cols
def describe_table(
self, conn, table: str, *, linked_server: str | None = None,
database: str | None = None, schema: str | None = None,
) -> str | None:
validate_identifier(table, "table")
self._validate(linked_server, database, schema)
if linked_server:
return None
sch = schema or "dbo"
db_prefix = f"[{database}]." if database else ""
sql = (
f"SELECT CAST(ep.value AS NVARCHAR(MAX)) "
f"FROM {db_prefix}sys.extended_properties ep "
f"JOIN {db_prefix}sys.tables t ON t.object_id = ep.major_id "
f"JOIN {db_prefix}sys.schemas s ON s.schema_id = t.schema_id "
f"WHERE ep.class = 1 AND ep.minor_id = 0 "
f"AND ep.name = 'MS_Description' "
f"AND s.name = '{sch}' AND t.name = '{table}'"
)
result = self.query(conn, sql)
if not result.rows or not result.rows[0]:
return None
v = result.rows[0][0].strip()
return v or None
def _column_descriptions(
self, conn, table: str, *, database: str | None, schema: str,
) -> dict[str, str]:
db_prefix = f"[{database}]." if database else ""
sql = (
f"SELECT c.name, CAST(ep.value AS NVARCHAR(MAX)) "
f"FROM {db_prefix}sys.extended_properties ep "
f"JOIN {db_prefix}sys.columns c "
f" ON c.object_id = ep.major_id AND c.column_id = ep.minor_id "
f"JOIN {db_prefix}sys.tables t ON t.object_id = c.object_id "
f"JOIN {db_prefix}sys.schemas s ON s.schema_id = t.schema_id "
f"WHERE ep.class = 1 AND ep.name = 'MS_Description' "
f"AND s.name = '{schema}' AND t.name = '{table}'"
)
result = self.query(conn, sql)
out: dict[str, str] = {}
for row in result.rows:
if len(row) < 2:
continue
name = row[0].strip()
desc = row[1].strip()
if name and desc:
out[name] = desc
return out
def qualified_table_name(
self, table: str, *, linked_server: str | None = None,
database: str | None = None, schema: str | None = None,
) -> str:
parts = []
if linked_server:
parts.append(self.quote_identifier(linked_server))
parts.append(self.quote_identifier(database or ""))
elif database:
parts.append(self.quote_identifier(database))
parts.append(self.quote_identifier(schema or "dbo"))
parts.append(self.quote_identifier(table))
return ".".join(parts)
def quote_identifier(self, name: str) -> str:
if not name:
return ""
return "[" + name.replace("]", "]]") + "]"
def default_expression(self, type_raw: str, column_name: str) -> str:
col = self.quote_identifier(column_name)
if _base(type_raw) in _TEXT_TYPES:
return f"RTRIM({col})"
return col
def map_type(self, type_raw: str) -> str:
base = _base(type_raw)
mapped = _TYPE_MAP.get(base, "text")
if mapped == "numeric" and "(" in type_raw:
return "numeric" + type_raw[type_raw.index("("):]
return mapped
# ---- helpers ----
def _validate(self, linked_server, database, schema):
if linked_server:
validate_identifier(linked_server, "linked_server")
if database:
validate_identifier(database, "database")
if schema:
validate_identifier(schema, "schema")
def _info_schema_prefix(self, linked_server, database) -> str:
if linked_server:
return f"[{linked_server}].[{database or ''}]."
if database:
return f"[{database}]."
return ""
def _format_type(dtype: str, length: str, prec: str, scale: str) -> str:
base = dtype.upper()
if base in ("DECIMAL", "NUMERIC") and prec:
return f"{base}({prec},{scale or '0'})"
if base in ("CHAR", "VARCHAR", "NCHAR", "NVARCHAR") and length and length != "-1":
return f"{base}({length})"
return base

167
pipekit/drivers/pg.py Normal file
View File

@ -0,0 +1,167 @@
"""PostgreSQL driver (also used as a destination target)."""
from __future__ import annotations
from .base import (BrowseField, Driver, RemoteColumn, RemoteTable,
validate_identifier)
_TYPE_MAP = {
# Mostly identity — PG is the usual destination target, so mapping a PG
# source to PG dest is near-passthrough.
"smallint": "smallint", "integer": "integer", "bigint": "bigint",
"int": "integer", "int2": "smallint", "int4": "integer", "int8": "bigint",
"numeric": "numeric", "decimal": "numeric",
"real": "real", "double precision": "double precision",
"float4": "real", "float8": "double precision",
"text": "text", "varchar": "text", "char": "text", "bpchar": "text",
"character varying": "text", "character": "text",
"date": "date", "timestamp": "timestamp",
"timestamp without time zone": "timestamp",
"timestamp with time zone": "timestamptz", "timestamptz": "timestamptz",
"time": "time",
"boolean": "boolean", "bool": "boolean",
"bytea": "bytea",
"uuid": "uuid",
"json": "json", "jsonb": "jsonb",
}
def _base(type_raw: str) -> str:
return type_raw.lower().split("(", 1)[0].strip()
class PGDriver(Driver):
kind = "pg"
label = "PostgreSQL"
def browse_fields(self) -> list[BrowseField]:
return [
BrowseField(name="schema", label="Schema",
required=False, default="public"),
]
def list_tables(self, conn, *, schema: str | None = None) -> list[RemoteTable]:
if schema:
validate_identifier(schema, "schema")
where = ["table_schema NOT IN ('pg_catalog','information_schema')"]
if schema:
where.append(f"table_schema = '{schema}'")
sql = (
"SELECT table_schema, table_name, table_type "
"FROM information_schema.tables "
f"WHERE {' AND '.join(where)} "
"ORDER BY table_schema, table_name"
)
result = self.query(conn, sql)
tables: list[RemoteTable] = []
for row in result.rows:
if len(row) < 3:
continue
sch, name, ttype = row[0].strip(), row[1].strip(), row[2].strip()
kind = "view" if ttype.upper() == "VIEW" else "table"
tables.append(RemoteTable(
schema=sch, name=name, kind=kind,
full_name=self.qualified_table_name(name, schema=sch),
))
return tables
def get_columns(
self, conn, table: str, *, schema: str | None = None,
) -> list[RemoteColumn]:
validate_identifier(table, "table")
if schema:
validate_identifier(schema, "schema")
sch = schema or "public"
where = [f"c.table_name = '{table}'", f"c.table_schema = '{sch}'"]
sql = (
"SELECT c.column_name, c.data_type, c.ordinal_position, c.is_nullable, "
" c.character_maximum_length, c.numeric_precision, c.numeric_scale, "
" COALESCE(pg_catalog.col_description("
" (quote_ident(c.table_schema) || '.' || quote_ident(c.table_name))::regclass, "
" c.ordinal_position::int), '') "
"FROM information_schema.columns c "
f"WHERE {' AND '.join(where)} "
"ORDER BY c.ordinal_position"
)
result = self.query(conn, sql)
cols: list[RemoteColumn] = []
for row in result.rows:
if len(row) < 4:
continue
name, dtype, pos, nullable = [c.strip() for c in row[:4]]
length = row[4].strip() if len(row) > 4 else ""
prec = row[5].strip() if len(row) > 5 else ""
scale = row[6].strip() if len(row) > 6 else ""
desc = row[7].strip() if len(row) > 7 else ""
type_raw = _format_type(dtype, length, prec, scale)
cols.append(RemoteColumn(
name=name, type_raw=type_raw,
position=int(pos), nullable=(nullable.upper() == "YES"),
description=desc or None,
))
return cols
def describe_table(
self, conn, table: str, *, schema: str | None = None,
) -> str | None:
validate_identifier(table, "table")
if schema:
validate_identifier(schema, "schema")
sch = schema or "public"
sql = (
"SELECT COALESCE(pg_catalog.obj_description("
f" (quote_ident('{sch}') || '.' || quote_ident('{table}'))::regclass, "
" 'pg_class'), '')"
)
result = self.query(conn, sql)
if not result.rows or not result.rows[0]:
return None
v = result.rows[0][0].strip()
return v or None
def qualified_table_name(
self, table: str, *, schema: str | None = None,
) -> str:
sch = schema or "public"
return f"{self.quote_identifier(sch)}.{self.quote_identifier(table)}"
def quote_identifier(self, name: str) -> str:
if name and name.islower() and name.replace("_", "").isalnum() and not name[0].isdigit():
return name
return '"' + name.replace('"', '""') + '"'
def default_expression(self, type_raw: str, column_name: str) -> str:
# PG doesn't pad char types and has honest NULLs — no shaping needed.
return self.quote_identifier(column_name)
def map_type(self, type_raw: str) -> str:
base = _base(type_raw)
mapped = _TYPE_MAP.get(base, "text")
if mapped == "numeric" and "(" in type_raw:
return "numeric" + type_raw[type_raw.index("("):]
return mapped
def build_create_table_sql(self, qualified_table: str,
columns: list[dict]) -> str:
if not columns:
raise ValueError("no columns provided for CREATE TABLE")
lines = []
for c in columns:
name = c["dest_name"]
validate_identifier(name, "dest column name")
dtype = (c.get("dest_type") or "text").strip()
if not dtype:
raise ValueError(f"column {name!r} has no dest_type")
lines.append(f" {self.quote_identifier(name)} {dtype}")
body = ",\n".join(lines)
return f"CREATE TABLE IF NOT EXISTS {qualified_table} (\n{body}\n);"
def _format_type(dtype: str, length: str, prec: str, scale: str) -> str:
base = dtype.lower()
if base in ("numeric", "decimal") and prec:
return f"{base}({prec},{scale or '0'})"
if base in ("character varying", "character") and length:
return f"{base}({length})"
return base

View File

@ -0,0 +1,3 @@
from .runner import LockBusy, RunOutcome, run_module
__all__ = ["LockBusy", "RunOutcome", "run_module"]

47
pipekit/engine/merge.py Normal file
View File

@ -0,0 +1,47 @@
"""Build the SQL that merges staging → dest for one module.
Three strategies (from SPEC.md §"Merge strategies"):
* ``full`` TRUNCATE dest; INSERT from staging
* ``incremental`` DELETE rows in dest matching merge_key, then INSERT
* ``append`` INSERT only
Generated SQL targets PostgreSQL the 95% destination in the user's
setup. Moving this into a dest-driver method is a one-line refactor when
a non-PG destination appears.
"""
from __future__ import annotations
class MergeError(ValueError):
pass
def build_merge_sql(*, strategy: str, dest_table: str, staging_table: str,
merge_key: str | None) -> str:
if strategy == "full":
return f"TRUNCATE TABLE {dest_table};\nINSERT INTO {dest_table} SELECT * FROM {staging_table};"
if strategy == "append":
return f"INSERT INTO {dest_table} SELECT * FROM {staging_table};"
if strategy == "incremental":
if not merge_key:
raise MergeError("incremental merge requires merge_key")
keys = [k.strip() for k in merge_key.split(",") if k.strip()]
if not keys:
raise MergeError(f"merge_key is empty after parsing: {merge_key!r}")
if len(keys) == 1:
k = keys[0]
delete = (f"DELETE FROM {dest_table} "
f"WHERE {k} IN (SELECT {k} FROM {staging_table});")
else:
tuple_cols = "(" + ", ".join(keys) + ")"
select_cols = ", ".join(keys)
delete = (f"DELETE FROM {dest_table} "
f"WHERE {tuple_cols} IN (SELECT {select_cols} FROM {staging_table});")
insert = f"INSERT INTO {dest_table} SELECT * FROM {staging_table};"
return delete + "\n" + insert
raise MergeError(f"unknown merge strategy: {strategy!r}")

168
pipekit/engine/runner.py Normal file
View File

@ -0,0 +1,168 @@
"""Orchestrate one module run, per SPEC.md §"Engine flow".
Steps:
1. acquire lock atomically (repo.acquire_module_lock)
2. resolve watermarks (watermark.resolve_watermarks)
3. materialise source query, persist preview (watermark.materialise + repo)
4. ensure staging table exists on dest (CREATE TABLE IF NOT EXISTS ... LIKE dest)
5. jrunner migrate source staging (jrunner.migrate clears staging internally)
6. build merge SQL (merge.build_merge_sql)
7. run merge SQL on dest (jrunner.run_dest_sql)
8. run hooks in order, honouring run_on (jrunner.run_dest_sql)
9. write run_log row (repo.finish_run)
10. release lock (always) (repo.release_module_lock)
"""
from __future__ import annotations
import os
import traceback
from dataclasses import dataclass
from .. import jrunner, repo
from . import merge, watermark
@dataclass
class RunOutcome:
run_id: int
status: str # success | error | cancelled
row_count: int | None
error: str | None
resolved_source_sql: str | None
merge_sql: str | None
class LockBusy(RuntimeError):
"""Raised when a module is already running."""
def run_module(module_id: int, *, group_run_id: int | None = None,
dry_run: bool = False, run_id: int | None = None) -> RunOutcome:
"""Run one module end-to-end. In dry-run mode, SQL is generated and
stored on the run_log but no jrunner calls are made.
If ``run_id`` is provided, that run_log row is reused this lets
async callers (the API) reserve a run_id before the run starts so
they can return it to the client immediately.
"""
module = repo.get_module(module_id)
if module is None:
raise ValueError(f"module id={module_id} not found")
if run_id is None:
run_id = repo.create_run(module_id, group_run_id=group_run_id)
lock_owner = f"{os.getpid()}:{run_id}"
if not repo.acquire_module_lock(module_id, lock_owner):
repo.finish_run(run_id, status="error", error="already running")
raise LockBusy(f"module {module['name']!r} is already running")
resolved_sql: str | None = None
merge_sql: str | None = None
row_count: int | None = None
status = "error"
error: str | None = None
try:
source_conn = repo.get_connection(module["source_connection_id"])
dest_conn = repo.get_connection(module["dest_connection_id"])
if source_conn is None or dest_conn is None:
raise ValueError("source or dest connection missing")
# 23. watermarks + materialised source query
wm_values = watermark.resolve_watermarks(module, use_defaults_only=dry_run)
resolved_sql = watermark.materialise(module["source_query"], wm_values)
repo.set_next_resolved_query(module_id, resolved_sql)
repo.log_run_sql(run_id, resolved_source_sql=resolved_sql,
watermark_values=wm_values)
# 6. merge SQL (built now so it's visible on run_log even if migrate fails)
merge_sql = merge.build_merge_sql(
strategy=module["merge_strategy"],
dest_table=module["dest_table"],
staging_table=module["staging_table"],
merge_key=module["merge_key"],
)
repo.log_run_sql(run_id, merge_sql=merge_sql)
if dry_run:
status = "success"
return RunOutcome(run_id, status, None, None, resolved_sql, merge_sql)
# 4. ensure staging table exists on dest. Mirror the real dest schema
# so jrunner's auto-DELETE and the subsequent merge INSERT both find
# a table to work on. Idempotent — no-op after first run.
staging_schema, _, _ = module["staging_table"].partition(".")
if staging_schema and staging_schema != module["staging_table"]:
jrunner.run_dest_sql(
dest_conn, f"CREATE SCHEMA IF NOT EXISTS {staging_schema};")
jrunner.run_dest_sql(
dest_conn,
f"CREATE TABLE IF NOT EXISTS {module['staging_table']} "
f"(LIKE {module['dest_table']} INCLUDING ALL);",
)
# 5. migrate source → staging. jrunner does its own `DELETE FROM staging`
# before loading, so we don't need a separate TRUNCATE.
migrate_result = jrunner.migrate(
source_conn=source_conn, dest_conn=dest_conn,
sql=resolved_sql, dest_table=module["staging_table"],
clear=False,
)
row_count = migrate_result.row_count
repo.log_run_output(run_id, jrunner_stdout=migrate_result.stdout,
jrunner_stderr=migrate_result.stderr)
# 7. merge
jrunner.run_dest_sql(dest_conn, merge_sql)
# 8. hooks (success path so far)
hook_log = _run_hooks(module_id, fail_fast=True, run_on_set={"success", "always"})
if hook_log:
repo.log_run_output(run_id, hook_log=hook_log)
status = "success"
return RunOutcome(run_id, status, row_count, None, resolved_sql, merge_sql)
except Exception as e: # noqa: BLE001
error = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
# Failure-path hooks, if any. Never let these mask the real error.
try:
hook_log = _run_hooks(module_id, fail_fast=False,
run_on_set={"failure", "always"})
if hook_log:
repo.log_run_output(run_id, hook_log=hook_log)
except Exception: # noqa: BLE001, S110
pass
return RunOutcome(run_id, "error", row_count, error, resolved_sql, merge_sql)
finally:
repo.finish_run(run_id, status=status, row_count=row_count, error=error)
repo.release_module_lock(module_id)
def _run_hooks(module_id: int, *, fail_fast: bool, run_on_set: set[str]) -> str:
"""Run hooks whose ``run_on`` is in run_on_set. Returns a text log."""
hooks = [h for h in repo.list_hooks(module_id) if h["run_on"] in run_on_set]
if not hooks:
return ""
lines: list[str] = []
for h in hooks:
conn = repo.get_connection(h["connection_id"]) if h["connection_id"] else None
target = conn["name"] if conn else f"connection id={h['connection_id']}"
lines.append(f"-- hook run_order={h['run_order']} on={h['run_on']} target={target}")
if conn is None:
lines.append(" SKIP: connection not found")
if fail_fast:
raise RuntimeError(f"hook connection {h['connection_id']} not found")
continue
try:
jrunner.run_dest_sql(conn, h["sql"])
lines.append(" OK")
except Exception as e: # noqa: BLE001
lines.append(f" ERROR: {e}")
if fail_fast:
raise
return "\n".join(lines)

View File

@ -0,0 +1,53 @@
"""Resolve a module's watermarks and substitute them into its source query.
One resolver = one query run via jrunner query mode against the
watermark's connection (often dest, sometimes source, occasionally a
third). The first row's first column is used as an opaque string; the
user controls quoting inside the resolver SQL itself (see SPEC.md
§"Watermarks — type-agnostic"). NULL/empty falls back to ``default_value``.
"""
from __future__ import annotations
from .. import jrunner, repo
def resolve_watermarks(module: dict, *, use_defaults_only: bool = False) -> dict[str, str]:
"""Return ``{watermark_name: resolved_value}`` for every watermark on the module.
``use_defaults_only`` is the dry-run shortcut: skip jrunner entirely
and return each watermark's ``default_value``. Lets the user preview
the shape of the resolved query without hitting any database.
"""
values: dict[str, str] = {}
for wm in repo.list_watermarks(module["id"]):
if use_defaults_only:
values[wm["name"]] = wm["default_value"] or ""
continue
conn = repo.get_connection(wm["connection_id"])
if conn is None:
raise WatermarkError(
f"watermark {wm['name']!r}: connection id={wm['connection_id']} not found")
try:
result = jrunner.query(conn["jdbc_url"], conn.get("username"),
conn.get("password"), wm["resolver_sql"])
except jrunner.JrunnerError as e:
raise WatermarkError(
f"watermark {wm['name']!r} resolver failed: {e}") from e
value = result.first_value()
if value is None or value == "":
value = wm["default_value"] or ""
values[wm["name"]] = value
return values
def materialise(source_query: str, values: dict[str, str]) -> str:
"""Substitute ``{name}`` placeholders in the query with resolved values."""
out = source_query
for name, v in values.items():
out = out.replace("{" + name + "}", v)
return out
class WatermarkError(RuntimeError):
pass

209
pipekit/jrunner.py Normal file
View File

@ -0,0 +1,209 @@
"""Thin wrapper around the `jrunner` Java CLI.
Pipekit uses jrunner for two things:
* **migration mode** bulk streaming from source to dest (handled by the
engine; not in this file yet).
* **query mode** single-result queries for watermark resolvers and for
wizard introspection. Implemented here via :func:`query`.
Passwords are stored as env-var references (e.g. `"$DB2PW"`) per spec;
:func:`resolve_password` expands them at call time so secrets never land on
argv or in the database.
"""
from __future__ import annotations
import csv
import io
import os
import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass
from pathlib import Path
from .config import get_config
@dataclass
class QueryResult:
columns: list[str]
rows: list[list[str]]
stdout: str
stderr: str
def first_value(self) -> str | None:
if not self.rows or not self.rows[0]:
return None
return self.rows[0][0]
@dataclass
class MigrateResult:
row_count: int | None
stdout: str
stderr: str
def resolve_password(raw: str | None) -> str:
if not raw:
return ""
if raw.startswith("$"):
return os.environ.get(raw[1:], "")
return raw
# Force the JVM (and jt400 specifically) into non-interactive mode. Without
# this, jt400 pops up an AWT signon dialog when the password is empty/wrong
# — which crashes with HeadlessException on a server.
_HEADLESS_JAVA_OPTS = (
"-Djava.awt.headless=true "
"-Dcom.ibm.as400.access.AS400.guiAvailable=false"
)
def _subprocess_env() -> dict:
env = dict(os.environ)
existing = env.get("JAVA_TOOL_OPTIONS", "").strip()
env["JAVA_TOOL_OPTIONS"] = (
f"{existing} {_HEADLESS_JAVA_OPTS}".strip() if existing else _HEADLESS_JAVA_OPTS
)
return env
def jrunner_path() -> Path:
return get_config().jrunner_path
def version() -> tuple[bool, str]:
"""Return (ok, message) for use by pipekit doctor."""
path = jrunner_path()
if not shutil.which(str(path)) and not path.exists():
return False, f"jrunner not found at {path} (see /opt/jrunner/deploy.sh)"
try:
r = subprocess.run([str(path), "--help"], capture_output=True,
text=True, timeout=10)
first = (r.stdout or r.stderr).splitlines()[0] if (r.stdout or r.stderr) else ""
if "jrunner" in first.lower():
return True, first.strip()
return True, f"found at {path}"
except Exception as e:
return False, f"{type(e).__name__}: {e}"
def query(
jdbc_url: str,
username: str | None,
password: str | None,
sql: str,
*,
timeout: int = 60,
trim: bool = True,
) -> QueryResult:
"""Run `sql` in jrunner query mode and parse CSV output."""
path = jrunner_path()
pw = resolve_password(password)
with tempfile.NamedTemporaryFile("w", suffix=".sql", delete=False) as f:
f.write(sql)
sql_path = f.name
try:
argv = [str(path),
"-scu", jdbc_url,
"-scn", username or "",
"-scp", pw,
"-sq", sql_path,
"-f", "csv"]
if trim:
argv.insert(1, "-t")
r = subprocess.run(argv, capture_output=True, text=True,
timeout=timeout, env=_subprocess_env())
finally:
os.unlink(sql_path)
if r.returncode != 0:
raise JrunnerError(r.stderr.strip() or r.stdout.strip(),
stdout=r.stdout, stderr=r.stderr)
reader = csv.reader(io.StringIO(r.stdout))
header = next(reader, [])
rows = [row for row in reader if row]
return QueryResult(columns=header, rows=rows, stdout=r.stdout, stderr=r.stderr)
def migrate(
source_conn: dict,
dest_conn: dict,
sql: str,
dest_table: str,
*,
clear: bool = False,
trim: bool = True,
timeout: int = 3600,
) -> MigrateResult:
"""Stream `sql` results from source into `dest_table` via jrunner migration mode."""
path = jrunner_path()
with tempfile.NamedTemporaryFile("w", suffix=".sql", delete=False) as f:
f.write(sql)
sql_path = f.name
try:
argv = [str(path),
"-scu", source_conn["jdbc_url"],
"-scn", source_conn.get("username") or "",
"-scp", resolve_password(source_conn.get("password")),
"-dcu", dest_conn["jdbc_url"],
"-dcn", dest_conn.get("username") or "",
"-dcp", resolve_password(dest_conn.get("password")),
"-sq", sql_path,
"-dt", dest_table]
if trim:
argv.append("-t")
if clear:
argv.append("-c")
r = subprocess.run(argv, capture_output=True, text=True,
timeout=timeout, env=_subprocess_env())
finally:
os.unlink(sql_path)
if r.returncode != 0:
raise JrunnerError(r.stderr.strip() or r.stdout.strip(),
stdout=r.stdout, stderr=r.stderr)
return MigrateResult(
row_count=_parse_row_count(r.stdout + "\n" + r.stderr),
stdout=r.stdout, stderr=r.stderr,
)
def run_dest_sql(conn: dict, sql: str, *, timeout: int = 600) -> QueryResult:
"""Execute arbitrary SQL (DDL/DML/SELECT) on a connection. Used for
merge SQL, TRUNCATE staging, hooks, etc. Internally this is just
jrunner query mode pointed at the target."""
return query(conn["jdbc_url"], conn.get("username"), conn.get("password"),
sql, timeout=timeout, trim=False)
_ROW_COUNT_PATTERNS = (
re.compile(r"(\d+)\s+rows?\s+(?:inserted|transferred|migrated|written)", re.I),
re.compile(r"inserted\s+(\d+)\s+rows?", re.I),
re.compile(r"rows?:\s*(\d+)", re.I),
)
def _parse_row_count(text: str) -> int | None:
for pat in _ROW_COUNT_PATTERNS:
m = pat.search(text)
if m:
try:
return int(m.group(1))
except ValueError:
pass
return None
class JrunnerError(RuntimeError):
def __init__(self, message: str, *, stdout: str = "", stderr: str = ""):
super().__init__(message)
self.stdout = stdout
self.stderr = stderr

435
pipekit/repo.py Normal file
View File

@ -0,0 +1,435 @@
"""Repository — every piece of SQL against pipekit.db lives here.
Keeping all reads/writes in one module means the engine, API, and TUI
share one mental model of the data. Helpers are thin; they return plain
dicts (from ``sqlite3.Row``) so callers never have to think about the
database layer.
"""
from __future__ import annotations
import json
from typing import Any
from . import db
def _row(r) -> dict | None:
return dict(r) if r else None
# ---------------------------------------------------------------------------
# Driver rows (the registered JDBC drivers — jar + class + kind)
# ---------------------------------------------------------------------------
def create_driver(*, name: str, kind: str, jar_file: str, class_name: str,
url_template: str | None = None) -> dict:
with db.connect() as c:
cur = c.execute(
"INSERT INTO driver (name, kind, jar_file, class_name, url_template) "
"VALUES (?, ?, ?, ?, ?)",
(name, kind, jar_file, class_name, url_template),
)
return _row(c.execute("SELECT * FROM driver WHERE id=?", (cur.lastrowid,)).fetchone())
def list_drivers() -> list[dict]:
with db.connect() as c:
return [dict(r) for r in c.execute("SELECT * FROM driver ORDER BY name")]
def get_driver_row(driver_id: int) -> dict | None:
with db.connect() as c:
return _row(c.execute("SELECT * FROM driver WHERE id=?", (driver_id,)).fetchone())
# ---------------------------------------------------------------------------
# Connections
# ---------------------------------------------------------------------------
def create_connection(*, name: str, driver_id: int, jdbc_url: str,
username: str | None = None, password: str | None = None,
default_dest_connection_id: int | None = None,
default_dest_schema: str | None = None,
notes: str | None = None) -> dict:
with db.connect() as c:
cur = c.execute(
"INSERT INTO connection (name, driver_id, jdbc_url, username, password, "
"default_dest_connection_id, default_dest_schema, notes) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(name, driver_id, jdbc_url, username, password,
default_dest_connection_id, default_dest_schema, notes),
)
return _row(c.execute(
"SELECT * FROM connection WHERE id=?", (cur.lastrowid,)).fetchone())
def get_connection(connection_id: int) -> dict | None:
with db.connect() as c:
return _row(c.execute(
"SELECT * FROM connection WHERE id=?", (connection_id,)).fetchone())
def get_connection_by_name(name: str) -> dict | None:
with db.connect() as c:
return _row(c.execute(
"SELECT * FROM connection WHERE name=?", (name,)).fetchone())
def list_connections() -> list[dict]:
with db.connect() as c:
return [dict(r) for r in c.execute("SELECT * FROM connection ORDER BY name")]
def update_connection(connection_id: int, *, name: str | None = None,
driver_id: int | None = None, jdbc_url: str | None = None,
username: str | None = None, password: str | None = None,
default_dest_connection_id: int | None = None,
default_dest_schema: str | None = None,
notes: str | None = None) -> dict | None:
fields: list[str] = []
values: list = []
for col, val in (("name", name), ("driver_id", driver_id),
("jdbc_url", jdbc_url), ("username", username),
("password", password),
("default_dest_connection_id", default_dest_connection_id),
("default_dest_schema", default_dest_schema),
("notes", notes)):
if val is not None:
fields.append(f"{col}=?")
values.append(val)
if not fields:
return get_connection(connection_id)
fields.append("updated_at=datetime('now')")
values.append(connection_id)
with db.connect() as c:
c.execute(f"UPDATE connection SET {', '.join(fields)} WHERE id=?", values)
return get_connection(connection_id)
class ConnectionInUse(RuntimeError):
"""Raised by delete_connection when modules still reference it."""
def delete_connection(connection_id: int) -> bool:
"""Delete a connection. Raises ConnectionInUse if any module references it
as source, dest, or default-dest, or any watermark/hook uses it."""
with db.connect() as c:
refs: list[str] = []
for table, col in (("module", "source_connection_id"),
("module", "dest_connection_id"),
("connection", "default_dest_connection_id"),
("watermark", "connection_id"),
("hook", "connection_id")):
n = c.execute(
f"SELECT COUNT(*) FROM {table} WHERE {col}=?",
(connection_id,),
).fetchone()[0]
if n:
refs.append(f"{table}.{col} ({n})")
if refs:
raise ConnectionInUse(
f"connection id={connection_id} still referenced: {', '.join(refs)}")
cur = c.execute("DELETE FROM connection WHERE id=?", (connection_id,))
return cur.rowcount > 0
# ---------------------------------------------------------------------------
# Modules
# ---------------------------------------------------------------------------
def create_module(*, name: str, source_connection_id: int,
dest_connection_id: int, dest_table: str, source_query: str,
merge_strategy: str = "full", merge_key: str | None = None,
staging_table: str | None = None,
columns: list[dict] | None = None,
dest_description: str | None = None) -> dict:
staging = staging_table or f"pipekit_staging.{name}"
cols_json = json.dumps(columns) if columns else None
with db.connect() as c:
cur = c.execute(
"INSERT INTO module (name, source_connection_id, dest_connection_id, "
"dest_table, staging_table, source_query, merge_strategy, merge_key, "
"columns_json, dest_description) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(name, source_connection_id, dest_connection_id, dest_table,
staging, source_query, merge_strategy, merge_key, cols_json,
dest_description),
)
return _row(c.execute(
"SELECT * FROM module WHERE id=?", (cur.lastrowid,)).fetchone())
def get_module(module_id: int) -> dict | None:
with db.connect() as c:
return _row(c.execute(
"SELECT * FROM module WHERE id=?", (module_id,)).fetchone())
def get_module_by_name(name: str) -> dict | None:
with db.connect() as c:
return _row(c.execute(
"SELECT * FROM module WHERE name=?", (name,)).fetchone())
def list_modules() -> list[dict]:
with db.connect() as c:
return [dict(r) for r in c.execute("SELECT * FROM module ORDER BY name")]
def set_next_resolved_query(module_id: int, sql: str) -> None:
with db.connect() as c:
c.execute("UPDATE module SET next_resolved_query=?, "
"updated_at=datetime('now') WHERE id=?", (sql, module_id))
# ---------------------------------------------------------------------------
# Watermarks
# ---------------------------------------------------------------------------
def create_watermark(*, module_id: int, name: str, connection_id: int,
resolver_sql: str, default_value: str | None = None) -> dict:
with db.connect() as c:
cur = c.execute(
"INSERT INTO watermark (module_id, name, connection_id, resolver_sql, "
"default_value) VALUES (?, ?, ?, ?, ?)",
(module_id, name, connection_id, resolver_sql, default_value),
)
return _row(c.execute(
"SELECT * FROM watermark WHERE id=?", (cur.lastrowid,)).fetchone())
def list_watermarks(module_id: int) -> list[dict]:
with db.connect() as c:
return [dict(r) for r in c.execute(
"SELECT * FROM watermark WHERE module_id=? ORDER BY name", (module_id,))]
def get_watermark(watermark_id: int) -> dict | None:
with db.connect() as c:
return _row(c.execute(
"SELECT * FROM watermark WHERE id=?", (watermark_id,)).fetchone())
def update_watermark(watermark_id: int, *, name: str | None = None,
connection_id: int | None = None,
resolver_sql: str | None = None,
default_value: str | None = None) -> dict | None:
fields: list[str] = []
values: list = []
for col, val in (("name", name), ("connection_id", connection_id),
("resolver_sql", resolver_sql), ("default_value", default_value)):
if val is not None:
fields.append(f"{col}=?")
values.append(val)
if not fields:
return get_watermark(watermark_id)
values.append(watermark_id)
with db.connect() as c:
c.execute(f"UPDATE watermark SET {', '.join(fields)} WHERE id=?", values)
return get_watermark(watermark_id)
def delete_watermark(watermark_id: int) -> bool:
with db.connect() as c:
cur = c.execute("DELETE FROM watermark WHERE id=?", (watermark_id,))
return cur.rowcount > 0
# ---------------------------------------------------------------------------
# Hooks
# ---------------------------------------------------------------------------
def create_hook(*, module_id: int, sql: str, run_order: int = 0,
connection_id: int | None = None,
run_on: str = "success") -> dict:
with db.connect() as c:
cur = c.execute(
"INSERT INTO hook (module_id, run_order, connection_id, sql, run_on) "
"VALUES (?, ?, ?, ?, ?)",
(module_id, run_order, connection_id, sql, run_on),
)
return _row(c.execute(
"SELECT * FROM hook WHERE id=?", (cur.lastrowid,)).fetchone())
def list_hooks(module_id: int) -> list[dict]:
with db.connect() as c:
return [dict(r) for r in c.execute(
"SELECT * FROM hook WHERE module_id=? ORDER BY run_order", (module_id,))]
def get_hook(hook_id: int) -> dict | None:
with db.connect() as c:
return _row(c.execute(
"SELECT * FROM hook WHERE id=?", (hook_id,)).fetchone())
def update_hook(hook_id: int, *, run_order: int | None = None,
connection_id: int | None = None, sql: str | None = None,
run_on: str | None = None) -> dict | None:
fields: list[str] = []
values: list = []
for col, val in (("run_order", run_order), ("connection_id", connection_id),
("sql", sql), ("run_on", run_on)):
if val is not None:
fields.append(f"{col}=?")
values.append(val)
if not fields:
return get_hook(hook_id)
values.append(hook_id)
with db.connect() as c:
c.execute(f"UPDATE hook SET {', '.join(fields)} WHERE id=?", values)
return get_hook(hook_id)
def delete_hook(hook_id: int) -> bool:
with db.connect() as c:
cur = c.execute("DELETE FROM hook WHERE id=?", (hook_id,))
return cur.rowcount > 0
# ---------------------------------------------------------------------------
# Locking
# ---------------------------------------------------------------------------
def acquire_module_lock(module_id: int, pid: str) -> bool:
"""Atomic: UPDATE ... WHERE running=0. Returns True iff this call won."""
with db.connect() as c:
cur = c.execute(
"UPDATE module SET running=1, running_pid=?, "
"running_since=datetime('now') "
"WHERE id=? AND running=0",
(pid, module_id),
)
return cur.rowcount > 0
def release_module_lock(module_id: int) -> None:
with db.connect() as c:
c.execute("UPDATE module SET running=0, running_pid=NULL, "
"running_since=NULL WHERE id=?", (module_id,))
def clear_stale_locks(max_age_hours: int = 24, live_pids: set[int] | None = None) -> int:
"""Release locks older than max_age_hours OR held by a dead PID.
PID-based cleanup requires the caller to pass the current set of live
PIDs the repository has no business querying /proc.
"""
cleared = 0
with db.connect() as c:
cur = c.execute(
"UPDATE module SET running=0, running_pid=NULL, running_since=NULL "
"WHERE running=1 AND running_since < datetime('now', ?)",
(f"-{max_age_hours} hours",),
)
cleared += cur.rowcount
if live_pids is not None:
locked = [dict(r) for r in c.execute(
"SELECT id, running_pid FROM module WHERE running=1 AND running_pid IS NOT NULL")]
dead_ids = []
for row in locked:
pid_str = (row["running_pid"] or "").split(":", 1)[0]
try:
if int(pid_str) not in live_pids:
dead_ids.append(row["id"])
except ValueError:
dead_ids.append(row["id"])
for mid in dead_ids:
c.execute("UPDATE module SET running=0, running_pid=NULL, "
"running_since=NULL WHERE id=?", (mid,))
cleared += 1
return cleared
# ---------------------------------------------------------------------------
# Run log
# ---------------------------------------------------------------------------
def create_run(module_id: int, *, group_run_id: int | None = None) -> int:
with db.connect() as c:
cur = c.execute(
"INSERT INTO run_log (module_id, group_run_id) VALUES (?, ?)",
(module_id, group_run_id),
)
return int(cur.lastrowid)
def log_run_sql(run_id: int, *, resolved_source_sql: str | None = None,
merge_sql: str | None = None,
watermark_values: dict[str, Any] | None = None) -> None:
sets, vals = [], []
if resolved_source_sql is not None:
sets.append("resolved_source_sql=?"); vals.append(resolved_source_sql)
if merge_sql is not None:
sets.append("merge_sql=?"); vals.append(merge_sql)
if watermark_values is not None:
sets.append("watermark_values_json=?"); vals.append(json.dumps(watermark_values))
if not sets:
return
with db.connect() as c:
c.execute(f"UPDATE run_log SET {', '.join(sets)} WHERE id=?", vals + [run_id])
def log_run_output(run_id: int, *, jrunner_stdout: str | None = None,
jrunner_stderr: str | None = None,
hook_log: str | None = None) -> None:
sets, vals = [], []
if jrunner_stdout is not None:
sets.append("jrunner_stdout=?"); vals.append(jrunner_stdout)
if jrunner_stderr is not None:
sets.append("jrunner_stderr=?"); vals.append(jrunner_stderr)
if hook_log is not None:
sets.append("hook_log=?"); vals.append(hook_log)
if not sets:
return
with db.connect() as c:
c.execute(f"UPDATE run_log SET {', '.join(sets)} WHERE id=?", vals + [run_id])
def finish_run(run_id: int, *, status: str, row_count: int | None = None,
error: str | None = None) -> None:
with db.connect() as c:
c.execute(
"UPDATE run_log SET finished_at=datetime('now'), status=?, "
"row_count=?, error=? WHERE id=?",
(status, row_count, error, run_id),
)
def get_run(run_id: int) -> dict | None:
with db.connect() as c:
return _row(c.execute(
"SELECT * FROM run_log WHERE id=?", (run_id,)).fetchone())
def get_setting(key: str) -> str | None:
with db.connect() as c:
r = c.execute("SELECT value FROM settings WHERE key=?", (key,)).fetchone()
return r["value"] if r else None
def set_setting(key: str, value: str) -> None:
with db.connect() as c:
c.execute(
"INSERT INTO settings (key, value) VALUES (?, ?) "
"ON CONFLICT(key) DO UPDATE SET value=excluded.value",
(key, value),
)
def list_runs(*, module_id: int | None = None, status: str | None = None,
limit: int = 50) -> list[dict]:
where, params = [], []
if module_id is not None:
where.append("r.module_id=?"); params.append(module_id)
if status is not None:
where.append("r.status=?"); params.append(status)
clause = ("WHERE " + " AND ".join(where)) if where else ""
params.append(limit)
with db.connect() as c:
return [dict(r) for r in c.execute(
f"SELECT r.*, m.name AS module_name FROM run_log r "
f"LEFT JOIN module m ON r.module_id=m.id "
f"{clause} ORDER BY r.id DESC LIMIT ?", params)]

121
pipekit/schema.sql Normal file
View File

@ -0,0 +1,121 @@
-- Pipekit schema. Single source of truth — read by pipekit.db.init_db().
-- See SPEC.md sections: "Module model", "Run log / observability",
-- "Groups and scheduling", "Connections and credentials".
CREATE TABLE IF NOT EXISTS driver (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
kind TEXT NOT NULL, -- db2 | mssql | pg | ... (picks the Driver class)
jar_file TEXT NOT NULL,
class_name TEXT NOT NULL,
url_template TEXT,
created_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS connection (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
driver_id INTEGER NOT NULL REFERENCES driver(id),
jdbc_url TEXT NOT NULL,
username TEXT,
password TEXT, -- env-var reference, e.g. "$DB2PW"
default_dest_connection_id INTEGER REFERENCES connection(id),
default_dest_schema TEXT,
notes TEXT,
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS module (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
source_connection_id INTEGER NOT NULL REFERENCES connection(id),
dest_connection_id INTEGER NOT NULL REFERENCES connection(id),
dest_table TEXT NOT NULL,
staging_table TEXT NOT NULL, -- pipekit_staging.{name}
source_query TEXT NOT NULL, -- free text with {watermark} placeholders
merge_strategy TEXT NOT NULL DEFAULT 'full' CHECK (merge_strategy IN ('full','incremental','append')),
merge_key TEXT,
enabled INTEGER NOT NULL DEFAULT 1,
running INTEGER NOT NULL DEFAULT 0,
running_pid TEXT,
running_since TEXT,
next_resolved_query TEXT, -- materialised before each run for TUI preview
columns_json TEXT, -- [{source_name, source_type, dest_name, dest_type, description}, ...]
dest_description TEXT, -- COMMENT ON TABLE value, also shown in the UI
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS watermark (
id INTEGER PRIMARY KEY AUTOINCREMENT,
module_id INTEGER NOT NULL REFERENCES module(id) ON DELETE CASCADE,
name TEXT NOT NULL,
connection_id INTEGER NOT NULL REFERENCES connection(id),
resolver_sql TEXT NOT NULL,
default_value TEXT,
UNIQUE(module_id, name)
);
CREATE TABLE IF NOT EXISTS hook (
id INTEGER PRIMARY KEY AUTOINCREMENT,
module_id INTEGER NOT NULL REFERENCES module(id) ON DELETE CASCADE,
run_order INTEGER NOT NULL DEFAULT 0,
connection_id INTEGER REFERENCES connection(id),
sql TEXT NOT NULL,
run_on TEXT NOT NULL DEFAULT 'success' CHECK (run_on IN ('success','failure','always'))
);
CREATE TABLE IF NOT EXISTS grp (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS group_member (
id INTEGER PRIMARY KEY AUTOINCREMENT,
group_id INTEGER NOT NULL REFERENCES grp(id) ON DELETE CASCADE,
module_id INTEGER NOT NULL REFERENCES module(id) ON DELETE CASCADE,
run_order INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS schedule (
id INTEGER PRIMARY KEY AUTOINCREMENT,
group_id INTEGER NOT NULL REFERENCES grp(id) ON DELETE CASCADE,
cron_expr TEXT NOT NULL,
enabled INTEGER NOT NULL DEFAULT 1
);
CREATE TABLE IF NOT EXISTS group_run (
id INTEGER PRIMARY KEY AUTOINCREMENT,
group_id INTEGER NOT NULL REFERENCES grp(id),
started_at TEXT DEFAULT (datetime('now')),
finished_at TEXT,
status TEXT NOT NULL DEFAULT 'running' CHECK (status IN ('running','success','error','cancelled')),
triggered_by TEXT -- schedule | manual | null
);
CREATE TABLE IF NOT EXISTS run_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
module_id INTEGER NOT NULL REFERENCES module(id),
group_run_id INTEGER REFERENCES group_run(id),
started_at TEXT DEFAULT (datetime('now')),
finished_at TEXT,
row_count INTEGER,
status TEXT NOT NULL DEFAULT 'running' CHECK (status IN ('running','success','error','cancelled')),
error TEXT,
resolved_source_sql TEXT,
merge_sql TEXT,
watermark_values_json TEXT,
jrunner_stdout TEXT,
jrunner_stderr TEXT,
hook_log TEXT
);
CREATE INDEX IF NOT EXISTS idx_run_log_module ON run_log(module_id, id DESC);
CREATE INDEX IF NOT EXISTS idx_run_log_status ON run_log(status, started_at DESC);
CREATE INDEX IF NOT EXISTS idx_run_log_group_run ON run_log(group_run_id);
CREATE TABLE IF NOT EXISTS settings (
key TEXT PRIMARY KEY,
value TEXT
);

3
pipekit/web/__init__.py Normal file
View File

@ -0,0 +1,3 @@
from .app import mount_web
__all__ = ["mount_web"]

681
pipekit/web/app.py Normal file
View File

@ -0,0 +1,681 @@
"""HTML page handlers — the web frontend to Pipekit.
Mounted onto the FastAPI app by :func:`mount_web`. Pages live at
``/``, ``/modules/{id}``, ``/connections``, ``/runs``, ``/runs/{id}``.
JSON API stays at ``/api/*``.
Follows the UI design bar recorded in memory/feedback_tui_design.md:
bordered panels, structured layouts, pickers over free text. First
increment is read-heavy (pages render state + a Run button). The
wizard, editors, and SSE-driven live run watch come next.
"""
from __future__ import annotations
from pathlib import Path
from fastapi import APIRouter, FastAPI, HTTPException, Query, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from .. import __version__, drivers, engine, jrunner, repo
from ..config import get_config
from ..engine import watermark
from ..engine.merge import MergeError, build_merge_sql
_WEB_DIR = Path(__file__).parent
_templates = Jinja2Templates(directory=_WEB_DIR / "templates")
def mount_web(app: FastAPI) -> None:
"""Attach HTML pages + /static onto a FastAPI app."""
app.mount("/static", StaticFiles(directory=_WEB_DIR / "static"), name="static")
app.include_router(_router)
_router = APIRouter(include_in_schema=False)
def _ctx(**extra) -> dict:
return {"version": __version__, "flash": None, **extra}
# ---------------------------------------------------------------------------
# Modules — home page
# ---------------------------------------------------------------------------
@_router.get("/", response_class=HTMLResponse)
def home(request: Request):
modules = repo.list_modules()
conns_by_id = {c["id"]: c for c in repo.list_connections()}
drivers_by_id = {d["id"]: d for d in repo.list_drivers()}
# attach last-run summary to each module
for m in modules:
recent = repo.list_runs(module_id=m["id"], limit=1)
if recent:
last = recent[0]
m["last_run_at"] = last["started_at"]
m["last_status"] = last["status"]
m["last_row_count"] = last["row_count"]
else:
m["last_run_at"] = None
m["last_status"] = None
m["last_row_count"] = None
# group by source connection
grouped: dict[tuple[str, str], list] = {}
for m in modules:
src = conns_by_id.get(m["source_connection_id"], {})
drv = drivers_by_id.get(src.get("driver_id"), {}) if src else {}
key = (src.get("name", "(unknown)"), drv.get("kind", "?"))
grouped.setdefault(key, []).append(m)
grouped_list = [(name, kind, mods)
for (name, kind), mods in sorted(grouped.items())]
return _templates.TemplateResponse(
request,
"modules_index.html",
_ctx(total=len(modules), grouped=grouped_list),
)
@_router.get("/modules/{module_id}", response_class=HTMLResponse)
def module_detail(request: Request, module_id: int):
import json as _json
module = repo.get_module(module_id)
if module is None:
raise HTTPException(404, f"module id={module_id} not found")
source = repo.get_connection(module["source_connection_id"])
dest = repo.get_connection(module["dest_connection_id"])
watermarks = repo.list_watermarks(module_id)
hooks = repo.list_hooks(module_id)
recent_runs = repo.list_runs(module_id=module_id, limit=10)
schema_cols: list[dict] = []
if module.get("columns_json"):
try:
schema_cols = _json.loads(module["columns_json"])
except (ValueError, TypeError):
schema_cols = []
preview = None
preview_error: str | None = None
try:
wm_values = watermark.resolve_watermarks(module, use_defaults_only=True)
merge_sql = build_merge_sql(
strategy=module["merge_strategy"],
dest_table=module["dest_table"],
staging_table=module["staging_table"],
merge_key=module["merge_key"],
)
preview = {
"watermark_values": wm_values,
"resolved_source_sql": watermark.materialise(module["source_query"], wm_values),
"merge_sql": merge_sql,
}
except MergeError as e:
preview_error = str(e)
except Exception as e: # noqa: BLE001
preview_error = f"{type(e).__name__}: {e}"
return _templates.TemplateResponse(
request,
"module_detail.html",
_ctx(module=module, source_conn=source or {}, dest_conn=dest or {},
watermarks=watermarks, hooks=hooks, recent_runs=recent_runs,
preview=preview, preview_error=preview_error,
schema_cols=schema_cols),
)
@_router.post("/modules/{module_id}/run")
async def module_run_action(module_id: int, request: Request):
form = await request.form()
dry = form.get("dry_run") == "1"
if repo.get_module(module_id) is None:
raise HTTPException(404, f"module id={module_id} not found")
run_id = repo.create_run(module_id)
try:
engine.run_module(module_id, run_id=run_id, dry_run=dry)
except engine.LockBusy as e:
repo.finish_run(run_id, status="error", error=str(e))
return RedirectResponse(url=f"/runs/{run_id}", status_code=303)
# ---------------------------------------------------------------------------
# Wizard — guided new-module flow (per SPEC.md §"Wizard")
# ---------------------------------------------------------------------------
def _driver_for_conn(conn: dict):
drow = repo.get_driver_row(conn["driver_id"])
return drivers.get_driver(drow["kind"]) if drow else None
@_router.get("/wizard", response_class=HTMLResponse)
def wizard_step1(request: Request):
"""Step 1 — pick the source connection."""
conns = repo.list_connections()
drivers_by_id = {d["id"]: d for d in repo.list_drivers()}
for c in conns:
d = drivers_by_id.get(c["driver_id"])
c["driver_kind"] = d["kind"] if d else "?"
c["driver_label"] = d["name"] if d else "?"
return _templates.TemplateResponse(
request,
"wizard_step1.html",
_ctx(connections=conns, step=1),
)
@_router.get("/wizard/tables", response_class=HTMLResponse)
def wizard_step2(request: Request,
source_connection_id: int = Query(...)):
"""Step 2 — enter qualifier fields, browse tables."""
conn = repo.get_connection(source_connection_id)
if conn is None:
raise HTTPException(404, f"connection id={source_connection_id} not found")
drv = _driver_for_conn(conn)
if drv is None:
raise HTTPException(500, "driver row missing for connection")
browse = drv.browse_fields()
# Collect qualifier values from the querystring — each browse_field
# name maps to a top-level query param.
qvals: dict = {}
qp = dict(request.query_params)
for f in browse:
v = qp.get(f.name)
if v:
qvals[f.name] = v
elif f.default:
qvals[f.name] = f.default
tables: list[dict] = []
fetch_error: str | None = None
required_ok = all(qvals.get(f.name) for f in browse if f.required)
should_fetch = required_ok and qp.get("browse") == "1"
if should_fetch:
try:
tables = [t.to_dict() for t in drv.list_tables(conn, **qvals)]
except (jrunner.JrunnerError, ValueError) as e:
fetch_error = str(e)
except Exception as e: # noqa: BLE001
fetch_error = f"{type(e).__name__}: {e}"
return _templates.TemplateResponse(
request,
"wizard_step2.html",
_ctx(step=2, connection=conn, driver_kind=drv.kind,
browse_fields=browse, qvals=qvals, tables=tables,
fetch_error=fetch_error, required_ok=required_ok,
attempted=should_fetch),
)
@_router.get("/wizard/columns", response_class=HTMLResponse)
def wizard_step3(request: Request,
source_connection_id: int = Query(...),
table: str = Query(...),
table_schema: str = Query("")):
"""Step 3 — pick columns, merge config, destination."""
conn = repo.get_connection(source_connection_id)
if conn is None:
raise HTTPException(404, f"connection id={source_connection_id} not found")
drv = _driver_for_conn(conn)
if drv is None:
raise HTTPException(500, "driver row missing for connection")
qvals: dict = {}
qp = dict(request.query_params)
for f in drv.browse_fields():
v = qp.get(f.name)
if v:
qvals[f.name] = v
columns: list[dict] = []
fetch_error: str | None = None
table_description: str | None = None
try:
for c in drv.get_columns(conn, table, **qvals):
d = c.to_dict()
d["default_dest_name"] = c.name.lower()
d["default_dest_type"] = drv.map_type(c.type_raw)
d["default_description"] = c.description or ""
columns.append(d)
table_description = drv.describe_table(conn, table, **qvals) or ""
except (jrunner.JrunnerError, ValueError) as e:
fetch_error = str(e)
except Exception as e: # noqa: BLE001
fetch_error = f"{type(e).__name__}: {e}"
drivers_by_id = {d["id"]: d for d in repo.list_drivers()}
dest_conns = [
c for c in repo.list_connections()
if drivers_by_id.get(c["driver_id"], {}).get("kind") == "pg"
]
qualified = drv.qualified_table_name(table, **qvals) if not fetch_error else table
default_module_name = (table_schema + "_" + table).lower() if table_schema else table.lower()
default_dest_conn_id = conn.get("default_dest_connection_id")
default_dest_schema = conn.get("default_dest_schema") or ""
return _templates.TemplateResponse(
request,
"wizard_step3.html",
_ctx(step=3, connection=conn, all_connections=dest_conns,
driver_kind=drv.kind, qvals=qvals, table=table, table_schema=table_schema,
qualified_table=qualified, columns=columns,
table_description=table_description,
fetch_error=fetch_error, default_module_name=default_module_name,
default_dest_conn_id=default_dest_conn_id,
default_dest_schema=default_dest_schema),
)
@_router.post("/wizard/create")
async def wizard_create(request: Request):
"""Step 4 — build source_query from picks, create the module,
and provision the destination schema + table."""
form = await request.form()
source_connection_id = int(form["source_connection_id"])
dest_connection_id = int(form["dest_connection_id"])
table = form["table"]
module_name = form["module_name"].strip()
dest_table = form["dest_table"].strip()
merge_strategy = form.get("merge_strategy", "full")
merge_key = (form.get("merge_key") or "").strip() or None
staging_table = (form.get("staging_table") or "").strip() or None
dest_description = (form.get("dest_description") or "").strip() or None
picked = form.getlist("col")
src_conn = repo.get_connection(source_connection_id)
if src_conn is None:
raise HTTPException(404, f"connection id={source_connection_id} not found")
src_drv = _driver_for_conn(src_conn)
if src_drv is None:
raise HTTPException(500, "driver row missing for source connection")
dest_conn = repo.get_connection(dest_connection_id)
if dest_conn is None:
raise HTTPException(404, f"connection id={dest_connection_id} not found")
dest_drv = _driver_for_conn(dest_conn)
if dest_drv is None:
raise HTTPException(500, "driver row missing for dest connection")
qvals: dict = {}
for f in src_drv.browse_fields():
v = form.get(f.name)
if v:
qvals[f.name] = v
all_cols = src_drv.get_columns(src_conn, table, **qvals)
by_name = {c.name: c for c in all_cols}
chosen = []
for name in picked:
if name not in by_name:
continue
src_col = by_name[name]
dest_name = (form.get(f"dest_name__{name}") or "").strip()
dest_type = (form.get(f"dest_type__{name}") or "").strip()
desc = (form.get(f"dest_desc__{name}") or "").strip() or None
if not dest_name or not dest_type:
raise HTTPException(400, f"column {name!r} missing dest_name or dest_type")
chosen.append({
"source_name": src_col.name,
"source_type": src_col.type_raw,
"dest_name": dest_name,
"dest_type": dest_type,
"description": desc,
})
if not chosen:
raise HTTPException(400, "no columns selected")
qualified_source = src_drv.qualified_table_name(table, **qvals)
select_list = ",\n ".join(
f"{src_drv.default_expression(c['source_type'], c['source_name'])} AS "
f"{dest_drv.quote_identifier(c['dest_name'])}"
for c in chosen
)
source_query = f"SELECT\n {select_list}\nFROM {qualified_source}"
dest_schema, _, dest_table_bare = dest_table.partition(".")
if not dest_table_bare:
dest_schema, dest_table_bare = "public", dest_schema
qualified_dest = dest_drv.qualified_table_name(dest_table_bare, schema=dest_schema)
try:
create_table_sql = dest_drv.build_create_table_sql(qualified_dest, chosen)
except NotImplementedError as e:
raise HTTPException(400, str(e))
try:
jrunner.run_dest_sql(
dest_conn,
f"CREATE SCHEMA IF NOT EXISTS {dest_drv.quote_identifier(dest_schema)};",
)
jrunner.run_dest_sql(dest_conn, create_table_sql)
comment_sql = _build_comment_sql(dest_drv, qualified_dest,
dest_description, chosen)
if comment_sql:
jrunner.run_dest_sql(dest_conn, comment_sql)
except jrunner.JrunnerError as e:
raise HTTPException(500, f"dest provisioning failed: {e}")
module = repo.create_module(
name=module_name,
source_connection_id=source_connection_id,
dest_connection_id=dest_connection_id,
dest_table=dest_table,
source_query=source_query,
merge_strategy=merge_strategy,
merge_key=merge_key,
staging_table=staging_table,
columns=chosen,
dest_description=dest_description,
)
return RedirectResponse(url=f"/modules/{module['id']}", status_code=303)
def _sql_str(v: str) -> str:
"""SQL string literal — PG-style single-quote escaping."""
return "'" + v.replace("'", "''") + "'"
def _build_comment_sql(dest_drv, qualified_dest: str,
table_description: str | None,
columns: list[dict]) -> str:
stmts: list[str] = []
if table_description:
stmts.append(
f"COMMENT ON TABLE {qualified_dest} IS {_sql_str(table_description)};"
)
for c in columns:
desc = c.get("description")
if not desc:
continue
qcol = dest_drv.quote_identifier(c["dest_name"])
stmts.append(
f"COMMENT ON COLUMN {qualified_dest}.{qcol} IS {_sql_str(desc)};"
)
return "\n".join(stmts)
# ---------------------------------------------------------------------------
# Connections
# ---------------------------------------------------------------------------
@_router.get("/connections", response_class=HTMLResponse)
def connections_index(request: Request):
conns = repo.list_connections()
drivers = repo.list_drivers()
drivers_by_id = {d["id"]: d for d in drivers}
for c in conns:
d = drivers_by_id.get(c["driver_id"])
c["driver_kind"] = d["kind"] if d else "?"
return _templates.TemplateResponse(
request,
"connections.html",
_ctx(connections=conns, drivers=drivers),
)
@_router.get("/connections/new", response_class=HTMLResponse)
def connection_new(request: Request):
return _templates.TemplateResponse(
request,
"connection_form.html",
_ctx(connection=None, drivers=repo.list_drivers(),
connections=repo.list_connections(),
form_action="/connections",
cancel_url="/connections"),
)
@_router.post("/connections")
async def connection_create(request: Request):
form = await request.form()
ddc = form.get("default_dest_connection_id")
repo.create_connection(
name=form["name"].strip(),
driver_id=int(form["driver_id"]),
jdbc_url=form["jdbc_url"].strip(),
username=(form.get("username") or "").strip() or None,
password=(form.get("password") or "").strip() or None,
default_dest_connection_id=int(ddc) if ddc else None,
default_dest_schema=(form.get("default_dest_schema") or "").strip() or None,
notes=(form.get("notes") or "").strip() or None,
)
return RedirectResponse(url="/connections", status_code=303)
@_router.get("/connections/{connection_id}/edit", response_class=HTMLResponse)
def connection_edit(request: Request, connection_id: int):
conn = repo.get_connection(connection_id)
if conn is None:
raise HTTPException(404, f"connection id={connection_id} not found")
return _templates.TemplateResponse(
request,
"connection_form.html",
_ctx(connection=conn, drivers=repo.list_drivers(),
connections=repo.list_connections(),
form_action=f"/connections/{connection_id}",
cancel_url="/connections"),
)
@_router.post("/connections/{connection_id}")
async def connection_update(request: Request, connection_id: int):
conn = repo.get_connection(connection_id)
if conn is None:
raise HTTPException(404, f"connection id={connection_id} not found")
form = await request.form()
ddc = form.get("default_dest_connection_id")
repo.update_connection(
connection_id,
name=form["name"].strip(),
driver_id=int(form["driver_id"]),
jdbc_url=form["jdbc_url"].strip(),
username=(form.get("username") or "").strip() or None,
password=(form.get("password") or "").strip() or None,
default_dest_connection_id=int(ddc) if ddc else None,
default_dest_schema=(form.get("default_dest_schema") or "").strip() or None,
notes=(form.get("notes") or "").strip() or None,
)
return RedirectResponse(url="/connections", status_code=303)
@_router.post("/connections/{connection_id}/delete")
def connection_delete(connection_id: int):
conn = repo.get_connection(connection_id)
if conn is None:
raise HTTPException(404, f"connection id={connection_id} not found")
try:
repo.delete_connection(connection_id)
except repo.ConnectionInUse as e:
raise HTTPException(409, str(e))
return RedirectResponse(url="/connections", status_code=303)
# ---------------------------------------------------------------------------
# Runs
# ---------------------------------------------------------------------------
@_router.get("/runs", response_class=HTMLResponse)
def runs_index(request: Request,
module_id: int | None = Query(None),
limit: int = Query(50, ge=1, le=500)):
module_filter = repo.get_module(module_id) if module_id else None
runs = repo.list_runs(module_id=module_id, limit=limit)
return _templates.TemplateResponse(
request,
"runs.html",
_ctx(runs=runs, module_filter=module_filter),
)
@_router.get("/runs/{run_id}", response_class=HTMLResponse)
def run_detail(request: Request, run_id: int):
run = repo.get_run(run_id)
if run is None:
raise HTTPException(404, f"run id={run_id} not found")
module = repo.get_module(run["module_id"])
run["module_name"] = module["name"] if module else "?"
return _templates.TemplateResponse(
request,
"run_detail.html",
_ctx(run=run),
)
# ---------------------------------------------------------------------------
# Watermarks — add/edit/delete forms on module detail
# ---------------------------------------------------------------------------
@_router.get("/modules/{module_id}/watermarks/new", response_class=HTMLResponse)
def watermark_new(request: Request, module_id: int):
module = repo.get_module(module_id)
if module is None:
raise HTTPException(404, f"module id={module_id} not found")
return _templates.TemplateResponse(
request,
"watermark_form.html",
_ctx(module=module, watermark=None, connections=repo.list_connections(),
form_action=f"/modules/{module_id}/watermarks",
cancel_url=f"/modules/{module_id}"),
)
@_router.post("/modules/{module_id}/watermarks")
async def watermark_create(request: Request, module_id: int):
if repo.get_module(module_id) is None:
raise HTTPException(404, f"module id={module_id} not found")
form = await request.form()
repo.create_watermark(
module_id=module_id,
name=form["name"].strip(),
connection_id=int(form["connection_id"]),
resolver_sql=form["resolver_sql"],
default_value=(form.get("default_value") or "").strip() or None,
)
return RedirectResponse(url=f"/modules/{module_id}", status_code=303)
@_router.get("/watermarks/{watermark_id}/edit", response_class=HTMLResponse)
def watermark_edit(request: Request, watermark_id: int):
wm = repo.get_watermark(watermark_id)
if wm is None:
raise HTTPException(404, f"watermark id={watermark_id} not found")
module = repo.get_module(wm["module_id"])
return _templates.TemplateResponse(
request,
"watermark_form.html",
_ctx(module=module, watermark=wm, connections=repo.list_connections(),
form_action=f"/watermarks/{watermark_id}",
cancel_url=f"/modules/{module['id']}"),
)
@_router.post("/watermarks/{watermark_id}")
async def watermark_update(request: Request, watermark_id: int):
wm = repo.get_watermark(watermark_id)
if wm is None:
raise HTTPException(404, f"watermark id={watermark_id} not found")
form = await request.form()
repo.update_watermark(
watermark_id,
name=form["name"].strip(),
connection_id=int(form["connection_id"]),
resolver_sql=form["resolver_sql"],
default_value=(form.get("default_value") or "").strip() or None,
)
return RedirectResponse(url=f"/modules/{wm['module_id']}", status_code=303)
@_router.post("/watermarks/{watermark_id}/delete")
def watermark_delete(watermark_id: int):
wm = repo.get_watermark(watermark_id)
if wm is None:
raise HTTPException(404, f"watermark id={watermark_id} not found")
module_id = wm["module_id"]
repo.delete_watermark(watermark_id)
return RedirectResponse(url=f"/modules/{module_id}", status_code=303)
# ---------------------------------------------------------------------------
# Hooks — add/edit/delete forms on module detail
# ---------------------------------------------------------------------------
@_router.get("/modules/{module_id}/hooks/new", response_class=HTMLResponse)
def hook_new(request: Request, module_id: int):
module = repo.get_module(module_id)
if module is None:
raise HTTPException(404, f"module id={module_id} not found")
return _templates.TemplateResponse(
request,
"hook_form.html",
_ctx(module=module, hook=None, connections=repo.list_connections(),
form_action=f"/modules/{module_id}/hooks",
cancel_url=f"/modules/{module_id}"),
)
@_router.post("/modules/{module_id}/hooks")
async def hook_create(request: Request, module_id: int):
if repo.get_module(module_id) is None:
raise HTTPException(404, f"module id={module_id} not found")
form = await request.form()
conn_id = form.get("connection_id")
repo.create_hook(
module_id=module_id,
sql=form["sql"],
run_order=int(form.get("run_order") or 0),
connection_id=int(conn_id) if conn_id else None,
run_on=form.get("run_on", "success"),
)
return RedirectResponse(url=f"/modules/{module_id}", status_code=303)
@_router.get("/hooks/{hook_id}/edit", response_class=HTMLResponse)
def hook_edit(request: Request, hook_id: int):
hook = repo.get_hook(hook_id)
if hook is None:
raise HTTPException(404, f"hook id={hook_id} not found")
module = repo.get_module(hook["module_id"])
return _templates.TemplateResponse(
request,
"hook_form.html",
_ctx(module=module, hook=hook, connections=repo.list_connections(),
form_action=f"/hooks/{hook_id}",
cancel_url=f"/modules/{module['id']}"),
)
@_router.post("/hooks/{hook_id}")
async def hook_update(request: Request, hook_id: int):
hook = repo.get_hook(hook_id)
if hook is None:
raise HTTPException(404, f"hook id={hook_id} not found")
form = await request.form()
conn_id = form.get("connection_id")
repo.update_hook(
hook_id,
sql=form["sql"],
run_order=int(form.get("run_order") or 0),
connection_id=int(conn_id) if conn_id else None,
run_on=form.get("run_on", "success"),
)
return RedirectResponse(url=f"/modules/{hook['module_id']}", status_code=303)
@_router.post("/hooks/{hook_id}/delete")
def hook_delete(hook_id: int):
hook = repo.get_hook(hook_id)
if hook is None:
raise HTTPException(404, f"hook id={hook_id} not found")
module_id = hook["module_id"]
repo.delete_hook(hook_id)
return RedirectResponse(url=f"/modules/{module_id}", status_code=303)

View File

@ -0,0 +1,279 @@
/* Pipekit web structured, bordered, terminal-inspired.
Design bar (per user feedback):
- Every logical region has a visible border + title.
- Pickers and structured lists over free-text inputs.
- Layout directs flow; nothing floats. */
:root {
--bg: #111418;
--surface: #181c22;
--border: #2a3038;
--border-strong: #3d4652;
--text: #d7dce3;
--text-muted: #8b95a2;
--accent: #6fa8dc;
--success: #78c679;
--danger: #e57373;
--warning: #e1b467;
--mono: "JetBrains Mono", "Fira Code", "Consolas", "Courier New", monospace;
--sans: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
}
* { box-sizing: border-box; }
body {
margin: 0;
background: var(--bg);
color: var(--text);
font-family: var(--sans);
font-size: 14px;
line-height: 1.45;
}
a { color: var(--accent); text-decoration: none; }
a:hover { text-decoration: underline; }
code, pre, .mono { font-family: var(--mono); font-size: 13px; }
header.topbar {
display: flex;
align-items: center;
gap: 1.5rem;
padding: 0.6rem 1.2rem;
background: var(--surface);
border-bottom: 1px solid var(--border-strong);
}
header.topbar .brand {
font-weight: 700;
letter-spacing: 0.05em;
}
header.topbar nav {
display: flex;
gap: 1rem;
}
header.topbar nav a {
color: var(--text-muted);
padding: 0.2rem 0.5rem;
border-radius: 3px;
}
header.topbar nav a.active,
header.topbar nav a:hover {
color: var(--text);
background: var(--border);
text-decoration: none;
}
header.topbar .right { margin-left: auto; color: var(--text-muted); font-size: 12px; }
main {
max-width: 1200px;
margin: 1rem auto;
padding: 0 1.2rem;
}
/* Bordered panels — the bread and butter. */
.panel {
background: var(--surface);
border: 1px solid var(--border);
border-radius: 4px;
margin-bottom: 1rem;
}
.panel > header {
padding: 0.45rem 0.9rem;
border-bottom: 1px solid var(--border);
background: #1d222a;
font-weight: 600;
letter-spacing: 0.02em;
display: flex;
align-items: center;
gap: 0.6rem;
}
.panel > header .subtitle {
color: var(--text-muted);
font-weight: 400;
font-size: 12px;
}
.panel > .body { padding: 0.8rem 0.9rem; }
.panel > .body.tight { padding: 0; }
.panel > footer {
padding: 0.5rem 0.9rem;
border-top: 1px solid var(--border);
background: #15191f;
font-size: 12px;
color: var(--text-muted);
}
/* Tables */
table.grid {
width: 100%;
border-collapse: collapse;
}
table.grid th, table.grid td {
padding: 0.4rem 0.7rem;
border-bottom: 1px solid var(--border);
text-align: left;
vertical-align: top;
}
table.grid th {
color: var(--text-muted);
font-weight: 500;
font-size: 12px;
letter-spacing: 0.04em;
text-transform: uppercase;
background: #15191f;
border-bottom-color: var(--border-strong);
}
table.grid tr:last-child td { border-bottom: none; }
table.grid tr:hover td { background: #1c2128; }
/* Status pills */
.pill {
display: inline-block;
padding: 0.05rem 0.5rem;
border-radius: 10px;
font-size: 11px;
font-weight: 600;
letter-spacing: 0.04em;
text-transform: uppercase;
border: 1px solid currentColor;
color: var(--text-muted);
}
.pill.ok, .pill.success { color: var(--success); }
.pill.err, .pill.error { color: var(--danger); }
.pill.running { color: var(--accent); }
.pill.disabled { color: var(--text-muted); }
.pill.warning { color: var(--warning); }
/* Labeled key-value rows (used in detail views) */
dl.keyval {
display: grid;
grid-template-columns: 10rem 1fr;
gap: 0.3rem 1rem;
margin: 0;
}
dl.keyval dt { color: var(--text-muted); }
dl.keyval dd { margin: 0; }
/* SQL blocks */
pre.sql {
background: #0f1216;
border: 1px solid var(--border);
border-radius: 3px;
padding: 0.7rem 0.9rem;
margin: 0;
white-space: pre-wrap;
overflow-x: auto;
color: #c6d0da;
}
/* Buttons and forms */
button, .btn {
background: var(--border);
border: 1px solid var(--border-strong);
color: var(--text);
padding: 0.35rem 0.9rem;
border-radius: 3px;
font-family: inherit;
font-size: 13px;
cursor: pointer;
}
button:hover, .btn:hover { background: var(--border-strong); }
button.primary { background: #22303f; border-color: #3d5273; color: #cfe0f5; }
button.primary:hover { background: #2b3d52; }
button.ghost { background: transparent; }
form.inline { display: inline; }
.actions { display: flex; gap: 0.5rem; flex-wrap: wrap; }
/* Empty-state */
.empty {
padding: 1.5rem;
text-align: center;
color: var(--text-muted);
}
/* Group heading inside a panel (used on the module list) */
.group-head {
padding: 0.4rem 0.9rem;
color: var(--text-muted);
background: #141820;
font-size: 12px;
letter-spacing: 0.05em;
text-transform: uppercase;
border-bottom: 1px solid var(--border);
}
/* Two-column layout helper */
.two-col {
display: grid;
grid-template-columns: 2fr 1fr;
gap: 1rem;
}
@media (max-width: 900px) {
.two-col { grid-template-columns: 1fr; }
}
/* Form controls — inputs, selects, textarea. Match bordered panel look. */
input[type="text"], input[type="number"], input[type="password"],
select, textarea {
background: #0f1216;
border: 1px solid var(--border-strong);
color: var(--text);
padding: 0.35rem 0.6rem;
border-radius: 3px;
font-family: inherit;
font-size: 13px;
min-width: 14rem;
}
input:focus, select:focus, textarea:focus {
outline: 1px solid var(--accent);
border-color: var(--accent);
}
textarea { font-family: var(--mono); min-width: 100%; }
label.field {
display: grid;
grid-template-columns: 10rem 1fr;
align-items: center;
gap: 0.5rem 1rem;
margin-bottom: 0.6rem;
}
label.field .help { grid-column: 2; color: var(--text-muted); font-size: 12px; }
/* Step indicator */
.steps {
display: flex;
gap: 0;
margin-bottom: 1rem;
border: 1px solid var(--border);
border-radius: 4px;
overflow: hidden;
background: var(--surface);
}
.steps .step {
flex: 1;
padding: 0.5rem 0.9rem;
color: var(--text-muted);
font-size: 12px;
letter-spacing: 0.04em;
text-transform: uppercase;
border-right: 1px solid var(--border);
}
.steps .step:last-child { border-right: none; }
.steps .step.active { color: var(--text); background: #1d222a; }
.steps .step.done { color: var(--success); }
.steps .step .num { font-weight: 700; margin-right: 0.4rem; }
/* Radio/checkbox-in-row tables */
table.picker td.pick { width: 2.5rem; text-align: center; }
table.picker input[type="radio"],
table.picker input[type="checkbox"] { margin: 0; }
table.picker tbody tr { cursor: pointer; }
table.picker tbody tr:hover td { background: #1c2128; }
/* Flash messages */
.flash {
padding: 0.5rem 0.9rem;
margin-bottom: 1rem;
border-radius: 3px;
border: 1px solid var(--border-strong);
background: #1d222a;
}
.flash.ok { border-color: #2f6b35; background: #16261a; color: #b6dcb8; }
.flash.err { border-color: #6b2f2f; background: #261616; color: #dcb6b6; }

View File

@ -0,0 +1,12 @@
{# Step indicator shared by all wizard pages. `step` is 1..4. #}
<div class="steps">
<div class="step {% if step == 1 %}active{% elif step > 1 %}done{% endif %}">
<span class="num">1</span> source connection
</div>
<div class="step {% if step == 2 %}active{% elif step > 2 %}done{% endif %}">
<span class="num">2</span> browse tables
</div>
<div class="step {% if step == 3 %}active{% elif step > 3 %}done{% endif %}">
<span class="num">3</span> columns &amp; config
</div>
</div>

View File

@ -0,0 +1,26 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{% block title %}Pipekit{% endblock %}</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="/static/style.css">
</head>
<body>
<header class="topbar">
<span class="brand">PIPEKIT</span>
<nav>
<a href="/" class="{% if section == 'modules' %}active{% endif %}">Modules</a>
<a href="/connections" class="{% if section == 'connections' %}active{% endif %}">Connections</a>
<a href="/runs" class="{% if section == 'runs' %}active{% endif %}">Runs</a>
</nav>
<span class="right">v{{ version }} &middot; <a href="/docs">API docs</a></span>
</header>
<main>
{% if flash %}
<div class="flash {{ flash.kind }}">{{ flash.message }}</div>
{% endif %}
{% block content %}{% endblock %}
</main>
</body>
</html>

View File

@ -0,0 +1,111 @@
{% extends "base.html" %}
{% set section = "connections" %}
{% block title %}{% if connection %}Edit connection{% else %}New connection{% endif %} — Pipekit{% endblock %}
{% block content %}
<div class="panel">
<header>
{% if connection %}
Edit connection &middot; {{ connection.name }}
{% else %}
New connection
{% endif %}
<span class="subtitle">jdbc endpoint + credentials</span>
<span style="margin-left:auto"><a href="{{ cancel_url }}">&larr; back</a></span>
</header>
<div class="body">
<form method="post" action="{{ form_action }}">
<label class="field">
<span>name</span>
<input type="text" name="name" required
value="{{ connection.name if connection else '' }}">
<span class="help">short identifier, used in module and watermark links</span>
</label>
<label class="field">
<span>driver</span>
<select name="driver_id" required>
{% for d in drivers %}
<option value="{{ d.id }}"
{% if connection and d.id == connection.driver_id %}selected{% endif %}>
{{ d.name }} &mdash; {{ d.kind }}
</option>
{% endfor %}
</select>
<span class="help">jar + class registered in the driver table</span>
</label>
<div class="panel" style="margin:0.5rem 0 0.8rem;background:#0f1216">
<header style="padding:0.3rem 0.7rem">JDBC url format by driver
<span class="subtitle">pick the line matching the selected driver</span>
</header>
<div class="body" style="padding:0.5rem 0.9rem">
<dl class="keyval" style="grid-template-columns:5rem 1fr;gap:0.2rem 1rem">
<dt>db2</dt> <dd class="mono">jdbc:as400://HOST;libraries=LIB1,LIB2;naming=system;translate%20binary=true</dd>
<dt>mssql</dt> <dd class="mono">jdbc:sqlserver://HOST:1433;databaseName=DB;encrypt=false</dd>
<dt>pg</dt> <dd class="mono">jdbc:postgresql://HOST:5432/DATABASE</dd>
</dl>
</div>
</div>
<label class="field">
<span>JDBC url</span>
<input type="text" name="jdbc_url" required
value="{{ connection.jdbc_url if connection else '' }}"
placeholder="jdbc:as400://...">
<span class="help">must start with <code>jdbc:</code> — driver-specific query params after the host</span>
</label>
<label class="field">
<span>username</span>
<input type="text" name="username"
value="{{ connection.username if connection else '' }}">
</label>
<label class="field">
<span>password</span>
<input type="text" name="password"
value="{{ connection.password if connection else '' }}"
placeholder="$DB2PW">
<span class="help">store as an env-var reference like <code>$DB2PW</code> — resolved at run time, never logged</span>
</label>
<label class="field">
<span>default dest connection</span>
<select name="default_dest_connection_id">
<option value=""></option>
{% for c in connections %}
{% if not connection or c.id != connection.id %}
<option value="{{ c.id }}"
{% if connection and c.id == connection.default_dest_connection_id %}selected{% endif %}>
{{ c.name }}
</option>
{% endif %}
{% endfor %}
</select>
<span class="help">pre-selected as destination when this is the source of a new module</span>
</label>
<label class="field">
<span>default dest schema</span>
<input type="text" name="default_dest_schema"
value="{{ connection.default_dest_schema if connection else '' }}"
placeholder="e.g. rlarp">
<span class="help">prefix for dest_table in the new-module wizard</span>
</label>
<label class="field">
<span>notes</span>
<textarea name="notes" rows="3">{{ connection.notes if connection else '' }}</textarea>
</label>
<div class="actions" style="justify-content:flex-end;margin-top:0.8rem">
<a class="btn ghost" href="{{ cancel_url }}">cancel</a>
<button type="submit" class="primary">
{% if connection %}save changes{% else %}create connection{% endif %}
</button>
</div>
</form>
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,74 @@
{% extends "base.html" %}
{% set section = "connections" %}
{% block title %}Connections — Pipekit{% endblock %}
{% block content %}
<div class="panel">
<header>
Connections
<span class="subtitle">{{ connections|length }} total</span>
<span style="margin-left:auto">
<a class="btn" href="/connections/new">New connection…</a>
</span>
</header>
<div class="body tight">
{% if connections %}
<table class="grid">
<thead>
<tr>
<th>name</th>
<th>driver</th>
<th>jdbc url</th>
<th>default dest</th>
<th></th>
</tr>
</thead>
<tbody>
{% for c in connections %}
<tr>
<td><strong>{{ c.name }}</strong></td>
<td class="mono">{{ c.driver_kind }}</td>
<td class="mono" style="max-width:26rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap">{{ c.jdbc_url }}</td>
<td class="mono">{{ c.default_dest_schema or '—' }}</td>
<td style="text-align:right;white-space:nowrap">
<a href="/connections/{{ c.id }}/edit">edit</a> ·
<form class="inline" method="post" action="/connections/{{ c.id }}/delete"
onsubmit="return confirm('Delete connection {{ c.name }}?')">
<button class="ghost" type="submit" style="padding:0;border:none;color:var(--danger)">delete</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<div class="empty">No connections yet.</div>
{% endif %}
</div>
</div>
<div class="panel">
<header>Registered drivers
<span class="subtitle">{{ drivers|length }} JDBC drivers available</span>
</header>
<div class="body tight">
{% if drivers %}
<table class="grid">
<thead><tr><th>kind</th><th>name</th><th>jar</th><th>class</th></tr></thead>
<tbody>
{% for d in drivers %}
<tr>
<td class="mono">{{ d.kind }}</td>
<td>{{ d.name }}</td>
<td class="mono" style="max-width:28rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap">{{ d.jar_file }}</td>
<td class="mono">{{ d.class_name }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<div class="empty">No drivers registered.</div>
{% endif %}
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,63 @@
{% extends "base.html" %}
{% set section = "modules" %}
{% block title %}{% if hook %}Edit hook{% else %}New hook{% endif %} — {{ module.name }}{% endblock %}
{% block content %}
<div class="panel">
<header>
{% if hook %}Edit hook &middot; #{{ hook.id }}{% else %}New hook for {{ module.name }}{% endif %}
<span class="subtitle">SQL run after the merge, in order</span>
<span style="margin-left:auto"><a href="{{ cancel_url }}">&larr; back to module</a></span>
</header>
<div class="body">
<form method="post" action="{{ form_action }}">
<label class="field">
<span>run order</span>
<input type="number" name="run_order" min="0" step="1"
value="{{ hook.run_order if hook else 0 }}">
<span class="help">lower runs first</span>
</label>
<label class="field">
<span>run on</span>
<select name="run_on">
{% for opt in ['success', 'failure', 'always'] %}
<option value="{{ opt }}"
{% if (hook and hook.run_on == opt) or (not hook and opt == 'success') %}selected{% endif %}>
{{ opt }}
</option>
{% endfor %}
</select>
<span class="help">success = only after merge succeeds; always = even on error</span>
</label>
<label class="field">
<span>connection</span>
<select name="connection_id">
<option value="">— use module destination —</option>
{% for c in connections %}
<option value="{{ c.id }}"
{% if hook and c.id == hook.connection_id %}selected{% endif %}>
{{ c.name }}
</option>
{% endfor %}
</select>
<span class="help">leave blank to run against the module's destination connection</span>
</label>
<label class="field">
<span>SQL</span>
<textarea name="sql" rows="8" required>{{ hook.sql if hook else '' }}</textarea>
<span class="help">e.g. <code>ANALYZE rlarp.mytable;</code></span>
</label>
<div class="actions" style="justify-content:flex-end;margin-top:0.8rem">
<a class="btn ghost" href="{{ cancel_url }}">cancel</a>
<button type="submit" class="primary">
{% if hook %}save changes{% else %}create hook{% endif %}
</button>
</div>
</form>
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,203 @@
{% extends "base.html" %}
{% set section = "modules" %}
{% block title %}{{ module.name }} — Pipekit{% endblock %}
{% block content %}
<div class="panel">
<header>
{{ module.name }}
<span class="subtitle">
module #{{ module.id }}
{% if module.running %}<span class="pill running">running</span>{% endif %}
{% if not module.enabled %}<span class="pill disabled">disabled</span>{% endif %}
</span>
<span style="margin-left:auto" class="actions">
<form class="inline" method="post" action="/modules/{{ module.id }}/run">
<button class="primary" type="submit">Run now</button>
</form>
<form class="inline" method="post" action="/modules/{{ module.id }}/run">
<input type="hidden" name="dry_run" value="1">
<button type="submit">Dry run</button>
</form>
</span>
</header>
<div class="body">
<dl class="keyval">
<dt>source</dt> <dd>{{ source_conn.name }} <span style="opacity:.6" class="mono">({{ source_conn.jdbc_url }})</span></dd>
<dt>destination</dt> <dd>{{ dest_conn.name }} <span style="opacity:.6" class="mono">({{ dest_conn.jdbc_url }})</span></dd>
<dt>dest table</dt> <dd class="mono">{{ module.dest_table }}</dd>
<dt>staging table</dt> <dd class="mono">{{ module.staging_table }}</dd>
<dt>merge strategy</dt> <dd class="mono">{{ module.merge_strategy }}</dd>
<dt>merge key</dt> <dd class="mono">{{ module.merge_key or "—" }}</dd>
</dl>
</div>
</div>
<div class="two-col">
<div>
<div class="panel">
<header>Source query
<span class="subtitle">free text — edit opens in $EDITOR (TODO)</span>
</header>
<div class="body"><pre class="sql">{{ module.source_query }}</pre></div>
</div>
{% if schema_cols or module.dest_description %}
<div class="panel">
<header>Schema
<span class="subtitle">{{ schema_cols|length }} column{{ 's' if schema_cols|length != 1 else '' }}</span>
</header>
<div class="body tight">
{% if module.dest_description %}
<p style="margin:0 0 0.6rem 0">{{ module.dest_description }}</p>
{% endif %}
{% if schema_cols %}
<table class="grid">
<thead>
<tr>
<th>source</th>
<th>dest</th>
<th>type</th>
<th>description</th>
</tr>
</thead>
<tbody>
{% for c in schema_cols %}
<tr>
<td class="mono">{{ c.source_name }}</td>
<td class="mono">{{ c.dest_name }}</td>
<td class="mono" style="color:var(--text-muted)">{{ c.dest_type }}</td>
<td>{{ c.description or '' }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% endif %}
</div>
</div>
{% endif %}
{% if preview %}
<div class="panel">
<header>Next resolved source SQL
<span class="subtitle">watermarks substituted — this is what will run</span>
</header>
<div class="body"><pre class="sql">{{ preview.resolved_source_sql }}</pre></div>
</div>
<div class="panel">
<header>Merge SQL
<span class="subtitle">runs against destination after staging is loaded</span>
</header>
<div class="body"><pre class="sql">{{ preview.merge_sql }}</pre></div>
</div>
{% else %}
<div class="panel"><header>Preview</header>
<div class="body empty">
{% if preview_error %}
<span class="pill err">error</span> {{ preview_error }}
{% else %}
No preview available.
{% endif %}
</div>
</div>
{% endif %}
</div>
<div>
<div class="panel">
<header>Watermarks
<span class="subtitle">{{ watermarks|length }}</span>
<span style="margin-left:auto">
<a class="btn" href="/modules/{{ module.id }}/watermarks/new">+ add</a>
</span>
</header>
<div class="body tight">
{% if watermarks %}
<table class="grid">
<thead><tr><th>name</th><th>resolved</th><th>default</th><th></th></tr></thead>
<tbody>
{% for w in watermarks %}
<tr>
<td class="mono">{{ w.name }}</td>
<td class="mono">{{ (preview.watermark_values.get(w.name) if preview else '') or '—' }}</td>
<td class="mono">{{ w.default_value or '—' }}</td>
<td style="white-space:nowrap">
<a href="/watermarks/{{ w.id }}/edit">edit</a> ·
<form class="inline" method="post" action="/watermarks/{{ w.id }}/delete"
onsubmit="return confirm('Delete watermark {{ w.name }}?')">
<button class="ghost" type="submit" style="padding:0;border:none;color:var(--danger)">delete</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<div class="empty">None — this module doesn't use watermarks.</div>
{% endif %}
</div>
</div>
<div class="panel">
<header>Hooks
<span class="subtitle">{{ hooks|length }} post-merge</span>
<span style="margin-left:auto">
<a class="btn" href="/modules/{{ module.id }}/hooks/new">+ add</a>
</span>
</header>
<div class="body tight">
{% if hooks %}
<table class="grid">
<thead><tr><th style="width:3em">#</th><th>when</th><th>sql</th><th></th></tr></thead>
<tbody>
{% for h in hooks %}
<tr>
<td class="mono">{{ h.run_order }}</td>
<td><span class="pill">{{ h.run_on }}</span></td>
<td class="mono" style="max-width:22rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap">{{ h.sql }}</td>
<td style="white-space:nowrap">
<a href="/hooks/{{ h.id }}/edit">edit</a> ·
<form class="inline" method="post" action="/hooks/{{ h.id }}/delete"
onsubmit="return confirm('Delete hook #{{ h.id }}?')">
<button class="ghost" type="submit" style="padding:0;border:none;color:var(--danger)">delete</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<div class="empty">No hooks.</div>
{% endif %}
</div>
</div>
<div class="panel">
<header>Recent runs
<span class="subtitle">last {{ recent_runs|length }}</span>
<span style="margin-left:auto"><a href="/runs?module_id={{ module.id }}">all →</a></span>
</header>
<div class="body tight">
{% if recent_runs %}
<table class="grid">
<thead><tr><th>id</th><th>started</th><th>status</th><th>rows</th></tr></thead>
<tbody>
{% for r in recent_runs %}
<tr>
<td><a href="/runs/{{ r.id }}">#{{ r.id }}</a></td>
<td class="mono">{{ r.started_at }}</td>
<td><span class="pill {{ r.status }}">{{ r.status }}</span></td>
<td class="mono">{{ r.row_count if r.row_count is not none else "—" }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<div class="empty">No runs yet.</div>
{% endif %}
</div>
</div>
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,71 @@
{% extends "base.html" %}
{% set section = "modules" %}
{% block title %}Modules — Pipekit{% endblock %}
{% block content %}
<div class="panel">
<header>
Modules
<span class="subtitle">{{ total }} total · grouped by source connection</span>
<span style="margin-left:auto">
<a class="btn" href="/wizard">New module…</a>
</span>
</header>
<div class="body tight">
{% if grouped %}
{% for conn_name, driver_label, modules in grouped %}
<div class="group-head">{{ conn_name }} <span style="opacity:.7">({{ driver_label }})</span></div>
<table class="grid">
<thead>
<tr>
<th style="width:30%">name</th>
<th>strategy</th>
<th>dest</th>
<th>last run</th>
<th style="width:9em">status</th>
<th style="width:7em">rows</th>
<th></th>
</tr>
</thead>
<tbody>
{% for m in modules %}
<tr>
<td><a href="/modules/{{ m.id }}"><strong>{{ m.name }}</strong></a></td>
<td class="mono">{{ m.merge_strategy }}</td>
<td class="mono">{{ m.dest_table }}</td>
<td class="mono">{{ m.last_run_at or "—" }}</td>
<td>
{% if m.running %}
<span class="pill running">running</span>
{% elif not m.enabled %}
<span class="pill disabled">disabled</span>
{% elif m.last_status %}
<span class="pill {{ m.last_status }}">{{ m.last_status }}</span>
{% else %}
<span class="pill">never ran</span>
{% endif %}
</td>
<td class="mono">{{ m.last_row_count if m.last_row_count is not none else "—" }}</td>
<td style="text-align:right">
<form class="inline" method="post" action="/modules/{{ m.id }}/run">
<button type="submit">Run</button>
</form>
<form class="inline" method="post" action="/modules/{{ m.id }}/run">
<input type="hidden" name="dry_run" value="1">
<button type="submit" class="ghost">Dry run</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
{% endfor %}
{% else %}
<div class="empty">
No modules yet.<br>
<a class="btn" href="/wizard" style="margin-top:0.7rem; display:inline-block">Create one</a>
</div>
{% endif %}
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,56 @@
{% extends "base.html" %}
{% set section = "runs" %}
{% block title %}Run #{{ run.id }} — Pipekit{% endblock %}
{% block content %}
<div class="panel">
<header>
Run #{{ run.id }}
<span class="subtitle">
<a href="/modules/{{ run.module_id }}">{{ run.module_name }}</a> ·
started {{ run.started_at }}
</span>
<span style="margin-left:auto"><span class="pill {{ run.status }}">{{ run.status }}</span></span>
</header>
<div class="body">
<dl class="keyval">
<dt>started</dt> <dd class="mono">{{ run.started_at }}</dd>
<dt>finished</dt> <dd class="mono">{{ run.finished_at or '—' }}</dd>
<dt>rows</dt> <dd class="mono">{{ run.row_count if run.row_count is not none else '—' }}</dd>
<dt>watermarks</dt><dd class="mono">{{ run.watermark_values_json or '—' }}</dd>
{% if run.error %}<dt>error</dt><dd class="mono" style="color:var(--danger)">{{ run.error }}</dd>{% endif %}
</dl>
</div>
</div>
{% if run.resolved_source_sql %}
<div class="panel">
<header>Resolved source SQL</header>
<div class="body"><pre class="sql">{{ run.resolved_source_sql }}</pre></div>
</div>
{% endif %}
{% if run.merge_sql %}
<div class="panel">
<header>Merge SQL</header>
<div class="body"><pre class="sql">{{ run.merge_sql }}</pre></div>
</div>
{% endif %}
{% if run.jrunner_stdout or run.jrunner_stderr %}
<div class="panel">
<header>jrunner output</header>
<div class="body">
{% if run.jrunner_stdout %}<div style="color:var(--text-muted)">stdout</div><pre class="sql">{{ run.jrunner_stdout }}</pre>{% endif %}
{% if run.jrunner_stderr %}<div style="color:var(--text-muted);margin-top:0.6rem">stderr</div><pre class="sql">{{ run.jrunner_stderr }}</pre>{% endif %}
</div>
</div>
{% endif %}
{% if run.hook_log %}
<div class="panel">
<header>Hook log</header>
<div class="body"><pre class="sql">{{ run.hook_log }}</pre></div>
</div>
{% endif %}
{% endblock %}

View File

@ -0,0 +1,50 @@
{% extends "base.html" %}
{% set section = "runs" %}
{% block title %}Runs — Pipekit{% endblock %}
{% block content %}
<div class="panel">
<header>
Runs
<span class="subtitle">
{% if module_filter %}for module {{ module_filter.name }} · {% endif %}
last {{ runs|length }}
</span>
{% if module_filter %}
<span style="margin-left:auto"><a href="/runs">clear filter</a></span>
{% endif %}
</header>
<div class="body tight">
{% if runs %}
<table class="grid">
<thead>
<tr>
<th style="width:5em">id</th>
<th>module</th>
<th>started</th>
<th>finished</th>
<th style="width:8em">status</th>
<th style="width:7em">rows</th>
<th>error</th>
</tr>
</thead>
<tbody>
{% for r in runs %}
<tr>
<td><a href="/runs/{{ r.id }}">#{{ r.id }}</a></td>
<td><a href="/modules/{{ r.module_id }}">{{ r.module_name }}</a></td>
<td class="mono">{{ r.started_at }}</td>
<td class="mono">{{ r.finished_at or '—' }}</td>
<td><span class="pill {{ r.status }}">{{ r.status }}</span></td>
<td class="mono">{{ r.row_count if r.row_count is not none else "—" }}</td>
<td class="mono" style="max-width:22rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap">{{ r.error or '' }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<div class="empty">No runs yet.</div>
{% endif %}
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,56 @@
{% extends "base.html" %}
{% set section = "modules" %}
{% block title %}{% if watermark %}Edit watermark{% else %}New watermark{% endif %} — {{ module.name }}{% endblock %}
{% block content %}
<div class="panel">
<header>
{% if watermark %}Edit watermark &middot; {{ watermark.name }}{% else %}New watermark for {{ module.name }}{% endif %}
<span class="subtitle">resolved before each run; value substituted into source_query</span>
<span style="margin-left:auto"><a href="{{ cancel_url }}">&larr; back to module</a></span>
</header>
<div class="body">
<form method="post" action="{{ form_action }}">
<label class="field">
<span>name</span>
<input type="text" name="name" required
value="{{ watermark.name if watermark else '' }}">
<span class="help">referenced in source_query as <code>{% raw %}{name}{% endraw %}</code></span>
</label>
<label class="field">
<span>resolver connection</span>
<select name="connection_id" required>
{% for c in connections %}
<option value="{{ c.id }}"
{% if watermark and c.id == watermark.connection_id %}selected{% endif %}>
{{ c.name }}
</option>
{% endfor %}
</select>
<span class="help">database the resolver_sql runs against (usually the destination)</span>
</label>
<label class="field">
<span>resolver SQL</span>
<textarea name="resolver_sql" rows="4" required>{{ watermark.resolver_sql if watermark else '' }}</textarea>
<span class="help">must return exactly one row with one column, e.g. <code>SELECT MAX(modified_at) FROM rlarp.mytable</code></span>
</label>
<label class="field">
<span>default value</span>
<input type="text" name="default_value"
value="{{ watermark.default_value if watermark else '' }}">
<span class="help">used if resolver returns NULL (first run, empty dest). Leave blank to fail instead.</span>
</label>
<div class="actions" style="justify-content:flex-end;margin-top:0.8rem">
<a class="btn ghost" href="{{ cancel_url }}">cancel</a>
<button type="submit" class="primary">
{% if watermark %}save changes{% else %}create watermark{% endif %}
</button>
</div>
</form>
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,51 @@
{% extends "base.html" %}
{% set section = "modules" %}
{% block title %}New module — step 1{% endblock %}
{% block content %}
{% include "_wizard_steps.html" %}
<div class="panel">
<header>
Step 1 — pick a source connection
<span class="subtitle">the database we'll copy data from</span>
</header>
<div class="body tight">
{% if connections %}
<form method="get" action="/wizard/tables">
<table class="grid picker">
<thead>
<tr>
<th class="pick"></th>
<th>name</th>
<th>driver</th>
<th>jdbc_url</th>
</tr>
</thead>
<tbody>
{% for c in connections %}
<tr onclick="document.getElementById('conn-{{ c.id }}').checked=true">
<td class="pick">
<input type="radio" id="conn-{{ c.id }}" name="source_connection_id"
value="{{ c.id }}" {% if loop.first %}checked{% endif %}>
</td>
<td>{{ c.name }}</td>
<td class="mono">{{ c.driver_kind }}</td>
<td class="mono" style="color:var(--text-muted)">{{ c.jdbc_url }}</td>
</tr>
{% endfor %}
</tbody>
</table>
<div class="body" style="display:flex;justify-content:flex-end;gap:0.5rem">
<a class="btn ghost" href="/">cancel</a>
<button type="submit" class="primary">next &rarr;</button>
</div>
</form>
{% else %}
<div class="empty">
No connections yet. <a href="/connections">Add one</a> to begin.
</div>
{% endif %}
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,164 @@
{% extends "base.html" %}
{% set section = "modules" %}
{% block title %}New module — step 2{% endblock %}
{% block content %}
{% include "_wizard_steps.html" %}
<div class="panel">
<header>
Step 2 — browse tables on {{ connection.name }}
<span class="subtitle">driver: {{ driver_kind }}</span>
<span style="margin-left:auto"><a href="/wizard">&larr; change connection</a></span>
</header>
<div class="body">
<form method="get" action="/wizard/tables">
<input type="hidden" name="source_connection_id" value="{{ connection.id }}">
<input type="hidden" name="browse" value="1">
{% for f in browse_fields %}
<label class="field">
<span>{{ f.label }}{% if f.required %} *{% endif %}</span>
<input type="text" name="{{ f.name }}"
value="{{ qvals.get(f.name, '') }}"
{% if f.required %}required{% endif %}
placeholder="{{ f.default or '' }}">
{% if f.help %}<span class="help">{{ f.help }}</span>{% endif %}
</label>
{% endfor %}
<div class="actions" style="margin-top:0.8rem">
<button type="submit" class="primary">browse &rarr;</button>
</div>
</form>
</div>
</div>
{% if fetch_error %}
<div class="panel">
<header>Browse failed</header>
<div class="body"><pre class="sql" style="color:var(--danger)">{{ fetch_error }}</pre></div>
</div>
{% elif attempted %}
<div class="panel">
<header>
Tables
<span class="subtitle" id="tbl-count">{{ tables|length }} found</span>
{% if tables %}
<span style="margin-left:auto;display:flex;gap:0.5rem;align-items:center">
<input type="text" id="tbl-filter" form="wizard-next-form"
placeholder="filter (regex, case-insensitive)"
autocomplete="off" spellcheck="false"
style="min-width:22rem;font-family:var(--mono);font-size:12px">
<span id="tbl-filter-err" style="color:var(--danger);font-size:12px;display:none">invalid regex — substring match</span>
<button type="submit" class="primary" form="wizard-next-form">next &rarr;</button>
</span>
{% endif %}
</header>
<div class="body tight">
{% if tables %}
<form method="get" action="/wizard/columns" id="wizard-next-form">
<input type="hidden" name="source_connection_id" value="{{ connection.id }}">
{% for name, val in qvals.items() %}
<input type="hidden" name="{{ name }}" value="{{ val }}">
{% endfor %}
<table class="grid picker" id="tbl-grid">
<thead>
<tr>
<th class="pick"></th>
<th style="width:5em">kind</th>
<th>schema</th>
<th>name</th>
<th>qualified</th>
</tr>
</thead>
<tbody>
{% for t in tables %}
<tr data-match="{{ t.schema }}.{{ t.name }}"
onclick="document.getElementById('tbl-{{ loop.index }}').checked=true">
<td class="pick">
<input type="radio" id="tbl-{{ loop.index }}"
name="table" value="{{ t.name }}"
{% if loop.first %}checked{% endif %}
data-schema="{{ t.schema }}">
</td>
<td class="mono">{{ t.kind }}</td>
<td class="mono">{{ t.schema }}</td>
<td class="mono">{{ t.name }}</td>
<td class="mono" style="color:var(--text-muted)">{{ t.full_name }}</td>
</tr>
{% endfor %}
</tbody>
</table>
<input type="hidden" name="table_schema" id="table_schema" value="{{ tables[0].schema if tables else '' }}">
<div class="body" style="display:flex;justify-content:flex-end;gap:0.5rem">
<button type="submit" class="primary">next &rarr;</button>
</div>
</form>
<script>
(function () {
var radios = document.querySelectorAll('input[name="table"]');
var tschema = document.getElementById('table_schema');
radios.forEach(function (r) {
r.addEventListener('change', function () {
tschema.value = r.dataset.schema || '';
});
});
var totalRows = {{ tables|length }};
var input = document.getElementById('tbl-filter');
var errTag = document.getElementById('tbl-filter-err');
var countTag = document.getElementById('tbl-count');
var rows = document.querySelectorAll('#tbl-grid tbody tr');
function applyFilter() {
var q = input.value;
var rx = null, useSubstring = false;
errTag.style.display = 'none';
input.style.borderColor = '';
if (q) {
try { rx = new RegExp(q, 'i'); }
catch (e) {
useSubstring = true;
errTag.style.display = '';
input.style.borderColor = 'var(--danger)';
}
}
var visible = 0, firstVisibleRadio = null;
rows.forEach(function (tr) {
var s = tr.getAttribute('data-match') || '';
var show = !q
|| (rx ? rx.test(s) : s.toLowerCase().indexOf(q.toLowerCase()) !== -1);
tr.style.display = show ? '' : 'none';
if (show) {
visible++;
if (!firstVisibleRadio) firstVisibleRadio = tr.querySelector('input[type="radio"]');
}
});
countTag.textContent = q
? visible + ' of ' + totalRows + ' shown'
: totalRows + ' found';
// If the current selection is hidden, promote the first visible one.
var selected = document.querySelector('input[name="table"]:checked');
if ((!selected || selected.closest('tr').style.display === 'none') && firstVisibleRadio) {
firstVisibleRadio.checked = true;
tschema.value = firstVisibleRadio.dataset.schema || '';
}
}
input.addEventListener('input', applyFilter);
// Focus the filter by default so it's keyboard-first.
input.focus();
})();
</script>
{% else %}
<div class="empty">No tables matched those qualifiers.</div>
{% endif %}
</div>
</div>
{% elif not required_ok %}
<div class="panel">
<header>Waiting</header>
<div class="body"><div class="empty">Fill in the required qualifier(s) above, then click Browse.</div></div>
</div>
{% endif %}
{% endblock %}

View File

@ -0,0 +1,174 @@
{% extends "base.html" %}
{% set section = "modules" %}
{% block title %}New module — step 3{% endblock %}
{% block content %}
{% include "_wizard_steps.html" %}
<div class="panel">
<header>
Step 3 — choose columns &amp; configure merge
<span class="subtitle">{{ qualified_table }}</span>
<span style="margin-left:auto"><a href="/wizard/tables?source_connection_id={{ connection.id }}{% for k,v in qvals.items() %}&amp;{{ k }}={{ v }}{% endfor %}&amp;browse=1">&larr; different table</a></span>
</header>
<div class="body">
{% if fetch_error %}
<pre class="sql" style="color:var(--danger)">{{ fetch_error }}</pre>
{% endif %}
</div>
</div>
{% if not fetch_error %}
<form method="post" action="/wizard/create">
<input type="hidden" name="source_connection_id" value="{{ connection.id }}">
<input type="hidden" name="table" value="{{ table }}">
{% for k, v in qvals.items() %}
<input type="hidden" name="{{ k }}" value="{{ v }}">
{% endfor %}
<div class="two-col">
<div class="panel">
<header>
Columns
<span class="subtitle">{{ columns|length }} total — uncheck to exclude</span>
<span style="margin-left:auto">
<button type="button" class="ghost" onclick="toggleAll(true)">all</button>
<button type="button" class="ghost" onclick="toggleAll(false)">none</button>
</span>
</header>
<div class="body tight">
<table class="grid picker">
<thead>
<tr>
<th class="pick"></th>
<th style="width:3em">#</th>
<th>source name</th>
<th>source type</th>
<th style="width:3em">null?</th>
<th>dest name</th>
<th>dest type</th>
<th>description</th>
</tr>
</thead>
<tbody>
{% for c in columns %}
<tr onclick="var cb=document.getElementById('col-{{ loop.index }}'); if(event.target.tagName!=='INPUT') cb.checked=!cb.checked">
<td class="pick">
<input type="checkbox" id="col-{{ loop.index }}"
class="col-check" name="col" value="{{ c.name }}" checked>
</td>
<td class="mono">{{ c.position }}</td>
<td class="mono">{{ c.name }}</td>
<td class="mono" style="color:var(--text-muted)">{{ c.type_raw }}</td>
<td class="mono">{{ 'Y' if c.nullable else 'N' }}</td>
<td>
<input type="text" class="mono"
name="dest_name__{{ c.name }}"
value="{{ c.default_dest_name }}"
style="width:100%;font-size:12px">
</td>
<td>
<input type="text" class="mono"
name="dest_type__{{ c.name }}"
value="{{ c.default_dest_type }}"
style="width:100%;font-size:12px">
</td>
<td>
<input type="text"
name="dest_desc__{{ c.name }}"
value="{{ c.default_description }}"
style="width:100%;font-size:12px">
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
<div>
<div class="panel">
<header>Module</header>
<div class="body">
<label class="field">
<span>name</span>
<input type="text" name="module_name" required
value="{{ default_module_name }}">
<span class="help">used in the URL and as the default staging table name</span>
</label>
</div>
</div>
<div class="panel">
<header>Destination</header>
<div class="body">
<label class="field">
<span>connection</span>
<select name="dest_connection_id" required>
{% for c in all_connections %}
<option value="{{ c.id }}"
{% if default_dest_conn_id and c.id == default_dest_conn_id %}selected{% endif %}>
{{ c.name }}
</option>
{% endfor %}
</select>
</label>
<label class="field">
<span>dest table</span>
<input type="text" name="dest_table" required
value="{{ (default_dest_schema + '.' + default_module_name) if default_dest_schema else default_module_name }}">
<span class="help">fully-qualified (schema.table) in the destination DB</span>
</label>
<label class="field">
<span>staging table</span>
<input type="text" name="staging_table"
placeholder="pipekit_staging.{{ default_module_name }}">
<span class="help">optional — defaults to pipekit_staging.&lt;name&gt;</span>
</label>
<label class="field">
<span>table description</span>
<textarea name="dest_description" rows="2"
style="width:100%">{{ table_description }}</textarea>
<span class="help">emitted as COMMENT ON TABLE after CREATE</span>
</label>
</div>
</div>
<div class="panel">
<header>Merge</header>
<div class="body">
<label class="field">
<span>strategy</span>
<select name="merge_strategy" id="merge_strategy"
onchange="document.getElementById('mkf').style.display = this.value==='incremental' ? '' : 'none'">
<option value="full">full (truncate + insert)</option>
<option value="incremental">incremental (delete by key + insert)</option>
<option value="append">append (insert only)</option>
</select>
</label>
<label class="field" id="mkf" style="display:none">
<span>merge key</span>
<input type="text" name="merge_key" placeholder="e.g. id or id,version">
<span class="help">column name(s) used for the DELETE predicate</span>
</label>
</div>
</div>
<div class="panel">
<header>Create</header>
<div class="body" style="display:flex;justify-content:flex-end;gap:0.5rem">
<a class="btn ghost" href="/">cancel</a>
<button type="submit" class="primary">create module</button>
</div>
</div>
</div>
</div>
</form>
<script>
function toggleAll(val) {
document.querySelectorAll('.col-check').forEach(function (cb) { cb.checked = val; });
}
</script>
{% endif %}
{% endblock %}

View File

@ -0,0 +1,23 @@
{% extends "base.html" %}
{% set section = "modules" %}
{% block title %}New module — Pipekit{% endblock %}
{% block content %}
<div class="panel">
<header>New module — wizard</header>
<div class="body">
<p>The wizard (pick connection → pick table → confirm columns → generate module) is the next increment.</p>
<p>Today you can seed a module via the JSON API:</p>
<pre class="sql">curl -X POST http://localhost:{{ port }}/api/modules \
-H 'Content-Type: application/json' \
-d '{
"name": "my_module",
"source_connection_id": 1,
"dest_connection_id": 2,
"dest_table": "rlarp.my_module",
"source_query": "SELECT * FROM SOMEWHERE",
"merge_strategy": "full"
}'</pre>
</div>
</div>
{% endblock %}

6
requirements.txt Normal file
View File

@ -0,0 +1,6 @@
fastapi>=0.115
uvicorn[standard]>=0.30
python-multipart>=0.0.20
jinja2>=3.1
pyyaml>=6.0
httpx>=0.27