pipekit/.archive/pre-rewrite/engine/introspect.py
Paul Trowbridge 574ada5258 Initial commit: Pipekit rewrite.
Orchestration layer around the jrunner Java JDBC CLI, replacing the
previous shell-based sync system in .archive/pre-rewrite. Includes
the FastAPI + Jinja web frontend, per-driver adapters (DB2, MSSQL,
PG), wizard-driven module creation with editable dest types and
source-sourced table/column descriptions, watermark/hook CRUD,
and the engine that runs modules end-to-end.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-22 00:38:26 -04:00

463 lines
17 KiB
Python

"""Introspect source systems — browse tables, fetch columns, generate queries and DDL."""
import csv
import io
import os
import re
import subprocess
import tempfile
from dataclasses import dataclass
from config import get_config
from engine.db import get_connection
@dataclass
class RemoteTable:
schema: str
name: str
table_type: str
linked_server: str = None
linked_db: str = None
@property
def full_name(self) -> str:
if self.linked_server:
return f"[{self.linked_server}].[{self.linked_db}].{self.schema}.{self.name}"
return f"{self.schema}.{self.name}"
@property
def type_label(self) -> str:
mapping = {
"BASE TABLE": "Table", "VIEW": "View",
"P": "Table", "L": "View", "T": "Table", "V": "View",
}
return mapping.get(self.table_type, self.table_type)
def to_dict(self) -> dict:
return {"schema": self.schema, "name": self.name,
"table_type": self.table_type, "type_label": self.type_label,
"full_name": self.full_name,
"linked_server": self.linked_server,
"linked_db": self.linked_db}
@dataclass
class RemoteColumn:
name: str
data_type: str
position: int
nullable: bool = True
def to_dict(self) -> dict:
return {"name": self.name, "data_type": self.data_type,
"position": self.position, "nullable": self.nullable}
# ---------------------------------------------------------------------------
# JDBC type to PostgreSQL type mapping
# ---------------------------------------------------------------------------
TYPE_MAP_PG = {
# integers
"int": "integer", "integer": "integer", "smallint": "smallint",
"bigint": "bigint", "tinyint": "smallint",
# floats
"float": "double precision", "real": "real", "double": "double precision",
# decimal
"decimal": "numeric", "numeric": "numeric", "money": "numeric(19,4)",
"smallmoney": "numeric(10,4)",
# strings
"varchar": "text", "char": "text", "nvarchar": "text", "nchar": "text",
"text": "text", "ntext": "text", "character": "text",
# dates
"date": "date", "datetime": "timestamp", "datetime2": "timestamp",
"smalldatetime": "timestamp", "timestamp": "timestamp",
"timestamptz": "timestamptz",
# boolean
"bit": "boolean",
# binary
"binary": "bytea", "varbinary": "bytea", "image": "bytea",
# uuid
"uniqueidentifier": "uuid",
}
def map_type_pg(source_type: str) -> str:
"""Map a source column type to a PostgreSQL type."""
base = source_type.lower().split("(")[0].strip()
return TYPE_MAP_PG.get(base, "text")
# ---------------------------------------------------------------------------
# jrunner query helper
# ---------------------------------------------------------------------------
def _resolve_password(password: str) -> str:
"""Resolve a password — if it starts with $, look up the env var."""
if password and password.startswith("$"):
return os.environ.get(password[1:], "")
return password or ""
def run_jrunner_query(connection_id: int, sql: str) -> str:
"""Run a query via jrunner in CSV mode and return raw output."""
conn = get_connection(connection_id)
if not conn:
raise ValueError(f"Connection {connection_id} not found")
cfg = get_config()
jrunner = cfg["jrunner_path"]
password = _resolve_password(conn["password"])
with tempfile.NamedTemporaryFile(mode="w", suffix=".sql", delete=False) as f:
f.write(sql)
sql_path = f.name
try:
result = subprocess.run(
[jrunner,
"-scu", conn["jdbc_url"],
"-scn", conn["username"] or "",
"-scp", password,
"-sq", sql_path,
"-f", "csv"],
capture_output=True, text=True, timeout=60,
)
if result.returncode != 0:
raise RuntimeError(f"jrunner error: {result.stderr or result.stdout}")
return result.stdout
finally:
os.unlink(sql_path)
def _parse_csv(output: str) -> list[list[str]]:
"""Parse CSV output from jrunner, skipping the header."""
reader = csv.reader(io.StringIO(output))
header = next(reader, None)
if not header:
return []
return [row for row in reader if row]
# ---------------------------------------------------------------------------
# Table browsing
# ---------------------------------------------------------------------------
def _detect_source_type(jdbc_url: str) -> str:
"""Detect source type from JDBC URL."""
url = jdbc_url.lower()
if "as400" in url:
return "as400"
if "sqlserver" in url:
return "sqlserver"
if "postgresql" in url:
return "postgresql"
if "clickhouse" in url:
return "clickhouse"
if "mysql" in url:
return "mysql"
return "unknown"
def fetch_tables(connection_id: int, schema_filter: str = None) -> list[RemoteTable]:
"""Fetch list of tables and views from a source connection."""
conn = get_connection(connection_id)
if not conn:
raise ValueError(f"Connection {connection_id} not found")
source_type = _detect_source_type(conn["jdbc_url"])
linked_server = None
linked_db = None
if source_type == "as400":
sql = (
"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
"FROM QSYS2.SYSTABLES "
"WHERE TABLE_SCHEMA NOT LIKE 'Q%' "
)
if schema_filter:
sql += f"AND TABLE_SCHEMA = '{schema_filter}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
elif source_type == "sqlserver":
# Parse schema_filter formats:
# "LINKED.DB" -> linked server + database
# "LINKED.DB.SCHEMA" -> linked server + database + schema
# ".DB" -> database only (no linked server)
# ".DB.SCHEMA" -> database + schema
# "SCHEMA" -> schema only (current database)
linked_schema = None
local_db = None
if schema_filter and "." in schema_filter:
parts = schema_filter.split(".")
if parts[0] == "":
# Starts with dot: ".DB" or ".DB.SCHEMA"
local_db = parts[1] if len(parts) > 1 else None
linked_schema = parts[2] if len(parts) > 2 else None
elif len(parts) == 2:
linked_server, linked_db = parts
elif len(parts) >= 3:
linked_server, linked_db, linked_schema = parts[0], parts[1], parts[2]
if linked_server:
sql = (
f"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
f"FROM [{linked_server}].[{linked_db}].INFORMATION_SCHEMA.TABLES "
f"WHERE TABLE_TYPE IN ('BASE TABLE','VIEW') "
)
if linked_schema:
sql += f"AND TABLE_SCHEMA = '{linked_schema}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
elif local_db:
sql = (
f"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
f"FROM [{local_db}].INFORMATION_SCHEMA.TABLES "
f"WHERE TABLE_TYPE IN ('BASE TABLE','VIEW') "
)
if linked_schema:
sql += f"AND TABLE_SCHEMA = '{linked_schema}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
else:
sql = (
"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
"FROM INFORMATION_SCHEMA.TABLES "
"WHERE TABLE_TYPE IN ('BASE TABLE','VIEW') "
)
if schema_filter:
sql += f"AND TABLE_SCHEMA = '{schema_filter}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
elif source_type == "postgresql":
sql = (
"SELECT table_schema, table_name, table_type "
"FROM information_schema.tables "
"WHERE table_schema NOT IN ('pg_catalog','information_schema') "
)
if schema_filter:
sql += f"AND table_schema = '{schema_filter}' "
sql += "ORDER BY table_schema, table_name"
elif source_type == "clickhouse":
sql = (
"SELECT database AS TABLE_SCHEMA, name AS TABLE_NAME, engine AS TABLE_TYPE "
"FROM system.tables "
"WHERE database NOT IN ('system','INFORMATION_SCHEMA','information_schema') "
)
if schema_filter:
sql += f"AND database = '{schema_filter}' "
sql += "ORDER BY database, name"
elif source_type == "mysql":
sql = (
"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
"FROM INFORMATION_SCHEMA.TABLES "
"WHERE TABLE_SCHEMA NOT IN ('mysql','information_schema','performance_schema','sys') "
)
if schema_filter:
sql += f"AND TABLE_SCHEMA = '{schema_filter}' "
sql += "ORDER BY TABLE_SCHEMA, TABLE_NAME"
else:
# Generic fallback — INFORMATION_SCHEMA is widely supported
sql = (
"SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE "
"FROM INFORMATION_SCHEMA.TABLES "
"ORDER BY TABLE_SCHEMA, TABLE_NAME"
)
# For database-only queries, store the db in linked_db so downstream can reference it
effective_db = linked_db if linked_server else (local_db if source_type == "sqlserver" else None)
rows = _parse_csv(run_jrunner_query(connection_id, sql))
return [RemoteTable(schema=r[0].strip(), name=r[1].strip(), table_type=r[2].strip(),
linked_server=linked_server if source_type == "sqlserver" else None,
linked_db=effective_db)
for r in rows if len(r) >= 3]
def fetch_columns(connection_id: int, schema: str, table: str,
linked_server: str = None, linked_db: str = None) -> list[RemoteColumn]:
"""Fetch column metadata for a specific table."""
conn = get_connection(connection_id)
if not conn:
raise ValueError(f"Connection {connection_id} not found")
source_type = _detect_source_type(conn["jdbc_url"])
if source_type == "as400":
sql = (
f"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION "
f"FROM QSYS2.SYSCOLUMNS "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
f"ORDER BY ORDINAL_POSITION"
)
elif source_type == "clickhouse":
sql = (
f"SELECT name, type, position() "
f"FROM system.columns "
f"WHERE database = '{schema}' AND table = '{table}' "
f"ORDER BY position"
)
elif source_type == "sqlserver" and linked_server and linked_db:
sql = (
f"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION "
f"FROM [{linked_server}].[{linked_db}].INFORMATION_SCHEMA.COLUMNS "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
f"ORDER BY ORDINAL_POSITION"
)
elif source_type == "sqlserver" and linked_db:
sql = (
f"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION "
f"FROM [{linked_db}].INFORMATION_SCHEMA.COLUMNS "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
f"ORDER BY ORDINAL_POSITION"
)
else:
# Works for SQL Server, PostgreSQL, MySQL
sql = (
f"SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION "
f"FROM INFORMATION_SCHEMA.COLUMNS "
f"WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table}' "
f"ORDER BY ORDINAL_POSITION"
)
rows = _parse_csv(run_jrunner_query(connection_id, sql))
return [RemoteColumn(name=r[0].strip(), data_type=r[1].strip(), position=int(r[2].strip()))
for r in rows if len(r) >= 3]
# ---------------------------------------------------------------------------
# Query and DDL generation
# ---------------------------------------------------------------------------
_IDENTIFIER_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
def _needs_quoting(name: str) -> bool:
"""Check if a column name needs quoting (has spaces, special chars, etc.)."""
return not _IDENTIFIER_RE.match(name)
def _safe_alias(name: str) -> str:
"""Generate a safe lowercase alias for a column name.
Replaces special characters with underscores and strips leading/trailing
underscores. If the result still needs quoting, wraps in double quotes.
"""
alias = re.sub(r'[^a-z0-9_]', '_', name.lower())
alias = re.sub(r'_+', '_', alias).strip('_')
if not alias or not _IDENTIFIER_RE.match(alias):
alias = f'"{alias}"'
return alias
def generate_select(connection_id: int, schema: str, table: str,
columns: list[RemoteColumn] = None,
linked_server: str = None, linked_db: str = None) -> str:
"""Generate a SELECT query from column metadata."""
if columns is None:
columns = fetch_columns(connection_id, schema, table,
linked_server=linked_server, linked_db=linked_db)
conn = get_connection(connection_id)
source_type = _detect_source_type(conn["jdbc_url"])
text_types = {"varchar", "char", "nvarchar", "nchar", "character", "text", "ntext"}
lines = ["SELECT"]
for i, col in enumerate(columns):
prefix = " ," if i > 0 else " "
alias = _safe_alias(col.name)
# Quote source column name if it contains special characters
# SQL Server uses [brackets], others use "double quotes"
if _needs_quoting(col.name):
if source_type == "sqlserver":
col_ref = f"[{col.name}]"
else:
col_ref = f'"{col.name}"'
else:
col_ref = col.name
base_type = col.data_type.lower().split("(")[0].strip()
# RTRIM text columns for SQL Server and AS/400 (padded char fields)
if base_type in text_types and source_type in ("sqlserver", "as400"):
expr = f"RTRIM({col_ref})"
lines.append(f"{prefix}{expr:<35} AS {alias}")
else:
lines.append(f"{prefix}{col_ref:<35} AS {alias}")
lines.append("FROM")
if linked_server and linked_db:
lines.append(f" [{linked_server}].[{linked_db}].{schema}.{table}")
elif linked_db:
lines.append(f" [{linked_db}].{schema}.{table}")
else:
lines.append(f" {schema}.{table}")
return "\n".join(lines)
def generate_dest_ddl(dest_table: str, columns: list[RemoteColumn]) -> str:
"""Generate CREATE TABLE DDL for the destination (PostgreSQL)."""
schema_table = dest_table
lines = [f"CREATE TABLE IF NOT EXISTS {schema_table} ("]
col_lines = []
for col in columns:
pg_type = map_type_pg(col.data_type)
col_name = _safe_alias(col.name)
col_lines.append(f" {col_name:<30} {pg_type}")
lines.append(",\n".join(col_lines))
lines.append(");")
return "\n".join(lines)
def propose_module(connection_id: int, schema: str, table: str,
dest_schema: str = None,
linked_server: str = None, linked_db: str = None) -> dict:
"""
Given a source table, propose a complete module config:
- source_query (auto-generated SELECT with RTRIM)
- dest_table
- dest_ddl (CREATE TABLE for destination)
- suggested merge_strategy
- suggested merge_key (first column)
- suggested watermark_column (if DEX_ROW_TS or similar found)
"""
columns = fetch_columns(connection_id, schema, table,
linked_server=linked_server, linked_db=linked_db)
source_query = generate_select(connection_id, schema, table, columns,
linked_server=linked_server, linked_db=linked_db)
# Propose destination table name
if dest_schema is None:
dest_schema = "public"
dest_table = f"{dest_schema}.{table.lower()}"
# Generate DDL
dest_ddl = generate_dest_ddl(dest_table, columns)
# Suggest merge strategy based on columns present
col_names_lower = [c.name.lower() for c in columns]
timestamp_col = None
for candidate in ["dex_row_ts", "modified_date", "updated_at", "last_modified",
"modifieddate", "changedate"]:
if candidate in col_names_lower:
timestamp_col = candidate
break
merge_key = columns[0].name.lower() if columns else None
if timestamp_col:
strategy = "incremental"
else:
strategy = "full"
return {
"name": table.lower(),
"source_query": source_query,
"dest_table": dest_table,
"dest_ddl": dest_ddl,
"columns": [c.to_dict() for c in columns],
"merge_strategy": strategy,
"merge_key": merge_key,
"watermark_column": timestamp_col,
}