"""
clean_names_xml.py  --  strip junk defined names from an .xlsx via direct XML
surgery (NOT via Excel).

WHY THIS EXISTS
---------------
The "Segment Financials" workbook accumulated ~13k junk defined names (Bloomberg
BLPH*, SAP BEx*, Lotus ___PRN2/__123Graph, etc.). Deleting them with a VBA macro
and then letting EXCEL SAVE repeatedly corrupted the file: Excel's save
garbage-collected dependent parts (first the pivotCacheRecords, then
xl/connections.xml -> orphaned the TB/SalesData query tables). That workbook is a
web of Power Query outputs (TB & SalesData are query tables), connections, the
data model (_xlcn.LinkedTable_*), external links, and pivot caches.

This script edits ONLY xl/workbook.xml's <definedNames> block and copies every
other part byte-for-byte into a new file. Excel never re-saves, so nothing gets
garbage-collected. Result opens clean.

WHAT IT KEEPS
-------------
  * names in KEEP (your real user-defined names)
  * anything starting with _xl  (Excel-reserved: _xlnm.* print/filter,
    _xlcn.* data-model connections, _xlpm.*, _xludf.*, ...)
  * anything starting with ExternalData  (query-table external-data ranges)
  * any name REFERENCED elsewhere in the package (worksheets, tables,
    queryTables, charts, pivotTables, externalLinks, connections) -- so we
    never orphan a feature that points at a name.
Everything else is dropped.

USAGE
-----
  python clean_names_xml.py SRC.xlsx OUT.xlsx [Name1 Name2 ...]
  # if no names given, defaults to the Segment Financials trio below.
Non-destructive: reads SRC, writes a NEW OUT; never touches SRC. Close OUT in
Excel before re-running (Windows file lock).
"""
import zipfile, re, os, sys

DEFAULT_KEEP = {"Report_Date", "Value_Base", "FSPR_Date"}


def clean(src, out, keep):
    zin = zipfile.ZipFile(src, "r")
    wb = zin.read("xl/workbook.xml").decode("utf-8")

    m = re.search(r"<definedNames>.*?</definedNames>", wb, re.S)
    if not m:
        print("No <definedNames> block found - nothing to do.")
        return
    block = m.group(0)
    entries = re.findall(r"<definedName\b[^>]*>.*?</definedName>", block, re.S)
    print("definedName entries found:", len(entries))

    # names REFERENCED anywhere structural (not workbook.xml, not bulk cell text)
    tok = re.compile(r"[A-Za-z_\\][A-Za-z0-9_.\\]*")
    referenced = set()
    SCAN = ("xl/worksheets/", "xl/charts/", "xl/pivotTables/",
            "xl/tables/", "xl/queryTables/", "xl/externalLinks/")
    for nm in zin.namelist():
        if nm.endswith(".xml") and nm != "xl/workbook.xml" \
                and (nm.startswith(SCAN) or "connections" in nm):
            referenced |= set(tok.findall(zin.read(nm).decode("utf-8", "ignore")))

    kept, dropped, kept_ref = [], 0, []
    for e in entries:
        mm = re.search(r'name="([^"]*)"', e)
        name = mm.group(1) if mm else ""
        if (name in keep or name.startswith("_xl")
                or name.startswith("ExternalData") or name in referenced):
            kept.append(e)
            if name not in keep and not name.startswith("_xl"):
                kept_ref.append(name)
        else:
            dropped += 1
    print("keeping:", len(kept), "| dropping:", dropped)
    print("kept because referenced/external:", sorted(set(kept_ref)))

    wb_new = wb[:m.start()] + "<definedNames>" + "".join(kept) + "</definedNames>" + wb[m.end():]

    if os.path.exists(out):
        os.remove(out)
    zout = zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED)
    for item in zin.infolist():
        data = wb_new.encode("utf-8") if item.filename == "xl/workbook.xml" else zin.read(item.filename)
        zi = zipfile.ZipInfo(item.filename, date_time=item.date_time)
        zi.compress_type = item.compress_type
        zi.external_attr = item.external_attr
        zout.writestr(zi, data)
    zout.close()
    zin.close()
    print("WROTE:", out, "| size:", os.path.getsize(out))


if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("usage: python clean_names_xml.py SRC.xlsx OUT.xlsx [KeepName ...]")
        sys.exit(1)
    src, out = sys.argv[1], sys.argv[2]
    keep = set(sys.argv[3:]) if len(sys.argv) > 3 else DEFAULT_KEEP
    clean(src, out, keep)