- BuildEbitdaBridge.bas: waterfall bridge tab (2026E->2027 AOP) from Slide 13 - clean_names_xml.py: strip junk defined names via direct XML surgery (Excel save corrupts this workbook's query tables/pivot caches) - CleanDefinedNames.bas: skip all _xl* reserved names; copy survivors to clipboard Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
101 lines
4.1 KiB
Python
101 lines
4.1 KiB
Python
"""
|
|
clean_names_xml.py -- strip junk defined names from an .xlsx via direct XML
|
|
surgery (NOT via Excel).
|
|
|
|
WHY THIS EXISTS
|
|
---------------
|
|
The "Segment Financials" workbook accumulated ~13k junk defined names (Bloomberg
|
|
BLPH*, SAP BEx*, Lotus ___PRN2/__123Graph, etc.). Deleting them with a VBA macro
|
|
and then letting EXCEL SAVE repeatedly corrupted the file: Excel's save
|
|
garbage-collected dependent parts (first the pivotCacheRecords, then
|
|
xl/connections.xml -> orphaned the TB/SalesData query tables). That workbook is a
|
|
web of Power Query outputs (TB & SalesData are query tables), connections, the
|
|
data model (_xlcn.LinkedTable_*), external links, and pivot caches.
|
|
|
|
This script edits ONLY xl/workbook.xml's <definedNames> block and copies every
|
|
other part byte-for-byte into a new file. Excel never re-saves, so nothing gets
|
|
garbage-collected. Result opens clean.
|
|
|
|
WHAT IT KEEPS
|
|
-------------
|
|
* names in KEEP (your real user-defined names)
|
|
* anything starting with _xl (Excel-reserved: _xlnm.* print/filter,
|
|
_xlcn.* data-model connections, _xlpm.*, _xludf.*, ...)
|
|
* anything starting with ExternalData (query-table external-data ranges)
|
|
* any name REFERENCED elsewhere in the package (worksheets, tables,
|
|
queryTables, charts, pivotTables, externalLinks, connections) -- so we
|
|
never orphan a feature that points at a name.
|
|
Everything else is dropped.
|
|
|
|
USAGE
|
|
-----
|
|
python clean_names_xml.py SRC.xlsx OUT.xlsx [Name1 Name2 ...]
|
|
# if no names given, defaults to the Segment Financials trio below.
|
|
Non-destructive: reads SRC, writes a NEW OUT; never touches SRC. Close OUT in
|
|
Excel before re-running (Windows file lock).
|
|
"""
|
|
import zipfile, re, os, sys
|
|
|
|
DEFAULT_KEEP = {"Report_Date", "Value_Base", "FSPR_Date"}
|
|
|
|
|
|
def clean(src, out, keep):
|
|
zin = zipfile.ZipFile(src, "r")
|
|
wb = zin.read("xl/workbook.xml").decode("utf-8")
|
|
|
|
m = re.search(r"<definedNames>.*?</definedNames>", wb, re.S)
|
|
if not m:
|
|
print("No <definedNames> block found - nothing to do.")
|
|
return
|
|
block = m.group(0)
|
|
entries = re.findall(r"<definedName\b[^>]*>.*?</definedName>", block, re.S)
|
|
print("definedName entries found:", len(entries))
|
|
|
|
# names REFERENCED anywhere structural (not workbook.xml, not bulk cell text)
|
|
tok = re.compile(r"[A-Za-z_\\][A-Za-z0-9_.\\]*")
|
|
referenced = set()
|
|
SCAN = ("xl/worksheets/", "xl/charts/", "xl/pivotTables/",
|
|
"xl/tables/", "xl/queryTables/", "xl/externalLinks/")
|
|
for nm in zin.namelist():
|
|
if nm.endswith(".xml") and nm != "xl/workbook.xml" \
|
|
and (nm.startswith(SCAN) or "connections" in nm):
|
|
referenced |= set(tok.findall(zin.read(nm).decode("utf-8", "ignore")))
|
|
|
|
kept, dropped, kept_ref = [], 0, []
|
|
for e in entries:
|
|
mm = re.search(r'name="([^"]*)"', e)
|
|
name = mm.group(1) if mm else ""
|
|
if (name in keep or name.startswith("_xl")
|
|
or name.startswith("ExternalData") or name in referenced):
|
|
kept.append(e)
|
|
if name not in keep and not name.startswith("_xl"):
|
|
kept_ref.append(name)
|
|
else:
|
|
dropped += 1
|
|
print("keeping:", len(kept), "| dropping:", dropped)
|
|
print("kept because referenced/external:", sorted(set(kept_ref)))
|
|
|
|
wb_new = wb[:m.start()] + "<definedNames>" + "".join(kept) + "</definedNames>" + wb[m.end():]
|
|
|
|
if os.path.exists(out):
|
|
os.remove(out)
|
|
zout = zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED)
|
|
for item in zin.infolist():
|
|
data = wb_new.encode("utf-8") if item.filename == "xl/workbook.xml" else zin.read(item.filename)
|
|
zi = zipfile.ZipInfo(item.filename, date_time=item.date_time)
|
|
zi.compress_type = item.compress_type
|
|
zi.external_attr = item.external_attr
|
|
zout.writestr(zi, data)
|
|
zout.close()
|
|
zin.close()
|
|
print("WROTE:", out, "| size:", os.path.getsize(out))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 3:
|
|
print("usage: python clean_names_xml.py SRC.xlsx OUT.xlsx [KeepName ...]")
|
|
sys.exit(1)
|
|
src, out = sys.argv[1], sys.argv[2]
|
|
keep = set(sys.argv[3:]) if len(sys.argv) > 3 else DEFAULT_KEEP
|
|
clean(src, out, keep)
|