""" clean_names_xml.py -- strip junk defined names from an .xlsx via direct XML surgery (NOT via Excel). WHY THIS EXISTS --------------- The "Segment Financials" workbook accumulated ~13k junk defined names (Bloomberg BLPH*, SAP BEx*, Lotus ___PRN2/__123Graph, etc.). Deleting them with a VBA macro and then letting EXCEL SAVE repeatedly corrupted the file: Excel's save garbage-collected dependent parts (first the pivotCacheRecords, then xl/connections.xml -> orphaned the TB/SalesData query tables). That workbook is a web of Power Query outputs (TB & SalesData are query tables), connections, the data model (_xlcn.LinkedTable_*), external links, and pivot caches. This script edits ONLY xl/workbook.xml's block and copies every other part byte-for-byte into a new file. Excel never re-saves, so nothing gets garbage-collected. Result opens clean. WHAT IT KEEPS ------------- * names in KEEP (your real user-defined names) * anything starting with _xl (Excel-reserved: _xlnm.* print/filter, _xlcn.* data-model connections, _xlpm.*, _xludf.*, ...) * anything starting with ExternalData (query-table external-data ranges) * any name REFERENCED elsewhere in the package (worksheets, tables, queryTables, charts, pivotTables, externalLinks, connections) -- so we never orphan a feature that points at a name. Everything else is dropped. USAGE ----- python clean_names_xml.py SRC.xlsx OUT.xlsx [Name1 Name2 ...] # if no names given, defaults to the Segment Financials trio below. Non-destructive: reads SRC, writes a NEW OUT; never touches SRC. Close OUT in Excel before re-running (Windows file lock). """ import zipfile, re, os, sys DEFAULT_KEEP = {"Report_Date", "Value_Base", "FSPR_Date"} def clean(src, out, keep): zin = zipfile.ZipFile(src, "r") wb = zin.read("xl/workbook.xml").decode("utf-8") m = re.search(r".*?", wb, re.S) if not m: print("No block found - nothing to do.") return block = m.group(0) entries = re.findall(r"]*>.*?", block, re.S) print("definedName entries found:", len(entries)) # names REFERENCED anywhere structural (not workbook.xml, not bulk cell text) tok = re.compile(r"[A-Za-z_\\][A-Za-z0-9_.\\]*") referenced = set() SCAN = ("xl/worksheets/", "xl/charts/", "xl/pivotTables/", "xl/tables/", "xl/queryTables/", "xl/externalLinks/") for nm in zin.namelist(): if nm.endswith(".xml") and nm != "xl/workbook.xml" \ and (nm.startswith(SCAN) or "connections" in nm): referenced |= set(tok.findall(zin.read(nm).decode("utf-8", "ignore"))) kept, dropped, kept_ref = [], 0, [] for e in entries: mm = re.search(r'name="([^"]*)"', e) name = mm.group(1) if mm else "" if (name in keep or name.startswith("_xl") or name.startswith("ExternalData") or name in referenced): kept.append(e) if name not in keep and not name.startswith("_xl"): kept_ref.append(name) else: dropped += 1 print("keeping:", len(kept), "| dropping:", dropped) print("kept because referenced/external:", sorted(set(kept_ref))) wb_new = wb[:m.start()] + "" + "".join(kept) + "" + wb[m.end():] if os.path.exists(out): os.remove(out) zout = zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) for item in zin.infolist(): data = wb_new.encode("utf-8") if item.filename == "xl/workbook.xml" else zin.read(item.filename) zi = zipfile.ZipInfo(item.filename, date_time=item.date_time) zi.compress_type = item.compress_type zi.external_attr = item.external_attr zout.writestr(zi, data) zout.close() zin.close() print("WROTE:", out, "| size:", os.path.getsize(out)) if __name__ == "__main__": if len(sys.argv) < 3: print("usage: python clean_names_xml.py SRC.xlsx OUT.xlsx [KeepName ...]") sys.exit(1) src, out = sys.argv[1], sys.argv[2] keep = set(sys.argv[3:]) if len(sys.argv) > 3 else DEFAULT_KEEP clean(src, out, keep)