NinoCode/Tool_Box/ExcelSpreadsheetToDict.py

"""

ExcelSpreadsheetToDict

Created by:
Emanuel Borges
09.09.2025

A library for manipulating Excel Spreadsheets.  Created to compare privilege logs.

"""


from typing import Dict, Iterable, Optional, Tuple
from collections import namedtuple
from win32com import client as win32

version = "2.0"

class ExcelToDictIngestor:
    def __init__(self):
        self._excel_app = None  ##  Will be ceated on demand


    @staticmethod
    def _normalize_col_letter(col: str) -> str:
        if not isinstance(col, str) or not col.isalpha():
            raise ValueError(f"Invalid Excel column letter: {col!r}")
        return col.upper()


    @staticmethod
    def _cel_to_string(val) -> str:
        if val is None:
            return ""
        return str(val)

    @staticmethod
    def _normalize_single(val) -> str:
        return ExcelToDictIngestor._cel_to_string(val).strip().upper()

    @staticmethod
    def _normalize_multi(val,delimiter: str) -> list:
        raw = ExcelToDictIngestor._cel_to_string(val)
        parts = [p.strip().upper() for p in raw.split(delimiter) if p]
        ## These next two lines deduplicate the values in the multi value cells but i'm reconsidering this.  If the original had dups, I want the dict to have dups.
        #uniq = sorted({p for p in parts if p})
        #return uniq
        return sorted(parts) 

    def _ensure_app(self, visible: bool):
        if self._excel_app is None:
            self._excel_app = win32.Dispatch("Excel.Application")
        self._excel_app.Visible = bool(visible)


    def _close_app(self):
        if self._excel_app is not None:
            try:
                self._excel_app.Quit()
            finally:
                self._excel_app = None


    def load(self, filepath: str, doc_id_col: str, columns: Iterable[Tuple[str, bool]], start_row: int = 2, end_row: Optional[int] = None, sheet_name: Optional[str] = None, delimiter: str = ";", visible: bool = False,) -> Dict[str, tuple]:
        doc_id_col = self._normalize_col_letter(doc_id_col)
        col_specs = [(self._normalize_col_letter(c), bool(is_multi)) for c, is_multi in columns]
        field_names = [c for c, _ in col_specs]
        RowTuple = namedtuple("Row", field_names)

        data: Dict[str, RowTuple] = {}

        self._ensure_app(visible=visible)
        wb = None
        try:
            wb = self._excel_app.Workbooks.Open(filepath)
            ws = wb.Worksheets(sheet_name) if sheet_name else wb.ActiveSheet

            if end_row is None:
                used = ws.UsedRange
                end_row = used.Row + used.Rows.Count -1

            for row in range(start_row, end_row +1):
                doc_id_val = ws.Range(f"{doc_id_col}{row}").Value
                doc_id = self._normalize_single(doc_id_val)
                if not doc_id:
                    continue
                
                row_values = []
                for col_letter, is_multi in col_specs:
                    cell_val = ws.Range(f"{col_letter}{row}").Value
                    if is_multi:
                        row_values.append(self._normalize_multi(cell_val, delimiter))
                    else:
                        row_values.append(self._normalize_single(cell_val))
                
                data[doc_id] = RowTuple(*row_values)
                
            wb.Close(SaveChanges = False)
            wb = None
        finally:
            if wb is not None:
                try:
                    wb.Close(SaveChanges = False)
                except Exception:
                    pass
            if not visible:
                self._close_app()
                
        return data
Revision:	942
Committed:	Tue Sep 9 17:03:07 2025 UTC (6 months, 2 weeks ago) by nino.borges
Content type:	text/x-python
File size:	3770 byte(s)
Log Message:	Originally _normalize_multi would deduplicate the values in the cell using a set. I changed my mind and changed this to allow duplicates. If the original cell had dups, I want the dictionary cell to have dups.
#	Content
1	"""
2
3	ExcelSpreadsheetToDict
4
5	Created by:
6	Emanuel Borges
7	09.09.2025
8
9	A library for manipulating Excel Spreadsheets. Created to compare privilege logs.
10
11	"""
12
13
14	from typing import Dict, Iterable, Optional, Tuple
15	from collections import namedtuple
16	from win32com import client as win32
17
18	version = "2.0"
19
20	class ExcelToDictIngestor:
21	def __init__(self):
22	self._excel_app = None ## Will be ceated on demand
23
24
25	@staticmethod
26	def _normalize_col_letter(col: str) -> str:
27	if not isinstance(col, str) or not col.isalpha():
28	raise ValueError(f"Invalid Excel column letter: {col!r}")
29	return col.upper()
30
31
32	@staticmethod
33	def _cel_to_string(val) -> str:
34	if val is None:
35	return ""
36	return str(val)
37
38	@staticmethod
39	def _normalize_single(val) -> str:
40	return ExcelToDictIngestor._cel_to_string(val).strip().upper()
41
42	@staticmethod
43	def _normalize_multi(val,delimiter: str) -> list:
44	raw = ExcelToDictIngestor._cel_to_string(val)
45	parts = [p.strip().upper() for p in raw.split(delimiter) if p]
46	## These next two lines deduplicate the values in the multi value cells but i'm reconsidering this. If the original had dups, I want the dict to have dups.
47	#uniq = sorted({p for p in parts if p})
48	#return uniq
49	return sorted(parts)
50
51	def _ensure_app(self, visible: bool):
52	if self._excel_app is None:
53	self._excel_app = win32.Dispatch("Excel.Application")
54	self._excel_app.Visible = bool(visible)
55
56
57	def _close_app(self):
58	if self._excel_app is not None:
59	try:
60	self._excel_app.Quit()
61	finally:
62	self._excel_app = None
63
64
65
66	def load(self, filepath: str, doc_id_col: str, columns: Iterable[Tuple[str, bool]], start_row: int = 2, end_row: Optional[int] = None, sheet_name: Optional[str] = None, delimiter: str = ";", visible: bool = False,) -> Dict[str, tuple]:
67	doc_id_col = self._normalize_col_letter(doc_id_col)
68	col_specs = [(self._normalize_col_letter(c), bool(is_multi)) for c, is_multi in columns]
69	field_names = [c for c, _ in col_specs]
70	RowTuple = namedtuple("Row", field_names)
71
72	data: Dict[str, RowTuple] = {}
73
74	self._ensure_app(visible=visible)
75	wb = None
76	try:
77	wb = self._excel_app.Workbooks.Open(filepath)
78	ws = wb.Worksheets(sheet_name) if sheet_name else wb.ActiveSheet
79
80	if end_row is None:
81	used = ws.UsedRange
82	end_row = used.Row + used.Rows.Count -1
83
84	for row in range(start_row, end_row +1):
85	doc_id_val = ws.Range(f"{doc_id_col}{row}").Value
86	doc_id = self._normalize_single(doc_id_val)
87	if not doc_id:
88	continue
89
90	row_values = []
91	for col_letter, is_multi in col_specs:
92	cell_val = ws.Range(f"{col_letter}{row}").Value
93	if is_multi:
94	row_values.append(self._normalize_multi(cell_val, delimiter))
95	else:
96	row_values.append(self._normalize_single(cell_val))
97
98	data[doc_id] = RowTuple(*row_values)
99
100	wb.Close(SaveChanges = False)
101	wb = None
102	finally:
103	if wb is not None:
104	try:
105	wb.Close(SaveChanges = False)
106	except Exception:
107	pass
108	if not visible:
109	self._close_app()
110
111	return data