ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NS-CategoricalGroupedLogCreator.py
Revision: 971
Committed: Wed Feb 11 22:20:19 2026 UTC (6 weeks, 2 days ago) by nino.borges
Content type: text/x-python
File size: 13323 byte(s)
Log Message:
version .3 adds support for document type to the grouped log.

File Contents

# Content
1 """
2 NS-CategoricalGroupedLogCreator
3
4 Created by:
5 Emanuel Borges
6 02.09.2026
7
8 This program will create the grouped concatenated categorical log where there is a row only for each unique
9 priv description.
10
11 NOTE:
12 - This implementation uses win32com / pywin32 and requires Windows + Microsoft Excel installed.
13 - Also note that I'm using DispatchEx instead of just normal Dispatch for win32.com. This is due to some crashing weirdness that is happening on this worklaptop...
14 - I import pythoncom for some belt-and-suspenders. Mostly not needed.
15 """
16
17 from __future__ import annotations
18
19 from dataclasses import dataclass, field
20 from datetime import date, datetime
21 from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
22
23 import pythoncom
24 from win32com.client import DispatchEx
25
26
27 version = "0.3"
28
29
30 # ----------------------------
31 # My Helpers
32 # ----------------------------
33
34 def col_letter_to_index(col: str) -> int:
35 """
36 Convert an Excel column letter (e.g., 'A', 'Z', 'AA') to a 1-based index.
37 """
38 col = col.strip().upper()
39 if not col.isalpha():
40 raise ValueError(f"Invalid column letter: {col!r}")
41
42 idx = 0
43 for ch in col:
44 idx = idx * 26 + (ord(ch) - ord("A") + 1)
45 return idx
46
47
48 def normalize_key(s: Any) -> str:
49 """
50 Normalize dictionary key input (Privilege Description).
51 Adjust this if you later want casefolding, whitespace collapsing, etc.
52 """
53 if s is None:
54 return ""
55 return str(s).strip()
56
57
58 def excel_value_to_date(v: Any) -> Optional[date]:
59 """
60 Attempt to convert a win32com Excel cell value into a Python date.
61 Common cases:
62 - datetime (pywin32 often gives datetime)
63 - date
64 - string like '2025-01-31' or '1/31/2025'
65 - empty / None
66 """
67 if v in (None, ""):
68 return None
69
70 if isinstance(v, datetime):
71 return v.date()
72 if isinstance(v, date):
73 return v
74
75 ## If it comes back as a string, try a couple common formats.
76 if isinstance(v, str):
77 s = v.strip()
78 if not s:
79 return None
80 for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%m/%d/%y"):
81 try:
82 return datetime.strptime(s, fmt).date()
83 except ValueError:
84 pass
85 ## If we can't parse, just skip (or raise if you prefer strict behavior)
86 return None
87
88 ## Otherwise, unknown type - skip.
89 return None
90
91
92 def split_people(cell_value: Any, delimiter: str = ";") -> List[str]:
93 """
94 Split a participant cell into individual participants.
95 """
96 if cell_value in (None, ""):
97 return []
98 s = str(cell_value).strip()
99 if not s:
100 return []
101 return [p.strip() for p in s.split(delimiter) if p.strip()]
102
103
104 def split_priv_types(cell_value: Any, delimiter: str = ";") -> List[str]:
105 """
106 Split a privilege type cell into individual privilege types (if multi-valued).
107 """
108 if cell_value in (None, ""):
109 return []
110 s = str(cell_value).strip()
111 if not s:
112 return []
113 return [t.strip() for t in s.split(delimiter) if t.strip()]
114
115
116 # ----------------------------
117 # The Data model
118 # ----------------------------
119
120 @dataclass
121 class DocValues:
122 BegDate_val: Optional[date] = None
123 EndDate_val: Optional[date] = None
124 Participants_val: Set[str] = field(default_factory=set)
125 PrivilegeTypes_val: Set[str] = field(default_factory=set)
126 DocumentTypes_val: Set[str] = field(default_factory=set)
127 DocCount_val: int = 0
128
129 def update(
130 self,
131 doc_date: Optional[date],
132 participants: Iterable[str],
133 privilege_types: Iterable[str],
134 document_types: Iterable[str],
135 ) -> None:
136 ## Count
137 self.DocCount_val += 1
138
139 ## Dates
140 if doc_date is not None:
141 if self.BegDate_val is None or doc_date < self.BegDate_val:
142 self.BegDate_val = doc_date
143 if self.EndDate_val is None or doc_date > self.EndDate_val:
144 self.EndDate_val = doc_date
145
146 ## Participants
147 for p in participants:
148 p = str(p).strip()
149 if p:
150 self.Participants_val.add(p)
151
152 ## Privilege Types
153 for t in privilege_types:
154 t = str(t).strip()
155 if t:
156 self.PrivilegeTypes_val.add(t)
157
158
159 ## Document Types
160 for d in document_types:
161 d = str(d).strip()
162 if d:
163 self.DocumentTypes_val.add(d)
164
165
166 # ----------------------------
167 # The Main class
168 # ----------------------------
169
170 class CreateGroupedCategoricalLog:
171 def __init__(
172 self,
173 pathToExcelSpreadsheet: str,
174 privDescriptionColumnLetter: str,
175 docDatesColumnLettersList: List[str],
176 participantsColumnLettersList: List[str],
177 privilegeTypeColumnLetter: str,
178 documentTypeColumnLetter: str,
179 *,
180 sheet_name: Optional[str] = None,
181 header_row: int = 1,
182 data_start_row: int = 2,
183 totalNumberOfRows: Optional[int] = None,
184 participants_delimiter: str = ";",
185 privilege_types_delimiter: str = ";",
186 ) -> None:
187 """
188 privDescriptionColumnLetter, documentTypeColumnLetter, and privilegeTypeColumnLetter should each be a single column letter value.
189
190 docDatesColumnLettersList:
191 list of column letters; program uses the FIRST non-empty date cell among them.
192
193 participantsColumnLettersList:
194 list of all column letters that contain participants; participants are aggregated into a dedup set.
195
196 totalNumberOfRows:
197 optional override; if None, program uses Excel UsedRange to determine last row.
198
199 sheet_name:
200 optional; if None, uses the first worksheet.
201 """
202 self.pathToExcelSpreadsheet = pathToExcelSpreadsheet
203 self.sheet_name = sheet_name
204 self.header_row = header_row
205 self.data_start_row = data_start_row
206 self.totalNumberOfRows = totalNumberOfRows
207
208 self.privDescriptionColumnLetter = privDescriptionColumnLetter
209 self.docDatesColumnLettersList = docDatesColumnLettersList
210 self.participantsColumnLettersList = participantsColumnLettersList
211 self.privilegeTypeColumnLetter = privilegeTypeColumnLetter
212 self.documentTypeColumnLetter = documentTypeColumnLetter
213
214 self.participants_delimiter = participants_delimiter
215 self.privilege_types_delimiter = privilege_types_delimiter
216
217 ## The matrix, keyed by Privilege Description
218 self.catLogMatrix: Dict[str, DocValues] = {}
219
220 ## ---------- Excel COM plumbing ----------
221
222 def _open_excel(self):
223 pythoncom.CoInitialize()
224 xlApp = DispatchEx("Excel.Application")
225 xlApp.Visible = False
226 xlApp.DisplayAlerts = False
227 return xlApp
228
229 def ingest_spreadsheet(self) -> None:
230 """
231 Read the input privilege log spreadsheet and populate self.catLogMatrix.
232 """
233 xlApp = self._open_excel()
234 wb = None
235
236 try:
237 wb = xlApp.Workbooks.Open(self.pathToExcelSpreadsheet)
238
239 if self.sheet_name:
240 ws = wb.Worksheets(self.sheet_name)
241 else:
242 ws = wb.Worksheets(1)
243
244 ## Determine last row
245 if self.totalNumberOfRows is not None:
246 last_row = self.totalNumberOfRows
247 else:
248 ## UsedRange can be imperfect if sheet has formatting far down,
249 ## but is generally fine for privilege logs.
250 last_row = ws.UsedRange.Rows.Count
251
252 priv_col = col_letter_to_index(self.privDescriptionColumnLetter)
253 priv_type_col = col_letter_to_index(self.privilegeTypeColumnLetter)
254 doc_type_col = col_letter_to_index(self.documentTypeColumnLetter)
255 date_cols = [col_letter_to_index(c) for c in self.docDatesColumnLettersList]
256 part_cols = [col_letter_to_index(c) for c in self.participantsColumnLettersList]
257
258 ## Loop rows
259 for row in range(self.data_start_row, last_row + 1):
260 priv_desc_raw = ws.Cells(row, priv_col).Value
261 priv_desc = normalize_key(priv_desc_raw)
262
263 if not priv_desc:
264 ## Skip empty rows
265 continue
266
267 ## Find first non-empty doc date among the configured columns
268 doc_dt: Optional[date] = None
269 for dc in date_cols:
270 v = ws.Cells(row, dc).Value
271 doc_dt = excel_value_to_date(v)
272 if doc_dt is not None:
273 break
274
275 ## Participants across multiple columns
276 participants: List[str] = []
277 for pc in part_cols:
278 participants.extend(
279 split_people(ws.Cells(row, pc).Value, delimiter=self.participants_delimiter)
280 )
281
282 ## Privilege type(s)
283 priv_types = split_priv_types(
284 ws.Cells(row, priv_type_col).Value,
285 delimiter=self.privilege_types_delimiter,
286 )
287
288 ## Update matrix
289 dv = self.catLogMatrix.setdefault(priv_desc, DocValues())
290 dv.update(doc_date=doc_dt, participants=participants, privilege_types=priv_types)
291
292 finally:
293 if wb is not None:
294 wb.Close(SaveChanges=False)
295 xlApp.Quit()
296
297 def export_grouped_log(self, output_path: str) -> None:
298 """
299 Export the grouped results to a new Excel spreadsheet.
300 """
301 xlApp = self._open_excel()
302 wb_out = None
303
304 try:
305 wb_out = xlApp.Workbooks.Add()
306 wb_out.Activate()
307 ws_out = wb_out.Worksheets(1)
308 ws_out.Activate()
309 ws_out.Name = "Grouped Log"
310
311 headers = [
312 "Privilege Description",
313 "BegDate",
314 "EndDate",
315 "Participants (Deduped)",
316 "Privilege Types (Deduped)",
317 "Document Types (Deduped)",
318 "DocCount",
319 ]
320
321 ## Write headers
322 for c, h in enumerate(headers, start=1):
323 ws_out.Cells(1, c).Value = h
324 ws_out.Cells(1, c).Font.Bold = True
325
326 ## Write rows
327 row_out = 2
328 for priv_desc in sorted(self.catLogMatrix.keys()):
329 dv = self.catLogMatrix[priv_desc]
330
331 participants_sorted = sorted(dv.Participants_val)
332 priv_types_sorted = sorted(dv.PrivilegeTypes_val)
333
334 ws_out.Cells(row_out, 1).Value = priv_desc
335 ws_out.Cells(row_out, 2).Value = dv.BegDate_val.isoformat() if dv.BegDate_val else ""
336 ws_out.Cells(row_out, 3).Value = dv.EndDate_val.isoformat() if dv.EndDate_val else ""
337 ws_out.Cells(row_out, 4).Value = "; ".join(participants_sorted)
338 ws_out.Cells(row_out, 5).Value = "; ".join(priv_types_sorted)
339 ws_out.Cells(row_out, 6).Value = dv.DocCount_val
340
341 ## Wrap the long text columns
342 ws_out.Cells(row_out, 1).WrapText = True
343 ws_out.Cells(row_out, 4).WrapText = True
344 ws_out.Cells(row_out, 5).WrapText = True
345
346 row_out += 1
347
348 ## AutoFit
349 ws_out.Columns.AutoFit()
350
351 ## Save
352 wb_out.SaveAs(output_path)
353
354 finally:
355 if wb_out is not None:
356 wb_out.Close(SaveChanges=True)
357 xlApp.Quit()
358
359
360 if __name__ == "__main__":
361 inputSpreadsheetLog = r"C:\Users\eborges\foo.xlsx"
362 outputSpreadsheetLog = r"C:\Users\eborges\bar.xlsx"
363
364 privDescriptionColumnLetter = "O"
365 docDatesColumnLettersList = ["C", "D"] # first non-empty wins!
366 participantsColumnLettersList = ["G", "H", "I", "J", "K", "L"] # Author/From/To/CC/BCC
367 privilegeTypeColumnLetter = "N"
368 documentTypeColumnLetter = "E"
369
370 grouper = CreateGroupedCategoricalLog(
371 pathToExcelSpreadsheet=inputSpreadsheetLog,
372 privDescriptionColumnLetter=privDescriptionColumnLetter,
373 docDatesColumnLettersList=docDatesColumnLettersList,
374 participantsColumnLettersList=participantsColumnLettersList,
375 privilegeTypeColumnLetter=privilegeTypeColumnLetter,
376 documentTypeColumnLetter=documentTypeColumnLetter,
377 sheet_name=None, ## or "Sheet1"
378 header_row=1,
379 data_start_row=2,
380 totalNumberOfRows=None, ## or set an int if you want to override UsedRange
381 participants_delimiter=";",
382 privilege_types_delimiter=";",
383 )
384
385 grouper.ingest_spreadsheet()
386 grouper.export_grouped_log(outputSpreadsheetLog)
387 print(f"Done. Grouped rows: {len(grouper.catLogMatrix)}")