ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_NamesNormQC.py
Revision: 967
Committed: Thu Jan 8 21:27:51 2026 UTC (2 months, 2 weeks ago) by nino.borges
Content type: text/x-python
File size: 46354 byte(s)
Log Message:
First working version of the build_similar_names function.  Some QC was done and it does appear to be working.

File Contents

# User Rev Content
1 nino.borges 834 """
2    
3 nino.borges 847 Amazon_NamesNormQC
4 nino.borges 834
5     Created by:
6     Emanuel Borges
7     11.21.2024
8    
9 nino.borges 847 This Library will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10 nino.borges 834
11     """
12    
13     import os, uuid, pickle, re
14 nino.borges 847 #import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15 nino.borges 877 from dataclasses import dataclass, field, fields
16 nino.borges 966 from typing import List, Tuple, Optional, Set, Dict
17     from collections import namedtuple, defaultdict
18 nino.borges 834 from win32com.client import Dispatch
19    
20    
21     @dataclass
22     class Person:
23     first_name: Optional[str] = None
24     last_name: Optional[str] = None
25 nino.borges 966 alt_first_names: Optional[str] = None
26     alt_surnames: Optional[str] = None
27     middle_initial: Optional[str] = None
28 nino.borges 834 work_email_address: Optional[str] = None
29     alt_work_email_address: Optional[str] = None
30     _id: uuid.UUID = field(default_factory=uuid.uuid4)
31     is_attorney: Optional[str] = None
32     split_role_date_range: Optional[str] = None
33     sidley_validated: Optional[str] = None
34     category: Optional[str] = None
35     organization: Optional[str] = None
36     job_title: Optional[str] = None
37     business_title: Optional[str] = None
38     full_name_preferred: Optional[str] = None
39     login: Optional[str] = None
40     department_fine: Optional[str] = None
41     addressed_during_caag: Optional[str] = None
42 nino.borges 838 #last_updated: Optional[str] = None
43 nino.borges 837 full_name_overide: Optional[str] = None
44 nino.borges 850 ## Only gather unique_attorney_row_number from the attorney and split role attorney tabs. NEVER from downgrades.
45     unique_attorney_row_number:Optional[str] = None
46 nino.borges 854 ## Will be saving this as a list of tuple pairs (startdate,enddate). Allowing None for now but may update this to forcing an empty list, to avoid mutable default issues.
47     dates_as_counsel:Optional[List[Tuple[str,str]]] = None
48 nino.borges 834
49 nino.borges 966 name_variants: Set[str] = field(default_factory=set, repr=False)
50     ## List of other person_ids with at least one shared variant
51     similar_names: List[str] = field(default_factory=list)
52    
53 nino.borges 834 def __post_init__(self):
54     """Convert all string fields to uppercase."""
55     if self.first_name:
56 nino.borges 836 self.first_name = self.first_name.strip().upper()
57 nino.borges 834 if self.last_name:
58 nino.borges 836 self.last_name = self.last_name.strip().upper()
59 nino.borges 966 if self.alt_first_names:
60     self.alt_first_names = self.alt_first_names.strip().upper()
61     if self.alt_surnames:
62     self.alt_surnames = self.alt_surnames.strip().upper()
63     if self.middle_initial:
64     self.middle_initial = self.middle_initial.strip().upper()
65 nino.borges 834 if self.work_email_address:
66 nino.borges 836 self.work_email_address = self.work_email_address.strip().upper()
67 nino.borges 834 if self.alt_work_email_address:
68 nino.borges 836 self.alt_work_email_address = self.alt_work_email_address.strip().upper()
69 nino.borges 834 if self.is_attorney:
70 nino.borges 836 self.is_attorney = self.is_attorney.strip().upper()
71 nino.borges 834 if self.split_role_date_range:
72 nino.borges 836 self.split_role_date_range = self.split_role_date_range.strip().upper()
73 nino.borges 834 if self.sidley_validated:
74 nino.borges 836 self.sidley_validated = self.sidley_validated.strip().upper()
75 nino.borges 834 if self.category:
76 nino.borges 836 self.category = self.category.strip().upper()
77 nino.borges 834 if self.organization:
78 nino.borges 836 self.organization = self.organization.strip().upper()
79 nino.borges 834 if self.job_title:
80 nino.borges 836 self.job_title = self.job_title.strip().upper()
81 nino.borges 834 if self.business_title:
82 nino.borges 836 self.business_title = self.business_title.strip().upper()
83 nino.borges 834 if self.full_name_preferred:
84 nino.borges 836 self.full_name_preferred = self.full_name_preferred.strip().upper()
85 nino.borges 834 if self.login:
86 nino.borges 836 self.login = self.login.strip().upper()
87 nino.borges 834 if self.department_fine:
88 nino.borges 836 self.department_fine = self.department_fine.strip().upper()
89 nino.borges 834 if self.addressed_during_caag:
90 nino.borges 836 self.addressed_during_caag = self.addressed_during_caag.strip().upper()
91 nino.borges 838 #if self.last_updated:
92     # self.last_updated = self.last_updated.strip().upper()
93 nino.borges 834
94     @dataclass
95     class PeopleList:
96     people: List[Person] = field(default_factory=list)
97 nino.borges 965 ## This is a list of international domains for Amazon. Since I use pickle I cant just make a class attribute and need an instance attribute, hence using a dataclass field.
98     internationalEmailDomainsSet: Set[str] = field(default_factory=set)
99 nino.borges 834
100     def add_person(self, person: Person):
101     self.people.append(person)
102     #print(f"Added person: {person}")
103    
104    
105     def search_by_email(self, emailAddress:str) -> Optional[Person]:
106 nino.borges 850 """Returns the first matching emailAddress value. Assumes emailAddresses are unique"""
107 nino.borges 834 for person in self.people:
108     if person.work_email_address == emailAddress:
109     return person
110 nino.borges 854 elif person.alt_work_email_address == emailAddress:
111     return person
112 nino.borges 834 return None
113 nino.borges 850
114    
115     def search_by_unique_attorney_row_number(self,uniqueAttorneyRowNumber:str) -> Optional[Person]:
116     """Returns the first matching uniqueAttorneyRowNumber value. Assumes uniqueAttorneyRowNumbers are unique"""
117     for person in self.people:
118     if person.unique_attorney_row_number == uniqueAttorneyRowNumber:
119     return person
120     return None
121    
122     def search_by_id(self, idNumber):
123     """Returns the first matching idNumber value. Must be in format UUID('7414f78c-8289-4c9f-bd49-a5aaac35545f')."""
124     for person in self.people:
125     if person._id == idNumber:
126     return person
127     return None
128    
129     def return_list_of_matching_values(self,fieldName, value:str):
130     """Returns a full list of items where value is found in fieldName"""
131     matchingPeopleList = []
132     for person in self.people:
133     if getattr(person,fieldName) == value:
134     matchingPeopleList.append(person)
135     return matchingPeopleList
136 nino.borges 920
137     def return_list_of_partial_email_matches(self, emailAddress:str) -> Optional[Person]:
138     """Returns a full list of partial email address matches by attempting to match the user name part of the email address"""
139     ## Grab the username part of the email address
140     emailAddressPart = emailAddress.split("@")[0]
141     matchingPeopleList = []
142     for person in self.people:
143     if person.work_email_address:
144     ## If a work email address for this person exists, see if the username part is a match.
145     if emailAddressPart == person.work_email_address.split("@")[0]:
146     ## If so, add the person to the matching people list
147     matchingPeopleList.append(person)
148     ## if not, do the same on the Alt email address, if one exists.
149     elif person.alt_work_email_address:
150     if emailAddressPart == person.alt_work_email_address.split("@")[0]:
151     matchingPeopleList.append(person)
152     return matchingPeopleList
153    
154     def return_soppy_search_list(self, fieldName, value:str):
155     """Peforms a sloppy search where the value is IN the field, returns full list of possible matches"""
156     ## Be very careful in using this because you can get a lot of false positives.
157     matchingPeopleList = []
158     for person in self.people:
159     if getattr(person,fieldName) == None:
160     pass
161     else:
162     if value in getattr(person,fieldName):
163     matchingPeopleList.append(person)
164     return matchingPeopleList
165    
166 nino.borges 834 def list_people(self):
167     for person in self.people:
168     print(person)
169    
170 nino.borges 837 def update_full_Name_overide(self, emailAddress:str, fullNameOverideValue) -> Optional[Person]:
171 nino.borges 846 valueUpdated = False
172 nino.borges 837 for person in self.people:
173     if person.work_email_address == emailAddress.upper():
174     person.full_name_overide = fullNameOverideValue.upper()
175 nino.borges 846 valueUpdated = True
176 nino.borges 844 ## Give a quik warning as you add the override value into the database if the last name differs.
177     if "," in fullNameOverideValue:
178     lastName = fullNameOverideValue.split(",")[0]
179     else:
180     lastName = fullNameOverideValue.split(" ")[-1]
181     if lastName.upper() == person.last_name:
182     pass
183     else:
184     print(f"WARNING: Overide last name value {lastName.upper()} does not match {person.last_name}.")
185 nino.borges 846 if valueUpdated == False:
186     print(f"WARNING: No email address match for {emailAddress} found.")
187 nino.borges 834
188 nino.borges 837
189 nino.borges 944 def return_str_string(self, person):
190     """returns the STR search string for a given person"""
191     namesList = []
192     emailAddrList = []
193     if person.last_name:
194     namesList.append(person.last_name)
195     if person.first_name:
196     namesList.append(person.first_name)
197     if person.work_email_address:
198     emailAddrList.append(person.work_email_address)
199     if person.alt_work_email_address:
200     emailAddrList.append(person.alt_work_email_address)
201     if namesList:
202     if emailAddrList:
203     if len(emailAddrList) >1:
204     strText = f"({' W/3 '.join(namesList)}) OR ({' OR '.join(emailAddrList)})"
205     else:
206     strText = f"({' W/3 '.join(namesList)}) OR {' OR '.join(emailAddrList)}"
207     else:
208     strText = f"{' W/3 '.join(namesList)}"
209     else:
210     if emailAddrList:
211     strText = f"{' OR '.join(emailAddrList)}"
212     else:
213     strText = "NONE"
214     return strText
215    
216 nino.borges 965 def return_str_string2(self, person, includeInternationalDomains = False):
217     """returns the STR search string for a given person. if includeInternationalDomains is true, will add the international domains if that person has one already. """
218 nino.borges 956 namesList = []
219 nino.borges 965 #emailAddrList = []
220     emailAddrSet = set()
221 nino.borges 956 firstNamesSet = set()
222     lastNamesSet = set()
223    
224     if person.last_name:
225     lastNamesSet.add(person.last_name)
226 nino.borges 966 if person.alt_surnames:
227     for lastName in person.alt_surnames.split(";\n"):
228     lastNamesSet.add(lastName)
229 nino.borges 956 if person.first_name:
230     firstNamesSet.add(person.first_name)
231     if person.full_name_preferred:
232     person.full_name_preferred = person.full_name_preferred.replace('(SHE, HER)',"(SHE HER)")
233     person.full_name_preferred = person.full_name_preferred.replace(",(LEGAL),"," (LEGAL),")
234     if "," in person.full_name_preferred:
235     parsedLast, parsedFirst = person.full_name_preferred.split(",", 1)
236     else:
237     parsedLast = person.full_name_preferred.split(" ")[-1]
238     parsedFirst = person.full_name_preferred.split(" ")[:-1]
239     parsedFirst = " ".join(parsedFirst)
240     lastNamesSet.add(parsedLast.strip())
241     firstNamesSet.add(parsedFirst.strip())
242    
243     if len(lastNamesSet) >1:
244     namesList.append(f'({" OR ".join((lastNamesSet))})')
245     elif len(lastNamesSet) == 1:
246     namesList.append(list(lastNamesSet)[0])
247     if len(firstNamesSet) >1:
248     namesList.append(f'({" OR ".join((firstNamesSet))})')
249     elif len(firstNamesSet) == 1:
250     namesList.append(list(firstNamesSet)[0])
251    
252    
253     withinWordCount = len(str(firstNamesSet).split(" ")) + len(str(lastNamesSet).split(" "))
254     withinPhrase = f' W/{str(withinWordCount)} '
255    
256    
257 nino.borges 965 ## Assembling the list of email addresses and possibly adding the international domain parts.
258 nino.borges 956 if person.work_email_address:
259 nino.borges 965 emailAddrSet.add(person.work_email_address)
260 nino.borges 956 if person.alt_work_email_address:
261 nino.borges 965 emailAddrSet.add(person.alt_work_email_address)
262     if includeInternationalDomains:
263     ## They elected to add the additional international domain parts.
264     for addr in list(emailAddrSet):
265     addrDomain = addr.split("@")[-1]
266     if addrDomain in self.internationalEmailDomainsSet:
267     for intDomain in self.internationalEmailDomainsSet:
268     emailAddrSet.add(addr.replace(addrDomain, intDomain))
269    
270    
271     emailAddrList = list(emailAddrSet)
272    
273 nino.borges 956 if namesList:
274     if emailAddrList:
275     if len(emailAddrList) >1:
276     strText = f"({withinPhrase.join(namesList)}) OR ({' OR '.join(emailAddrList)})"
277     else:
278     strText = f"({withinPhrase.join(namesList)}) OR {' OR '.join(emailAddrList)}"
279     else:
280     strText = f"{withinPhrase.join(namesList)}"
281     else:
282     if emailAddrList:
283     strText = f"{' OR '.join(emailAddrList)}"
284     else:
285     strText = "NONE"
286    
287     if person.login:
288     ## Only consider the login if it's longer than 4 characters. We may adjust this cutoff to more than 5 in the future.
289     if len(person.login) > 4:
290     if person.login in str(namesList):
291     pass
292     else:
293     strText = strText + f" OR {person.login.lower()}"
294     return strText
295    
296 nino.borges 966
297     def return_person_all_name_variations(self, person):
298     """This will take a matched person and return a large list of all of the possible full name variations"""
299     #last = person.last_name.strip() if person.last_name else None
300     lasts = [person.last_name.strip() if person.last_name else None]
301     if person.alt_surnames:
302     lasts += person.alt_surnames.split(";\n")
303    
304     firsts = [person.first_name.strip() if person.first_name else None]
305     if person.alt_first_names:
306     #firsts.append(person.alt_first_names.strip())
307     firsts += person.alt_first_names.split(";\n")
308    
309     middle = person.middle_initial.replace(".","").strip() if person.middle_initial else None
310    
311     combos = set() ## Using a set here to avoid dupes.
312    
313     for last in lasts:
314     for first in firsts:
315     ## Some basic combinations
316     combos.add(f"{first} {last}")
317     combos.add(f"{last} {first}")
318     combos.add(f"{last}, {first}")
319    
320     ## Include middle initial variations if it exists
321     if middle:
322     combos.add(f"{first} {middle} {last}")
323     combos.add(f"{last} {first} {middle}")
324     combos.add(f"{last}, {first} {middle}")
325     combos.add(f"{first} {middle}. {last}")
326     combos.add(f"{last} {first} {middle}.")
327     combos.add(f"{last}, {first} {middle}.")
328    
329     fNamePrefered = person.full_name_preferred
330     if fNamePrefered:
331     fNamePrefered = fNamePrefered.split(";\n")
332     fNamePrefered = [x.strip() for x in fNamePrefered]
333     combos.update(fNamePrefered)
334     ## if person.vendor_normalized_name:
335     ## combos.add(person.vendor_normalized_name.strip())
336     ## ## Want to add the vendor version of the name without the ESQ here.
337     ## combos.add(person.vendor_normalized_name.upper().replace("(ESQ.)","").strip())
338     return list(combos)
339    
340 nino.borges 967 def build_similar_names(self, people: List[Person], keep_details: bool = False) -> Optional[Dict[str, Dict[str, Set[str]]]]:
341 nino.borges 966
342 nino.borges 967
343 nino.borges 966 for p in people:
344     p.name_variants = self.return_person_all_name_variations(p)
345    
346     index: Dict[str, Set[str]] = defaultdict(set)
347     for p in people:
348     for v in p.name_variants:
349     index[v].add(p._id)
350    
351    
352     details: Optional[Dict[str, Dict[str, Set[str]]]] = {} if keep_details else None
353    
354 nino.borges 967 id_to_person: {p._id: p for p in self.people}
355 nino.borges 966
356     for p in people:
357     collisions: Set[str] = set()
358     if keep_details:
359     details.setdefault(p._id, {})
360     for v in p.name_variants:
361     others = index[v]
362     if len(others) >1:
363     for other_id in others:
364     if other_id == p._id:
365     continue
366     collisions.add(other_id)
367     if keep_details:
368     details[p._id].setdefault(other_id, set()).add(v)
369 nino.borges 967 #if collisions:
370     # print(collisions)
371 nino.borges 966 p.similar_names = sorted(collisions)
372    
373     return details
374    
375 nino.borges 834 class NamesVerification(object):
376     """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
377 nino.borges 966 version = '0.14.0'
378 nino.borges 834
379    
380 nino.borges 837 def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName,fullNameOveridesFileName, forceNewPklFile = False, Encoding = 'UTF8'):
381 nino.borges 834 """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
382     Assumes the first row of the data file is the header and first column is DocID.
383     Assumes the MAL is a spreadsheet (for now).
384     MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
385     pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
386 nino.borges 965
387 nino.borges 834
388     print("Initializing data structures...")
389     if forceNewPklFile:
390     print("Creating MAL structure...")
391     self.malPeopleList = PeopleList()
392     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
393     print("MAL structure created.")
394 nino.borges 837 print("Loading full name overide values...")
395     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
396     print("Full name overide values loaded.")
397 nino.borges 966 print("Analyzing for similar name links...")
398 nino.borges 967 details = self.malPeopleList.build_similar_names(self.malPeopleList.people)
399 nino.borges 966 print("Similar names links added.")
400 nino.borges 834 print("Creating pickle backup...")
401     self.__SaveMalToPkl(pklFileName)
402     print("Pickle backup created.")
403     else:
404     if os.path.exists(pklFileName):
405     print("Loading MAL structure from pickle file...")
406     self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
407     print("MAL structure loaded.")
408     else:
409     print("Pickle file doesnt exist.")
410     print("Creating MAL structure...")
411     self.malPeopleList = PeopleList()
412     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
413     print("MAL structure created.")
414 nino.borges 837 print("Loading full name overide values...")
415     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
416     print("Full name overide values loaded.")
417 nino.borges 966 print("Analyzing for similar name links...")
418 nino.borges 967 details = self.malPeopleList.build_similar_names(self.malPeopleList.people)
419 nino.borges 966 print("Similar names links added.")
420 nino.borges 834 print("Creating pickle backup...")
421     self.__SaveMalToPkl(pklFileName)
422     print("Pickle backup created.")
423    
424     ## self.malPeopleList = PeopleList()
425     ##
426     ## print("Creating MAL structure...")
427     ## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
428     ## print("MAL structure created.")
429     ## print("Creating pickle backup...")
430    
431    
432    
433    
434    
435     def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
436     """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
437     ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
438    
439 nino.borges 850 ## excelTabParametersList should always be an ordered list because now order matters.
440 nino.borges 966 excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":11106, "beginColNumber":1, "endColNumber":20},
441     {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":817, "beginColNumber":1, "endColNumber":19},
442     {"tabName":"Split Role Attorneys", "beginRowNumber":2, "endRowNumber":47, "beginColNumber":1, "endColNumber":10}]
443 nino.borges 834
444 nino.borges 850
445 nino.borges 834 # excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
446     # {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
447    
448 nino.borges 850 # spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
449     # "Split Role - Attorney Capacity Date Range":"split_role_date_range", " Validated by OC??":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
450     # "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag",
451     # "Last Updated":"last_updated"}
452 nino.borges 834
453     xlApp = Dispatch('Excel.Application')
454     xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
455    
456     for excelTab in excelTabParametersList:
457     sht = xlBook.Worksheets(excelTab['tabName'])
458     print(f"Ingesting sheet {excelTab['tabName']}.")
459     excelFieldPositionMatrix = {}
460     for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
461     excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
462     for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
463 nino.borges 920 if row == 5000:
464     print("5,000 row mark reached.")
465     elif row == 10000:
466     print("10,000 row mark reached.")
467 nino.borges 834 #print(row)
468     ## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
469     if excelTab['tabName'] == 'Attorneys':
470     self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
471 nino.borges 877 split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Dates as Counsel']).Value,
472 nino.borges 837 sidley_validated = sht.Cells(row,excelFieldPositionMatrix[' Validated by OC?']).Value,
473 nino.borges 834 category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
474     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
475     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
476     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
477 nino.borges 966 alt_first_names = sht.Cells(row,excelFieldPositionMatrix['Alt First Names']).Value,
478     alt_surnames = sht.Cells(row,excelFieldPositionMatrix['Alt Surnames']).Value,
479     middle_initial = sht.Cells(row,excelFieldPositionMatrix['Middle Initial']).Value,
480 nino.borges 834 work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
481     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
482     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
483     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
484     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
485     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
486     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
487 nino.borges 850 unique_attorney_row_number = sht.Cells(row,excelFieldPositionMatrix['Row']).Value,
488     addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Comments']).Value))
489     #addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
490 nino.borges 838 #last_updated = sht.Cells(row,excelFieldPositionMatrix['Last Updated']).Value ))
491 nino.borges 844
492 nino.borges 850 elif excelTab['tabName'] == 'Downgrades':
493     ## Make sure to NOT grab the unique attorney row number from here
494 nino.borges 844 self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
495 nino.borges 850 #split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
496 nino.borges 844 sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Validated by OC?']).Value,
497     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
498     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
499     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
500 nino.borges 966 alt_first_names = sht.Cells(row,excelFieldPositionMatrix['Alt First Names']).Value,
501     alt_surnames = sht.Cells(row,excelFieldPositionMatrix['Alt Surnames']).Value,
502     middle_initial = sht.Cells(row,excelFieldPositionMatrix['Middle Initial']).Value,
503 nino.borges 844 work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
504     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
505     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
506     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
507     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
508     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
509     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
510 nino.borges 850 addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
511     elif excelTab['tabName'] == 'Split Role Attorneys':
512     unique_attorney_row_number = sht.Cells(row,excelFieldPositionMatrix['Attorney Row']).Value
513     matchedPerson = self.malPeopleList.search_by_unique_attorney_row_number(unique_attorney_row_number)
514     if matchedPerson:
515 nino.borges 854
516     ## dates_as_counsel should always be a two string value tuple (startdate,enddate).
517     datesAsCounselValue = sht.Cells(row,excelFieldPositionMatrix['Dates as Counsel']).Value
518     datesAsCounselList = []
519     ## First get rid of any extra data that is on a new line. Note that they shouldnt be seperating the date ranges by newline.
520     datesAsCounselValue = datesAsCounselValue.split("\n")[0]
521 nino.borges 920 #print(datesAsCounselValue)
522 nino.borges 854 ## Next split the ranges correctly by semicolon
523     dateRanges = datesAsCounselValue.split(";")
524     for dateRange in dateRanges:
525     ## Split out the start and end, allowing non-date words. (current, present, etc) however force these to be uppercase.
526     counselStartDate, counselEndDate = dateRange.split("-")
527     counselStartDate = counselStartDate.upper().strip()
528     counselEndDate = counselEndDate.upper().strip()
529     datesAsCounselList.append((counselStartDate,counselEndDate))
530     matchedPerson.dates_as_counsel = datesAsCounselList
531 nino.borges 965
532     elif excelTab['tabName'] == '_data':
533     ## This is a tab that contains some additional data tables which I want others to see. I'll ingest this separately.
534     pass
535 nino.borges 850
536     else:
537     print(f"ERROR UNKNOWN TAB! {excelTab['tabName']} HAVE NEEDED TAB NAMES CHANGED?")
538    
539 nino.borges 965 ## Now grab any additional data tables from the _data tab.
540     print("Main tabs ingested. Now ingesting additional data tables.")
541     sht = xlBook.Worksheets('_data')
542     for row in range(2,22):
543     intDomainValue = sht.Cells(row,"A").Value
544     self.malPeopleList.internationalEmailDomainsSet.add(intDomainValue.strip().upper())
545     #print(self.malPeopleList.internationalEmailDomainsSet)
546 nino.borges 834 xlBook.Close()
547    
548     def __SaveMalToPkl(self, pklFileName):
549     """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
550     outputFile = open(pklFileName,'wb')
551     pickle.dump(self.malPeopleList,outputFile)
552     outputFile.close()
553    
554     def __LoadMalFromPkl(self, pklFileName):
555     """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
556     contents = open(pklFileName, 'rb')
557     obj = pickle.load(contents)
558     contents.close()
559     return obj
560    
561 nino.borges 837 def __LoadFullNameOverideValues(self, fullNameOveridesFileName):
562     """Pseudo-private method which will update the MAL people list object with the full name overide values."""
563     contents = open(fullNameOveridesFileName).readlines()
564     for line in contents:
565     line = line.replace("\n","")
566     emailAddress,fullNameOverideValue = line.split("|")
567 nino.borges 844
568 nino.borges 837 self.malPeopleList.update_full_Name_overide(emailAddress, fullNameOverideValue)
569    
570 nino.borges 844 def SmartDedupeSet(self, currentSet):
571 nino.borges 850 """A method that attempts to do some additional deduplication of the values in a set by lowering all values and deduplicating. Returns a lowered deduplicated set."""
572 nino.borges 844 newSet = set()
573     for val in currentSet:
574     newSet.add(val.lower())
575     return newSet
576    
577 nino.borges 853 def RunMalEmailAddressIntegrityCheck(self):
578     """This method performs an integrity check on the MAL by analyzing and looking for duplicate email addresses."""
579     emailTestMatrix = {}
580     altTestMatrix = {}
581     print("Performing MAL email address integrity check...")
582     for i in range(0,len(self.malPeopleList.people)):
583     altAddr = self.malPeopleList.people[i].alt_work_email_address
584     workAddr = self.malPeopleList.people[i].work_email_address
585     if altAddr != None:
586 nino.borges 877 altAddr = altAddr.strip()
587     if altAddr in list(emailTestMatrix.keys()):
588     print(f"ISSUE:{altAddr} is a dupe of an workAddr.")
589 nino.borges 853 if altAddr in list(altTestMatrix.keys()):
590 nino.borges 877 print(f"ISSUE:{altAddr} is a dupe!")
591 nino.borges 853 else:
592     altTestMatrix[altAddr] = 1
593     if workAddr != None:
594 nino.borges 877 workAddr = workAddr.strip()
595 nino.borges 853 if workAddr in list(altTestMatrix.keys()):
596 nino.borges 877 print(f"ISSUE:{workAddr} is a dupe of an altAddr.")
597 nino.borges 853 if workAddr in list(emailTestMatrix.keys()):
598 nino.borges 877 print(f"ISSUE:{workAddr} is a dupe!")
599 nino.borges 853 else:
600     emailTestMatrix[workAddr] = 1
601 nino.borges 877 print("\nEmail address integrity check complete.\n\n")
602 nino.borges 853
603 nino.borges 877 def RunMalEmailOutsideEmailFieldsIntegrityCheck(self):
604     """This method performs an integrity check on the MAL by looking for email addresses that exist in fields other than the email address fields."""
605     ## Right now this looks for the @ symbol.
606     ## Editable list of fields that should be excluded from this test, especially those that should already have email addresses
607     fieldsToExcludeList = ['work_email_address', 'alt_work_email_address','_id','dates_as_counsel','unique_attorney_row_number']
608     print("Performing MAL email addresses outside of email address fields integrity check...")
609     fieldObjects = fields(Person)
610     fieldNames = [f.name for f in fieldObjects]
611     #print(fieldNames)
612     fieldsToSearchList = [x for x in fieldNames if x not in fieldsToExcludeList]
613     #print(fieldsToSearchList)
614     for i in range(0,len(self.malPeopleList.people)):
615     for fieldName in fieldsToSearchList:
616     testValue = getattr(self.malPeopleList.people[i], fieldName)
617     #print(fieldName)
618     if testValue:
619     if "@" in testValue:
620     print(f"ISSUE: The email address {testValue} exists in the non-email field {fieldName} for unique row# {self.malPeopleList.people[i].unique_attorney_row_number}.")
621     print("\nEmail addresss outside of email fields integrity check complete.\n\n")
622    
623 nino.borges 853
624 nino.borges 869 def RunRowNumberIntegrityCheck(self):
625     """This method performs an integrity check on the MAL by analyzing the hard-coded row numbers across the 3 imporant tabs. Looks for gaps, blanks, and inconsistencies between split role. """
626     ## First let's return all non-attorneys and confirm the hard-coded row number is in the 50000 range and look for gaps.
627     print("Performing MAL hard-coded row number integrity check...")
628     ## nonAttorneyPeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','NO')
629     ## print(f"Analyzing all {len(nonAttorneyPeopleList)} non-attorneys items...")
630     ## ## Gather all non-attorneys and add hc row number to a list, looking for any that are missing a value
631     ## for nonAttorneyPerson in nonAttorneyPeopleList:
632     ## hcRowNumberList = []
633     ## hcRowNumber = nonAttorneyPerson.unique_attorney_row_number
634     ## if hcRowNumber == None:
635     ## print(f"WARNING: Empty hard coded row number for {nonAttorneyPerson.first_name} {nonAttorneyPerson.last_name} in the Downgrades Tab.")
636     ## else:
637     ## hcRowNumberList.append(int(hcRowNumber))
638     ## ## Next export a list of the missing numbers
639     ## hcRowNumberList.sort()
640     ## compareSet = set(range(hcRowNumberList[0], hcRowNumberList[-1]))
641     ## downgradeDiffs = compareSet - set(hcRowNumberList)
642     ## print(downgradeDiffs)
643     ## Now let's do similar for attorneys, including split role.
644     attorneyPeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','YES')
645     splitRolePeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','SPLIT ROLE')
646     ## Creating a third list using the newer list joining from pep 448
647     fullAttorneyPeopleList = [*attorneyPeopleList,*splitRolePeopleList]
648     print(f"Analyzing all {len(fullAttorneyPeopleList)} attorneys items...")
649     ## Gather all attorneys and add hc row number to a list, looking for any that are missing a value
650     for attorneyPerson in fullAttorneyPeopleList:
651     hcRowNumberList = []
652     hcRowNumber = attorneyPerson.unique_attorney_row_number
653     if hcRowNumber == None:
654     print(f"WARNING: Empty hard coded row number for {attorneyPerson.first_name} {attorneyPerson.last_name} in the Attorneys Tab.")
655     else:
656     hcRowNumberList.append(int(hcRowNumber))
657     ## Next export a list of the missing numbers
658     hcRowNumberList.sort()
659     compareSet = set(range(hcRowNumberList[0], hcRowNumberList[-1]))
660     attorneyDiffs = compareSet - set(hcRowNumberList)
661     if attorneyDiffs:
662     print(attorneyDiffs)
663     else:
664     print("There are no gaps in the hard coded row numbers in the Attorneys tab.")
665 nino.borges 853
666 nino.borges 944 def ExportFullSTRList(self, attorneyOnly = True):
667 nino.borges 956 """Exports a full STR file for all entries in the data class. Defaults to only attorneys. Changed to also include split role in attorneys"""
668     outputFile = open(r"C:\Test_Dir\Amazon\Attorneys_SR_STR_v2.txt",'w', encoding='UTF-8')
669 nino.borges 944 if attorneyOnly == True:
670     attorneyPeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','YES')
671 nino.borges 956 splitRolePeoplelist = self.malPeopleList.return_list_of_matching_values('is_attorney','SPLIT ROLE')
672     attorneyPeopleList += splitRolePeoplelist
673 nino.borges 944 for attorneyPerson in attorneyPeopleList:
674 nino.borges 956 outputText = self.malPeopleList.return_str_string2(attorneyPerson)
675 nino.borges 944 outputFile.write(outputText + "\n")
676     outputFile.close()
677 nino.borges 853
678 nino.borges 834 if __name__ == '__main__':
679 nino.borges 847 pass
680     ## cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\VEAS\VEAS_Log_Data_Export_Converted.txt"
681     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
682     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\TEST-PLOG.txt"
683     ## #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
684     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
685     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
686     ## #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
687     ## masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.1(7045413.15).xlsx"
688     ## #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
689     ## #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides.txt"
690     ## #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides - Copy.txt"
691     ## fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
692     ##
693     ##
694     ##
695     ## nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
696     ## #nv.malPeopleList.list_people()
697     ##
698     ## qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
699     ## print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
700     ## #print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
701     ## workList = qcP.metadataValuesDict.keys()
702     ## outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
703     ## for docID in workList:
704     ## #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
705     ## #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
706     ## #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
707     ## #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
708     ## #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
709     ## #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
710     ## metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
711     ## formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
712     ## formattedAttorneyValues = set()
713     ## for formattedValue in formattedFieldValues:
714     ## if "*" in formattedValue:
715     ## formattedAttorneyValues.add(formattedValue.upper())
716     ##
717     ## if metadataFieldValues:
718     ## matchedMetadataValues = set()
719     ## for nameItem in metadataFieldValues:
720     ## ## First test to see if there is a valid email address.
721     ## resultSet = set()
722     ## results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
723     ## if results:
724     ## for result in results:
725     ## resultSet.add(result)
726     ## if len(resultSet) >1:
727     ## resultSet = nv.SmartDedupeSet(resultSet)
728     ## if len(resultSet) >1:
729     ## print("ERROR multiple email **unique** email addresses in one item.")
730     ## print(resultSet)
731     ## print("\n")
732     ## else:
733     ## personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
734     ## if personMatch:
735     ## if personMatch.full_name_overide:
736     ## fullName = personMatch.full_name_overide
737     ## elif personMatch.full_name_preferred:
738     ## #print(personMatch.full_name_preferred)
739     ## ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
740     ## fullPreferredName = personMatch.full_name_preferred
741     ## fullPreferredName = fullPreferredName.replace('(LEGAL)','')
742     ## fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
743     ## fullPreferredName = fullPreferredName.replace('(SHE HER)','')
744     ## preferedLastName, preferedFirstName = fullPreferredName.split(',')
745     ## preferedLastName = preferedLastName.strip()
746     ## preferedFirstName = preferedFirstName.strip()
747     ## preferedFirstName = preferedFirstName.split(" ")[0]
748     ## fullName = f"{preferedFirstName} {preferedLastName}"
749     ## #fullName = f"{preferedLastName}, {preferedFirstName}"
750     ## else:
751     ## fullName = f"{personMatch.first_name} {personMatch.last_name}"
752     ## #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
753     ## if personMatch.is_attorney == 'YES':
754     ## #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
755     ## matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
756     ## else:
757     ## #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
758     ## matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
759     ## else:
760     ## outputFile.write(f"{docID} contains a non-email item {nameItem}\n\n")
761     ## missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
762     ## missingFromMeta = formattedAttorneyValues - matchedMetadataValues
763     ## if missingFromFormatted:
764     ## for missingItem in missingFromFormatted:
765     ## outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
766     ## if missingFromMeta:
767     ## for missingItem in missingFromMeta:
768     ## outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
769     ## if missingFromFormatted:
770     ## outputFile.write("\n")
771     ## elif missingFromMeta:
772     ## outputFile.write("\n")
773     ## outputFile.close()
774 nino.borges 834