ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_NamesNormQC.py
Revision: 980
Committed: Fri Feb 20 18:54:49 2026 UTC (5 weeks, 1 day ago) by nino.borges
Content type: text/x-python
File size: 47799 byte(s)
Log Message:
Added support for the similar names exclusions in this version.15

File Contents

# User Rev Content
1 nino.borges 834 """
2    
3 nino.borges 847 Amazon_NamesNormQC
4 nino.borges 834
5     Created by:
6     Emanuel Borges
7     11.21.2024
8    
9 nino.borges 847 This Library will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10 nino.borges 834
11     """
12    
13     import os, uuid, pickle, re
14 nino.borges 847 #import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15 nino.borges 877 from dataclasses import dataclass, field, fields
16 nino.borges 966 from typing import List, Tuple, Optional, Set, Dict
17     from collections import namedtuple, defaultdict
18 nino.borges 834 from win32com.client import Dispatch
19    
20    
21     @dataclass
22     class Person:
23     first_name: Optional[str] = None
24     last_name: Optional[str] = None
25 nino.borges 966 alt_first_names: Optional[str] = None
26     alt_surnames: Optional[str] = None
27     middle_initial: Optional[str] = None
28 nino.borges 834 work_email_address: Optional[str] = None
29     alt_work_email_address: Optional[str] = None
30     _id: uuid.UUID = field(default_factory=uuid.uuid4)
31     is_attorney: Optional[str] = None
32     split_role_date_range: Optional[str] = None
33     sidley_validated: Optional[str] = None
34     category: Optional[str] = None
35     organization: Optional[str] = None
36     job_title: Optional[str] = None
37     business_title: Optional[str] = None
38     full_name_preferred: Optional[str] = None
39     login: Optional[str] = None
40     department_fine: Optional[str] = None
41     addressed_during_caag: Optional[str] = None
42 nino.borges 838 #last_updated: Optional[str] = None
43 nino.borges 837 full_name_overide: Optional[str] = None
44 nino.borges 850 ## Only gather unique_attorney_row_number from the attorney and split role attorney tabs. NEVER from downgrades.
45     unique_attorney_row_number:Optional[str] = None
46 nino.borges 854 ## Will be saving this as a list of tuple pairs (startdate,enddate). Allowing None for now but may update this to forcing an empty list, to avoid mutable default issues.
47     dates_as_counsel:Optional[List[Tuple[str,str]]] = None
48 nino.borges 834
49 nino.borges 966 name_variants: Set[str] = field(default_factory=set, repr=False)
50     ## List of other person_ids with at least one shared variant
51     similar_names: List[str] = field(default_factory=list)
52    
53 nino.borges 834 def __post_init__(self):
54     """Convert all string fields to uppercase."""
55     if self.first_name:
56 nino.borges 836 self.first_name = self.first_name.strip().upper()
57 nino.borges 834 if self.last_name:
58 nino.borges 836 self.last_name = self.last_name.strip().upper()
59 nino.borges 966 if self.alt_first_names:
60     self.alt_first_names = self.alt_first_names.strip().upper()
61     if self.alt_surnames:
62     self.alt_surnames = self.alt_surnames.strip().upper()
63     if self.middle_initial:
64     self.middle_initial = self.middle_initial.strip().upper()
65 nino.borges 834 if self.work_email_address:
66 nino.borges 836 self.work_email_address = self.work_email_address.strip().upper()
67 nino.borges 834 if self.alt_work_email_address:
68 nino.borges 836 self.alt_work_email_address = self.alt_work_email_address.strip().upper()
69 nino.borges 834 if self.is_attorney:
70 nino.borges 836 self.is_attorney = self.is_attorney.strip().upper()
71 nino.borges 834 if self.split_role_date_range:
72 nino.borges 836 self.split_role_date_range = self.split_role_date_range.strip().upper()
73 nino.borges 834 if self.sidley_validated:
74 nino.borges 836 self.sidley_validated = self.sidley_validated.strip().upper()
75 nino.borges 834 if self.category:
76 nino.borges 836 self.category = self.category.strip().upper()
77 nino.borges 834 if self.organization:
78 nino.borges 836 self.organization = self.organization.strip().upper()
79 nino.borges 834 if self.job_title:
80 nino.borges 836 self.job_title = self.job_title.strip().upper()
81 nino.borges 834 if self.business_title:
82 nino.borges 836 self.business_title = self.business_title.strip().upper()
83 nino.borges 834 if self.full_name_preferred:
84 nino.borges 836 self.full_name_preferred = self.full_name_preferred.strip().upper()
85 nino.borges 834 if self.login:
86 nino.borges 836 self.login = self.login.strip().upper()
87 nino.borges 834 if self.department_fine:
88 nino.borges 836 self.department_fine = self.department_fine.strip().upper()
89 nino.borges 834 if self.addressed_during_caag:
90 nino.borges 836 self.addressed_during_caag = self.addressed_during_caag.strip().upper()
91 nino.borges 838 #if self.last_updated:
92     # self.last_updated = self.last_updated.strip().upper()
93 nino.borges 834
94     @dataclass
95     class PeopleList:
96     people: List[Person] = field(default_factory=list)
97 nino.borges 965 ## This is a list of international domains for Amazon. Since I use pickle I cant just make a class attribute and need an instance attribute, hence using a dataclass field.
98     internationalEmailDomainsSet: Set[str] = field(default_factory=set)
99 nino.borges 834
100     def add_person(self, person: Person):
101     self.people.append(person)
102     #print(f"Added person: {person}")
103    
104    
105     def search_by_email(self, emailAddress:str) -> Optional[Person]:
106 nino.borges 850 """Returns the first matching emailAddress value. Assumes emailAddresses are unique"""
107 nino.borges 834 for person in self.people:
108     if person.work_email_address == emailAddress:
109     return person
110 nino.borges 854 elif person.alt_work_email_address == emailAddress:
111     return person
112 nino.borges 834 return None
113 nino.borges 850
114    
115     def search_by_unique_attorney_row_number(self,uniqueAttorneyRowNumber:str) -> Optional[Person]:
116     """Returns the first matching uniqueAttorneyRowNumber value. Assumes uniqueAttorneyRowNumbers are unique"""
117     for person in self.people:
118     if person.unique_attorney_row_number == uniqueAttorneyRowNumber:
119     return person
120     return None
121    
122     def search_by_id(self, idNumber):
123     """Returns the first matching idNumber value. Must be in format UUID('7414f78c-8289-4c9f-bd49-a5aaac35545f')."""
124     for person in self.people:
125     if person._id == idNumber:
126     return person
127     return None
128    
129     def return_list_of_matching_values(self,fieldName, value:str):
130     """Returns a full list of items where value is found in fieldName"""
131     matchingPeopleList = []
132     for person in self.people:
133     if getattr(person,fieldName) == value:
134     matchingPeopleList.append(person)
135     return matchingPeopleList
136 nino.borges 920
137     def return_list_of_partial_email_matches(self, emailAddress:str) -> Optional[Person]:
138     """Returns a full list of partial email address matches by attempting to match the user name part of the email address"""
139     ## Grab the username part of the email address
140     emailAddressPart = emailAddress.split("@")[0]
141     matchingPeopleList = []
142     for person in self.people:
143     if person.work_email_address:
144     ## If a work email address for this person exists, see if the username part is a match.
145     if emailAddressPart == person.work_email_address.split("@")[0]:
146     ## If so, add the person to the matching people list
147     matchingPeopleList.append(person)
148     ## if not, do the same on the Alt email address, if one exists.
149     elif person.alt_work_email_address:
150     if emailAddressPart == person.alt_work_email_address.split("@")[0]:
151     matchingPeopleList.append(person)
152     return matchingPeopleList
153    
154     def return_soppy_search_list(self, fieldName, value:str):
155     """Peforms a sloppy search where the value is IN the field, returns full list of possible matches"""
156     ## Be very careful in using this because you can get a lot of false positives.
157     matchingPeopleList = []
158     for person in self.people:
159     if getattr(person,fieldName) == None:
160     pass
161     else:
162     if value in getattr(person,fieldName):
163     matchingPeopleList.append(person)
164     return matchingPeopleList
165    
166 nino.borges 834 def list_people(self):
167     for person in self.people:
168     print(person)
169    
170 nino.borges 837 def update_full_Name_overide(self, emailAddress:str, fullNameOverideValue) -> Optional[Person]:
171 nino.borges 846 valueUpdated = False
172 nino.borges 837 for person in self.people:
173     if person.work_email_address == emailAddress.upper():
174     person.full_name_overide = fullNameOverideValue.upper()
175 nino.borges 846 valueUpdated = True
176 nino.borges 844 ## Give a quik warning as you add the override value into the database if the last name differs.
177     if "," in fullNameOverideValue:
178     lastName = fullNameOverideValue.split(",")[0]
179     else:
180     lastName = fullNameOverideValue.split(" ")[-1]
181     if lastName.upper() == person.last_name:
182     pass
183     else:
184     print(f"WARNING: Overide last name value {lastName.upper()} does not match {person.last_name}.")
185 nino.borges 846 if valueUpdated == False:
186     print(f"WARNING: No email address match for {emailAddress} found.")
187 nino.borges 834
188 nino.borges 837
189 nino.borges 944 def return_str_string(self, person):
190     """returns the STR search string for a given person"""
191     namesList = []
192     emailAddrList = []
193     if person.last_name:
194     namesList.append(person.last_name)
195     if person.first_name:
196     namesList.append(person.first_name)
197     if person.work_email_address:
198     emailAddrList.append(person.work_email_address)
199     if person.alt_work_email_address:
200     emailAddrList.append(person.alt_work_email_address)
201     if namesList:
202     if emailAddrList:
203     if len(emailAddrList) >1:
204     strText = f"({' W/3 '.join(namesList)}) OR ({' OR '.join(emailAddrList)})"
205     else:
206     strText = f"({' W/3 '.join(namesList)}) OR {' OR '.join(emailAddrList)}"
207     else:
208     strText = f"{' W/3 '.join(namesList)}"
209     else:
210     if emailAddrList:
211     strText = f"{' OR '.join(emailAddrList)}"
212     else:
213     strText = "NONE"
214     return strText
215    
216 nino.borges 980 def return_str_string2(self, person, includeInternationalDomains = False, includeSimilarNamesExclusion = True):
217 nino.borges 965 """returns the STR search string for a given person. if includeInternationalDomains is true, will add the international domains if that person has one already. """
218 nino.borges 956 namesList = []
219 nino.borges 965 #emailAddrList = []
220     emailAddrSet = set()
221 nino.borges 956 firstNamesSet = set()
222     lastNamesSet = set()
223    
224     if person.last_name:
225     lastNamesSet.add(person.last_name)
226 nino.borges 966 if person.alt_surnames:
227     for lastName in person.alt_surnames.split(";\n"):
228     lastNamesSet.add(lastName)
229 nino.borges 956 if person.first_name:
230     firstNamesSet.add(person.first_name)
231 nino.borges 980 if person.alt_first_names:
232     firstNamesSet.add(person.alt_first_names)
233 nino.borges 956 if person.full_name_preferred:
234     person.full_name_preferred = person.full_name_preferred.replace('(SHE, HER)',"(SHE HER)")
235     person.full_name_preferred = person.full_name_preferred.replace(",(LEGAL),"," (LEGAL),")
236     if "," in person.full_name_preferred:
237     parsedLast, parsedFirst = person.full_name_preferred.split(",", 1)
238     else:
239     parsedLast = person.full_name_preferred.split(" ")[-1]
240     parsedFirst = person.full_name_preferred.split(" ")[:-1]
241     parsedFirst = " ".join(parsedFirst)
242     lastNamesSet.add(parsedLast.strip())
243     firstNamesSet.add(parsedFirst.strip())
244    
245 nino.borges 980 ## within all values in both the first name and last name sets, replace any open or close parenthesis with a space.
246     firstNamesSet = {fn.replace("("," ").replace(")"," ") for fn in firstNamesSet}
247     lastNamesSet = {ln.replace("("," ").replace(")"," ") for ln in lastNamesSet}
248    
249 nino.borges 956 if len(lastNamesSet) >1:
250     namesList.append(f'({" OR ".join((lastNamesSet))})')
251     elif len(lastNamesSet) == 1:
252     namesList.append(list(lastNamesSet)[0])
253     if len(firstNamesSet) >1:
254     namesList.append(f'({" OR ".join((firstNamesSet))})')
255     elif len(firstNamesSet) == 1:
256     namesList.append(list(firstNamesSet)[0])
257    
258    
259     withinWordCount = len(str(firstNamesSet).split(" ")) + len(str(lastNamesSet).split(" "))
260     withinPhrase = f' W/{str(withinWordCount)} '
261    
262    
263 nino.borges 965 ## Assembling the list of email addresses and possibly adding the international domain parts.
264 nino.borges 956 if person.work_email_address:
265 nino.borges 965 emailAddrSet.add(person.work_email_address)
266 nino.borges 956 if person.alt_work_email_address:
267 nino.borges 965 emailAddrSet.add(person.alt_work_email_address)
268     if includeInternationalDomains:
269     ## They elected to add the additional international domain parts.
270     for addr in list(emailAddrSet):
271     addrDomain = addr.split("@")[-1]
272     if addrDomain in self.internationalEmailDomainsSet:
273     for intDomain in self.internationalEmailDomainsSet:
274     emailAddrSet.add(addr.replace(addrDomain, intDomain))
275    
276    
277     emailAddrList = list(emailAddrSet)
278    
279 nino.borges 956 if namesList:
280     if emailAddrList:
281     if len(emailAddrList) >1:
282     strText = f"({withinPhrase.join(namesList)}) OR ({' OR '.join(emailAddrList)})"
283     else:
284     strText = f"({withinPhrase.join(namesList)}) OR {' OR '.join(emailAddrList)}"
285     else:
286     strText = f"{withinPhrase.join(namesList)}"
287     else:
288     if emailAddrList:
289     strText = f"{' OR '.join(emailAddrList)}"
290     else:
291     strText = "NONE"
292    
293     if person.login:
294     ## Only consider the login if it's longer than 4 characters. We may adjust this cutoff to more than 5 in the future.
295     if len(person.login) > 4:
296     if person.login in str(namesList):
297     pass
298     else:
299     strText = strText + f" OR {person.login.lower()}"
300 nino.borges 980
301    
302     if includeSimilarNamesExclusion:
303     ## This will check the similar_names value for the person and will add the NOT clause at the end.
304     if person.similar_names:
305     #print("Similar names for this person exists")
306     similarNamesEmailList = []
307     similarNamesIDList = person.similar_names
308     for sID in similarNamesIDList:
309     sPerson = self.search_by_id(sID)
310     if sPerson.work_email_address:
311     similarNamesEmailList.append(sPerson.work_email_address)
312     if sPerson.alt_work_email_address:
313     similarNamesEmailList.append(sPerson.alt_work_email_address)
314     strText = f"({strText}) NOT W/2 ({' OR '.join(similarNamesEmailList)})"
315    
316 nino.borges 956 return strText
317    
318 nino.borges 966
319     def return_person_all_name_variations(self, person):
320     """This will take a matched person and return a large list of all of the possible full name variations"""
321     #last = person.last_name.strip() if person.last_name else None
322     lasts = [person.last_name.strip() if person.last_name else None]
323     if person.alt_surnames:
324     lasts += person.alt_surnames.split(";\n")
325    
326     firsts = [person.first_name.strip() if person.first_name else None]
327     if person.alt_first_names:
328     #firsts.append(person.alt_first_names.strip())
329     firsts += person.alt_first_names.split(";\n")
330    
331     middle = person.middle_initial.replace(".","").strip() if person.middle_initial else None
332    
333     combos = set() ## Using a set here to avoid dupes.
334    
335 nino.borges 980 if lasts != [None]:
336     #print(lasts)
337     ## Dont bother adding entries if the last name is None.
338     for last in lasts:
339     for first in firsts:
340     ## Some basic combinations
341     combos.add(f"{first} {last}")
342     combos.add(f"{last} {first}")
343     combos.add(f"{last}, {first}")
344 nino.borges 966
345 nino.borges 980 ## Include middle initial variations if it exists
346     if middle:
347     combos.add(f"{first} {middle} {last}")
348     combos.add(f"{last} {first} {middle}")
349     combos.add(f"{last}, {first} {middle}")
350     combos.add(f"{first} {middle}. {last}")
351     combos.add(f"{last} {first} {middle}.")
352     combos.add(f"{last}, {first} {middle}.")
353 nino.borges 966
354     fNamePrefered = person.full_name_preferred
355     if fNamePrefered:
356     fNamePrefered = fNamePrefered.split(";\n")
357     fNamePrefered = [x.strip() for x in fNamePrefered]
358     combos.update(fNamePrefered)
359     ## if person.vendor_normalized_name:
360     ## combos.add(person.vendor_normalized_name.strip())
361     ## ## Want to add the vendor version of the name without the ESQ here.
362     ## combos.add(person.vendor_normalized_name.upper().replace("(ESQ.)","").strip())
363 nino.borges 980
364 nino.borges 966 return list(combos)
365    
366 nino.borges 967 def build_similar_names(self, people: List[Person], keep_details: bool = False) -> Optional[Dict[str, Dict[str, Set[str]]]]:
367 nino.borges 966
368 nino.borges 967
369 nino.borges 966 for p in people:
370     p.name_variants = self.return_person_all_name_variations(p)
371    
372     index: Dict[str, Set[str]] = defaultdict(set)
373     for p in people:
374     for v in p.name_variants:
375     index[v].add(p._id)
376    
377    
378     details: Optional[Dict[str, Dict[str, Set[str]]]] = {} if keep_details else None
379    
380 nino.borges 967 id_to_person: {p._id: p for p in self.people}
381 nino.borges 966
382     for p in people:
383     collisions: Set[str] = set()
384     if keep_details:
385     details.setdefault(p._id, {})
386     for v in p.name_variants:
387     others = index[v]
388     if len(others) >1:
389     for other_id in others:
390     if other_id == p._id:
391     continue
392     collisions.add(other_id)
393     if keep_details:
394     details[p._id].setdefault(other_id, set()).add(v)
395 nino.borges 967 #if collisions:
396     # print(collisions)
397 nino.borges 966 p.similar_names = sorted(collisions)
398    
399     return details
400    
401 nino.borges 834 class NamesVerification(object):
402     """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
403 nino.borges 980 version = '0.15.0'
404 nino.borges 834
405    
406 nino.borges 837 def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName,fullNameOveridesFileName, forceNewPklFile = False, Encoding = 'UTF8'):
407 nino.borges 834 """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
408     Assumes the first row of the data file is the header and first column is DocID.
409     Assumes the MAL is a spreadsheet (for now).
410     MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
411     pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
412 nino.borges 965
413 nino.borges 834
414     print("Initializing data structures...")
415     if forceNewPklFile:
416     print("Creating MAL structure...")
417     self.malPeopleList = PeopleList()
418     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
419     print("MAL structure created.")
420 nino.borges 837 print("Loading full name overide values...")
421     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
422     print("Full name overide values loaded.")
423 nino.borges 966 print("Analyzing for similar name links...")
424 nino.borges 967 details = self.malPeopleList.build_similar_names(self.malPeopleList.people)
425 nino.borges 966 print("Similar names links added.")
426 nino.borges 834 print("Creating pickle backup...")
427     self.__SaveMalToPkl(pklFileName)
428     print("Pickle backup created.")
429     else:
430     if os.path.exists(pklFileName):
431     print("Loading MAL structure from pickle file...")
432     self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
433     print("MAL structure loaded.")
434     else:
435     print("Pickle file doesnt exist.")
436     print("Creating MAL structure...")
437     self.malPeopleList = PeopleList()
438     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
439     print("MAL structure created.")
440 nino.borges 837 print("Loading full name overide values...")
441     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
442     print("Full name overide values loaded.")
443 nino.borges 966 print("Analyzing for similar name links...")
444 nino.borges 967 details = self.malPeopleList.build_similar_names(self.malPeopleList.people)
445 nino.borges 966 print("Similar names links added.")
446 nino.borges 834 print("Creating pickle backup...")
447     self.__SaveMalToPkl(pklFileName)
448     print("Pickle backup created.")
449    
450     ## self.malPeopleList = PeopleList()
451     ##
452     ## print("Creating MAL structure...")
453     ## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
454     ## print("MAL structure created.")
455     ## print("Creating pickle backup...")
456    
457    
458    
459    
460    
461     def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
462     """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
463     ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
464    
465 nino.borges 850 ## excelTabParametersList should always be an ordered list because now order matters.
466 nino.borges 980 excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":11117, "beginColNumber":1, "endColNumber":20},
467     {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":822, "beginColNumber":1, "endColNumber":19},
468     {"tabName":"Split Role Attorneys", "beginRowNumber":2, "endRowNumber":49, "beginColNumber":1, "endColNumber":10}]
469 nino.borges 834
470 nino.borges 850
471 nino.borges 834 # excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
472     # {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
473    
474 nino.borges 850 # spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
475     # "Split Role - Attorney Capacity Date Range":"split_role_date_range", " Validated by OC??":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
476     # "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag",
477     # "Last Updated":"last_updated"}
478 nino.borges 834
479     xlApp = Dispatch('Excel.Application')
480     xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
481    
482     for excelTab in excelTabParametersList:
483     sht = xlBook.Worksheets(excelTab['tabName'])
484     print(f"Ingesting sheet {excelTab['tabName']}.")
485     excelFieldPositionMatrix = {}
486     for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
487     excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
488     for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
489 nino.borges 920 if row == 5000:
490     print("5,000 row mark reached.")
491     elif row == 10000:
492     print("10,000 row mark reached.")
493 nino.borges 834 #print(row)
494     ## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
495     if excelTab['tabName'] == 'Attorneys':
496     self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
497 nino.borges 877 split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Dates as Counsel']).Value,
498 nino.borges 837 sidley_validated = sht.Cells(row,excelFieldPositionMatrix[' Validated by OC?']).Value,
499 nino.borges 834 category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
500     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
501     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
502     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
503 nino.borges 966 alt_first_names = sht.Cells(row,excelFieldPositionMatrix['Alt First Names']).Value,
504     alt_surnames = sht.Cells(row,excelFieldPositionMatrix['Alt Surnames']).Value,
505     middle_initial = sht.Cells(row,excelFieldPositionMatrix['Middle Initial']).Value,
506 nino.borges 834 work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
507     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
508     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
509     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
510     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
511     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
512     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
513 nino.borges 850 unique_attorney_row_number = sht.Cells(row,excelFieldPositionMatrix['Row']).Value,
514     addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Comments']).Value))
515     #addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
516 nino.borges 838 #last_updated = sht.Cells(row,excelFieldPositionMatrix['Last Updated']).Value ))
517 nino.borges 844
518 nino.borges 850 elif excelTab['tabName'] == 'Downgrades':
519     ## Make sure to NOT grab the unique attorney row number from here
520 nino.borges 844 self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
521 nino.borges 850 #split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
522 nino.borges 844 sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Validated by OC?']).Value,
523     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
524     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
525     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
526 nino.borges 966 alt_first_names = sht.Cells(row,excelFieldPositionMatrix['Alt First Names']).Value,
527     alt_surnames = sht.Cells(row,excelFieldPositionMatrix['Alt Surnames']).Value,
528     middle_initial = sht.Cells(row,excelFieldPositionMatrix['Middle Initial']).Value,
529 nino.borges 844 work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
530     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
531     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
532     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
533     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
534     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
535     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
536 nino.borges 850 addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
537     elif excelTab['tabName'] == 'Split Role Attorneys':
538     unique_attorney_row_number = sht.Cells(row,excelFieldPositionMatrix['Attorney Row']).Value
539     matchedPerson = self.malPeopleList.search_by_unique_attorney_row_number(unique_attorney_row_number)
540     if matchedPerson:
541 nino.borges 854
542     ## dates_as_counsel should always be a two string value tuple (startdate,enddate).
543     datesAsCounselValue = sht.Cells(row,excelFieldPositionMatrix['Dates as Counsel']).Value
544     datesAsCounselList = []
545     ## First get rid of any extra data that is on a new line. Note that they shouldnt be seperating the date ranges by newline.
546     datesAsCounselValue = datesAsCounselValue.split("\n")[0]
547 nino.borges 920 #print(datesAsCounselValue)
548 nino.borges 854 ## Next split the ranges correctly by semicolon
549     dateRanges = datesAsCounselValue.split(";")
550     for dateRange in dateRanges:
551     ## Split out the start and end, allowing non-date words. (current, present, etc) however force these to be uppercase.
552     counselStartDate, counselEndDate = dateRange.split("-")
553     counselStartDate = counselStartDate.upper().strip()
554     counselEndDate = counselEndDate.upper().strip()
555     datesAsCounselList.append((counselStartDate,counselEndDate))
556     matchedPerson.dates_as_counsel = datesAsCounselList
557 nino.borges 965
558     elif excelTab['tabName'] == '_data':
559     ## This is a tab that contains some additional data tables which I want others to see. I'll ingest this separately.
560     pass
561 nino.borges 850
562     else:
563     print(f"ERROR UNKNOWN TAB! {excelTab['tabName']} HAVE NEEDED TAB NAMES CHANGED?")
564    
565 nino.borges 965 ## Now grab any additional data tables from the _data tab.
566     print("Main tabs ingested. Now ingesting additional data tables.")
567     sht = xlBook.Worksheets('_data')
568     for row in range(2,22):
569     intDomainValue = sht.Cells(row,"A").Value
570     self.malPeopleList.internationalEmailDomainsSet.add(intDomainValue.strip().upper())
571     #print(self.malPeopleList.internationalEmailDomainsSet)
572 nino.borges 834 xlBook.Close()
573    
574     def __SaveMalToPkl(self, pklFileName):
575     """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
576     outputFile = open(pklFileName,'wb')
577     pickle.dump(self.malPeopleList,outputFile)
578     outputFile.close()
579    
580     def __LoadMalFromPkl(self, pklFileName):
581     """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
582     contents = open(pklFileName, 'rb')
583     obj = pickle.load(contents)
584     contents.close()
585     return obj
586    
587 nino.borges 837 def __LoadFullNameOverideValues(self, fullNameOveridesFileName):
588     """Pseudo-private method which will update the MAL people list object with the full name overide values."""
589     contents = open(fullNameOveridesFileName).readlines()
590     for line in contents:
591     line = line.replace("\n","")
592     emailAddress,fullNameOverideValue = line.split("|")
593 nino.borges 844
594 nino.borges 837 self.malPeopleList.update_full_Name_overide(emailAddress, fullNameOverideValue)
595    
596 nino.borges 844 def SmartDedupeSet(self, currentSet):
597 nino.borges 850 """A method that attempts to do some additional deduplication of the values in a set by lowering all values and deduplicating. Returns a lowered deduplicated set."""
598 nino.borges 844 newSet = set()
599     for val in currentSet:
600     newSet.add(val.lower())
601     return newSet
602    
603 nino.borges 853 def RunMalEmailAddressIntegrityCheck(self):
604     """This method performs an integrity check on the MAL by analyzing and looking for duplicate email addresses."""
605     emailTestMatrix = {}
606     altTestMatrix = {}
607     print("Performing MAL email address integrity check...")
608     for i in range(0,len(self.malPeopleList.people)):
609     altAddr = self.malPeopleList.people[i].alt_work_email_address
610     workAddr = self.malPeopleList.people[i].work_email_address
611     if altAddr != None:
612 nino.borges 877 altAddr = altAddr.strip()
613     if altAddr in list(emailTestMatrix.keys()):
614     print(f"ISSUE:{altAddr} is a dupe of an workAddr.")
615 nino.borges 853 if altAddr in list(altTestMatrix.keys()):
616 nino.borges 877 print(f"ISSUE:{altAddr} is a dupe!")
617 nino.borges 853 else:
618     altTestMatrix[altAddr] = 1
619     if workAddr != None:
620 nino.borges 877 workAddr = workAddr.strip()
621 nino.borges 853 if workAddr in list(altTestMatrix.keys()):
622 nino.borges 877 print(f"ISSUE:{workAddr} is a dupe of an altAddr.")
623 nino.borges 853 if workAddr in list(emailTestMatrix.keys()):
624 nino.borges 877 print(f"ISSUE:{workAddr} is a dupe!")
625 nino.borges 853 else:
626     emailTestMatrix[workAddr] = 1
627 nino.borges 877 print("\nEmail address integrity check complete.\n\n")
628 nino.borges 853
629 nino.borges 877 def RunMalEmailOutsideEmailFieldsIntegrityCheck(self):
630     """This method performs an integrity check on the MAL by looking for email addresses that exist in fields other than the email address fields."""
631     ## Right now this looks for the @ symbol.
632     ## Editable list of fields that should be excluded from this test, especially those that should already have email addresses
633     fieldsToExcludeList = ['work_email_address', 'alt_work_email_address','_id','dates_as_counsel','unique_attorney_row_number']
634     print("Performing MAL email addresses outside of email address fields integrity check...")
635     fieldObjects = fields(Person)
636     fieldNames = [f.name for f in fieldObjects]
637     #print(fieldNames)
638     fieldsToSearchList = [x for x in fieldNames if x not in fieldsToExcludeList]
639     #print(fieldsToSearchList)
640     for i in range(0,len(self.malPeopleList.people)):
641     for fieldName in fieldsToSearchList:
642     testValue = getattr(self.malPeopleList.people[i], fieldName)
643     #print(fieldName)
644     if testValue:
645     if "@" in testValue:
646     print(f"ISSUE: The email address {testValue} exists in the non-email field {fieldName} for unique row# {self.malPeopleList.people[i].unique_attorney_row_number}.")
647     print("\nEmail addresss outside of email fields integrity check complete.\n\n")
648    
649 nino.borges 853
650 nino.borges 869 def RunRowNumberIntegrityCheck(self):
651     """This method performs an integrity check on the MAL by analyzing the hard-coded row numbers across the 3 imporant tabs. Looks for gaps, blanks, and inconsistencies between split role. """
652     ## First let's return all non-attorneys and confirm the hard-coded row number is in the 50000 range and look for gaps.
653     print("Performing MAL hard-coded row number integrity check...")
654     ## nonAttorneyPeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','NO')
655     ## print(f"Analyzing all {len(nonAttorneyPeopleList)} non-attorneys items...")
656     ## ## Gather all non-attorneys and add hc row number to a list, looking for any that are missing a value
657     ## for nonAttorneyPerson in nonAttorneyPeopleList:
658     ## hcRowNumberList = []
659     ## hcRowNumber = nonAttorneyPerson.unique_attorney_row_number
660     ## if hcRowNumber == None:
661     ## print(f"WARNING: Empty hard coded row number for {nonAttorneyPerson.first_name} {nonAttorneyPerson.last_name} in the Downgrades Tab.")
662     ## else:
663     ## hcRowNumberList.append(int(hcRowNumber))
664     ## ## Next export a list of the missing numbers
665     ## hcRowNumberList.sort()
666     ## compareSet = set(range(hcRowNumberList[0], hcRowNumberList[-1]))
667     ## downgradeDiffs = compareSet - set(hcRowNumberList)
668     ## print(downgradeDiffs)
669     ## Now let's do similar for attorneys, including split role.
670     attorneyPeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','YES')
671     splitRolePeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','SPLIT ROLE')
672     ## Creating a third list using the newer list joining from pep 448
673     fullAttorneyPeopleList = [*attorneyPeopleList,*splitRolePeopleList]
674     print(f"Analyzing all {len(fullAttorneyPeopleList)} attorneys items...")
675     ## Gather all attorneys and add hc row number to a list, looking for any that are missing a value
676     for attorneyPerson in fullAttorneyPeopleList:
677     hcRowNumberList = []
678     hcRowNumber = attorneyPerson.unique_attorney_row_number
679     if hcRowNumber == None:
680     print(f"WARNING: Empty hard coded row number for {attorneyPerson.first_name} {attorneyPerson.last_name} in the Attorneys Tab.")
681     else:
682     hcRowNumberList.append(int(hcRowNumber))
683     ## Next export a list of the missing numbers
684     hcRowNumberList.sort()
685     compareSet = set(range(hcRowNumberList[0], hcRowNumberList[-1]))
686     attorneyDiffs = compareSet - set(hcRowNumberList)
687     if attorneyDiffs:
688     print(attorneyDiffs)
689     else:
690     print("There are no gaps in the hard coded row numbers in the Attorneys tab.")
691 nino.borges 853
692 nino.borges 944 def ExportFullSTRList(self, attorneyOnly = True):
693 nino.borges 956 """Exports a full STR file for all entries in the data class. Defaults to only attorneys. Changed to also include split role in attorneys"""
694 nino.borges 980 outputFile = open(r"C:\Test_Dir\Amazon\Attorneys_SR_STR_v3.txt",'w', encoding='UTF-8')
695 nino.borges 944 if attorneyOnly == True:
696     attorneyPeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','YES')
697 nino.borges 956 splitRolePeoplelist = self.malPeopleList.return_list_of_matching_values('is_attorney','SPLIT ROLE')
698     attorneyPeopleList += splitRolePeoplelist
699 nino.borges 944 for attorneyPerson in attorneyPeopleList:
700 nino.borges 956 outputText = self.malPeopleList.return_str_string2(attorneyPerson)
701 nino.borges 944 outputFile.write(outputText + "\n")
702     outputFile.close()
703 nino.borges 853
704 nino.borges 834 if __name__ == '__main__':
705 nino.borges 847 pass
706     ## cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\VEAS\VEAS_Log_Data_Export_Converted.txt"
707     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
708     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\TEST-PLOG.txt"
709     ## #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
710     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
711     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
712     ## #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
713     ## masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.1(7045413.15).xlsx"
714     ## #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
715     ## #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides.txt"
716     ## #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides - Copy.txt"
717     ## fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
718     ##
719     ##
720     ##
721     ## nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
722     ## #nv.malPeopleList.list_people()
723     ##
724     ## qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
725     ## print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
726     ## #print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
727     ## workList = qcP.metadataValuesDict.keys()
728     ## outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
729     ## for docID in workList:
730     ## #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
731     ## #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
732     ## #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
733     ## #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
734     ## #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
735     ## #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
736     ## metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
737     ## formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
738     ## formattedAttorneyValues = set()
739     ## for formattedValue in formattedFieldValues:
740     ## if "*" in formattedValue:
741     ## formattedAttorneyValues.add(formattedValue.upper())
742     ##
743     ## if metadataFieldValues:
744     ## matchedMetadataValues = set()
745     ## for nameItem in metadataFieldValues:
746     ## ## First test to see if there is a valid email address.
747     ## resultSet = set()
748     ## results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
749     ## if results:
750     ## for result in results:
751     ## resultSet.add(result)
752     ## if len(resultSet) >1:
753     ## resultSet = nv.SmartDedupeSet(resultSet)
754     ## if len(resultSet) >1:
755     ## print("ERROR multiple email **unique** email addresses in one item.")
756     ## print(resultSet)
757     ## print("\n")
758     ## else:
759     ## personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
760     ## if personMatch:
761     ## if personMatch.full_name_overide:
762     ## fullName = personMatch.full_name_overide
763     ## elif personMatch.full_name_preferred:
764     ## #print(personMatch.full_name_preferred)
765     ## ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
766     ## fullPreferredName = personMatch.full_name_preferred
767     ## fullPreferredName = fullPreferredName.replace('(LEGAL)','')
768     ## fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
769     ## fullPreferredName = fullPreferredName.replace('(SHE HER)','')
770     ## preferedLastName, preferedFirstName = fullPreferredName.split(',')
771     ## preferedLastName = preferedLastName.strip()
772     ## preferedFirstName = preferedFirstName.strip()
773     ## preferedFirstName = preferedFirstName.split(" ")[0]
774     ## fullName = f"{preferedFirstName} {preferedLastName}"
775     ## #fullName = f"{preferedLastName}, {preferedFirstName}"
776     ## else:
777     ## fullName = f"{personMatch.first_name} {personMatch.last_name}"
778     ## #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
779     ## if personMatch.is_attorney == 'YES':
780     ## #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
781     ## matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
782     ## else:
783     ## #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
784     ## matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
785     ## else:
786     ## outputFile.write(f"{docID} contains a non-email item {nameItem}\n\n")
787     ## missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
788     ## missingFromMeta = formattedAttorneyValues - matchedMetadataValues
789     ## if missingFromFormatted:
790     ## for missingItem in missingFromFormatted:
791     ## outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
792     ## if missingFromMeta:
793     ## for missingItem in missingFromMeta:
794     ## outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
795     ## if missingFromFormatted:
796     ## outputFile.write("\n")
797     ## elif missingFromMeta:
798     ## outputFile.write("\n")
799     ## outputFile.close()
800 nino.borges 834