ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_NamesNormQC.py
Revision: 844
Committed: Fri Dec 6 21:56:09 2024 UTC (15 months, 2 weeks ago) by nino.borges
Content type: text/x-python
File size: 23349 byte(s)
Log Message:
This version fixes a bug where it wasnt importing the downgrades tab because of an error in missing logic.  Also adds a pseudo private method that does some smarter deduplication in a set because I wanted to make sure I only had unique addresses in the initial RE result, looking for that as an error.  If I have 3 email addresses where only one should exist (like in the from) but they are the same email address, I wanted to make sure I wasnt raising a warning because they are not unique.  if there is more than one unique, that is a warning.  Finally also added support for providing a warning if the override name has a different last name from the MAL for the same record.  I'm now using consilio values for my override but there is concern they could be completely wrong.  So far I'm finding them more accurate actually and not totally off. however still want to be warned.

File Contents

# User Rev Content
1 nino.borges 834 """
2    
3     Amazon-NamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     11.21.2024
8    
9     This program will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10    
11     """
12    
13     import os, uuid, pickle, re
14     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15     from dataclasses import dataclass, field
16     from typing import List, Optional
17     from collections import namedtuple
18     from win32com.client import Dispatch
19    
20    
21     @dataclass
22     class Person:
23     first_name: Optional[str] = None
24     last_name: Optional[str] = None
25     work_email_address: Optional[str] = None
26     alt_work_email_address: Optional[str] = None
27     _id: uuid.UUID = field(default_factory=uuid.uuid4)
28     is_attorney: Optional[str] = None
29     split_role_date_range: Optional[str] = None
30     sidley_validated: Optional[str] = None
31     category: Optional[str] = None
32     organization: Optional[str] = None
33     job_title: Optional[str] = None
34     business_title: Optional[str] = None
35     full_name_preferred: Optional[str] = None
36     login: Optional[str] = None
37     department_fine: Optional[str] = None
38     addressed_during_caag: Optional[str] = None
39 nino.borges 838 #last_updated: Optional[str] = None
40 nino.borges 837 full_name_overide: Optional[str] = None
41 nino.borges 834
42     def __post_init__(self):
43     """Convert all string fields to uppercase."""
44     if self.first_name:
45 nino.borges 836 self.first_name = self.first_name.strip().upper()
46 nino.borges 834 if self.last_name:
47 nino.borges 836 self.last_name = self.last_name.strip().upper()
48 nino.borges 834 if self.work_email_address:
49 nino.borges 836 self.work_email_address = self.work_email_address.strip().upper()
50 nino.borges 834 if self.alt_work_email_address:
51 nino.borges 836 self.alt_work_email_address = self.alt_work_email_address.strip().upper()
52 nino.borges 834 if self.is_attorney:
53 nino.borges 836 self.is_attorney = self.is_attorney.strip().upper()
54 nino.borges 834 if self.split_role_date_range:
55 nino.borges 836 self.split_role_date_range = self.split_role_date_range.strip().upper()
56 nino.borges 834 if self.sidley_validated:
57 nino.borges 836 self.sidley_validated = self.sidley_validated.strip().upper()
58 nino.borges 834 if self.category:
59 nino.borges 836 self.category = self.category.strip().upper()
60 nino.borges 834 if self.organization:
61 nino.borges 836 self.organization = self.organization.strip().upper()
62 nino.borges 834 if self.job_title:
63 nino.borges 836 self.job_title = self.job_title.strip().upper()
64 nino.borges 834 if self.business_title:
65 nino.borges 836 self.business_title = self.business_title.strip().upper()
66 nino.borges 834 if self.full_name_preferred:
67 nino.borges 836 self.full_name_preferred = self.full_name_preferred.strip().upper()
68 nino.borges 834 if self.login:
69 nino.borges 836 self.login = self.login.strip().upper()
70 nino.borges 834 if self.department_fine:
71 nino.borges 836 self.department_fine = self.department_fine.strip().upper()
72 nino.borges 834 if self.addressed_during_caag:
73 nino.borges 836 self.addressed_during_caag = self.addressed_during_caag.strip().upper()
74 nino.borges 838 #if self.last_updated:
75     # self.last_updated = self.last_updated.strip().upper()
76 nino.borges 834
77     @dataclass
78     class PeopleList:
79     people: List[Person] = field(default_factory=list)
80    
81     def add_person(self, person: Person):
82     self.people.append(person)
83     #print(f"Added person: {person}")
84    
85    
86     def search_by_email(self, emailAddress:str) -> Optional[Person]:
87     for person in self.people:
88     if person.work_email_address == emailAddress:
89     return person
90     return None
91    
92     def list_people(self):
93     for person in self.people:
94     print(person)
95    
96 nino.borges 837 def update_full_Name_overide(self, emailAddress:str, fullNameOverideValue) -> Optional[Person]:
97     for person in self.people:
98     if person.work_email_address == emailAddress.upper():
99     person.full_name_overide = fullNameOverideValue.upper()
100 nino.borges 844 ## Give a quik warning as you add the override value into the database if the last name differs.
101     if "," in fullNameOverideValue:
102     lastName = fullNameOverideValue.split(",")[0]
103     else:
104     lastName = fullNameOverideValue.split(" ")[-1]
105     if lastName.upper() == person.last_name:
106     pass
107     else:
108     print(f"WARNING: Overide last name value {lastName.upper()} does not match {person.last_name}.")
109 nino.borges 834
110 nino.borges 837
111 nino.borges 834 class NamesVerification(object):
112     """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
113 nino.borges 844 version = '0.5.0'
114 nino.borges 834
115    
116 nino.borges 837 def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName,fullNameOveridesFileName, forceNewPklFile = False, Encoding = 'UTF8'):
117 nino.borges 834 """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
118     Assumes the first row of the data file is the header and first column is DocID.
119     Assumes the MAL is a spreadsheet (for now).
120     MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
121     pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
122    
123     print("Initializing data structures...")
124     if forceNewPklFile:
125     print("Creating MAL structure...")
126     self.malPeopleList = PeopleList()
127     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
128     print("MAL structure created.")
129 nino.borges 837 print("Loading full name overide values...")
130     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
131     print("Full name overide values loaded.")
132 nino.borges 834 print("Creating pickle backup...")
133     self.__SaveMalToPkl(pklFileName)
134     print("Pickle backup created.")
135     else:
136     if os.path.exists(pklFileName):
137     print("Loading MAL structure from pickle file...")
138     self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
139     print("MAL structure loaded.")
140     else:
141     print("Pickle file doesnt exist.")
142     print("Creating MAL structure...")
143     self.malPeopleList = PeopleList()
144     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
145     print("MAL structure created.")
146 nino.borges 837 print("Loading full name overide values...")
147     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
148     print("Full name overide values loaded.")
149 nino.borges 834 print("Creating pickle backup...")
150     self.__SaveMalToPkl(pklFileName)
151     print("Pickle backup created.")
152    
153     ## self.malPeopleList = PeopleList()
154     ##
155     ## print("Creating MAL structure...")
156     ## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
157     ## print("MAL structure created.")
158     ## print("Creating pickle backup...")
159    
160    
161    
162    
163    
164     def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
165     """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
166     ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
167    
168 nino.borges 837 excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10919, "beginColNumber":2, "endColNumber":17},
169     {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":572, "beginColNumber":2, "endColNumber":16}]
170 nino.borges 834
171     # excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
172     # {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
173    
174     spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
175 nino.borges 837 "Split Role - Attorney Capacity Date Range":"split_role_date_range", " Validated by OC??":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
176     "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag",
177     "Last Updated":"last_updated"}
178 nino.borges 834
179     xlApp = Dispatch('Excel.Application')
180     xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
181    
182     for excelTab in excelTabParametersList:
183     sht = xlBook.Worksheets(excelTab['tabName'])
184     print(f"Ingesting sheet {excelTab['tabName']}.")
185     excelFieldPositionMatrix = {}
186     for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
187     excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
188     for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
189     #print(row)
190     ## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
191     if excelTab['tabName'] == 'Attorneys':
192     self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
193     split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
194 nino.borges 837 sidley_validated = sht.Cells(row,excelFieldPositionMatrix[' Validated by OC?']).Value,
195 nino.borges 834 category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
196     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
197     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
198     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
199     work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
200     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
201     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
202     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
203     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
204     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
205     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
206 nino.borges 838 addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
207     #last_updated = sht.Cells(row,excelFieldPositionMatrix['Last Updated']).Value ))
208 nino.borges 844
209     else:
210     self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
211     split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
212     sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Validated by OC?']).Value,
213     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
214     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
215     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
216     work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
217     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
218     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
219     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
220     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
221     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
222     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
223     addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
224 nino.borges 834
225     xlBook.Close()
226    
227     def __SaveMalToPkl(self, pklFileName):
228     """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
229     outputFile = open(pklFileName,'wb')
230     pickle.dump(self.malPeopleList,outputFile)
231     outputFile.close()
232    
233     def __LoadMalFromPkl(self, pklFileName):
234     """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
235     contents = open(pklFileName, 'rb')
236     obj = pickle.load(contents)
237     contents.close()
238     return obj
239    
240 nino.borges 837 def __LoadFullNameOverideValues(self, fullNameOveridesFileName):
241     """Pseudo-private method which will update the MAL people list object with the full name overide values."""
242     contents = open(fullNameOveridesFileName).readlines()
243     for line in contents:
244     line = line.replace("\n","")
245     emailAddress,fullNameOverideValue = line.split("|")
246 nino.borges 844
247 nino.borges 837 self.malPeopleList.update_full_Name_overide(emailAddress, fullNameOverideValue)
248    
249 nino.borges 844 def SmartDedupeSet(self, currentSet):
250     """Pseudo-private method that attempts to do some additional deduplication of the values in a set by lowering all values and deduplicating. Returns a lowered deduplicated set."""
251     newSet = set()
252     for val in currentSet:
253     newSet.add(val.lower())
254     return newSet
255    
256 nino.borges 834 if __name__ == '__main__':
257 nino.borges 844 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\VEAS\VEAS_Log_Data_Export_Converted.txt"
258     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
259     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\TEST-PLOG.txt"
260     #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
261 nino.borges 837 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
262 nino.borges 835 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
263 nino.borges 837 #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
264 nino.borges 844 masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.1(7045413.15).xlsx"
265 nino.borges 834 #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
266 nino.borges 844 #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides.txt"
267     #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides - Copy.txt"
268     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
269 nino.borges 834
270    
271    
272 nino.borges 837 nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
273 nino.borges 834 #nv.malPeopleList.list_people()
274    
275     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
276 nino.borges 837 print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
277 nino.borges 844 #print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
278 nino.borges 834 workList = qcP.metadataValuesDict.keys()
279     outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
280     for docID in workList:
281 nino.borges 844 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
282     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
283 nino.borges 837 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
284     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
285     #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
286     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
287 nino.borges 844 metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
288     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
289 nino.borges 835 formattedAttorneyValues = set()
290     for formattedValue in formattedFieldValues:
291     if "*" in formattedValue:
292     formattedAttorneyValues.add(formattedValue.upper())
293 nino.borges 834
294     if metadataFieldValues:
295 nino.borges 835 matchedMetadataValues = set()
296 nino.borges 834 for nameItem in metadataFieldValues:
297     ## First test to see if there is a valid email address.
298     resultSet = set()
299     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
300     if results:
301     for result in results:
302     resultSet.add(result)
303     if len(resultSet) >1:
304 nino.borges 844 resultSet = nv.SmartDedupeSet(resultSet)
305     if len(resultSet) >1:
306     print("ERROR multiple email **unique** email addresses in one item.")
307 nino.borges 838 print(resultSet)
308     print("\n")
309 nino.borges 834 else:
310     personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
311     if personMatch:
312 nino.borges 837 if personMatch.full_name_overide:
313     fullName = personMatch.full_name_overide
314     elif personMatch.full_name_preferred:
315     #print(personMatch.full_name_preferred)
316 nino.borges 835 ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
317     fullPreferredName = personMatch.full_name_preferred
318     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
319     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
320     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
321     preferedLastName, preferedFirstName = fullPreferredName.split(',')
322     preferedLastName = preferedLastName.strip()
323     preferedFirstName = preferedFirstName.strip()
324     preferedFirstName = preferedFirstName.split(" ")[0]
325     fullName = f"{preferedFirstName} {preferedLastName}"
326 nino.borges 844 #fullName = f"{preferedLastName}, {preferedFirstName}"
327 nino.borges 835 else:
328     fullName = f"{personMatch.first_name} {personMatch.last_name}"
329 nino.borges 844 #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
330 nino.borges 834 if personMatch.is_attorney == 'YES':
331 nino.borges 835 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
332     matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
333 nino.borges 834 else:
334 nino.borges 835 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
335     matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
336 nino.borges 834 else:
337 nino.borges 836 outputFile.write(f"{docID} contains a non-email item {nameItem}\n\n")
338 nino.borges 835 missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
339     missingFromMeta = formattedAttorneyValues - matchedMetadataValues
340     if missingFromFormatted:
341     for missingItem in missingFromFormatted:
342     outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
343     if missingFromMeta:
344     for missingItem in missingFromMeta:
345     outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
346     if missingFromFormatted:
347     outputFile.write("\n")
348     elif missingFromMeta:
349     outputFile.write("\n")
350 nino.borges 834 outputFile.close()
351    
352    
353    
354    
355     ## people_list = PeopleList()
356     ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "fooBar@gmail.com", altWorkEmailAddress = ""))
357     ## people_list.add_person(Person(firstName = "Gary", lastName = "Cooper", workEmailAddress = "", altWorkEmailAddress = "spam.eggs@hotmail.com"))
358     ## people_list.add_person(Person(firstName = "", lastName = "", workEmailAddress = "noname@gmail.com", altWorkEmailAddress = ""))
359     ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "eggs@outlook.com", altWorkEmailAddress = ""))
360     ## print("\nAll People:")
361     ## people_list.list_people()
362     ## print("\nSearching...")
363     ## result = people_list.search_by_email('fooBar@gmail.com')
364     ## print(result if result else "email not found.")