ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_NamesNormQC.py
Revision: 846
Committed: Tue Dec 10 17:22:55 2024 UTC (15 months, 2 weeks ago) by nino.borges
Content type: text/x-python
File size: 23675 byte(s)
Log Message:
They changed a column name to 'comments', so updated dataclass. also added some warning messages when the consilio override ingestion cant find a matching email address in the MAL.  I'm seeing many of these.

File Contents

# User Rev Content
1 nino.borges 834 """
2    
3     Amazon-NamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     11.21.2024
8    
9     This program will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10    
11     """
12    
13     import os, uuid, pickle, re
14     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15     from dataclasses import dataclass, field
16     from typing import List, Optional
17     from collections import namedtuple
18     from win32com.client import Dispatch
19    
20    
21     @dataclass
22     class Person:
23     first_name: Optional[str] = None
24     last_name: Optional[str] = None
25     work_email_address: Optional[str] = None
26     alt_work_email_address: Optional[str] = None
27     _id: uuid.UUID = field(default_factory=uuid.uuid4)
28     is_attorney: Optional[str] = None
29     split_role_date_range: Optional[str] = None
30     sidley_validated: Optional[str] = None
31     category: Optional[str] = None
32     organization: Optional[str] = None
33     job_title: Optional[str] = None
34     business_title: Optional[str] = None
35     full_name_preferred: Optional[str] = None
36     login: Optional[str] = None
37     department_fine: Optional[str] = None
38     addressed_during_caag: Optional[str] = None
39 nino.borges 838 #last_updated: Optional[str] = None
40 nino.borges 837 full_name_overide: Optional[str] = None
41 nino.borges 834
42     def __post_init__(self):
43     """Convert all string fields to uppercase."""
44     if self.first_name:
45 nino.borges 836 self.first_name = self.first_name.strip().upper()
46 nino.borges 834 if self.last_name:
47 nino.borges 836 self.last_name = self.last_name.strip().upper()
48 nino.borges 834 if self.work_email_address:
49 nino.borges 836 self.work_email_address = self.work_email_address.strip().upper()
50 nino.borges 834 if self.alt_work_email_address:
51 nino.borges 836 self.alt_work_email_address = self.alt_work_email_address.strip().upper()
52 nino.borges 834 if self.is_attorney:
53 nino.borges 836 self.is_attorney = self.is_attorney.strip().upper()
54 nino.borges 834 if self.split_role_date_range:
55 nino.borges 836 self.split_role_date_range = self.split_role_date_range.strip().upper()
56 nino.borges 834 if self.sidley_validated:
57 nino.borges 836 self.sidley_validated = self.sidley_validated.strip().upper()
58 nino.borges 834 if self.category:
59 nino.borges 836 self.category = self.category.strip().upper()
60 nino.borges 834 if self.organization:
61 nino.borges 836 self.organization = self.organization.strip().upper()
62 nino.borges 834 if self.job_title:
63 nino.borges 836 self.job_title = self.job_title.strip().upper()
64 nino.borges 834 if self.business_title:
65 nino.borges 836 self.business_title = self.business_title.strip().upper()
66 nino.borges 834 if self.full_name_preferred:
67 nino.borges 836 self.full_name_preferred = self.full_name_preferred.strip().upper()
68 nino.borges 834 if self.login:
69 nino.borges 836 self.login = self.login.strip().upper()
70 nino.borges 834 if self.department_fine:
71 nino.borges 836 self.department_fine = self.department_fine.strip().upper()
72 nino.borges 834 if self.addressed_during_caag:
73 nino.borges 836 self.addressed_during_caag = self.addressed_during_caag.strip().upper()
74 nino.borges 838 #if self.last_updated:
75     # self.last_updated = self.last_updated.strip().upper()
76 nino.borges 834
77     @dataclass
78     class PeopleList:
79     people: List[Person] = field(default_factory=list)
80    
81     def add_person(self, person: Person):
82     self.people.append(person)
83     #print(f"Added person: {person}")
84    
85    
86     def search_by_email(self, emailAddress:str) -> Optional[Person]:
87     for person in self.people:
88     if person.work_email_address == emailAddress:
89     return person
90     return None
91    
92     def list_people(self):
93     for person in self.people:
94     print(person)
95    
96 nino.borges 837 def update_full_Name_overide(self, emailAddress:str, fullNameOverideValue) -> Optional[Person]:
97 nino.borges 846 valueUpdated = False
98 nino.borges 837 for person in self.people:
99     if person.work_email_address == emailAddress.upper():
100     person.full_name_overide = fullNameOverideValue.upper()
101 nino.borges 846 valueUpdated = True
102 nino.borges 844 ## Give a quik warning as you add the override value into the database if the last name differs.
103     if "," in fullNameOverideValue:
104     lastName = fullNameOverideValue.split(",")[0]
105     else:
106     lastName = fullNameOverideValue.split(" ")[-1]
107     if lastName.upper() == person.last_name:
108     pass
109     else:
110     print(f"WARNING: Overide last name value {lastName.upper()} does not match {person.last_name}.")
111 nino.borges 846 if valueUpdated == False:
112     print(f"WARNING: No email address match for {emailAddress} found.")
113 nino.borges 834
114 nino.borges 837
115 nino.borges 834 class NamesVerification(object):
116     """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
117 nino.borges 846 version = '0.6.0'
118 nino.borges 834
119    
120 nino.borges 837 def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName,fullNameOveridesFileName, forceNewPklFile = False, Encoding = 'UTF8'):
121 nino.borges 834 """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
122     Assumes the first row of the data file is the header and first column is DocID.
123     Assumes the MAL is a spreadsheet (for now).
124     MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
125     pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
126    
127     print("Initializing data structures...")
128     if forceNewPklFile:
129     print("Creating MAL structure...")
130     self.malPeopleList = PeopleList()
131     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
132     print("MAL structure created.")
133 nino.borges 837 print("Loading full name overide values...")
134     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
135     print("Full name overide values loaded.")
136 nino.borges 834 print("Creating pickle backup...")
137     self.__SaveMalToPkl(pklFileName)
138     print("Pickle backup created.")
139     else:
140     if os.path.exists(pklFileName):
141     print("Loading MAL structure from pickle file...")
142     self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
143     print("MAL structure loaded.")
144     else:
145     print("Pickle file doesnt exist.")
146     print("Creating MAL structure...")
147     self.malPeopleList = PeopleList()
148     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
149     print("MAL structure created.")
150 nino.borges 837 print("Loading full name overide values...")
151     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
152     print("Full name overide values loaded.")
153 nino.borges 834 print("Creating pickle backup...")
154     self.__SaveMalToPkl(pklFileName)
155     print("Pickle backup created.")
156    
157     ## self.malPeopleList = PeopleList()
158     ##
159     ## print("Creating MAL structure...")
160     ## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
161     ## print("MAL structure created.")
162     ## print("Creating pickle backup...")
163    
164    
165    
166    
167    
168     def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
169     """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
170     ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
171    
172 nino.borges 837 excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10919, "beginColNumber":2, "endColNumber":17},
173     {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":572, "beginColNumber":2, "endColNumber":16}]
174 nino.borges 834
175     # excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
176     # {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
177    
178     spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
179 nino.borges 837 "Split Role - Attorney Capacity Date Range":"split_role_date_range", " Validated by OC??":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
180     "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag",
181     "Last Updated":"last_updated"}
182 nino.borges 834
183     xlApp = Dispatch('Excel.Application')
184     xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
185    
186     for excelTab in excelTabParametersList:
187     sht = xlBook.Worksheets(excelTab['tabName'])
188     print(f"Ingesting sheet {excelTab['tabName']}.")
189     excelFieldPositionMatrix = {}
190     for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
191     excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
192     for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
193     #print(row)
194     ## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
195     if excelTab['tabName'] == 'Attorneys':
196     self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
197     split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
198 nino.borges 837 sidley_validated = sht.Cells(row,excelFieldPositionMatrix[' Validated by OC?']).Value,
199 nino.borges 834 category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
200     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
201     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
202     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
203     work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
204     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
205     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
206     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
207     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
208     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
209     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
210 nino.borges 846 addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Comments']).Value))
211     #addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
212 nino.borges 838 #last_updated = sht.Cells(row,excelFieldPositionMatrix['Last Updated']).Value ))
213 nino.borges 844
214     else:
215     self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
216     split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
217     sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Validated by OC?']).Value,
218     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
219     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
220     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
221     work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
222     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
223     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
224     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
225     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
226     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
227     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
228     addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
229 nino.borges 834
230     xlBook.Close()
231    
232     def __SaveMalToPkl(self, pklFileName):
233     """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
234     outputFile = open(pklFileName,'wb')
235     pickle.dump(self.malPeopleList,outputFile)
236     outputFile.close()
237    
238     def __LoadMalFromPkl(self, pklFileName):
239     """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
240     contents = open(pklFileName, 'rb')
241     obj = pickle.load(contents)
242     contents.close()
243     return obj
244    
245 nino.borges 837 def __LoadFullNameOverideValues(self, fullNameOveridesFileName):
246     """Pseudo-private method which will update the MAL people list object with the full name overide values."""
247     contents = open(fullNameOveridesFileName).readlines()
248     for line in contents:
249     line = line.replace("\n","")
250     emailAddress,fullNameOverideValue = line.split("|")
251 nino.borges 844
252 nino.borges 837 self.malPeopleList.update_full_Name_overide(emailAddress, fullNameOverideValue)
253    
254 nino.borges 844 def SmartDedupeSet(self, currentSet):
255     """Pseudo-private method that attempts to do some additional deduplication of the values in a set by lowering all values and deduplicating. Returns a lowered deduplicated set."""
256     newSet = set()
257     for val in currentSet:
258     newSet.add(val.lower())
259     return newSet
260    
261 nino.borges 834 if __name__ == '__main__':
262 nino.borges 844 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\VEAS\VEAS_Log_Data_Export_Converted.txt"
263     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
264     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\TEST-PLOG.txt"
265     #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
266 nino.borges 837 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
267 nino.borges 835 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
268 nino.borges 837 #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
269 nino.borges 844 masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.1(7045413.15).xlsx"
270 nino.borges 834 #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
271 nino.borges 844 #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides.txt"
272     #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides - Copy.txt"
273     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
274 nino.borges 834
275    
276    
277 nino.borges 837 nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
278 nino.borges 834 #nv.malPeopleList.list_people()
279    
280     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
281 nino.borges 837 print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
282 nino.borges 844 #print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
283 nino.borges 834 workList = qcP.metadataValuesDict.keys()
284     outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
285     for docID in workList:
286 nino.borges 844 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
287     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
288 nino.borges 837 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
289     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
290     #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
291     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
292 nino.borges 844 metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
293     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
294 nino.borges 835 formattedAttorneyValues = set()
295     for formattedValue in formattedFieldValues:
296     if "*" in formattedValue:
297     formattedAttorneyValues.add(formattedValue.upper())
298 nino.borges 834
299     if metadataFieldValues:
300 nino.borges 835 matchedMetadataValues = set()
301 nino.borges 834 for nameItem in metadataFieldValues:
302     ## First test to see if there is a valid email address.
303     resultSet = set()
304     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
305     if results:
306     for result in results:
307     resultSet.add(result)
308     if len(resultSet) >1:
309 nino.borges 844 resultSet = nv.SmartDedupeSet(resultSet)
310     if len(resultSet) >1:
311     print("ERROR multiple email **unique** email addresses in one item.")
312 nino.borges 838 print(resultSet)
313     print("\n")
314 nino.borges 834 else:
315     personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
316     if personMatch:
317 nino.borges 837 if personMatch.full_name_overide:
318     fullName = personMatch.full_name_overide
319     elif personMatch.full_name_preferred:
320     #print(personMatch.full_name_preferred)
321 nino.borges 835 ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
322     fullPreferredName = personMatch.full_name_preferred
323     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
324     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
325     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
326     preferedLastName, preferedFirstName = fullPreferredName.split(',')
327     preferedLastName = preferedLastName.strip()
328     preferedFirstName = preferedFirstName.strip()
329     preferedFirstName = preferedFirstName.split(" ")[0]
330     fullName = f"{preferedFirstName} {preferedLastName}"
331 nino.borges 844 #fullName = f"{preferedLastName}, {preferedFirstName}"
332 nino.borges 835 else:
333     fullName = f"{personMatch.first_name} {personMatch.last_name}"
334 nino.borges 844 #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
335 nino.borges 834 if personMatch.is_attorney == 'YES':
336 nino.borges 835 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
337     matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
338 nino.borges 834 else:
339 nino.borges 835 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
340     matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
341 nino.borges 834 else:
342 nino.borges 836 outputFile.write(f"{docID} contains a non-email item {nameItem}\n\n")
343 nino.borges 835 missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
344     missingFromMeta = formattedAttorneyValues - matchedMetadataValues
345     if missingFromFormatted:
346     for missingItem in missingFromFormatted:
347     outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
348     if missingFromMeta:
349     for missingItem in missingFromMeta:
350     outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
351     if missingFromFormatted:
352     outputFile.write("\n")
353     elif missingFromMeta:
354     outputFile.write("\n")
355 nino.borges 834 outputFile.close()
356    
357    
358    
359    
360     ## people_list = PeopleList()
361     ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "fooBar@gmail.com", altWorkEmailAddress = ""))
362     ## people_list.add_person(Person(firstName = "Gary", lastName = "Cooper", workEmailAddress = "", altWorkEmailAddress = "spam.eggs@hotmail.com"))
363     ## people_list.add_person(Person(firstName = "", lastName = "", workEmailAddress = "noname@gmail.com", altWorkEmailAddress = ""))
364     ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "eggs@outlook.com", altWorkEmailAddress = ""))
365     ## print("\nAll People:")
366     ## people_list.list_people()
367     ## print("\nSearching...")
368     ## result = people_list.search_by_email('fooBar@gmail.com')
369     ## print(result if result else "email not found.")