ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_NamesNormQC.py
Revision: 846
Committed: Tue Dec 10 17:22:55 2024 UTC (15 months, 2 weeks ago) by nino.borges
Content type: text/x-python
File size: 23675 byte(s)
Log Message:
They changed a column name to 'comments', so updated dataclass. also added some warning messages when the consilio override ingestion cant find a matching email address in the MAL.  I'm seeing many of these.

File Contents

# Content
1 """
2
3 Amazon-NamesNormQC
4
5 Created by:
6 Emanuel Borges
7 11.21.2024
8
9 This program will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10
11 """
12
13 import os, uuid, pickle, re
14 import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15 from dataclasses import dataclass, field
16 from typing import List, Optional
17 from collections import namedtuple
18 from win32com.client import Dispatch
19
20
21 @dataclass
22 class Person:
23 first_name: Optional[str] = None
24 last_name: Optional[str] = None
25 work_email_address: Optional[str] = None
26 alt_work_email_address: Optional[str] = None
27 _id: uuid.UUID = field(default_factory=uuid.uuid4)
28 is_attorney: Optional[str] = None
29 split_role_date_range: Optional[str] = None
30 sidley_validated: Optional[str] = None
31 category: Optional[str] = None
32 organization: Optional[str] = None
33 job_title: Optional[str] = None
34 business_title: Optional[str] = None
35 full_name_preferred: Optional[str] = None
36 login: Optional[str] = None
37 department_fine: Optional[str] = None
38 addressed_during_caag: Optional[str] = None
39 #last_updated: Optional[str] = None
40 full_name_overide: Optional[str] = None
41
42 def __post_init__(self):
43 """Convert all string fields to uppercase."""
44 if self.first_name:
45 self.first_name = self.first_name.strip().upper()
46 if self.last_name:
47 self.last_name = self.last_name.strip().upper()
48 if self.work_email_address:
49 self.work_email_address = self.work_email_address.strip().upper()
50 if self.alt_work_email_address:
51 self.alt_work_email_address = self.alt_work_email_address.strip().upper()
52 if self.is_attorney:
53 self.is_attorney = self.is_attorney.strip().upper()
54 if self.split_role_date_range:
55 self.split_role_date_range = self.split_role_date_range.strip().upper()
56 if self.sidley_validated:
57 self.sidley_validated = self.sidley_validated.strip().upper()
58 if self.category:
59 self.category = self.category.strip().upper()
60 if self.organization:
61 self.organization = self.organization.strip().upper()
62 if self.job_title:
63 self.job_title = self.job_title.strip().upper()
64 if self.business_title:
65 self.business_title = self.business_title.strip().upper()
66 if self.full_name_preferred:
67 self.full_name_preferred = self.full_name_preferred.strip().upper()
68 if self.login:
69 self.login = self.login.strip().upper()
70 if self.department_fine:
71 self.department_fine = self.department_fine.strip().upper()
72 if self.addressed_during_caag:
73 self.addressed_during_caag = self.addressed_during_caag.strip().upper()
74 #if self.last_updated:
75 # self.last_updated = self.last_updated.strip().upper()
76
77 @dataclass
78 class PeopleList:
79 people: List[Person] = field(default_factory=list)
80
81 def add_person(self, person: Person):
82 self.people.append(person)
83 #print(f"Added person: {person}")
84
85
86 def search_by_email(self, emailAddress:str) -> Optional[Person]:
87 for person in self.people:
88 if person.work_email_address == emailAddress:
89 return person
90 return None
91
92 def list_people(self):
93 for person in self.people:
94 print(person)
95
96 def update_full_Name_overide(self, emailAddress:str, fullNameOverideValue) -> Optional[Person]:
97 valueUpdated = False
98 for person in self.people:
99 if person.work_email_address == emailAddress.upper():
100 person.full_name_overide = fullNameOverideValue.upper()
101 valueUpdated = True
102 ## Give a quik warning as you add the override value into the database if the last name differs.
103 if "," in fullNameOverideValue:
104 lastName = fullNameOverideValue.split(",")[0]
105 else:
106 lastName = fullNameOverideValue.split(" ")[-1]
107 if lastName.upper() == person.last_name:
108 pass
109 else:
110 print(f"WARNING: Overide last name value {lastName.upper()} does not match {person.last_name}.")
111 if valueUpdated == False:
112 print(f"WARNING: No email address match for {emailAddress} found.")
113
114
115 class NamesVerification(object):
116 """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
117 version = '0.6.0'
118
119
120 def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName,fullNameOveridesFileName, forceNewPklFile = False, Encoding = 'UTF8'):
121 """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
122 Assumes the first row of the data file is the header and first column is DocID.
123 Assumes the MAL is a spreadsheet (for now).
124 MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
125 pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
126
127 print("Initializing data structures...")
128 if forceNewPklFile:
129 print("Creating MAL structure...")
130 self.malPeopleList = PeopleList()
131 self.__IngestMALSpreadsheet(masterAttorneyListFileName)
132 print("MAL structure created.")
133 print("Loading full name overide values...")
134 self.__LoadFullNameOverideValues(fullNameOveridesFileName)
135 print("Full name overide values loaded.")
136 print("Creating pickle backup...")
137 self.__SaveMalToPkl(pklFileName)
138 print("Pickle backup created.")
139 else:
140 if os.path.exists(pklFileName):
141 print("Loading MAL structure from pickle file...")
142 self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
143 print("MAL structure loaded.")
144 else:
145 print("Pickle file doesnt exist.")
146 print("Creating MAL structure...")
147 self.malPeopleList = PeopleList()
148 self.__IngestMALSpreadsheet(masterAttorneyListFileName)
149 print("MAL structure created.")
150 print("Loading full name overide values...")
151 self.__LoadFullNameOverideValues(fullNameOveridesFileName)
152 print("Full name overide values loaded.")
153 print("Creating pickle backup...")
154 self.__SaveMalToPkl(pklFileName)
155 print("Pickle backup created.")
156
157 ## self.malPeopleList = PeopleList()
158 ##
159 ## print("Creating MAL structure...")
160 ## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
161 ## print("MAL structure created.")
162 ## print("Creating pickle backup...")
163
164
165
166
167
168 def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
169 """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
170 ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
171
172 excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10919, "beginColNumber":2, "endColNumber":17},
173 {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":572, "beginColNumber":2, "endColNumber":16}]
174
175 # excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
176 # {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
177
178 spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
179 "Split Role - Attorney Capacity Date Range":"split_role_date_range", " Validated by OC??":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
180 "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag",
181 "Last Updated":"last_updated"}
182
183 xlApp = Dispatch('Excel.Application')
184 xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
185
186 for excelTab in excelTabParametersList:
187 sht = xlBook.Worksheets(excelTab['tabName'])
188 print(f"Ingesting sheet {excelTab['tabName']}.")
189 excelFieldPositionMatrix = {}
190 for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
191 excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
192 for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
193 #print(row)
194 ## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
195 if excelTab['tabName'] == 'Attorneys':
196 self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
197 split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
198 sidley_validated = sht.Cells(row,excelFieldPositionMatrix[' Validated by OC?']).Value,
199 category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
200 organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
201 last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
202 first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
203 work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
204 alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
205 job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
206 business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
207 full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
208 login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
209 department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
210 addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Comments']).Value))
211 #addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
212 #last_updated = sht.Cells(row,excelFieldPositionMatrix['Last Updated']).Value ))
213
214 else:
215 self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
216 split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
217 sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Validated by OC?']).Value,
218 organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
219 last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
220 first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
221 work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
222 alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
223 job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
224 business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
225 full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
226 login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
227 department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
228 addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
229
230 xlBook.Close()
231
232 def __SaveMalToPkl(self, pklFileName):
233 """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
234 outputFile = open(pklFileName,'wb')
235 pickle.dump(self.malPeopleList,outputFile)
236 outputFile.close()
237
238 def __LoadMalFromPkl(self, pklFileName):
239 """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
240 contents = open(pklFileName, 'rb')
241 obj = pickle.load(contents)
242 contents.close()
243 return obj
244
245 def __LoadFullNameOverideValues(self, fullNameOveridesFileName):
246 """Pseudo-private method which will update the MAL people list object with the full name overide values."""
247 contents = open(fullNameOveridesFileName).readlines()
248 for line in contents:
249 line = line.replace("\n","")
250 emailAddress,fullNameOverideValue = line.split("|")
251
252 self.malPeopleList.update_full_Name_overide(emailAddress, fullNameOverideValue)
253
254 def SmartDedupeSet(self, currentSet):
255 """Pseudo-private method that attempts to do some additional deduplication of the values in a set by lowering all values and deduplicating. Returns a lowered deduplicated set."""
256 newSet = set()
257 for val in currentSet:
258 newSet.add(val.lower())
259 return newSet
260
261 if __name__ == '__main__':
262 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\VEAS\VEAS_Log_Data_Export_Converted.txt"
263 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
264 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\TEST-PLOG.txt"
265 #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
266 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
267 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
268 #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
269 masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.1(7045413.15).xlsx"
270 #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
271 #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides.txt"
272 #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides - Copy.txt"
273 fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
274
275
276
277 nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
278 #nv.malPeopleList.list_people()
279
280 qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
281 print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
282 #print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
283 workList = qcP.metadataValuesDict.keys()
284 outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
285 for docID in workList:
286 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
287 #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
288 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
289 #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
290 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
291 #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
292 metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
293 formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
294 formattedAttorneyValues = set()
295 for formattedValue in formattedFieldValues:
296 if "*" in formattedValue:
297 formattedAttorneyValues.add(formattedValue.upper())
298
299 if metadataFieldValues:
300 matchedMetadataValues = set()
301 for nameItem in metadataFieldValues:
302 ## First test to see if there is a valid email address.
303 resultSet = set()
304 results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
305 if results:
306 for result in results:
307 resultSet.add(result)
308 if len(resultSet) >1:
309 resultSet = nv.SmartDedupeSet(resultSet)
310 if len(resultSet) >1:
311 print("ERROR multiple email **unique** email addresses in one item.")
312 print(resultSet)
313 print("\n")
314 else:
315 personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
316 if personMatch:
317 if personMatch.full_name_overide:
318 fullName = personMatch.full_name_overide
319 elif personMatch.full_name_preferred:
320 #print(personMatch.full_name_preferred)
321 ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
322 fullPreferredName = personMatch.full_name_preferred
323 fullPreferredName = fullPreferredName.replace('(LEGAL)','')
324 fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
325 fullPreferredName = fullPreferredName.replace('(SHE HER)','')
326 preferedLastName, preferedFirstName = fullPreferredName.split(',')
327 preferedLastName = preferedLastName.strip()
328 preferedFirstName = preferedFirstName.strip()
329 preferedFirstName = preferedFirstName.split(" ")[0]
330 fullName = f"{preferedFirstName} {preferedLastName}"
331 #fullName = f"{preferedLastName}, {preferedFirstName}"
332 else:
333 fullName = f"{personMatch.first_name} {personMatch.last_name}"
334 #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
335 if personMatch.is_attorney == 'YES':
336 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
337 matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
338 else:
339 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
340 matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
341 else:
342 outputFile.write(f"{docID} contains a non-email item {nameItem}\n\n")
343 missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
344 missingFromMeta = formattedAttorneyValues - matchedMetadataValues
345 if missingFromFormatted:
346 for missingItem in missingFromFormatted:
347 outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
348 if missingFromMeta:
349 for missingItem in missingFromMeta:
350 outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
351 if missingFromFormatted:
352 outputFile.write("\n")
353 elif missingFromMeta:
354 outputFile.write("\n")
355 outputFile.close()
356
357
358
359
360 ## people_list = PeopleList()
361 ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "fooBar@gmail.com", altWorkEmailAddress = ""))
362 ## people_list.add_person(Person(firstName = "Gary", lastName = "Cooper", workEmailAddress = "", altWorkEmailAddress = "spam.eggs@hotmail.com"))
363 ## people_list.add_person(Person(firstName = "", lastName = "", workEmailAddress = "noname@gmail.com", altWorkEmailAddress = ""))
364 ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "eggs@outlook.com", altWorkEmailAddress = ""))
365 ## print("\nAll People:")
366 ## people_list.list_people()
367 ## print("\nSearching...")
368 ## result = people_list.search_by_email('fooBar@gmail.com')
369 ## print(result if result else "email not found.")