ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_NamesNormQC.py
Revision: 847
Committed: Tue Dec 10 17:59:30 2024 UTC (15 months, 2 weeks ago) by nino.borges
Content type: text/x-python
File size: 23052 byte(s)
Log Message:
Finally changed this so that it's a library, so I removed the run code at the bottom.  Programs will now be able to call this library now that the pickle will work properly.

File Contents

# User Rev Content
1 nino.borges 834 """
2    
3 nino.borges 847 Amazon_NamesNormQC
4 nino.borges 834
5     Created by:
6     Emanuel Borges
7     11.21.2024
8    
9 nino.borges 847 This Library will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10 nino.borges 834
11     """
12    
13     import os, uuid, pickle, re
14 nino.borges 847 #import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15 nino.borges 834 from dataclasses import dataclass, field
16     from typing import List, Optional
17     from collections import namedtuple
18     from win32com.client import Dispatch
19    
20    
21     @dataclass
22     class Person:
23     first_name: Optional[str] = None
24     last_name: Optional[str] = None
25     work_email_address: Optional[str] = None
26     alt_work_email_address: Optional[str] = None
27     _id: uuid.UUID = field(default_factory=uuid.uuid4)
28     is_attorney: Optional[str] = None
29     split_role_date_range: Optional[str] = None
30     sidley_validated: Optional[str] = None
31     category: Optional[str] = None
32     organization: Optional[str] = None
33     job_title: Optional[str] = None
34     business_title: Optional[str] = None
35     full_name_preferred: Optional[str] = None
36     login: Optional[str] = None
37     department_fine: Optional[str] = None
38     addressed_during_caag: Optional[str] = None
39 nino.borges 838 #last_updated: Optional[str] = None
40 nino.borges 837 full_name_overide: Optional[str] = None
41 nino.borges 834
42     def __post_init__(self):
43     """Convert all string fields to uppercase."""
44     if self.first_name:
45 nino.borges 836 self.first_name = self.first_name.strip().upper()
46 nino.borges 834 if self.last_name:
47 nino.borges 836 self.last_name = self.last_name.strip().upper()
48 nino.borges 834 if self.work_email_address:
49 nino.borges 836 self.work_email_address = self.work_email_address.strip().upper()
50 nino.borges 834 if self.alt_work_email_address:
51 nino.borges 836 self.alt_work_email_address = self.alt_work_email_address.strip().upper()
52 nino.borges 834 if self.is_attorney:
53 nino.borges 836 self.is_attorney = self.is_attorney.strip().upper()
54 nino.borges 834 if self.split_role_date_range:
55 nino.borges 836 self.split_role_date_range = self.split_role_date_range.strip().upper()
56 nino.borges 834 if self.sidley_validated:
57 nino.borges 836 self.sidley_validated = self.sidley_validated.strip().upper()
58 nino.borges 834 if self.category:
59 nino.borges 836 self.category = self.category.strip().upper()
60 nino.borges 834 if self.organization:
61 nino.borges 836 self.organization = self.organization.strip().upper()
62 nino.borges 834 if self.job_title:
63 nino.borges 836 self.job_title = self.job_title.strip().upper()
64 nino.borges 834 if self.business_title:
65 nino.borges 836 self.business_title = self.business_title.strip().upper()
66 nino.borges 834 if self.full_name_preferred:
67 nino.borges 836 self.full_name_preferred = self.full_name_preferred.strip().upper()
68 nino.borges 834 if self.login:
69 nino.borges 836 self.login = self.login.strip().upper()
70 nino.borges 834 if self.department_fine:
71 nino.borges 836 self.department_fine = self.department_fine.strip().upper()
72 nino.borges 834 if self.addressed_during_caag:
73 nino.borges 836 self.addressed_during_caag = self.addressed_during_caag.strip().upper()
74 nino.borges 838 #if self.last_updated:
75     # self.last_updated = self.last_updated.strip().upper()
76 nino.borges 834
77     @dataclass
78     class PeopleList:
79     people: List[Person] = field(default_factory=list)
80    
81     def add_person(self, person: Person):
82     self.people.append(person)
83     #print(f"Added person: {person}")
84    
85    
86     def search_by_email(self, emailAddress:str) -> Optional[Person]:
87     for person in self.people:
88     if person.work_email_address == emailAddress:
89     return person
90     return None
91    
92     def list_people(self):
93     for person in self.people:
94     print(person)
95    
96 nino.borges 837 def update_full_Name_overide(self, emailAddress:str, fullNameOverideValue) -> Optional[Person]:
97 nino.borges 846 valueUpdated = False
98 nino.borges 837 for person in self.people:
99     if person.work_email_address == emailAddress.upper():
100     person.full_name_overide = fullNameOverideValue.upper()
101 nino.borges 846 valueUpdated = True
102 nino.borges 844 ## Give a quik warning as you add the override value into the database if the last name differs.
103     if "," in fullNameOverideValue:
104     lastName = fullNameOverideValue.split(",")[0]
105     else:
106     lastName = fullNameOverideValue.split(" ")[-1]
107     if lastName.upper() == person.last_name:
108     pass
109     else:
110     print(f"WARNING: Overide last name value {lastName.upper()} does not match {person.last_name}.")
111 nino.borges 846 if valueUpdated == False:
112     print(f"WARNING: No email address match for {emailAddress} found.")
113 nino.borges 834
114 nino.borges 837
115 nino.borges 834 class NamesVerification(object):
116     """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
117 nino.borges 847 version = '0.7.0'
118 nino.borges 834
119    
120 nino.borges 837 def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName,fullNameOveridesFileName, forceNewPklFile = False, Encoding = 'UTF8'):
121 nino.borges 834 """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
122     Assumes the first row of the data file is the header and first column is DocID.
123     Assumes the MAL is a spreadsheet (for now).
124     MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
125     pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
126    
127     print("Initializing data structures...")
128     if forceNewPklFile:
129     print("Creating MAL structure...")
130     self.malPeopleList = PeopleList()
131     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
132     print("MAL structure created.")
133 nino.borges 837 print("Loading full name overide values...")
134     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
135     print("Full name overide values loaded.")
136 nino.borges 834 print("Creating pickle backup...")
137     self.__SaveMalToPkl(pklFileName)
138     print("Pickle backup created.")
139     else:
140     if os.path.exists(pklFileName):
141     print("Loading MAL structure from pickle file...")
142     self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
143     print("MAL structure loaded.")
144     else:
145     print("Pickle file doesnt exist.")
146     print("Creating MAL structure...")
147     self.malPeopleList = PeopleList()
148     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
149     print("MAL structure created.")
150 nino.borges 837 print("Loading full name overide values...")
151     self.__LoadFullNameOverideValues(fullNameOveridesFileName)
152     print("Full name overide values loaded.")
153 nino.borges 834 print("Creating pickle backup...")
154     self.__SaveMalToPkl(pklFileName)
155     print("Pickle backup created.")
156    
157     ## self.malPeopleList = PeopleList()
158     ##
159     ## print("Creating MAL structure...")
160     ## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
161     ## print("MAL structure created.")
162     ## print("Creating pickle backup...")
163    
164    
165    
166    
167    
168     def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
169     """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
170     ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
171    
172 nino.borges 837 excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10919, "beginColNumber":2, "endColNumber":17},
173     {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":572, "beginColNumber":2, "endColNumber":16}]
174 nino.borges 834
175     # excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
176     # {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
177    
178     spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
179 nino.borges 837 "Split Role - Attorney Capacity Date Range":"split_role_date_range", " Validated by OC??":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
180     "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag",
181     "Last Updated":"last_updated"}
182 nino.borges 834
183     xlApp = Dispatch('Excel.Application')
184     xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
185    
186     for excelTab in excelTabParametersList:
187     sht = xlBook.Worksheets(excelTab['tabName'])
188     print(f"Ingesting sheet {excelTab['tabName']}.")
189     excelFieldPositionMatrix = {}
190     for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
191     excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
192     for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
193     #print(row)
194     ## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
195     if excelTab['tabName'] == 'Attorneys':
196     self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
197     split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
198 nino.borges 837 sidley_validated = sht.Cells(row,excelFieldPositionMatrix[' Validated by OC?']).Value,
199 nino.borges 834 category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
200     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
201     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
202     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
203     work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
204     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
205     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
206     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
207     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
208     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
209     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
210 nino.borges 847 #addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Comments']).Value))
211     addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
212 nino.borges 838 #last_updated = sht.Cells(row,excelFieldPositionMatrix['Last Updated']).Value ))
213 nino.borges 844
214     else:
215     self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
216     split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
217     sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Validated by OC?']).Value,
218     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
219     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
220     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
221     work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
222     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
223     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
224     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
225     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
226     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
227     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
228     addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
229 nino.borges 834
230     xlBook.Close()
231    
232     def __SaveMalToPkl(self, pklFileName):
233     """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
234     outputFile = open(pklFileName,'wb')
235     pickle.dump(self.malPeopleList,outputFile)
236     outputFile.close()
237    
238     def __LoadMalFromPkl(self, pklFileName):
239     """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
240     contents = open(pklFileName, 'rb')
241     obj = pickle.load(contents)
242     contents.close()
243     return obj
244    
245 nino.borges 837 def __LoadFullNameOverideValues(self, fullNameOveridesFileName):
246     """Pseudo-private method which will update the MAL people list object with the full name overide values."""
247     contents = open(fullNameOveridesFileName).readlines()
248     for line in contents:
249     line = line.replace("\n","")
250     emailAddress,fullNameOverideValue = line.split("|")
251 nino.borges 844
252 nino.borges 837 self.malPeopleList.update_full_Name_overide(emailAddress, fullNameOverideValue)
253    
254 nino.borges 844 def SmartDedupeSet(self, currentSet):
255     """Pseudo-private method that attempts to do some additional deduplication of the values in a set by lowering all values and deduplicating. Returns a lowered deduplicated set."""
256     newSet = set()
257     for val in currentSet:
258     newSet.add(val.lower())
259     return newSet
260    
261 nino.borges 834 if __name__ == '__main__':
262 nino.borges 847 pass
263     ## cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\VEAS\VEAS_Log_Data_Export_Converted.txt"
264     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
265     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\TEST-PLOG.txt"
266     ## #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
267     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
268     ## #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
269     ## #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
270     ## masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.1(7045413.15).xlsx"
271     ## #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
272     ## #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides.txt"
273     ## #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides - Copy.txt"
274     ## fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
275     ##
276     ##
277     ##
278     ## nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
279     ## #nv.malPeopleList.list_people()
280     ##
281     ## qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
282     ## print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
283     ## #print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
284     ## workList = qcP.metadataValuesDict.keys()
285     ## outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
286     ## for docID in workList:
287     ## #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
288     ## #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
289     ## #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
290     ## #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
291     ## #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
292     ## #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
293     ## metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
294     ## formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
295     ## formattedAttorneyValues = set()
296     ## for formattedValue in formattedFieldValues:
297     ## if "*" in formattedValue:
298     ## formattedAttorneyValues.add(formattedValue.upper())
299     ##
300     ## if metadataFieldValues:
301     ## matchedMetadataValues = set()
302     ## for nameItem in metadataFieldValues:
303     ## ## First test to see if there is a valid email address.
304     ## resultSet = set()
305     ## results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
306     ## if results:
307     ## for result in results:
308     ## resultSet.add(result)
309     ## if len(resultSet) >1:
310     ## resultSet = nv.SmartDedupeSet(resultSet)
311     ## if len(resultSet) >1:
312     ## print("ERROR multiple email **unique** email addresses in one item.")
313     ## print(resultSet)
314     ## print("\n")
315     ## else:
316     ## personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
317     ## if personMatch:
318     ## if personMatch.full_name_overide:
319     ## fullName = personMatch.full_name_overide
320     ## elif personMatch.full_name_preferred:
321     ## #print(personMatch.full_name_preferred)
322     ## ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
323     ## fullPreferredName = personMatch.full_name_preferred
324     ## fullPreferredName = fullPreferredName.replace('(LEGAL)','')
325     ## fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
326     ## fullPreferredName = fullPreferredName.replace('(SHE HER)','')
327     ## preferedLastName, preferedFirstName = fullPreferredName.split(',')
328     ## preferedLastName = preferedLastName.strip()
329     ## preferedFirstName = preferedFirstName.strip()
330     ## preferedFirstName = preferedFirstName.split(" ")[0]
331     ## fullName = f"{preferedFirstName} {preferedLastName}"
332     ## #fullName = f"{preferedLastName}, {preferedFirstName}"
333     ## else:
334     ## fullName = f"{personMatch.first_name} {personMatch.last_name}"
335     ## #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
336     ## if personMatch.is_attorney == 'YES':
337     ## #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
338     ## matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
339     ## else:
340     ## #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
341     ## matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
342     ## else:
343     ## outputFile.write(f"{docID} contains a non-email item {nameItem}\n\n")
344     ## missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
345     ## missingFromMeta = formattedAttorneyValues - matchedMetadataValues
346     ## if missingFromFormatted:
347     ## for missingItem in missingFromFormatted:
348     ## outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
349     ## if missingFromMeta:
350     ## for missingItem in missingFromMeta:
351     ## outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
352     ## if missingFromFormatted:
353     ## outputFile.write("\n")
354     ## elif missingFromMeta:
355     ## outputFile.write("\n")
356     ## outputFile.close()
357 nino.borges 834