ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_NamesNormQC.py
Revision: 835
Committed: Wed Nov 27 16:08:34 2024 UTC (15 months, 4 weeks ago) by nino.borges
Content type: text/x-python
File size: 16353 byte(s)
Log Message:
Added support to do the actual compares and output the issue values to a log file, further refining the logic.  Also added some support to add some manual manipulation on some of the values for things where inconsistent in the MAL.

File Contents

# User Rev Content
1 nino.borges 834 """
2    
3     Amazon-NamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     11.21.2024
8    
9     This program will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10    
11     """
12    
13     import os, uuid, pickle, re
14     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15     from dataclasses import dataclass, field
16     from typing import List, Optional
17     from collections import namedtuple
18     from win32com.client import Dispatch
19    
20    
21     @dataclass
22     class Person:
23     first_name: Optional[str] = None
24     last_name: Optional[str] = None
25     work_email_address: Optional[str] = None
26     alt_work_email_address: Optional[str] = None
27     _id: uuid.UUID = field(default_factory=uuid.uuid4)
28     is_attorney: Optional[str] = None
29     split_role_date_range: Optional[str] = None
30     sidley_validated: Optional[str] = None
31     category: Optional[str] = None
32     organization: Optional[str] = None
33     job_title: Optional[str] = None
34     business_title: Optional[str] = None
35     full_name_preferred: Optional[str] = None
36     login: Optional[str] = None
37     department_fine: Optional[str] = None
38     addressed_during_caag: Optional[str] = None
39    
40     def __post_init__(self):
41     """Convert all string fields to uppercase."""
42     if self.first_name:
43     self.first_name = self.first_name.upper()
44     if self.last_name:
45     self.last_name = self.last_name.upper()
46     if self.work_email_address:
47     self.work_email_address = self.work_email_address.upper()
48     if self.alt_work_email_address:
49     self.alt_work_email_address = self.alt_work_email_address.upper()
50     if self.is_attorney:
51     self.is_attorney = self.is_attorney.upper()
52     if self.split_role_date_range:
53     self.split_role_date_range = self.split_role_date_range.upper()
54     if self.sidley_validated:
55     self.sidley_validated = self.sidley_validated.upper()
56     if self.category:
57     self.category = self.category.upper()
58     if self.organization:
59     self.organization = self.organization.upper()
60     if self.job_title:
61     self.job_title = self.job_title.upper()
62     if self.business_title:
63     self.business_title = self.business_title.upper()
64     if self.full_name_preferred:
65     self.full_name_preferred = self.full_name_preferred.upper()
66     if self.login:
67     self.login = self.login.upper()
68     if self.department_fine:
69     self.department_fine = self.department_fine.upper()
70     if self.addressed_during_caag:
71     self.addressed_during_caag = self.addressed_during_caag.upper()
72    
73     @dataclass
74     class PeopleList:
75     people: List[Person] = field(default_factory=list)
76    
77     def add_person(self, person: Person):
78     self.people.append(person)
79     #print(f"Added person: {person}")
80    
81    
82     def search_by_email(self, emailAddress:str) -> Optional[Person]:
83     for person in self.people:
84     if person.work_email_address == emailAddress:
85     return person
86     return None
87    
88     def list_people(self):
89     for person in self.people:
90     print(person)
91    
92    
93     class NamesVerification(object):
94     """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
95 nino.borges 835 version = '0.2.0'
96 nino.borges 834
97    
98     def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName, forceNewPklFile = False, Encoding = 'UTF8'):
99     """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
100     Assumes the first row of the data file is the header and first column is DocID.
101     Assumes the MAL is a spreadsheet (for now).
102     MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
103     pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
104    
105     print("Initializing data structures...")
106     if forceNewPklFile:
107     print("Creating MAL structure...")
108     self.malPeopleList = PeopleList()
109     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
110     print("MAL structure created.")
111     print("Creating pickle backup...")
112     self.__SaveMalToPkl(pklFileName)
113     print("Pickle backup created.")
114     else:
115     if os.path.exists(pklFileName):
116     print("Loading MAL structure from pickle file...")
117     self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
118     print("MAL structure loaded.")
119     else:
120     print("Pickle file doesnt exist.")
121     print("Creating MAL structure...")
122     self.malPeopleList = PeopleList()
123     self.__IngestMALSpreadsheet(masterAttorneyListFileName)
124     print("MAL structure created.")
125     print("Creating pickle backup...")
126     self.__SaveMalToPkl(pklFileName)
127     print("Pickle backup created.")
128    
129     ## self.malPeopleList = PeopleList()
130     ##
131     ## print("Creating MAL structure...")
132     ## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
133     ## print("MAL structure created.")
134     ## print("Creating pickle backup...")
135    
136    
137    
138    
139    
140     def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
141     """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
142     ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
143    
144     excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10909, "beginColNumber":2, "endColNumber":16},
145     {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":565, "beginColNumber":2, "endColNumber":15}]
146    
147     # excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
148     # {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
149    
150     spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
151     "Split Role - Attorney Capacity Date Range":"split_role_date_range", "Sidley Validated?":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
152     "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag"}
153    
154     xlApp = Dispatch('Excel.Application')
155     xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
156    
157     for excelTab in excelTabParametersList:
158     sht = xlBook.Worksheets(excelTab['tabName'])
159     print(f"Ingesting sheet {excelTab['tabName']}.")
160     excelFieldPositionMatrix = {}
161     for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
162     excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
163     for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
164     #print(row)
165     ## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
166     if excelTab['tabName'] == 'Attorneys':
167     self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
168     split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
169     sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Sidley Validated?']).Value,
170     category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
171     organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
172     last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
173     first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
174     work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
175     alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
176     job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
177     business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
178     full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
179     login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
180     department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
181     addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
182    
183    
184     xlBook.Close()
185    
186     def __SaveMalToPkl(self, pklFileName):
187     """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
188     outputFile = open(pklFileName,'wb')
189     pickle.dump(self.malPeopleList,outputFile)
190     outputFile.close()
191    
192     def __LoadMalFromPkl(self, pklFileName):
193     """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
194     contents = open(pklFileName, 'rb')
195     obj = pickle.load(contents)
196     contents.close()
197     return obj
198    
199    
200     if __name__ == '__main__':
201     #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
202     cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
203 nino.borges 835 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
204 nino.borges 834 masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
205     #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
206    
207    
208    
209     nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName)
210     #nv.malPeopleList.list_people()
211    
212     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
213 nino.borges 835 #print(nv.malPeopleList.search_by_email('joyshine@amazon.com'.upper()))
214 nino.borges 834 workList = qcP.metadataValuesDict.keys()
215     outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
216     for docID in workList:
217     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
218     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
219 nino.borges 835 formattedAttorneyValues = set()
220     for formattedValue in formattedFieldValues:
221     if "*" in formattedValue:
222     formattedAttorneyValues.add(formattedValue.upper())
223 nino.borges 834
224     if metadataFieldValues:
225 nino.borges 835 matchedMetadataValues = set()
226 nino.borges 834 for nameItem in metadataFieldValues:
227     ## First test to see if there is a valid email address.
228     resultSet = set()
229     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
230     if results:
231     for result in results:
232     resultSet.add(result)
233     if len(resultSet) >1:
234     print("ERROR multiple email unique email addresses in one item.")
235     else:
236     personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
237     if personMatch:
238 nino.borges 835 if personMatch.full_name_preferred:
239     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
240     fullPreferredName = personMatch.full_name_preferred
241     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
242     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
243     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
244     preferedLastName, preferedFirstName = fullPreferredName.split(',')
245     preferedLastName = preferedLastName.strip()
246     preferedFirstName = preferedFirstName.strip()
247     preferedFirstName = preferedFirstName.split(" ")[0]
248     fullName = f"{preferedFirstName} {preferedLastName}"
249     else:
250     fullName = f"{personMatch.first_name} {personMatch.last_name}"
251 nino.borges 834 if personMatch.is_attorney == 'YES':
252 nino.borges 835 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
253     matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
254 nino.borges 834 else:
255 nino.borges 835 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
256     matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
257 nino.borges 834 else:
258     outputFile.write(f"{docID} contains a non-email item {nameItem}\n")
259 nino.borges 835 missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
260     missingFromMeta = formattedAttorneyValues - matchedMetadataValues
261     if missingFromFormatted:
262     for missingItem in missingFromFormatted:
263     outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
264     if missingFromMeta:
265     for missingItem in missingFromMeta:
266     outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
267     if missingFromFormatted:
268     outputFile.write("\n")
269     elif missingFromMeta:
270     outputFile.write("\n")
271 nino.borges 834 outputFile.close()
272    
273    
274    
275    
276     ## people_list = PeopleList()
277     ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "fooBar@gmail.com", altWorkEmailAddress = ""))
278     ## people_list.add_person(Person(firstName = "Gary", lastName = "Cooper", workEmailAddress = "", altWorkEmailAddress = "spam.eggs@hotmail.com"))
279     ## people_list.add_person(Person(firstName = "", lastName = "", workEmailAddress = "noname@gmail.com", altWorkEmailAddress = ""))
280     ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "eggs@outlook.com", altWorkEmailAddress = ""))
281     ## print("\nAll People:")
282     ## people_list.list_people()
283     ## print("\nSearching...")
284     ## result = people_list.search_by_email('fooBar@gmail.com')
285     ## print(result if result else "email not found.")