ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_NamesNormQC.py
Revision: 835
Committed: Wed Nov 27 16:08:34 2024 UTC (15 months, 4 weeks ago) by nino.borges
Content type: text/x-python
File size: 16353 byte(s)
Log Message:
Added support to do the actual compares and output the issue values to a log file, further refining the logic.  Also added some support to add some manual manipulation on some of the values for things where inconsistent in the MAL.

File Contents

# Content
1 """
2
3 Amazon-NamesNormQC
4
5 Created by:
6 Emanuel Borges
7 11.21.2024
8
9 This program will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10
11 """
12
13 import os, uuid, pickle, re
14 import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15 from dataclasses import dataclass, field
16 from typing import List, Optional
17 from collections import namedtuple
18 from win32com.client import Dispatch
19
20
21 @dataclass
22 class Person:
23 first_name: Optional[str] = None
24 last_name: Optional[str] = None
25 work_email_address: Optional[str] = None
26 alt_work_email_address: Optional[str] = None
27 _id: uuid.UUID = field(default_factory=uuid.uuid4)
28 is_attorney: Optional[str] = None
29 split_role_date_range: Optional[str] = None
30 sidley_validated: Optional[str] = None
31 category: Optional[str] = None
32 organization: Optional[str] = None
33 job_title: Optional[str] = None
34 business_title: Optional[str] = None
35 full_name_preferred: Optional[str] = None
36 login: Optional[str] = None
37 department_fine: Optional[str] = None
38 addressed_during_caag: Optional[str] = None
39
40 def __post_init__(self):
41 """Convert all string fields to uppercase."""
42 if self.first_name:
43 self.first_name = self.first_name.upper()
44 if self.last_name:
45 self.last_name = self.last_name.upper()
46 if self.work_email_address:
47 self.work_email_address = self.work_email_address.upper()
48 if self.alt_work_email_address:
49 self.alt_work_email_address = self.alt_work_email_address.upper()
50 if self.is_attorney:
51 self.is_attorney = self.is_attorney.upper()
52 if self.split_role_date_range:
53 self.split_role_date_range = self.split_role_date_range.upper()
54 if self.sidley_validated:
55 self.sidley_validated = self.sidley_validated.upper()
56 if self.category:
57 self.category = self.category.upper()
58 if self.organization:
59 self.organization = self.organization.upper()
60 if self.job_title:
61 self.job_title = self.job_title.upper()
62 if self.business_title:
63 self.business_title = self.business_title.upper()
64 if self.full_name_preferred:
65 self.full_name_preferred = self.full_name_preferred.upper()
66 if self.login:
67 self.login = self.login.upper()
68 if self.department_fine:
69 self.department_fine = self.department_fine.upper()
70 if self.addressed_during_caag:
71 self.addressed_during_caag = self.addressed_during_caag.upper()
72
73 @dataclass
74 class PeopleList:
75 people: List[Person] = field(default_factory=list)
76
77 def add_person(self, person: Person):
78 self.people.append(person)
79 #print(f"Added person: {person}")
80
81
82 def search_by_email(self, emailAddress:str) -> Optional[Person]:
83 for person in self.people:
84 if person.work_email_address == emailAddress:
85 return person
86 return None
87
88 def list_people(self):
89 for person in self.people:
90 print(person)
91
92
93 class NamesVerification(object):
94 """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
95 version = '0.2.0'
96
97
98 def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName, forceNewPklFile = False, Encoding = 'UTF8'):
99 """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
100 Assumes the first row of the data file is the header and first column is DocID.
101 Assumes the MAL is a spreadsheet (for now).
102 MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
103 pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
104
105 print("Initializing data structures...")
106 if forceNewPklFile:
107 print("Creating MAL structure...")
108 self.malPeopleList = PeopleList()
109 self.__IngestMALSpreadsheet(masterAttorneyListFileName)
110 print("MAL structure created.")
111 print("Creating pickle backup...")
112 self.__SaveMalToPkl(pklFileName)
113 print("Pickle backup created.")
114 else:
115 if os.path.exists(pklFileName):
116 print("Loading MAL structure from pickle file...")
117 self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
118 print("MAL structure loaded.")
119 else:
120 print("Pickle file doesnt exist.")
121 print("Creating MAL structure...")
122 self.malPeopleList = PeopleList()
123 self.__IngestMALSpreadsheet(masterAttorneyListFileName)
124 print("MAL structure created.")
125 print("Creating pickle backup...")
126 self.__SaveMalToPkl(pklFileName)
127 print("Pickle backup created.")
128
129 ## self.malPeopleList = PeopleList()
130 ##
131 ## print("Creating MAL structure...")
132 ## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
133 ## print("MAL structure created.")
134 ## print("Creating pickle backup...")
135
136
137
138
139
140 def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
141 """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
142 ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
143
144 excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10909, "beginColNumber":2, "endColNumber":16},
145 {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":565, "beginColNumber":2, "endColNumber":15}]
146
147 # excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
148 # {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
149
150 spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
151 "Split Role - Attorney Capacity Date Range":"split_role_date_range", "Sidley Validated?":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
152 "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag"}
153
154 xlApp = Dispatch('Excel.Application')
155 xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
156
157 for excelTab in excelTabParametersList:
158 sht = xlBook.Worksheets(excelTab['tabName'])
159 print(f"Ingesting sheet {excelTab['tabName']}.")
160 excelFieldPositionMatrix = {}
161 for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
162 excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
163 for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
164 #print(row)
165 ## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
166 if excelTab['tabName'] == 'Attorneys':
167 self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
168 split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
169 sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Sidley Validated?']).Value,
170 category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
171 organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
172 last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
173 first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
174 work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
175 alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
176 job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
177 business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
178 full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
179 login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
180 department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
181 addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
182
183
184 xlBook.Close()
185
186 def __SaveMalToPkl(self, pklFileName):
187 """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
188 outputFile = open(pklFileName,'wb')
189 pickle.dump(self.malPeopleList,outputFile)
190 outputFile.close()
191
192 def __LoadMalFromPkl(self, pklFileName):
193 """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
194 contents = open(pklFileName, 'rb')
195 obj = pickle.load(contents)
196 contents.close()
197 return obj
198
199
200 if __name__ == '__main__':
201 #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
202 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
203 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
204 masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
205 #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
206
207
208
209 nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName)
210 #nv.malPeopleList.list_people()
211
212 qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
213 #print(nv.malPeopleList.search_by_email('joyshine@amazon.com'.upper()))
214 workList = qcP.metadataValuesDict.keys()
215 outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
216 for docID in workList:
217 metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
218 formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
219 formattedAttorneyValues = set()
220 for formattedValue in formattedFieldValues:
221 if "*" in formattedValue:
222 formattedAttorneyValues.add(formattedValue.upper())
223
224 if metadataFieldValues:
225 matchedMetadataValues = set()
226 for nameItem in metadataFieldValues:
227 ## First test to see if there is a valid email address.
228 resultSet = set()
229 results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
230 if results:
231 for result in results:
232 resultSet.add(result)
233 if len(resultSet) >1:
234 print("ERROR multiple email unique email addresses in one item.")
235 else:
236 personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
237 if personMatch:
238 if personMatch.full_name_preferred:
239 ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
240 fullPreferredName = personMatch.full_name_preferred
241 fullPreferredName = fullPreferredName.replace('(LEGAL)','')
242 fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
243 fullPreferredName = fullPreferredName.replace('(SHE HER)','')
244 preferedLastName, preferedFirstName = fullPreferredName.split(',')
245 preferedLastName = preferedLastName.strip()
246 preferedFirstName = preferedFirstName.strip()
247 preferedFirstName = preferedFirstName.split(" ")[0]
248 fullName = f"{preferedFirstName} {preferedLastName}"
249 else:
250 fullName = f"{personMatch.first_name} {personMatch.last_name}"
251 if personMatch.is_attorney == 'YES':
252 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
253 matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
254 else:
255 #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
256 matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
257 else:
258 outputFile.write(f"{docID} contains a non-email item {nameItem}\n")
259 missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
260 missingFromMeta = formattedAttorneyValues - matchedMetadataValues
261 if missingFromFormatted:
262 for missingItem in missingFromFormatted:
263 outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
264 if missingFromMeta:
265 for missingItem in missingFromMeta:
266 outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
267 if missingFromFormatted:
268 outputFile.write("\n")
269 elif missingFromMeta:
270 outputFile.write("\n")
271 outputFile.close()
272
273
274
275
276 ## people_list = PeopleList()
277 ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "fooBar@gmail.com", altWorkEmailAddress = ""))
278 ## people_list.add_person(Person(firstName = "Gary", lastName = "Cooper", workEmailAddress = "", altWorkEmailAddress = "spam.eggs@hotmail.com"))
279 ## people_list.add_person(Person(firstName = "", lastName = "", workEmailAddress = "noname@gmail.com", altWorkEmailAddress = ""))
280 ## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "eggs@outlook.com", altWorkEmailAddress = ""))
281 ## print("\nAll People:")
282 ## people_list.list_people()
283 ## print("\nSearching...")
284 ## result = people_list.search_by_email('fooBar@gmail.com')
285 ## print(result if result else "email not found.")