ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 860
Committed: Fri Dec 13 18:22:14 2024 UTC (15 months, 1 week ago) by nino.borges
Content type: text/x-python
File size: 25483 byte(s)
Log Message:
This version finishes the evaluation on the email address matches split role and reports on the results by adding an issue.  Also added part at end of the email address match section to catch where I can match an email address in the metadata, the person is an attorney, but there is no matching value in the formatted field.  This is a potential missing attorney and it a high confidence flag because I had the email address.

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 860 version = '0.7.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32     if personMatch.alt_work_email_address:
33     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
34     allDomainsList = list(dict.fromkeys(allDomainsList))
35    
36     if personMatch.full_name_overide:
37     fullName = personMatch.full_name_overide
38     for domain in allDomainsList:
39     allPossibleVariationsList.append((fullName,domain))
40     if personMatch.full_name_preferred:
41     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
42     fullPreferredName = personMatch.full_name_preferred
43     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
44     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
45     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
46     if "," in fullPreferredName:
47     preferedLastName, preferedFirstName = fullPreferredName.split(',')
48     preferedLastName = preferedLastName.strip()
49     preferedFirstName = preferedFirstName.strip()
50     preferedFirstName = preferedFirstName.split(" ")[0]
51     fullName = f"{preferedFirstName} {preferedLastName}"
52     #fullName = f"{preferedLastName}, {preferedFirstName}"
53     for domain in allDomainsList:
54     allPossibleVariationsList.append((fullName,domain))
55     else:
56     print(f"ERROR in this name {fullPreferredName}")
57     if personMatch.last_name:
58     if personMatch.first_name:
59     fullName = f"{personMatch.first_name} {personMatch.last_name}"
60     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
61     else:
62     fullName = f"{personMatch.last_name}"
63     for domain in allDomainsList:
64     allPossibleVariationsList.append((fullName,domain))
65    
66    
67     ## Now return a deduplicated list by using dict to deduplicate.
68     return list(dict.fromkeys(allPossibleVariationsList))
69    
70    
71 nino.borges 856 def AddToIssuesList(docID,issueMessage):
72     """This function will add a single issue to the issues matrix."""
73     if docID in list(issuesMatrix.keys()):
74     issuesMatrix[docID].append(issueMessage)
75     else:
76     issuesMatrix[docID] = [issueMessage,]
77 nino.borges 855
78    
79    
80     if __name__ == '__main__':
81     cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
82     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
83     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
84     outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
85    
86    
87     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
88    
89     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
90    
91     #issuesMatrix = {}
92    
93     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
94     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
95    
96     workList = qcP.metadataValuesDict.keys()
97     for docID in workList:
98     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
99     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
100     ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
101     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
102     ## This will change once you start itterating acroll all of the field values names
103     currentMetadataValues = metadataFieldValues
104     for val in currentMetadataValues:
105     ## First try to locate an email address in this val and if found, try to find that in the MAL.
106     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
107     if results:
108     ## Use some smart deduplication to remove duplicates.
109     results = nv.SmartDedupeSet(results)
110     for result in results:
111     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
112     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
113    
114     personMatch = nv.malPeopleList.search_by_email(result.upper())
115     if personMatch:
116     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
117     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
118     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
119     matchFlag = False
120     for variationPair in allPossibleVariationsList:
121     if personMatch.is_attorney == 'YES':
122     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
123     ## This variation was found in the list of formatted values, which is fine, so just remove it.
124     if matchFlag:
125     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
126     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
127     matchFlag = True
128     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
129     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
130     if matchFlag:
131     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
132     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
133     matchFlag = True
134 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
135     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
136 nino.borges 860
137 nino.borges 857
138 nino.borges 858 elif personMatch.is_attorney == 'NO':
139 nino.borges 855 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
140     if matchFlag:
141     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
142     ## This variation was found in the list of formatted values, which is fine, so just remove it.
143     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
144     matchFlag = True
145     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
146     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
147     if matchFlag:
148     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
149     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
150     matchFlag = True
151 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
152     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
153 nino.borges 858 else:
154     ## This means they are a split role, so additional work will need to be done with the dates.
155     ## First, determin if this document date is between the dates where this person was an attorney
156     wasAttorneyAtThatTime = False
157     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
158     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
159 nino.borges 860 #print(f"\ndocumentDateValue is {documentDateValue}")
160 nino.borges 858 personWasAttorneyDates = personMatch.dates_as_counsel
161     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
162 nino.borges 860 #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
163 nino.borges 858 if wasAttorneyStartDate.count("/") < 2:
164     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
165     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
166    
167     if wasAttorneyEndDate == "CURRENT":
168     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
169     elif wasAttorneyEndDate == "PRESENT":
170     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
171     if wasAttorneyEndDate.count("/") < 2:
172     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
173     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
174     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
175 nino.borges 859
176     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
177 nino.borges 858 if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
178 nino.borges 859 wasAttorneyAtThatTime = True
179    
180 nino.borges 860 ## if wasAttorneyAtThatTime:
181     ## print("Person WAS attorney at this doc date.")
182     ## else:
183     ## print("Person WAS NOT attorney at this doc date.")
184    
185     ## Person's role at the time of the document has been determined, so now do the same checks as above.
186 nino.borges 859 if wasAttorneyAtThatTime:
187 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
188     ## This variation was found in the list of formatted values, which is fine, so just remove it.
189     if matchFlag:
190     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
191     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
192     matchFlag = True
193     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
194     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
195     if matchFlag:
196     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
197     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
198     matchFlag = True
199     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
200     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
201    
202 nino.borges 859 else:
203 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
204     if matchFlag:
205     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
206     ## This variation was found in the list of formatted values, which is fine, so just remove it.
207     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
208     matchFlag = True
209     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
210     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
211     if matchFlag:
212     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
213     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
214     matchFlag = True
215     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
216     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
217    
218     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
219     if matchFlag == False:
220     if personMatch.is_attorney == 'YES':
221     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in Metadata To Field and did not directly match value in formatted however is a HIGH Confidence Potential Attorney")
222 nino.borges 855
223     else:
224     ## Person match, using email, not found in MAL. Try extracting a name from this metadata value and try matching the MAL using that.
225     val = val.upper()
226     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
227     if "(LEGAL)" in val:
228     ## Attempt to only remove the email parenthetical, including the now empty paren.
229     val = val.replace(result.upper(),"")
230     val = val.replace("()",'')
231     #val = val.replace(")","")
232     else:
233     ## Remove all parenthicals, including any character in that paren, from value.
234     val = re.sub(r"\([^)]*\)","",val)
235 nino.borges 860
236 nino.borges 855 val = val.strip()
237     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
238     if val:
239     ## if there is a comma, parse to last name, first name
240     if "," in val:
241     lastName, firstName = val.split(",")
242     lastName = lastName.strip()
243     firstName = firstName.strip()
244     elif " " in val:
245     ## For now, try just splitting by the first space and take everything after as the first name.
246     firstName, lastName = val.split(" ",1)
247     ## With the name now parse, try searching for all values that match on the last name.
248    
249     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
250     if personMatchList:
251     possiblePeopleMatchesMatrix = {}
252     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
253     for personMatch in personMatchList:
254     if personMatch.first_name == firstName:
255     ## This is a personMatch that matches the first and last name
256     possiblePeopleMatchesMatrix[personMatch._id] = 1
257     if possiblePeopleMatchesMatrix.keys():
258     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
259     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
260     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
261     matchFlag = False
262     for variationPair in allPossibleVariationsList:
263     if personMatch.is_attorney == 'YES':
264     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
265     ## This variation was found in the list of formatted values, which is fine, so just remove it.
266     if matchFlag:
267     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
268     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
269     matchFlag = True
270     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
271     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
272     if matchFlag:
273     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
274     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
275     matchFlag = True
276 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
277     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
278 nino.borges 858 else:
279     ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
280     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
281 nino.borges 857
282 nino.borges 855 else:
283     ## TODO: will need to split this out to include split role soon.
284     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
285     if matchFlag:
286     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
287     ## This variation was found in the list of formatted values, which is fine, so just remove it.
288     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
289     matchFlag = True
290     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
291     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
292     if matchFlag:
293     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
294     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
295     matchFlag = True
296 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
297     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
298 nino.borges 855
299    
300     else:
301     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
302 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
303 nino.borges 855 pass
304 nino.borges 857
305 nino.borges 855 else:
306     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
307 nino.borges 856 AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
308 nino.borges 857
309 nino.borges 855
310     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
311     if formattedFieldValues:
312     for val in formattedFieldValues:
313     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
314     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
315     if "*" in val:
316 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
317     AddToIssuesList(docID,f"{val} in To Field is an attorney but couldnt be matched to any metadata value.")
318 nino.borges 855
319    
320 nino.borges 857
321 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
322     outputFile = open(outputFileName,'w')
323     for docID in list(issuesMatrix.keys()):
324     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
325     outputFile.close()