ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 861
Committed: Fri Dec 13 19:50:55 2024 UTC (15 months, 1 week ago) by nino.borges
Content type: text/x-python
File size: 26233 byte(s)
Log Message:
I had to place the put the for loop into the "if allPossibleVariationsList" because I was getting an issue where it was returning some none from the MAL but still using older variation variables.  this way, and not referring to the variation variables in the matchFlag check works better.  that was a terrible bug. 

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 860 version = '0.7.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32     if personMatch.alt_work_email_address:
33     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
34     allDomainsList = list(dict.fromkeys(allDomainsList))
35    
36     if personMatch.full_name_overide:
37     fullName = personMatch.full_name_overide
38     for domain in allDomainsList:
39     allPossibleVariationsList.append((fullName,domain))
40     if personMatch.full_name_preferred:
41     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
42     fullPreferredName = personMatch.full_name_preferred
43     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
44     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
45     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
46     if "," in fullPreferredName:
47     preferedLastName, preferedFirstName = fullPreferredName.split(',')
48     preferedLastName = preferedLastName.strip()
49     preferedFirstName = preferedFirstName.strip()
50     preferedFirstName = preferedFirstName.split(" ")[0]
51     fullName = f"{preferedFirstName} {preferedLastName}"
52     #fullName = f"{preferedLastName}, {preferedFirstName}"
53     for domain in allDomainsList:
54     allPossibleVariationsList.append((fullName,domain))
55     else:
56     print(f"ERROR in this name {fullPreferredName}")
57     if personMatch.last_name:
58     if personMatch.first_name:
59     fullName = f"{personMatch.first_name} {personMatch.last_name}"
60     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
61     else:
62     fullName = f"{personMatch.last_name}"
63     for domain in allDomainsList:
64     allPossibleVariationsList.append((fullName,domain))
65    
66    
67     ## Now return a deduplicated list by using dict to deduplicate.
68     return list(dict.fromkeys(allPossibleVariationsList))
69    
70    
71 nino.borges 856 def AddToIssuesList(docID,issueMessage):
72     """This function will add a single issue to the issues matrix."""
73     if docID in list(issuesMatrix.keys()):
74     issuesMatrix[docID].append(issueMessage)
75     else:
76     issuesMatrix[docID] = [issueMessage,]
77 nino.borges 855
78    
79    
80     if __name__ == '__main__':
81     cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
82     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
83     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
84     outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
85    
86    
87     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
88    
89     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
90    
91     #issuesMatrix = {}
92    
93     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
94     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
95    
96     workList = qcP.metadataValuesDict.keys()
97     for docID in workList:
98     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
99     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
100     ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
101     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
102     ## This will change once you start itterating acroll all of the field values names
103     currentMetadataValues = metadataFieldValues
104     for val in currentMetadataValues:
105     ## First try to locate an email address in this val and if found, try to find that in the MAL.
106     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
107     if results:
108     ## Use some smart deduplication to remove duplicates.
109     results = nv.SmartDedupeSet(results)
110     for result in results:
111     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
112     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
113    
114     personMatch = nv.malPeopleList.search_by_email(result.upper())
115     if personMatch:
116     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
117     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
118     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
119     matchFlag = False
120 nino.borges 861 if allPossibleVariationsList:
121     for variationPair in allPossibleVariationsList:
122     if personMatch.is_attorney == 'YES':
123 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
124     ## This variation was found in the list of formatted values, which is fine, so just remove it.
125     if matchFlag:
126     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
127 nino.borges 861
128 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
129     matchFlag = True
130 nino.borges 861
131    
132 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
133     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
134     if matchFlag:
135     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
136     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
137     matchFlag = True
138     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
139     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
140    
141 nino.borges 861
142     elif personMatch.is_attorney == 'NO':
143 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
144     if matchFlag:
145     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
146     ## This variation was found in the list of formatted values, which is fine, so just remove it.
147     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
148     matchFlag = True
149     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
150     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
151     if matchFlag:
152     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
153     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
154     matchFlag = True
155     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
156     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
157 nino.borges 861 else:
158     ## This means they are a split role, so additional work will need to be done with the dates.
159     ## First, determin if this document date is between the dates where this person was an attorney
160     wasAttorneyAtThatTime = False
161     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
162     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
163     #print(f"\ndocumentDateValue is {documentDateValue}")
164     personWasAttorneyDates = personMatch.dates_as_counsel
165     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
166     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
167     if wasAttorneyStartDate.count("/") < 2:
168     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
169     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
170    
171     if wasAttorneyEndDate == "CURRENT":
172     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
173     elif wasAttorneyEndDate == "PRESENT":
174     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
175     if wasAttorneyEndDate.count("/") < 2:
176     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
177     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
178     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
179 nino.borges 860
180 nino.borges 861 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
181     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
182     wasAttorneyAtThatTime = True
183    
184     ## if wasAttorneyAtThatTime:
185     ## print("Person WAS attorney at this doc date.")
186     ## else:
187     ## print("Person WAS NOT attorney at this doc date.")
188    
189     ## Person's role at the time of the document has been determined, so now do the same checks as above.
190     if wasAttorneyAtThatTime:
191     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
192     ## This variation was found in the list of formatted values, which is fine, so just remove it.
193     if matchFlag:
194     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
195     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
196     matchFlag = True
197     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
198     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
199     if matchFlag:
200     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
201     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
202     matchFlag = True
203     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
204     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
205    
206     else:
207     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
208     if matchFlag:
209     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
210     ## This variation was found in the list of formatted values, which is fine, so just remove it.
211     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
212     matchFlag = True
213     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
214     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
215     if matchFlag:
216     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
217     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
218     matchFlag = True
219     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
220     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
221    
222     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
223     if matchFlag:
224     pass
225     else:
226 nino.borges 860 if personMatch.is_attorney == 'YES':
227 nino.borges 861 AddToIssuesList(docID,f"{val} in Metadata To Field and did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
228 nino.borges 855
229     else:
230     ## Person match, using email, not found in MAL. Try extracting a name from this metadata value and try matching the MAL using that.
231     val = val.upper()
232     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
233     if "(LEGAL)" in val:
234     ## Attempt to only remove the email parenthetical, including the now empty paren.
235     val = val.replace(result.upper(),"")
236     val = val.replace("()",'')
237     #val = val.replace(")","")
238     else:
239     ## Remove all parenthicals, including any character in that paren, from value.
240     val = re.sub(r"\([^)]*\)","",val)
241 nino.borges 860
242 nino.borges 855 val = val.strip()
243     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
244     if val:
245     ## if there is a comma, parse to last name, first name
246     if "," in val:
247     lastName, firstName = val.split(",")
248     lastName = lastName.strip()
249     firstName = firstName.strip()
250     elif " " in val:
251     ## For now, try just splitting by the first space and take everything after as the first name.
252     firstName, lastName = val.split(" ",1)
253     ## With the name now parse, try searching for all values that match on the last name.
254    
255     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
256     if personMatchList:
257     possiblePeopleMatchesMatrix = {}
258     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
259     for personMatch in personMatchList:
260     if personMatch.first_name == firstName:
261     ## This is a personMatch that matches the first and last name
262     possiblePeopleMatchesMatrix[personMatch._id] = 1
263     if possiblePeopleMatchesMatrix.keys():
264     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
265     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
266     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
267     matchFlag = False
268 nino.borges 861 if allPossibleVariationsList:
269     for variationPair in allPossibleVariationsList:
270     if personMatch.is_attorney == 'YES':
271     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
272     ## This variation was found in the list of formatted values, which is fine, so just remove it.
273     if matchFlag:
274     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
275     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
276     matchFlag = True
277     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
278     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
279     if matchFlag:
280     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
281     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
282     matchFlag = True
283     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
284     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
285     else:
286     ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
287     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
288    
289 nino.borges 858 else:
290 nino.borges 861 ## TODO: will need to split this out to include split role soon.
291     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
292     if matchFlag:
293     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
294     ## This variation was found in the list of formatted values, which is fine, so just remove it.
295     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
296     matchFlag = True
297     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
298     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
299     if matchFlag:
300     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
301     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
302     matchFlag = True
303     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
304     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
305 nino.borges 855
306    
307     else:
308     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
309 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
310 nino.borges 855 pass
311 nino.borges 857
312 nino.borges 855 else:
313     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
314 nino.borges 856 AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
315 nino.borges 857
316 nino.borges 855
317     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
318     if formattedFieldValues:
319     for val in formattedFieldValues:
320     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
321     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
322     if "*" in val:
323 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
324 nino.borges 861 AddToIssuesList(docID,f"{val} in Formatted To Field is an attorney but couldnt be matched to any value in metadata field.")
325 nino.borges 855
326    
327 nino.borges 857
328 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
329     outputFile = open(outputFileName,'w')
330     for docID in list(issuesMatrix.keys()):
331     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
332     outputFile.close()