ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 863
Committed: Fri Dec 13 22:11:54 2024 UTC (15 months, 1 week ago) by nino.borges
Content type: text/x-python
File size: 48579 byte(s)
Log Message:
Copied, almost word for word, the first name last name matching piece under the email, when you cant match email, to the part where you werent even able to extract an email, which also needed to be first name last name matched.

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 863 version = '0.9.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32     if personMatch.alt_work_email_address:
33     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
34     allDomainsList = list(dict.fromkeys(allDomainsList))
35    
36     if personMatch.full_name_overide:
37     fullName = personMatch.full_name_overide
38     for domain in allDomainsList:
39     allPossibleVariationsList.append((fullName,domain))
40     if personMatch.full_name_preferred:
41     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
42     fullPreferredName = personMatch.full_name_preferred
43     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
44     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
45     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
46     if "," in fullPreferredName:
47     preferedLastName, preferedFirstName = fullPreferredName.split(',')
48     preferedLastName = preferedLastName.strip()
49     preferedFirstName = preferedFirstName.strip()
50     preferedFirstName = preferedFirstName.split(" ")[0]
51     fullName = f"{preferedFirstName} {preferedLastName}"
52     #fullName = f"{preferedLastName}, {preferedFirstName}"
53     for domain in allDomainsList:
54     allPossibleVariationsList.append((fullName,domain))
55     else:
56     print(f"ERROR in this name {fullPreferredName}")
57     if personMatch.last_name:
58     if personMatch.first_name:
59     fullName = f"{personMatch.first_name} {personMatch.last_name}"
60     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
61     else:
62     fullName = f"{personMatch.last_name}"
63     for domain in allDomainsList:
64     allPossibleVariationsList.append((fullName,domain))
65    
66    
67     ## Now return a deduplicated list by using dict to deduplicate.
68     return list(dict.fromkeys(allPossibleVariationsList))
69    
70    
71 nino.borges 856 def AddToIssuesList(docID,issueMessage):
72     """This function will add a single issue to the issues matrix."""
73     if docID in list(issuesMatrix.keys()):
74     issuesMatrix[docID].append(issueMessage)
75     else:
76     issuesMatrix[docID] = [issueMessage,]
77 nino.borges 855
78    
79    
80     if __name__ == '__main__':
81     cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
82     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
83     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
84     outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
85    
86    
87     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
88    
89     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
90    
91     #issuesMatrix = {}
92    
93     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
94     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
95    
96     workList = qcP.metadataValuesDict.keys()
97     for docID in workList:
98     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
99     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
100 nino.borges 863 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['docAuthor']
101     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['docAuthor']
102 nino.borges 855 ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
103     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
104     ## This will change once you start itterating acroll all of the field values names
105     currentMetadataValues = metadataFieldValues
106     for val in currentMetadataValues:
107     ## First try to locate an email address in this val and if found, try to find that in the MAL.
108     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
109     if results:
110     ## Use some smart deduplication to remove duplicates.
111     results = nv.SmartDedupeSet(results)
112 nino.borges 863 if len(results) > 1:
113     print(f"WARNING: more than one unique email address found in this value: {results}")
114 nino.borges 855 for result in results:
115     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
116     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
117    
118     personMatch = nv.malPeopleList.search_by_email(result.upper())
119     if personMatch:
120     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
121     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
122     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
123     matchFlag = False
124 nino.borges 861 if allPossibleVariationsList:
125     for variationPair in allPossibleVariationsList:
126     if personMatch.is_attorney == 'YES':
127 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
128     ## This variation was found in the list of formatted values, which is fine, so just remove it.
129     if matchFlag:
130     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
131 nino.borges 861
132 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
133     matchFlag = True
134 nino.borges 861
135    
136 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
137     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
138     if matchFlag:
139     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
140     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
141     matchFlag = True
142     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
143     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
144    
145 nino.borges 861
146     elif personMatch.is_attorney == 'NO':
147 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
148     if matchFlag:
149     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
150     ## This variation was found in the list of formatted values, which is fine, so just remove it.
151     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
152     matchFlag = True
153     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
154     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
155     if matchFlag:
156     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
157     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
158     matchFlag = True
159     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
160     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
161 nino.borges 861 else:
162     ## This means they are a split role, so additional work will need to be done with the dates.
163     ## First, determin if this document date is between the dates where this person was an attorney
164     wasAttorneyAtThatTime = False
165     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
166     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
167     #print(f"\ndocumentDateValue is {documentDateValue}")
168     personWasAttorneyDates = personMatch.dates_as_counsel
169     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
170     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
171     if wasAttorneyStartDate.count("/") < 2:
172     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
173     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
174    
175     if wasAttorneyEndDate == "CURRENT":
176     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
177     elif wasAttorneyEndDate == "PRESENT":
178     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
179     if wasAttorneyEndDate.count("/") < 2:
180     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
181     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
182     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
183 nino.borges 860
184 nino.borges 861 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
185     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
186     wasAttorneyAtThatTime = True
187    
188     ## if wasAttorneyAtThatTime:
189     ## print("Person WAS attorney at this doc date.")
190     ## else:
191     ## print("Person WAS NOT attorney at this doc date.")
192    
193     ## Person's role at the time of the document has been determined, so now do the same checks as above.
194     if wasAttorneyAtThatTime:
195     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
196     ## This variation was found in the list of formatted values, which is fine, so just remove it.
197     if matchFlag:
198     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
199     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
200     matchFlag = True
201     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
202     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
203     if matchFlag:
204     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
205     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
206     matchFlag = True
207     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
208     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
209    
210     else:
211     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
212     if matchFlag:
213     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
214     ## This variation was found in the list of formatted values, which is fine, so just remove it.
215     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
216     matchFlag = True
217     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
218     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
219     if matchFlag:
220     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
221     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
222     matchFlag = True
223     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
224     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
225    
226     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
227     if matchFlag:
228     pass
229     else:
230 nino.borges 860 if personMatch.is_attorney == 'YES':
231 nino.borges 861 AddToIssuesList(docID,f"{val} in Metadata To Field and did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
232 nino.borges 855
233     else:
234 nino.borges 863 ## Person match, using email, not found in MAL.
235     ## Try extracting a name from this metadata value and try matching the MAL using that.
236 nino.borges 855 val = val.upper()
237 nino.borges 863 origVal = val
238 nino.borges 855 ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
239     if "(LEGAL)" in val:
240     ## Attempt to only remove the email parenthetical, including the now empty paren.
241     val = val.replace(result.upper(),"")
242     val = val.replace("()",'')
243     #val = val.replace(")","")
244     else:
245     ## Remove all parenthicals, including any character in that paren, from value.
246     val = re.sub(r"\([^)]*\)","",val)
247 nino.borges 860
248 nino.borges 855 val = val.strip()
249     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
250     if val:
251     ## if there is a comma, parse to last name, first name
252     if "," in val:
253     lastName, firstName = val.split(",")
254     lastName = lastName.strip()
255     firstName = firstName.strip()
256     elif " " in val:
257     ## For now, try just splitting by the first space and take everything after as the first name.
258     firstName, lastName = val.split(" ",1)
259     ## With the name now parse, try searching for all values that match on the last name.
260 nino.borges 863 if "STERN" in val:
261     print(val)
262     print(firstName, lastName)
263 nino.borges 855 personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
264     if personMatchList:
265     possiblePeopleMatchesMatrix = {}
266     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
267     for personMatch in personMatchList:
268     if personMatch.first_name == firstName:
269     ## This is a personMatch that matches the first and last name
270     possiblePeopleMatchesMatrix[personMatch._id] = 1
271     if possiblePeopleMatchesMatrix.keys():
272     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
273     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
274     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
275     matchFlag = False
276 nino.borges 861 if allPossibleVariationsList:
277     for variationPair in allPossibleVariationsList:
278     if personMatch.is_attorney == 'YES':
279     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
280     ## This variation was found in the list of formatted values, which is fine, so just remove it.
281     if matchFlag:
282     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
283     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
284     matchFlag = True
285     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
286     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
287     if matchFlag:
288     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
289     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
290     matchFlag = True
291     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
292     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
293     else:
294     ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
295 nino.borges 863 #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
296     AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
297 nino.borges 861
298 nino.borges 862 elif personMatch.is_attorney == 'NO':
299 nino.borges 861 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
300     if matchFlag:
301     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
302     ## This variation was found in the list of formatted values, which is fine, so just remove it.
303     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
304     matchFlag = True
305     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
306     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
307     if matchFlag:
308     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
309     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
310     matchFlag = True
311     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
312     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
313 nino.borges 862 else:
314     ## This means they are a split role, so additional work will need to be done with the dates.
315     ## First, determin if this document date is between the dates where this person was an attorney
316     wasAttorneyAtThatTime = False
317     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
318     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
319     #print(f"\ndocumentDateValue is {documentDateValue}")
320     personWasAttorneyDates = personMatch.dates_as_counsel
321     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
322     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
323     if wasAttorneyStartDate.count("/") < 2:
324     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
325     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
326    
327     if wasAttorneyEndDate == "CURRENT":
328     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
329     elif wasAttorneyEndDate == "PRESENT":
330     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
331     if wasAttorneyEndDate.count("/") < 2:
332     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
333     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
334     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
335    
336     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
337     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
338     wasAttorneyAtThatTime = True
339    
340     ## if wasAttorneyAtThatTime:
341     ## print("Person WAS attorney at this doc date.")
342     ## else:
343     ## print("Person WAS NOT attorney at this doc date.")
344    
345     ## Person's role at the time of the document has been determined, so now do the same checks as above.
346     if wasAttorneyAtThatTime:
347     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
348     ## This variation was found in the list of formatted values, which is fine, so just remove it.
349     if matchFlag:
350     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
351     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
352     matchFlag = True
353     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
354     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
355     if matchFlag:
356     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
357     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
358     matchFlag = True
359     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
360     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
361    
362     else:
363     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
364     if matchFlag:
365     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
366     ## This variation was found in the list of formatted values, which is fine, so just remove it.
367     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
368     matchFlag = True
369     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
370     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
371     if matchFlag:
372     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
373     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
374     matchFlag = True
375     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
376     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
377 nino.borges 863 else:
378     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
379     ## TODO: Add support here for more than one first name last name match in MAL.
380 nino.borges 855
381     else:
382     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
383 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
384 nino.borges 855 pass
385 nino.borges 857
386 nino.borges 855 else:
387     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
388 nino.borges 863 #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
389     val = val.upper()
390     origVal = val
391     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
392     if "(LEGAL)" in val:
393     pass
394     else:
395     ## Remove all parenthicals, including any character in that paren, from value.
396     val = re.sub(r"\([^)]*\)","",val)
397    
398     val = val.strip()
399     ## with the paren information stripped out of the val, only move forward if anything still exists.
400     if val:
401     ## if there is a comma, parse to last name, first name
402     if "," in val:
403     lastName, firstName = val.split(",")
404     lastName = lastName.strip()
405     firstName = firstName.strip()
406     elif " " in val:
407     ## For now, try just splitting by the first space and take everything after as the first name.
408     firstName, lastName = val.split(" ",1)
409     ## With the name now parse, try searching for all values that match on the last name.
410    
411     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
412     if personMatchList:
413     possiblePeopleMatchesMatrix = {}
414     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
415     for personMatch in personMatchList:
416     if personMatch.first_name == firstName:
417     ## This is a personMatch that matches the first and last name
418     possiblePeopleMatchesMatrix[personMatch._id] = 1
419     if possiblePeopleMatchesMatrix.keys():
420     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
421     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
422     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
423     matchFlag = False
424     if allPossibleVariationsList:
425     for variationPair in allPossibleVariationsList:
426     if personMatch.is_attorney == 'YES':
427     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
428     ## This variation was found in the list of formatted values, which is fine, so just remove it.
429     if matchFlag:
430     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
431     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
432     matchFlag = True
433     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
434     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
435     if matchFlag:
436     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
437     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
438     matchFlag = True
439     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
440     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
441     else:
442     ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
443     #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
444     AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
445 nino.borges 857
446 nino.borges 863 elif personMatch.is_attorney == 'NO':
447     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
448     if matchFlag:
449     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
450     ## This variation was found in the list of formatted values, which is fine, so just remove it.
451     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
452     matchFlag = True
453     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
454     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
455     if matchFlag:
456     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
457     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
458     matchFlag = True
459     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
460     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
461     else:
462     ## This means they are a split role, so additional work will need to be done with the dates.
463     ## First, determin if this document date is between the dates where this person was an attorney
464     wasAttorneyAtThatTime = False
465     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
466     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
467     #print(f"\ndocumentDateValue is {documentDateValue}")
468     personWasAttorneyDates = personMatch.dates_as_counsel
469     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
470     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
471     if wasAttorneyStartDate.count("/") < 2:
472     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
473     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
474    
475     if wasAttorneyEndDate == "CURRENT":
476     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
477     elif wasAttorneyEndDate == "PRESENT":
478     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
479     if wasAttorneyEndDate.count("/") < 2:
480     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
481     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
482     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
483    
484     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
485     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
486     wasAttorneyAtThatTime = True
487    
488     ## if wasAttorneyAtThatTime:
489     ## print("Person WAS attorney at this doc date.")
490     ## else:
491     ## print("Person WAS NOT attorney at this doc date.")
492    
493     ## Person's role at the time of the document has been determined, so now do the same checks as above.
494     if wasAttorneyAtThatTime:
495     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
496     ## This variation was found in the list of formatted values, which is fine, so just remove it.
497     if matchFlag:
498     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
499     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
500     matchFlag = True
501     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
502     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
503     if matchFlag:
504     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
505     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
506     matchFlag = True
507     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
508     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
509    
510     else:
511     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
512     if matchFlag:
513     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
514     ## This variation was found in the list of formatted values, which is fine, so just remove it.
515     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
516     matchFlag = True
517     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
518     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
519     if matchFlag:
520     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
521     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
522     matchFlag = True
523     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
524     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
525     else:
526     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
527     ## TODO: Add support here for more than one first name last name match in MAL.
528    
529    
530    
531 nino.borges 855
532     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
533     if formattedFieldValues:
534     for val in formattedFieldValues:
535     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
536     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
537     if "*" in val:
538 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
539 nino.borges 861 AddToIssuesList(docID,f"{val} in Formatted To Field is an attorney but couldnt be matched to any value in metadata field.")
540 nino.borges 855
541    
542 nino.borges 857
543 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
544     outputFile = open(outputFileName,'w')
545     for docID in list(issuesMatrix.keys()):
546     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
547     outputFile.close()