ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 864
Committed: Fri Dec 13 22:42:46 2024 UTC (15 months, 1 week ago) by nino.borges
Content type: text/x-python
File size: 49778 byte(s)
Log Message:
Fixed an annoying bug where, although I was checking to make sure the first name and last name both exist in the person I'm about to test when I need to match first name and last name only, I was looking at the full list of matching last names, so the report was wrong.

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 864 version = '0.9.1'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32     if personMatch.alt_work_email_address:
33     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
34     allDomainsList = list(dict.fromkeys(allDomainsList))
35    
36     if personMatch.full_name_overide:
37     fullName = personMatch.full_name_overide
38     for domain in allDomainsList:
39     allPossibleVariationsList.append((fullName,domain))
40     if personMatch.full_name_preferred:
41     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
42     fullPreferredName = personMatch.full_name_preferred
43     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
44     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
45     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
46     if "," in fullPreferredName:
47     preferedLastName, preferedFirstName = fullPreferredName.split(',')
48     preferedLastName = preferedLastName.strip()
49     preferedFirstName = preferedFirstName.strip()
50     preferedFirstName = preferedFirstName.split(" ")[0]
51     fullName = f"{preferedFirstName} {preferedLastName}"
52     #fullName = f"{preferedLastName}, {preferedFirstName}"
53     for domain in allDomainsList:
54     allPossibleVariationsList.append((fullName,domain))
55     else:
56     print(f"ERROR in this name {fullPreferredName}")
57     if personMatch.last_name:
58     if personMatch.first_name:
59     fullName = f"{personMatch.first_name} {personMatch.last_name}"
60     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
61     else:
62     fullName = f"{personMatch.last_name}"
63     for domain in allDomainsList:
64     allPossibleVariationsList.append((fullName,domain))
65    
66    
67     ## Now return a deduplicated list by using dict to deduplicate.
68     return list(dict.fromkeys(allPossibleVariationsList))
69    
70    
71 nino.borges 856 def AddToIssuesList(docID,issueMessage):
72     """This function will add a single issue to the issues matrix."""
73     if docID in list(issuesMatrix.keys()):
74     issuesMatrix[docID].append(issueMessage)
75     else:
76     issuesMatrix[docID] = [issueMessage,]
77 nino.borges 855
78    
79    
80     if __name__ == '__main__':
81     cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
82     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
83     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
84     outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
85    
86    
87     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
88    
89     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
90    
91     #issuesMatrix = {}
92    
93     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
94     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
95    
96     workList = qcP.metadataValuesDict.keys()
97     for docID in workList:
98     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
99     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
100 nino.borges 863 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['docAuthor']
101     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['docAuthor']
102 nino.borges 855 ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
103     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
104     ## This will change once you start itterating acroll all of the field values names
105     currentMetadataValues = metadataFieldValues
106     for val in currentMetadataValues:
107     ## First try to locate an email address in this val and if found, try to find that in the MAL.
108     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
109     if results:
110     ## Use some smart deduplication to remove duplicates.
111     results = nv.SmartDedupeSet(results)
112 nino.borges 863 if len(results) > 1:
113     print(f"WARNING: more than one unique email address found in this value: {results}")
114 nino.borges 855 for result in results:
115     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
116     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
117    
118     personMatch = nv.malPeopleList.search_by_email(result.upper())
119     if personMatch:
120     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
121     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
122     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
123     matchFlag = False
124 nino.borges 861 if allPossibleVariationsList:
125     for variationPair in allPossibleVariationsList:
126     if personMatch.is_attorney == 'YES':
127 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
128     ## This variation was found in the list of formatted values, which is fine, so just remove it.
129     if matchFlag:
130     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
131 nino.borges 861
132 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
133     matchFlag = True
134 nino.borges 861
135    
136 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
137     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
138     if matchFlag:
139     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
140     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
141     matchFlag = True
142     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
143     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
144    
145 nino.borges 861
146     elif personMatch.is_attorney == 'NO':
147 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
148     if matchFlag:
149     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
150     ## This variation was found in the list of formatted values, which is fine, so just remove it.
151     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
152     matchFlag = True
153     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
154     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
155     if matchFlag:
156     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
157     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
158     matchFlag = True
159     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
160     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
161 nino.borges 861 else:
162     ## This means they are a split role, so additional work will need to be done with the dates.
163     ## First, determin if this document date is between the dates where this person was an attorney
164     wasAttorneyAtThatTime = False
165     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
166     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
167     #print(f"\ndocumentDateValue is {documentDateValue}")
168     personWasAttorneyDates = personMatch.dates_as_counsel
169     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
170     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
171     if wasAttorneyStartDate.count("/") < 2:
172     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
173     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
174    
175     if wasAttorneyEndDate == "CURRENT":
176     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
177     elif wasAttorneyEndDate == "PRESENT":
178     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
179     if wasAttorneyEndDate.count("/") < 2:
180     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
181     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
182     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
183 nino.borges 860
184 nino.borges 861 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
185     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
186     wasAttorneyAtThatTime = True
187    
188     ## if wasAttorneyAtThatTime:
189     ## print("Person WAS attorney at this doc date.")
190     ## else:
191     ## print("Person WAS NOT attorney at this doc date.")
192    
193     ## Person's role at the time of the document has been determined, so now do the same checks as above.
194     if wasAttorneyAtThatTime:
195     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
196     ## This variation was found in the list of formatted values, which is fine, so just remove it.
197     if matchFlag:
198     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
199     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
200     matchFlag = True
201     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
202     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
203     if matchFlag:
204     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
205     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
206     matchFlag = True
207     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
208     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
209    
210     else:
211     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
212     if matchFlag:
213     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
214     ## This variation was found in the list of formatted values, which is fine, so just remove it.
215     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
216     matchFlag = True
217     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
218     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
219     if matchFlag:
220     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
221     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
222     matchFlag = True
223     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
224     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
225    
226     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
227     if matchFlag:
228     pass
229     else:
230 nino.borges 860 if personMatch.is_attorney == 'YES':
231 nino.borges 861 AddToIssuesList(docID,f"{val} in Metadata To Field and did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
232 nino.borges 855
233     else:
234 nino.borges 863 ## Person match, using email, not found in MAL.
235     ## Try extracting a name from this metadata value and try matching the MAL using that.
236 nino.borges 855 val = val.upper()
237 nino.borges 863 origVal = val
238 nino.borges 855 ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
239     if "(LEGAL)" in val:
240     ## Attempt to only remove the email parenthetical, including the now empty paren.
241     val = val.replace(result.upper(),"")
242     val = val.replace("()",'')
243     #val = val.replace(")","")
244     else:
245     ## Remove all parenthicals, including any character in that paren, from value.
246     val = re.sub(r"\([^)]*\)","",val)
247 nino.borges 860
248 nino.borges 855 val = val.strip()
249     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
250     if val:
251     ## if there is a comma, parse to last name, first name
252     if "," in val:
253     lastName, firstName = val.split(",")
254     lastName = lastName.strip()
255     firstName = firstName.strip()
256     elif " " in val:
257     ## For now, try just splitting by the first space and take everything after as the first name.
258     firstName, lastName = val.split(" ",1)
259     ## With the name now parse, try searching for all values that match on the last name.
260 nino.borges 864
261 nino.borges 855 personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
262     if personMatchList:
263     possiblePeopleMatchesMatrix = {}
264     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
265     for personMatch in personMatchList:
266     if personMatch.first_name == firstName:
267     ## This is a personMatch that matches the first and last name
268     possiblePeopleMatchesMatrix[personMatch._id] = 1
269     if possiblePeopleMatchesMatrix.keys():
270     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
271     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
272 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
273     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
274    
275 nino.borges 855 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
276     matchFlag = False
277 nino.borges 861 if allPossibleVariationsList:
278     for variationPair in allPossibleVariationsList:
279     if personMatch.is_attorney == 'YES':
280     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
281     ## This variation was found in the list of formatted values, which is fine, so just remove it.
282     if matchFlag:
283     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
284     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
285     matchFlag = True
286     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
287     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
288     if matchFlag:
289     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
290     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
291     matchFlag = True
292     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
293     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
294     else:
295     ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
296 nino.borges 863 #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
297     AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
298 nino.borges 861
299 nino.borges 862 elif personMatch.is_attorney == 'NO':
300 nino.borges 861 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
301     if matchFlag:
302     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
303     ## This variation was found in the list of formatted values, which is fine, so just remove it.
304     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
305     matchFlag = True
306     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
307     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
308     if matchFlag:
309     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
310     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
311     matchFlag = True
312     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
313     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
314 nino.borges 862 else:
315     ## This means they are a split role, so additional work will need to be done with the dates.
316     ## First, determin if this document date is between the dates where this person was an attorney
317     wasAttorneyAtThatTime = False
318     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
319     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
320     #print(f"\ndocumentDateValue is {documentDateValue}")
321     personWasAttorneyDates = personMatch.dates_as_counsel
322     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
323     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
324     if wasAttorneyStartDate.count("/") < 2:
325     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
326     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
327    
328     if wasAttorneyEndDate == "CURRENT":
329     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
330     elif wasAttorneyEndDate == "PRESENT":
331     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
332     if wasAttorneyEndDate.count("/") < 2:
333     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
334     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
335     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
336    
337     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
338     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
339     wasAttorneyAtThatTime = True
340    
341     ## if wasAttorneyAtThatTime:
342     ## print("Person WAS attorney at this doc date.")
343     ## else:
344     ## print("Person WAS NOT attorney at this doc date.")
345    
346     ## Person's role at the time of the document has been determined, so now do the same checks as above.
347     if wasAttorneyAtThatTime:
348     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
349     ## This variation was found in the list of formatted values, which is fine, so just remove it.
350     if matchFlag:
351     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
352     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
353     matchFlag = True
354     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
355     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
356     if matchFlag:
357     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
358     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
359     matchFlag = True
360     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
361     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
362    
363     else:
364     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
365     if matchFlag:
366     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
367     ## This variation was found in the list of formatted values, which is fine, so just remove it.
368     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
369     matchFlag = True
370     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
371     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
372     if matchFlag:
373     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
374     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
375     matchFlag = True
376     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
377     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
378 nino.borges 863 else:
379     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
380     ## TODO: Add support here for more than one first name last name match in MAL.
381 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
382     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
383 nino.borges 855
384     else:
385     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
386 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
387 nino.borges 855 pass
388 nino.borges 857
389 nino.borges 855 else:
390     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
391 nino.borges 863 #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
392     val = val.upper()
393     origVal = val
394     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
395     if "(LEGAL)" in val:
396     pass
397     else:
398     ## Remove all parenthicals, including any character in that paren, from value.
399     val = re.sub(r"\([^)]*\)","",val)
400    
401     val = val.strip()
402     ## with the paren information stripped out of the val, only move forward if anything still exists.
403     if val:
404     ## if there is a comma, parse to last name, first name
405     if "," in val:
406     lastName, firstName = val.split(",")
407     lastName = lastName.strip()
408     firstName = firstName.strip()
409     elif " " in val:
410     ## For now, try just splitting by the first space and take everything after as the first name.
411     firstName, lastName = val.split(" ",1)
412     ## With the name now parse, try searching for all values that match on the last name.
413    
414     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
415     if personMatchList:
416     possiblePeopleMatchesMatrix = {}
417     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
418     for personMatch in personMatchList:
419     if personMatch.first_name == firstName:
420     ## This is a personMatch that matches the first and last name
421     possiblePeopleMatchesMatrix[personMatch._id] = 1
422     if possiblePeopleMatchesMatrix.keys():
423     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
424     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
425 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
426     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
427    
428 nino.borges 863 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
429     matchFlag = False
430     if allPossibleVariationsList:
431     for variationPair in allPossibleVariationsList:
432     if personMatch.is_attorney == 'YES':
433     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
434     ## This variation was found in the list of formatted values, which is fine, so just remove it.
435     if matchFlag:
436     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
437     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
438     matchFlag = True
439     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
440     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
441     if matchFlag:
442     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
443     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
444     matchFlag = True
445     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
446     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
447     else:
448     ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
449     #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
450     AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
451 nino.borges 857
452 nino.borges 863 elif personMatch.is_attorney == 'NO':
453     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
454     if matchFlag:
455     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
456     ## This variation was found in the list of formatted values, which is fine, so just remove it.
457     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
458     matchFlag = True
459     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
460     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
461     if matchFlag:
462     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
463     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
464     matchFlag = True
465     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
466     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
467     else:
468     ## This means they are a split role, so additional work will need to be done with the dates.
469     ## First, determin if this document date is between the dates where this person was an attorney
470     wasAttorneyAtThatTime = False
471     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
472     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
473     #print(f"\ndocumentDateValue is {documentDateValue}")
474     personWasAttorneyDates = personMatch.dates_as_counsel
475     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
476     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
477     if wasAttorneyStartDate.count("/") < 2:
478     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
479     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
480    
481     if wasAttorneyEndDate == "CURRENT":
482     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
483     elif wasAttorneyEndDate == "PRESENT":
484     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
485     if wasAttorneyEndDate.count("/") < 2:
486     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
487     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
488     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
489    
490     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
491     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
492     wasAttorneyAtThatTime = True
493    
494     ## if wasAttorneyAtThatTime:
495     ## print("Person WAS attorney at this doc date.")
496     ## else:
497     ## print("Person WAS NOT attorney at this doc date.")
498    
499     ## Person's role at the time of the document has been determined, so now do the same checks as above.
500     if wasAttorneyAtThatTime:
501     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
502     ## This variation was found in the list of formatted values, which is fine, so just remove it.
503     if matchFlag:
504     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
505     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
506     matchFlag = True
507     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
508     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
509     if matchFlag:
510     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
511     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
512     matchFlag = True
513     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
514     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
515    
516     else:
517     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
518     if matchFlag:
519     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
520     ## This variation was found in the list of formatted values, which is fine, so just remove it.
521     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
522     matchFlag = True
523     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
524     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
525     if matchFlag:
526     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
527     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
528     matchFlag = True
529     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
530     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
531     else:
532     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
533     ## TODO: Add support here for more than one first name last name match in MAL.
534 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
535     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
536 nino.borges 863
537    
538    
539 nino.borges 855
540     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
541     if formattedFieldValues:
542     for val in formattedFieldValues:
543     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
544     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
545     if "*" in val:
546 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
547 nino.borges 861 AddToIssuesList(docID,f"{val} in Formatted To Field is an attorney but couldnt be matched to any value in metadata field.")
548 nino.borges 855
549    
550 nino.borges 857
551 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
552     outputFile = open(outputFileName,'w')
553     for docID in list(issuesMatrix.keys()):
554     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
555     outputFile.close()