ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 866
Committed: Sat Dec 14 01:55:21 2024 UTC (15 months, 1 week ago) by nino.borges
Content type: text/x-python
File size: 52329 byte(s)
Log Message:
Fixed a bug where it was writing more than one non match attempt to the issues.  This was because of that extra else.  This was incorrect and the else belonged at the very end of cycling through the different variations, not during.  Putting this after all the variations have run but still no match fixed the issue.

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 866 version = '0.11.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32 nino.borges 865 ## After talking to Eli, we decided that all of these amazon.com.uk or amazon.it are related domains,
33     ## so we should feel confident that we can add amazon.com to the list of possible domains. doing that here and for alt work email.
34     if "@AMAZON." in personMatch.work_email_address:
35     allDomainsList.append("AMAZON.COM")
36 nino.borges 855 if personMatch.alt_work_email_address:
37     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
38 nino.borges 865 if "@AMAZON." in personMatch.alt_work_email_address:
39     allDomainsList.append("AMAZON.COM")
40 nino.borges 855 allDomainsList = list(dict.fromkeys(allDomainsList))
41 nino.borges 865 if personMatch.last_name == "MANEK":
42     print(allDomainsList)
43 nino.borges 855
44     if personMatch.full_name_overide:
45     fullName = personMatch.full_name_overide
46     for domain in allDomainsList:
47     allPossibleVariationsList.append((fullName,domain))
48     if personMatch.full_name_preferred:
49     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
50     fullPreferredName = personMatch.full_name_preferred
51     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
52     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
53     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
54 nino.borges 866 fullPreferredName = fullPreferredName.replace(',,',',')
55 nino.borges 855 if "," in fullPreferredName:
56     preferedLastName, preferedFirstName = fullPreferredName.split(',')
57     preferedLastName = preferedLastName.strip()
58     preferedFirstName = preferedFirstName.strip()
59     preferedFirstName = preferedFirstName.split(" ")[0]
60     fullName = f"{preferedFirstName} {preferedLastName}"
61     #fullName = f"{preferedLastName}, {preferedFirstName}"
62     for domain in allDomainsList:
63     allPossibleVariationsList.append((fullName,domain))
64     else:
65     print(f"ERROR in this name {fullPreferredName}")
66     if personMatch.last_name:
67     if personMatch.first_name:
68     fullName = f"{personMatch.first_name} {personMatch.last_name}"
69     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
70     else:
71     fullName = f"{personMatch.last_name}"
72     for domain in allDomainsList:
73     allPossibleVariationsList.append((fullName,domain))
74    
75    
76     ## Now return a deduplicated list by using dict to deduplicate.
77     return list(dict.fromkeys(allPossibleVariationsList))
78    
79    
80 nino.borges 856 def AddToIssuesList(docID,issueMessage):
81     """This function will add a single issue to the issues matrix."""
82     if docID in list(issuesMatrix.keys()):
83     issuesMatrix[docID].append(issueMessage)
84     else:
85     issuesMatrix[docID] = [issueMessage,]
86 nino.borges 855
87    
88    
89     if __name__ == '__main__':
90     cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
91     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
92     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
93     outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
94    
95    
96     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
97    
98     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
99    
100     #issuesMatrix = {}
101    
102     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
103     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
104    
105     workList = qcP.metadataValuesDict.keys()
106     for docID in workList:
107 nino.borges 866 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
108     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
109     #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
110     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
111     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['docAuthor']
112     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['docAuthor']
113 nino.borges 855 ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
114     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
115     ## This will change once you start itterating acroll all of the field values names
116     currentMetadataValues = metadataFieldValues
117     for val in currentMetadataValues:
118     ## First try to locate an email address in this val and if found, try to find that in the MAL.
119     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
120     if results:
121     ## Use some smart deduplication to remove duplicates.
122     results = nv.SmartDedupeSet(results)
123 nino.borges 863 if len(results) > 1:
124     print(f"WARNING: more than one unique email address found in this value: {results}")
125 nino.borges 855 for result in results:
126     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
127     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
128    
129     personMatch = nv.malPeopleList.search_by_email(result.upper())
130     if personMatch:
131     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
132     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
133     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
134     matchFlag = False
135 nino.borges 861 if allPossibleVariationsList:
136     for variationPair in allPossibleVariationsList:
137     if personMatch.is_attorney == 'YES':
138 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
139     ## This variation was found in the list of formatted values, which is fine, so just remove it.
140     if matchFlag:
141     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
142 nino.borges 861
143 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
144     matchFlag = True
145 nino.borges 861
146    
147 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
148     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
149     if matchFlag:
150     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
151     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
152     matchFlag = True
153     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
154     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
155    
156 nino.borges 861
157     elif personMatch.is_attorney == 'NO':
158 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
159     if matchFlag:
160     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
161     ## This variation was found in the list of formatted values, which is fine, so just remove it.
162     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
163     matchFlag = True
164     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
165     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
166     if matchFlag:
167     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
168     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
169     matchFlag = True
170     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
171     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
172 nino.borges 861 else:
173     ## This means they are a split role, so additional work will need to be done with the dates.
174     ## First, determin if this document date is between the dates where this person was an attorney
175     wasAttorneyAtThatTime = False
176     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
177     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
178     #print(f"\ndocumentDateValue is {documentDateValue}")
179     personWasAttorneyDates = personMatch.dates_as_counsel
180     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
181     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
182     if wasAttorneyStartDate.count("/") < 2:
183     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
184     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
185    
186     if wasAttorneyEndDate == "CURRENT":
187     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
188     elif wasAttorneyEndDate == "PRESENT":
189     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
190     if wasAttorneyEndDate.count("/") < 2:
191     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
192     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
193     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
194 nino.borges 860
195 nino.borges 861 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
196     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
197     wasAttorneyAtThatTime = True
198    
199     ## if wasAttorneyAtThatTime:
200     ## print("Person WAS attorney at this doc date.")
201     ## else:
202     ## print("Person WAS NOT attorney at this doc date.")
203    
204     ## Person's role at the time of the document has been determined, so now do the same checks as above.
205     if wasAttorneyAtThatTime:
206     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
207     ## This variation was found in the list of formatted values, which is fine, so just remove it.
208     if matchFlag:
209     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
210     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
211     matchFlag = True
212     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
213     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
214     if matchFlag:
215     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
216     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
217     matchFlag = True
218     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
219     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
220    
221     else:
222     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
223     if matchFlag:
224     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
225     ## This variation was found in the list of formatted values, which is fine, so just remove it.
226     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
227     matchFlag = True
228     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
229     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
230     if matchFlag:
231     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
232     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
233     matchFlag = True
234     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
235     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
236    
237     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
238     if matchFlag:
239     pass
240     else:
241 nino.borges 860 if personMatch.is_attorney == 'YES':
242 nino.borges 866 AddToIssuesList(docID,f"{val} in Metadata To Field did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
243 nino.borges 855
244     else:
245 nino.borges 863 ## Person match, using email, not found in MAL.
246     ## Try extracting a name from this metadata value and try matching the MAL using that.
247 nino.borges 855 val = val.upper()
248 nino.borges 863 origVal = val
249 nino.borges 866 if docID == "H95472-0163-004394":
250     print(origVal,val)
251 nino.borges 855 ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
252     if "(LEGAL)" in val:
253     ## Attempt to only remove the email parenthetical, including the now empty paren.
254     val = val.replace(result.upper(),"")
255     val = val.replace("()",'')
256     #val = val.replace(")","")
257     else:
258     ## Remove all parenthicals, including any character in that paren, from value.
259     val = re.sub(r"\([^)]*\)","",val)
260 nino.borges 860
261 nino.borges 855 val = val.strip()
262     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
263     if val:
264     ## if there is a comma, parse to last name, first name
265     if "," in val:
266     lastName, firstName = val.split(",")
267     lastName = lastName.strip()
268     firstName = firstName.strip()
269     elif " " in val:
270     ## For now, try just splitting by the first space and take everything after as the first name.
271     firstName, lastName = val.split(" ",1)
272     ## With the name now parse, try searching for all values that match on the last name.
273 nino.borges 864
274 nino.borges 855 personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
275     if personMatchList:
276     possiblePeopleMatchesMatrix = {}
277     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
278     for personMatch in personMatchList:
279     if personMatch.first_name == firstName:
280     ## This is a personMatch that matches the first and last name
281     possiblePeopleMatchesMatrix[personMatch._id] = 1
282     if possiblePeopleMatchesMatrix.keys():
283     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
284     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
285 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
286     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
287    
288 nino.borges 855 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
289 nino.borges 865
290 nino.borges 855 matchFlag = False
291 nino.borges 861 if allPossibleVariationsList:
292     for variationPair in allPossibleVariationsList:
293     if personMatch.is_attorney == 'YES':
294 nino.borges 865 if personMatch.last_name == "MANEK":
295     print(variationPair)
296 nino.borges 861 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
297     ## This variation was found in the list of formatted values, which is fine, so just remove it.
298     if matchFlag:
299     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
300     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
301     matchFlag = True
302     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
303     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
304     if matchFlag:
305     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
306     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
307     matchFlag = True
308     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
309     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
310 nino.borges 866 ## else:
311     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
312     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
313     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
314 nino.borges 861
315 nino.borges 862 elif personMatch.is_attorney == 'NO':
316 nino.borges 861 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
317     if matchFlag:
318     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
319     ## This variation was found in the list of formatted values, which is fine, so just remove it.
320     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
321     matchFlag = True
322     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
323     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
324     if matchFlag:
325     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
326     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
327     matchFlag = True
328     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
329     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
330 nino.borges 862 else:
331     ## This means they are a split role, so additional work will need to be done with the dates.
332     ## First, determin if this document date is between the dates where this person was an attorney
333     wasAttorneyAtThatTime = False
334     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
335     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
336     #print(f"\ndocumentDateValue is {documentDateValue}")
337     personWasAttorneyDates = personMatch.dates_as_counsel
338     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
339     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
340     if wasAttorneyStartDate.count("/") < 2:
341     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
342     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
343    
344     if wasAttorneyEndDate == "CURRENT":
345     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
346     elif wasAttorneyEndDate == "PRESENT":
347     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
348     if wasAttorneyEndDate.count("/") < 2:
349     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
350     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
351     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
352    
353     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
354     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
355     wasAttorneyAtThatTime = True
356    
357     ## if wasAttorneyAtThatTime:
358     ## print("Person WAS attorney at this doc date.")
359     ## else:
360     ## print("Person WAS NOT attorney at this doc date.")
361    
362     ## Person's role at the time of the document has been determined, so now do the same checks as above.
363     if wasAttorneyAtThatTime:
364     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
365     ## This variation was found in the list of formatted values, which is fine, so just remove it.
366     if matchFlag:
367     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
368     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
369     matchFlag = True
370     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
371     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
372     if matchFlag:
373     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
374     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
375     matchFlag = True
376     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
377     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
378    
379     else:
380     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
381     if matchFlag:
382     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
383     ## This variation was found in the list of formatted values, which is fine, so just remove it.
384     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
385     matchFlag = True
386     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
387     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
388     if matchFlag:
389     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
390     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
391     matchFlag = True
392     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
393     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
394 nino.borges 866
395     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
396     if matchFlag:
397     pass
398     else:
399     if personMatch.is_attorney == 'YES':
400     AddToIssuesList(docID,f"{origVal} in Metadata To Field and did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
401 nino.borges 863 else:
402     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
403     ## TODO: Add support here for more than one first name last name match in MAL.
404 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
405     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
406 nino.borges 855
407     else:
408     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
409 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
410 nino.borges 855 pass
411 nino.borges 857
412 nino.borges 855 else:
413     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
414 nino.borges 863 #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
415     val = val.upper()
416     origVal = val
417     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
418     if "(LEGAL)" in val:
419     pass
420     else:
421     ## Remove all parenthicals, including any character in that paren, from value.
422     val = re.sub(r"\([^)]*\)","",val)
423    
424     val = val.strip()
425     ## with the paren information stripped out of the val, only move forward if anything still exists.
426     if val:
427     ## if there is a comma, parse to last name, first name
428     if "," in val:
429     lastName, firstName = val.split(",")
430     lastName = lastName.strip()
431     firstName = firstName.strip()
432     elif " " in val:
433     ## For now, try just splitting by the first space and take everything after as the first name.
434     firstName, lastName = val.split(" ",1)
435     ## With the name now parse, try searching for all values that match on the last name.
436    
437     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
438     if personMatchList:
439     possiblePeopleMatchesMatrix = {}
440     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
441     for personMatch in personMatchList:
442     if personMatch.first_name == firstName:
443     ## This is a personMatch that matches the first and last name
444     possiblePeopleMatchesMatrix[personMatch._id] = 1
445     if possiblePeopleMatchesMatrix.keys():
446     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
447     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
448 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
449     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
450    
451 nino.borges 863 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
452     matchFlag = False
453     if allPossibleVariationsList:
454     for variationPair in allPossibleVariationsList:
455     if personMatch.is_attorney == 'YES':
456     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
457     ## This variation was found in the list of formatted values, which is fine, so just remove it.
458     if matchFlag:
459     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
460     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
461     matchFlag = True
462     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
463     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
464     if matchFlag:
465     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
466     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
467     matchFlag = True
468     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
469     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
470 nino.borges 866 ## else:
471     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
472     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
473     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
474 nino.borges 857
475 nino.borges 863 elif personMatch.is_attorney == 'NO':
476     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
477     if matchFlag:
478     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
479     ## This variation was found in the list of formatted values, which is fine, so just remove it.
480     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
481     matchFlag = True
482     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
483     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
484     if matchFlag:
485     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
486     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
487     matchFlag = True
488     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
489     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
490     else:
491     ## This means they are a split role, so additional work will need to be done with the dates.
492     ## First, determin if this document date is between the dates where this person was an attorney
493     wasAttorneyAtThatTime = False
494     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
495     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
496     #print(f"\ndocumentDateValue is {documentDateValue}")
497     personWasAttorneyDates = personMatch.dates_as_counsel
498     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
499     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
500     if wasAttorneyStartDate.count("/") < 2:
501     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
502     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
503    
504     if wasAttorneyEndDate == "CURRENT":
505     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
506     elif wasAttorneyEndDate == "PRESENT":
507     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
508     if wasAttorneyEndDate.count("/") < 2:
509     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
510     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
511     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
512    
513     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
514     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
515     wasAttorneyAtThatTime = True
516    
517     ## if wasAttorneyAtThatTime:
518     ## print("Person WAS attorney at this doc date.")
519     ## else:
520     ## print("Person WAS NOT attorney at this doc date.")
521    
522     ## Person's role at the time of the document has been determined, so now do the same checks as above.
523     if wasAttorneyAtThatTime:
524     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
525     ## This variation was found in the list of formatted values, which is fine, so just remove it.
526     if matchFlag:
527     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
528     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
529     matchFlag = True
530     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
531     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
532     if matchFlag:
533     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
534     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
535     matchFlag = True
536     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
537     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
538    
539     else:
540     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
541     if matchFlag:
542     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
543     ## This variation was found in the list of formatted values, which is fine, so just remove it.
544     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
545     matchFlag = True
546     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
547     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
548     if matchFlag:
549     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
550     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
551     matchFlag = True
552     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
553     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
554 nino.borges 866
555     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
556     if matchFlag:
557     pass
558     else:
559     if personMatch.is_attorney == 'YES':
560     AddToIssuesList(docID,f"{origVal} in Metadata To Field did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
561 nino.borges 863 else:
562     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
563     ## TODO: Add support here for more than one first name last name match in MAL.
564 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
565     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
566 nino.borges 863
567    
568    
569 nino.borges 855
570     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
571     if formattedFieldValues:
572     for val in formattedFieldValues:
573     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
574     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
575     if "*" in val:
576 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
577 nino.borges 861 AddToIssuesList(docID,f"{val} in Formatted To Field is an attorney but couldnt be matched to any value in metadata field.")
578 nino.borges 855
579    
580 nino.borges 857
581 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
582     outputFile = open(outputFileName,'w')
583     for docID in list(issuesMatrix.keys()):
584     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
585     outputFile.close()