ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 867
Committed: Sat Dec 14 02:01:57 2024 UTC (15 months, 1 week ago) by nino.borges
Content type: text/x-python
File size: 52602 byte(s)
Log Message:
in author, when resorting to a name search when the email doesnt match, because in this field the email address is not always in a paren, the email address was surviving and not removed.  added an extra line to make sure it's removed before trying to parse the name.

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 867 version = '0.12.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32 nino.borges 865 ## After talking to Eli, we decided that all of these amazon.com.uk or amazon.it are related domains,
33     ## so we should feel confident that we can add amazon.com to the list of possible domains. doing that here and for alt work email.
34     if "@AMAZON." in personMatch.work_email_address:
35     allDomainsList.append("AMAZON.COM")
36 nino.borges 855 if personMatch.alt_work_email_address:
37     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
38 nino.borges 865 if "@AMAZON." in personMatch.alt_work_email_address:
39     allDomainsList.append("AMAZON.COM")
40 nino.borges 855 allDomainsList = list(dict.fromkeys(allDomainsList))
41 nino.borges 865 if personMatch.last_name == "MANEK":
42     print(allDomainsList)
43 nino.borges 855
44     if personMatch.full_name_overide:
45     fullName = personMatch.full_name_overide
46     for domain in allDomainsList:
47     allPossibleVariationsList.append((fullName,domain))
48     if personMatch.full_name_preferred:
49     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
50     fullPreferredName = personMatch.full_name_preferred
51     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
52     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
53     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
54 nino.borges 866 fullPreferredName = fullPreferredName.replace(',,',',')
55 nino.borges 855 if "," in fullPreferredName:
56     preferedLastName, preferedFirstName = fullPreferredName.split(',')
57     preferedLastName = preferedLastName.strip()
58     preferedFirstName = preferedFirstName.strip()
59     preferedFirstName = preferedFirstName.split(" ")[0]
60     fullName = f"{preferedFirstName} {preferedLastName}"
61     #fullName = f"{preferedLastName}, {preferedFirstName}"
62     for domain in allDomainsList:
63     allPossibleVariationsList.append((fullName,domain))
64     else:
65     print(f"ERROR in this name {fullPreferredName}")
66     if personMatch.last_name:
67     if personMatch.first_name:
68     fullName = f"{personMatch.first_name} {personMatch.last_name}"
69     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
70     else:
71     fullName = f"{personMatch.last_name}"
72     for domain in allDomainsList:
73     allPossibleVariationsList.append((fullName,domain))
74    
75    
76     ## Now return a deduplicated list by using dict to deduplicate.
77     return list(dict.fromkeys(allPossibleVariationsList))
78    
79    
80 nino.borges 856 def AddToIssuesList(docID,issueMessage):
81     """This function will add a single issue to the issues matrix."""
82     if docID in list(issuesMatrix.keys()):
83     issuesMatrix[docID].append(issueMessage)
84     else:
85     issuesMatrix[docID] = [issueMessage,]
86 nino.borges 855
87    
88    
89     if __name__ == '__main__':
90     cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
91     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
92     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
93     outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
94    
95    
96     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
97    
98     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
99    
100     #issuesMatrix = {}
101    
102     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
103     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
104    
105     workList = qcP.metadataValuesDict.keys()
106     for docID in workList:
107 nino.borges 866 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
108     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
109     #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
110     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
111 nino.borges 867 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
112     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
113     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
114     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
115     #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['docAuthor']
116     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['docAuthor']
117 nino.borges 855 ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
118     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
119     ## This will change once you start itterating acroll all of the field values names
120     currentMetadataValues = metadataFieldValues
121     for val in currentMetadataValues:
122     ## First try to locate an email address in this val and if found, try to find that in the MAL.
123     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
124     if results:
125     ## Use some smart deduplication to remove duplicates.
126     results = nv.SmartDedupeSet(results)
127 nino.borges 863 if len(results) > 1:
128     print(f"WARNING: more than one unique email address found in this value: {results}")
129 nino.borges 855 for result in results:
130     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
131     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
132    
133     personMatch = nv.malPeopleList.search_by_email(result.upper())
134     if personMatch:
135     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
136     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
137     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
138     matchFlag = False
139 nino.borges 861 if allPossibleVariationsList:
140     for variationPair in allPossibleVariationsList:
141     if personMatch.is_attorney == 'YES':
142 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
143     ## This variation was found in the list of formatted values, which is fine, so just remove it.
144     if matchFlag:
145     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
146 nino.borges 861
147 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
148     matchFlag = True
149 nino.borges 861
150    
151 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
152     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
153     if matchFlag:
154     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
155     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
156     matchFlag = True
157     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
158     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
159    
160 nino.borges 861
161     elif personMatch.is_attorney == 'NO':
162 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
163     if matchFlag:
164     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
165     ## This variation was found in the list of formatted values, which is fine, so just remove it.
166     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
167     matchFlag = True
168     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
169     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
170     if matchFlag:
171     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
172     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
173     matchFlag = True
174     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
175     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
176 nino.borges 861 else:
177     ## This means they are a split role, so additional work will need to be done with the dates.
178     ## First, determin if this document date is between the dates where this person was an attorney
179     wasAttorneyAtThatTime = False
180     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
181     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
182     #print(f"\ndocumentDateValue is {documentDateValue}")
183     personWasAttorneyDates = personMatch.dates_as_counsel
184     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
185     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
186     if wasAttorneyStartDate.count("/") < 2:
187     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
188     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
189    
190     if wasAttorneyEndDate == "CURRENT":
191     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
192     elif wasAttorneyEndDate == "PRESENT":
193     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
194     if wasAttorneyEndDate.count("/") < 2:
195     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
196     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
197     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
198 nino.borges 860
199 nino.borges 861 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
200     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
201     wasAttorneyAtThatTime = True
202    
203     ## if wasAttorneyAtThatTime:
204     ## print("Person WAS attorney at this doc date.")
205     ## else:
206     ## print("Person WAS NOT attorney at this doc date.")
207    
208     ## Person's role at the time of the document has been determined, so now do the same checks as above.
209     if wasAttorneyAtThatTime:
210     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
211     ## This variation was found in the list of formatted values, which is fine, so just remove it.
212     if matchFlag:
213     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
214     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
215     matchFlag = True
216     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
217     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
218     if matchFlag:
219     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
220     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
221     matchFlag = True
222     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
223     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
224    
225     else:
226     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
227     if matchFlag:
228     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
229     ## This variation was found in the list of formatted values, which is fine, so just remove it.
230     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
231     matchFlag = True
232     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
233     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
234     if matchFlag:
235     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
236     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
237     matchFlag = True
238     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
239     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
240    
241     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
242     if matchFlag:
243     pass
244     else:
245 nino.borges 860 if personMatch.is_attorney == 'YES':
246 nino.borges 866 AddToIssuesList(docID,f"{val} in Metadata To Field did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
247 nino.borges 855
248     else:
249 nino.borges 863 ## Person match, using email, not found in MAL.
250     ## Try extracting a name from this metadata value and try matching the MAL using that.
251 nino.borges 855 val = val.upper()
252 nino.borges 863 origVal = val
253 nino.borges 867
254 nino.borges 855 ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
255     if "(LEGAL)" in val:
256     ## Attempt to only remove the email parenthetical, including the now empty paren.
257     val = val.replace(result.upper(),"")
258     val = val.replace("()",'')
259     #val = val.replace(")","")
260     else:
261     ## Remove all parenthicals, including any character in that paren, from value.
262     val = re.sub(r"\([^)]*\)","",val)
263 nino.borges 867 val = val.replace(result.upper(),"")
264 nino.borges 855 val = val.strip()
265     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
266     if val:
267     ## if there is a comma, parse to last name, first name
268     if "," in val:
269     lastName, firstName = val.split(",")
270     lastName = lastName.strip()
271     firstName = firstName.strip()
272     elif " " in val:
273     ## For now, try just splitting by the first space and take everything after as the first name.
274     firstName, lastName = val.split(" ",1)
275     ## With the name now parse, try searching for all values that match on the last name.
276 nino.borges 864
277 nino.borges 855 personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
278     if personMatchList:
279     possiblePeopleMatchesMatrix = {}
280     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
281     for personMatch in personMatchList:
282     if personMatch.first_name == firstName:
283     ## This is a personMatch that matches the first and last name
284     possiblePeopleMatchesMatrix[personMatch._id] = 1
285     if possiblePeopleMatchesMatrix.keys():
286     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
287     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
288 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
289     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
290    
291 nino.borges 855 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
292 nino.borges 865
293 nino.borges 855 matchFlag = False
294 nino.borges 861 if allPossibleVariationsList:
295     for variationPair in allPossibleVariationsList:
296     if personMatch.is_attorney == 'YES':
297 nino.borges 865 if personMatch.last_name == "MANEK":
298     print(variationPair)
299 nino.borges 861 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
300     ## This variation was found in the list of formatted values, which is fine, so just remove it.
301     if matchFlag:
302     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
303     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
304     matchFlag = True
305     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
306     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
307     if matchFlag:
308     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
309     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
310     matchFlag = True
311     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
312     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
313 nino.borges 866 ## else:
314     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
315     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
316     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
317 nino.borges 861
318 nino.borges 862 elif personMatch.is_attorney == 'NO':
319 nino.borges 861 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
320     if matchFlag:
321     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
322     ## This variation was found in the list of formatted values, which is fine, so just remove it.
323     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
324     matchFlag = True
325     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
326     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
327     if matchFlag:
328     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
329     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
330     matchFlag = True
331     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
332     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
333 nino.borges 862 else:
334     ## This means they are a split role, so additional work will need to be done with the dates.
335     ## First, determin if this document date is between the dates where this person was an attorney
336     wasAttorneyAtThatTime = False
337     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
338     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
339     #print(f"\ndocumentDateValue is {documentDateValue}")
340     personWasAttorneyDates = personMatch.dates_as_counsel
341     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
342     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
343     if wasAttorneyStartDate.count("/") < 2:
344     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
345     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
346    
347     if wasAttorneyEndDate == "CURRENT":
348     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
349     elif wasAttorneyEndDate == "PRESENT":
350     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
351     if wasAttorneyEndDate.count("/") < 2:
352     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
353     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
354     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
355    
356     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
357     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
358     wasAttorneyAtThatTime = True
359    
360     ## if wasAttorneyAtThatTime:
361     ## print("Person WAS attorney at this doc date.")
362     ## else:
363     ## print("Person WAS NOT attorney at this doc date.")
364    
365     ## Person's role at the time of the document has been determined, so now do the same checks as above.
366     if wasAttorneyAtThatTime:
367     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
368     ## This variation was found in the list of formatted values, which is fine, so just remove it.
369     if matchFlag:
370     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
371     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
372     matchFlag = True
373     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
374     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
375     if matchFlag:
376     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
377     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
378     matchFlag = True
379     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
380     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
381    
382     else:
383     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
384     if matchFlag:
385     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
386     ## This variation was found in the list of formatted values, which is fine, so just remove it.
387     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
388     matchFlag = True
389     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
390     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
391     if matchFlag:
392     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
393     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
394     matchFlag = True
395     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
396     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
397 nino.borges 866
398     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
399     if matchFlag:
400     pass
401     else:
402     if personMatch.is_attorney == 'YES':
403     AddToIssuesList(docID,f"{origVal} in Metadata To Field and did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
404 nino.borges 863 else:
405     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
406     ## TODO: Add support here for more than one first name last name match in MAL.
407 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
408     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
409 nino.borges 855
410     else:
411     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
412 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
413 nino.borges 855 pass
414 nino.borges 857
415 nino.borges 855 else:
416     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
417 nino.borges 863 #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
418     val = val.upper()
419     origVal = val
420     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
421     if "(LEGAL)" in val:
422     pass
423     else:
424     ## Remove all parenthicals, including any character in that paren, from value.
425     val = re.sub(r"\([^)]*\)","",val)
426    
427     val = val.strip()
428     ## with the paren information stripped out of the val, only move forward if anything still exists.
429     if val:
430     ## if there is a comma, parse to last name, first name
431     if "," in val:
432     lastName, firstName = val.split(",")
433     lastName = lastName.strip()
434     firstName = firstName.strip()
435     elif " " in val:
436     ## For now, try just splitting by the first space and take everything after as the first name.
437     firstName, lastName = val.split(" ",1)
438     ## With the name now parse, try searching for all values that match on the last name.
439    
440     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
441     if personMatchList:
442     possiblePeopleMatchesMatrix = {}
443     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
444     for personMatch in personMatchList:
445     if personMatch.first_name == firstName:
446     ## This is a personMatch that matches the first and last name
447     possiblePeopleMatchesMatrix[personMatch._id] = 1
448     if possiblePeopleMatchesMatrix.keys():
449     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
450     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
451 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
452     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
453    
454 nino.borges 863 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
455     matchFlag = False
456     if allPossibleVariationsList:
457     for variationPair in allPossibleVariationsList:
458     if personMatch.is_attorney == 'YES':
459     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
460     ## This variation was found in the list of formatted values, which is fine, so just remove it.
461     if matchFlag:
462     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
463     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
464     matchFlag = True
465     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
466     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
467     if matchFlag:
468     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
469     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
470     matchFlag = True
471     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
472     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
473 nino.borges 866 ## else:
474     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
475     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
476     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
477 nino.borges 857
478 nino.borges 863 elif personMatch.is_attorney == 'NO':
479     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
480     if matchFlag:
481     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
482     ## This variation was found in the list of formatted values, which is fine, so just remove it.
483     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
484     matchFlag = True
485     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
486     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
487     if matchFlag:
488     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
489     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
490     matchFlag = True
491     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
492     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
493     else:
494     ## This means they are a split role, so additional work will need to be done with the dates.
495     ## First, determin if this document date is between the dates where this person was an attorney
496     wasAttorneyAtThatTime = False
497     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
498     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
499     #print(f"\ndocumentDateValue is {documentDateValue}")
500     personWasAttorneyDates = personMatch.dates_as_counsel
501     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
502     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
503     if wasAttorneyStartDate.count("/") < 2:
504     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
505     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
506    
507     if wasAttorneyEndDate == "CURRENT":
508     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
509     elif wasAttorneyEndDate == "PRESENT":
510     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
511     if wasAttorneyEndDate.count("/") < 2:
512     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
513     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
514     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
515    
516     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
517     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
518     wasAttorneyAtThatTime = True
519    
520     ## if wasAttorneyAtThatTime:
521     ## print("Person WAS attorney at this doc date.")
522     ## else:
523     ## print("Person WAS NOT attorney at this doc date.")
524    
525     ## Person's role at the time of the document has been determined, so now do the same checks as above.
526     if wasAttorneyAtThatTime:
527     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
528     ## This variation was found in the list of formatted values, which is fine, so just remove it.
529     if matchFlag:
530     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
531     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
532     matchFlag = True
533     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
534     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
535     if matchFlag:
536     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
537     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
538     matchFlag = True
539     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
540     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
541    
542     else:
543     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
544     if matchFlag:
545     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
546     ## This variation was found in the list of formatted values, which is fine, so just remove it.
547     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
548     matchFlag = True
549     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
550     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
551     if matchFlag:
552     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
553     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
554     matchFlag = True
555     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
556     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
557 nino.borges 866
558     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
559     if matchFlag:
560     pass
561     else:
562     if personMatch.is_attorney == 'YES':
563     AddToIssuesList(docID,f"{origVal} in Metadata To Field did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
564 nino.borges 863 else:
565     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
566     ## TODO: Add support here for more than one first name last name match in MAL.
567 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
568     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
569 nino.borges 863
570    
571    
572 nino.borges 855
573     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
574     if formattedFieldValues:
575     for val in formattedFieldValues:
576     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
577     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
578     if "*" in val:
579 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
580 nino.borges 861 AddToIssuesList(docID,f"{val} in Formatted To Field is an attorney but couldnt be matched to any value in metadata field.")
581 nino.borges 855
582    
583 nino.borges 857
584 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
585     outputFile = open(outputFileName,'w')
586     for docID in list(issuesMatrix.keys()):
587     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
588     outputFile.close()