ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 865
Committed: Sat Dec 14 00:36:53 2024 UTC (15 months, 1 week ago) by nino.borges
Content type: text/x-python
File size: 50535 byte(s)
Log Message:
Updated the variations function to force add AMAZON.COM if there is an @amazon. in the domain name because Eli said I could be confident that the same email name will show up as the amazon.co.uk, amazon.it, etc.  This should increase my matches.

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 865 version = '0.10.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32 nino.borges 865 ## After talking to Eli, we decided that all of these amazon.com.uk or amazon.it are related domains,
33     ## so we should feel confident that we can add amazon.com to the list of possible domains. doing that here and for alt work email.
34     if "@AMAZON." in personMatch.work_email_address:
35     allDomainsList.append("AMAZON.COM")
36 nino.borges 855 if personMatch.alt_work_email_address:
37     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
38 nino.borges 865 if "@AMAZON." in personMatch.alt_work_email_address:
39     allDomainsList.append("AMAZON.COM")
40 nino.borges 855 allDomainsList = list(dict.fromkeys(allDomainsList))
41 nino.borges 865 if personMatch.last_name == "MANEK":
42     print(allDomainsList)
43 nino.borges 855
44     if personMatch.full_name_overide:
45     fullName = personMatch.full_name_overide
46     for domain in allDomainsList:
47     allPossibleVariationsList.append((fullName,domain))
48     if personMatch.full_name_preferred:
49     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
50     fullPreferredName = personMatch.full_name_preferred
51     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
52     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
53     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
54     if "," in fullPreferredName:
55     preferedLastName, preferedFirstName = fullPreferredName.split(',')
56     preferedLastName = preferedLastName.strip()
57     preferedFirstName = preferedFirstName.strip()
58     preferedFirstName = preferedFirstName.split(" ")[0]
59     fullName = f"{preferedFirstName} {preferedLastName}"
60     #fullName = f"{preferedLastName}, {preferedFirstName}"
61     for domain in allDomainsList:
62     allPossibleVariationsList.append((fullName,domain))
63     else:
64     print(f"ERROR in this name {fullPreferredName}")
65     if personMatch.last_name:
66     if personMatch.first_name:
67     fullName = f"{personMatch.first_name} {personMatch.last_name}"
68     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
69     else:
70     fullName = f"{personMatch.last_name}"
71     for domain in allDomainsList:
72     allPossibleVariationsList.append((fullName,domain))
73    
74    
75     ## Now return a deduplicated list by using dict to deduplicate.
76     return list(dict.fromkeys(allPossibleVariationsList))
77    
78    
79 nino.borges 856 def AddToIssuesList(docID,issueMessage):
80     """This function will add a single issue to the issues matrix."""
81     if docID in list(issuesMatrix.keys()):
82     issuesMatrix[docID].append(issueMessage)
83     else:
84     issuesMatrix[docID] = [issueMessage,]
85 nino.borges 855
86    
87    
88     if __name__ == '__main__':
89     cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
90     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
91     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
92     outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
93    
94    
95     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
96    
97     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
98    
99     #issuesMatrix = {}
100    
101     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
102     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
103    
104     workList = qcP.metadataValuesDict.keys()
105     for docID in workList:
106     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
107     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
108 nino.borges 863 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['docAuthor']
109     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['docAuthor']
110 nino.borges 855 ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
111     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
112     ## This will change once you start itterating acroll all of the field values names
113     currentMetadataValues = metadataFieldValues
114     for val in currentMetadataValues:
115     ## First try to locate an email address in this val and if found, try to find that in the MAL.
116     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
117     if results:
118     ## Use some smart deduplication to remove duplicates.
119     results = nv.SmartDedupeSet(results)
120 nino.borges 863 if len(results) > 1:
121     print(f"WARNING: more than one unique email address found in this value: {results}")
122 nino.borges 855 for result in results:
123     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
124     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
125    
126     personMatch = nv.malPeopleList.search_by_email(result.upper())
127     if personMatch:
128     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
129     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
130     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
131     matchFlag = False
132 nino.borges 861 if allPossibleVariationsList:
133     for variationPair in allPossibleVariationsList:
134     if personMatch.is_attorney == 'YES':
135 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
136     ## This variation was found in the list of formatted values, which is fine, so just remove it.
137     if matchFlag:
138     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
139 nino.borges 861
140 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
141     matchFlag = True
142 nino.borges 861
143    
144 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
145     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
146     if matchFlag:
147     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
148     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
149     matchFlag = True
150     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
151     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
152    
153 nino.borges 861
154     elif personMatch.is_attorney == 'NO':
155 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
156     if matchFlag:
157     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
158     ## This variation was found in the list of formatted values, which is fine, so just remove it.
159     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
160     matchFlag = True
161     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
162     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
163     if matchFlag:
164     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
165     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
166     matchFlag = True
167     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
168     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
169 nino.borges 861 else:
170     ## This means they are a split role, so additional work will need to be done with the dates.
171     ## First, determin if this document date is between the dates where this person was an attorney
172     wasAttorneyAtThatTime = False
173     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
174     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
175     #print(f"\ndocumentDateValue is {documentDateValue}")
176     personWasAttorneyDates = personMatch.dates_as_counsel
177     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
178     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
179     if wasAttorneyStartDate.count("/") < 2:
180     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
181     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
182    
183     if wasAttorneyEndDate == "CURRENT":
184     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
185     elif wasAttorneyEndDate == "PRESENT":
186     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
187     if wasAttorneyEndDate.count("/") < 2:
188     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
189     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
190     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
191 nino.borges 860
192 nino.borges 861 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
193     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
194     wasAttorneyAtThatTime = True
195    
196     ## if wasAttorneyAtThatTime:
197     ## print("Person WAS attorney at this doc date.")
198     ## else:
199     ## print("Person WAS NOT attorney at this doc date.")
200    
201     ## Person's role at the time of the document has been determined, so now do the same checks as above.
202     if wasAttorneyAtThatTime:
203     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
204     ## This variation was found in the list of formatted values, which is fine, so just remove it.
205     if matchFlag:
206     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
207     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
208     matchFlag = True
209     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
210     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
211     if matchFlag:
212     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
213     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
214     matchFlag = True
215     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
216     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
217    
218     else:
219     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
220     if matchFlag:
221     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
222     ## This variation was found in the list of formatted values, which is fine, so just remove it.
223     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
224     matchFlag = True
225     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
226     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
227     if matchFlag:
228     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
229     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
230     matchFlag = True
231     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
232     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
233    
234     ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
235     if matchFlag:
236     pass
237     else:
238 nino.borges 860 if personMatch.is_attorney == 'YES':
239 nino.borges 861 AddToIssuesList(docID,f"{val} in Metadata To Field and did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
240 nino.borges 855
241     else:
242 nino.borges 863 ## Person match, using email, not found in MAL.
243     ## Try extracting a name from this metadata value and try matching the MAL using that.
244 nino.borges 855 val = val.upper()
245 nino.borges 863 origVal = val
246 nino.borges 855 ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
247     if "(LEGAL)" in val:
248     ## Attempt to only remove the email parenthetical, including the now empty paren.
249     val = val.replace(result.upper(),"")
250     val = val.replace("()",'')
251     #val = val.replace(")","")
252     else:
253     ## Remove all parenthicals, including any character in that paren, from value.
254     val = re.sub(r"\([^)]*\)","",val)
255 nino.borges 860
256 nino.borges 855 val = val.strip()
257     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
258     if val:
259     ## if there is a comma, parse to last name, first name
260     if "," in val:
261     lastName, firstName = val.split(",")
262     lastName = lastName.strip()
263     firstName = firstName.strip()
264     elif " " in val:
265     ## For now, try just splitting by the first space and take everything after as the first name.
266     firstName, lastName = val.split(" ",1)
267     ## With the name now parse, try searching for all values that match on the last name.
268 nino.borges 864
269 nino.borges 855 personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
270     if personMatchList:
271     possiblePeopleMatchesMatrix = {}
272     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
273     for personMatch in personMatchList:
274     if personMatch.first_name == firstName:
275     ## This is a personMatch that matches the first and last name
276     possiblePeopleMatchesMatrix[personMatch._id] = 1
277     if possiblePeopleMatchesMatrix.keys():
278     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
279     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
280 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
281     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
282    
283 nino.borges 855 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
284 nino.borges 865
285 nino.borges 855 matchFlag = False
286 nino.borges 861 if allPossibleVariationsList:
287     for variationPair in allPossibleVariationsList:
288     if personMatch.is_attorney == 'YES':
289 nino.borges 865 if personMatch.last_name == "MANEK":
290     print(variationPair)
291 nino.borges 861 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
292     ## This variation was found in the list of formatted values, which is fine, so just remove it.
293     if matchFlag:
294     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
295     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
296     matchFlag = True
297     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
298     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
299     if matchFlag:
300     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
301     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
302     matchFlag = True
303     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
304     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
305     else:
306     ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
307 nino.borges 863 #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
308     AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
309 nino.borges 861
310 nino.borges 862 elif personMatch.is_attorney == 'NO':
311 nino.borges 861 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
312     if matchFlag:
313     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
314     ## This variation was found in the list of formatted values, which is fine, so just remove it.
315     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
316     matchFlag = True
317     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
318     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
319     if matchFlag:
320     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
321     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
322     matchFlag = True
323     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
324     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
325 nino.borges 862 else:
326     ## This means they are a split role, so additional work will need to be done with the dates.
327     ## First, determin if this document date is between the dates where this person was an attorney
328     wasAttorneyAtThatTime = False
329     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
330     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
331     #print(f"\ndocumentDateValue is {documentDateValue}")
332     personWasAttorneyDates = personMatch.dates_as_counsel
333     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
334     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
335     if wasAttorneyStartDate.count("/") < 2:
336     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
337     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
338    
339     if wasAttorneyEndDate == "CURRENT":
340     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
341     elif wasAttorneyEndDate == "PRESENT":
342     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
343     if wasAttorneyEndDate.count("/") < 2:
344     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
345     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
346     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
347    
348     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
349     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
350     wasAttorneyAtThatTime = True
351    
352     ## if wasAttorneyAtThatTime:
353     ## print("Person WAS attorney at this doc date.")
354     ## else:
355     ## print("Person WAS NOT attorney at this doc date.")
356    
357     ## Person's role at the time of the document has been determined, so now do the same checks as above.
358     if wasAttorneyAtThatTime:
359     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
360     ## This variation was found in the list of formatted values, which is fine, so just remove it.
361     if matchFlag:
362     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
363     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
364     matchFlag = True
365     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
366     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
367     if matchFlag:
368     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
369     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
370     matchFlag = True
371     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
372     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
373    
374     else:
375     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
376     if matchFlag:
377     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
378     ## This variation was found in the list of formatted values, which is fine, so just remove it.
379     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
380     matchFlag = True
381     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
382     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
383     if matchFlag:
384     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
385     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
386     matchFlag = True
387     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
388     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
389 nino.borges 863 else:
390     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
391     ## TODO: Add support here for more than one first name last name match in MAL.
392 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
393     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
394 nino.borges 855
395     else:
396     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
397 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
398 nino.borges 855 pass
399 nino.borges 857
400 nino.borges 855 else:
401     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
402 nino.borges 863 #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
403     val = val.upper()
404     origVal = val
405     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
406     if "(LEGAL)" in val:
407     pass
408     else:
409     ## Remove all parenthicals, including any character in that paren, from value.
410     val = re.sub(r"\([^)]*\)","",val)
411    
412     val = val.strip()
413     ## with the paren information stripped out of the val, only move forward if anything still exists.
414     if val:
415     ## if there is a comma, parse to last name, first name
416     if "," in val:
417     lastName, firstName = val.split(",")
418     lastName = lastName.strip()
419     firstName = firstName.strip()
420     elif " " in val:
421     ## For now, try just splitting by the first space and take everything after as the first name.
422     firstName, lastName = val.split(" ",1)
423     ## With the name now parse, try searching for all values that match on the last name.
424    
425     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
426     if personMatchList:
427     possiblePeopleMatchesMatrix = {}
428     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
429     for personMatch in personMatchList:
430     if personMatch.first_name == firstName:
431     ## This is a personMatch that matches the first and last name
432     possiblePeopleMatchesMatrix[personMatch._id] = 1
433     if possiblePeopleMatchesMatrix.keys():
434     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
435     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
436 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
437     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
438    
439 nino.borges 863 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
440     matchFlag = False
441     if allPossibleVariationsList:
442     for variationPair in allPossibleVariationsList:
443     if personMatch.is_attorney == 'YES':
444     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
445     ## This variation was found in the list of formatted values, which is fine, so just remove it.
446     if matchFlag:
447     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
448     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
449     matchFlag = True
450     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
451     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
452     if matchFlag:
453     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
454     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
455     matchFlag = True
456     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
457     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
458     else:
459     ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
460     #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
461     AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
462 nino.borges 857
463 nino.borges 863 elif personMatch.is_attorney == 'NO':
464     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
465     if matchFlag:
466     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
467     ## This variation was found in the list of formatted values, which is fine, so just remove it.
468     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
469     matchFlag = True
470     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
471     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
472     if matchFlag:
473     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
474     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
475     matchFlag = True
476     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
477     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
478     else:
479     ## This means they are a split role, so additional work will need to be done with the dates.
480     ## First, determin if this document date is between the dates where this person was an attorney
481     wasAttorneyAtThatTime = False
482     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
483     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
484     #print(f"\ndocumentDateValue is {documentDateValue}")
485     personWasAttorneyDates = personMatch.dates_as_counsel
486     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
487     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
488     if wasAttorneyStartDate.count("/") < 2:
489     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
490     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
491    
492     if wasAttorneyEndDate == "CURRENT":
493     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
494     elif wasAttorneyEndDate == "PRESENT":
495     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
496     if wasAttorneyEndDate.count("/") < 2:
497     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
498     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
499     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
500    
501     #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
502     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
503     wasAttorneyAtThatTime = True
504    
505     ## if wasAttorneyAtThatTime:
506     ## print("Person WAS attorney at this doc date.")
507     ## else:
508     ## print("Person WAS NOT attorney at this doc date.")
509    
510     ## Person's role at the time of the document has been determined, so now do the same checks as above.
511     if wasAttorneyAtThatTime:
512     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
513     ## This variation was found in the list of formatted values, which is fine, so just remove it.
514     if matchFlag:
515     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
516     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
517     matchFlag = True
518     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
519     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
520     if matchFlag:
521     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
522     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
523     matchFlag = True
524     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
525     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
526    
527     else:
528     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
529     if matchFlag:
530     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
531     ## This variation was found in the list of formatted values, which is fine, so just remove it.
532     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
533     matchFlag = True
534     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
535     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
536     if matchFlag:
537     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
538     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
539     matchFlag = True
540     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
541     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
542     else:
543     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
544     ## TODO: Add support here for more than one first name last name match in MAL.
545 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
546     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
547 nino.borges 863
548    
549    
550 nino.borges 855
551     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
552     if formattedFieldValues:
553     for val in formattedFieldValues:
554     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
555     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
556     if "*" in val:
557 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
558 nino.borges 861 AddToIssuesList(docID,f"{val} in Formatted To Field is an attorney but couldnt be matched to any value in metadata field.")
559 nino.borges 855
560    
561 nino.borges 857
562 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
563     outputFile = open(outputFileName,'w')
564     for docID in list(issuesMatrix.keys()):
565     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
566     outputFile.close()