ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 868
Committed: Tue Dec 17 17:59:43 2024 UTC (15 months, 1 week ago) by nino.borges
Content type: text/x-python
File size: 56179 byte(s)
Log Message:
This version adds logic for instances where there is a split role attny but there is no date value in the priv log, which I didnt initially think would ever happen but found instances where it does.  Also adds support for situations where the name cant parse because there are multiple commas in the metadata name value.

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 868 version = '0.13.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32 nino.borges 865 ## After talking to Eli, we decided that all of these amazon.com.uk or amazon.it are related domains,
33     ## so we should feel confident that we can add amazon.com to the list of possible domains. doing that here and for alt work email.
34     if "@AMAZON." in personMatch.work_email_address:
35     allDomainsList.append("AMAZON.COM")
36 nino.borges 855 if personMatch.alt_work_email_address:
37     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
38 nino.borges 865 if "@AMAZON." in personMatch.alt_work_email_address:
39     allDomainsList.append("AMAZON.COM")
40 nino.borges 855 allDomainsList = list(dict.fromkeys(allDomainsList))
41    
42     if personMatch.full_name_overide:
43     fullName = personMatch.full_name_overide
44     for domain in allDomainsList:
45     allPossibleVariationsList.append((fullName,domain))
46     if personMatch.full_name_preferred:
47     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
48     fullPreferredName = personMatch.full_name_preferred
49     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
50     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
51     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
52 nino.borges 866 fullPreferredName = fullPreferredName.replace(',,',',')
53 nino.borges 855 if "," in fullPreferredName:
54     preferedLastName, preferedFirstName = fullPreferredName.split(',')
55     preferedLastName = preferedLastName.strip()
56     preferedFirstName = preferedFirstName.strip()
57     preferedFirstName = preferedFirstName.split(" ")[0]
58     fullName = f"{preferedFirstName} {preferedLastName}"
59     #fullName = f"{preferedLastName}, {preferedFirstName}"
60     for domain in allDomainsList:
61     allPossibleVariationsList.append((fullName,domain))
62     else:
63     print(f"ERROR in this name {fullPreferredName}")
64     if personMatch.last_name:
65     if personMatch.first_name:
66     fullName = f"{personMatch.first_name} {personMatch.last_name}"
67     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
68     else:
69     fullName = f"{personMatch.last_name}"
70     for domain in allDomainsList:
71     allPossibleVariationsList.append((fullName,domain))
72    
73    
74     ## Now return a deduplicated list by using dict to deduplicate.
75     return list(dict.fromkeys(allPossibleVariationsList))
76    
77    
78 nino.borges 856 def AddToIssuesList(docID,issueMessage):
79     """This function will add a single issue to the issues matrix."""
80     if docID in list(issuesMatrix.keys()):
81     issuesMatrix[docID].append(issueMessage)
82     else:
83     issuesMatrix[docID] = [issueMessage,]
84 nino.borges 855
85    
86    
87     if __name__ == '__main__':
88 nino.borges 868 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241215\PrivLogExports\PrivLogExport_20241211_CAAG_Converted.txt"
89     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
90     #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
91     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241216-0954).xlsx"
92     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\CAAG-MasterAttorneyList\FullNameOverides.txt"
93 nino.borges 855 outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
94    
95    
96     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
97    
98     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
99    
100     #issuesMatrix = {}
101    
102     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
103     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
104    
105     workList = qcP.metadataValuesDict.keys()
106     for docID in workList:
107 nino.borges 866 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
108     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
109     #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
110     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
111 nino.borges 867 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
112     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
113 nino.borges 868 #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
114     #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
115     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['docAuthor']
116     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['docAuthor']
117 nino.borges 855 ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
118     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
119     ## This will change once you start itterating acroll all of the field values names
120     currentMetadataValues = metadataFieldValues
121     for val in currentMetadataValues:
122     ## First try to locate an email address in this val and if found, try to find that in the MAL.
123     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
124     if results:
125     ## Use some smart deduplication to remove duplicates.
126     results = nv.SmartDedupeSet(results)
127 nino.borges 863 if len(results) > 1:
128     print(f"WARNING: more than one unique email address found in this value: {results}")
129 nino.borges 855 for result in results:
130     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
131     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
132    
133     personMatch = nv.malPeopleList.search_by_email(result.upper())
134     if personMatch:
135     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
136     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
137     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
138     matchFlag = False
139 nino.borges 861 if allPossibleVariationsList:
140     for variationPair in allPossibleVariationsList:
141     if personMatch.is_attorney == 'YES':
142 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
143     ## This variation was found in the list of formatted values, which is fine, so just remove it.
144     if matchFlag:
145     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
146 nino.borges 861
147 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
148     matchFlag = True
149 nino.borges 861
150    
151 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
152     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
153     if matchFlag:
154     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
155     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
156     matchFlag = True
157     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
158     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
159    
160 nino.borges 861
161     elif personMatch.is_attorney == 'NO':
162 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
163     if matchFlag:
164     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
165     ## This variation was found in the list of formatted values, which is fine, so just remove it.
166     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
167     matchFlag = True
168     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
169     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
170     if matchFlag:
171     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
172     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
173     matchFlag = True
174     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
175     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
176 nino.borges 861 else:
177     ## This means they are a split role, so additional work will need to be done with the dates.
178 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
179     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
180     ## Second, determin if this document date is between the dates where this person was an attorney
181     wasAttorneyAtThatTime = False
182     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
183     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
184     #print(f"\ndocumentDateValue is {documentDateValue}")
185     personWasAttorneyDates = personMatch.dates_as_counsel
186     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
187     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
188     if wasAttorneyStartDate.count("/") < 2:
189     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
190     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
191    
192     if wasAttorneyEndDate == "CURRENT":
193     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
194     elif wasAttorneyEndDate == "PRESENT":
195     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
196     if wasAttorneyEndDate.count("/") < 2:
197     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
198     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
199     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
200 nino.borges 861
201 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
202     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
203     wasAttorneyAtThatTime = True
204    
205     ## if wasAttorneyAtThatTime:
206     ## print("Person WAS attorney at this doc date.")
207     ## else:
208     ## print("Person WAS NOT attorney at this doc date.")
209    
210     ## Person's role at the time of the document has been determined, so now do the same checks as above.
211     if wasAttorneyAtThatTime:
212     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
213     ## This variation was found in the list of formatted values, which is fine, so just remove it.
214     if matchFlag:
215     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
216     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
217     matchFlag = True
218     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
219     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
220     if matchFlag:
221     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
222     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
223     matchFlag = True
224     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
225     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
226    
227     else:
228     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
229     if matchFlag:
230     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
231     ## This variation was found in the list of formatted values, which is fine, so just remove it.
232     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
233     matchFlag = True
234     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
235     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
236     if matchFlag:
237     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
238     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
239     matchFlag = True
240     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
241     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
242 nino.borges 861 else:
243 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
244     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
245 nino.borges 861 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
246     if matchFlag:
247     pass
248     else:
249 nino.borges 860 if personMatch.is_attorney == 'YES':
250 nino.borges 866 AddToIssuesList(docID,f"{val} in Metadata To Field did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
251 nino.borges 855
252     else:
253 nino.borges 863 ## Person match, using email, not found in MAL.
254     ## Try extracting a name from this metadata value and try matching the MAL using that.
255 nino.borges 855 val = val.upper()
256 nino.borges 863 origVal = val
257 nino.borges 867
258 nino.borges 855 ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
259     if "(LEGAL)" in val:
260     ## Attempt to only remove the email parenthetical, including the now empty paren.
261     val = val.replace(result.upper(),"")
262     val = val.replace("()",'')
263     #val = val.replace(")","")
264     else:
265     ## Remove all parenthicals, including any character in that paren, from value.
266     val = re.sub(r"\([^)]*\)","",val)
267 nino.borges 867 val = val.replace(result.upper(),"")
268 nino.borges 855 val = val.strip()
269     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
270     if val:
271     ## if there is a comma, parse to last name, first name
272     if "," in val:
273 nino.borges 868 if val.count(",") < 2:
274     lastName, firstName = val.split(",")
275     else:
276     ## This is here to catch some malformatted values in the metadata. this will never match.
277     lastName, firstName = val.split(",", maxsplit = 1)
278     print(f"WARNING: Malformed metadata value found: {val}")
279 nino.borges 855 lastName = lastName.strip()
280     firstName = firstName.strip()
281     elif " " in val:
282     ## For now, try just splitting by the first space and take everything after as the first name.
283     firstName, lastName = val.split(" ",1)
284     ## With the name now parse, try searching for all values that match on the last name.
285 nino.borges 864
286 nino.borges 855 personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
287     if personMatchList:
288     possiblePeopleMatchesMatrix = {}
289     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
290     for personMatch in personMatchList:
291     if personMatch.first_name == firstName:
292     ## This is a personMatch that matches the first and last name
293     possiblePeopleMatchesMatrix[personMatch._id] = 1
294     if possiblePeopleMatchesMatrix.keys():
295     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
296     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
297 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
298     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
299    
300 nino.borges 855 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
301 nino.borges 865
302 nino.borges 855 matchFlag = False
303 nino.borges 861 if allPossibleVariationsList:
304     for variationPair in allPossibleVariationsList:
305     if personMatch.is_attorney == 'YES':
306 nino.borges 868 #if personMatch.last_name == "MANEK":
307     #print(variationPair)
308 nino.borges 861 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
309     ## This variation was found in the list of formatted values, which is fine, so just remove it.
310     if matchFlag:
311     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
312     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
313     matchFlag = True
314     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
315     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
316     if matchFlag:
317     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
318     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
319     matchFlag = True
320     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
321     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
322 nino.borges 866 ## else:
323     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
324     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
325     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
326 nino.borges 861
327 nino.borges 862 elif personMatch.is_attorney == 'NO':
328 nino.borges 861 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
329     if matchFlag:
330     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
331     ## This variation was found in the list of formatted values, which is fine, so just remove it.
332     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
333     matchFlag = True
334     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
335     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
336     if matchFlag:
337     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
338     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
339     matchFlag = True
340     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
341     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
342 nino.borges 862 else:
343     ## This means they are a split role, so additional work will need to be done with the dates.
344 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
345     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
346     ## Second, determin if this document date is between the dates where this person was an attorney
347     wasAttorneyAtThatTime = False
348     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
349     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
350     #print(f"\ndocumentDateValue is {documentDateValue}")
351     personWasAttorneyDates = personMatch.dates_as_counsel
352     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
353     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
354     if wasAttorneyStartDate.count("/") < 2:
355     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
356     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
357    
358     if wasAttorneyEndDate == "CURRENT":
359     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
360     elif wasAttorneyEndDate == "PRESENT":
361     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
362     if wasAttorneyEndDate.count("/") < 2:
363     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
364     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
365     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
366 nino.borges 862
367 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
368     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
369     wasAttorneyAtThatTime = True
370    
371     ## if wasAttorneyAtThatTime:
372     ## print("Person WAS attorney at this doc date.")
373     ## else:
374     ## print("Person WAS NOT attorney at this doc date.")
375    
376     ## Person's role at the time of the document has been determined, so now do the same checks as above.
377     if wasAttorneyAtThatTime:
378     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
379     ## This variation was found in the list of formatted values, which is fine, so just remove it.
380     if matchFlag:
381     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
382     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
383     matchFlag = True
384     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
385     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
386     if matchFlag:
387     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
388     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
389     matchFlag = True
390     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
391     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
392    
393     else:
394     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
395     if matchFlag:
396     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
397     ## This variation was found in the list of formatted values, which is fine, so just remove it.
398     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
399     matchFlag = True
400     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
401     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
402     if matchFlag:
403     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
404     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
405     matchFlag = True
406     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
407     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
408 nino.borges 862 else:
409 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
410     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
411 nino.borges 866 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
412     if matchFlag:
413     pass
414     else:
415     if personMatch.is_attorney == 'YES':
416     AddToIssuesList(docID,f"{origVal} in Metadata To Field and did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
417 nino.borges 863 else:
418     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
419     ## TODO: Add support here for more than one first name last name match in MAL.
420 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
421     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
422 nino.borges 855
423     else:
424     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
425 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
426 nino.borges 855 pass
427 nino.borges 857
428 nino.borges 855 else:
429     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
430 nino.borges 863 #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
431     val = val.upper()
432     origVal = val
433     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
434     if "(LEGAL)" in val:
435     pass
436     else:
437     ## Remove all parenthicals, including any character in that paren, from value.
438     val = re.sub(r"\([^)]*\)","",val)
439    
440     val = val.strip()
441     ## with the paren information stripped out of the val, only move forward if anything still exists.
442     if val:
443     ## if there is a comma, parse to last name, first name
444     if "," in val:
445 nino.borges 868 if val.count(",") < 2:
446     lastName, firstName = val.split(",")
447     else:
448     ## This is here to catch some malformatted values in the metadata. this will never match.
449     lastName, firstName = val.split(",", maxsplit = 1)
450     print(f"WARNING: Malformed metadata value found: {val}")
451 nino.borges 863 lastName = lastName.strip()
452     firstName = firstName.strip()
453     elif " " in val:
454     ## For now, try just splitting by the first space and take everything after as the first name.
455     firstName, lastName = val.split(" ",1)
456     ## With the name now parse, try searching for all values that match on the last name.
457    
458     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
459     if personMatchList:
460     possiblePeopleMatchesMatrix = {}
461     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
462     for personMatch in personMatchList:
463     if personMatch.first_name == firstName:
464     ## This is a personMatch that matches the first and last name
465     possiblePeopleMatchesMatrix[personMatch._id] = 1
466     if possiblePeopleMatchesMatrix.keys():
467     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
468     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
469 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
470     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
471    
472 nino.borges 863 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
473     matchFlag = False
474     if allPossibleVariationsList:
475     for variationPair in allPossibleVariationsList:
476     if personMatch.is_attorney == 'YES':
477     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
478     ## This variation was found in the list of formatted values, which is fine, so just remove it.
479     if matchFlag:
480     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
481     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
482     matchFlag = True
483     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
484     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
485     if matchFlag:
486     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
487     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
488     matchFlag = True
489     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
490     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
491 nino.borges 866 ## else:
492     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
493     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
494     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
495 nino.borges 857
496 nino.borges 863 elif personMatch.is_attorney == 'NO':
497     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
498     if matchFlag:
499     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
500     ## This variation was found in the list of formatted values, which is fine, so just remove it.
501     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
502     matchFlag = True
503     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
504     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
505     if matchFlag:
506     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
507     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
508     matchFlag = True
509     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
510     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
511     else:
512     ## This means they are a split role, so additional work will need to be done with the dates.
513 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
514     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
515     ## Second, determin if this document date is between the dates where this person was an attorney
516     wasAttorneyAtThatTime = False
517     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
518     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
519     #print(f"\ndocumentDateValue is {documentDateValue}")
520     personWasAttorneyDates = personMatch.dates_as_counsel
521     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
522     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
523     if wasAttorneyStartDate.count("/") < 2:
524     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
525     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
526    
527     if wasAttorneyEndDate == "CURRENT":
528     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
529     elif wasAttorneyEndDate == "PRESENT":
530     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
531     if wasAttorneyEndDate.count("/") < 2:
532     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
533     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
534     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
535 nino.borges 863
536 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
537     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
538     wasAttorneyAtThatTime = True
539    
540     ## if wasAttorneyAtThatTime:
541     ## print("Person WAS attorney at this doc date.")
542     ## else:
543     ## print("Person WAS NOT attorney at this doc date.")
544    
545     ## Person's role at the time of the document has been determined, so now do the same checks as above.
546     if wasAttorneyAtThatTime:
547     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
548     ## This variation was found in the list of formatted values, which is fine, so just remove it.
549     if matchFlag:
550     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
551     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
552     matchFlag = True
553     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
554     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
555     if matchFlag:
556     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
557     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
558     matchFlag = True
559     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
560     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
561    
562     else:
563     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
564     if matchFlag:
565     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
566     ## This variation was found in the list of formatted values, which is fine, so just remove it.
567     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
568     matchFlag = True
569     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
570     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
571     if matchFlag:
572     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
573     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
574     matchFlag = True
575     ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
576     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
577 nino.borges 863 else:
578 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
579     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
580 nino.borges 866 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
581     if matchFlag:
582     pass
583     else:
584     if personMatch.is_attorney == 'YES':
585     AddToIssuesList(docID,f"{origVal} in Metadata To Field did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
586 nino.borges 863 else:
587     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
588     ## TODO: Add support here for more than one first name last name match in MAL.
589 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
590     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
591 nino.borges 863
592    
593    
594 nino.borges 855
595     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
596     if formattedFieldValues:
597     for val in formattedFieldValues:
598     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
599     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
600     if "*" in val:
601 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
602 nino.borges 861 AddToIssuesList(docID,f"{val} in Formatted To Field is an attorney but couldnt be matched to any value in metadata field.")
603 nino.borges 855
604    
605 nino.borges 857
606 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
607     outputFile = open(outputFileName,'w')
608     for docID in list(issuesMatrix.keys()):
609     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
610     outputFile.close()