ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 872
Committed: Mon Dec 23 22:53:20 2024 UTC (15 months ago) by nino.borges
Content type: text/x-python
File size: 56866 byte(s)
Log Message:
Added support for writing the "to Field" or "from Field" information to the issue messages, in advance of possibly making this a single report.  Also going to eventually make this so it itterates over a list and not doing these one by one.

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 872 version = '0.14.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32 nino.borges 865 ## After talking to Eli, we decided that all of these amazon.com.uk or amazon.it are related domains,
33     ## so we should feel confident that we can add amazon.com to the list of possible domains. doing that here and for alt work email.
34     if "@AMAZON." in personMatch.work_email_address:
35     allDomainsList.append("AMAZON.COM")
36 nino.borges 855 if personMatch.alt_work_email_address:
37     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
38 nino.borges 865 if "@AMAZON." in personMatch.alt_work_email_address:
39     allDomainsList.append("AMAZON.COM")
40 nino.borges 855 allDomainsList = list(dict.fromkeys(allDomainsList))
41    
42     if personMatch.full_name_overide:
43     fullName = personMatch.full_name_overide
44     for domain in allDomainsList:
45     allPossibleVariationsList.append((fullName,domain))
46     if personMatch.full_name_preferred:
47     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
48     fullPreferredName = personMatch.full_name_preferred
49     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
50     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
51     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
52 nino.borges 866 fullPreferredName = fullPreferredName.replace(',,',',')
53 nino.borges 855 if "," in fullPreferredName:
54     preferedLastName, preferedFirstName = fullPreferredName.split(',')
55     preferedLastName = preferedLastName.strip()
56     preferedFirstName = preferedFirstName.strip()
57     preferedFirstName = preferedFirstName.split(" ")[0]
58     fullName = f"{preferedFirstName} {preferedLastName}"
59     #fullName = f"{preferedLastName}, {preferedFirstName}"
60     for domain in allDomainsList:
61     allPossibleVariationsList.append((fullName,domain))
62     else:
63     print(f"ERROR in this name {fullPreferredName}")
64     if personMatch.last_name:
65     if personMatch.first_name:
66     fullName = f"{personMatch.first_name} {personMatch.last_name}"
67     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
68     else:
69     fullName = f"{personMatch.last_name}"
70     for domain in allDomainsList:
71     allPossibleVariationsList.append((fullName,domain))
72    
73    
74     ## Now return a deduplicated list by using dict to deduplicate.
75     return list(dict.fromkeys(allPossibleVariationsList))
76    
77    
78 nino.borges 856 def AddToIssuesList(docID,issueMessage):
79     """This function will add a single issue to the issues matrix."""
80     if docID in list(issuesMatrix.keys()):
81     issuesMatrix[docID].append(issueMessage)
82     else:
83     issuesMatrix[docID] = [issueMessage,]
84 nino.borges 855
85    
86    
87     if __name__ == '__main__':
88 nino.borges 868 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241215\PrivLogExports\PrivLogExport_20241211_CAAG_Converted.txt"
89     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
90     #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
91 nino.borges 872 #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241216-0954).xlsx"
92     #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.16(20241219-0157).xlsx"
93     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.20(20241220-0107).xlsx"
94 nino.borges 868 fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\CAAG-MasterAttorneyList\FullNameOverides.txt"
95 nino.borges 855 outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
96    
97    
98     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
99    
100     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
101    
102 nino.borges 872 ## Creat a simple named tuple value name to message field name matrix, to control the text used in the issue message
103     ntvnMatrix = {'toValues':'To Field','fromValues':'From Field','ccValues':'CC Field','bccValues':'BCC Field','docAuthor':'Doc Author Field'}
104     ## This next line will change as soon as you start itterating over all of these instead of just doing one at a time. For now I just do each of these one by one.
105     ## TODO: Change this to itterate over a list of all of these instead of just doing them one by one.
106     #currentNtvn = 'toValues'
107     #currentNtvn = 'fromValues'
108     #currentNtvn = 'ccValues'
109     #currentNtvn = 'bccValues'
110     currentNtvn = 'docAuthor'
111 nino.borges 855
112     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
113     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
114    
115     workList = qcP.metadataValuesDict.keys()
116     for docID in workList:
117 nino.borges 872 metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()[currentNtvn]
118     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()[currentNtvn]
119 nino.borges 855 ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
120     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
121     ## This will change once you start itterating acroll all of the field values names
122     currentMetadataValues = metadataFieldValues
123     for val in currentMetadataValues:
124     ## First try to locate an email address in this val and if found, try to find that in the MAL.
125     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
126     if results:
127     ## Use some smart deduplication to remove duplicates.
128     results = nv.SmartDedupeSet(results)
129 nino.borges 863 if len(results) > 1:
130     print(f"WARNING: more than one unique email address found in this value: {results}")
131 nino.borges 855 for result in results:
132     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
133     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
134    
135     personMatch = nv.malPeopleList.search_by_email(result.upper())
136     if personMatch:
137     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
138     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
139     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
140     matchFlag = False
141 nino.borges 861 if allPossibleVariationsList:
142     for variationPair in allPossibleVariationsList:
143     if personMatch.is_attorney == 'YES':
144 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
145     ## This variation was found in the list of formatted values, which is fine, so just remove it.
146     if matchFlag:
147     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
148 nino.borges 861
149 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
150     matchFlag = True
151 nino.borges 861
152    
153 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
154     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
155     if matchFlag:
156     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
157     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
158     matchFlag = True
159 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
160     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential Upgrade")
161 nino.borges 860
162 nino.borges 861
163     elif personMatch.is_attorney == 'NO':
164 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
165     if matchFlag:
166     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
167     ## This variation was found in the list of formatted values, which is fine, so just remove it.
168     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
169     matchFlag = True
170     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
171     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
172     if matchFlag:
173     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
174     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
175     matchFlag = True
176 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
177     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential downgrade")
178 nino.borges 861 else:
179     ## This means they are a split role, so additional work will need to be done with the dates.
180 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
181     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
182     ## Second, determin if this document date is between the dates where this person was an attorney
183     wasAttorneyAtThatTime = False
184     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
185     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
186     #print(f"\ndocumentDateValue is {documentDateValue}")
187     personWasAttorneyDates = personMatch.dates_as_counsel
188     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
189     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
190     if wasAttorneyStartDate.count("/") < 2:
191     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
192     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
193    
194     if wasAttorneyEndDate == "CURRENT":
195     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
196     elif wasAttorneyEndDate == "PRESENT":
197     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
198     if wasAttorneyEndDate.count("/") < 2:
199     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
200     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
201     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
202 nino.borges 861
203 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
204     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
205     wasAttorneyAtThatTime = True
206    
207     ## if wasAttorneyAtThatTime:
208     ## print("Person WAS attorney at this doc date.")
209     ## else:
210     ## print("Person WAS NOT attorney at this doc date.")
211    
212     ## Person's role at the time of the document has been determined, so now do the same checks as above.
213     if wasAttorneyAtThatTime:
214     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
215     ## This variation was found in the list of formatted values, which is fine, so just remove it.
216     if matchFlag:
217     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
218     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
219     matchFlag = True
220     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
221     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
222     if matchFlag:
223     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
224     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
225     matchFlag = True
226 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
227     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential Upgrade")
228 nino.borges 868
229     else:
230     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
231     if matchFlag:
232     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
233     ## This variation was found in the list of formatted values, which is fine, so just remove it.
234     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
235     matchFlag = True
236     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
237     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
238     if matchFlag:
239     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
240     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
241     matchFlag = True
242 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
243     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential downgrade")
244 nino.borges 861 else:
245 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
246     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
247 nino.borges 861 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
248     if matchFlag:
249     pass
250     else:
251 nino.borges 860 if personMatch.is_attorney == 'YES':
252 nino.borges 872 AddToIssuesList(docID,f"{val} in Metadata {ntvnMatrix[currentNtvn]} did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
253 nino.borges 855
254     else:
255 nino.borges 863 ## Person match, using email, not found in MAL.
256     ## Try extracting a name from this metadata value and try matching the MAL using that.
257 nino.borges 855 val = val.upper()
258 nino.borges 863 origVal = val
259 nino.borges 867
260 nino.borges 855 ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
261     if "(LEGAL)" in val:
262     ## Attempt to only remove the email parenthetical, including the now empty paren.
263     val = val.replace(result.upper(),"")
264     val = val.replace("()",'')
265     #val = val.replace(")","")
266     else:
267     ## Remove all parenthicals, including any character in that paren, from value.
268     val = re.sub(r"\([^)]*\)","",val)
269 nino.borges 867 val = val.replace(result.upper(),"")
270 nino.borges 855 val = val.strip()
271     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
272     if val:
273     ## if there is a comma, parse to last name, first name
274     if "," in val:
275 nino.borges 868 if val.count(",") < 2:
276     lastName, firstName = val.split(",")
277     else:
278     ## This is here to catch some malformatted values in the metadata. this will never match.
279     lastName, firstName = val.split(",", maxsplit = 1)
280     print(f"WARNING: Malformed metadata value found: {val}")
281 nino.borges 855 lastName = lastName.strip()
282     firstName = firstName.strip()
283     elif " " in val:
284     ## For now, try just splitting by the first space and take everything after as the first name.
285     firstName, lastName = val.split(" ",1)
286     ## With the name now parse, try searching for all values that match on the last name.
287 nino.borges 864
288 nino.borges 855 personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
289     if personMatchList:
290     possiblePeopleMatchesMatrix = {}
291     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
292     for personMatch in personMatchList:
293     if personMatch.first_name == firstName:
294     ## This is a personMatch that matches the first and last name
295     possiblePeopleMatchesMatrix[personMatch._id] = 1
296     if possiblePeopleMatchesMatrix.keys():
297     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
298     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
299 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
300     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
301    
302 nino.borges 855 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
303 nino.borges 865
304 nino.borges 855 matchFlag = False
305 nino.borges 861 if allPossibleVariationsList:
306     for variationPair in allPossibleVariationsList:
307     if personMatch.is_attorney == 'YES':
308 nino.borges 868 #if personMatch.last_name == "MANEK":
309     #print(variationPair)
310 nino.borges 861 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
311     ## This variation was found in the list of formatted values, which is fine, so just remove it.
312     if matchFlag:
313     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
314     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
315     matchFlag = True
316     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
317     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
318     if matchFlag:
319     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
320     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
321     matchFlag = True
322 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
323     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
324 nino.borges 866 ## else:
325     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
326     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
327     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
328 nino.borges 861
329 nino.borges 862 elif personMatch.is_attorney == 'NO':
330 nino.borges 861 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
331     if matchFlag:
332     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
333     ## This variation was found in the list of formatted values, which is fine, so just remove it.
334     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
335     matchFlag = True
336     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
337     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
338     if matchFlag:
339     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
340     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
341     matchFlag = True
342 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
343     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
344 nino.borges 862 else:
345     ## This means they are a split role, so additional work will need to be done with the dates.
346 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
347     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
348     ## Second, determin if this document date is between the dates where this person was an attorney
349     wasAttorneyAtThatTime = False
350     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
351     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
352     #print(f"\ndocumentDateValue is {documentDateValue}")
353     personWasAttorneyDates = personMatch.dates_as_counsel
354     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
355     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
356     if wasAttorneyStartDate.count("/") < 2:
357     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
358     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
359    
360     if wasAttorneyEndDate == "CURRENT":
361     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
362     elif wasAttorneyEndDate == "PRESENT":
363     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
364     if wasAttorneyEndDate.count("/") < 2:
365     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
366     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
367     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
368 nino.borges 862
369 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
370     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
371     wasAttorneyAtThatTime = True
372    
373     ## if wasAttorneyAtThatTime:
374     ## print("Person WAS attorney at this doc date.")
375     ## else:
376     ## print("Person WAS NOT attorney at this doc date.")
377    
378     ## Person's role at the time of the document has been determined, so now do the same checks as above.
379     if wasAttorneyAtThatTime:
380     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
381     ## This variation was found in the list of formatted values, which is fine, so just remove it.
382     if matchFlag:
383     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
384     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
385     matchFlag = True
386     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
387     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
388     if matchFlag:
389     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
390     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
391     matchFlag = True
392 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
393     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
394 nino.borges 868
395     else:
396     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
397     if matchFlag:
398     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
399     ## This variation was found in the list of formatted values, which is fine, so just remove it.
400     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
401     matchFlag = True
402     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
403     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
404     if matchFlag:
405     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
406     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
407     matchFlag = True
408 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
409     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
410 nino.borges 862 else:
411 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
412     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
413 nino.borges 866 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
414     if matchFlag:
415     pass
416     else:
417     if personMatch.is_attorney == 'YES':
418 nino.borges 872 AddToIssuesList(docID,f"{origVal} in Metadata {ntvnMatrix[currentNtvn]} and did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
419 nino.borges 863 else:
420     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
421     ## TODO: Add support here for more than one first name last name match in MAL.
422 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
423     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
424 nino.borges 855
425     else:
426     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
427 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
428 nino.borges 855 pass
429 nino.borges 857
430 nino.borges 855 else:
431     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
432 nino.borges 863 #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
433     val = val.upper()
434     origVal = val
435     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
436     if "(LEGAL)" in val:
437     pass
438     else:
439     ## Remove all parenthicals, including any character in that paren, from value.
440     val = re.sub(r"\([^)]*\)","",val)
441    
442     val = val.strip()
443     ## with the paren information stripped out of the val, only move forward if anything still exists.
444     if val:
445     ## if there is a comma, parse to last name, first name
446     if "," in val:
447 nino.borges 868 if val.count(",") < 2:
448     lastName, firstName = val.split(",")
449     else:
450     ## This is here to catch some malformatted values in the metadata. this will never match.
451     lastName, firstName = val.split(",", maxsplit = 1)
452     print(f"WARNING: Malformed metadata value found: {val}")
453 nino.borges 863 lastName = lastName.strip()
454     firstName = firstName.strip()
455     elif " " in val:
456     ## For now, try just splitting by the first space and take everything after as the first name.
457     firstName, lastName = val.split(" ",1)
458     ## With the name now parse, try searching for all values that match on the last name.
459    
460     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
461     if personMatchList:
462     possiblePeopleMatchesMatrix = {}
463     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
464     for personMatch in personMatchList:
465     if personMatch.first_name == firstName:
466     ## This is a personMatch that matches the first and last name
467     possiblePeopleMatchesMatrix[personMatch._id] = 1
468     if possiblePeopleMatchesMatrix.keys():
469     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
470     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
471 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
472     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
473    
474 nino.borges 863 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
475     matchFlag = False
476     if allPossibleVariationsList:
477     for variationPair in allPossibleVariationsList:
478     if personMatch.is_attorney == 'YES':
479     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
480     ## This variation was found in the list of formatted values, which is fine, so just remove it.
481     if matchFlag:
482     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
483     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
484     matchFlag = True
485     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
486     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
487     if matchFlag:
488     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
489     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
490     matchFlag = True
491 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
492     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
493 nino.borges 866 ## else:
494     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
495     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
496     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
497 nino.borges 857
498 nino.borges 863 elif personMatch.is_attorney == 'NO':
499     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
500     if matchFlag:
501     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
502     ## This variation was found in the list of formatted values, which is fine, so just remove it.
503     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
504     matchFlag = True
505     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
506     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
507     if matchFlag:
508     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
509     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
510     matchFlag = True
511 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
512     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
513 nino.borges 863 else:
514     ## This means they are a split role, so additional work will need to be done with the dates.
515 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
516     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
517     ## Second, determin if this document date is between the dates where this person was an attorney
518     wasAttorneyAtThatTime = False
519     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
520     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
521     #print(f"\ndocumentDateValue is {documentDateValue}")
522     personWasAttorneyDates = personMatch.dates_as_counsel
523     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
524     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
525     if wasAttorneyStartDate.count("/") < 2:
526     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
527     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
528    
529     if wasAttorneyEndDate == "CURRENT":
530     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
531     elif wasAttorneyEndDate == "PRESENT":
532     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
533     if wasAttorneyEndDate.count("/") < 2:
534     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
535     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
536     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
537 nino.borges 863
538 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
539     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
540     wasAttorneyAtThatTime = True
541    
542     ## if wasAttorneyAtThatTime:
543     ## print("Person WAS attorney at this doc date.")
544     ## else:
545     ## print("Person WAS NOT attorney at this doc date.")
546    
547     ## Person's role at the time of the document has been determined, so now do the same checks as above.
548     if wasAttorneyAtThatTime:
549     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
550     ## This variation was found in the list of formatted values, which is fine, so just remove it.
551     if matchFlag:
552     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
553     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
554     matchFlag = True
555     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
556     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
557     if matchFlag:
558     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
559     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
560     matchFlag = True
561 nino.borges 872 ## TODO:DONE change the hard coded "To Field" here once you change to itterate over all field groups.
562     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
563 nino.borges 868
564     else:
565     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
566     if matchFlag:
567     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
568     ## This variation was found in the list of formatted values, which is fine, so just remove it.
569     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
570     matchFlag = True
571     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
572     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
573     if matchFlag:
574     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
575     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
576     matchFlag = True
577 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
578     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
579 nino.borges 863 else:
580 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
581     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
582 nino.borges 866 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
583     if matchFlag:
584     pass
585     else:
586     if personMatch.is_attorney == 'YES':
587 nino.borges 872 AddToIssuesList(docID,f"{origVal} in Metadata {ntvnMatrix[currentNtvn]} did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
588 nino.borges 863 else:
589     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
590     ## TODO: Add support here for more than one first name last name match in MAL.
591 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
592     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
593 nino.borges 863
594    
595    
596 nino.borges 855
597     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
598     if formattedFieldValues:
599     for val in formattedFieldValues:
600     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
601     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
602     if "*" in val:
603 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
604     AddToIssuesList(docID,f"{val} in Formatted {ntvnMatrix[currentNtvn]} is an attorney but couldnt be matched to any value in metadata field.")
605 nino.borges 855
606    
607 nino.borges 857
608 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
609     outputFile = open(outputFileName,'w')
610     for docID in list(issuesMatrix.keys()):
611     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
612     outputFile.close()