ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 947
Committed: Wed Nov 5 18:12:54 2025 UTC (4 months, 3 weeks ago) by nino.borges
Content type: text/x-python
File size: 57354 byte(s)
Log Message:
Fixing some conflicts

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 872 version = '0.14.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32 nino.borges 865 ## After talking to Eli, we decided that all of these amazon.com.uk or amazon.it are related domains,
33     ## so we should feel confident that we can add amazon.com to the list of possible domains. doing that here and for alt work email.
34     if "@AMAZON." in personMatch.work_email_address:
35     allDomainsList.append("AMAZON.COM")
36 nino.borges 855 if personMatch.alt_work_email_address:
37     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
38 nino.borges 865 if "@AMAZON." in personMatch.alt_work_email_address:
39     allDomainsList.append("AMAZON.COM")
40 nino.borges 855 allDomainsList = list(dict.fromkeys(allDomainsList))
41    
42     if personMatch.full_name_overide:
43     fullName = personMatch.full_name_overide
44     for domain in allDomainsList:
45     allPossibleVariationsList.append((fullName,domain))
46     if personMatch.full_name_preferred:
47     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
48     fullPreferredName = personMatch.full_name_preferred
49     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
50     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
51     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
52 nino.borges 866 fullPreferredName = fullPreferredName.replace(',,',',')
53 nino.borges 921 ## Noticed that people are adding multiple commas to also separate the middle name. This test will look for that and solve for it.
54     errTest = fullPreferredName.split(",")
55     if len(errTest) > 2:
56     fullPreferredName = "".join(fullPreferredName.rsplit(",",1))
57    
58 nino.borges 855 if "," in fullPreferredName:
59     preferedLastName, preferedFirstName = fullPreferredName.split(',')
60     preferedLastName = preferedLastName.strip()
61     preferedFirstName = preferedFirstName.strip()
62     preferedFirstName = preferedFirstName.split(" ")[0]
63     fullName = f"{preferedFirstName} {preferedLastName}"
64     #fullName = f"{preferedLastName}, {preferedFirstName}"
65     for domain in allDomainsList:
66     allPossibleVariationsList.append((fullName,domain))
67     else:
68     print(f"ERROR in this name {fullPreferredName}")
69     if personMatch.last_name:
70     if personMatch.first_name:
71     fullName = f"{personMatch.first_name} {personMatch.last_name}"
72     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
73     else:
74     fullName = f"{personMatch.last_name}"
75     for domain in allDomainsList:
76     allPossibleVariationsList.append((fullName,domain))
77    
78    
79     ## Now return a deduplicated list by using dict to deduplicate.
80     return list(dict.fromkeys(allPossibleVariationsList))
81    
82    
83 nino.borges 856 def AddToIssuesList(docID,issueMessage):
84     """This function will add a single issue to the issues matrix."""
85     if docID in list(issuesMatrix.keys()):
86     issuesMatrix[docID].append(issueMessage)
87     else:
88     issuesMatrix[docID] = [issueMessage,]
89 nino.borges 855
90    
91    
92     if __name__ == '__main__':
93 nino.borges 947 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241215\PrivLogExports\PrivLogExport_20241211_CAAG_Converted.txt"
94 nino.borges 868 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
95     #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
96 nino.borges 872 #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241216-0954).xlsx"
97     #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.16(20241219-0157).xlsx"
98 nino.borges 947 masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2025.10.30 (20251030-1252).xlsx"
99 nino.borges 868 fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\CAAG-MasterAttorneyList\FullNameOverides.txt"
100 nino.borges 855 outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
101    
102    
103     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
104    
105     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
106    
107 nino.borges 872 ## Creat a simple named tuple value name to message field name matrix, to control the text used in the issue message
108     ntvnMatrix = {'toValues':'To Field','fromValues':'From Field','ccValues':'CC Field','bccValues':'BCC Field','docAuthor':'Doc Author Field'}
109     ## This next line will change as soon as you start itterating over all of these instead of just doing one at a time. For now I just do each of these one by one.
110     ## TODO: Change this to itterate over a list of all of these instead of just doing them one by one.
111 nino.borges 947 #currentNtvn = 'toValues'
112 nino.borges 872 #currentNtvn = 'fromValues'
113     #currentNtvn = 'ccValues'
114     #currentNtvn = 'bccValues'
115 nino.borges 947 currentNtvn = 'docAuthor'
116 nino.borges 855
117     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
118     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
119    
120     workList = qcP.metadataValuesDict.keys()
121     for docID in workList:
122 nino.borges 872 metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()[currentNtvn]
123     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()[currentNtvn]
124 nino.borges 855 ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
125     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
126     ## This will change once you start itterating acroll all of the field values names
127     currentMetadataValues = metadataFieldValues
128     for val in currentMetadataValues:
129     ## First try to locate an email address in this val and if found, try to find that in the MAL.
130     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
131     if results:
132     ## Use some smart deduplication to remove duplicates.
133     results = nv.SmartDedupeSet(results)
134 nino.borges 863 if len(results) > 1:
135     print(f"WARNING: more than one unique email address found in this value: {results}")
136 nino.borges 855 for result in results:
137     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
138     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
139    
140     personMatch = nv.malPeopleList.search_by_email(result.upper())
141     if personMatch:
142     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
143     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
144     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
145     matchFlag = False
146 nino.borges 861 if allPossibleVariationsList:
147     for variationPair in allPossibleVariationsList:
148     if personMatch.is_attorney == 'YES':
149 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
150     ## This variation was found in the list of formatted values, which is fine, so just remove it.
151     if matchFlag:
152     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
153 nino.borges 861
154 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
155     matchFlag = True
156 nino.borges 861
157    
158 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
159     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
160     if matchFlag:
161     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
162     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
163     matchFlag = True
164 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
165     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential Upgrade")
166 nino.borges 860
167 nino.borges 861
168     elif personMatch.is_attorney == 'NO':
169 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
170     if matchFlag:
171     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
172     ## This variation was found in the list of formatted values, which is fine, so just remove it.
173     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
174     matchFlag = True
175     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
176     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
177     if matchFlag:
178     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
179     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
180     matchFlag = True
181 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
182     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential downgrade")
183 nino.borges 861 else:
184     ## This means they are a split role, so additional work will need to be done with the dates.
185 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
186     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
187     ## Second, determin if this document date is between the dates where this person was an attorney
188     wasAttorneyAtThatTime = False
189     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
190     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
191     #print(f"\ndocumentDateValue is {documentDateValue}")
192     personWasAttorneyDates = personMatch.dates_as_counsel
193 nino.borges 921 #print(personMatch.work_email_address)
194 nino.borges 868 for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
195     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
196     if wasAttorneyStartDate.count("/") < 2:
197     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
198     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
199    
200     if wasAttorneyEndDate == "CURRENT":
201     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
202     elif wasAttorneyEndDate == "PRESENT":
203     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
204     if wasAttorneyEndDate.count("/") < 2:
205     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
206     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
207     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
208 nino.borges 861
209 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
210     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
211     wasAttorneyAtThatTime = True
212    
213     ## if wasAttorneyAtThatTime:
214     ## print("Person WAS attorney at this doc date.")
215     ## else:
216     ## print("Person WAS NOT attorney at this doc date.")
217    
218     ## Person's role at the time of the document has been determined, so now do the same checks as above.
219     if wasAttorneyAtThatTime:
220     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
221     ## This variation was found in the list of formatted values, which is fine, so just remove it.
222     if matchFlag:
223     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
224     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
225     matchFlag = True
226     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
227     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
228     if matchFlag:
229     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
230     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
231     matchFlag = True
232 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
233     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential Upgrade")
234 nino.borges 868
235     else:
236     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
237     if matchFlag:
238     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
239     ## This variation was found in the list of formatted values, which is fine, so just remove it.
240     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
241     matchFlag = True
242     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
243     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
244     if matchFlag:
245     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
246     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
247     matchFlag = True
248 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
249     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential downgrade")
250 nino.borges 861 else:
251 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
252     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
253 nino.borges 861 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
254     if matchFlag:
255     pass
256     else:
257 nino.borges 860 if personMatch.is_attorney == 'YES':
258 nino.borges 872 AddToIssuesList(docID,f"{val} in Metadata {ntvnMatrix[currentNtvn]} did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
259 nino.borges 855
260     else:
261 nino.borges 863 ## Person match, using email, not found in MAL.
262     ## Try extracting a name from this metadata value and try matching the MAL using that.
263 nino.borges 855 val = val.upper()
264 nino.borges 863 origVal = val
265 nino.borges 867
266 nino.borges 855 ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
267     if "(LEGAL)" in val:
268     ## Attempt to only remove the email parenthetical, including the now empty paren.
269     val = val.replace(result.upper(),"")
270     val = val.replace("()",'')
271     #val = val.replace(")","")
272     else:
273     ## Remove all parenthicals, including any character in that paren, from value.
274     val = re.sub(r"\([^)]*\)","",val)
275 nino.borges 867 val = val.replace(result.upper(),"")
276 nino.borges 855 val = val.strip()
277     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
278     if val:
279     ## if there is a comma, parse to last name, first name
280     if "," in val:
281 nino.borges 868 if val.count(",") < 2:
282     lastName, firstName = val.split(",")
283     else:
284     ## This is here to catch some malformatted values in the metadata. this will never match.
285     lastName, firstName = val.split(",", maxsplit = 1)
286     print(f"WARNING: Malformed metadata value found: {val}")
287 nino.borges 855 lastName = lastName.strip()
288     firstName = firstName.strip()
289     elif " " in val:
290     ## For now, try just splitting by the first space and take everything after as the first name.
291     firstName, lastName = val.split(" ",1)
292     ## With the name now parse, try searching for all values that match on the last name.
293 nino.borges 864
294 nino.borges 855 personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
295     if personMatchList:
296     possiblePeopleMatchesMatrix = {}
297     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
298     for personMatch in personMatchList:
299     if personMatch.first_name == firstName:
300     ## This is a personMatch that matches the first and last name
301     possiblePeopleMatchesMatrix[personMatch._id] = 1
302     if possiblePeopleMatchesMatrix.keys():
303     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
304     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
305 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
306     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
307    
308 nino.borges 855 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
309 nino.borges 865
310 nino.borges 855 matchFlag = False
311 nino.borges 861 if allPossibleVariationsList:
312     for variationPair in allPossibleVariationsList:
313     if personMatch.is_attorney == 'YES':
314 nino.borges 868 #if personMatch.last_name == "MANEK":
315     #print(variationPair)
316 nino.borges 861 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
317     ## This variation was found in the list of formatted values, which is fine, so just remove it.
318     if matchFlag:
319     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
320     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
321     matchFlag = True
322     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
323     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
324     if matchFlag:
325     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
326     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
327     matchFlag = True
328 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
329     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
330 nino.borges 866 ## else:
331     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
332     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
333     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
334 nino.borges 861
335 nino.borges 862 elif personMatch.is_attorney == 'NO':
336 nino.borges 861 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
337     if matchFlag:
338     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
339     ## This variation was found in the list of formatted values, which is fine, so just remove it.
340     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
341     matchFlag = True
342     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
343     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
344     if matchFlag:
345     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
346     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
347     matchFlag = True
348 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
349     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
350 nino.borges 862 else:
351     ## This means they are a split role, so additional work will need to be done with the dates.
352 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
353     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
354     ## Second, determin if this document date is between the dates where this person was an attorney
355     wasAttorneyAtThatTime = False
356     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
357     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
358     #print(f"\ndocumentDateValue is {documentDateValue}")
359     personWasAttorneyDates = personMatch.dates_as_counsel
360     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
361     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
362     if wasAttorneyStartDate.count("/") < 2:
363     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
364     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
365    
366     if wasAttorneyEndDate == "CURRENT":
367     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
368     elif wasAttorneyEndDate == "PRESENT":
369     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
370     if wasAttorneyEndDate.count("/") < 2:
371     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
372     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
373     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
374 nino.borges 862
375 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
376     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
377     wasAttorneyAtThatTime = True
378    
379     ## if wasAttorneyAtThatTime:
380     ## print("Person WAS attorney at this doc date.")
381     ## else:
382     ## print("Person WAS NOT attorney at this doc date.")
383    
384     ## Person's role at the time of the document has been determined, so now do the same checks as above.
385     if wasAttorneyAtThatTime:
386     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
387     ## This variation was found in the list of formatted values, which is fine, so just remove it.
388     if matchFlag:
389     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
390     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
391     matchFlag = True
392     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
393     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
394     if matchFlag:
395     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
396     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
397     matchFlag = True
398 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
399     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
400 nino.borges 868
401     else:
402     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
403     if matchFlag:
404     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
405     ## This variation was found in the list of formatted values, which is fine, so just remove it.
406     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
407     matchFlag = True
408     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
409     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
410     if matchFlag:
411     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
412     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
413     matchFlag = True
414 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
415     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
416 nino.borges 862 else:
417 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
418     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
419 nino.borges 866 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
420     if matchFlag:
421     pass
422     else:
423     if personMatch.is_attorney == 'YES':
424 nino.borges 872 AddToIssuesList(docID,f"{origVal} in Metadata {ntvnMatrix[currentNtvn]} and did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
425 nino.borges 863 else:
426     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
427     ## TODO: Add support here for more than one first name last name match in MAL.
428 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
429     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
430 nino.borges 855
431     else:
432     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
433 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
434 nino.borges 855 pass
435 nino.borges 857
436 nino.borges 855 else:
437     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
438 nino.borges 863 #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
439     val = val.upper()
440     origVal = val
441     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
442     if "(LEGAL)" in val:
443     pass
444     else:
445     ## Remove all parenthicals, including any character in that paren, from value.
446     val = re.sub(r"\([^)]*\)","",val)
447    
448     val = val.strip()
449     ## with the paren information stripped out of the val, only move forward if anything still exists.
450     if val:
451     ## if there is a comma, parse to last name, first name
452     if "," in val:
453 nino.borges 868 if val.count(",") < 2:
454     lastName, firstName = val.split(",")
455     else:
456     ## This is here to catch some malformatted values in the metadata. this will never match.
457     lastName, firstName = val.split(",", maxsplit = 1)
458     print(f"WARNING: Malformed metadata value found: {val}")
459 nino.borges 863 lastName = lastName.strip()
460     firstName = firstName.strip()
461     elif " " in val:
462     ## For now, try just splitting by the first space and take everything after as the first name.
463     firstName, lastName = val.split(" ",1)
464     ## With the name now parse, try searching for all values that match on the last name.
465    
466     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
467     if personMatchList:
468     possiblePeopleMatchesMatrix = {}
469     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
470     for personMatch in personMatchList:
471     if personMatch.first_name == firstName:
472     ## This is a personMatch that matches the first and last name
473     possiblePeopleMatchesMatrix[personMatch._id] = 1
474     if possiblePeopleMatchesMatrix.keys():
475     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
476     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
477 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
478     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
479    
480 nino.borges 863 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
481     matchFlag = False
482     if allPossibleVariationsList:
483     for variationPair in allPossibleVariationsList:
484     if personMatch.is_attorney == 'YES':
485     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
486     ## This variation was found in the list of formatted values, which is fine, so just remove it.
487     if matchFlag:
488     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
489     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
490     matchFlag = True
491     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
492     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
493     if matchFlag:
494     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
495     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
496     matchFlag = True
497 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
498     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
499 nino.borges 866 ## else:
500     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
501     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
502     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
503 nino.borges 857
504 nino.borges 863 elif personMatch.is_attorney == 'NO':
505     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
506     if matchFlag:
507     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
508     ## This variation was found in the list of formatted values, which is fine, so just remove it.
509     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
510     matchFlag = True
511     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
512     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
513     if matchFlag:
514     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
515     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
516     matchFlag = True
517 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
518     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
519 nino.borges 863 else:
520     ## This means they are a split role, so additional work will need to be done with the dates.
521 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
522     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
523     ## Second, determin if this document date is between the dates where this person was an attorney
524     wasAttorneyAtThatTime = False
525     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
526     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
527     #print(f"\ndocumentDateValue is {documentDateValue}")
528     personWasAttorneyDates = personMatch.dates_as_counsel
529     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
530     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
531     if wasAttorneyStartDate.count("/") < 2:
532     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
533     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
534    
535     if wasAttorneyEndDate == "CURRENT":
536     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
537     elif wasAttorneyEndDate == "PRESENT":
538     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
539     if wasAttorneyEndDate.count("/") < 2:
540     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
541     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
542     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
543 nino.borges 863
544 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
545     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
546     wasAttorneyAtThatTime = True
547    
548     ## if wasAttorneyAtThatTime:
549     ## print("Person WAS attorney at this doc date.")
550     ## else:
551     ## print("Person WAS NOT attorney at this doc date.")
552    
553     ## Person's role at the time of the document has been determined, so now do the same checks as above.
554     if wasAttorneyAtThatTime:
555     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
556     ## This variation was found in the list of formatted values, which is fine, so just remove it.
557     if matchFlag:
558     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
559     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
560     matchFlag = True
561     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
562     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
563     if matchFlag:
564     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
565     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
566     matchFlag = True
567 nino.borges 872 ## TODO:DONE change the hard coded "To Field" here once you change to itterate over all field groups.
568     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
569 nino.borges 868
570     else:
571     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
572     if matchFlag:
573     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
574     ## This variation was found in the list of formatted values, which is fine, so just remove it.
575     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
576     matchFlag = True
577     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
578     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
579     if matchFlag:
580     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
581     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
582     matchFlag = True
583 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
584     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
585 nino.borges 863 else:
586 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
587     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
588 nino.borges 866 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
589     if matchFlag:
590     pass
591     else:
592     if personMatch.is_attorney == 'YES':
593 nino.borges 872 AddToIssuesList(docID,f"{origVal} in Metadata {ntvnMatrix[currentNtvn]} did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
594 nino.borges 863 else:
595     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
596     ## TODO: Add support here for more than one first name last name match in MAL.
597 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
598     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
599 nino.borges 863
600    
601    
602 nino.borges 855
603     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
604     if formattedFieldValues:
605     for val in formattedFieldValues:
606     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
607     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
608     if "*" in val:
609 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
610     AddToIssuesList(docID,f"{val} in Formatted {ntvnMatrix[currentNtvn]} is an attorney but couldnt be matched to any value in metadata field.")
611 nino.borges 855
612    
613 nino.borges 857
614 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
615     outputFile = open(outputFileName,'w')
616     for docID in list(issuesMatrix.keys()):
617 nino.borges 921 try:
618     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
619     except UnicodeEncodeError:
620     print(str(issuesMatrix[docID]))
621 nino.borges 855 outputFile.close()