ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 921
Committed: Thu Aug 7 20:26:06 2025 UTC (7 months, 2 weeks ago) by nino.borges
Content type: text/x-python
File size: 57709 byte(s)
Log Message:
multiple comma support

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14 nino.borges 858 import os, re, datetime, calendar
15 nino.borges 855 from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 872 version = '0.14.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32 nino.borges 865 ## After talking to Eli, we decided that all of these amazon.com.uk or amazon.it are related domains,
33     ## so we should feel confident that we can add amazon.com to the list of possible domains. doing that here and for alt work email.
34     if "@AMAZON." in personMatch.work_email_address:
35     allDomainsList.append("AMAZON.COM")
36 nino.borges 855 if personMatch.alt_work_email_address:
37     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
38 nino.borges 865 if "@AMAZON." in personMatch.alt_work_email_address:
39     allDomainsList.append("AMAZON.COM")
40 nino.borges 855 allDomainsList = list(dict.fromkeys(allDomainsList))
41    
42     if personMatch.full_name_overide:
43     fullName = personMatch.full_name_overide
44     for domain in allDomainsList:
45     allPossibleVariationsList.append((fullName,domain))
46     if personMatch.full_name_preferred:
47     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
48     fullPreferredName = personMatch.full_name_preferred
49     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
50     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
51     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
52 nino.borges 866 fullPreferredName = fullPreferredName.replace(',,',',')
53 nino.borges 921 ## Noticed that people are adding multiple commas to also separate the middle name. This test will look for that and solve for it.
54     errTest = fullPreferredName.split(",")
55     if len(errTest) > 2:
56     fullPreferredName = "".join(fullPreferredName.rsplit(",",1))
57    
58 nino.borges 855 if "," in fullPreferredName:
59     preferedLastName, preferedFirstName = fullPreferredName.split(',')
60     preferedLastName = preferedLastName.strip()
61     preferedFirstName = preferedFirstName.strip()
62     preferedFirstName = preferedFirstName.split(" ")[0]
63     fullName = f"{preferedFirstName} {preferedLastName}"
64     #fullName = f"{preferedLastName}, {preferedFirstName}"
65     for domain in allDomainsList:
66     allPossibleVariationsList.append((fullName,domain))
67     else:
68     print(f"ERROR in this name {fullPreferredName}")
69     if personMatch.last_name:
70     if personMatch.first_name:
71     fullName = f"{personMatch.first_name} {personMatch.last_name}"
72     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
73     else:
74     fullName = f"{personMatch.last_name}"
75     for domain in allDomainsList:
76     allPossibleVariationsList.append((fullName,domain))
77    
78    
79     ## Now return a deduplicated list by using dict to deduplicate.
80     return list(dict.fromkeys(allPossibleVariationsList))
81    
82    
83 nino.borges 856 def AddToIssuesList(docID,issueMessage):
84     """This function will add a single issue to the issues matrix."""
85     if docID in list(issuesMatrix.keys()):
86     issuesMatrix[docID].append(issueMessage)
87     else:
88     issuesMatrix[docID] = [issueMessage,]
89 nino.borges 855
90    
91    
92     if __name__ == '__main__':
93 nino.borges 921 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20250228-FTC-Retail\export_20250228_215605_Converted.txt"
94     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241215\PrivLogExports\PrivLogExport_20241211_CAAG_Converted.txt"
95 nino.borges 868 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
96     #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
97 nino.borges 872 #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241216-0954).xlsx"
98     #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.16(20241219-0157).xlsx"
99 nino.borges 921 #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.24(20241230-1109).xlsx"
100     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2025.07.03(20250703-1027).xlsx"
101 nino.borges 868 fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\CAAG-MasterAttorneyList\FullNameOverides.txt"
102 nino.borges 855 outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
103    
104    
105     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
106    
107     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
108    
109 nino.borges 872 ## Creat a simple named tuple value name to message field name matrix, to control the text used in the issue message
110     ntvnMatrix = {'toValues':'To Field','fromValues':'From Field','ccValues':'CC Field','bccValues':'BCC Field','docAuthor':'Doc Author Field'}
111     ## This next line will change as soon as you start itterating over all of these instead of just doing one at a time. For now I just do each of these one by one.
112     ## TODO: Change this to itterate over a list of all of these instead of just doing them one by one.
113 nino.borges 921 currentNtvn = 'toValues'
114 nino.borges 872 #currentNtvn = 'fromValues'
115     #currentNtvn = 'ccValues'
116     #currentNtvn = 'bccValues'
117 nino.borges 921 #currentNtvn = 'docAuthor'
118 nino.borges 855
119     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
120     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
121    
122     workList = qcP.metadataValuesDict.keys()
123     for docID in workList:
124 nino.borges 872 metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()[currentNtvn]
125     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()[currentNtvn]
126 nino.borges 855 ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
127     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
128     ## This will change once you start itterating acroll all of the field values names
129     currentMetadataValues = metadataFieldValues
130     for val in currentMetadataValues:
131     ## First try to locate an email address in this val and if found, try to find that in the MAL.
132     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
133     if results:
134     ## Use some smart deduplication to remove duplicates.
135     results = nv.SmartDedupeSet(results)
136 nino.borges 863 if len(results) > 1:
137     print(f"WARNING: more than one unique email address found in this value: {results}")
138 nino.borges 855 for result in results:
139     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
140     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
141    
142     personMatch = nv.malPeopleList.search_by_email(result.upper())
143     if personMatch:
144     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
145     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
146     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
147     matchFlag = False
148 nino.borges 861 if allPossibleVariationsList:
149     for variationPair in allPossibleVariationsList:
150     if personMatch.is_attorney == 'YES':
151 nino.borges 860 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
152     ## This variation was found in the list of formatted values, which is fine, so just remove it.
153     if matchFlag:
154     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
155 nino.borges 861
156 nino.borges 860 formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
157     matchFlag = True
158 nino.borges 861
159    
160 nino.borges 860 elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
161     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
162     if matchFlag:
163     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
164     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
165     matchFlag = True
166 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
167     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential Upgrade")
168 nino.borges 860
169 nino.borges 861
170     elif personMatch.is_attorney == 'NO':
171 nino.borges 860 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
172     if matchFlag:
173     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
174     ## This variation was found in the list of formatted values, which is fine, so just remove it.
175     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
176     matchFlag = True
177     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
178     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
179     if matchFlag:
180     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
181     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
182     matchFlag = True
183 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
184     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential downgrade")
185 nino.borges 861 else:
186     ## This means they are a split role, so additional work will need to be done with the dates.
187 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
188     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
189     ## Second, determin if this document date is between the dates where this person was an attorney
190     wasAttorneyAtThatTime = False
191     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
192     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
193     #print(f"\ndocumentDateValue is {documentDateValue}")
194     personWasAttorneyDates = personMatch.dates_as_counsel
195 nino.borges 921 #print(personMatch.work_email_address)
196 nino.borges 868 for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
197     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
198     if wasAttorneyStartDate.count("/") < 2:
199     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
200     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
201    
202     if wasAttorneyEndDate == "CURRENT":
203     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
204     elif wasAttorneyEndDate == "PRESENT":
205     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
206     if wasAttorneyEndDate.count("/") < 2:
207     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
208     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
209     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
210 nino.borges 861
211 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
212     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
213     wasAttorneyAtThatTime = True
214    
215     ## if wasAttorneyAtThatTime:
216     ## print("Person WAS attorney at this doc date.")
217     ## else:
218     ## print("Person WAS NOT attorney at this doc date.")
219    
220     ## Person's role at the time of the document has been determined, so now do the same checks as above.
221     if wasAttorneyAtThatTime:
222     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
223     ## This variation was found in the list of formatted values, which is fine, so just remove it.
224     if matchFlag:
225     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
226     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
227     matchFlag = True
228     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
229     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
230     if matchFlag:
231     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
232     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
233     matchFlag = True
234 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
235     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential Upgrade")
236 nino.borges 868
237     else:
238     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
239     if matchFlag:
240     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
241     ## This variation was found in the list of formatted values, which is fine, so just remove it.
242     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
243     matchFlag = True
244     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
245     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
246     if matchFlag:
247     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
248     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
249     matchFlag = True
250 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
251     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is High Confidence Potential downgrade")
252 nino.borges 861 else:
253 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
254     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
255 nino.borges 861 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
256     if matchFlag:
257     pass
258     else:
259 nino.borges 860 if personMatch.is_attorney == 'YES':
260 nino.borges 872 AddToIssuesList(docID,f"{val} in Metadata {ntvnMatrix[currentNtvn]} did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
261 nino.borges 855
262     else:
263 nino.borges 863 ## Person match, using email, not found in MAL.
264     ## Try extracting a name from this metadata value and try matching the MAL using that.
265 nino.borges 855 val = val.upper()
266 nino.borges 863 origVal = val
267 nino.borges 867
268 nino.borges 855 ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
269     if "(LEGAL)" in val:
270     ## Attempt to only remove the email parenthetical, including the now empty paren.
271     val = val.replace(result.upper(),"")
272     val = val.replace("()",'')
273     #val = val.replace(")","")
274     else:
275     ## Remove all parenthicals, including any character in that paren, from value.
276     val = re.sub(r"\([^)]*\)","",val)
277 nino.borges 867 val = val.replace(result.upper(),"")
278 nino.borges 855 val = val.strip()
279     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
280     if val:
281     ## if there is a comma, parse to last name, first name
282     if "," in val:
283 nino.borges 868 if val.count(",") < 2:
284     lastName, firstName = val.split(",")
285     else:
286     ## This is here to catch some malformatted values in the metadata. this will never match.
287     lastName, firstName = val.split(",", maxsplit = 1)
288     print(f"WARNING: Malformed metadata value found: {val}")
289 nino.borges 855 lastName = lastName.strip()
290     firstName = firstName.strip()
291     elif " " in val:
292     ## For now, try just splitting by the first space and take everything after as the first name.
293     firstName, lastName = val.split(" ",1)
294     ## With the name now parse, try searching for all values that match on the last name.
295 nino.borges 864
296 nino.borges 855 personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
297     if personMatchList:
298     possiblePeopleMatchesMatrix = {}
299     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
300     for personMatch in personMatchList:
301     if personMatch.first_name == firstName:
302     ## This is a personMatch that matches the first and last name
303     possiblePeopleMatchesMatrix[personMatch._id] = 1
304     if possiblePeopleMatchesMatrix.keys():
305     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
306     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
307 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
308     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
309    
310 nino.borges 855 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
311 nino.borges 865
312 nino.borges 855 matchFlag = False
313 nino.borges 861 if allPossibleVariationsList:
314     for variationPair in allPossibleVariationsList:
315     if personMatch.is_attorney == 'YES':
316 nino.borges 868 #if personMatch.last_name == "MANEK":
317     #print(variationPair)
318 nino.borges 861 if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
319     ## This variation was found in the list of formatted values, which is fine, so just remove it.
320     if matchFlag:
321     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
322     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
323     matchFlag = True
324     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
325     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
326     if matchFlag:
327     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
328     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
329     matchFlag = True
330 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
331     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
332 nino.borges 866 ## else:
333     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
334     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
335     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
336 nino.borges 861
337 nino.borges 862 elif personMatch.is_attorney == 'NO':
338 nino.borges 861 if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
339     if matchFlag:
340     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
341     ## This variation was found in the list of formatted values, which is fine, so just remove it.
342     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
343     matchFlag = True
344     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
345     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
346     if matchFlag:
347     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
348     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
349     matchFlag = True
350 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
351     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
352 nino.borges 862 else:
353     ## This means they are a split role, so additional work will need to be done with the dates.
354 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
355     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
356     ## Second, determin if this document date is between the dates where this person was an attorney
357     wasAttorneyAtThatTime = False
358     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
359     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
360     #print(f"\ndocumentDateValue is {documentDateValue}")
361     personWasAttorneyDates = personMatch.dates_as_counsel
362     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
363     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
364     if wasAttorneyStartDate.count("/") < 2:
365     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
366     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
367    
368     if wasAttorneyEndDate == "CURRENT":
369     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
370     elif wasAttorneyEndDate == "PRESENT":
371     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
372     if wasAttorneyEndDate.count("/") < 2:
373     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
374     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
375     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
376 nino.borges 862
377 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
378     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
379     wasAttorneyAtThatTime = True
380    
381     ## if wasAttorneyAtThatTime:
382     ## print("Person WAS attorney at this doc date.")
383     ## else:
384     ## print("Person WAS NOT attorney at this doc date.")
385    
386     ## Person's role at the time of the document has been determined, so now do the same checks as above.
387     if wasAttorneyAtThatTime:
388     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
389     ## This variation was found in the list of formatted values, which is fine, so just remove it.
390     if matchFlag:
391     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
392     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
393     matchFlag = True
394     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
395     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
396     if matchFlag:
397     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
398     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
399     matchFlag = True
400 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
401     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
402 nino.borges 868
403     else:
404     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
405     if matchFlag:
406     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
407     ## This variation was found in the list of formatted values, which is fine, so just remove it.
408     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
409     matchFlag = True
410     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
411     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
412     if matchFlag:
413     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
414     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
415     matchFlag = True
416 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
417     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
418 nino.borges 862 else:
419 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
420     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
421 nino.borges 866 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
422     if matchFlag:
423     pass
424     else:
425     if personMatch.is_attorney == 'YES':
426 nino.borges 872 AddToIssuesList(docID,f"{origVal} in Metadata {ntvnMatrix[currentNtvn]} and did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
427 nino.borges 863 else:
428     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
429     ## TODO: Add support here for more than one first name last name match in MAL.
430 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
431     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
432 nino.borges 855
433     else:
434     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
435 nino.borges 857 #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
436 nino.borges 855 pass
437 nino.borges 857
438 nino.borges 855 else:
439     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
440 nino.borges 863 #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
441     val = val.upper()
442     origVal = val
443     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
444     if "(LEGAL)" in val:
445     pass
446     else:
447     ## Remove all parenthicals, including any character in that paren, from value.
448     val = re.sub(r"\([^)]*\)","",val)
449    
450     val = val.strip()
451     ## with the paren information stripped out of the val, only move forward if anything still exists.
452     if val:
453     ## if there is a comma, parse to last name, first name
454     if "," in val:
455 nino.borges 868 if val.count(",") < 2:
456     lastName, firstName = val.split(",")
457     else:
458     ## This is here to catch some malformatted values in the metadata. this will never match.
459     lastName, firstName = val.split(",", maxsplit = 1)
460     print(f"WARNING: Malformed metadata value found: {val}")
461 nino.borges 863 lastName = lastName.strip()
462     firstName = firstName.strip()
463     elif " " in val:
464     ## For now, try just splitting by the first space and take everything after as the first name.
465     firstName, lastName = val.split(" ",1)
466     ## With the name now parse, try searching for all values that match on the last name.
467    
468     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
469     if personMatchList:
470     possiblePeopleMatchesMatrix = {}
471     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
472     for personMatch in personMatchList:
473     if personMatch.first_name == firstName:
474     ## This is a personMatch that matches the first and last name
475     possiblePeopleMatchesMatrix[personMatch._id] = 1
476     if possiblePeopleMatchesMatrix.keys():
477     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
478     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
479 nino.borges 864 ## I can grab the single matching value here because I've confirmed there is just 1. if you do something similar for where there are more, change this next line.
480     personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
481    
482 nino.borges 863 allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
483     matchFlag = False
484     if allPossibleVariationsList:
485     for variationPair in allPossibleVariationsList:
486     if personMatch.is_attorney == 'YES':
487     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
488     ## This variation was found in the list of formatted values, which is fine, so just remove it.
489     if matchFlag:
490     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
491     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
492     matchFlag = True
493     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
494     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
495     if matchFlag:
496     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
497     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
498     matchFlag = True
499 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
500     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
501 nino.borges 866 ## else:
502     ## ## This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
503     ## #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
504     ## AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
505 nino.borges 857
506 nino.borges 863 elif personMatch.is_attorney == 'NO':
507     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
508     if matchFlag:
509     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
510     ## This variation was found in the list of formatted values, which is fine, so just remove it.
511     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
512     matchFlag = True
513     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
514     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
515     if matchFlag:
516     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
517     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
518     matchFlag = True
519 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
520     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
521 nino.borges 863 else:
522     ## This means they are a split role, so additional work will need to be done with the dates.
523 nino.borges 868 ## First, check to see that there is a date value from the log file. Finding issues where this doenst always exist.
524     if qcP.additionalValuesDict[docID]._asdict()['dateValue']:
525     ## Second, determin if this document date is between the dates where this person was an attorney
526     wasAttorneyAtThatTime = False
527     documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
528     documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
529     #print(f"\ndocumentDateValue is {documentDateValue}")
530     personWasAttorneyDates = personMatch.dates_as_counsel
531     for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
532     #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
533     if wasAttorneyStartDate.count("/") < 2:
534     wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
535     wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
536    
537     if wasAttorneyEndDate == "CURRENT":
538     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
539     elif wasAttorneyEndDate == "PRESENT":
540     wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
541     if wasAttorneyEndDate.count("/") < 2:
542     missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
543     wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
544     wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
545 nino.borges 863
546 nino.borges 868 #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
547     if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
548     wasAttorneyAtThatTime = True
549    
550     ## if wasAttorneyAtThatTime:
551     ## print("Person WAS attorney at this doc date.")
552     ## else:
553     ## print("Person WAS NOT attorney at this doc date.")
554    
555     ## Person's role at the time of the document has been determined, so now do the same checks as above.
556     if wasAttorneyAtThatTime:
557     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
558     ## This variation was found in the list of formatted values, which is fine, so just remove it.
559     if matchFlag:
560     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
561     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
562     matchFlag = True
563     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
564     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
565     if matchFlag:
566     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
567     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
568     matchFlag = True
569 nino.borges 872 ## TODO:DONE change the hard coded "To Field" here once you change to itterate over all field groups.
570     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential Upgrade")
571 nino.borges 868
572     else:
573     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
574     if matchFlag:
575     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
576     ## This variation was found in the list of formatted values, which is fine, so just remove it.
577     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
578     matchFlag = True
579     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
580     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
581     if matchFlag:
582     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
583     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
584     matchFlag = True
585 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
586     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in {ntvnMatrix[currentNtvn]} is Low Confidence Potential downgrade")
587 nino.borges 863 else:
588 nino.borges 868 ## This means we have a split role person but NO value in the privlog date field.
589     print(f"WARNING: {personMatch.first_name} {personMatch.last_name} is in the MAL as a split role however no date in priv log for {docID}!")
590 nino.borges 866 ## Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY Name found in meta but MISSING FROM FORMATTED
591     if matchFlag:
592     pass
593     else:
594     if personMatch.is_attorney == 'YES':
595 nino.borges 872 AddToIssuesList(docID,f"{origVal} in Metadata {ntvnMatrix[currentNtvn]} did not directly match value in formatted however this is a LOW Confidence Potential Attorney")
596 nino.borges 863 else:
597     print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
598     ## TODO: Add support here for more than one first name last name match in MAL.
599 nino.borges 864 ## ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
600     ## Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.
601 nino.borges 863
602    
603    
604 nino.borges 855
605     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
606     if formattedFieldValues:
607     for val in formattedFieldValues:
608     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
609     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
610     if "*" in val:
611 nino.borges 872 ## TODO:DONE: change the hard coded "To Field" here once you change to itterate over all field groups.
612     AddToIssuesList(docID,f"{val} in Formatted {ntvnMatrix[currentNtvn]} is an attorney but couldnt be matched to any value in metadata field.")
613 nino.borges 855
614    
615 nino.borges 857
616 nino.borges 855 ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
617     outputFile = open(outputFileName,'w')
618     for docID in list(issuesMatrix.keys()):
619 nino.borges 921 try:
620     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
621     except UnicodeEncodeError:
622     print(str(issuesMatrix[docID]))
623 nino.borges 855 outputFile.close()