ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py
Revision: 856
Committed: Fri Dec 13 16:28:48 2024 UTC (15 months, 1 week ago) by nino.borges
Content type: text/x-python
File size: 22451 byte(s)
Log Message:
Consolidated all of the additions into the issuesMatrix into a fuction. commented out all the individual adds and tested using the function.  results were identical except for a period in the text that was a typo anyway.

File Contents

# User Rev Content
1 nino.borges 855 """
2    
3     Amazon_PerformDeepNamesNormQC
4    
5     Created by:
6     Emanuel Borges
7     12.11.2024
8    
9     This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
10     like to keep both.
11    
12     """
13    
14     import os, re
15     from uuid import UUID
16     import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
17     import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC
18    
19 nino.borges 856 version = '0.3.0'
20 nino.borges 855
21     issuesMatrix = {}
22    
23     def GatherAllPossibleVariations(personMatch):
24     """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
25     returns deduplicated list of tuple pairs (fullname, parenthetical)"""
26     ## Start as a plain list of all possible tuple pairs.
27     allPossibleVariationsList = []
28    
29     allDomainsList = []
30     if personMatch.work_email_address:
31     allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
32     if personMatch.alt_work_email_address:
33     allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
34     allDomainsList = list(dict.fromkeys(allDomainsList))
35    
36     if personMatch.full_name_overide:
37     fullName = personMatch.full_name_overide
38     for domain in allDomainsList:
39     allPossibleVariationsList.append((fullName,domain))
40     if personMatch.full_name_preferred:
41     ## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
42     fullPreferredName = personMatch.full_name_preferred
43     fullPreferredName = fullPreferredName.replace('(LEGAL)','')
44     fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
45     fullPreferredName = fullPreferredName.replace('(SHE HER)','')
46     if "," in fullPreferredName:
47     preferedLastName, preferedFirstName = fullPreferredName.split(',')
48     preferedLastName = preferedLastName.strip()
49     preferedFirstName = preferedFirstName.strip()
50     preferedFirstName = preferedFirstName.split(" ")[0]
51     fullName = f"{preferedFirstName} {preferedLastName}"
52     #fullName = f"{preferedLastName}, {preferedFirstName}"
53     for domain in allDomainsList:
54     allPossibleVariationsList.append((fullName,domain))
55     else:
56     print(f"ERROR in this name {fullPreferredName}")
57     if personMatch.last_name:
58     if personMatch.first_name:
59     fullName = f"{personMatch.first_name} {personMatch.last_name}"
60     #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
61     else:
62     fullName = f"{personMatch.last_name}"
63     for domain in allDomainsList:
64     allPossibleVariationsList.append((fullName,domain))
65    
66    
67     ## Now return a deduplicated list by using dict to deduplicate.
68     return list(dict.fromkeys(allPossibleVariationsList))
69    
70    
71 nino.borges 856 def AddToIssuesList(docID,issueMessage):
72     """This function will add a single issue to the issues matrix."""
73     if docID in list(issuesMatrix.keys()):
74     issuesMatrix[docID].append(issueMessage)
75     else:
76     issuesMatrix[docID] = [issueMessage,]
77 nino.borges 855
78    
79    
80     if __name__ == '__main__':
81     cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
82     masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.12(20241212-1151).xlsx"
83     fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
84     outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"
85    
86    
87     nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
88    
89     qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
90    
91     #issuesMatrix = {}
92    
93     print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
94     print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")
95    
96     workList = qcP.metadataValuesDict.keys()
97     for docID in workList:
98     metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
99     formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
100     ## remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
101     formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
102     ## This will change once you start itterating acroll all of the field values names
103     currentMetadataValues = metadataFieldValues
104     for val in currentMetadataValues:
105     ## First try to locate an email address in this val and if found, try to find that in the MAL.
106     results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
107     if results:
108     ## Use some smart deduplication to remove duplicates.
109     results = nv.SmartDedupeSet(results)
110     for result in results:
111     ## Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
112     ## TODO:DONE: Update search_by_email to search both workemail and alt email.
113    
114     personMatch = nv.malPeopleList.search_by_email(result.upper())
115     if personMatch:
116     ## Person match found in MAL. Now try to match a value in the formatted field by pulling various values from the MAL.
117     ## For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
118     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
119     matchFlag = False
120     for variationPair in allPossibleVariationsList:
121     if personMatch.is_attorney == 'YES':
122     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
123     ## This variation was found in the list of formatted values, which is fine, so just remove it.
124     if matchFlag:
125     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
126     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
127     matchFlag = True
128     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
129     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
130     if matchFlag:
131     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
132     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
133     matchFlag = True
134 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
135     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
136     ## if docID in list(issuesMatrix.keys()):
137     ## ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
138     ## issuesMatrix[docID].append(f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
139     ## else:
140     ## ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
141     ## issuesMatrix[docID] = [f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade.",]
142 nino.borges 855 else:
143     ## TODO: will need to split this out to include split role soon.
144     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
145     if matchFlag:
146     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
147     ## This variation was found in the list of formatted values, which is fine, so just remove it.
148     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
149     matchFlag = True
150     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
151     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
152     if matchFlag:
153     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
154     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
155     matchFlag = True
156 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
157     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
158     ## if docID in list(issuesMatrix.keys()):
159     ## ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
160     ## issuesMatrix[docID].append(f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrades")
161     ## else:
162     ## ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
163     ## issuesMatrix[docID] = [f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade",]
164 nino.borges 855
165     else:
166     ## Person match, using email, not found in MAL. Try extracting a name from this metadata value and try matching the MAL using that.
167     val = val.upper()
168     ## First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
169     if "(LEGAL)" in val:
170     ## Attempt to only remove the email parenthetical, including the now empty paren.
171     val = val.replace(result.upper(),"")
172     val = val.replace("()",'')
173     #val = val.replace(")","")
174     else:
175     ## Remove all parenthicals, including any character in that paren, from value.
176     val = re.sub(r"\([^)]*\)","",val)
177     val = val.strip()
178     ## with the email address and the paren stripped out of the val, only move forward if anything still exists.
179     if val:
180     ## if there is a comma, parse to last name, first name
181     if "," in val:
182     lastName, firstName = val.split(",")
183     lastName = lastName.strip()
184     firstName = firstName.strip()
185     elif " " in val:
186     ## For now, try just splitting by the first space and take everything after as the first name.
187     firstName, lastName = val.split(" ",1)
188     ## With the name now parse, try searching for all values that match on the last name.
189    
190     personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
191     if personMatchList:
192     possiblePeopleMatchesMatrix = {}
193     ## For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
194     for personMatch in personMatchList:
195     if personMatch.first_name == firstName:
196     ## This is a personMatch that matches the first and last name
197     possiblePeopleMatchesMatrix[personMatch._id] = 1
198     if possiblePeopleMatchesMatrix.keys():
199     ## If the list of possible matches is just 1, we are okay doing a simple match attempt. if more than 1, we need to test for conflicting designations in the list of possible matches.
200     if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
201     allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
202     matchFlag = False
203     for variationPair in allPossibleVariationsList:
204     if personMatch.is_attorney == 'YES':
205     if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
206     ## This variation was found in the list of formatted values, which is fine, so just remove it.
207     if matchFlag:
208     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
209     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
210     matchFlag = True
211     elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
212     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
213     if matchFlag:
214     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
215     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
216     matchFlag = True
217 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
218     AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
219     ## if docID in list(issuesMatrix.keys()):
220     ## ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
221     ## issuesMatrix[docID].append(f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
222     ## else:
223     ## ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
224     ## issuesMatrix[docID] = [f"{variationPair[0]} ({variationPair[1]}) in To Field is low Confidence Potential Upgrade.",]
225 nino.borges 855 else:
226     ## TODO: will need to split this out to include split role soon.
227     if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
228     if matchFlag:
229     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
230     ## This variation was found in the list of formatted values, which is fine, so just remove it.
231     formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
232     matchFlag = True
233     elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
234     ## This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
235     if matchFlag:
236     print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
237     formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
238     matchFlag = True
239 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
240     AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
241     ## if docID in list(issuesMatrix.keys()):
242     ## ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
243     ## issuesMatrix[docID].append(f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrades")
244     ## else:
245     ## ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
246     ## issuesMatrix[docID] = [f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade",]
247 nino.borges 855
248    
249     else:
250     ## TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
251     pass
252     ## if docID in list(issuesMatrix.keys()):
253     ## issuesMatrix[docID].append(f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
254     ## else:
255     ## issuesMatrix[docID] = [f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL",]
256     else:
257     ## No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
258 nino.borges 856 AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
259     ## if docID in list(issuesMatrix.keys()):
260     ## issuesMatrix[docID].append(f"{val} is a value in metadata that I couldnt extract an email address from")
261     ## else:
262     ## issuesMatrix[docID] = [f"{val} is a value in metadata that I couldnt extract an email address from",]
263 nino.borges 855
264     ## Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
265     if formattedFieldValues:
266     for val in formattedFieldValues:
267     ## TODO: Confirm with Eli but we should only report these remaining values if they have a *
268     ## From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
269     if "*" in val:
270 nino.borges 856 ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
271     AddToIssuesList(docID,f"{val} in To Field is an attorney but couldnt be matched to any metadata value.")
272     ## if docID in list(issuesMatrix.keys()):
273     ## ## TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
274     ## issuesMatrix[docID].append(f"{val} in To Field is an attorney but couldnt be matched to any metadata value.")
275     ## else:
276     ## issuesMatrix[docID] = [f"{val} in To Field is an attorney but couldnt be matched to any metadata value.",]
277 nino.borges 855
278    
279     ## Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
280     outputFile = open(outputFileName,'w')
281     for docID in list(issuesMatrix.keys()):
282     outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
283     outputFile.close()