Active_prgs/Redgrave/Amazon_NamesNormQC.py

"""

Amazon-NamesNormQC

Created by:
Emanuel Borges
11.21.2024

This program will assist with the process of performing Names Normalization QC on the Amazon privilege logs.

"""

import os, uuid, pickle, re
import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
from dataclasses import dataclass, field
from typing import List, Optional
from collections import namedtuple
from win32com.client import Dispatch


@dataclass
class Person:
    first_name: Optional[str] = None
    last_name: Optional[str] = None
    work_email_address: Optional[str] = None
    alt_work_email_address: Optional[str] = None
    _id: uuid.UUID = field(default_factory=uuid.uuid4)
    is_attorney: Optional[str] = None
    split_role_date_range: Optional[str] = None
    sidley_validated: Optional[str] = None
    category: Optional[str] = None
    organization: Optional[str] = None
    job_title: Optional[str] = None
    business_title: Optional[str] = None
    full_name_preferred: Optional[str] = None
    login: Optional[str] = None
    department_fine: Optional[str] = None
    addressed_during_caag: Optional[str] = None

    def __post_init__(self):
        """Convert all string fields to uppercase."""
        if self.first_name:
            self.first_name = self.first_name.upper()
        if self.last_name:
            self.last_name = self.last_name.upper()
        if self.work_email_address:
            self.work_email_address = self.work_email_address.upper()
        if self.alt_work_email_address:
            self.alt_work_email_address = self.alt_work_email_address.upper()
        if self.is_attorney:
            self.is_attorney = self.is_attorney.upper()
        if self.split_role_date_range:
            self.split_role_date_range = self.split_role_date_range.upper()
        if self.sidley_validated:
            self.sidley_validated = self.sidley_validated.upper()
        if self.category:
            self.category = self.category.upper()
        if self.organization:
            self.organization = self.organization.upper()
        if self.job_title:
            self.job_title = self.job_title.upper()
        if self.business_title:
            self.business_title = self.business_title.upper()
        if self.full_name_preferred:
            self.full_name_preferred = self.full_name_preferred.upper()
        if self.login:
            self.login = self.login.upper()
        if self.department_fine:
            self.department_fine = self.department_fine.upper()
        if self.addressed_during_caag:
            self.addressed_during_caag = self.addressed_during_caag.upper()

@dataclass
class PeopleList:
    people: List[Person] = field(default_factory=list)

    def add_person(self, person: Person):
        self.people.append(person)
        #print(f"Added person: {person}")


    def search_by_email(self, emailAddress:str) -> Optional[Person]:
        for person in self.people:
            if person.work_email_address == emailAddress:
                return person
        return None
        
    def list_people(self):
        for person in self.people:
            print(person)


class NamesVerification(object):
    """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
    version = '0.2.0'


    def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName, forceNewPklFile = False, Encoding = 'UTF8'):
        """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
        Assumes the first row of the data file is the header and first column is DocID.
        Assumes the MAL is a spreadsheet (for now).
        MAL gets saved to a pkl file for performance reasons.  pkl will be used unless forceNewPklFile is set to true"""
        pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"

        print("Initializing data structures...")
        if forceNewPklFile:
            print("Creating MAL structure...")
            self.malPeopleList = PeopleList()
            self.__IngestMALSpreadsheet(masterAttorneyListFileName)
            print("MAL structure created.")
            print("Creating pickle backup...")
            self.__SaveMalToPkl(pklFileName)
            print("Pickle backup created.")
        else:
            if os.path.exists(pklFileName):
                print("Loading MAL structure from pickle file...")
                self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
                print("MAL structure loaded.")
            else:
                print("Pickle file doesnt exist.")
                print("Creating MAL structure...")
                self.malPeopleList = PeopleList()
                self.__IngestMALSpreadsheet(masterAttorneyListFileName)
                print("MAL structure created.")
                print("Creating pickle backup...")
                self.__SaveMalToPkl(pklFileName)
                print("Pickle backup created.")
                
##        self.malPeopleList = PeopleList()
##        
##        print("Creating MAL structure...")
##        self.__IngestMALSpreadsheet(masterAttorneyListFileName)
##        print("MAL structure created.")
##        print("Creating pickle backup...")


    def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
        """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
        ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.

        excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10909, "beginColNumber":2, "endColNumber":16},
                               {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":565, "beginColNumber":2, "endColNumber":15}]

 #       excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
 #                              {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]

        spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
                                  "Split Role -  Attorney Capacity Date Range":"split_role_date_range", "Sidley Validated?":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
                                  "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag"}

        xlApp = Dispatch('Excel.Application')
        xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
        
        for excelTab in excelTabParametersList:
            sht = xlBook.Worksheets(excelTab['tabName'])
            print(f"Ingesting sheet {excelTab['tabName']}.")
            excelFieldPositionMatrix = {}
            for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
                excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
            for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
                #print(row)
                ##  TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
                if excelTab['tabName'] == 'Attorneys':
                    self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
                                                         split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role -  Attorney Capacity Date Range']).Value,
                                                         sidley_validated =  sht.Cells(row,excelFieldPositionMatrix['Sidley Validated?']).Value,
                                                         category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
                                                         organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
                                                         last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
                                                         first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
                                                         work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
                                                         alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
                                                         job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
                                                         business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
                                                         full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
                                                         login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
                                                         department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
                                                         addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
                                                         

        xlBook.Close()

    def __SaveMalToPkl(self, pklFileName):
        """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
        outputFile = open(pklFileName,'wb')
        pickle.dump(self.malPeopleList,outputFile)
        outputFile.close()

    def __LoadMalFromPkl(self, pklFileName):
        """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
        contents = open(pklFileName, 'rb')
        obj = pickle.load(contents)
        contents.close()
        return obj
    

if __name__ == '__main__':
    #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
    cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
    masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List  2024.11.06(7045550.3).xlsx"
    #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
    

    nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName)
    #nv.malPeopleList.list_people()

    qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
    #print(nv.malPeopleList.search_by_email('joyshine@amazon.com'.upper()))
    workList = qcP.metadataValuesDict.keys()
    outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
    for docID in workList:
        metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
        formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
        formattedAttorneyValues = set()
        for formattedValue in formattedFieldValues:
            if "*" in formattedValue:
                formattedAttorneyValues.add(formattedValue.upper())
        
        if metadataFieldValues:
            matchedMetadataValues = set()
            for nameItem in metadataFieldValues:
                ## First test to see if there is a valid email address.
                resultSet = set()
                results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
                if results:
                    for result in results:
                        resultSet.add(result)
                    if len(resultSet) >1:
                        print("ERROR multiple email unique email addresses in one item.")
                    else:
                        personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
                        if personMatch:
                            if personMatch.full_name_preferred:
                                ##  Going to need to do a bit of replacing to remove some information that is just never in the formatted.
                                fullPreferredName = personMatch.full_name_preferred
                                fullPreferredName = fullPreferredName.replace('(LEGAL)','')
                                fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
                                fullPreferredName = fullPreferredName.replace('(SHE HER)','')
                                preferedLastName, preferedFirstName = fullPreferredName.split(',')
                                preferedLastName = preferedLastName.strip()
                                preferedFirstName = preferedFirstName.strip()
                                preferedFirstName = preferedFirstName.split(" ")[0]
                                fullName = f"{preferedFirstName} {preferedLastName}"
                            else:
                                fullName = f"{personMatch.first_name} {personMatch.last_name}"
                            if personMatch.is_attorney == 'YES':
                                #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
                                matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
                            else:
                                #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
                                matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
                else:
                    outputFile.write(f"{docID} contains a non-email item {nameItem}\n")
            missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
            missingFromMeta = formattedAttorneyValues - matchedMetadataValues
            if missingFromFormatted:
                for missingItem in missingFromFormatted:
                    outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
            if missingFromMeta:
                for missingItem in missingFromMeta:
                    outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
            if missingFromFormatted:
                outputFile.write("\n")
            elif missingFromMeta:
                outputFile.write("\n")
    outputFile.close()


##    people_list = PeopleList()
##    people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "fooBar@gmail.com", altWorkEmailAddress = ""))
##    people_list.add_person(Person(firstName = "Gary", lastName = "Cooper", workEmailAddress = "", altWorkEmailAddress = "spam.eggs@hotmail.com"))
##    people_list.add_person(Person(firstName = "", lastName = "", workEmailAddress = "noname@gmail.com", altWorkEmailAddress = ""))
##    people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "eggs@outlook.com", altWorkEmailAddress = ""))
##    print("\nAll People:")
##    people_list.list_people()
##    print("\nSearching...")
##    result = people_list.search_by_email('fooBar@gmail.com')
##    print(result if result else "email not found.")
Revision:	835
Committed:	Wed Nov 27 16:08:34 2024 UTC (15 months, 4 weeks ago) by nino.borges
Content type:	text/x-python
File size:	16353 byte(s)
Log Message:	Added support to do the actual compares and output the issue values to a log file, further refining the logic. Also added some support to add some manual manipulation on some of the values for things where inconsistent in the MAL.
#	Content
1	"""
2
3	Amazon-NamesNormQC
4
5	Created by:
6	Emanuel Borges
7	11.21.2024
8
9	This program will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10
11	"""
12
13	import os, uuid, pickle, re
14	import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15	from dataclasses import dataclass, field
16	from typing import List, Optional
17	from collections import namedtuple
18	from win32com.client import Dispatch
19
20
21	@dataclass
22	class Person:
23	first_name: Optional[str] = None
24	last_name: Optional[str] = None
25	work_email_address: Optional[str] = None
26	alt_work_email_address: Optional[str] = None
27	_id: uuid.UUID = field(default_factory=uuid.uuid4)
28	is_attorney: Optional[str] = None
29	split_role_date_range: Optional[str] = None
30	sidley_validated: Optional[str] = None
31	category: Optional[str] = None
32	organization: Optional[str] = None
33	job_title: Optional[str] = None
34	business_title: Optional[str] = None
35	full_name_preferred: Optional[str] = None
36	login: Optional[str] = None
37	department_fine: Optional[str] = None
38	addressed_during_caag: Optional[str] = None
39
40	def __post_init__(self):
41	"""Convert all string fields to uppercase."""
42	if self.first_name:
43	self.first_name = self.first_name.upper()
44	if self.last_name:
45	self.last_name = self.last_name.upper()
46	if self.work_email_address:
47	self.work_email_address = self.work_email_address.upper()
48	if self.alt_work_email_address:
49	self.alt_work_email_address = self.alt_work_email_address.upper()
50	if self.is_attorney:
51	self.is_attorney = self.is_attorney.upper()
52	if self.split_role_date_range:
53	self.split_role_date_range = self.split_role_date_range.upper()
54	if self.sidley_validated:
55	self.sidley_validated = self.sidley_validated.upper()
56	if self.category:
57	self.category = self.category.upper()
58	if self.organization:
59	self.organization = self.organization.upper()
60	if self.job_title:
61	self.job_title = self.job_title.upper()
62	if self.business_title:
63	self.business_title = self.business_title.upper()
64	if self.full_name_preferred:
65	self.full_name_preferred = self.full_name_preferred.upper()
66	if self.login:
67	self.login = self.login.upper()
68	if self.department_fine:
69	self.department_fine = self.department_fine.upper()
70	if self.addressed_during_caag:
71	self.addressed_during_caag = self.addressed_during_caag.upper()
72
73	@dataclass
74	class PeopleList:
75	people: List[Person] = field(default_factory=list)
76
77	def add_person(self, person: Person):
78	self.people.append(person)
79	#print(f"Added person: {person}")
80
81
82	def search_by_email(self, emailAddress:str) -> Optional[Person]:
83	for person in self.people:
84	if person.work_email_address == emailAddress:
85	return person
86	return None
87
88	def list_people(self):
89	for person in self.people:
90	print(person)
91
92
93	class NamesVerification(object):
94	"""A class for automating the process of performing QC on the names within the Amazon privilege logs."""
95	version = '0.2.0'
96
97
98	def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName, forceNewPklFile = False, Encoding = 'UTF8'):
99	"""Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
100	Assumes the first row of the data file is the header and first column is DocID.
101	Assumes the MAL is a spreadsheet (for now).
102	MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
103	pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
104
105	print("Initializing data structures...")
106	if forceNewPklFile:
107	print("Creating MAL structure...")
108	self.malPeopleList = PeopleList()
109	self.__IngestMALSpreadsheet(masterAttorneyListFileName)
110	print("MAL structure created.")
111	print("Creating pickle backup...")
112	self.__SaveMalToPkl(pklFileName)
113	print("Pickle backup created.")
114	else:
115	if os.path.exists(pklFileName):
116	print("Loading MAL structure from pickle file...")
117	self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
118	print("MAL structure loaded.")
119	else:
120	print("Pickle file doesnt exist.")
121	print("Creating MAL structure...")
122	self.malPeopleList = PeopleList()
123	self.__IngestMALSpreadsheet(masterAttorneyListFileName)
124	print("MAL structure created.")
125	print("Creating pickle backup...")
126	self.__SaveMalToPkl(pklFileName)
127	print("Pickle backup created.")
128
129	## self.malPeopleList = PeopleList()
130	##
131	## print("Creating MAL structure...")
132	## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
133	## print("MAL structure created.")
134	## print("Creating pickle backup...")
135
136
137
138
139
140	def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
141	"""Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
142	## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
143
144	excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10909, "beginColNumber":2, "endColNumber":16},
145	{"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":565, "beginColNumber":2, "endColNumber":15}]
146
147	# excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
148	# {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
149
150	spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
151	"Split Role - Attorney Capacity Date Range":"split_role_date_range", "Sidley Validated?":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
152	"Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag"}
153
154	xlApp = Dispatch('Excel.Application')
155	xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
156
157	for excelTab in excelTabParametersList:
158	sht = xlBook.Worksheets(excelTab['tabName'])
159	print(f"Ingesting sheet {excelTab['tabName']}.")
160	excelFieldPositionMatrix = {}
161	for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
162	excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
163	for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
164	#print(row)
165	## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
166	if excelTab['tabName'] == 'Attorneys':
167	self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
168	split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
169	sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Sidley Validated?']).Value,
170	category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
171	organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
172	last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
173	first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
174	work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
175	alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
176	job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
177	business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
178	full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
179	login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
180	department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
181	addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
182
183
184	xlBook.Close()
185
186	def __SaveMalToPkl(self, pklFileName):
187	"""Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
188	outputFile = open(pklFileName,'wb')
189	pickle.dump(self.malPeopleList,outputFile)
190	outputFile.close()
191
192	def __LoadMalFromPkl(self, pklFileName):
193	"""Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
194	contents = open(pklFileName, 'rb')
195	obj = pickle.load(contents)
196	contents.close()
197	return obj
198
199
200	if __name__ == '__main__':
201	#cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
202	cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
203	#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
204	masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
205	#masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
206
207
208
209	nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName)
210	#nv.malPeopleList.list_people()
211
212	qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
213	#print(nv.malPeopleList.search_by_email('joyshine@amazon.com'.upper()))
214	workList = qcP.metadataValuesDict.keys()
215	outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
216	for docID in workList:
217	metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
218	formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
219	formattedAttorneyValues = set()
220	for formattedValue in formattedFieldValues:
221	if "*" in formattedValue:
222	formattedAttorneyValues.add(formattedValue.upper())
223
224	if metadataFieldValues:
225	matchedMetadataValues = set()
226	for nameItem in metadataFieldValues:
227	## First test to see if there is a valid email address.
228	resultSet = set()
229	results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
230	if results:
231	for result in results:
232	resultSet.add(result)
233	if len(resultSet) >1:
234	print("ERROR multiple email unique email addresses in one item.")
235	else:
236	personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
237	if personMatch:
238	if personMatch.full_name_preferred:
239	## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
240	fullPreferredName = personMatch.full_name_preferred
241	fullPreferredName = fullPreferredName.replace('(LEGAL)','')
242	fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
243	fullPreferredName = fullPreferredName.replace('(SHE HER)','')
244	preferedLastName, preferedFirstName = fullPreferredName.split(',')
245	preferedLastName = preferedLastName.strip()
246	preferedFirstName = preferedFirstName.strip()
247	preferedFirstName = preferedFirstName.split(" ")[0]
248	fullName = f"{preferedFirstName} {preferedLastName}"
249	else:
250	fullName = f"{personMatch.first_name} {personMatch.last_name}"
251	if personMatch.is_attorney == 'YES':
252	#outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
253	matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
254	else:
255	#outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
256	matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
257	else:
258	outputFile.write(f"{docID} contains a non-email item {nameItem}\n")
259	missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
260	missingFromMeta = formattedAttorneyValues - matchedMetadataValues
261	if missingFromFormatted:
262	for missingItem in missingFromFormatted:
263	outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
264	if missingFromMeta:
265	for missingItem in missingFromMeta:
266	outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
267	if missingFromFormatted:
268	outputFile.write("\n")
269	elif missingFromMeta:
270	outputFile.write("\n")
271	outputFile.close()
272
273
274
275
276	## people_list = PeopleList()
277	## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "fooBar@gmail.com", altWorkEmailAddress = ""))
278	## people_list.add_person(Person(firstName = "Gary", lastName = "Cooper", workEmailAddress = "", altWorkEmailAddress = "spam.eggs@hotmail.com"))
279	## people_list.add_person(Person(firstName = "", lastName = "", workEmailAddress = "noname@gmail.com", altWorkEmailAddress = ""))
280	## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "eggs@outlook.com", altWorkEmailAddress = ""))
281	## print("\nAll People:")
282	## people_list.list_people()
283	## print("\nSearching...")
284	## result = people_list.search_by_email('fooBar@gmail.com')
285	## print(result if result else "email not found.")