Active_prgs/Redgrave/Amazon_NamesNormQC.py

"""

Amazon-NamesNormQC

Created by:
Emanuel Borges
11.21.2024

This program will assist with the process of performing Names Normalization QC on the Amazon privilege logs.

"""

import os, uuid, pickle, re
import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
from dataclasses import dataclass, field
from typing import List, Optional
from collections import namedtuple
from win32com.client import Dispatch


@dataclass
class Person:
    first_name: Optional[str] = None
    last_name: Optional[str] = None
    work_email_address: Optional[str] = None
    alt_work_email_address: Optional[str] = None
    _id: uuid.UUID = field(default_factory=uuid.uuid4)
    is_attorney: Optional[str] = None
    split_role_date_range: Optional[str] = None
    sidley_validated: Optional[str] = None
    category: Optional[str] = None
    organization: Optional[str] = None
    job_title: Optional[str] = None
    business_title: Optional[str] = None
    full_name_preferred: Optional[str] = None
    login: Optional[str] = None
    department_fine: Optional[str] = None
    addressed_during_caag: Optional[str] = None
    #last_updated: Optional[str] = None
    full_name_overide: Optional[str] = None

    def __post_init__(self):
        """Convert all string fields to uppercase."""
        if self.first_name:
            self.first_name = self.first_name.strip().upper()
        if self.last_name:
            self.last_name = self.last_name.strip().upper()
        if self.work_email_address:
            self.work_email_address = self.work_email_address.strip().upper()
        if self.alt_work_email_address:
            self.alt_work_email_address = self.alt_work_email_address.strip().upper()
        if self.is_attorney:
            self.is_attorney = self.is_attorney.strip().upper()
        if self.split_role_date_range:
            self.split_role_date_range = self.split_role_date_range.strip().upper()
        if self.sidley_validated:
            self.sidley_validated = self.sidley_validated.strip().upper()
        if self.category:
            self.category = self.category.strip().upper()
        if self.organization:
            self.organization = self.organization.strip().upper()
        if self.job_title:
            self.job_title = self.job_title.strip().upper()
        if self.business_title:
            self.business_title = self.business_title.strip().upper()
        if self.full_name_preferred:
            self.full_name_preferred = self.full_name_preferred.strip().upper()
        if self.login:
            self.login = self.login.strip().upper()
        if self.department_fine:
            self.department_fine = self.department_fine.strip().upper()
        if self.addressed_during_caag:
            self.addressed_during_caag = self.addressed_during_caag.strip().upper()
        #if self.last_updated:
        #    self.last_updated = self.last_updated.strip().upper()

@dataclass
class PeopleList:
    people: List[Person] = field(default_factory=list)

    def add_person(self, person: Person):
        self.people.append(person)
        #print(f"Added person: {person}")


    def search_by_email(self, emailAddress:str) -> Optional[Person]:
        for person in self.people:
            if person.work_email_address == emailAddress:
                return person
        return None
        
    def list_people(self):
        for person in self.people:
            print(person)

    def update_full_Name_overide(self, emailAddress:str, fullNameOverideValue) -> Optional[Person]:
        valueUpdated = False
        for person in self.people:
            if person.work_email_address == emailAddress.upper():
                person.full_name_overide = fullNameOverideValue.upper()
                valueUpdated = True
                ## Give a quik warning as you add the override value into the database if the last name differs.
                if "," in fullNameOverideValue:
                    lastName = fullNameOverideValue.split(",")[0]
                else:
                    lastName = fullNameOverideValue.split(" ")[-1]
                if lastName.upper() == person.last_name:
                    pass
                else:
                    print(f"WARNING: Overide last name value {lastName.upper()} does not match {person.last_name}.")
        if valueUpdated == False:
            print(f"WARNING: No email address match for {emailAddress} found.")


class NamesVerification(object):
    """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
    version = '0.6.0'


    def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName,fullNameOveridesFileName, forceNewPklFile = False, Encoding = 'UTF8'):
        """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
        Assumes the first row of the data file is the header and first column is DocID.
        Assumes the MAL is a spreadsheet (for now).
        MAL gets saved to a pkl file for performance reasons.  pkl will be used unless forceNewPklFile is set to true"""
        pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"

        print("Initializing data structures...")
        if forceNewPklFile:
            print("Creating MAL structure...")
            self.malPeopleList = PeopleList()
            self.__IngestMALSpreadsheet(masterAttorneyListFileName)
            print("MAL structure created.")
            print("Loading full name overide values...")
            self.__LoadFullNameOverideValues(fullNameOveridesFileName)
            print("Full name overide values loaded.")
            print("Creating pickle backup...")
            self.__SaveMalToPkl(pklFileName)
            print("Pickle backup created.")
        else:
            if os.path.exists(pklFileName):
                print("Loading MAL structure from pickle file...")
                self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
                print("MAL structure loaded.")
            else:
                print("Pickle file doesnt exist.")
                print("Creating MAL structure...")
                self.malPeopleList = PeopleList()
                self.__IngestMALSpreadsheet(masterAttorneyListFileName)
                print("MAL structure created.")
                print("Loading full name overide values...")
                self.__LoadFullNameOverideValues(fullNameOveridesFileName)
                print("Full name overide values loaded.")
                print("Creating pickle backup...")
                self.__SaveMalToPkl(pklFileName)
                print("Pickle backup created.")
                
##        self.malPeopleList = PeopleList()
##        
##        print("Creating MAL structure...")
##        self.__IngestMALSpreadsheet(masterAttorneyListFileName)
##        print("MAL structure created.")
##        print("Creating pickle backup...")


    def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
        """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
        ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.

        excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10919, "beginColNumber":2, "endColNumber":17},
                               {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":572, "beginColNumber":2, "endColNumber":16}]

 #       excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
 #                              {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]

        spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
                                  "Split Role -  Attorney Capacity Date Range":"split_role_date_range", " Validated by OC??":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
                                  "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag",
                                        "Last Updated":"last_updated"}

        xlApp = Dispatch('Excel.Application')
        xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
        
        for excelTab in excelTabParametersList:
            sht = xlBook.Worksheets(excelTab['tabName'])
            print(f"Ingesting sheet {excelTab['tabName']}.")
            excelFieldPositionMatrix = {}
            for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
                excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
            for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
                #print(row)
                ##  TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
                if excelTab['tabName'] == 'Attorneys':
                    self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
                                                         split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role -  Attorney Capacity Date Range']).Value,
                                                         sidley_validated =  sht.Cells(row,excelFieldPositionMatrix[' Validated by OC?']).Value,
                                                         category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
                                                         organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
                                                         last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
                                                         first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
                                                         work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
                                                         alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
                                                         job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
                                                         business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
                                                         full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
                                                         login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
                                                         department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
                                                         addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Comments']).Value))
                                                         #addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
                                                         #last_updated = sht.Cells(row,excelFieldPositionMatrix['Last Updated']).Value ))
                
                else:
                    self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
                                                         split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role -  Attorney Capacity Date Range']).Value,
                                                         sidley_validated =  sht.Cells(row,excelFieldPositionMatrix['Validated by OC?']).Value,
                                                         organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
                                                         last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
                                                         first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
                                                         work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
                                                         alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
                                                         job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
                                                         business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
                                                         full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
                                                         login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
                                                         department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
                                                         addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))                                         

        xlBook.Close()

    def __SaveMalToPkl(self, pklFileName):
        """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
        outputFile = open(pklFileName,'wb')
        pickle.dump(self.malPeopleList,outputFile)
        outputFile.close()

    def __LoadMalFromPkl(self, pklFileName):
        """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
        contents = open(pklFileName, 'rb')
        obj = pickle.load(contents)
        contents.close()
        return obj

    def __LoadFullNameOverideValues(self, fullNameOveridesFileName):
        """Pseudo-private method which will update the MAL people list object with the full name overide values."""
        contents = open(fullNameOveridesFileName).readlines()
        for line in contents:
            line = line.replace("\n","")
            emailAddress,fullNameOverideValue = line.split("|")
            
            self.malPeopleList.update_full_Name_overide(emailAddress, fullNameOverideValue)

    def SmartDedupeSet(self, currentSet):
        """Pseudo-private method that attempts to do some additional deduplication of the values in a set by lowering all values and deduplicating.  Returns a lowered deduplicated set."""
        newSet = set()
        for val in currentSet:
            newSet.add(val.lower())
        return newSet

if __name__ == '__main__':
    cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\VEAS\VEAS_Log_Data_Export_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\TEST-PLOG.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
    #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List  2024.11.06(7045550.3).xlsx"
    masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List  2024.12.1(7045413.15).xlsx"
    #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
    #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides.txt"
    #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides - Copy.txt"
    fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
    

    nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
    #nv.malPeopleList.list_people()

    qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
    print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
    #print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
    workList = qcP.metadataValuesDict.keys()
    outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
    for docID in workList:
        #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
        #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
        #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
        #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
        #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
        #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
        metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
        formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
        formattedAttorneyValues = set()
        for formattedValue in formattedFieldValues:
            if "*" in formattedValue:
                formattedAttorneyValues.add(formattedValue.upper())
        
        if metadataFieldValues:
            matchedMetadataValues = set()
            for nameItem in metadataFieldValues:
                ## First test to see if there is a valid email address.
                resultSet = set()
                results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
                if results:
                    for result in results:
                        resultSet.add(result)
                    if len(resultSet) >1:
                        resultSet = nv.SmartDedupeSet(resultSet)
                    if len(resultSet) >1:
                        print("ERROR multiple email **unique** email addresses in one item.")
                        print(resultSet)
                        print("\n")
                    else:
                        personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
                        if personMatch:
                            if personMatch.full_name_overide:
                                fullName = personMatch.full_name_overide
                            elif personMatch.full_name_preferred:
                                #print(personMatch.full_name_preferred)
                                ##  Going to need to do a bit of replacing to remove some information that is just never in the formatted.
                                fullPreferredName = personMatch.full_name_preferred
                                fullPreferredName = fullPreferredName.replace('(LEGAL)','')
                                fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
                                fullPreferredName = fullPreferredName.replace('(SHE HER)','')
                                preferedLastName, preferedFirstName = fullPreferredName.split(',')
                                preferedLastName = preferedLastName.strip()
                                preferedFirstName = preferedFirstName.strip()
                                preferedFirstName = preferedFirstName.split(" ")[0]
                                fullName = f"{preferedFirstName} {preferedLastName}"
                                #fullName = f"{preferedLastName}, {preferedFirstName}"
                            else:
                                fullName = f"{personMatch.first_name} {personMatch.last_name}"
                                #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
                            if personMatch.is_attorney == 'YES':
                                #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
                                matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
                            else:
                                #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
                                matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
                else:
                    outputFile.write(f"{docID} contains a non-email item {nameItem}\n\n")
            missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
            missingFromMeta = formattedAttorneyValues - matchedMetadataValues
            if missingFromFormatted:
                for missingItem in missingFromFormatted:
                    outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
            if missingFromMeta:
                for missingItem in missingFromMeta:
                    outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
            if missingFromFormatted:
                outputFile.write("\n")
            elif missingFromMeta:
                outputFile.write("\n")
    outputFile.close()


##    people_list = PeopleList()
##    people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "fooBar@gmail.com", altWorkEmailAddress = ""))
##    people_list.add_person(Person(firstName = "Gary", lastName = "Cooper", workEmailAddress = "", altWorkEmailAddress = "spam.eggs@hotmail.com"))
##    people_list.add_person(Person(firstName = "", lastName = "", workEmailAddress = "noname@gmail.com", altWorkEmailAddress = ""))
##    people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "eggs@outlook.com", altWorkEmailAddress = ""))
##    print("\nAll People:")
##    people_list.list_people()
##    print("\nSearching...")
##    result = people_list.search_by_email('fooBar@gmail.com')
##    print(result if result else "email not found.")
Revision:	846
Committed:	Tue Dec 10 17:22:55 2024 UTC (15 months, 2 weeks ago) by nino.borges
Content type:	text/x-python
File size:	23675 byte(s)
Log Message:	They changed a column name to 'comments', so updated dataclass. also added some warning messages when the consilio override ingestion cant find a matching email address in the MAL. I'm seeing many of these.
#	Content
1	"""
2
3	Amazon-NamesNormQC
4
5	Created by:
6	Emanuel Borges
7	11.21.2024
8
9	This program will assist with the process of performing Names Normalization QC on the Amazon privilege logs.
10
11	"""
12
13	import os, uuid, pickle, re
14	import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
15	from dataclasses import dataclass, field
16	from typing import List, Optional
17	from collections import namedtuple
18	from win32com.client import Dispatch
19
20
21	@dataclass
22	class Person:
23	first_name: Optional[str] = None
24	last_name: Optional[str] = None
25	work_email_address: Optional[str] = None
26	alt_work_email_address: Optional[str] = None
27	_id: uuid.UUID = field(default_factory=uuid.uuid4)
28	is_attorney: Optional[str] = None
29	split_role_date_range: Optional[str] = None
30	sidley_validated: Optional[str] = None
31	category: Optional[str] = None
32	organization: Optional[str] = None
33	job_title: Optional[str] = None
34	business_title: Optional[str] = None
35	full_name_preferred: Optional[str] = None
36	login: Optional[str] = None
37	department_fine: Optional[str] = None
38	addressed_during_caag: Optional[str] = None
39	#last_updated: Optional[str] = None
40	full_name_overide: Optional[str] = None
41
42	def __post_init__(self):
43	"""Convert all string fields to uppercase."""
44	if self.first_name:
45	self.first_name = self.first_name.strip().upper()
46	if self.last_name:
47	self.last_name = self.last_name.strip().upper()
48	if self.work_email_address:
49	self.work_email_address = self.work_email_address.strip().upper()
50	if self.alt_work_email_address:
51	self.alt_work_email_address = self.alt_work_email_address.strip().upper()
52	if self.is_attorney:
53	self.is_attorney = self.is_attorney.strip().upper()
54	if self.split_role_date_range:
55	self.split_role_date_range = self.split_role_date_range.strip().upper()
56	if self.sidley_validated:
57	self.sidley_validated = self.sidley_validated.strip().upper()
58	if self.category:
59	self.category = self.category.strip().upper()
60	if self.organization:
61	self.organization = self.organization.strip().upper()
62	if self.job_title:
63	self.job_title = self.job_title.strip().upper()
64	if self.business_title:
65	self.business_title = self.business_title.strip().upper()
66	if self.full_name_preferred:
67	self.full_name_preferred = self.full_name_preferred.strip().upper()
68	if self.login:
69	self.login = self.login.strip().upper()
70	if self.department_fine:
71	self.department_fine = self.department_fine.strip().upper()
72	if self.addressed_during_caag:
73	self.addressed_during_caag = self.addressed_during_caag.strip().upper()
74	#if self.last_updated:
75	# self.last_updated = self.last_updated.strip().upper()
76
77	@dataclass
78	class PeopleList:
79	people: List[Person] = field(default_factory=list)
80
81	def add_person(self, person: Person):
82	self.people.append(person)
83	#print(f"Added person: {person}")
84
85
86	def search_by_email(self, emailAddress:str) -> Optional[Person]:
87	for person in self.people:
88	if person.work_email_address == emailAddress:
89	return person
90	return None
91
92	def list_people(self):
93	for person in self.people:
94	print(person)
95
96	def update_full_Name_overide(self, emailAddress:str, fullNameOverideValue) -> Optional[Person]:
97	valueUpdated = False
98	for person in self.people:
99	if person.work_email_address == emailAddress.upper():
100	person.full_name_overide = fullNameOverideValue.upper()
101	valueUpdated = True
102	## Give a quik warning as you add the override value into the database if the last name differs.
103	if "," in fullNameOverideValue:
104	lastName = fullNameOverideValue.split(",")[0]
105	else:
106	lastName = fullNameOverideValue.split(" ")[-1]
107	if lastName.upper() == person.last_name:
108	pass
109	else:
110	print(f"WARNING: Overide last name value {lastName.upper()} does not match {person.last_name}.")
111	if valueUpdated == False:
112	print(f"WARNING: No email address match for {emailAddress} found.")
113
114
115	class NamesVerification(object):
116	"""A class for automating the process of performing QC on the names within the Amazon privilege logs."""
117	version = '0.6.0'
118
119
120	def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName,fullNameOveridesFileName, forceNewPklFile = False, Encoding = 'UTF8'):
121	"""Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
122	Assumes the first row of the data file is the header and first column is DocID.
123	Assumes the MAL is a spreadsheet (for now).
124	MAL gets saved to a pkl file for performance reasons. pkl will be used unless forceNewPklFile is set to true"""
125	pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"
126
127	print("Initializing data structures...")
128	if forceNewPklFile:
129	print("Creating MAL structure...")
130	self.malPeopleList = PeopleList()
131	self.__IngestMALSpreadsheet(masterAttorneyListFileName)
132	print("MAL structure created.")
133	print("Loading full name overide values...")
134	self.__LoadFullNameOverideValues(fullNameOveridesFileName)
135	print("Full name overide values loaded.")
136	print("Creating pickle backup...")
137	self.__SaveMalToPkl(pklFileName)
138	print("Pickle backup created.")
139	else:
140	if os.path.exists(pklFileName):
141	print("Loading MAL structure from pickle file...")
142	self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
143	print("MAL structure loaded.")
144	else:
145	print("Pickle file doesnt exist.")
146	print("Creating MAL structure...")
147	self.malPeopleList = PeopleList()
148	self.__IngestMALSpreadsheet(masterAttorneyListFileName)
149	print("MAL structure created.")
150	print("Loading full name overide values...")
151	self.__LoadFullNameOverideValues(fullNameOveridesFileName)
152	print("Full name overide values loaded.")
153	print("Creating pickle backup...")
154	self.__SaveMalToPkl(pklFileName)
155	print("Pickle backup created.")
156
157	## self.malPeopleList = PeopleList()
158	##
159	## print("Creating MAL structure...")
160	## self.__IngestMALSpreadsheet(masterAttorneyListFileName)
161	## print("MAL structure created.")
162	## print("Creating pickle backup...")
163
164
165
166
167
168	def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
169	"""Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
170	## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.
171
172	excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10919, "beginColNumber":2, "endColNumber":17},
173	{"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":572, "beginColNumber":2, "endColNumber":16}]
174
175	# excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
176	# {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]
177
178	spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
179	"Split Role - Attorney Capacity Date Range":"split_role_date_range", " Validated by OC??":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
180	"Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag",
181	"Last Updated":"last_updated"}
182
183	xlApp = Dispatch('Excel.Application')
184	xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
185
186	for excelTab in excelTabParametersList:
187	sht = xlBook.Worksheets(excelTab['tabName'])
188	print(f"Ingesting sheet {excelTab['tabName']}.")
189	excelFieldPositionMatrix = {}
190	for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
191	excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
192	for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
193	#print(row)
194	## TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
195	if excelTab['tabName'] == 'Attorneys':
196	self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
197	split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
198	sidley_validated = sht.Cells(row,excelFieldPositionMatrix[' Validated by OC?']).Value,
199	category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
200	organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
201	last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
202	first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
203	work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
204	alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
205	job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
206	business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
207	full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
208	login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
209	department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
210	addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Comments']).Value))
211	#addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
212	#last_updated = sht.Cells(row,excelFieldPositionMatrix['Last Updated']).Value ))
213
214	else:
215	self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
216	split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Attorney Capacity Date Range']).Value,
217	sidley_validated = sht.Cells(row,excelFieldPositionMatrix['Validated by OC?']).Value,
218	organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
219	last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
220	first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
221	work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
222	alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
223	job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
224	business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
225	full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
226	login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
227	department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
228	addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
229
230	xlBook.Close()
231
232	def __SaveMalToPkl(self, pklFileName):
233	"""Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
234	outputFile = open(pklFileName,'wb')
235	pickle.dump(self.malPeopleList,outputFile)
236	outputFile.close()
237
238	def __LoadMalFromPkl(self, pklFileName):
239	"""Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
240	contents = open(pklFileName, 'rb')
241	obj = pickle.load(contents)
242	contents.close()
243	return obj
244
245	def __LoadFullNameOverideValues(self, fullNameOveridesFileName):
246	"""Pseudo-private method which will update the MAL people list object with the full name overide values."""
247	contents = open(fullNameOveridesFileName).readlines()
248	for line in contents:
249	line = line.replace("\n","")
250	emailAddress,fullNameOverideValue = line.split("\|")
251
252	self.malPeopleList.update_full_Name_overide(emailAddress, fullNameOverideValue)
253
254	def SmartDedupeSet(self, currentSet):
255	"""Pseudo-private method that attempts to do some additional deduplication of the values in a set by lowering all values and deduplicating. Returns a lowered deduplicated set."""
256	newSet = set()
257	for val in currentSet:
258	newSet.add(val.lower())
259	return newSet
260
261	if __name__ == '__main__':
262	cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\VEAS\VEAS_Log_Data_Export_Converted.txt"
263	#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
264	#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\TEST-PLOG.txt"
265	#cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
266	#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
267	#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
268	#masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.11.06(7045550.3).xlsx"
269	masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List 2024.12.1(7045413.15).xlsx"
270	#masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
271	#fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides.txt"
272	#fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides - Copy.txt"
273	fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
274
275
276
277	nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
278	#nv.malPeopleList.list_people()
279
280	qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
281	print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
282	#print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
283	workList = qcP.metadataValuesDict.keys()
284	outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
285	for docID in workList:
286	#metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
287	#formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
288	#metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
289	#formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
290	#metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
291	#formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
292	metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
293	formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
294	formattedAttorneyValues = set()
295	for formattedValue in formattedFieldValues:
296	if "*" in formattedValue:
297	formattedAttorneyValues.add(formattedValue.upper())
298
299	if metadataFieldValues:
300	matchedMetadataValues = set()
301	for nameItem in metadataFieldValues:
302	## First test to see if there is a valid email address.
303	resultSet = set()
304	results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
305	if results:
306	for result in results:
307	resultSet.add(result)
308	if len(resultSet) >1:
309	resultSet = nv.SmartDedupeSet(resultSet)
310	if len(resultSet) >1:
311	print("ERROR multiple email unique email addresses in one item.")
312	print(resultSet)
313	print("\n")
314	else:
315	personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
316	if personMatch:
317	if personMatch.full_name_overide:
318	fullName = personMatch.full_name_overide
319	elif personMatch.full_name_preferred:
320	#print(personMatch.full_name_preferred)
321	## Going to need to do a bit of replacing to remove some information that is just never in the formatted.
322	fullPreferredName = personMatch.full_name_preferred
323	fullPreferredName = fullPreferredName.replace('(LEGAL)','')
324	fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
325	fullPreferredName = fullPreferredName.replace('(SHE HER)','')
326	preferedLastName, preferedFirstName = fullPreferredName.split(',')
327	preferedLastName = preferedLastName.strip()
328	preferedFirstName = preferedFirstName.strip()
329	preferedFirstName = preferedFirstName.split(" ")[0]
330	fullName = f"{preferedFirstName} {preferedLastName}"
331	#fullName = f"{preferedLastName}, {preferedFirstName}"
332	else:
333	fullName = f"{personMatch.first_name} {personMatch.last_name}"
334	#fullName = f"{personMatch.last_name}, {personMatch.first_name}"
335	if personMatch.is_attorney == 'YES':
336	#outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
337	matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
338	else:
339	#outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
340	matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
341	else:
342	outputFile.write(f"{docID} contains a non-email item {nameItem}\n\n")
343	missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
344	missingFromMeta = formattedAttorneyValues - matchedMetadataValues
345	if missingFromFormatted:
346	for missingItem in missingFromFormatted:
347	outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
348	if missingFromMeta:
349	for missingItem in missingFromMeta:
350	outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
351	if missingFromFormatted:
352	outputFile.write("\n")
353	elif missingFromMeta:
354	outputFile.write("\n")
355	outputFile.close()
356
357
358
359
360	## people_list = PeopleList()
361	## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "fooBar@gmail.com", altWorkEmailAddress = ""))
362	## people_list.add_person(Person(firstName = "Gary", lastName = "Cooper", workEmailAddress = "", altWorkEmailAddress = "spam.eggs@hotmail.com"))
363	## people_list.add_person(Person(firstName = "", lastName = "", workEmailAddress = "noname@gmail.com", altWorkEmailAddress = ""))
364	## people_list.add_person(Person(firstName = "Sally", lastName = "Smith", workEmailAddress = "eggs@outlook.com", altWorkEmailAddress = ""))
365	## print("\nAll People:")
366	## people_list.list_people()
367	## print("\nSearching...")
368	## result = people_list.search_by_email('fooBar@gmail.com')
369	## print(result if result else "email not found.")