Active_prgs/Redgrave/Amazon_NamesNormQC.py

"""

Amazon_NamesNormQC

Created by:
Emanuel Borges
11.21.2024

This Library will assist with the process of performing Names Normalization QC on the Amazon privilege logs.

"""

import os, uuid, pickle, re
#import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
from dataclasses import dataclass, field, fields
from typing import List, Tuple, Optional
from collections import namedtuple
from win32com.client import Dispatch


@dataclass
class Person:
    first_name: Optional[str] = None
    last_name: Optional[str] = None
    work_email_address: Optional[str] = None
    alt_work_email_address: Optional[str] = None
    _id: uuid.UUID = field(default_factory=uuid.uuid4)
    is_attorney: Optional[str] = None
    split_role_date_range: Optional[str] = None
    sidley_validated: Optional[str] = None
    category: Optional[str] = None
    organization: Optional[str] = None
    job_title: Optional[str] = None
    business_title: Optional[str] = None
    full_name_preferred: Optional[str] = None
    login: Optional[str] = None
    department_fine: Optional[str] = None
    addressed_during_caag: Optional[str] = None
    #last_updated: Optional[str] = None
    full_name_overide: Optional[str] = None
    ## Only gather unique_attorney_row_number from the attorney and split role attorney tabs.  NEVER from downgrades.
    unique_attorney_row_number:Optional[str] = None
    ##  Will be saving this as a list of tuple pairs (startdate,enddate). Allowing None for now but may update this to forcing an empty list, to avoid mutable default issues.
    dates_as_counsel:Optional[List[Tuple[str,str]]] = None

    def __post_init__(self):
        """Convert all string fields to uppercase."""
        if self.first_name:
            self.first_name = self.first_name.strip().upper()
        if self.last_name:
            self.last_name = self.last_name.strip().upper()
        if self.work_email_address:
            self.work_email_address = self.work_email_address.strip().upper()
        if self.alt_work_email_address:
            self.alt_work_email_address = self.alt_work_email_address.strip().upper()
        if self.is_attorney:
            self.is_attorney = self.is_attorney.strip().upper()
        if self.split_role_date_range:
            self.split_role_date_range = self.split_role_date_range.strip().upper()
        if self.sidley_validated:
            self.sidley_validated = self.sidley_validated.strip().upper()
        if self.category:
            self.category = self.category.strip().upper()
        if self.organization:
            self.organization = self.organization.strip().upper()
        if self.job_title:
            self.job_title = self.job_title.strip().upper()
        if self.business_title:
            self.business_title = self.business_title.strip().upper()
        if self.full_name_preferred:
            self.full_name_preferred = self.full_name_preferred.strip().upper()
        if self.login:
            self.login = self.login.strip().upper()
        if self.department_fine:
            self.department_fine = self.department_fine.strip().upper()
        if self.addressed_during_caag:
            self.addressed_during_caag = self.addressed_during_caag.strip().upper()
        #if self.last_updated:
        #    self.last_updated = self.last_updated.strip().upper()

@dataclass
class PeopleList:
    people: List[Person] = field(default_factory=list)

    def add_person(self, person: Person):
        self.people.append(person)
        #print(f"Added person: {person}")


    def search_by_email(self, emailAddress:str) -> Optional[Person]:
        """Returns the first matching emailAddress value.  Assumes emailAddresses are unique"""
        for person in self.people:
            if person.work_email_address == emailAddress:
                return person
            elif person.alt_work_email_address == emailAddress:
                return person
        return None


    def search_by_unique_attorney_row_number(self,uniqueAttorneyRowNumber:str) -> Optional[Person]:
        """Returns the first matching uniqueAttorneyRowNumber value.  Assumes uniqueAttorneyRowNumbers are unique"""
        for person in self.people:
            if person.unique_attorney_row_number == uniqueAttorneyRowNumber:
                return person
        return None

    def search_by_id(self, idNumber):
        """Returns the first matching idNumber value.  Must be in format UUID('7414f78c-8289-4c9f-bd49-a5aaac35545f')."""
        for person in self.people:
            if person._id == idNumber:
                return person
        return None

    def return_list_of_matching_values(self,fieldName, value:str):
        """Returns a full list of items where value is found in fieldName"""
        matchingPeopleList = []
        for person in self.people:
            if getattr(person,fieldName) == value:
                matchingPeopleList.append(person)
        return matchingPeopleList
        
    def list_people(self):
        for person in self.people:
            print(person)

    def update_full_Name_overide(self, emailAddress:str, fullNameOverideValue) -> Optional[Person]:
        valueUpdated = False
        for person in self.people:
            if person.work_email_address == emailAddress.upper():
                person.full_name_overide = fullNameOverideValue.upper()
                valueUpdated = True
                ## Give a quik warning as you add the override value into the database if the last name differs.
                if "," in fullNameOverideValue:
                    lastName = fullNameOverideValue.split(",")[0]
                else:
                    lastName = fullNameOverideValue.split(" ")[-1]
                if lastName.upper() == person.last_name:
                    pass
                else:
                    print(f"WARNING: Overide last name value {lastName.upper()} does not match {person.last_name}.")
        if valueUpdated == False:
            print(f"WARNING: No email address match for {emailAddress} found.")


class NamesVerification(object):
    """A class for automating the process of performing QC on the names within the Amazon privilege logs."""
    version = '0.10.0'


    def __init__(self, cleanedDatExportFileName, masterAttorneyListFileName,fullNameOveridesFileName, forceNewPklFile = False, Encoding = 'UTF8'):
        """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.
        Assumes the first row of the data file is the header and first column is DocID.
        Assumes the MAL is a spreadsheet (for now).
        MAL gets saved to a pkl file for performance reasons.  pkl will be used unless forceNewPklFile is set to true"""
        pklFileName = os.path.splitext(masterAttorneyListFileName)[0] + ".pkl"

        print("Initializing data structures...")
        if forceNewPklFile:
            print("Creating MAL structure...")
            self.malPeopleList = PeopleList()
            self.__IngestMALSpreadsheet(masterAttorneyListFileName)
            print("MAL structure created.")
            print("Loading full name overide values...")
            self.__LoadFullNameOverideValues(fullNameOveridesFileName)
            print("Full name overide values loaded.")
            print("Creating pickle backup...")
            self.__SaveMalToPkl(pklFileName)
            print("Pickle backup created.")
        else:
            if os.path.exists(pklFileName):
                print("Loading MAL structure from pickle file...")
                self.malPeopleList = self.__LoadMalFromPkl(pklFileName)
                print("MAL structure loaded.")
            else:
                print("Pickle file doesnt exist.")
                print("Creating MAL structure...")
                self.malPeopleList = PeopleList()
                self.__IngestMALSpreadsheet(masterAttorneyListFileName)
                print("MAL structure created.")
                print("Loading full name overide values...")
                self.__LoadFullNameOverideValues(fullNameOveridesFileName)
                print("Full name overide values loaded.")
                print("Creating pickle backup...")
                self.__SaveMalToPkl(pklFileName)
                print("Pickle backup created.")
                
##        self.malPeopleList = PeopleList()
##        
##        print("Creating MAL structure...")
##        self.__IngestMALSpreadsheet(masterAttorneyListFileName)
##        print("MAL structure created.")
##        print("Creating pickle backup...")


    def __IngestMALSpreadsheet(self, masterAttorneyListFileName):
        """Pseudo-private method which will open an Excel spreadsheet and ingest the values into the peoplelist dataclass."""
        ## There doenst seem to be a consistent value in the "row" column in the MAL, so setting these parameters here to avoid gap issues.

        ##  excelTabParametersList should always be an ordered list because now order matters.
        excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":10923, "beginColNumber":1, "endColNumber":17},
                                  {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":726, "beginColNumber":1, "endColNumber":16},
                                  {"tabName":"Split Role Attorneys", "beginRowNumber":2, "endRowNumber":21, "beginColNumber":1, "endColNumber":10}]


 #       excelTabParametersList = [{"tabName":"Attorneys", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":16},
 #                              {"tabName":"Downgrades", "beginRowNumber":2, "endRowNumber":30, "beginColNumber":2, "endColNumber":15}]

 #       spreadsheetFileMappingMatrix = {"First Name":"first_name", "Last Name":"last_name", "Work Email":"work_email_address", "Alt Work Email":"alt_work_email_address", "Is Attorney": "is_attorney",
 #                                 "Split Role -  Attorney Capacity Date Range":"split_role_date_range", " Validated by OC??":"sidley_validated", "Category": "category", "Organization":"organization", "Job Title":"job_title",
 #                                 "Business Title":"business_title", "Full Name (Preferred)":"full_name_preferred", "Login":"login", "Department (Fine)":"department_fine", "Addressed during CAAG":"addressed_during_caag",
 #                                       "Last Updated":"last_updated"}

        xlApp = Dispatch('Excel.Application')
        xlBook = xlApp.Workbooks.Open(masterAttorneyListFileName)
        
        for excelTab in excelTabParametersList:
            sht = xlBook.Worksheets(excelTab['tabName'])
            print(f"Ingesting sheet {excelTab['tabName']}.")
            excelFieldPositionMatrix = {}
            for col in range (excelTab['beginColNumber'], excelTab['endColNumber'] +1):
                excelFieldPositionMatrix[sht.Cells(1,col).Value] = col
            for row in range(excelTab['beginRowNumber'], excelTab['endRowNumber'] +1):
                #print(row)
                ##  TODO: Refactor the excelTabParametersList later. Didnt realize columns were not consistent.
                if excelTab['tabName'] == 'Attorneys':
                    self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
                                                         split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role - Dates as Counsel']).Value,
                                                         sidley_validated =  sht.Cells(row,excelFieldPositionMatrix[' Validated by OC?']).Value,
                                                         category = sht.Cells(row,excelFieldPositionMatrix['Category']).Value,
                                                         organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
                                                         last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
                                                         first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
                                                         work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
                                                         alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
                                                         job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
                                                         business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
                                                         full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
                                                         login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
                                                         department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
                                                         unique_attorney_row_number = sht.Cells(row,excelFieldPositionMatrix['Row']).Value,
                                                         addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Comments']).Value))
                                                         #addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
                                                         #last_updated = sht.Cells(row,excelFieldPositionMatrix['Last Updated']).Value ))
                
                elif excelTab['tabName'] == 'Downgrades':
                    ## Make sure to NOT grab the unique attorney row number from here
                    self.malPeopleList.add_person(Person(is_attorney = sht.Cells(row,excelFieldPositionMatrix['Is Attorney']).Value,
                                                         #split_role_date_range = sht.Cells(row,excelFieldPositionMatrix['Split Role -  Attorney Capacity Date Range']).Value,
                                                         sidley_validated =  sht.Cells(row,excelFieldPositionMatrix['Validated by OC?']).Value,
                                                         organization = sht.Cells(row,excelFieldPositionMatrix['Organization']).Value,
                                                         last_name = sht.Cells(row,excelFieldPositionMatrix['Last Name']).Value,
                                                         first_name = sht.Cells(row,excelFieldPositionMatrix['First Name']).Value,
                                                         work_email_address = sht.Cells(row,excelFieldPositionMatrix['Work Email']).Value,
                                                         alt_work_email_address = sht.Cells(row,excelFieldPositionMatrix['Alt Work Email']).Value,
                                                         job_title = sht.Cells(row,excelFieldPositionMatrix['Job Title']).Value,
                                                         business_title = sht.Cells(row,excelFieldPositionMatrix['Business Title']).Value,
                                                         full_name_preferred = sht.Cells(row,excelFieldPositionMatrix['Full Name (Preferred)']).Value,
                                                         login = sht.Cells(row,excelFieldPositionMatrix['Login']).Value,
                                                         department_fine = sht.Cells(row,excelFieldPositionMatrix['Department (Fine)']).Value,
                                                         addressed_during_caag = sht.Cells(row,excelFieldPositionMatrix['Addressed during CAAG']).Value))
                elif excelTab['tabName'] == 'Split Role Attorneys':
                    unique_attorney_row_number = sht.Cells(row,excelFieldPositionMatrix['Attorney Row']).Value
                    matchedPerson = self.malPeopleList.search_by_unique_attorney_row_number(unique_attorney_row_number)
                    if matchedPerson:
                        
                        ##  dates_as_counsel should always be a two string value tuple (startdate,enddate).
                        datesAsCounselValue = sht.Cells(row,excelFieldPositionMatrix['Dates as Counsel']).Value
                        datesAsCounselList = []
                        ##  First get rid of any extra data that is on a new line.  Note that they shouldnt be seperating the date ranges by newline.
                        datesAsCounselValue = datesAsCounselValue.split("\n")[0]
                        ##  Next split the ranges correctly by semicolon
                        dateRanges = datesAsCounselValue.split(";")
                        for dateRange in dateRanges:
                            ##  Split out the start and end, allowing non-date words.  (current, present, etc) however force these to be uppercase.
                            counselStartDate, counselEndDate = dateRange.split("-")
                            counselStartDate = counselStartDate.upper().strip()
                            counselEndDate = counselEndDate.upper().strip()
                            datesAsCounselList.append((counselStartDate,counselEndDate))
                        matchedPerson.dates_as_counsel = datesAsCounselList
                    
                else:
                    print(f"ERROR UNKNOWN TAB! {excelTab['tabName']}  HAVE NEEDED TAB NAMES CHANGED?")
        

        xlBook.Close()

    def __SaveMalToPkl(self, pklFileName):
        """Pseudo-private method which will save the current MAL people list object to a pkl file, for performance reasons."""
        outputFile = open(pklFileName,'wb')
        pickle.dump(self.malPeopleList,outputFile)
        outputFile.close()

    def __LoadMalFromPkl(self, pklFileName):
        """Pseudo-private method which will load a MAL people list object from a pkl file, for performance reasons."""
        contents = open(pklFileName, 'rb')
        obj = pickle.load(contents)
        contents.close()
        return obj

    def __LoadFullNameOverideValues(self, fullNameOveridesFileName):
        """Pseudo-private method which will update the MAL people list object with the full name overide values."""
        contents = open(fullNameOveridesFileName).readlines()
        for line in contents:
            line = line.replace("\n","")
            emailAddress,fullNameOverideValue = line.split("|")
            
            self.malPeopleList.update_full_Name_overide(emailAddress, fullNameOverideValue)

    def SmartDedupeSet(self, currentSet):
        """A method that attempts to do some additional deduplication of the values in a set by lowering all values and deduplicating.  Returns a lowered deduplicated set."""
        newSet = set()
        for val in currentSet:
            newSet.add(val.lower())
        return newSet

    def RunMalEmailAddressIntegrityCheck(self):
        """This method performs an integrity check on the MAL by analyzing and looking for duplicate email addresses."""
        emailTestMatrix = {}
        altTestMatrix = {}
        print("Performing MAL email address integrity check...")
        for i in range(0,len(self.malPeopleList.people)):
            altAddr =  self.malPeopleList.people[i].alt_work_email_address
            workAddr = self.malPeopleList.people[i].work_email_address
            if altAddr != None:
                altAddr = altAddr.strip()
                if altAddr in list(emailTestMatrix.keys()):
                    print(f"ISSUE:{altAddr} is a dupe of an workAddr.")
                if altAddr in list(altTestMatrix.keys()):
                    print(f"ISSUE:{altAddr} is a dupe!")
                else:
                    altTestMatrix[altAddr] = 1
            if workAddr != None:
                workAddr = workAddr.strip()
                if workAddr in list(altTestMatrix.keys()):
                    print(f"ISSUE:{workAddr} is a dupe of an altAddr.")
                if workAddr in list(emailTestMatrix.keys()):
                    print(f"ISSUE:{workAddr} is a dupe!")
                else:
                    emailTestMatrix[workAddr] = 1
        print("\nEmail address integrity check complete.\n\n")

    def RunMalEmailOutsideEmailFieldsIntegrityCheck(self):
        """This method performs an integrity check on the MAL by looking for email addresses that exist in fields other than the email address fields."""
        ##  Right now this looks for the @ symbol.
        ##  Editable list of fields that should be excluded from this test, especially those that should already have email addresses
        fieldsToExcludeList = ['work_email_address', 'alt_work_email_address','_id','dates_as_counsel','unique_attorney_row_number']
        print("Performing MAL email addresses outside of email address fields integrity check...")
        fieldObjects = fields(Person)
        fieldNames = [f.name for f in fieldObjects]
        #print(fieldNames)
        fieldsToSearchList = [x for x in fieldNames if x not in fieldsToExcludeList]
        #print(fieldsToSearchList)
        for i in range(0,len(self.malPeopleList.people)):
            for fieldName in fieldsToSearchList:
                testValue = getattr(self.malPeopleList.people[i], fieldName)
                #print(fieldName)
                if testValue:
                    if "@" in testValue:
                        print(f"ISSUE: The email address {testValue} exists in the non-email field {fieldName} for unique row# {self.malPeopleList.people[i].unique_attorney_row_number}.")
        print("\nEmail addresss outside of email fields integrity check complete.\n\n")
            

    def RunRowNumberIntegrityCheck(self):
        """This method performs an integrity check on the MAL by analyzing the hard-coded row numbers across the 3 imporant tabs.  Looks for gaps, blanks, and inconsistencies between split role. """
        ##  First let's return all non-attorneys and confirm the hard-coded row number is in the 50000 range and look for gaps.
        print("Performing MAL hard-coded row number integrity check...")
##        nonAttorneyPeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','NO')
##        print(f"Analyzing all {len(nonAttorneyPeopleList)} non-attorneys items...")
##        ##  Gather all non-attorneys and add hc row number to a list, looking for any that are missing a value
##        for nonAttorneyPerson in nonAttorneyPeopleList:
##            hcRowNumberList = []
##            hcRowNumber = nonAttorneyPerson.unique_attorney_row_number
##            if hcRowNumber == None:
##                print(f"WARNING: Empty hard coded row number for {nonAttorneyPerson.first_name} {nonAttorneyPerson.last_name} in the Downgrades Tab.")
##            else:
##                hcRowNumberList.append(int(hcRowNumber))
##        ##  Next export a list of the missing numbers
##        hcRowNumberList.sort()
##        compareSet = set(range(hcRowNumberList[0], hcRowNumberList[-1]))
##        downgradeDiffs = compareSet - set(hcRowNumberList)
##        print(downgradeDiffs)
        ##  Now let's do similar for attorneys, including split role.
        attorneyPeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','YES')
        splitRolePeopleList = self.malPeopleList.return_list_of_matching_values('is_attorney','SPLIT ROLE')
        ##  Creating a third list using the newer list joining from pep 448
        fullAttorneyPeopleList = [*attorneyPeopleList,*splitRolePeopleList]
        print(f"Analyzing all {len(fullAttorneyPeopleList)} attorneys items...")
        ##  Gather all attorneys and add hc row number to a list, looking for any that are missing a value
        for attorneyPerson in fullAttorneyPeopleList:
            hcRowNumberList = []
            hcRowNumber = attorneyPerson.unique_attorney_row_number
            if hcRowNumber == None:
                print(f"WARNING: Empty hard coded row number for {attorneyPerson.first_name} {attorneyPerson.last_name} in the Attorneys Tab.")
            else:
                hcRowNumberList.append(int(hcRowNumber))
        ##  Next export a list of the missing numbers
        hcRowNumberList.sort()
        compareSet = set(range(hcRowNumberList[0], hcRowNumberList[-1]))
        attorneyDiffs = compareSet - set(hcRowNumberList)
        if attorneyDiffs:
            print(attorneyDiffs)
        else:
            print("There are no gaps in the hard coded row numbers in the Attorneys tab.")


if __name__ == '__main__':
    pass
##    cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\VEAS\VEAS_Log_Data_Export_Converted.txt"
##    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
##    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\TEST-PLOG.txt"
##    #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
##    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
##    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\TEST.txt"
##    #masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List  2024.11.06(7045550.3).xlsx"
##    masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List  2024.12.1(7045413.15).xlsx"
##    #masterAttorneyListFileName = r"C:\Test_Dir\Amazon\TEST-MAL.xlsx"
##    #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides.txt"
##    #fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\FullNameOverides - Copy.txt"
##    fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
##    
##
##
##    nv = NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)
##    #nv.malPeopleList.list_people()
##
##    qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)
##    print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
##    #print(nv.malPeopleList.search_by_email('crespojp@amazon.com'.upper()))
##    workList = qcP.metadataValuesDict.keys()
##    outputFile = open(r"C:\Test_Dir\Amazon\NameNormOutputText.txt",'w')
##    for docID in workList:
##        #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
##        #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
##        #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['ccValues']
##        #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['ccValues']
##        #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['bccValues']
##        #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['bccValues']
##        metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['fromValues']
##        formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['fromValues']
##        formattedAttorneyValues = set()
##        for formattedValue in formattedFieldValues:
##            if "*" in formattedValue:
##                formattedAttorneyValues.add(formattedValue.upper())
##        
##        if metadataFieldValues:
##            matchedMetadataValues = set()
##            for nameItem in metadataFieldValues:
##                ## First test to see if there is a valid email address.
##                resultSet = set()
##                results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, nameItem)
##                if results:
##                    for result in results:
##                        resultSet.add(result)
##                    if len(resultSet) >1:
##                        resultSet = nv.SmartDedupeSet(resultSet)
##                    if len(resultSet) >1:
##                        print("ERROR multiple email **unique** email addresses in one item.")
##                        print(resultSet)
##                        print("\n")
##                    else:
##                        personMatch = nv.malPeopleList.search_by_email(resultSet.pop().upper())
##                        if personMatch:
##                            if personMatch.full_name_overide:
##                                fullName = personMatch.full_name_overide
##                            elif personMatch.full_name_preferred:
##                                #print(personMatch.full_name_preferred)
##                                ##  Going to need to do a bit of replacing to remove some information that is just never in the formatted.
##                                fullPreferredName = personMatch.full_name_preferred
##                                fullPreferredName = fullPreferredName.replace('(LEGAL)','')
##                                fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
##                                fullPreferredName = fullPreferredName.replace('(SHE HER)','')
##                                preferedLastName, preferedFirstName = fullPreferredName.split(',')
##                                preferedLastName = preferedLastName.strip()
##                                preferedFirstName = preferedFirstName.strip()
##                                preferedFirstName = preferedFirstName.split(" ")[0]
##                                fullName = f"{preferedFirstName} {preferedLastName}"
##                                #fullName = f"{preferedLastName}, {preferedFirstName}"
##                            else:
##                                fullName = f"{personMatch.first_name} {personMatch.last_name}"
##                                #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
##                            if personMatch.is_attorney == 'YES':
##                                #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name}* ({personMatch.work_email_address.split('@')[-1]})\n")
##                                matchedMetadataValues.add(f"{fullName}* ({personMatch.work_email_address.split('@')[-1]})")
##                            else:
##                                #outputFile.write(f"{docID} has match {personMatch.first_name} {personMatch.last_name} ({personMatch.work_email_address.split('@')[-1]})\n")
##                                matchedMetadataValues.add(f"{fullName} ({personMatch.work_email_address.split('@')[-1]})")
##                else:
##                    outputFile.write(f"{docID} contains a non-email item {nameItem}\n\n")
##            missingFromFormatted = matchedMetadataValues - formattedAttorneyValues
##            missingFromMeta = formattedAttorneyValues - matchedMetadataValues
##            if missingFromFormatted:
##                for missingItem in missingFromFormatted:
##                    outputFile.write(f"{docID} has {missingItem} missing from the formatted field\n")
##            if missingFromMeta:
##                for missingItem in missingFromMeta:
##                    outputFile.write(f"{docID} has {missingItem} missing from the metadata field\n")
##            if missingFromFormatted:
##                outputFile.write("\n")
##            elif missingFromMeta:
##                outputFile.write("\n")
##    outputFile.close()

Revision:	877
Committed:	Fri Jan 10 22:03:53 2025 UTC (14 months, 2 weeks ago) by nino.borges
Content type:	text/x-python
File size:	32676 byte(s)
Log Message:	Adds a new integrity check that will look for email addresses outside of email address fields in the MAL. Also updated the existing email address integrity check to do a better job of finding duplicates across work email and alt email.