Active_prgs/Redgrave/Amazon_PerformDeepNamesNormQC.py

"""

Amazon_PerformDeepNamesNormQC

Created by:
Emanuel Borges
12.11.2024

This program is similar to Amazon_PerformNamesNormQC but it will perform a deeper level of names norm QC. I may just replace Amazon_PerformNamesNormQC with this file but for now i'd
like to keep both.

"""

import os, re, datetime, calendar
from uuid import UUID
import MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC
import MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC

version = '0.10.0'

issuesMatrix = {}

def GatherAllPossibleVariations(personMatch):
    """Takes a personMatch, which is the results of a person match, and attempts to make all possible name match variations that may exist in the formatted field.
    returns deduplicated list of tuple pairs (fullname, parenthetical)"""
    ##  Start as a plain list of all possible tuple pairs.
    allPossibleVariationsList = []

    allDomainsList = []
    if personMatch.work_email_address:
        allDomainsList.append(f"{personMatch.work_email_address.split('@')[-1]}")
        ##  After talking to Eli, we decided that all of these amazon.com.uk or amazon.it are related domains,
        ##  so we should feel confident that we can add amazon.com to the list of possible domains.  doing that here and for alt work email. 
        if "@AMAZON." in personMatch.work_email_address:
            allDomainsList.append("AMAZON.COM")
    if personMatch.alt_work_email_address:
        allDomainsList.append(f"{personMatch.alt_work_email_address.split('@')[-1]}")
        if "@AMAZON." in personMatch.alt_work_email_address:
            allDomainsList.append("AMAZON.COM")
    allDomainsList = list(dict.fromkeys(allDomainsList))
    if personMatch.last_name == "MANEK":
        print(allDomainsList)

    if personMatch.full_name_overide:
        fullName = personMatch.full_name_overide
        for domain in allDomainsList:
            allPossibleVariationsList.append((fullName,domain))
    if personMatch.full_name_preferred:
        ##  Going to need to do a bit of replacing to remove some information that is just never in the formatted.
        fullPreferredName = personMatch.full_name_preferred
        fullPreferredName = fullPreferredName.replace('(LEGAL)','')
        fullPreferredName = fullPreferredName.replace('(SHE, HER)','')
        fullPreferredName = fullPreferredName.replace('(SHE HER)','')
        if "," in fullPreferredName:
            preferedLastName, preferedFirstName = fullPreferredName.split(',')
            preferedLastName = preferedLastName.strip()
            preferedFirstName = preferedFirstName.strip()
            preferedFirstName = preferedFirstName.split(" ")[0]
            fullName = f"{preferedFirstName} {preferedLastName}"
            #fullName = f"{preferedLastName}, {preferedFirstName}"
            for domain in allDomainsList:
                allPossibleVariationsList.append((fullName,domain))
        else:
            print(f"ERROR in this name {fullPreferredName}")
    if personMatch.last_name:
        if personMatch.first_name:
            fullName = f"{personMatch.first_name} {personMatch.last_name}"
            #fullName = f"{personMatch.last_name}, {personMatch.first_name}"
        else:
            fullName = f"{personMatch.last_name}"
        for domain in allDomainsList:
            allPossibleVariationsList.append((fullName,domain))


    ##  Now return a deduplicated list by using dict to deduplicate.
    return list(dict.fromkeys(allPossibleVariationsList))


def AddToIssuesList(docID,issueMessage):
    """This function will add a single issue to the issues matrix."""
    if docID in list(issuesMatrix.keys()):
        issuesMatrix[docID].append(issueMessage)
    else:
        issuesMatrix[docID] = [issueMessage,]
    

if __name__ == '__main__':
    cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
    masterAttorneyListFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Amazon_ Master Attorney List  2024.12.12(20241212-1151).xlsx"
    fullNameOveridesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Consilio\VEAS-MasterAttorneyList\FullNameOverides.txt"
    outputFileName = r"C:\Test_Dir\Amazon\NameNormDeepOutputText.txt"


    nv = MyCode.Active_prgs.Redgrave.Amazon_NamesNormQC.NamesVerification(cleanedDatExportFileName, masterAttorneyListFileName, fullNameOveridesFileName)

    qcP = MyCode.Active_prgs.Redgrave.Amazon_PrivLogQC.QcPrivLog(cleanedDatExportFileName)

    #issuesMatrix = {}

    print(f"\nThere are {len(qcP.formattedValuesDict)} documents in the formatted values dictionary.")
    print(f"There are {len(qcP.metadataValuesDict)} documents in the metadata values dictionary.")

    workList = qcP.metadataValuesDict.keys()
    for docID in workList:
        metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['toValues']
        formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['toValues']
        #metadataFieldValues = qcP.metadataValuesDict[docID]._asdict()['docAuthor']
        #formattedFieldValues = qcP.formattedValuesDict[docID]._asdict()['docAuthor']
        ##  remember to convert all values in formattedFieldValues to uppercase (perhaps eventually do some of the formatted cleaning that eli mentioned.
        formattedFieldValues = [xVal.upper() for xVal in formattedFieldValues]
        ##  This will change once you start itterating acroll all of the field values names
        currentMetadataValues = metadataFieldValues
        for val in currentMetadataValues:
            ##  First try to locate an email address in this val and if found, try to find that in the MAL.
            results = re.findall(qcP.allPossibleEmailAddressesRegExPattern, val)
            if results:
                ##  Use some smart deduplication to remove duplicates.
                results = nv.SmartDedupeSet(results)
                if len(results) > 1:
                    print(f"WARNING: more than one unique email address found in this value: {results}")
                for result in results:
                    ##  Try to find a match in the MAL by email. There shouldnt rows with duplicative email addresses.
                    ##  TODO:DONE: Update search_by_email to search both workemail and alt email.
                    
                    personMatch = nv.malPeopleList.search_by_email(result.upper())
                    if personMatch:
                        ##  Person match found in MAL.  Now try to match a value in the formatted field by pulling various values from the MAL.
                        ##  For each of these match attempts, try using the correct designation and incorrect designation (* vs no *) and note that.
                        allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
                        matchFlag = False
                        if allPossibleVariationsList:
                            for variationPair in allPossibleVariationsList:
                                if personMatch.is_attorney == 'YES':
                                    if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                        ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                        if matchFlag:
                                            print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")

                                        formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                        matchFlag = True

                                        
                                    elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                        ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                        if matchFlag:
                                            print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                        formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                        matchFlag = True
                                        ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                        AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
                                        

                                elif personMatch.is_attorney == 'NO':
                                    if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                        if matchFlag:
                                            print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                        ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                        formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                        matchFlag = True
                                    elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                        ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                        if matchFlag:
                                            print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                        formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                        matchFlag = True
                                        ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                        AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
                                else:
                                    ##  This means they are a split role, so additional work will need to be done with the dates.
                                    ##  First, determin if this document date is between the dates where this person was an attorney
                                    wasAttorneyAtThatTime = False
                                    documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
                                    documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
                                    #print(f"\ndocumentDateValue is {documentDateValue}")
                                    personWasAttorneyDates = personMatch.dates_as_counsel
                                    for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
                                        #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
                                        if wasAttorneyStartDate.count("/") < 2:
                                            wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
                                        wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
                                            
                                        if wasAttorneyEndDate == "CURRENT":
                                            wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
                                        elif wasAttorneyEndDate == "PRESENT":
                                            wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
                                        if wasAttorneyEndDate.count("/") < 2:
                                            missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
                                            wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
                                        wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
                                        
                                        #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
                                        if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
                                            wasAttorneyAtThatTime = True
                                            
    ##                                if wasAttorneyAtThatTime:
    ##                                    print("Person WAS attorney at this doc date.")
    ##                                else:
    ##                                    print("Person WAS NOT attorney at this doc date.")
                                            
                                    ##  Person's role at the time of the document has been determined, so now do the same checks as above.
                                    if wasAttorneyAtThatTime:
                                        if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                            ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                            if matchFlag:
                                                print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                            formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                            matchFlag = True
                                        elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                            ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                            if matchFlag:
                                                print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                            formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                            matchFlag = True
                                            ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                            AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is High Confidence Potential Upgrade")
                                            
                                    else:
                                        if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                            if matchFlag:
                                                print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                            ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                            formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                            matchFlag = True
                                        elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                            ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                            if matchFlag:
                                                print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                            formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                            matchFlag = True
                                            ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                            AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is High Confidence Potential downgrade")
                                        
                        ##  Test the matchFlag here (as you are outside of the for loop) and if it's false but the person is an attorney, FLAG AN ATTORNEY found in meta but MISSING FROM FORMATTED
                        if matchFlag:
                            pass
                        else:
                            if personMatch.is_attorney == 'YES':
                                AddToIssuesList(docID,f"{val} in Metadata To Field and did not directly match value in formatted however this is a HIGH Confidence Potential Attorney")
                                
                    else:
                        ##  Person match, using email, not found in MAL.
                        ##  Try extracting a name from this metadata value and try matching the MAL using that.
                        val = val.upper()
                        origVal = val
                        ##  First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
                        if "(LEGAL)" in val:
                            ##  Attempt to only remove the email parenthetical, including the now empty paren.
                            val = val.replace(result.upper(),"")
                            val = val.replace("()",'')
                            #val = val.replace(")","")
                        else:
                            ##  Remove all parenthicals, including any character in that paren, from value.
                            val = re.sub(r"\([^)]*\)","",val)
                            
                        val = val.strip()
                        ##  with the email address and the paren stripped out of the val, only move forward if anything still exists.
                        if val:
                            ##  if there is a comma, parse to last name, first name
                            if "," in val:
                                lastName, firstName = val.split(",")
                                lastName = lastName.strip()
                                firstName = firstName.strip()
                            elif " " in val:
                                ##  For now, try just splitting by the first space and take everything after as the first name.
                                firstName, lastName = val.split(" ",1)
                            ##  With the name now parse, try searching for all values that match on the last name.

                            personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
                            if personMatchList:
                                possiblePeopleMatchesMatrix = {}
                                ##  For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
                                for personMatch in personMatchList:
                                    if personMatch.first_name == firstName:
                                        ##  This is a personMatch that matches the first and last name
                                        possiblePeopleMatchesMatrix[personMatch._id] = 1
                                if possiblePeopleMatchesMatrix.keys():
                                    ##  If the list of possible matches is just 1, we are okay doing a simple match attempt.  if more than 1, we need to test for conflicting designations in the list of possible matches.
                                    if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
                                        ##  I can grab the single matching value here because I've confirmed there is just 1.  if you do something similar for where there are more, change this next line.
                                        personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())
 
                                        allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
                                        
                                        matchFlag = False
                                        if allPossibleVariationsList:
                                            for variationPair in allPossibleVariationsList:
                                                if personMatch.is_attorney == 'YES':
                                                    if personMatch.last_name == "MANEK":
                                                        print(variationPair)
                                                    if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                                        ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                                        if matchFlag:
                                                            print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
                                                        formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                                        matchFlag = True
                                                    elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                                        ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                                        if matchFlag:
                                                            print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
                                                        formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                                        matchFlag = True
                                                        ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                                        AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
                                                    else:
                                                        ##  This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
                                                        #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
                                                        AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")

                                                elif personMatch.is_attorney == 'NO':
                                                    if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                                        if matchFlag:
                                                            print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                        ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                                        formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                                        matchFlag = True
                                                    elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                                        ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                                        if matchFlag:
                                                            print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                        formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                                        matchFlag = True
                                                        ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                                        AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
                                                else:
                                                    ##  This means they are a split role, so additional work will need to be done with the dates.
                                                    ##  First, determin if this document date is between the dates where this person was an attorney
                                                    wasAttorneyAtThatTime = False
                                                    documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
                                                    documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
                                                    #print(f"\ndocumentDateValue is {documentDateValue}")
                                                    personWasAttorneyDates = personMatch.dates_as_counsel
                                                    for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
                                                        #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
                                                        if wasAttorneyStartDate.count("/") < 2:
                                                            wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
                                                        wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
                                                            
                                                        if wasAttorneyEndDate == "CURRENT":
                                                            wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
                                                        elif wasAttorneyEndDate == "PRESENT":
                                                            wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
                                                        if wasAttorneyEndDate.count("/") < 2:
                                                            missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
                                                            wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
                                                        wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
                                                        
                                                        #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
                                                        if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
                                                            wasAttorneyAtThatTime = True
                                                            
                    ##                                if wasAttorneyAtThatTime:
                    ##                                    print("Person WAS attorney at this doc date.")
                    ##                                else:
                    ##                                    print("Person WAS NOT attorney at this doc date.")
                                                            
                                                    ##  Person's role at the time of the document has been determined, so now do the same checks as above.
                                                    if wasAttorneyAtThatTime:
                                                        if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                                            ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                                            if matchFlag:
                                                                print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                            formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                                            matchFlag = True
                                                        elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                                            ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                                            if matchFlag:
                                                                print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                            formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                                            matchFlag = True
                                                            ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                                            AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
                                                            
                                                    else:
                                                        if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                                            if matchFlag:
                                                                print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                            ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                                            formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                                            matchFlag = True
                                                        elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                                            ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                                            if matchFlag:
                                                                print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                            formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                                            matchFlag = True
                                                            ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                                            AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
                                    else:
                                        print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
                                        ##  TODO: Add support here for more than one first name last name match in MAL.
                                        ##  ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
                                        ##    Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.  
                            
                            else:
                                ##  TODO: Need to ask Eli if I dont a match by checking first and last name for a match if it's needed to flag these.
                                #AddToIssuesList(docID,f"first name: {firstName} - last name: {lastName} is an email in metadata that I couldnt match in MAL")
                                pass

            else:
                ##  No email address could be extracted from this val. Try extracting a name from this metadata value and try matching the MAL using that.
                #AddToIssuesList(docID,f"{val} is a value in metadata that I couldnt extract an email address from")
                val = val.upper()
                origVal = val
                ##  First lets try dealing with the extra parentheticals that keep coming up but make sure to handle (LEGAL) differently.
                if "(LEGAL)" in val:
                    pass
                else:
                    ##  Remove all parenthicals, including any character in that paren, from value.
                    val = re.sub(r"\([^)]*\)","",val)
                    
                val = val.strip()
                ##  with the paren information stripped out of the val, only move forward if anything still exists.
                if val:
                    ##  if there is a comma, parse to last name, first name
                    if "," in val:
                        lastName, firstName = val.split(",")
                        lastName = lastName.strip()
                        firstName = firstName.strip()
                    elif " " in val:
                        ##  For now, try just splitting by the first space and take everything after as the first name.
                        firstName, lastName = val.split(" ",1)
                    ##  With the name now parse, try searching for all values that match on the last name.
                    
                    personMatchList = nv.malPeopleList.return_list_of_matching_values('last_name',lastName)
                    if personMatchList:
                        possiblePeopleMatchesMatrix = {}
                        ##  For each personMatch in the list, now attempt to also see if the first name matches and, if so, put that into the possible people list
                        for personMatch in personMatchList:
                            if personMatch.first_name == firstName:
                                ##  This is a personMatch that matches the first and last name
                                possiblePeopleMatchesMatrix[personMatch._id] = 1
                        if possiblePeopleMatchesMatrix.keys():
                            ##  If the list of possible matches is just 1, we are okay doing a simple match attempt.  if more than 1, we need to test for conflicting designations in the list of possible matches.
                            if len(list(possiblePeopleMatchesMatrix.keys())) < 2:
                                ##  I can grab the single matching value here because I've confirmed there is just 1.  if you do something similar for where there are more, change this next line.
                                personMatch = nv.malPeopleList.search_by_id(list(possiblePeopleMatchesMatrix.keys()).pop())

                                allPossibleVariationsList = GatherAllPossibleVariations(personMatch)
                                matchFlag = False
                                if allPossibleVariationsList:
                                    for variationPair in allPossibleVariationsList:
                                        if personMatch.is_attorney == 'YES':
                                            if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                                ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                                if matchFlag:
                                                    print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
                                                formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                                matchFlag = True
                                            elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                                ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                                if matchFlag:
                                                    print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE name VALUE??")
                                                formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                                matchFlag = True
                                                ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                                AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
                                            else:
                                                ##  This means it failed to match a value at all in the formatted field HOWEVER, this is an attorney so we should flag this as a low confidence flag.
                                                #AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")
                                                AddToIssuesList(docID,f"{origVal} in metadata To Field did not directly match value in formatted however is a Low Confidence Potential Attorney")

                                        elif personMatch.is_attorney == 'NO':
                                            if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                                if matchFlag:
                                                    print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                                formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                                matchFlag = True
                                            elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                                ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                                if matchFlag:
                                                    print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                                matchFlag = True
                                                ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                                AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
                                        else:
                                            ##  This means they are a split role, so additional work will need to be done with the dates.
                                            ##  First, determin if this document date is between the dates where this person was an attorney
                                            wasAttorneyAtThatTime = False
                                            documentDateValue = qcP.additionalValuesDict[docID]._asdict()['dateValue']
                                            documentDateValue = datetime.datetime.strptime(documentDateValue,'%m/%d/%Y').date()
                                            #print(f"\ndocumentDateValue is {documentDateValue}")
                                            personWasAttorneyDates = personMatch.dates_as_counsel
                                            for wasAttorneyStartDate,wasAttorneyEndDate in personWasAttorneyDates:
                                                #print(f"Attorney {wasAttorneyStartDate}-{wasAttorneyEndDate}")
                                                if wasAttorneyStartDate.count("/") < 2:
                                                    wasAttorneyStartDate = wasAttorneyStartDate.replace("/","/1/")
                                                wasAttorneyStartDate = datetime.datetime.strptime(wasAttorneyStartDate,'%m/%d/%Y').date()
                                                    
                                                if wasAttorneyEndDate == "CURRENT":
                                                    wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
                                                elif wasAttorneyEndDate == "PRESENT":
                                                    wasAttorneyEndDate = datetime.datetime.today().strftime('%m/%d/%Y')
                                                if wasAttorneyEndDate.count("/") < 2:
                                                    missingDayValue = calendar.monthrange(int(wasAttorneyEndDate.split("/")[1]),int(wasAttorneyEndDate.split("/")[0]))[1]
                                                    wasAttorneyEndDate = wasAttorneyEndDate.replace("/",f"/{missingDayValue}/")
                                                wasAttorneyEndDate = datetime.datetime.strptime(wasAttorneyEndDate,'%m/%d/%Y').date()
                                                
                                                #print(f"{wasAttorneyStartDate} - {documentDateValue} - {wasAttorneyEndDate}")
                                                if wasAttorneyStartDate <= documentDateValue <= wasAttorneyEndDate:
                                                    wasAttorneyAtThatTime = True
                                                    
            ##                                if wasAttorneyAtThatTime:
            ##                                    print("Person WAS attorney at this doc date.")
            ##                                else:
            ##                                    print("Person WAS NOT attorney at this doc date.")
                                                    
                                            ##  Person's role at the time of the document has been determined, so now do the same checks as above.
                                            if wasAttorneyAtThatTime:
                                                if f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                                    ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                                    if matchFlag:
                                                        print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                    formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                                    matchFlag = True
                                                elif f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                                    ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                                    if matchFlag:
                                                        print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                    formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                                    matchFlag = True
                                                    ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                                    AddToIssuesList(docID,f"{variationPair[0]} ({variationPair[1]}) in To Field is Low Confidence Potential Upgrade")
                                                    
                                            else:
                                                if f"{variationPair[0]} ({variationPair[1]})" in formattedFieldValues:
                                                    if matchFlag:
                                                        print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                    ##  This variation was found in the list of formatted values, which is fine, so just remove it.
                                                    formattedFieldValues.remove(f"{variationPair[0]} ({variationPair[1]})")
                                                    matchFlag = True
                                                elif f"{variationPair[0]}* ({variationPair[1]})" in formattedFieldValues:
                                                    ##  This variation was found in the list of formatted values, however it's a bad match, so remove it but also add this to the issuesList.
                                                    if matchFlag:
                                                        print("WARNING: TWO SEPARATE FORMATTED NAME MATCHES FROM A SINGLE EMAIL VALUE??")
                                                    formattedFieldValues.remove(f"{variationPair[0]}* ({variationPair[1]})")
                                                    matchFlag = True
                                                    ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                                                    AddToIssuesList(docID,f"{variationPair[0]}* ({variationPair[1]}) in To Field is Low Confidence Potential downgrade")
                            else:
                                print(f"WARNING: more than one match in MAL for this first name and last name combo {val}")
                                ##  TODO: Add support here for more than one first name last name match in MAL.
                                ##  ELI said to just test if in the group you have a mix of attorney and non attorney and if so, we should report that as a needs manual check.
                                ##    Dont try to compare these against the formatted because that could lead to eliminating possitive or false positives.


        ##  Since you itterated over the metadata values but didnt itterate over the formatted values, check for any remaining formatted values that exist in the list
        if formattedFieldValues:
            for val in formattedFieldValues:
                ##  TODO: Confirm with Eli but we should only report these remaining values if they have a *
                ##  From Eliu: the Highest risk is the * values because these are the potential overdesignations so yes but in a perfect world we would check both.
                if "*" in val:
                    ##  TODO: change the hard coded "To Field" here once you change to itterate over all field groups.
                    AddToIssuesList(docID,f"{val} in Formatted To Field is an attorney but couldnt be matched to any value in metadata field.")


    ##  Now just unpack and write the issues, per DocID, to the output file separated by semicolon.
    outputFile = open(outputFileName,'w')
    for docID in list(issuesMatrix.keys()):
        outputFile.write(f"{docID}|{';'.join(issuesMatrix[docID])}\n")
    outputFile.close()
Revision:	865
Committed:	Sat Dec 14 00:36:53 2024 UTC (15 months, 1 week ago) by nino.borges
Content type:	text/x-python
File size:	50535 byte(s)
Log Message:	Updated the variations function to force add AMAZON.COM if there is an @amazon. in the domain name because Eli said I could be confident that the same email name will show up as the amazon.co.uk, amazon.it, etc. This should increase my matches.