Active_prgs/Redgrave/Amazon_PrivLogQC.py

"""

Amazon-PrivLogQC

Created by:
Emanuel Borges
11.19.2024

This program will assist with the process of performing QC on the Amazon privilege logs.

"""

import os, re
from collections import namedtuple


class QcPrivLog(object):
    """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
    version = '0.6.1'


    def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
        """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.  Assumes the first row of the data file is the header and first column is DocID."""
        print("Initializing data structures...")
        self.metadataValuesDict = {}
        self.formattedValuesDict = {}
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
        
        contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
        self.cleanedInputDataFileHeader = contents[0]
        self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("|")
        contents = contents[1:]
        print (f"There are {len(contents)} rows of data in this input file.\n\n")

##        print (f"The data structure will be made of following field pairs:")
##        print(f"{self.cleanedInputDataFileHeaderList[26]} | {self.cleanedInputDataFileHeaderList[27]}")
##        print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[30]}")
##        print(f"{self.cleanedInputDataFileHeaderList[31]} | {self.cleanedInputDataFileHeaderList[32]}")
##        print(f"{self.cleanedInputDataFileHeaderList[33]} | {self.cleanedInputDataFileHeaderList[34]}")
##        print(f"{self.cleanedInputDataFileHeaderList[25]} | {self.cleanedInputDataFileHeaderList[28]}\n\n")
        
        print (f"The data structure will be made of following field pairs:")
        print(f"{self.cleanedInputDataFileHeaderList[2]} | {self.cleanedInputDataFileHeaderList[6]}")
        print(f"{self.cleanedInputDataFileHeaderList[3]} | {self.cleanedInputDataFileHeaderList[8]}")
        print(f"{self.cleanedInputDataFileHeaderList[4]} | {self.cleanedInputDataFileHeaderList[9]}")
        print(f"{self.cleanedInputDataFileHeaderList[5]} | {self.cleanedInputDataFileHeaderList[10]}")
        print(f"{self.cleanedInputDataFileHeaderList[1]} | {self.cleanedInputDataFileHeaderList[6]}\n\n")

        RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
        self.recordValuesFieldList = RecordValues._fields

        for line in contents:
            line = line.replace("\n","")
            line = line.split("|")
            docID = line[0]
            ## TODO: These are hard coded for now but change to column header lookup asap.
            ## CAAG
            #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
            #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
            ## VEAS-CAAG
            #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[26]),self.__SplitAndClean(line[29]),self.__SplitAndClean(line[31]),self.__SplitAndClean(line[33]) ,self.__SplitAndClean(line[25]))
            #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[27]),self.__SplitAndClean(line[30]),self.__SplitAndClean(line[32]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[28]))
            ## FTC-CID
            #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]) ,self.__SplitAndClean(line[10]))
            #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[2]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[11]))
            ## VEAS_custom
            self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[2]),self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]) ,self.__SplitAndClean(line[1]))
            self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[6]))
            ## CAAG_custom
            #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[6]) ,self.__SplitAndClean(line[2]))
            #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[7]))

        print("Data structures created.")


    def __SplitAndClean(self, rawVal, delim = ";"):
        """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
        if rawVal:
            newVal = [x.strip() for x in rawVal.split(delim)]
        else: newVal = ""
        return newVal


    def __FieldDedupeByEmailAddress(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
        ##  This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
        ## TODO: update this to be case insensitive.
        tempEmailList = []
        newList = []
        for item in valuesList:
            result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
            if result:
                for r in result:
                    if r.upper() in tempEmailList:
                        pass
                    else:
                        newList.append(item)
                        tempEmailList.append(r.upper())
            else:
                newList.append(item)
        return len(newList)
            

    def __FieldFullValueDedupe(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE.  This was created because there appears to be duplicate values int he formatted fields"""
        ##  Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
        newSet = set()
        for item in valuesList:
            newSet.add(item.upper())
        return len(newSet)


    def PerformValueCountChecks(self, countsOnly = True):
        """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
        workList = self.metadataValuesDict.keys()
        #misCount = 0
        #redFlagDocList = []
        #warningDocList = []
        #misList = []
        redFlagDocSet = set()
        redFlagDocMatrix = {}
        warningDocSet = set()
        warningDocMatrix = {}
        #duplicatesInFormattedSet = set()
        duplicatesInFormattedMatrix = {}
        
        for docID in workList:
            for fieldName in self.recordValuesFieldList:
                metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
                formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]

                if len(metadataFieldValues) - len(formattedFieldValues) == 0:
                    pass
                else:
                    if len(metadataFieldValues) == 0:
                        ##  Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
                        if fieldName == 'docAuthor':
                            if self.metadataValuesDict[docID].fromValues:
                                pass
                            else:
                                redFlagDocSet.add(docID)
                                #print(docID)
                                try:
                                    redFlagDocMatrix[docID].append(fieldName)
                                except KeyError:
                                    redFlagDocMatrix[docID] = [fieldName,]
                        else:
                            redFlagDocSet.add(docID)
                            try:
                                redFlagDocMatrix[docID].append(fieldName)
                            except KeyError:
                                redFlagDocMatrix[docID] = [fieldName,]
                    elif len(formattedFieldValues) == 0:
                        redFlagDocSet.add(docID)
                        try:
                            redFlagDocMatrix[docID].append(fieldName)
                        except KeyError:
                            redFlagDocMatrix[docID] = [fieldName,]
                    else:
                        ##  try the count again by deduplicating the metadata field values.  Never on the formatted field values.
                        deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
                        if deduplicatedFieldCount - len(formattedFieldValues) == 0:
                            pass
                        else:
                            distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
                            if deduplicatedFieldCount > 30:
                                if distanceBetween > (10 * deduplicatedFieldCount)/100:
                                    print(docID,fieldName)
                                    redFlagDocSet.add(docID)
                                    try:
                                        redFlagDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        redFlagDocMatrix[docID] = [fieldName,]
                                else:
                                    warningDocSet.add(docID)
                                    try:
                                        warningDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        warningDocMatrix[docID]= [fieldName,]
                            else:
                                if distanceBetween > 2:
                                    print(docID,fieldName)
                                    redFlagDocSet.add(docID)
                                    try:
                                        redFlagDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        redFlagDocMatrix[docID] = [fieldName,]
                                else:
                                    warningDocSet.add(docID)
                                    try:
                                        warningDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        warningDocMatrix[docID]= [fieldName,]
                            
                ##  Perform a separate check for duplicates in the formatted field.
                if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
                    pass
                else:
                    try:
                        duplicatesInFormattedMatrix[docID].append(fieldName)
                    except KeyError:
                        duplicatesInFormattedMatrix[docID] = [fieldName,]
                    #duplicatesInFormattedSet.add(docID)
##            if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
##                pass
##            else:
##                if len(self.metadataValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                elif len(self.formattedValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                else:
##                    #misCount +=1
##                    #misList.append(docID)
##                    #warningDocList.append(docID)
##                    warningDocSet.add(docID)
                    
        print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
        if countsOnly == False:
            warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
            redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
            duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
            for x in warningDocMatrix:
                warningsOutputFile.write(f"{x} | {*warningDocMatrix[x],}\n")
            warningsOutputFile.close()
            for y in redFlagDocMatrix:
                redFladsOutputFile.write(f"{y} | {*redFlagDocMatrix[y],}\n")
            redFladsOutputFile.close()
            for z in duplicatesInFormattedMatrix:
                duplicatesInFormattedOutputFile.write(f"{z} | {*duplicatesInFormattedMatrix[z],}\n")
            duplicatesInFormattedOutputFile.close()


if __name__ == '__main__':
    cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\CAAG\CAAG_Log_Data_Export_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\TEST.txt"

    ## Code Testing
    qcP = QcPrivLog(cleanedDatExportFileName)
##    print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
##    print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
##    print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
##    print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
##    print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
##    print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
##    print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
##    print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)

    qcP.PerformValueCountChecks(countsOnly = False)
    

Revision:	839
Committed:	Fri Dec 6 21:44:52 2024 UTC (15 months, 2 weeks ago) by nino.borges
Content type:	text/x-python
File size:	15703 byte(s)
Log Message:	No significant changes, just updated to run on a different dat file, so I had to change the field indexes.
#	User	Rev	Content
1	nino.borges	828	"""
2
3			Amazon-PrivLogQC
4
5			Created by:
6			Emanuel Borges
7			11.19.2024
8
9			This program will assist with the process of performing QC on the Amazon privilege logs.
10
11			"""
12
13			import os, re
14			from collections import namedtuple
15
16
17			class QcPrivLog(object):
18			"""A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
19	nino.borges	833	version = '0.6.1'
20	nino.borges	828
21
22			def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
23			"""Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
24			print("Initializing data structures...")
25			self.metadataValuesDict = {}
26			self.formattedValuesDict = {}
27	nino.borges	830	self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
28	nino.borges	828
29			contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
30			self.cleanedInputDataFileHeader = contents[0]
31	nino.borges	832	self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("\|")
32	nino.borges	828	contents = contents[1:]
33	nino.borges	832	print (f"There are {len(contents)} rows of data in this input file.\n\n")
34	nino.borges	828
35	nino.borges	839	## print (f"The data structure will be made of following field pairs:")
36			## print(f"{self.cleanedInputDataFileHeaderList[26]} \| {self.cleanedInputDataFileHeaderList[27]}")
37			## print(f"{self.cleanedInputDataFileHeaderList[29]} \| {self.cleanedInputDataFileHeaderList[30]}")
38			## print(f"{self.cleanedInputDataFileHeaderList[31]} \| {self.cleanedInputDataFileHeaderList[32]}")
39			## print(f"{self.cleanedInputDataFileHeaderList[33]} \| {self.cleanedInputDataFileHeaderList[34]}")
40			## print(f"{self.cleanedInputDataFileHeaderList[25]} \| {self.cleanedInputDataFileHeaderList[28]}\n\n")
41
42	nino.borges	832	print (f"The data structure will be made of following field pairs:")
43	nino.borges	839	print(f"{self.cleanedInputDataFileHeaderList[2]} \| {self.cleanedInputDataFileHeaderList[6]}")
44			print(f"{self.cleanedInputDataFileHeaderList[3]} \| {self.cleanedInputDataFileHeaderList[8]}")
45			print(f"{self.cleanedInputDataFileHeaderList[4]} \| {self.cleanedInputDataFileHeaderList[9]}")
46			print(f"{self.cleanedInputDataFileHeaderList[5]} \| {self.cleanedInputDataFileHeaderList[10]}")
47			print(f"{self.cleanedInputDataFileHeaderList[1]} \| {self.cleanedInputDataFileHeaderList[6]}\n\n")
48	nino.borges	832
49	nino.borges	831	RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
50	nino.borges	828	self.recordValuesFieldList = RecordValues._fields
51
52			for line in contents:
53			line = line.replace("\n","")
54			line = line.split("\|")
55			docID = line[0]
56			## TODO: These are hard coded for now but change to column header lookup asap.
57	nino.borges	839	## CAAG
58	nino.borges	833	#self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
59			#self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
60	nino.borges	839	## VEAS-CAAG
61			#self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[26]),self.__SplitAndClean(line[29]),self.__SplitAndClean(line[31]),self.__SplitAndClean(line[33]) ,self.__SplitAndClean(line[25]))
62			#self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[27]),self.__SplitAndClean(line[30]),self.__SplitAndClean(line[32]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[28]))
63			## FTC-CID
64			#self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]) ,self.__SplitAndClean(line[10]))
65			#self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[2]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[11]))
66			## VEAS_custom
67			self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[2]),self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]) ,self.__SplitAndClean(line[1]))
68			self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[6]))
69			## CAAG_custom
70			#self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[6]) ,self.__SplitAndClean(line[2]))
71			#self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[7]))
72	nino.borges	828
73			print("Data structures created.")
74
75
76
77			def __SplitAndClean(self, rawVal, delim = ";"):
78			"""Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
79	nino.borges	833	if rawVal:
80			newVal = [x.strip() for x in rawVal.split(delim)]
81			else: newVal = ""
82			return newVal
83	nino.borges	828
84
85			def __FieldDedupeByEmailAddress(self, valuesList):
86			"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
87	nino.borges	830	## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
88			## TODO: update this to be case insensitive.
89			tempEmailList = []
90			newList = []
91			for item in valuesList:
92			result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
93			if result:
94			for r in result:
95			if r.upper() in tempEmailList:
96			pass
97			else:
98			newList.append(item)
99			tempEmailList.append(r.upper())
100			else:
101			newList.append(item)
102			return len(newList)
103
104	nino.borges	828
105	nino.borges	831	def __FieldFullValueDedupe(self, valuesList):
106			"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values int he formatted fields"""
107			## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
108			newSet = set()
109			for item in valuesList:
110			newSet.add(item.upper())
111			return len(newSet)
112	nino.borges	828
113	nino.borges	830
114	nino.borges	828	def PerformValueCountChecks(self, countsOnly = True):
115			"""Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
116			workList = self.metadataValuesDict.keys()
117			#misCount = 0
118			#redFlagDocList = []
119			#warningDocList = []
120			#misList = []
121			redFlagDocSet = set()
122	nino.borges	833	redFlagDocMatrix = {}
123	nino.borges	828	warningDocSet = set()
124	nino.borges	833	warningDocMatrix = {}
125	nino.borges	832	#duplicatesInFormattedSet = set()
126			duplicatesInFormattedMatrix = {}
127	nino.borges	828
128			for docID in workList:
129	nino.borges	829	for fieldName in self.recordValuesFieldList:
130			metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
131			formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
132	nino.borges	833
133	nino.borges	829	if len(metadataFieldValues) - len(formattedFieldValues) == 0:
134			pass
135	nino.borges	828	else:
136	nino.borges	829	if len(metadataFieldValues) == 0:
137	nino.borges	833	## Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
138			if fieldName == 'docAuthor':
139			if self.metadataValuesDict[docID].fromValues:
140			pass
141			else:
142			redFlagDocSet.add(docID)
143			#print(docID)
144			try:
145			redFlagDocMatrix[docID].append(fieldName)
146			except KeyError:
147			redFlagDocMatrix[docID] = [fieldName,]
148			else:
149			redFlagDocSet.add(docID)
150			try:
151			redFlagDocMatrix[docID].append(fieldName)
152			except KeyError:
153			redFlagDocMatrix[docID] = [fieldName,]
154	nino.borges	829	elif len(formattedFieldValues) == 0:
155			redFlagDocSet.add(docID)
156	nino.borges	833	try:
157			redFlagDocMatrix[docID].append(fieldName)
158			except KeyError:
159			redFlagDocMatrix[docID] = [fieldName,]
160	nino.borges	829	else:
161	nino.borges	830	## try the count again by deduplicating the metadata field values. Never on the formatted field values.
162			deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
163			if deduplicatedFieldCount - len(formattedFieldValues) == 0:
164			pass
165			else:
166	nino.borges	833	distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
167			if deduplicatedFieldCount > 30:
168			if distanceBetween > (10 * deduplicatedFieldCount)/100:
169			print(docID,fieldName)
170			redFlagDocSet.add(docID)
171			try:
172			redFlagDocMatrix[docID].append(fieldName)
173			except KeyError:
174			redFlagDocMatrix[docID] = [fieldName,]
175			else:
176			warningDocSet.add(docID)
177			try:
178			warningDocMatrix[docID].append(fieldName)
179			except KeyError:
180			warningDocMatrix[docID]= [fieldName,]
181			else:
182			if distanceBetween > 2:
183			print(docID,fieldName)
184			redFlagDocSet.add(docID)
185			try:
186			redFlagDocMatrix[docID].append(fieldName)
187			except KeyError:
188			redFlagDocMatrix[docID] = [fieldName,]
189			else:
190			warningDocSet.add(docID)
191			try:
192			warningDocMatrix[docID].append(fieldName)
193			except KeyError:
194			warningDocMatrix[docID]= [fieldName,]
195	nino.borges	831
196			## Perform a separate check for duplicates in the formatted field.
197			if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
198			pass
199			else:
200	nino.borges	832	try:
201			duplicatesInFormattedMatrix[docID].append(fieldName)
202			except KeyError:
203			duplicatesInFormattedMatrix[docID] = [fieldName,]
204			#duplicatesInFormattedSet.add(docID)
205	nino.borges	829	## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
206			## pass
207			## else:
208			## if len(self.metadataValuesDict[docID].toValues) == 0:
209			## #redFlagDocList.append(docID)
210			## redFlagDocSet.add(docID)
211			## elif len(self.formattedValuesDict[docID].toValues) == 0:
212			## #redFlagDocList.append(docID)
213			## redFlagDocSet.add(docID)
214			## else:
215			## #misCount +=1
216			## #misList.append(docID)
217			## #warningDocList.append(docID)
218			## warningDocSet.add(docID)
219	nino.borges	828
220	nino.borges	830	print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
221	nino.borges	828	if countsOnly == False:
222			warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
223			redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
224	nino.borges	831	duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
225	nino.borges	833	for x in warningDocMatrix:
226			warningsOutputFile.write(f"{x} \| {*warningDocMatrix[x],}\n")
227	nino.borges	828	warningsOutputFile.close()
228	nino.borges	833	for y in redFlagDocMatrix:
229			redFladsOutputFile.write(f"{y} \| {*redFlagDocMatrix[y],}\n")
230	nino.borges	828	redFladsOutputFile.close()
231	nino.borges	832	for z in duplicatesInFormattedMatrix:
232			duplicatesInFormattedOutputFile.write(f"{z} \| {*duplicatesInFormattedMatrix[z],}\n")
233	nino.borges	831	duplicatesInFormattedOutputFile.close()
234	nino.borges	828
235
236			if __name__ == '__main__':
237	nino.borges	839	cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\CAAG\CAAG_Log_Data_Export_Converted.txt"
238			#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
239			#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
240	nino.borges	833	#cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
241			#cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\TEST.txt"
242	nino.borges	828
243			## Code Testing
244			qcP = QcPrivLog(cleanedDatExportFileName)
245	nino.borges	833	## print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
246			## print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
247			## print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
248			## print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
249			## print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
250			## print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
251			## print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
252			## print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
253	nino.borges	828
254			qcP.PerformValueCountChecks(countsOnly = False)
255
256