Active_prgs/Redgrave/Amazon_PrivLogQC.py

"""

Amazon-PrivLogQC

Created by:
Emanuel Borges
11.19.2024

This program will assist with the process of performing QC on the Amazon privilege logs.

"""

import os, re
from collections import namedtuple
from MyCode.Tool_Box import FileEncodingLib


class QcPrivLog(object):
    """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
    version = '0.7.1'


    def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
        """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.  Assumes the first row of the data file is the header and first column is DocID."""
        print("Initializing data structures...")
        self.metadataValuesDict = {}
        self.formattedValuesDict = {}
        self.additionalValuesDict = {}
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
        
        contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
        self.cleanedInputDataFileHeader = contents[0]
        self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("|")
        contents = contents[1:]
        print (f"There are {len(contents)} rows of data in this input file.\n\n")


        ##  VEAS-CAAG
        print (f"The data structure will be made of following field pairs:")
        print(f"{self.cleanedInputDataFileHeaderList[26]} | {self.cleanedInputDataFileHeaderList[27]}")
        print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[30]}")
        print(f"{self.cleanedInputDataFileHeaderList[31]} | {self.cleanedInputDataFileHeaderList[32]}")
        print(f"{self.cleanedInputDataFileHeaderList[33]} | {self.cleanedInputDataFileHeaderList[34]}")
        print(f"{self.cleanedInputDataFileHeaderList[25]} | {self.cleanedInputDataFileHeaderList[28]}\n\n")
        print(f"{self.cleanedInputDataFileHeaderList[15]} will be used for the date qualifier.")
        print(f"{self.cleanedInputDataFileHeaderList[40]} will be used as the Legal Source field.")

        ## VEAS_custom
##        print (f"The data structure will be made of following field pairs:")
##        print(f"{self.cleanedInputDataFileHeaderList[2]} | {self.cleanedInputDataFileHeaderList[6]}")
##        print(f"{self.cleanedInputDataFileHeaderList[3]} | {self.cleanedInputDataFileHeaderList[8]}")
##        print(f"{self.cleanedInputDataFileHeaderList[4]} | {self.cleanedInputDataFileHeaderList[9]}")
##        print(f"{self.cleanedInputDataFileHeaderList[5]} | {self.cleanedInputDataFileHeaderList[10]}")
##        print(f"{self.cleanedInputDataFileHeaderList[1]} | {self.cleanedInputDataFileHeaderList[6]}\n\n")

        RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
        self.recordValuesFieldList = RecordValues._fields

        AdditionalValues = namedtuple("AdditionalValues","dateValue legalSourceValues")
        self.additionalValuesFieldList = AdditionalValues._fields

        for line in contents:
            line = line.replace("\n","")
            line = line.split("|")
            docID = line[0]
            ## TODO: These are hard coded for now but change to column header lookup asap.
            ## CAAG
            #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
            #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
            ## VEAS-CAAG
            self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[26]),self.__SplitAndClean(line[29]),self.__SplitAndClean(line[31]),self.__SplitAndClean(line[33]) ,self.__SplitAndClean(line[25]))
            self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[27]),self.__SplitAndClean(line[30]),self.__SplitAndClean(line[32]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[28]))
            self.additionalValuesDict[docID] = AdditionalValues(line[15],self.__SplitAndClean(line[40]))
            ## FTC-CID
            #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]) ,self.__SplitAndClean(line[10]))
            #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[2]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[11]))
            ## VEAS_custom
            #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[2]),self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]) ,self.__SplitAndClean(line[1]))
            #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[6]))
            ## CAAG_custom
            #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[6]) ,self.__SplitAndClean(line[2]))
            #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[7]))

        print("Data structures created.")


    def __SplitAndClean(self, rawVal, delim = ";"):
        """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
        if rawVal:
            newVal = [x.strip() for x in rawVal.split(delim)]
        else: newVal = ""
        return newVal


    def __FieldDedupeByEmailAddress(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
        ##  This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
        ## TODO: update this to be case insensitive.
        tempEmailList = []
        newList = []
        for item in valuesList:
            result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
            if result:
                for r in result:
                    if r.upper() in tempEmailList:
                        pass
                    else:
                        newList.append(item)
                        tempEmailList.append(r.upper())
            else:
                newList.append(item)
        return len(newList)
            

    def __FieldFullValueDedupe(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE.  This was created because there appears to be duplicate values int he formatted fields"""
        ##  Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
        newSet = set()
        for item in valuesList:
            newSet.add(item.upper())
        return len(newSet)


    def PerformValueCountChecks(self, countsOnly = True):
        """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
        workList = self.metadataValuesDict.keys()
        #misCount = 0
        #redFlagDocList = []
        #warningDocList = []
        #misList = []
        redFlagDocSet = set()
        redFlagDocMatrix = {}
        warningDocSet = set()
        warningDocMatrix = {}
        #duplicatesInFormattedSet = set()
        duplicatesInFormattedMatrix = {}
        
        for docID in workList:
            for fieldName in self.recordValuesFieldList:
                metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
                formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]

                if len(metadataFieldValues) - len(formattedFieldValues) == 0:
                    pass
                else:
                    if len(metadataFieldValues) == 0:
                        ##  Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
                        if fieldName == 'docAuthor':
                            if self.metadataValuesDict[docID].fromValues:
                                pass
                            else:
                                redFlagDocSet.add(docID)
                                #print(docID)
                                try:
                                    redFlagDocMatrix[docID].append(fieldName)
                                except KeyError:
                                    redFlagDocMatrix[docID] = [fieldName,]
                        else:
                            redFlagDocSet.add(docID)
                            try:
                                redFlagDocMatrix[docID].append(fieldName)
                            except KeyError:
                                redFlagDocMatrix[docID] = [fieldName,]
                    elif len(formattedFieldValues) == 0:
                        redFlagDocSet.add(docID)
                        try:
                            redFlagDocMatrix[docID].append(fieldName)
                        except KeyError:
                            redFlagDocMatrix[docID] = [fieldName,]
                    else:
                        ##  try the count again by deduplicating the metadata field values.  Never on the formatted field values.
                        deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
                        if deduplicatedFieldCount - len(formattedFieldValues) == 0:
                            pass
                        else:
                            distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
                            if deduplicatedFieldCount > 30:
                                if distanceBetween > (10 * deduplicatedFieldCount)/100:
                                    print(docID,fieldName)
                                    redFlagDocSet.add(docID)
                                    try:
                                        redFlagDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        redFlagDocMatrix[docID] = [fieldName,]
                                else:
                                    warningDocSet.add(docID)
                                    try:
                                        warningDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        warningDocMatrix[docID]= [fieldName,]
                            else:
                                if distanceBetween > 2:
                                    print(docID,fieldName)
                                    redFlagDocSet.add(docID)
                                    try:
                                        redFlagDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        redFlagDocMatrix[docID] = [fieldName,]
                                else:
                                    warningDocSet.add(docID)
                                    try:
                                        warningDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        warningDocMatrix[docID]= [fieldName,]
                            
                ##  Perform a separate check for duplicates in the formatted field.
                if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
                    pass
                else:
                    try:
                        duplicatesInFormattedMatrix[docID].append(fieldName)
                    except KeyError:
                        duplicatesInFormattedMatrix[docID] = [fieldName,]
                    #duplicatesInFormattedSet.add(docID)
##            if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
##                pass
##            else:
##                if len(self.metadataValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                elif len(self.formattedValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                else:
##                    #misCount +=1
##                    #misList.append(docID)
##                    #warningDocList.append(docID)
##                    warningDocSet.add(docID)
                    
        print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
        if countsOnly == False:
            warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
            redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
            duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
            for x in warningDocMatrix:
                warningsOutputFile.write(f"{x} | {*warningDocMatrix[x],}\n")
            warningsOutputFile.close()
            for y in redFlagDocMatrix:
                redFladsOutputFile.write(f"{y} | {*redFlagDocMatrix[y],}\n")
            redFladsOutputFile.close()
            for z in duplicatesInFormattedMatrix:
                duplicatesInFormattedOutputFile.write(f"{z} | {*duplicatesInFormattedMatrix[z],}\n")
            duplicatesInFormattedOutputFile.close()


if __name__ == '__main__':
    cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\CAAG\CAAG_Log_Data_Export_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
    #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\TEST.txt"

    ## Code Testing
    qcP = QcPrivLog(cleanedDatExportFileName)
##    print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
##    print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
##    print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
##    print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
##    print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
##    print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
##    print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
##    print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)

    qcP.PerformValueCountChecks(countsOnly = False)
    

Revision:	852
Committed:	Thu Dec 12 16:16:23 2024 UTC (15 months, 2 weeks ago) by nino.borges
Content type:	text/x-python
File size:	16462 byte(s)
Log Message:	This version adds support for the additionalValuesDict, adding the date value and the legal sources values.
#	User	Rev	Content
1	nino.borges	828	"""
2
3			Amazon-PrivLogQC
4
5			Created by:
6			Emanuel Borges
7			11.19.2024
8
9			This program will assist with the process of performing QC on the Amazon privilege logs.
10
11			"""
12
13			import os, re
14			from collections import namedtuple
15	nino.borges	851	from MyCode.Tool_Box import FileEncodingLib
16	nino.borges	828
17
18			class QcPrivLog(object):
19			"""A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
20	nino.borges	852	version = '0.7.1'
21	nino.borges	828
22
23			def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
24			"""Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
25			print("Initializing data structures...")
26			self.metadataValuesDict = {}
27			self.formattedValuesDict = {}
28	nino.borges	852	self.additionalValuesDict = {}
29	nino.borges	830	self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
30	nino.borges	828
31			contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
32			self.cleanedInputDataFileHeader = contents[0]
33	nino.borges	832	self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("\|")
34	nino.borges	828	contents = contents[1:]
35	nino.borges	832	print (f"There are {len(contents)} rows of data in this input file.\n\n")
36	nino.borges	828
37	nino.borges	851
38			## VEAS-CAAG
39	nino.borges	832	print (f"The data structure will be made of following field pairs:")
40	nino.borges	851	print(f"{self.cleanedInputDataFileHeaderList[26]} \| {self.cleanedInputDataFileHeaderList[27]}")
41			print(f"{self.cleanedInputDataFileHeaderList[29]} \| {self.cleanedInputDataFileHeaderList[30]}")
42			print(f"{self.cleanedInputDataFileHeaderList[31]} \| {self.cleanedInputDataFileHeaderList[32]}")
43			print(f"{self.cleanedInputDataFileHeaderList[33]} \| {self.cleanedInputDataFileHeaderList[34]}")
44			print(f"{self.cleanedInputDataFileHeaderList[25]} \| {self.cleanedInputDataFileHeaderList[28]}\n\n")
45	nino.borges	852	print(f"{self.cleanedInputDataFileHeaderList[15]} will be used for the date qualifier.")
46			print(f"{self.cleanedInputDataFileHeaderList[40]} will be used as the Legal Source field.")
47	nino.borges	832
48	nino.borges	851	## VEAS_custom
49			## print (f"The data structure will be made of following field pairs:")
50			## print(f"{self.cleanedInputDataFileHeaderList[2]} \| {self.cleanedInputDataFileHeaderList[6]}")
51			## print(f"{self.cleanedInputDataFileHeaderList[3]} \| {self.cleanedInputDataFileHeaderList[8]}")
52			## print(f"{self.cleanedInputDataFileHeaderList[4]} \| {self.cleanedInputDataFileHeaderList[9]}")
53			## print(f"{self.cleanedInputDataFileHeaderList[5]} \| {self.cleanedInputDataFileHeaderList[10]}")
54			## print(f"{self.cleanedInputDataFileHeaderList[1]} \| {self.cleanedInputDataFileHeaderList[6]}\n\n")
55
56	nino.borges	831	RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
57	nino.borges	828	self.recordValuesFieldList = RecordValues._fields
58
59	nino.borges	852	AdditionalValues = namedtuple("AdditionalValues","dateValue legalSourceValues")
60			self.additionalValuesFieldList = AdditionalValues._fields
61
62	nino.borges	828	for line in contents:
63			line = line.replace("\n","")
64			line = line.split("\|")
65			docID = line[0]
66			## TODO: These are hard coded for now but change to column header lookup asap.
67	nino.borges	839	## CAAG
68	nino.borges	833	#self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
69			#self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
70	nino.borges	839	## VEAS-CAAG
71	nino.borges	851	self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[26]),self.__SplitAndClean(line[29]),self.__SplitAndClean(line[31]),self.__SplitAndClean(line[33]) ,self.__SplitAndClean(line[25]))
72			self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[27]),self.__SplitAndClean(line[30]),self.__SplitAndClean(line[32]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[28]))
73	nino.borges	852	self.additionalValuesDict[docID] = AdditionalValues(line[15],self.__SplitAndClean(line[40]))
74	nino.borges	839	## FTC-CID
75			#self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]) ,self.__SplitAndClean(line[10]))
76			#self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[2]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[11]))
77			## VEAS_custom
78	nino.borges	851	#self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[2]),self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]) ,self.__SplitAndClean(line[1]))
79			#self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[6]))
80	nino.borges	839	## CAAG_custom
81			#self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[6]) ,self.__SplitAndClean(line[2]))
82			#self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[7]))
83	nino.borges	828
84			print("Data structures created.")
85
86
87
88			def __SplitAndClean(self, rawVal, delim = ";"):
89			"""Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
90	nino.borges	833	if rawVal:
91			newVal = [x.strip() for x in rawVal.split(delim)]
92			else: newVal = ""
93			return newVal
94	nino.borges	828
95
96			def __FieldDedupeByEmailAddress(self, valuesList):
97			"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
98	nino.borges	830	## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
99			## TODO: update this to be case insensitive.
100			tempEmailList = []
101			newList = []
102			for item in valuesList:
103			result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
104			if result:
105			for r in result:
106			if r.upper() in tempEmailList:
107			pass
108			else:
109			newList.append(item)
110			tempEmailList.append(r.upper())
111			else:
112			newList.append(item)
113			return len(newList)
114
115	nino.borges	828
116	nino.borges	831	def __FieldFullValueDedupe(self, valuesList):
117			"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values int he formatted fields"""
118			## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
119			newSet = set()
120			for item in valuesList:
121			newSet.add(item.upper())
122			return len(newSet)
123	nino.borges	828
124	nino.borges	830
125	nino.borges	828	def PerformValueCountChecks(self, countsOnly = True):
126			"""Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
127			workList = self.metadataValuesDict.keys()
128			#misCount = 0
129			#redFlagDocList = []
130			#warningDocList = []
131			#misList = []
132			redFlagDocSet = set()
133	nino.borges	833	redFlagDocMatrix = {}
134	nino.borges	828	warningDocSet = set()
135	nino.borges	833	warningDocMatrix = {}
136	nino.borges	832	#duplicatesInFormattedSet = set()
137			duplicatesInFormattedMatrix = {}
138	nino.borges	828
139			for docID in workList:
140	nino.borges	829	for fieldName in self.recordValuesFieldList:
141			metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
142			formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
143	nino.borges	833
144	nino.borges	829	if len(metadataFieldValues) - len(formattedFieldValues) == 0:
145			pass
146	nino.borges	828	else:
147	nino.borges	829	if len(metadataFieldValues) == 0:
148	nino.borges	833	## Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
149			if fieldName == 'docAuthor':
150			if self.metadataValuesDict[docID].fromValues:
151			pass
152			else:
153			redFlagDocSet.add(docID)
154			#print(docID)
155			try:
156			redFlagDocMatrix[docID].append(fieldName)
157			except KeyError:
158			redFlagDocMatrix[docID] = [fieldName,]
159			else:
160			redFlagDocSet.add(docID)
161			try:
162			redFlagDocMatrix[docID].append(fieldName)
163			except KeyError:
164			redFlagDocMatrix[docID] = [fieldName,]
165	nino.borges	829	elif len(formattedFieldValues) == 0:
166			redFlagDocSet.add(docID)
167	nino.borges	833	try:
168			redFlagDocMatrix[docID].append(fieldName)
169			except KeyError:
170			redFlagDocMatrix[docID] = [fieldName,]
171	nino.borges	829	else:
172	nino.borges	830	## try the count again by deduplicating the metadata field values. Never on the formatted field values.
173			deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
174			if deduplicatedFieldCount - len(formattedFieldValues) == 0:
175			pass
176			else:
177	nino.borges	833	distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
178			if deduplicatedFieldCount > 30:
179			if distanceBetween > (10 * deduplicatedFieldCount)/100:
180			print(docID,fieldName)
181			redFlagDocSet.add(docID)
182			try:
183			redFlagDocMatrix[docID].append(fieldName)
184			except KeyError:
185			redFlagDocMatrix[docID] = [fieldName,]
186			else:
187			warningDocSet.add(docID)
188			try:
189			warningDocMatrix[docID].append(fieldName)
190			except KeyError:
191			warningDocMatrix[docID]= [fieldName,]
192			else:
193			if distanceBetween > 2:
194			print(docID,fieldName)
195			redFlagDocSet.add(docID)
196			try:
197			redFlagDocMatrix[docID].append(fieldName)
198			except KeyError:
199			redFlagDocMatrix[docID] = [fieldName,]
200			else:
201			warningDocSet.add(docID)
202			try:
203			warningDocMatrix[docID].append(fieldName)
204			except KeyError:
205			warningDocMatrix[docID]= [fieldName,]
206	nino.borges	831
207			## Perform a separate check for duplicates in the formatted field.
208			if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
209			pass
210			else:
211	nino.borges	832	try:
212			duplicatesInFormattedMatrix[docID].append(fieldName)
213			except KeyError:
214			duplicatesInFormattedMatrix[docID] = [fieldName,]
215			#duplicatesInFormattedSet.add(docID)
216	nino.borges	829	## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
217			## pass
218			## else:
219			## if len(self.metadataValuesDict[docID].toValues) == 0:
220			## #redFlagDocList.append(docID)
221			## redFlagDocSet.add(docID)
222			## elif len(self.formattedValuesDict[docID].toValues) == 0:
223			## #redFlagDocList.append(docID)
224			## redFlagDocSet.add(docID)
225			## else:
226			## #misCount +=1
227			## #misList.append(docID)
228			## #warningDocList.append(docID)
229			## warningDocSet.add(docID)
230	nino.borges	828
231	nino.borges	830	print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
232	nino.borges	828	if countsOnly == False:
233			warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
234			redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
235	nino.borges	831	duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
236	nino.borges	833	for x in warningDocMatrix:
237			warningsOutputFile.write(f"{x} \| {*warningDocMatrix[x],}\n")
238	nino.borges	828	warningsOutputFile.close()
239	nino.borges	833	for y in redFlagDocMatrix:
240			redFladsOutputFile.write(f"{y} \| {*redFlagDocMatrix[y],}\n")
241	nino.borges	828	redFladsOutputFile.close()
242	nino.borges	832	for z in duplicatesInFormattedMatrix:
243			duplicatesInFormattedOutputFile.write(f"{z} \| {*duplicatesInFormattedMatrix[z],}\n")
244	nino.borges	831	duplicatesInFormattedOutputFile.close()
245	nino.borges	828
246
247			if __name__ == '__main__':
248	nino.borges	851	cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
249			#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\CAAG\CAAG_Log_Data_Export_Converted.txt"
250	nino.borges	839	#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
251			#cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
252	nino.borges	833	#cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
253			#cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\TEST.txt"
254	nino.borges	828
255			## Code Testing
256			qcP = QcPrivLog(cleanedDatExportFileName)
257	nino.borges	833	## print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
258			## print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
259			## print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
260			## print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
261			## print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
262			## print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
263			## print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
264			## print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
265	nino.borges	828
266			qcP.PerformValueCountChecks(countsOnly = False)
267
268