Active_prgs/Redgrave/Amazon_PrivLogQC.py

"""

Amazon-PrivLogQC

Created by:
Emanuel Borges
11.19.2024

This program will assist with the process of performing QC on the Amazon privilege logs.

"""

import os, re
from collections import namedtuple


class QcPrivLog(object):
    """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
    version = '0.3.0'


    def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
        """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.  Assumes the first row of the data file is the header and first column is DocID."""
        print("Initializing data structures...")
        self.metadataValuesDict = {}
        self.formattedValuesDict = {}
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
        
        contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
        self.cleanedInputDataFileHeader = contents[0]
        contents = contents[1:]
        print (f"There are {len(contents)} rows of data in this input file.")

        RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues")
        self.recordValuesFieldList = RecordValues._fields

        for line in contents:
            line = line.replace("\n","")
            line = line.split("|")
            docID = line[0]
            ## TODO: These are hard coded for now but change to column header lookup asap.
            self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]))
            self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]))

        print("Data structures created.")


    def __SplitAndClean(self, rawVal, delim = ";"):
        """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
        return [x.strip() for x in rawVal.split(delim)] 


    def __FieldDedupeByEmailAddress(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
        ##  This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
        ## TODO: update this to be case insensitive.
        tempEmailList = []
        newList = []
        for item in valuesList:
            result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
            if result:
                for r in result:
                    if r.upper() in tempEmailList:
                        pass
                    else:
                        newList.append(item)
                        tempEmailList.append(r.upper())
            else:
                newList.append(item)
        return len(newList)
            

    def PerformValueCountChecks(self, countsOnly = True):
        """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
        workList = self.metadataValuesDict.keys()
        #misCount = 0
        #redFlagDocList = []
        #warningDocList = []
        #misList = []
        redFlagDocSet = set()
        warningDocSet = set()
        
        for docID in workList:
            for fieldName in self.recordValuesFieldList:
                metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
                formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
                if len(metadataFieldValues) - len(formattedFieldValues) == 0:
                    pass
                else:
                    if len(metadataFieldValues) == 0:
                        redFlagDocSet.add(docID)
                    elif len(formattedFieldValues) == 0:
                        redFlagDocSet.add(docID)
                    else:
                        ##  try the count again by deduplicating the metadata field values.  Never on the formatted field values.
                        deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
                        if deduplicatedFieldCount - len(formattedFieldValues) == 0:
                            pass
                        else:
                            warningDocSet.add(docID)
                
##            if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
##                pass
##            else:
##                if len(self.metadataValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                elif len(self.formattedValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                else:
##                    #misCount +=1
##                    #misList.append(docID)
##                    #warningDocList.append(docID)
##                    warningDocSet.add(docID)
                    
        print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
        if countsOnly == False:
            warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
            redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
            for x in warningDocSet:
                warningsOutputFile.write(f"{x}\n")
            warningsOutputFile.close()
            for y in redFlagDocSet:
                redFladsOutputFile.write(f"{y}\n")
            redFladsOutputFile.close()


if __name__ == '__main__':
    cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"

    ## Code Testing
    qcP = QcPrivLog(cleanedDatExportFileName)
    print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
    print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
    print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
    print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)

    qcP.PerformValueCountChecks(countsOnly = False)
    

Revision:	830
Committed:	Tue Nov 19 21:44:44 2024 UTC (16 months, 1 week ago) by nino.borges
Content type:	text/x-python
Original Path:	*Python/NinoCode/Active_prgs/Redgrave/Amazon-PrivLogQC.py*
File size:	7111 byte(s)
Log Message:	This version supports deduplicating the metadata fields by using unique email addresses. Any values that do not have email addresses at all, will stay included but two values in the same metadata field that have the same email address, will dupe out. This resulted in a substantial drop in mismatches, so apparently the majority of these are where the metadata field has duplicates.
#	User	Rev	Content
1	nino.borges	828	"""
2
3			Amazon-PrivLogQC
4
5			Created by:
6			Emanuel Borges
7			11.19.2024
8
9			This program will assist with the process of performing QC on the Amazon privilege logs.
10
11			"""
12
13			import os, re
14			from collections import namedtuple
15
16
17			class QcPrivLog(object):
18			"""A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
19	nino.borges	830	version = '0.3.0'
20	nino.borges	828
21
22			def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
23			"""Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
24			print("Initializing data structures...")
25			self.metadataValuesDict = {}
26			self.formattedValuesDict = {}
27	nino.borges	830	self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
28	nino.borges	828
29			contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
30			self.cleanedInputDataFileHeader = contents[0]
31			contents = contents[1:]
32			print (f"There are {len(contents)} rows of data in this input file.")
33
34			RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues")
35			self.recordValuesFieldList = RecordValues._fields
36
37			for line in contents:
38			line = line.replace("\n","")
39			line = line.split("\|")
40			docID = line[0]
41			## TODO: These are hard coded for now but change to column header lookup asap.
42			self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]))
43			self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]))
44
45			print("Data structures created.")
46
47
48
49			def __SplitAndClean(self, rawVal, delim = ";"):
50			"""Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
51			return [x.strip() for x in rawVal.split(delim)]
52
53
54			def __FieldDedupeByEmailAddress(self, valuesList):
55			"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
56	nino.borges	830	## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
57			## TODO: update this to be case insensitive.
58			tempEmailList = []
59			newList = []
60			for item in valuesList:
61			result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
62			if result:
63			for r in result:
64			if r.upper() in tempEmailList:
65			pass
66			else:
67			newList.append(item)
68			tempEmailList.append(r.upper())
69			else:
70			newList.append(item)
71			return len(newList)
72
73	nino.borges	828
74
75	nino.borges	830
76
77	nino.borges	828	def PerformValueCountChecks(self, countsOnly = True):
78			"""Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
79			workList = self.metadataValuesDict.keys()
80			#misCount = 0
81			#redFlagDocList = []
82			#warningDocList = []
83			#misList = []
84			redFlagDocSet = set()
85			warningDocSet = set()
86
87			for docID in workList:
88	nino.borges	829	for fieldName in self.recordValuesFieldList:
89			metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
90			formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
91			if len(metadataFieldValues) - len(formattedFieldValues) == 0:
92			pass
93	nino.borges	828	else:
94	nino.borges	829	if len(metadataFieldValues) == 0:
95			redFlagDocSet.add(docID)
96			elif len(formattedFieldValues) == 0:
97			redFlagDocSet.add(docID)
98			else:
99	nino.borges	830	## try the count again by deduplicating the metadata field values. Never on the formatted field values.
100			deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
101			if deduplicatedFieldCount - len(formattedFieldValues) == 0:
102			pass
103			else:
104			warningDocSet.add(docID)
105	nino.borges	829
106			## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
107			## pass
108			## else:
109			## if len(self.metadataValuesDict[docID].toValues) == 0:
110			## #redFlagDocList.append(docID)
111			## redFlagDocSet.add(docID)
112			## elif len(self.formattedValuesDict[docID].toValues) == 0:
113			## #redFlagDocList.append(docID)
114			## redFlagDocSet.add(docID)
115			## else:
116			## #misCount +=1
117			## #misList.append(docID)
118			## #warningDocList.append(docID)
119			## warningDocSet.add(docID)
120	nino.borges	828
121	nino.borges	830	print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
122	nino.borges	828	if countsOnly == False:
123			warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
124			redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
125	nino.borges	829	for x in warningDocSet:
126	nino.borges	828	warningsOutputFile.write(f"{x}\n")
127			warningsOutputFile.close()
128	nino.borges	829	for y in redFlagDocSet:
129	nino.borges	828	redFladsOutputFile.write(f"{y}\n")
130			redFladsOutputFile.close()
131
132
133			if __name__ == '__main__':
134			cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
135
136			## Code Testing
137			qcP = QcPrivLog(cleanedDatExportFileName)
138			print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
139			print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
140			print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
141			print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
142			print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
143			print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
144			print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
145			print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
146
147			qcP.PerformValueCountChecks(countsOnly = False)
148
149