Active_prgs/Redgrave/Amazon_PrivLogQC.py

"""

Amazon-PrivLogQC

Created by:
Emanuel Borges
11.19.2024

This program will assist with the process of performing QC on the Amazon privilege logs.

"""

import os, re
from collections import namedtuple


class QcPrivLog(object):
    """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
    version = '0.2.0'


    def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
        """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.  Assumes the first row of the data file is the header and first column is DocID."""
        print("Initializing data structures...")
        self.metadataValuesDict = {}
        self.formattedValuesDict = {}
        
        contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
        self.cleanedInputDataFileHeader = contents[0]
        contents = contents[1:]
        print (f"There are {len(contents)} rows of data in this input file.")

        RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues")
        self.recordValuesFieldList = RecordValues._fields

        for line in contents:
            line = line.replace("\n","")
            line = line.split("|")
            docID = line[0]
            ## TODO: These are hard coded for now but change to column header lookup asap.
            self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]))
            self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]))

        print("Data structures created.")


    def __SplitAndClean(self, rawVal, delim = ";"):
        """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
        return [x.strip() for x in rawVal.split(delim)] 


    def __FieldDedupeByEmailAddress(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
        pass


    def PerformValueCountChecks(self, countsOnly = True):
        """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
        workList = self.metadataValuesDict.keys()
        #misCount = 0
        #redFlagDocList = []
        #warningDocList = []
        #misList = []
        redFlagDocSet = set()
        warningDocSet = set()
        
        for docID in workList:
            for fieldName in self.recordValuesFieldList:
                metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
                formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
                if len(metadataFieldValues) - len(formattedFieldValues) == 0:
                    pass
                else:
                    if len(metadataFieldValues) == 0:
                        redFlagDocSet.add(docID)
                    elif len(formattedFieldValues) == 0:
                        redFlagDocSet.add(docID)
                    else:
                        warningDocSet.add(docID)
                
##            if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
##                pass
##            else:
##                if len(self.metadataValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                elif len(self.formattedValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                else:
##                    #misCount +=1
##                    #misList.append(docID)
##                    #warningDocList.append(docID)
##                    warningDocSet.add(docID)
                    
        print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the TO value counts that do not match.")
        if countsOnly == False:
            warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
            redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
            for x in warningDocSet:
                warningsOutputFile.write(f"{x}\n")
            warningsOutputFile.close()
            for y in redFlagDocSet:
                redFladsOutputFile.write(f"{y}\n")
            redFladsOutputFile.close()


if __name__ == '__main__':
    cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"

    ## Code Testing
    qcP = QcPrivLog(cleanedDatExportFileName)
    print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
    print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
    print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
    print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)

    qcP.PerformValueCountChecks(countsOnly = False)
    

Revision:	829
Committed:	Tue Nov 19 18:24:40 2024 UTC (16 months, 1 week ago) by nino.borges
Content type:	text/x-python
Original Path:	*Python/NinoCode/Active_prgs/Redgrave/Amazon-PrivLogQC.py*
File size:	5841 byte(s)
Log Message:	0.2.0 adds testing all of the email fields listed in the named tuple, instead of just TO.
#	User	Rev	Content
1	nino.borges	828	"""
2
3			Amazon-PrivLogQC
4
5			Created by:
6			Emanuel Borges
7			11.19.2024
8
9			This program will assist with the process of performing QC on the Amazon privilege logs.
10
11			"""
12
13			import os, re
14			from collections import namedtuple
15
16
17			class QcPrivLog(object):
18			"""A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
19	nino.borges	829	version = '0.2.0'
20	nino.borges	828
21
22			def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
23			"""Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
24			print("Initializing data structures...")
25			self.metadataValuesDict = {}
26			self.formattedValuesDict = {}
27
28			contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
29			self.cleanedInputDataFileHeader = contents[0]
30			contents = contents[1:]
31			print (f"There are {len(contents)} rows of data in this input file.")
32
33			RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues")
34			self.recordValuesFieldList = RecordValues._fields
35
36			for line in contents:
37			line = line.replace("\n","")
38			line = line.split("\|")
39			docID = line[0]
40			## TODO: These are hard coded for now but change to column header lookup asap.
41			self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]))
42			self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]))
43
44			print("Data structures created.")
45
46
47
48			def __SplitAndClean(self, rawVal, delim = ";"):
49			"""Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
50			return [x.strip() for x in rawVal.split(delim)]
51
52
53			def __FieldDedupeByEmailAddress(self, valuesList):
54			"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
55			pass
56
57
58			def PerformValueCountChecks(self, countsOnly = True):
59			"""Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
60			workList = self.metadataValuesDict.keys()
61			#misCount = 0
62			#redFlagDocList = []
63			#warningDocList = []
64			#misList = []
65			redFlagDocSet = set()
66			warningDocSet = set()
67
68			for docID in workList:
69	nino.borges	829	for fieldName in self.recordValuesFieldList:
70			metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
71			formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
72			if len(metadataFieldValues) - len(formattedFieldValues) == 0:
73			pass
74	nino.borges	828	else:
75	nino.borges	829	if len(metadataFieldValues) == 0:
76			redFlagDocSet.add(docID)
77			elif len(formattedFieldValues) == 0:
78			redFlagDocSet.add(docID)
79			else:
80			warningDocSet.add(docID)
81
82			## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
83			## pass
84			## else:
85			## if len(self.metadataValuesDict[docID].toValues) == 0:
86			## #redFlagDocList.append(docID)
87			## redFlagDocSet.add(docID)
88			## elif len(self.formattedValuesDict[docID].toValues) == 0:
89			## #redFlagDocList.append(docID)
90			## redFlagDocSet.add(docID)
91			## else:
92			## #misCount +=1
93			## #misList.append(docID)
94			## #warningDocList.append(docID)
95			## warningDocSet.add(docID)
96	nino.borges	828
97	nino.borges	829	print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the TO value counts that do not match.")
98	nino.borges	828	if countsOnly == False:
99			warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
100			redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
101	nino.borges	829	for x in warningDocSet:
102	nino.borges	828	warningsOutputFile.write(f"{x}\n")
103			warningsOutputFile.close()
104	nino.borges	829	for y in redFlagDocSet:
105	nino.borges	828	redFladsOutputFile.write(f"{y}\n")
106			redFladsOutputFile.close()
107
108
109			if __name__ == '__main__':
110			cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
111
112			## Code Testing
113			qcP = QcPrivLog(cleanedDatExportFileName)
114			print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
115			print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
116			print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
117			print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
118			print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
119			print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
120			print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
121			print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
122
123			qcP.PerformValueCountChecks(countsOnly = False)
124
125