Active_prgs/Redgrave/Amazon-PrivLogQC.py

"""

Amazon-PrivLogQC

Created by:
Emanuel Borges
11.19.2024

This program will assist with the process of performing QC on the Amazon privilege logs.

"""

import os, re
from collections import namedtuple


class QcPrivLog(object):
    """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
    version = '0.4.0'


    def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
        """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.  Assumes the first row of the data file is the header and first column is DocID."""
        print("Initializing data structures...")
        self.metadataValuesDict = {}
        self.formattedValuesDict = {}
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
        
        contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
        self.cleanedInputDataFileHeader = contents[0]
        contents = contents[1:]
        print (f"There are {len(contents)} rows of data in this input file.")

        RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
        self.recordValuesFieldList = RecordValues._fields

        for line in contents:
            line = line.replace("\n","")
            line = line.split("|")
            docID = line[0]
            ## TODO: These are hard coded for now but change to column header lookup asap.
            self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
            self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))

        print("Data structures created.")


    def __SplitAndClean(self, rawVal, delim = ";"):
        """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
        return [x.strip() for x in rawVal.split(delim)] 


    def __FieldDedupeByEmailAddress(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
        ##  This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
        ## TODO: update this to be case insensitive.
        tempEmailList = []
        newList = []
        for item in valuesList:
            result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
            if result:
                for r in result:
                    if r.upper() in tempEmailList:
                        pass
                    else:
                        newList.append(item)
                        tempEmailList.append(r.upper())
            else:
                newList.append(item)
        return len(newList)
            

    def __FieldFullValueDedupe(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE.  This was created because there appears to be duplicate values int he formatted fields"""
        ##  Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
        newSet = set()
        for item in valuesList:
            newSet.add(item.upper())
        return len(newSet)


    def PerformValueCountChecks(self, countsOnly = True):
        """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
        workList = self.metadataValuesDict.keys()
        #misCount = 0
        #redFlagDocList = []
        #warningDocList = []
        #misList = []
        redFlagDocSet = set()
        warningDocSet = set()
        duplicatesInFormattedSet = set()
        
        for docID in workList:
            for fieldName in self.recordValuesFieldList:
                metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
                formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
                if len(metadataFieldValues) - len(formattedFieldValues) == 0:
                    pass
                else:
                    if len(metadataFieldValues) == 0:
                        redFlagDocSet.add(docID)
                    elif len(formattedFieldValues) == 0:
                        redFlagDocSet.add(docID)
                    else:
                        ##  try the count again by deduplicating the metadata field values.  Never on the formatted field values.
                        deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
                        if deduplicatedFieldCount - len(formattedFieldValues) == 0:
                            pass
                        else:
                            warningDocSet.add(docID)
                            
                ##  Perform a separate check for duplicates in the formatted field.
                if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
                    pass
                else:
                    duplicatesInFormattedSet.add(docID)
##            if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
##                pass
##            else:
##                if len(self.metadataValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                elif len(self.formattedValuesDict[docID].toValues) == 0:
##                    #redFlagDocList.append(docID)
##                    redFlagDocSet.add(docID)
##                else:
##                    #misCount +=1
##                    #misList.append(docID)
##                    #warningDocList.append(docID)
##                    warningDocSet.add(docID)
                    
        print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
        if countsOnly == False:
            warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
            redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
            duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
            for x in warningDocSet:
                warningsOutputFile.write(f"{x}\n")
            warningsOutputFile.close()
            for y in redFlagDocSet:
                redFladsOutputFile.write(f"{y}\n")
            redFladsOutputFile.close()
            for z in duplicatesInFormattedSet:
                duplicatesInFormattedOutputFile.write(f"{z}\n")
            duplicatesInFormattedOutputFile.close()


if __name__ == '__main__':
    cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"

    ## Code Testing
    qcP = QcPrivLog(cleanedDatExportFileName)
    print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
    print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
    print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
    print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
    print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)

    qcP.PerformValueCountChecks(countsOnly = False)
    

Revision:	831
Committed:	Tue Nov 19 22:25:10 2024 UTC (16 months ago) by nino.borges
Content type:	text/x-python
File size:	8357 byte(s)
Log Message:	Added the Author metadata and formatted fields to the check. This increased the unmatched count.
#	Content
1	"""
2
3	Amazon-PrivLogQC
4
5	Created by:
6	Emanuel Borges
7	11.19.2024
8
9	This program will assist with the process of performing QC on the Amazon privilege logs.
10
11	"""
12
13	import os, re
14	from collections import namedtuple
15
16
17	class QcPrivLog(object):
18	"""A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
19	version = '0.4.0'
20
21
22	def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
23	"""Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
24	print("Initializing data structures...")
25	self.metadataValuesDict = {}
26	self.formattedValuesDict = {}
27	self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
28
29	contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
30	self.cleanedInputDataFileHeader = contents[0]
31	contents = contents[1:]
32	print (f"There are {len(contents)} rows of data in this input file.")
33
34	RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
35	self.recordValuesFieldList = RecordValues._fields
36
37	for line in contents:
38	line = line.replace("\n","")
39	line = line.split("\|")
40	docID = line[0]
41	## TODO: These are hard coded for now but change to column header lookup asap.
42	self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
43	self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
44
45	print("Data structures created.")
46
47
48
49	def __SplitAndClean(self, rawVal, delim = ";"):
50	"""Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
51	return [x.strip() for x in rawVal.split(delim)]
52
53
54	def __FieldDedupeByEmailAddress(self, valuesList):
55	"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
56	## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
57	## TODO: update this to be case insensitive.
58	tempEmailList = []
59	newList = []
60	for item in valuesList:
61	result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
62	if result:
63	for r in result:
64	if r.upper() in tempEmailList:
65	pass
66	else:
67	newList.append(item)
68	tempEmailList.append(r.upper())
69	else:
70	newList.append(item)
71	return len(newList)
72
73
74	def __FieldFullValueDedupe(self, valuesList):
75	"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values int he formatted fields"""
76	## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
77	newSet = set()
78	for item in valuesList:
79	newSet.add(item.upper())
80	return len(newSet)
81
82
83	def PerformValueCountChecks(self, countsOnly = True):
84	"""Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
85	workList = self.metadataValuesDict.keys()
86	#misCount = 0
87	#redFlagDocList = []
88	#warningDocList = []
89	#misList = []
90	redFlagDocSet = set()
91	warningDocSet = set()
92	duplicatesInFormattedSet = set()
93
94	for docID in workList:
95	for fieldName in self.recordValuesFieldList:
96	metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
97	formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
98	if len(metadataFieldValues) - len(formattedFieldValues) == 0:
99	pass
100	else:
101	if len(metadataFieldValues) == 0:
102	redFlagDocSet.add(docID)
103	elif len(formattedFieldValues) == 0:
104	redFlagDocSet.add(docID)
105	else:
106	## try the count again by deduplicating the metadata field values. Never on the formatted field values.
107	deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
108	if deduplicatedFieldCount - len(formattedFieldValues) == 0:
109	pass
110	else:
111	warningDocSet.add(docID)
112
113	## Perform a separate check for duplicates in the formatted field.
114	if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
115	pass
116	else:
117	duplicatesInFormattedSet.add(docID)
118	## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
119	## pass
120	## else:
121	## if len(self.metadataValuesDict[docID].toValues) == 0:
122	## #redFlagDocList.append(docID)
123	## redFlagDocSet.add(docID)
124	## elif len(self.formattedValuesDict[docID].toValues) == 0:
125	## #redFlagDocList.append(docID)
126	## redFlagDocSet.add(docID)
127	## else:
128	## #misCount +=1
129	## #misList.append(docID)
130	## #warningDocList.append(docID)
131	## warningDocSet.add(docID)
132
133	print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
134	if countsOnly == False:
135	warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
136	redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
137	duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
138	for x in warningDocSet:
139	warningsOutputFile.write(f"{x}\n")
140	warningsOutputFile.close()
141	for y in redFlagDocSet:
142	redFladsOutputFile.write(f"{y}\n")
143	redFladsOutputFile.close()
144	for z in duplicatesInFormattedSet:
145	duplicatesInFormattedOutputFile.write(f"{z}\n")
146	duplicatesInFormattedOutputFile.close()
147
148
149	if __name__ == '__main__':
150	cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
151
152	## Code Testing
153	qcP = QcPrivLog(cleanedDatExportFileName)
154	print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
155	print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
156	print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
157	print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
158	print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
159	print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
160	print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
161	print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
162
163	qcP.PerformValueCountChecks(countsOnly = False)
164
165