Active_prgs/Redgrave/ATT-PrivLogQC.py

"""

ATT-PrivLogQC

Created by:
Emanuel Borges
03.25.2025

This program will assist with the process of performing QC on past and present AT&T privilege logs.

"""

import os, re
from collections import namedtuple
from MyCode.Tool_Box import FileEncodingLib


class QcPrivLog(object):
    """A class for automating the process of performing QC on the AT&T privilege logs, including names normalization analysis"""
    version = '0.1.0'


    def __init__(self, cleanedDatExportFileName, metaFromFieldName, plogFromFieldName, metaToFieldName, plogToFieldName, metaCcFieldName, plogCcFieldName, metaBccFieldName, plogBccFieldName, metaAuthorFieldName, plogAuthorFieldName, fileEncoding = 'UTF8'):
        """Initializes the data structures. cleanedDatExportFileName should be the full path to the file.  Assumes the first row of the data file is the header and first column is DocID."""
        print("Initializing data structures...")
        self.metadataValuesDict = {}
        self.formattedValuesDict = {}
        self.additionalValuesDict = {}
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
        
        contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
        self.cleanedInputDataFileHeader = contents[0].replace("\n","")
        self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("|")
        self.cleanedInputDataFileHeaderPositionalMatrix = {v: i for i, v in enumerate(self.cleanedInputDataFileHeaderList)}
        contents = contents[1:]
        print (f"There are {len(contents)} rows of data in this input file.\n\n")

        print (f"The data structure will be made of following field pairs:")
        print(f"{metaFromFieldName} | {plogFromFieldName}")
        print(f"{metaToFieldName} | {plogToFieldName}")
        print(f"{metaCcFieldName} | {plogCcFieldName}")
        print(f"{metaBccFieldName} | {plogBccFieldName}")
        print(f"{metaAuthorFieldName} | {plogAuthorFieldName}\n\n")


        RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
        self.recordValuesFieldList = RecordValues._fields


        for line in contents:
            line = line.replace("\n","")
            line = line.split("|")
            docID = line[0]
            self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaFromFieldName]]),
                                                          self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaToFieldName]]),
                                                          self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaCcFieldName]]),
                                                          self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaBccFieldName]]),
                                                          self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaAuthorFieldName]]))
            self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogFromFieldName]]),
                                                           self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogToFieldName]]),
                                                           self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogCcFieldName]]),
                                                           self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogBccFieldName]]),
                                                           self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogAuthorFieldName]]))

        print("Data structures created.")


    def __SplitAndClean(self, rawVal, delim = ";"):
        """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
        if rawVal:
            newVal = [x.strip() for x in rawVal.split(delim)]
        else: newVal = ""
        return newVal


    def __FieldDedupeByEmailAddress(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
        ##  This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
        ## TODO: update this to be case insensitive.
        tempEmailList = []
        newList = []
        for item in valuesList:
            result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
            if result:
                for r in result:
                    if r.upper() in tempEmailList:
                        pass
                    else:
                        newList.append(item)
                        tempEmailList.append(r.upper())
            else:
                newList.append(item)
        return len(newList)
            

    def __FieldFullValueDedupe(self, valuesList):
        """Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE.  This was created because there appears to be duplicate values in the formatted fields"""
        ##  Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
        newSet = set()
        for item in valuesList:
            newSet.add(item.upper())
        return len(newSet)


    def PerformValueCountChecks(self, countsOnly = True):
        """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
        workList = self.metadataValuesDict.keys()
        #misCount = 0
        #redFlagDocList = []
        #warningDocList = []
        #misList = []
        redFlagDocSet = set()
        redFlagDocMatrix = {}
        warningDocSet = set()
        warningDocMatrix = {}
        #duplicatesInFormattedSet = set()
        duplicatesInFormattedMatrix = {}
        
        for docID in workList:
            for fieldName in self.recordValuesFieldList:
                metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
                formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]

                if len(metadataFieldValues) - len(formattedFieldValues) == 0:
                    pass
                else:
                    if len(metadataFieldValues) == 0:
                        ##  Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
                        if fieldName == 'docAuthor':
                            if self.metadataValuesDict[docID].fromValues:
                                pass
                            else:
                                redFlagDocSet.add(docID)
                                #print(docID)
                                try:
                                    redFlagDocMatrix[docID].append(fieldName+"-No_Metadata_Entries-A")
                                except KeyError:
                                    redFlagDocMatrix[docID] = [fieldName+"-No_Metadata_Entries-A",]
                        else:
                            redFlagDocSet.add(docID)
                            try:
                                redFlagDocMatrix[docID].append(fieldName+"-No_Metadata_Entries-B")
                            except KeyError:
                                redFlagDocMatrix[docID] = [fieldName+"-No_Metadata_Entries-B",]
                    elif len(formattedFieldValues) == 0:
                        redFlagDocSet.add(docID)
                        try:
                            redFlagDocMatrix[docID].append(fieldName+"-No_Formatted_Entries")
                        except KeyError:
                            redFlagDocMatrix[docID] = [fieldName+"-No_Formatted_Entries",]
                    else:
                        ##  try the count again by deduplicating the metadata field values.  Never on the formatted field values.
                        deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
                        if deduplicatedFieldCount - len(formattedFieldValues) == 0:
                            pass
                        else:
                            distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
                            if deduplicatedFieldCount > 30:
                                if distanceBetween > (10 * deduplicatedFieldCount)/100:
                                    #print(docID,fieldName)
                                    redFlagDocSet.add(docID)
                                    try:
                                        redFlagDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        redFlagDocMatrix[docID] = [fieldName,]
                                else:
                                    warningDocSet.add(docID)
                                    try:
                                        warningDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        warningDocMatrix[docID]= [fieldName,]
                            else:
                                if distanceBetween > 2:
                                    #print(docID,fieldName)
                                    redFlagDocSet.add(docID)
                                    try:
                                        redFlagDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        redFlagDocMatrix[docID] = [fieldName,]
                                else:
                                    warningDocSet.add(docID)
                                    try:
                                        warningDocMatrix[docID].append(fieldName)
                                    except KeyError:
                                        warningDocMatrix[docID]= [fieldName,]
                            
                ##  Perform a separate check for duplicates in the formatted field.
                if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
                    pass
                else:
                    try:
                        duplicatesInFormattedMatrix[docID].append(fieldName)
                    except KeyError:
                        duplicatesInFormattedMatrix[docID] = [fieldName,]

                    
        print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
        if countsOnly == False:
            warningsOutputFile = open(r"C:\Test_Dir\ATT\warnings.txt",'w')
            redFladsOutputFile = open(r"C:\Test_Dir\ATT\redFlags.txt",'w')
            duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\ATT\dupesInFormattedFields.txt",'w')
            for x in warningDocMatrix:
                warningsOutputFile.write(f"{x} | {*warningDocMatrix[x],}\n")
            warningsOutputFile.close()
            for y in redFlagDocMatrix:
                redFladsOutputFile.write(f"{y} | {*redFlagDocMatrix[y],}\n")
            redFladsOutputFile.close()
            for z in duplicatesInFormattedMatrix:
                duplicatesInFormattedOutputFile.write(f"{z} | {*duplicatesInFormattedMatrix[z],}\n")
            duplicatesInFormattedOutputFile.close()


if __name__ == '__main__':
    cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\AT&T\Cybersecurity FCA Case\PLOG_Test\Shiny\20250325-Shiny-PLOG-Export-Test_Converted.txt"
    qcP = QcPrivLog(cleanedDatExportFileName, "From", "MA Normalized From::Full Name", "To", "MA Normalized To::Full Name",
                    "CC", "MA Normalized Cc::Full Name", "BCC", "MA Normalized Bcc::Full Name", "Author", "DocAuthor", fileEncoding = 'UTF8')
    print(qcP.cleanedInputDataFileHeaderPositionalMatrix)
    qcP.PerformValueCountChecks(countsOnly = False)
    #qcP.PerformValueCountChecks()
Revision:	887
Committed:	Thu May 22 20:04:49 2025 UTC (10 months ago) by nino.borges
Content type:	text/x-python
File size:	12844 byte(s)
Log Message:	This program will assist with the process of performing QC on past and present AT&T privilege logs.
#	User	Rev	Content
1	nino.borges	887	"""
2
3			ATT-PrivLogQC
4
5			Created by:
6			Emanuel Borges
7			03.25.2025
8
9			This program will assist with the process of performing QC on past and present AT&T privilege logs.
10
11			"""
12
13			import os, re
14			from collections import namedtuple
15			from MyCode.Tool_Box import FileEncodingLib
16
17
18			class QcPrivLog(object):
19			"""A class for automating the process of performing QC on the AT&T privilege logs, including names normalization analysis"""
20			version = '0.1.0'
21
22
23			def __init__(self, cleanedDatExportFileName, metaFromFieldName, plogFromFieldName, metaToFieldName, plogToFieldName, metaCcFieldName, plogCcFieldName, metaBccFieldName, plogBccFieldName, metaAuthorFieldName, plogAuthorFieldName, fileEncoding = 'UTF8'):
24			"""Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
25			print("Initializing data structures...")
26			self.metadataValuesDict = {}
27			self.formattedValuesDict = {}
28			self.additionalValuesDict = {}
29			self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
30
31			contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
32			self.cleanedInputDataFileHeader = contents[0].replace("\n","")
33			self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("\|")
34			self.cleanedInputDataFileHeaderPositionalMatrix = {v: i for i, v in enumerate(self.cleanedInputDataFileHeaderList)}
35			contents = contents[1:]
36			print (f"There are {len(contents)} rows of data in this input file.\n\n")
37
38			print (f"The data structure will be made of following field pairs:")
39			print(f"{metaFromFieldName} \| {plogFromFieldName}")
40			print(f"{metaToFieldName} \| {plogToFieldName}")
41			print(f"{metaCcFieldName} \| {plogCcFieldName}")
42			print(f"{metaBccFieldName} \| {plogBccFieldName}")
43			print(f"{metaAuthorFieldName} \| {plogAuthorFieldName}\n\n")
44
45
46
47			RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
48			self.recordValuesFieldList = RecordValues._fields
49
50
51			for line in contents:
52			line = line.replace("\n","")
53			line = line.split("\|")
54			docID = line[0]
55			self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaFromFieldName]]),
56			self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaToFieldName]]),
57			self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaCcFieldName]]),
58			self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaBccFieldName]]),
59			self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[metaAuthorFieldName]]))
60			self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogFromFieldName]]),
61			self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogToFieldName]]),
62			self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogCcFieldName]]),
63			self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogBccFieldName]]),
64			self.__SplitAndClean(line[self.cleanedInputDataFileHeaderPositionalMatrix[plogAuthorFieldName]]))
65
66			print("Data structures created.")
67
68
69			def __SplitAndClean(self, rawVal, delim = ";"):
70			"""Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
71			if rawVal:
72			newVal = [x.strip() for x in rawVal.split(delim)]
73			else: newVal = ""
74			return newVal
75
76
77			def __FieldDedupeByEmailAddress(self, valuesList):
78			"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
79			## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
80			## TODO: update this to be case insensitive.
81			tempEmailList = []
82			newList = []
83			for item in valuesList:
84			result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
85			if result:
86			for r in result:
87			if r.upper() in tempEmailList:
88			pass
89			else:
90			newList.append(item)
91			tempEmailList.append(r.upper())
92			else:
93			newList.append(item)
94			return len(newList)
95
96
97			def __FieldFullValueDedupe(self, valuesList):
98			"""Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values in the formatted fields"""
99			## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
100			newSet = set()
101			for item in valuesList:
102			newSet.add(item.upper())
103			return len(newSet)
104
105
106			def PerformValueCountChecks(self, countsOnly = True):
107			"""Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
108			workList = self.metadataValuesDict.keys()
109			#misCount = 0
110			#redFlagDocList = []
111			#warningDocList = []
112			#misList = []
113			redFlagDocSet = set()
114			redFlagDocMatrix = {}
115			warningDocSet = set()
116			warningDocMatrix = {}
117			#duplicatesInFormattedSet = set()
118			duplicatesInFormattedMatrix = {}
119
120			for docID in workList:
121			for fieldName in self.recordValuesFieldList:
122			metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
123			formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
124
125			if len(metadataFieldValues) - len(formattedFieldValues) == 0:
126			pass
127			else:
128			if len(metadataFieldValues) == 0:
129			## Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
130			if fieldName == 'docAuthor':
131			if self.metadataValuesDict[docID].fromValues:
132			pass
133			else:
134			redFlagDocSet.add(docID)
135			#print(docID)
136			try:
137			redFlagDocMatrix[docID].append(fieldName+"-No_Metadata_Entries-A")
138			except KeyError:
139			redFlagDocMatrix[docID] = [fieldName+"-No_Metadata_Entries-A",]
140			else:
141			redFlagDocSet.add(docID)
142			try:
143			redFlagDocMatrix[docID].append(fieldName+"-No_Metadata_Entries-B")
144			except KeyError:
145			redFlagDocMatrix[docID] = [fieldName+"-No_Metadata_Entries-B",]
146			elif len(formattedFieldValues) == 0:
147			redFlagDocSet.add(docID)
148			try:
149			redFlagDocMatrix[docID].append(fieldName+"-No_Formatted_Entries")
150			except KeyError:
151			redFlagDocMatrix[docID] = [fieldName+"-No_Formatted_Entries",]
152			else:
153			## try the count again by deduplicating the metadata field values. Never on the formatted field values.
154			deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
155			if deduplicatedFieldCount - len(formattedFieldValues) == 0:
156			pass
157			else:
158			distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
159			if deduplicatedFieldCount > 30:
160			if distanceBetween > (10 * deduplicatedFieldCount)/100:
161			#print(docID,fieldName)
162			redFlagDocSet.add(docID)
163			try:
164			redFlagDocMatrix[docID].append(fieldName)
165			except KeyError:
166			redFlagDocMatrix[docID] = [fieldName,]
167			else:
168			warningDocSet.add(docID)
169			try:
170			warningDocMatrix[docID].append(fieldName)
171			except KeyError:
172			warningDocMatrix[docID]= [fieldName,]
173			else:
174			if distanceBetween > 2:
175			#print(docID,fieldName)
176			redFlagDocSet.add(docID)
177			try:
178			redFlagDocMatrix[docID].append(fieldName)
179			except KeyError:
180			redFlagDocMatrix[docID] = [fieldName,]
181			else:
182			warningDocSet.add(docID)
183			try:
184			warningDocMatrix[docID].append(fieldName)
185			except KeyError:
186			warningDocMatrix[docID]= [fieldName,]
187
188			## Perform a separate check for duplicates in the formatted field.
189			if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
190			pass
191			else:
192			try:
193			duplicatesInFormattedMatrix[docID].append(fieldName)
194			except KeyError:
195			duplicatesInFormattedMatrix[docID] = [fieldName,]
196
197
198			print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
199			if countsOnly == False:
200			warningsOutputFile = open(r"C:\Test_Dir\ATT\warnings.txt",'w')
201			redFladsOutputFile = open(r"C:\Test_Dir\ATT\redFlags.txt",'w')
202			duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\ATT\dupesInFormattedFields.txt",'w')
203			for x in warningDocMatrix:
204			warningsOutputFile.write(f"{x} \| {*warningDocMatrix[x],}\n")
205			warningsOutputFile.close()
206			for y in redFlagDocMatrix:
207			redFladsOutputFile.write(f"{y} \| {*redFlagDocMatrix[y],}\n")
208			redFladsOutputFile.close()
209			for z in duplicatesInFormattedMatrix:
210			duplicatesInFormattedOutputFile.write(f"{z} \| {*duplicatesInFormattedMatrix[z],}\n")
211			duplicatesInFormattedOutputFile.close()
212
213
214			if __name__ == '__main__':
215			cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\AT&T\Cybersecurity FCA Case\PLOG_Test\Shiny\20250325-Shiny-PLOG-Export-Test_Converted.txt"
216			qcP = QcPrivLog(cleanedDatExportFileName, "From", "MA Normalized From::Full Name", "To", "MA Normalized To::Full Name",
217			"CC", "MA Normalized Cc::Full Name", "BCC", "MA Normalized Bcc::Full Name", "Author", "DocAuthor", fileEncoding = 'UTF8')
218			print(qcP.cleanedInputDataFileHeaderPositionalMatrix)
219			qcP.PerformValueCountChecks(countsOnly = False)
220			#qcP.PerformValueCountChecks()