ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PrivLogQC.py
Revision: 829
Committed: Tue Nov 19 18:24:40 2024 UTC (16 months, 1 week ago) by nino.borges
Content type: text/x-python
Original Path: Python/NinoCode/Active_prgs/Redgrave/Amazon-PrivLogQC.py
File size: 5841 byte(s)
Log Message:
0.2.0 adds testing all of the email fields listed in the named tuple, instead of just TO.

File Contents

# User Rev Content
1 nino.borges 828 """
2    
3     Amazon-PrivLogQC
4    
5     Created by:
6     Emanuel Borges
7     11.19.2024
8    
9     This program will assist with the process of performing QC on the Amazon privilege logs.
10    
11     """
12    
13     import os, re
14     from collections import namedtuple
15    
16    
17     class QcPrivLog(object):
18     """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
19 nino.borges 829 version = '0.2.0'
20 nino.borges 828
21    
22     def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
23     """Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
24     print("Initializing data structures...")
25     self.metadataValuesDict = {}
26     self.formattedValuesDict = {}
27    
28     contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
29     self.cleanedInputDataFileHeader = contents[0]
30     contents = contents[1:]
31     print (f"There are {len(contents)} rows of data in this input file.")
32    
33     RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues")
34     self.recordValuesFieldList = RecordValues._fields
35    
36     for line in contents:
37     line = line.replace("\n","")
38     line = line.split("|")
39     docID = line[0]
40     ## TODO: These are hard coded for now but change to column header lookup asap.
41     self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]))
42     self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]))
43    
44     print("Data structures created.")
45    
46    
47    
48     def __SplitAndClean(self, rawVal, delim = ";"):
49     """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
50     return [x.strip() for x in rawVal.split(delim)]
51    
52    
53     def __FieldDedupeByEmailAddress(self, valuesList):
54     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
55     pass
56    
57    
58     def PerformValueCountChecks(self, countsOnly = True):
59     """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
60     workList = self.metadataValuesDict.keys()
61     #misCount = 0
62     #redFlagDocList = []
63     #warningDocList = []
64     #misList = []
65     redFlagDocSet = set()
66     warningDocSet = set()
67    
68     for docID in workList:
69 nino.borges 829 for fieldName in self.recordValuesFieldList:
70     metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
71     formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
72     if len(metadataFieldValues) - len(formattedFieldValues) == 0:
73     pass
74 nino.borges 828 else:
75 nino.borges 829 if len(metadataFieldValues) == 0:
76     redFlagDocSet.add(docID)
77     elif len(formattedFieldValues) == 0:
78     redFlagDocSet.add(docID)
79     else:
80     warningDocSet.add(docID)
81    
82     ## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
83     ## pass
84     ## else:
85     ## if len(self.metadataValuesDict[docID].toValues) == 0:
86     ## #redFlagDocList.append(docID)
87     ## redFlagDocSet.add(docID)
88     ## elif len(self.formattedValuesDict[docID].toValues) == 0:
89     ## #redFlagDocList.append(docID)
90     ## redFlagDocSet.add(docID)
91     ## else:
92     ## #misCount +=1
93     ## #misList.append(docID)
94     ## #warningDocList.append(docID)
95     ## warningDocSet.add(docID)
96 nino.borges 828
97 nino.borges 829 print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the TO value counts that do not match.")
98 nino.borges 828 if countsOnly == False:
99     warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
100     redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
101 nino.borges 829 for x in warningDocSet:
102 nino.borges 828 warningsOutputFile.write(f"{x}\n")
103     warningsOutputFile.close()
104 nino.borges 829 for y in redFlagDocSet:
105 nino.borges 828 redFladsOutputFile.write(f"{y}\n")
106     redFladsOutputFile.close()
107    
108    
109     if __name__ == '__main__':
110     cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
111    
112     ## Code Testing
113     qcP = QcPrivLog(cleanedDatExportFileName)
114     print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
115     print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
116     print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
117     print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
118     print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
119     print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
120     print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
121     print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
122    
123     qcP.PerformValueCountChecks(countsOnly = False)
124    
125