ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PrivLogQC.py
Revision: 830
Committed: Tue Nov 19 21:44:44 2024 UTC (16 months, 1 week ago) by nino.borges
Content type: text/x-python
Original Path: Python/NinoCode/Active_prgs/Redgrave/Amazon-PrivLogQC.py
File size: 7111 byte(s)
Log Message:
This version supports deduplicating the metadata fields by using unique email addresses.  Any values that do not have email addresses at all, will stay included but two values in the same metadata field that have the same email address, will dupe out.  This resulted in a substantial drop in mismatches, so apparently the majority of these are where the metadata field has duplicates.

File Contents

# User Rev Content
1 nino.borges 828 """
2    
3     Amazon-PrivLogQC
4    
5     Created by:
6     Emanuel Borges
7     11.19.2024
8    
9     This program will assist with the process of performing QC on the Amazon privilege logs.
10    
11     """
12    
13     import os, re
14     from collections import namedtuple
15    
16    
17     class QcPrivLog(object):
18     """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
19 nino.borges 830 version = '0.3.0'
20 nino.borges 828
21    
22     def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
23     """Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
24     print("Initializing data structures...")
25     self.metadataValuesDict = {}
26     self.formattedValuesDict = {}
27 nino.borges 830 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
28 nino.borges 828
29     contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
30     self.cleanedInputDataFileHeader = contents[0]
31     contents = contents[1:]
32     print (f"There are {len(contents)} rows of data in this input file.")
33    
34     RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues")
35     self.recordValuesFieldList = RecordValues._fields
36    
37     for line in contents:
38     line = line.replace("\n","")
39     line = line.split("|")
40     docID = line[0]
41     ## TODO: These are hard coded for now but change to column header lookup asap.
42     self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]))
43     self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]))
44    
45     print("Data structures created.")
46    
47    
48    
49     def __SplitAndClean(self, rawVal, delim = ";"):
50     """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
51     return [x.strip() for x in rawVal.split(delim)]
52    
53    
54     def __FieldDedupeByEmailAddress(self, valuesList):
55     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
56 nino.borges 830 ## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
57     ## TODO: update this to be case insensitive.
58     tempEmailList = []
59     newList = []
60     for item in valuesList:
61     result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
62     if result:
63     for r in result:
64     if r.upper() in tempEmailList:
65     pass
66     else:
67     newList.append(item)
68     tempEmailList.append(r.upper())
69     else:
70     newList.append(item)
71     return len(newList)
72    
73 nino.borges 828
74    
75 nino.borges 830
76    
77 nino.borges 828 def PerformValueCountChecks(self, countsOnly = True):
78     """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
79     workList = self.metadataValuesDict.keys()
80     #misCount = 0
81     #redFlagDocList = []
82     #warningDocList = []
83     #misList = []
84     redFlagDocSet = set()
85     warningDocSet = set()
86    
87     for docID in workList:
88 nino.borges 829 for fieldName in self.recordValuesFieldList:
89     metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
90     formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
91     if len(metadataFieldValues) - len(formattedFieldValues) == 0:
92     pass
93 nino.borges 828 else:
94 nino.borges 829 if len(metadataFieldValues) == 0:
95     redFlagDocSet.add(docID)
96     elif len(formattedFieldValues) == 0:
97     redFlagDocSet.add(docID)
98     else:
99 nino.borges 830 ## try the count again by deduplicating the metadata field values. Never on the formatted field values.
100     deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
101     if deduplicatedFieldCount - len(formattedFieldValues) == 0:
102     pass
103     else:
104     warningDocSet.add(docID)
105 nino.borges 829
106     ## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
107     ## pass
108     ## else:
109     ## if len(self.metadataValuesDict[docID].toValues) == 0:
110     ## #redFlagDocList.append(docID)
111     ## redFlagDocSet.add(docID)
112     ## elif len(self.formattedValuesDict[docID].toValues) == 0:
113     ## #redFlagDocList.append(docID)
114     ## redFlagDocSet.add(docID)
115     ## else:
116     ## #misCount +=1
117     ## #misList.append(docID)
118     ## #warningDocList.append(docID)
119     ## warningDocSet.add(docID)
120 nino.borges 828
121 nino.borges 830 print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
122 nino.borges 828 if countsOnly == False:
123     warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
124     redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
125 nino.borges 829 for x in warningDocSet:
126 nino.borges 828 warningsOutputFile.write(f"{x}\n")
127     warningsOutputFile.close()
128 nino.borges 829 for y in redFlagDocSet:
129 nino.borges 828 redFladsOutputFile.write(f"{y}\n")
130     redFladsOutputFile.close()
131    
132    
133     if __name__ == '__main__':
134     cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
135    
136     ## Code Testing
137     qcP = QcPrivLog(cleanedDatExportFileName)
138     print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
139     print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
140     print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
141     print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
142     print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
143     print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
144     print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
145     print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
146    
147     qcP.PerformValueCountChecks(countsOnly = False)
148    
149