ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon-PrivLogQC.py
Revision: 831
Committed: Tue Nov 19 22:25:10 2024 UTC (16 months ago) by nino.borges
Content type: text/x-python
File size: 8357 byte(s)
Log Message:
Added the Author metadata and formatted fields to the check.  This increased the unmatched count.

File Contents

# Content
1 """
2
3 Amazon-PrivLogQC
4
5 Created by:
6 Emanuel Borges
7 11.19.2024
8
9 This program will assist with the process of performing QC on the Amazon privilege logs.
10
11 """
12
13 import os, re
14 from collections import namedtuple
15
16
17 class QcPrivLog(object):
18 """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
19 version = '0.4.0'
20
21
22 def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
23 """Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
24 print("Initializing data structures...")
25 self.metadataValuesDict = {}
26 self.formattedValuesDict = {}
27 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
28
29 contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
30 self.cleanedInputDataFileHeader = contents[0]
31 contents = contents[1:]
32 print (f"There are {len(contents)} rows of data in this input file.")
33
34 RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
35 self.recordValuesFieldList = RecordValues._fields
36
37 for line in contents:
38 line = line.replace("\n","")
39 line = line.split("|")
40 docID = line[0]
41 ## TODO: These are hard coded for now but change to column header lookup asap.
42 self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
43 self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
44
45 print("Data structures created.")
46
47
48
49 def __SplitAndClean(self, rawVal, delim = ";"):
50 """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
51 return [x.strip() for x in rawVal.split(delim)]
52
53
54 def __FieldDedupeByEmailAddress(self, valuesList):
55 """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
56 ## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
57 ## TODO: update this to be case insensitive.
58 tempEmailList = []
59 newList = []
60 for item in valuesList:
61 result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
62 if result:
63 for r in result:
64 if r.upper() in tempEmailList:
65 pass
66 else:
67 newList.append(item)
68 tempEmailList.append(r.upper())
69 else:
70 newList.append(item)
71 return len(newList)
72
73
74 def __FieldFullValueDedupe(self, valuesList):
75 """Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values int he formatted fields"""
76 ## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
77 newSet = set()
78 for item in valuesList:
79 newSet.add(item.upper())
80 return len(newSet)
81
82
83 def PerformValueCountChecks(self, countsOnly = True):
84 """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
85 workList = self.metadataValuesDict.keys()
86 #misCount = 0
87 #redFlagDocList = []
88 #warningDocList = []
89 #misList = []
90 redFlagDocSet = set()
91 warningDocSet = set()
92 duplicatesInFormattedSet = set()
93
94 for docID in workList:
95 for fieldName in self.recordValuesFieldList:
96 metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
97 formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
98 if len(metadataFieldValues) - len(formattedFieldValues) == 0:
99 pass
100 else:
101 if len(metadataFieldValues) == 0:
102 redFlagDocSet.add(docID)
103 elif len(formattedFieldValues) == 0:
104 redFlagDocSet.add(docID)
105 else:
106 ## try the count again by deduplicating the metadata field values. Never on the formatted field values.
107 deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
108 if deduplicatedFieldCount - len(formattedFieldValues) == 0:
109 pass
110 else:
111 warningDocSet.add(docID)
112
113 ## Perform a separate check for duplicates in the formatted field.
114 if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
115 pass
116 else:
117 duplicatesInFormattedSet.add(docID)
118 ## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
119 ## pass
120 ## else:
121 ## if len(self.metadataValuesDict[docID].toValues) == 0:
122 ## #redFlagDocList.append(docID)
123 ## redFlagDocSet.add(docID)
124 ## elif len(self.formattedValuesDict[docID].toValues) == 0:
125 ## #redFlagDocList.append(docID)
126 ## redFlagDocSet.add(docID)
127 ## else:
128 ## #misCount +=1
129 ## #misList.append(docID)
130 ## #warningDocList.append(docID)
131 ## warningDocSet.add(docID)
132
133 print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
134 if countsOnly == False:
135 warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
136 redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
137 duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
138 for x in warningDocSet:
139 warningsOutputFile.write(f"{x}\n")
140 warningsOutputFile.close()
141 for y in redFlagDocSet:
142 redFladsOutputFile.write(f"{y}\n")
143 redFladsOutputFile.close()
144 for z in duplicatesInFormattedSet:
145 duplicatesInFormattedOutputFile.write(f"{z}\n")
146 duplicatesInFormattedOutputFile.close()
147
148
149 if __name__ == '__main__':
150 cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
151
152 ## Code Testing
153 qcP = QcPrivLog(cleanedDatExportFileName)
154 print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
155 print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
156 print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
157 print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
158 print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
159 print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
160 print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
161 print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
162
163 qcP.PerformValueCountChecks(countsOnly = False)
164
165