ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PrivLogQC.py
Revision: 833
Committed: Tue Nov 26 19:58:13 2024 UTC (16 months ago) by nino.borges
Content type: text/x-python
Original Path: Python/NinoCode/Active_prgs/Redgrave/Amazon-PrivLogQC.py
File size: 13299 byte(s)
Log Message:
Added support for situations where the doc author is blank because it's an email. This is the version I ran on the newer workspace.

File Contents

# User Rev Content
1 nino.borges 828 """
2    
3     Amazon-PrivLogQC
4    
5     Created by:
6     Emanuel Borges
7     11.19.2024
8    
9     This program will assist with the process of performing QC on the Amazon privilege logs.
10    
11     """
12    
13     import os, re
14     from collections import namedtuple
15    
16    
17     class QcPrivLog(object):
18     """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
19 nino.borges 833 version = '0.6.1'
20 nino.borges 828
21    
22     def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
23     """Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
24     print("Initializing data structures...")
25     self.metadataValuesDict = {}
26     self.formattedValuesDict = {}
27 nino.borges 830 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
28 nino.borges 828
29     contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
30     self.cleanedInputDataFileHeader = contents[0]
31 nino.borges 832 self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("|")
32 nino.borges 828 contents = contents[1:]
33 nino.borges 832 print (f"There are {len(contents)} rows of data in this input file.\n\n")
34 nino.borges 828
35 nino.borges 832 print (f"The data structure will be made of following field pairs:")
36 nino.borges 833 print(f"{self.cleanedInputDataFileHeaderList[26]} | {self.cleanedInputDataFileHeaderList[27]}")
37     print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[30]}")
38     print(f"{self.cleanedInputDataFileHeaderList[31]} | {self.cleanedInputDataFileHeaderList[32]}")
39     print(f"{self.cleanedInputDataFileHeaderList[33]} | {self.cleanedInputDataFileHeaderList[34]}")
40     print(f"{self.cleanedInputDataFileHeaderList[25]} | {self.cleanedInputDataFileHeaderList[28]}\n\n")
41 nino.borges 832
42 nino.borges 831 RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
43 nino.borges 828 self.recordValuesFieldList = RecordValues._fields
44    
45     for line in contents:
46     line = line.replace("\n","")
47     line = line.split("|")
48     docID = line[0]
49     ## TODO: These are hard coded for now but change to column header lookup asap.
50 nino.borges 833 #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
51     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
52     self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[26]),self.__SplitAndClean(line[29]),self.__SplitAndClean(line[31]),self.__SplitAndClean(line[33]) ,self.__SplitAndClean(line[25]))
53     self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[27]),self.__SplitAndClean(line[30]),self.__SplitAndClean(line[32]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[28]))
54 nino.borges 828
55     print("Data structures created.")
56    
57    
58    
59     def __SplitAndClean(self, rawVal, delim = ";"):
60     """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
61 nino.borges 833 if rawVal:
62     newVal = [x.strip() for x in rawVal.split(delim)]
63     else: newVal = ""
64     return newVal
65 nino.borges 828
66    
67     def __FieldDedupeByEmailAddress(self, valuesList):
68     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
69 nino.borges 830 ## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
70     ## TODO: update this to be case insensitive.
71     tempEmailList = []
72     newList = []
73     for item in valuesList:
74     result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
75     if result:
76     for r in result:
77     if r.upper() in tempEmailList:
78     pass
79     else:
80     newList.append(item)
81     tempEmailList.append(r.upper())
82     else:
83     newList.append(item)
84     return len(newList)
85    
86 nino.borges 828
87 nino.borges 831 def __FieldFullValueDedupe(self, valuesList):
88     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values int he formatted fields"""
89     ## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
90     newSet = set()
91     for item in valuesList:
92     newSet.add(item.upper())
93     return len(newSet)
94 nino.borges 828
95 nino.borges 830
96 nino.borges 828 def PerformValueCountChecks(self, countsOnly = True):
97     """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
98     workList = self.metadataValuesDict.keys()
99     #misCount = 0
100     #redFlagDocList = []
101     #warningDocList = []
102     #misList = []
103     redFlagDocSet = set()
104 nino.borges 833 redFlagDocMatrix = {}
105 nino.borges 828 warningDocSet = set()
106 nino.borges 833 warningDocMatrix = {}
107 nino.borges 832 #duplicatesInFormattedSet = set()
108     duplicatesInFormattedMatrix = {}
109 nino.borges 828
110     for docID in workList:
111 nino.borges 829 for fieldName in self.recordValuesFieldList:
112     metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
113     formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
114 nino.borges 833
115 nino.borges 829 if len(metadataFieldValues) - len(formattedFieldValues) == 0:
116     pass
117 nino.borges 828 else:
118 nino.borges 829 if len(metadataFieldValues) == 0:
119 nino.borges 833 ## Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
120     if fieldName == 'docAuthor':
121     if self.metadataValuesDict[docID].fromValues:
122     pass
123     else:
124     redFlagDocSet.add(docID)
125     #print(docID)
126     try:
127     redFlagDocMatrix[docID].append(fieldName)
128     except KeyError:
129     redFlagDocMatrix[docID] = [fieldName,]
130     else:
131     redFlagDocSet.add(docID)
132     try:
133     redFlagDocMatrix[docID].append(fieldName)
134     except KeyError:
135     redFlagDocMatrix[docID] = [fieldName,]
136 nino.borges 829 elif len(formattedFieldValues) == 0:
137     redFlagDocSet.add(docID)
138 nino.borges 833 try:
139     redFlagDocMatrix[docID].append(fieldName)
140     except KeyError:
141     redFlagDocMatrix[docID] = [fieldName,]
142 nino.borges 829 else:
143 nino.borges 830 ## try the count again by deduplicating the metadata field values. Never on the formatted field values.
144     deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
145     if deduplicatedFieldCount - len(formattedFieldValues) == 0:
146     pass
147     else:
148 nino.borges 833 distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
149     if deduplicatedFieldCount > 30:
150     if distanceBetween > (10 * deduplicatedFieldCount)/100:
151     print(docID,fieldName)
152     redFlagDocSet.add(docID)
153     try:
154     redFlagDocMatrix[docID].append(fieldName)
155     except KeyError:
156     redFlagDocMatrix[docID] = [fieldName,]
157     else:
158     warningDocSet.add(docID)
159     try:
160     warningDocMatrix[docID].append(fieldName)
161     except KeyError:
162     warningDocMatrix[docID]= [fieldName,]
163     else:
164     if distanceBetween > 2:
165     print(docID,fieldName)
166     redFlagDocSet.add(docID)
167     try:
168     redFlagDocMatrix[docID].append(fieldName)
169     except KeyError:
170     redFlagDocMatrix[docID] = [fieldName,]
171     else:
172     warningDocSet.add(docID)
173     try:
174     warningDocMatrix[docID].append(fieldName)
175     except KeyError:
176     warningDocMatrix[docID]= [fieldName,]
177 nino.borges 831
178     ## Perform a separate check for duplicates in the formatted field.
179     if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
180     pass
181     else:
182 nino.borges 832 try:
183     duplicatesInFormattedMatrix[docID].append(fieldName)
184     except KeyError:
185     duplicatesInFormattedMatrix[docID] = [fieldName,]
186     #duplicatesInFormattedSet.add(docID)
187 nino.borges 829 ## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
188     ## pass
189     ## else:
190     ## if len(self.metadataValuesDict[docID].toValues) == 0:
191     ## #redFlagDocList.append(docID)
192     ## redFlagDocSet.add(docID)
193     ## elif len(self.formattedValuesDict[docID].toValues) == 0:
194     ## #redFlagDocList.append(docID)
195     ## redFlagDocSet.add(docID)
196     ## else:
197     ## #misCount +=1
198     ## #misList.append(docID)
199     ## #warningDocList.append(docID)
200     ## warningDocSet.add(docID)
201 nino.borges 828
202 nino.borges 830 print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
203 nino.borges 828 if countsOnly == False:
204     warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
205     redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
206 nino.borges 831 duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
207 nino.borges 833 for x in warningDocMatrix:
208     warningsOutputFile.write(f"{x} | {*warningDocMatrix[x],}\n")
209 nino.borges 828 warningsOutputFile.close()
210 nino.borges 833 for y in redFlagDocMatrix:
211     redFladsOutputFile.write(f"{y} | {*redFlagDocMatrix[y],}\n")
212 nino.borges 828 redFladsOutputFile.close()
213 nino.borges 832 for z in duplicatesInFormattedMatrix:
214     duplicatesInFormattedOutputFile.write(f"{z} | {*duplicatesInFormattedMatrix[z],}\n")
215 nino.borges 831 duplicatesInFormattedOutputFile.close()
216 nino.borges 828
217    
218     if __name__ == '__main__':
219 nino.borges 833 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
220     #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
221     #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\TEST.txt"
222 nino.borges 828
223     ## Code Testing
224     qcP = QcPrivLog(cleanedDatExportFileName)
225 nino.borges 833 ## print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
226     ## print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
227     ## print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
228     ## print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
229     ## print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
230     ## print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
231     ## print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
232     ## print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
233 nino.borges 828
234     qcP.PerformValueCountChecks(countsOnly = False)
235    
236