ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PrivLogQC.py
Revision: 839
Committed: Fri Dec 6 21:44:52 2024 UTC (15 months, 2 weeks ago) by nino.borges
Content type: text/x-python
File size: 15703 byte(s)
Log Message:
No significant changes, just updated to run on a different dat file, so I had to change the field indexes.

File Contents

# User Rev Content
1 nino.borges 828 """
2    
3     Amazon-PrivLogQC
4    
5     Created by:
6     Emanuel Borges
7     11.19.2024
8    
9     This program will assist with the process of performing QC on the Amazon privilege logs.
10    
11     """
12    
13     import os, re
14     from collections import namedtuple
15    
16    
17     class QcPrivLog(object):
18     """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
19 nino.borges 833 version = '0.6.1'
20 nino.borges 828
21    
22     def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
23     """Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
24     print("Initializing data structures...")
25     self.metadataValuesDict = {}
26     self.formattedValuesDict = {}
27 nino.borges 830 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
28 nino.borges 828
29     contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
30     self.cleanedInputDataFileHeader = contents[0]
31 nino.borges 832 self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("|")
32 nino.borges 828 contents = contents[1:]
33 nino.borges 832 print (f"There are {len(contents)} rows of data in this input file.\n\n")
34 nino.borges 828
35 nino.borges 839 ## print (f"The data structure will be made of following field pairs:")
36     ## print(f"{self.cleanedInputDataFileHeaderList[26]} | {self.cleanedInputDataFileHeaderList[27]}")
37     ## print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[30]}")
38     ## print(f"{self.cleanedInputDataFileHeaderList[31]} | {self.cleanedInputDataFileHeaderList[32]}")
39     ## print(f"{self.cleanedInputDataFileHeaderList[33]} | {self.cleanedInputDataFileHeaderList[34]}")
40     ## print(f"{self.cleanedInputDataFileHeaderList[25]} | {self.cleanedInputDataFileHeaderList[28]}\n\n")
41    
42 nino.borges 832 print (f"The data structure will be made of following field pairs:")
43 nino.borges 839 print(f"{self.cleanedInputDataFileHeaderList[2]} | {self.cleanedInputDataFileHeaderList[6]}")
44     print(f"{self.cleanedInputDataFileHeaderList[3]} | {self.cleanedInputDataFileHeaderList[8]}")
45     print(f"{self.cleanedInputDataFileHeaderList[4]} | {self.cleanedInputDataFileHeaderList[9]}")
46     print(f"{self.cleanedInputDataFileHeaderList[5]} | {self.cleanedInputDataFileHeaderList[10]}")
47     print(f"{self.cleanedInputDataFileHeaderList[1]} | {self.cleanedInputDataFileHeaderList[6]}\n\n")
48 nino.borges 832
49 nino.borges 831 RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
50 nino.borges 828 self.recordValuesFieldList = RecordValues._fields
51    
52     for line in contents:
53     line = line.replace("\n","")
54     line = line.split("|")
55     docID = line[0]
56     ## TODO: These are hard coded for now but change to column header lookup asap.
57 nino.borges 839 ## CAAG
58 nino.borges 833 #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
59     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
60 nino.borges 839 ## VEAS-CAAG
61     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[26]),self.__SplitAndClean(line[29]),self.__SplitAndClean(line[31]),self.__SplitAndClean(line[33]) ,self.__SplitAndClean(line[25]))
62     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[27]),self.__SplitAndClean(line[30]),self.__SplitAndClean(line[32]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[28]))
63     ## FTC-CID
64     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]) ,self.__SplitAndClean(line[10]))
65     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[2]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[11]))
66     ## VEAS_custom
67     self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[2]),self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]) ,self.__SplitAndClean(line[1]))
68     self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[6]))
69     ## CAAG_custom
70     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[6]) ,self.__SplitAndClean(line[2]))
71     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[7]))
72 nino.borges 828
73     print("Data structures created.")
74    
75    
76    
77     def __SplitAndClean(self, rawVal, delim = ";"):
78     """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
79 nino.borges 833 if rawVal:
80     newVal = [x.strip() for x in rawVal.split(delim)]
81     else: newVal = ""
82     return newVal
83 nino.borges 828
84    
85     def __FieldDedupeByEmailAddress(self, valuesList):
86     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
87 nino.borges 830 ## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
88     ## TODO: update this to be case insensitive.
89     tempEmailList = []
90     newList = []
91     for item in valuesList:
92     result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
93     if result:
94     for r in result:
95     if r.upper() in tempEmailList:
96     pass
97     else:
98     newList.append(item)
99     tempEmailList.append(r.upper())
100     else:
101     newList.append(item)
102     return len(newList)
103    
104 nino.borges 828
105 nino.borges 831 def __FieldFullValueDedupe(self, valuesList):
106     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values int he formatted fields"""
107     ## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
108     newSet = set()
109     for item in valuesList:
110     newSet.add(item.upper())
111     return len(newSet)
112 nino.borges 828
113 nino.borges 830
114 nino.borges 828 def PerformValueCountChecks(self, countsOnly = True):
115     """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
116     workList = self.metadataValuesDict.keys()
117     #misCount = 0
118     #redFlagDocList = []
119     #warningDocList = []
120     #misList = []
121     redFlagDocSet = set()
122 nino.borges 833 redFlagDocMatrix = {}
123 nino.borges 828 warningDocSet = set()
124 nino.borges 833 warningDocMatrix = {}
125 nino.borges 832 #duplicatesInFormattedSet = set()
126     duplicatesInFormattedMatrix = {}
127 nino.borges 828
128     for docID in workList:
129 nino.borges 829 for fieldName in self.recordValuesFieldList:
130     metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
131     formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
132 nino.borges 833
133 nino.borges 829 if len(metadataFieldValues) - len(formattedFieldValues) == 0:
134     pass
135 nino.borges 828 else:
136 nino.borges 829 if len(metadataFieldValues) == 0:
137 nino.borges 833 ## Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
138     if fieldName == 'docAuthor':
139     if self.metadataValuesDict[docID].fromValues:
140     pass
141     else:
142     redFlagDocSet.add(docID)
143     #print(docID)
144     try:
145     redFlagDocMatrix[docID].append(fieldName)
146     except KeyError:
147     redFlagDocMatrix[docID] = [fieldName,]
148     else:
149     redFlagDocSet.add(docID)
150     try:
151     redFlagDocMatrix[docID].append(fieldName)
152     except KeyError:
153     redFlagDocMatrix[docID] = [fieldName,]
154 nino.borges 829 elif len(formattedFieldValues) == 0:
155     redFlagDocSet.add(docID)
156 nino.borges 833 try:
157     redFlagDocMatrix[docID].append(fieldName)
158     except KeyError:
159     redFlagDocMatrix[docID] = [fieldName,]
160 nino.borges 829 else:
161 nino.borges 830 ## try the count again by deduplicating the metadata field values. Never on the formatted field values.
162     deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
163     if deduplicatedFieldCount - len(formattedFieldValues) == 0:
164     pass
165     else:
166 nino.borges 833 distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
167     if deduplicatedFieldCount > 30:
168     if distanceBetween > (10 * deduplicatedFieldCount)/100:
169     print(docID,fieldName)
170     redFlagDocSet.add(docID)
171     try:
172     redFlagDocMatrix[docID].append(fieldName)
173     except KeyError:
174     redFlagDocMatrix[docID] = [fieldName,]
175     else:
176     warningDocSet.add(docID)
177     try:
178     warningDocMatrix[docID].append(fieldName)
179     except KeyError:
180     warningDocMatrix[docID]= [fieldName,]
181     else:
182     if distanceBetween > 2:
183     print(docID,fieldName)
184     redFlagDocSet.add(docID)
185     try:
186     redFlagDocMatrix[docID].append(fieldName)
187     except KeyError:
188     redFlagDocMatrix[docID] = [fieldName,]
189     else:
190     warningDocSet.add(docID)
191     try:
192     warningDocMatrix[docID].append(fieldName)
193     except KeyError:
194     warningDocMatrix[docID]= [fieldName,]
195 nino.borges 831
196     ## Perform a separate check for duplicates in the formatted field.
197     if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
198     pass
199     else:
200 nino.borges 832 try:
201     duplicatesInFormattedMatrix[docID].append(fieldName)
202     except KeyError:
203     duplicatesInFormattedMatrix[docID] = [fieldName,]
204     #duplicatesInFormattedSet.add(docID)
205 nino.borges 829 ## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
206     ## pass
207     ## else:
208     ## if len(self.metadataValuesDict[docID].toValues) == 0:
209     ## #redFlagDocList.append(docID)
210     ## redFlagDocSet.add(docID)
211     ## elif len(self.formattedValuesDict[docID].toValues) == 0:
212     ## #redFlagDocList.append(docID)
213     ## redFlagDocSet.add(docID)
214     ## else:
215     ## #misCount +=1
216     ## #misList.append(docID)
217     ## #warningDocList.append(docID)
218     ## warningDocSet.add(docID)
219 nino.borges 828
220 nino.borges 830 print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
221 nino.borges 828 if countsOnly == False:
222     warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
223     redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
224 nino.borges 831 duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
225 nino.borges 833 for x in warningDocMatrix:
226     warningsOutputFile.write(f"{x} | {*warningDocMatrix[x],}\n")
227 nino.borges 828 warningsOutputFile.close()
228 nino.borges 833 for y in redFlagDocMatrix:
229     redFladsOutputFile.write(f"{y} | {*redFlagDocMatrix[y],}\n")
230 nino.borges 828 redFladsOutputFile.close()
231 nino.borges 832 for z in duplicatesInFormattedMatrix:
232     duplicatesInFormattedOutputFile.write(f"{z} | {*duplicatesInFormattedMatrix[z],}\n")
233 nino.borges 831 duplicatesInFormattedOutputFile.close()
234 nino.borges 828
235    
236     if __name__ == '__main__':
237 nino.borges 839 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\CAAG\CAAG_Log_Data_Export_Converted.txt"
238     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
239     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
240 nino.borges 833 #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
241     #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\TEST.txt"
242 nino.borges 828
243     ## Code Testing
244     qcP = QcPrivLog(cleanedDatExportFileName)
245 nino.borges 833 ## print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
246     ## print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
247     ## print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
248     ## print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
249     ## print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
250     ## print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
251     ## print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
252     ## print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
253 nino.borges 828
254     qcP.PerformValueCountChecks(countsOnly = False)
255    
256