ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PrivLogQC.py
Revision: 852
Committed: Thu Dec 12 16:16:23 2024 UTC (15 months, 2 weeks ago) by nino.borges
Content type: text/x-python
File size: 16462 byte(s)
Log Message:
This version adds support for the additionalValuesDict, adding the date value and the legal sources values.

File Contents

# User Rev Content
1 nino.borges 828 """
2    
3     Amazon-PrivLogQC
4    
5     Created by:
6     Emanuel Borges
7     11.19.2024
8    
9     This program will assist with the process of performing QC on the Amazon privilege logs.
10    
11     """
12    
13     import os, re
14     from collections import namedtuple
15 nino.borges 851 from MyCode.Tool_Box import FileEncodingLib
16 nino.borges 828
17    
18     class QcPrivLog(object):
19     """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
20 nino.borges 852 version = '0.7.1'
21 nino.borges 828
22    
23     def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
24     """Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
25     print("Initializing data structures...")
26     self.metadataValuesDict = {}
27     self.formattedValuesDict = {}
28 nino.borges 852 self.additionalValuesDict = {}
29 nino.borges 830 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
30 nino.borges 828
31     contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
32     self.cleanedInputDataFileHeader = contents[0]
33 nino.borges 832 self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("|")
34 nino.borges 828 contents = contents[1:]
35 nino.borges 832 print (f"There are {len(contents)} rows of data in this input file.\n\n")
36 nino.borges 828
37 nino.borges 851
38     ## VEAS-CAAG
39 nino.borges 832 print (f"The data structure will be made of following field pairs:")
40 nino.borges 851 print(f"{self.cleanedInputDataFileHeaderList[26]} | {self.cleanedInputDataFileHeaderList[27]}")
41     print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[30]}")
42     print(f"{self.cleanedInputDataFileHeaderList[31]} | {self.cleanedInputDataFileHeaderList[32]}")
43     print(f"{self.cleanedInputDataFileHeaderList[33]} | {self.cleanedInputDataFileHeaderList[34]}")
44     print(f"{self.cleanedInputDataFileHeaderList[25]} | {self.cleanedInputDataFileHeaderList[28]}\n\n")
45 nino.borges 852 print(f"{self.cleanedInputDataFileHeaderList[15]} will be used for the date qualifier.")
46     print(f"{self.cleanedInputDataFileHeaderList[40]} will be used as the Legal Source field.")
47 nino.borges 832
48 nino.borges 851 ## VEAS_custom
49     ## print (f"The data structure will be made of following field pairs:")
50     ## print(f"{self.cleanedInputDataFileHeaderList[2]} | {self.cleanedInputDataFileHeaderList[6]}")
51     ## print(f"{self.cleanedInputDataFileHeaderList[3]} | {self.cleanedInputDataFileHeaderList[8]}")
52     ## print(f"{self.cleanedInputDataFileHeaderList[4]} | {self.cleanedInputDataFileHeaderList[9]}")
53     ## print(f"{self.cleanedInputDataFileHeaderList[5]} | {self.cleanedInputDataFileHeaderList[10]}")
54     ## print(f"{self.cleanedInputDataFileHeaderList[1]} | {self.cleanedInputDataFileHeaderList[6]}\n\n")
55    
56 nino.borges 831 RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
57 nino.borges 828 self.recordValuesFieldList = RecordValues._fields
58    
59 nino.borges 852 AdditionalValues = namedtuple("AdditionalValues","dateValue legalSourceValues")
60     self.additionalValuesFieldList = AdditionalValues._fields
61    
62 nino.borges 828 for line in contents:
63     line = line.replace("\n","")
64     line = line.split("|")
65     docID = line[0]
66     ## TODO: These are hard coded for now but change to column header lookup asap.
67 nino.borges 839 ## CAAG
68 nino.borges 833 #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
69     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
70 nino.borges 839 ## VEAS-CAAG
71 nino.borges 851 self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[26]),self.__SplitAndClean(line[29]),self.__SplitAndClean(line[31]),self.__SplitAndClean(line[33]) ,self.__SplitAndClean(line[25]))
72     self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[27]),self.__SplitAndClean(line[30]),self.__SplitAndClean(line[32]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[28]))
73 nino.borges 852 self.additionalValuesDict[docID] = AdditionalValues(line[15],self.__SplitAndClean(line[40]))
74 nino.borges 839 ## FTC-CID
75     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]) ,self.__SplitAndClean(line[10]))
76     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[2]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[11]))
77     ## VEAS_custom
78 nino.borges 851 #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[2]),self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]) ,self.__SplitAndClean(line[1]))
79     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[6]))
80 nino.borges 839 ## CAAG_custom
81     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[6]) ,self.__SplitAndClean(line[2]))
82     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[7]))
83 nino.borges 828
84     print("Data structures created.")
85    
86    
87    
88     def __SplitAndClean(self, rawVal, delim = ";"):
89     """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
90 nino.borges 833 if rawVal:
91     newVal = [x.strip() for x in rawVal.split(delim)]
92     else: newVal = ""
93     return newVal
94 nino.borges 828
95    
96     def __FieldDedupeByEmailAddress(self, valuesList):
97     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
98 nino.borges 830 ## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
99     ## TODO: update this to be case insensitive.
100     tempEmailList = []
101     newList = []
102     for item in valuesList:
103     result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
104     if result:
105     for r in result:
106     if r.upper() in tempEmailList:
107     pass
108     else:
109     newList.append(item)
110     tempEmailList.append(r.upper())
111     else:
112     newList.append(item)
113     return len(newList)
114    
115 nino.borges 828
116 nino.borges 831 def __FieldFullValueDedupe(self, valuesList):
117     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values int he formatted fields"""
118     ## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
119     newSet = set()
120     for item in valuesList:
121     newSet.add(item.upper())
122     return len(newSet)
123 nino.borges 828
124 nino.borges 830
125 nino.borges 828 def PerformValueCountChecks(self, countsOnly = True):
126     """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
127     workList = self.metadataValuesDict.keys()
128     #misCount = 0
129     #redFlagDocList = []
130     #warningDocList = []
131     #misList = []
132     redFlagDocSet = set()
133 nino.borges 833 redFlagDocMatrix = {}
134 nino.borges 828 warningDocSet = set()
135 nino.borges 833 warningDocMatrix = {}
136 nino.borges 832 #duplicatesInFormattedSet = set()
137     duplicatesInFormattedMatrix = {}
138 nino.borges 828
139     for docID in workList:
140 nino.borges 829 for fieldName in self.recordValuesFieldList:
141     metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
142     formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
143 nino.borges 833
144 nino.borges 829 if len(metadataFieldValues) - len(formattedFieldValues) == 0:
145     pass
146 nino.borges 828 else:
147 nino.borges 829 if len(metadataFieldValues) == 0:
148 nino.borges 833 ## Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
149     if fieldName == 'docAuthor':
150     if self.metadataValuesDict[docID].fromValues:
151     pass
152     else:
153     redFlagDocSet.add(docID)
154     #print(docID)
155     try:
156     redFlagDocMatrix[docID].append(fieldName)
157     except KeyError:
158     redFlagDocMatrix[docID] = [fieldName,]
159     else:
160     redFlagDocSet.add(docID)
161     try:
162     redFlagDocMatrix[docID].append(fieldName)
163     except KeyError:
164     redFlagDocMatrix[docID] = [fieldName,]
165 nino.borges 829 elif len(formattedFieldValues) == 0:
166     redFlagDocSet.add(docID)
167 nino.borges 833 try:
168     redFlagDocMatrix[docID].append(fieldName)
169     except KeyError:
170     redFlagDocMatrix[docID] = [fieldName,]
171 nino.borges 829 else:
172 nino.borges 830 ## try the count again by deduplicating the metadata field values. Never on the formatted field values.
173     deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
174     if deduplicatedFieldCount - len(formattedFieldValues) == 0:
175     pass
176     else:
177 nino.borges 833 distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
178     if deduplicatedFieldCount > 30:
179     if distanceBetween > (10 * deduplicatedFieldCount)/100:
180     print(docID,fieldName)
181     redFlagDocSet.add(docID)
182     try:
183     redFlagDocMatrix[docID].append(fieldName)
184     except KeyError:
185     redFlagDocMatrix[docID] = [fieldName,]
186     else:
187     warningDocSet.add(docID)
188     try:
189     warningDocMatrix[docID].append(fieldName)
190     except KeyError:
191     warningDocMatrix[docID]= [fieldName,]
192     else:
193     if distanceBetween > 2:
194     print(docID,fieldName)
195     redFlagDocSet.add(docID)
196     try:
197     redFlagDocMatrix[docID].append(fieldName)
198     except KeyError:
199     redFlagDocMatrix[docID] = [fieldName,]
200     else:
201     warningDocSet.add(docID)
202     try:
203     warningDocMatrix[docID].append(fieldName)
204     except KeyError:
205     warningDocMatrix[docID]= [fieldName,]
206 nino.borges 831
207     ## Perform a separate check for duplicates in the formatted field.
208     if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
209     pass
210     else:
211 nino.borges 832 try:
212     duplicatesInFormattedMatrix[docID].append(fieldName)
213     except KeyError:
214     duplicatesInFormattedMatrix[docID] = [fieldName,]
215     #duplicatesInFormattedSet.add(docID)
216 nino.borges 829 ## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
217     ## pass
218     ## else:
219     ## if len(self.metadataValuesDict[docID].toValues) == 0:
220     ## #redFlagDocList.append(docID)
221     ## redFlagDocSet.add(docID)
222     ## elif len(self.formattedValuesDict[docID].toValues) == 0:
223     ## #redFlagDocList.append(docID)
224     ## redFlagDocSet.add(docID)
225     ## else:
226     ## #misCount +=1
227     ## #misList.append(docID)
228     ## #warningDocList.append(docID)
229     ## warningDocSet.add(docID)
230 nino.borges 828
231 nino.borges 830 print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
232 nino.borges 828 if countsOnly == False:
233     warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
234     redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
235 nino.borges 831 duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
236 nino.borges 833 for x in warningDocMatrix:
237     warningsOutputFile.write(f"{x} | {*warningDocMatrix[x],}\n")
238 nino.borges 828 warningsOutputFile.close()
239 nino.borges 833 for y in redFlagDocMatrix:
240     redFladsOutputFile.write(f"{y} | {*redFlagDocMatrix[y],}\n")
241 nino.borges 828 redFladsOutputFile.close()
242 nino.borges 832 for z in duplicatesInFormattedMatrix:
243     duplicatesInFormattedOutputFile.write(f"{z} | {*duplicatesInFormattedMatrix[z],}\n")
244 nino.borges 831 duplicatesInFormattedOutputFile.close()
245 nino.borges 828
246    
247     if __name__ == '__main__':
248 nino.borges 851 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
249     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\CAAG\CAAG_Log_Data_Export_Converted.txt"
250 nino.borges 839 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
251     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
252 nino.borges 833 #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
253     #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\TEST.txt"
254 nino.borges 828
255     ## Code Testing
256     qcP = QcPrivLog(cleanedDatExportFileName)
257 nino.borges 833 ## print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
258     ## print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
259     ## print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
260     ## print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
261     ## print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
262     ## print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
263     ## print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
264     ## print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
265 nino.borges 828
266     qcP.PerformValueCountChecks(countsOnly = False)
267    
268