ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/Amazon_PrivLogQC.py
Revision: 924
Committed: Thu Aug 7 20:28:36 2025 UTC (7 months, 2 weeks ago) by nino.borges
Content type: text/x-python
File size: 27090 byte(s)
Log Message:
use changes and additional reporting.

File Contents

# User Rev Content
1 nino.borges 828 """
2    
3     Amazon-PrivLogQC
4    
5     Created by:
6     Emanuel Borges
7     11.19.2024
8    
9     This program will assist with the process of performing QC on the Amazon privilege logs.
10    
11     """
12    
13     import os, re
14     from collections import namedtuple
15 nino.borges 851 from MyCode.Tool_Box import FileEncodingLib
16 nino.borges 828
17    
18     class QcPrivLog(object):
19     """A class for automating the process of performing QC on the Amazon privilege logs, including names normalization analysis"""
20 nino.borges 924 version = '0.8.0'
21 nino.borges 828
22    
23     def __init__(self, cleanedDatExportFileName, fileEncoding = 'UTF8'):
24     """Initializes the data structures. cleanedDatExportFileName should be the full path to the file. Assumes the first row of the data file is the header and first column is DocID."""
25     print("Initializing data structures...")
26     self.metadataValuesDict = {}
27     self.formattedValuesDict = {}
28 nino.borges 852 self.additionalValuesDict = {}
29 nino.borges 830 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
30 nino.borges 828
31     contents = open(cleanedDatExportFileName,encoding = fileEncoding).readlines()
32 nino.borges 924 self.cleanedInputDataFileHeader = contents[0].replace("\n","")
33 nino.borges 832 self.cleanedInputDataFileHeaderList = self.cleanedInputDataFileHeader.split("|")
34 nino.borges 828 contents = contents[1:]
35 nino.borges 832 print (f"There are {len(contents)} rows of data in this input file.\n\n")
36 nino.borges 828
37 nino.borges 851
38 nino.borges 881 ## Floyd
39     ## print (f"The data structure will be made of following field pairs:")
40     ## print(f"{self.cleanedInputDataFileHeaderList[23]} | {self.cleanedInputDataFileHeaderList[24]}")
41     ## print(f"{self.cleanedInputDataFileHeaderList[25]} | {self.cleanedInputDataFileHeaderList[26]}")
42     ## print(f"{self.cleanedInputDataFileHeaderList[27]} | {self.cleanedInputDataFileHeaderList[28]}")
43     ## print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[20]}")
44     ## print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[32]}\n\n")
45     ## print(f"{self.cleanedInputDataFileHeaderList[12]} will be used for the date qualifier.")
46     ## print(f"{self.cleanedInputDataFileHeaderList[36]} will be used as the Legal Source field.")
47    
48    
49     ## CAAG
50     ## print (f"The data structure will be made of following field pairs:")
51     ## print(f"{self.cleanedInputDataFileHeaderList[30]} | {self.cleanedInputDataFileHeaderList[31]}")
52     ## print(f"{self.cleanedInputDataFileHeaderList[34]} | {self.cleanedInputDataFileHeaderList[35]}")
53     ## print(f"{self.cleanedInputDataFileHeaderList[36]} | {self.cleanedInputDataFileHeaderList[37]}")
54     ## print(f"{self.cleanedInputDataFileHeaderList[38]} | {self.cleanedInputDataFileHeaderList[39]}")
55     ## print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[32]}\n\n")
56     ## print(f"{self.cleanedInputDataFileHeaderList[15]} will be used for the date qualifier.")
57     ## print(f"{self.cleanedInputDataFileHeaderList[45]} will be used as the Legal Source field.")
58    
59     ## CAAG Custom 2
60     ## print (f"The data structure will be made of following field pairs:")
61 nino.borges 924 ## print(f"{self.cleanedInputDataFileHeaderList[4]} | {self.cleanedInputDataFileHeaderList[14]}")
62     ## print(f"{self.cleanedInputDataFileHeaderList[5]} | {self.cleanedInputDataFileHeaderList[10]}")
63     ## print(f"{self.cleanedInputDataFileHeaderList[6]} | {self.cleanedInputDataFileHeaderList[11]}")
64     ## print(f"{self.cleanedInputDataFileHeaderList[7]} | {self.cleanedInputDataFileHeaderList[12]}")
65     ## print(f"{self.cleanedInputDataFileHeaderList[8]} | {self.cleanedInputDataFileHeaderList[9]}\n\n")
66     ## print(f"{self.cleanedInputDataFileHeaderList[16]} will be used for the date qualifier.")
67     ## print(f"{self.cleanedInputDataFileHeaderList[15]} will be used as the Legal Source field.")
68 nino.borges 881
69 nino.borges 851 ## VEAS-CAAG
70 nino.borges 881 ## print (f"The data structure will be made of following field pairs:")
71     ## print(f"{self.cleanedInputDataFileHeaderList[26]} | {self.cleanedInputDataFileHeaderList[27]}")
72     ## print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[30]}")
73     ## print(f"{self.cleanedInputDataFileHeaderList[31]} | {self.cleanedInputDataFileHeaderList[32]}")
74     ## print(f"{self.cleanedInputDataFileHeaderList[33]} | {self.cleanedInputDataFileHeaderList[34]}")
75     ## print(f"{self.cleanedInputDataFileHeaderList[25]} | {self.cleanedInputDataFileHeaderList[28]}\n\n")
76     ## print(f"{self.cleanedInputDataFileHeaderList[15]} will be used for the date qualifier.")
77     ## print(f"{self.cleanedInputDataFileHeaderList[40]} will be used as the Legal Source field.")
78 nino.borges 832
79 nino.borges 881 ## VEAS-CAAG 2
80     ## print (f"The data structure will be made of following field pairs:")
81     ## print(f"{self.cleanedInputDataFileHeaderList[11]} | {self.cleanedInputDataFileHeaderList[5]}")
82     ## print(f"{self.cleanedInputDataFileHeaderList[12]} | {self.cleanedInputDataFileHeaderList[6]}")
83     ## print(f"{self.cleanedInputDataFileHeaderList[13]} | {self.cleanedInputDataFileHeaderList[7]}")
84     ## print(f"{self.cleanedInputDataFileHeaderList[14]} | {self.cleanedInputDataFileHeaderList[8]}")
85     ## print(f"{self.cleanedInputDataFileHeaderList[15]} | {self.cleanedInputDataFileHeaderList[4]}\n\n")
86     ## print(f"{self.cleanedInputDataFileHeaderList[1]} will be used for the date qualifier.")
87     ## print(f"{self.cleanedInputDataFileHeaderList[9]} will be used as the Legal Source field.")
88    
89 nino.borges 851 ## VEAS_custom
90     ## print (f"The data structure will be made of following field pairs:")
91 nino.borges 924 ## print(f"{self.cleanedInputDataFileHeaderList[35]} | {self.cleanedInputDataFileHeaderList[36]}")
92     ## print(f"{self.cleanedInputDataFileHeaderList[38]} | {self.cleanedInputDataFileHeaderList[61]}")
93     ## print(f"{self.cleanedInputDataFileHeaderList[40]} | {self.cleanedInputDataFileHeaderList[62]}")
94     ## print(f"{self.cleanedInputDataFileHeaderList[42]} | {self.cleanedInputDataFileHeaderList[63]}")
95     ## print(f"{self.cleanedInputDataFileHeaderList[34]} | {self.cleanedInputDataFileHeaderList[37]}\n\n")
96     ## print(f"{self.cleanedInputDataFileHeaderList[21]} will be used for the date qualifier.")
97     ## print(f"{self.cleanedInputDataFileHeaderList[49]} will be used as the Legal Source field.")
98 nino.borges 851
99 nino.borges 881 ## FTC-Retail
100 nino.borges 924 ## print (f"The data structure will be made of following field pairs:")
101     ## print(f"{self.cleanedInputDataFileHeaderList[39]} | {self.cleanedInputDataFileHeaderList[40]}")
102     ## print(f"{self.cleanedInputDataFileHeaderList[41]} | {self.cleanedInputDataFileHeaderList[42]}")
103     ## print(f"{self.cleanedInputDataFileHeaderList[43]} | {self.cleanedInputDataFileHeaderList[44]}")
104     ## print(f"{self.cleanedInputDataFileHeaderList[45]} | {self.cleanedInputDataFileHeaderList[46]}")
105     ## print(f"{self.cleanedInputDataFileHeaderList[37]} | {self.cleanedInputDataFileHeaderList[38]}\n\n")
106     ## print(f"{self.cleanedInputDataFileHeaderList[27]} will be used for the date qualifier.")
107     ## print(f"{self.cleanedInputDataFileHeaderList[52]} will be used as the Legal Source field.")
108    
109     ## FTC-Retail PLOG
110 nino.borges 881 print (f"The data structure will be made of following field pairs:")
111 nino.borges 924 print(f"{self.cleanedInputDataFileHeaderList[29]} | {self.cleanedInputDataFileHeaderList[31]}")
112     print(f"{self.cleanedInputDataFileHeaderList[32]} | {self.cleanedInputDataFileHeaderList[34]}")
113     print(f"{self.cleanedInputDataFileHeaderList[35]} | {self.cleanedInputDataFileHeaderList[37]}")
114     print(f"{self.cleanedInputDataFileHeaderList[38]} | {self.cleanedInputDataFileHeaderList[40]}")
115     print(f"{self.cleanedInputDataFileHeaderList[27]} | {self.cleanedInputDataFileHeaderList[28]}\n\n")
116     print(f"{self.cleanedInputDataFileHeaderList[15]} will be used for the date qualifier.")
117     print(f"{self.cleanedInputDataFileHeaderList[47]} will be used as the Legal Source field.")
118 nino.borges 881
119 nino.borges 924 ## FTC-CID 2
120     ## print (f"The data structure will be made of following field pairs:")
121     ## print(f"{self.cleanedInputDataFileHeaderList[22]} | {self.cleanedInputDataFileHeaderList[25]}")
122     ## print(f"{self.cleanedInputDataFileHeaderList[26]} | {self.cleanedInputDataFileHeaderList[27]}")
123     ## print(f"{self.cleanedInputDataFileHeaderList[28]} | {self.cleanedInputDataFileHeaderList[29]}")
124     ## print(f"{self.cleanedInputDataFileHeaderList[30]} | {self.cleanedInputDataFileHeaderList[31]}")
125     ## print(f"{self.cleanedInputDataFileHeaderList[21]} | {self.cleanedInputDataFileHeaderList[24]}\n\n")
126     ## print(f"{self.cleanedInputDataFileHeaderList[8]} will be used for the date qualifier.")
127     ## print(f"{self.cleanedInputDataFileHeaderList[20]} will be used as the Legal Source field.")
128    
129 nino.borges 831 RecordValues = namedtuple("RecordValues","fromValues toValues ccValues bccValues docAuthor")
130 nino.borges 828 self.recordValuesFieldList = RecordValues._fields
131    
132 nino.borges 852 AdditionalValues = namedtuple("AdditionalValues","dateValue legalSourceValues")
133     self.additionalValuesFieldList = AdditionalValues._fields
134    
135 nino.borges 828 for line in contents:
136     line = line.replace("\n","")
137     line = line.split("|")
138     docID = line[0]
139     ## TODO: These are hard coded for now but change to column header lookup asap.
140 nino.borges 881 ## CAAG_custom 2
141 nino.borges 924 #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[6]),self.__SplitAndClean(line[7]) ,self.__SplitAndClean(line[8]))
142     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[14]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[11]),self.__SplitAndClean(line[12]),self.__SplitAndClean(line[9]))
143 nino.borges 881 ## TODO: update code because they now have time values in the family date field.
144 nino.borges 924 #self.additionalValuesDict[docID] = AdditionalValues(line[16].split(" ")[0],self.__SplitAndClean(line[15]))
145 nino.borges 881 ## Floyd
146     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
147     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
148     #self.additionalValuesDict[docID] = AdditionalValues(line[15],self.__SplitAndClean(line[45]))
149 nino.borges 839 ## CAAG
150 nino.borges 833 #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[30]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[36]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[29]))
151     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[39]),self.__SplitAndClean(line[32]))
152 nino.borges 881 #self.additionalValuesDict[docID] = AdditionalValues(line[15],self.__SplitAndClean(line[45]))
153 nino.borges 839 ## VEAS-CAAG
154 nino.borges 881 #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[26]),self.__SplitAndClean(line[29]),self.__SplitAndClean(line[31]),self.__SplitAndClean(line[33]) ,self.__SplitAndClean(line[25]))
155     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[27]),self.__SplitAndClean(line[30]),self.__SplitAndClean(line[32]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[28]))
156     #self.additionalValuesDict[docID] = AdditionalValues(line[15],self.__SplitAndClean(line[40]))
157     ## VEAS-CAAG 2
158     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[11]),self.__SplitAndClean(line[12]),self.__SplitAndClean(line[13]),self.__SplitAndClean(line[14]) ,self.__SplitAndClean(line[15]))
159     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[5]),self.__SplitAndClean(line[6]),self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[4]))
160     ## TODO: update code because they now have time values in the family date field.
161     #self.additionalValuesDict[docID] = AdditionalValues(line[1].split(" ")[0],self.__SplitAndClean(line[9]))
162 nino.borges 839 ## FTC-CID
163     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[6]),self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]) ,self.__SplitAndClean(line[10]))
164     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[2]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[11]))
165     ## VEAS_custom
166 nino.borges 924 #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[35]),self.__SplitAndClean(line[38]),self.__SplitAndClean(line[40]),self.__SplitAndClean(line[42]) ,self.__SplitAndClean(line[34]))
167     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[36]),self.__SplitAndClean(line[61]),self.__SplitAndClean(line[62]),self.__SplitAndClean(line[63]),self.__SplitAndClean(line[37]))
168     ## TODO: update code because they now have time values in the family date field.
169     #self.additionalValuesDict[docID] = AdditionalValues(line[21].split(" ")[0],self.__SplitAndClean(line[49]))
170 nino.borges 839 ## CAAG_custom
171     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[3]),self.__SplitAndClean(line[4]),self.__SplitAndClean(line[5]),self.__SplitAndClean(line[6]) ,self.__SplitAndClean(line[2]))
172     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[7]),self.__SplitAndClean(line[8]),self.__SplitAndClean(line[9]),self.__SplitAndClean(line[10]),self.__SplitAndClean(line[7]))
173 nino.borges 924 ## TODO: update code because they now have time values in the family date field.
174     #self.additionalValuesDict[docID] = AdditionalValues(line[21].split(" ")[0],self.__SplitAndClean(line[49]))
175 nino.borges 881 ## FTC-Retail
176 nino.borges 924 self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[29]),self.__SplitAndClean(line[32]),self.__SplitAndClean(line[35]),self.__SplitAndClean(line[38]) ,self.__SplitAndClean(line[27]))
177     self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[31]),self.__SplitAndClean(line[34]),self.__SplitAndClean(line[37]),self.__SplitAndClean(line[40]),self.__SplitAndClean(line[28]))
178 nino.borges 881 ## TODO: update code because they now have time values in the family date field.
179 nino.borges 924 self.additionalValuesDict[docID] = AdditionalValues(line[15].split(" ")[0],self.__SplitAndClean(line[47]))
180     ## FTC-CID 2
181     #self.metadataValuesDict[docID] = RecordValues(self.__SplitAndClean(line[22]),self.__SplitAndClean(line[26]),self.__SplitAndClean(line[28]),self.__SplitAndClean(line[30]) ,self.__SplitAndClean(line[21]))
182     #self.formattedValuesDict[docID] = RecordValues(self.__SplitAndClean(line[25]),self.__SplitAndClean(line[27]),self.__SplitAndClean(line[29]),self.__SplitAndClean(line[31]),self.__SplitAndClean(line[24]))
183     ## TODO: update code because they now have time values in the family date field.
184     #self.additionalValuesDict[docID] = AdditionalValues(line[8].split(" ")[0],self.__SplitAndClean(line[20]))
185 nino.borges 828
186     print("Data structures created.")
187    
188    
189    
190     def __SplitAndClean(self, rawVal, delim = ";"):
191     """Pseudo-private method which will take a raw string and split this into a list, removing any leading or trailing whitespace"""
192 nino.borges 833 if rawVal:
193     newVal = [x.strip() for x in rawVal.split(delim)]
194     else: newVal = ""
195     return newVal
196 nino.borges 828
197    
198     def __FieldDedupeByEmailAddress(self, valuesList):
199     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field by pulling out email addresses as the deduplication criteria. Returns deduplicated count."""
200 nino.borges 830 ## This should ONLY be used for deduplication for counting and not for true deduplication because it removes a duplicate value at random and sometimes this will be the value with more information.
201     ## TODO: update this to be case insensitive.
202     tempEmailList = []
203     newList = []
204     for item in valuesList:
205     result = re.findall(self.allPossibleEmailAddressesRegExPattern, item)
206     if result:
207     for r in result:
208     if r.upper() in tempEmailList:
209     pass
210     else:
211     newList.append(item)
212     tempEmailList.append(r.upper())
213     else:
214     newList.append(item)
215     return len(newList)
216    
217 nino.borges 828
218 nino.borges 831 def __FieldFullValueDedupe(self, valuesList):
219     """Pseudo-private method which will attempt to deduplicate a list of values from a specific field using the FULL VALUE. This was created because there appears to be duplicate values int he formatted fields"""
220     ## Going to do this the long way because of possible uppercase-lowercase issues. These should all be uppercase but there shouldnt have been dups either...
221     newSet = set()
222     for item in valuesList:
223     newSet.add(item.upper())
224     return len(newSet)
225 nino.borges 828
226 nino.borges 830
227 nino.borges 828 def PerformValueCountChecks(self, countsOnly = True):
228     """Performs the inital value count checks between the metadata values and formatted values, looking for red flags and warnings. By default reports numbers. Set countsOnly to false to export reports."""
229     workList = self.metadataValuesDict.keys()
230     #misCount = 0
231     #redFlagDocList = []
232     #warningDocList = []
233     #misList = []
234     redFlagDocSet = set()
235 nino.borges 833 redFlagDocMatrix = {}
236 nino.borges 828 warningDocSet = set()
237 nino.borges 833 warningDocMatrix = {}
238 nino.borges 832 #duplicatesInFormattedSet = set()
239     duplicatesInFormattedMatrix = {}
240 nino.borges 828
241     for docID in workList:
242 nino.borges 829 for fieldName in self.recordValuesFieldList:
243     metadataFieldValues = self.metadataValuesDict[docID]._asdict()[fieldName]
244     formattedFieldValues = self.formattedValuesDict[docID]._asdict()[fieldName]
245 nino.borges 833
246 nino.borges 829 if len(metadataFieldValues) - len(formattedFieldValues) == 0:
247     pass
248 nino.borges 828 else:
249 nino.borges 829 if len(metadataFieldValues) == 0:
250 nino.borges 833 ## Have to account for instances where the meta docAuthor is blank because it's an email and the formatted just has the from value in it.
251     if fieldName == 'docAuthor':
252     if self.metadataValuesDict[docID].fromValues:
253     pass
254     else:
255     redFlagDocSet.add(docID)
256     #print(docID)
257     try:
258 nino.borges 924 redFlagDocMatrix[docID].append(fieldName+"-No_Metadata_Entries-A")
259 nino.borges 833 except KeyError:
260 nino.borges 924 redFlagDocMatrix[docID] = [fieldName+"-No_Metadata_Entries-A",]
261 nino.borges 833 else:
262     redFlagDocSet.add(docID)
263     try:
264 nino.borges 924 redFlagDocMatrix[docID].append(fieldName+"-No_Metadata_Entries-B")
265 nino.borges 833 except KeyError:
266 nino.borges 924 redFlagDocMatrix[docID] = [fieldName+"-No_Metadata_Entries-B",]
267 nino.borges 829 elif len(formattedFieldValues) == 0:
268     redFlagDocSet.add(docID)
269 nino.borges 833 try:
270 nino.borges 924 redFlagDocMatrix[docID].append(fieldName+"-No_Formatted_Entries")
271 nino.borges 833 except KeyError:
272 nino.borges 924 redFlagDocMatrix[docID] = [fieldName+"-No_Formatted_Entries",]
273 nino.borges 829 else:
274 nino.borges 830 ## try the count again by deduplicating the metadata field values. Never on the formatted field values.
275     deduplicatedFieldCount = self.__FieldDedupeByEmailAddress(metadataFieldValues)
276     if deduplicatedFieldCount - len(formattedFieldValues) == 0:
277     pass
278     else:
279 nino.borges 833 distanceBetween = abs(deduplicatedFieldCount - len(formattedFieldValues))
280     if deduplicatedFieldCount > 30:
281     if distanceBetween > (10 * deduplicatedFieldCount)/100:
282 nino.borges 924 #print(docID,fieldName)
283 nino.borges 833 redFlagDocSet.add(docID)
284     try:
285     redFlagDocMatrix[docID].append(fieldName)
286     except KeyError:
287     redFlagDocMatrix[docID] = [fieldName,]
288     else:
289     warningDocSet.add(docID)
290     try:
291     warningDocMatrix[docID].append(fieldName)
292     except KeyError:
293     warningDocMatrix[docID]= [fieldName,]
294     else:
295     if distanceBetween > 2:
296 nino.borges 924 #print(docID,fieldName)
297 nino.borges 833 redFlagDocSet.add(docID)
298     try:
299     redFlagDocMatrix[docID].append(fieldName)
300     except KeyError:
301     redFlagDocMatrix[docID] = [fieldName,]
302     else:
303     warningDocSet.add(docID)
304     try:
305     warningDocMatrix[docID].append(fieldName)
306     except KeyError:
307     warningDocMatrix[docID]= [fieldName,]
308 nino.borges 831
309     ## Perform a separate check for duplicates in the formatted field.
310     if len(formattedFieldValues) == self.__FieldFullValueDedupe(formattedFieldValues):
311     pass
312     else:
313 nino.borges 832 try:
314     duplicatesInFormattedMatrix[docID].append(fieldName)
315     except KeyError:
316     duplicatesInFormattedMatrix[docID] = [fieldName,]
317     #duplicatesInFormattedSet.add(docID)
318 nino.borges 829 ## if len(self.metadataValuesDict[docID].toValues) - len(self.formattedValuesDict[docID].toValues) == 0:
319     ## pass
320     ## else:
321     ## if len(self.metadataValuesDict[docID].toValues) == 0:
322     ## #redFlagDocList.append(docID)
323     ## redFlagDocSet.add(docID)
324     ## elif len(self.formattedValuesDict[docID].toValues) == 0:
325     ## #redFlagDocList.append(docID)
326     ## redFlagDocSet.add(docID)
327     ## else:
328     ## #misCount +=1
329     ## #misList.append(docID)
330     ## #warningDocList.append(docID)
331     ## warningDocSet.add(docID)
332 nino.borges 828
333 nino.borges 830 print(f"There are a total of {len(redFlagDocSet)} red flag documents and {len(warningDocSet)} warnings where the matching field value counts that do not match.")
334 nino.borges 828 if countsOnly == False:
335     warningsOutputFile = open(r"C:\Test_Dir\Amazon\warnings.txt",'w')
336     redFladsOutputFile = open(r"C:\Test_Dir\Amazon\redFlags.txt",'w')
337 nino.borges 831 duplicatesInFormattedOutputFile = open(r"C:\Test_Dir\Amazon\dupesInFormattedFields.txt",'w')
338 nino.borges 833 for x in warningDocMatrix:
339     warningsOutputFile.write(f"{x} | {*warningDocMatrix[x],}\n")
340 nino.borges 828 warningsOutputFile.close()
341 nino.borges 833 for y in redFlagDocMatrix:
342     redFladsOutputFile.write(f"{y} | {*redFlagDocMatrix[y],}\n")
343 nino.borges 828 redFladsOutputFile.close()
344 nino.borges 832 for z in duplicatesInFormattedMatrix:
345     duplicatesInFormattedOutputFile.write(f"{z} | {*duplicatesInFormattedMatrix[z],}\n")
346 nino.borges 831 duplicatesInFormattedOutputFile.close()
347 nino.borges 828
348    
349     if __name__ == '__main__':
350 nino.borges 924 cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20250303-FTC-Retail\export_20250304_021504_Converted.txt"
351 nino.borges 881 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20250129\VEAS\VEAS_20250130_020124_Converted.txt"
352     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241215\PrivLogExports\PrivLogExport_20241211_CAAG_Converted.txt"
353     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Testing_3\PrivLogExport_20241204_VEAS_Converted.txt"
354 nino.borges 851 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\Data Exports\CAAG\CAAG_Log_Data_Export_Converted.txt"
355 nino.borges 839 #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241202 - FTC-CID\PLOG All IDs (20241202)\PLOG All IDs (20241202)_Converted_SubSetOnly.txt"
356     #cleanedDatExportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\_PrivLogQCProcess\20241122 - VEAS CAAG 20241206\export_20241122_160117_Converted.txt"
357 nino.borges 833 #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\PrivLogExport_20241113_CAAG_Converted.txt"
358     #cleanedDatExportFileName = r"C:\Users\eborges\AppData\Local\Programs\Python\Python312\MyCode\JN\_Temp2\20241115_PrivLogWorking_CAAG\TEST.txt"
359 nino.borges 828
360     ## Code Testing
361     qcP = QcPrivLog(cleanedDatExportFileName)
362 nino.borges 833 ## print(qcP.metadataValuesDict['H55278-0268-003517'].fromValues)
363     ## print(qcP.metadataValuesDict['H55278-0268-003517'].toValues)
364     ## print(qcP.metadataValuesDict['H55278-0268-003517'].ccValues)
365     ## print(qcP.metadataValuesDict['H55278-0268-003517'].bccValues)
366     ## print(qcP.formattedValuesDict['H55278-0268-003517'].fromValues)
367     ## print(qcP.formattedValuesDict['H55278-0268-003517'].toValues)
368     ## print(qcP.formattedValuesDict['H55278-0268-003517'].ccValues)
369     ## print(qcP.formattedValuesDict['H55278-0268-003517'].bccValues)
370 nino.borges 828
371     qcP.PerformValueCountChecks(countsOnly = False)
372    
373