ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
(Generate patch)

Comparing Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py (file contents):
Revision 810 by nino.borges, Fri Jan 5 19:24:53 2024 UTC vs.
Revision 811 by nino.borges, Fri Jan 5 20:25:29 2024 UTC

# Line 14 | Line 14 | To-Do: Method to QC for any parsing erro
14   import csv, os, re
15  
16   class TopSendersAnalyzer(object):
17 <    version = "0.07"
17 >    version = "0.08"
18  
19      def __init__(self):
20          self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
# Line 41 | Line 41 | class TopSendersAnalyzer(object):
41          self.falsePositiveLanIdAddressesSet = set()
42          self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43  
44 <        ##  Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Matrix of fileName:valuesSet.
45 <        self.senderEmailAddressesMatrix = {}
44 >        ##  Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
45 >        ##  Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46 >        self.senderEmailAddressesAcrossCSVsSet = set()
47          self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48          
49          #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
# Line 76 | Line 77 | class TopSendersAnalyzer(object):
77                  dateInPath = re.findall(self.dateInPathRegExPattern, root)
78                  allEmailAddressesInCSVSet = set()
79  
79                ##  This is the list of unique Sender Addresses,per CSV, ONLY IF THE EMAIL IS NOT TO A LANID.  Diana asked for this list.
80                ##  Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
81                senderEmailAddressesInCSVSet = set()
80  
81                  ##  This is the full count of receipient addresses found in the CSV, not unique addresses
82                  toFieldAddressesInCSVCount = 0
# Line 121 | Line 119 | class TopSendersAnalyzer(object):
119                                  ntrsNoLanBucketCount +=1
120                                  ## Also since no LAN ID was found in the TO field, add to the unique senders list.
121                                  senderValue = toValue = row['Sender']
122 <                                senderEmailAddressesInCSVSet.add(senderValue.upper())
122 >                                self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
123                          else:
124                              ## No ntrs addresses found at all,
125                              noNtrsDomainBucketCount +=1
126                              ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
127                              senderValue = toValue = row['Sender']
128 <                            senderEmailAddressesInCSVSet.add(senderValue.upper())
128 >                            self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
129 >                            
130                  scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
131                  print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}")
132                  csv_file.close()
133                  ##  Update the global all email addresses set, if they selected this option
134                  if writeAllPossibleEmailAddressesLogFile:
135                      self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
137
138                ##  Update the global special senders matrix, if they selected this option.
139                if writeSpecialSendersListLogFile:
140                    self.senderEmailAddressesMatrix[fl] = senderEmailAddressesInCSVSet
136                      
137          if writeTrueLanIDLogFile:
138              print("Writing the True LAN ID log file...")
# Line 153 | Line 148 | class TopSendersAnalyzer(object):
148              print("Done.\n")
149          if writeSpecialSendersListLogFile:
150              print("Writing the deduplicated special senders data to log file....")
151 <            
157 <            
151 >            self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
152              print("Done.\n")
153          
154  
# Line 183 | Line 177 | class TopSendersAnalyzer(object):
177          tempList = list(setOfValues)
178          tempList.sort()
179          for i in tempList:
180 <            outFl.write(f"{i}\n")
180 >            try:
181 >                outFl.write(f"{i}\n")
182 >            except:
183 >                print(i)
184          outFl.close()
185  
186      def CalculateScenario(self,rawNumber, averageNumbRecip):
# Line 201 | Line 198 | class TopSendersAnalyzer(object):
198   if __name__ == '__main__':
199  
200      tsa = TopSendersAnalyzer()
201 <    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True)
201 >    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True)
202  
203  
204      #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)