ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
(Generate patch)

Comparing Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py (file contents):
Revision 812 by nino.borges, Mon Jan 8 21:44:57 2024 UTC vs.
Revision 813 by nino.borges, Wed Jan 10 19:51:05 2024 UTC

# Line 14 | Line 14 | To-Do: Method to QC for any parsing erro
14   import csv, os, re
15  
16   class TopSendersAnalyzer(object):
17 <    version = "0.09"
17 >    version = "0.10"
18  
19      def __init__(self):
20          self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
# Line 45 | Line 45 | class TopSendersAnalyzer(object):
45          ##  Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46          self.senderEmailAddressesAcrossCSVsSet = set()
47          self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48 <        
48 >
49 >
50 >        ##  Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
51 >        self.subjectLineTopTenReportSet = set()
52 >        self.subjectLineTopTenReportFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"
53 >
54          #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
55          self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
56  
# Line 61 | Line 66 | class TopSendersAnalyzer(object):
66          #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
67          self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
68  
69 <    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
69 >    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
70          """Main Method in this program"""
71          print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
72          for (root,dirs,files) in os.walk(self.startDir):
# Line 82 | Line 87 | class TopSendersAnalyzer(object):
87                  toFieldAddressesInCSVCount = 0
88  
89  
90 <               ##  This set holds the list of unique subject lines, so that I can get a count below.
90 >                ##  This set holds the list of unique subject lines, so that I can get a count below.
91                  uniqueSubjectLineSet = set()
92 +                ##  This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
93 +                subjectLineCountMatrix = {}
94  
95                  with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
96                      csv_reader = csv.DictReader(csv_file)    
# Line 134 | Line 141 | class TopSendersAnalyzer(object):
141                          ##  Gather the subject line, so that I can later get a deduplicated count.
142                          subjectLineValue = row['Subject']
143                          uniqueSubjectLineSet.add(subjectLineValue.upper())
144 +                        ##  Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
145 +                        try:
146 +                            subjectLineCountMatrix[subjectLineValue.upper()] += 1
147 +                        except:
148 +                            subjectLineCountMatrix[subjectLineValue.upper()] = 1
149  
150  
151                  ##  This is the count of unique subject lines that exist within the CSV.
152 <                uniqueSubjectLineCount = len(uniqueSubjectLineSet)        
141 <                scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
152 >                uniqueSubjectLineCount = len(uniqueSubjectLineSet)
153                  
154 +                scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
155 +
156 +                ##  Gather the top 10 subject lines
157 +                sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
158 +                sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
159 +                if len(sortedSubjectKeyList) < 10:
160 +                    sNumbMax = len(sortedSubjectKeyList)
161 +                else:
162 +                    sNumbMax = 10
163 +                for sNumb in range(sNumbMax):
164 +                    #print(f"{sortedSubjectKeyList[s]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
165 +                    self.subjectLineTopTenReportSet.add(f"{fl}|{sortedSubjectKeyList[sNumb]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}")
166 +
167                  print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
168                  csv_file.close()
169                  ##  Update the global all email addresses set, if they selected this option
# Line 163 | Line 187 | class TopSendersAnalyzer(object):
187              print("Writing the deduplicated special senders data to log file....")
188              self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
189              print("Done.\n")
190 +        if writeTopTenSubjectLinesReportLogFile:
191 +            print("Writing the Top 10 Subject Lines Report to log file....")
192 +            self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName)
193 +            print("Done.\n")
194          
195  
196      def LanIDTrueTest(self, listOfIds):
# Line 211 | Line 239 | class TopSendersAnalyzer(object):
239   if __name__ == '__main__':
240  
241      tsa = TopSendersAnalyzer()
242 <    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True)
242 >    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)
243  
244  
245      #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)