[ViewVC] Diff of: ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py

Comparing Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py (file contents):
Revision 812 by nino.borges, Mon Jan 8 21:44:57 2024 UTC vs.
Revision 813 by nino.borges, Wed Jan 10 19:51:05 2024 UTC

+import csv, os, re
+class TopSendersAnalyzer(object):
-<
+    version = "0.09"
->
+    version = "0.10"
+    def __init__(self):
+        self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
+        ##  Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
+        self.senderEmailAddressesAcrossCSVsSet = set()
+        self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
-<
->
->
->
+        ##  Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
->
+        self.subjectLineTopTenReportSet = set()
->
+        self.subjectLineTopTenReportFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"
->
+        #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
+        self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
+        #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
+        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
-<
+    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
->
+    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
+        """Main Method in this program"""
+        print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
+        for (root,dirs,files) in os.walk(self.startDir):
+                toFieldAddressesInCSVCount = 0
-<
+               ##  This set holds the list of unique subject lines, so that I can get a count below.
->
+                ##  This set holds the list of unique subject lines, so that I can get a count below.
+                uniqueSubjectLineSet = set()
-+
+                ##  This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
-+
+                subjectLineCountMatrix = {}
+                with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
+                    csv_reader = csv.DictReader(csv_file)
+                        ##  Gather the subject line, so that I can later get a deduplicated count.
+                        subjectLineValue = row['Subject']
+                        uniqueSubjectLineSet.add(subjectLineValue.upper())
-+
+                        ##  Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
-+
+                        try:
-+
+                            subjectLineCountMatrix[subjectLineValue.upper()] += 1
-+
+                        except:
-+
+                            subjectLineCountMatrix[subjectLineValue.upper()] = 1
+                ##  This is the count of unique subject lines that exist within the CSV.
-<
+                uniqueSubjectLineCount = len(uniqueSubjectLineSet)
-<
+                scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
->
+                uniqueSubjectLineCount = len(uniqueSubjectLineSet)
-+
+                scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
-+
-+
+                ##  Gather the top 10 subject lines
-+
+                sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
-+
+                sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
-+
+                if len(sortedSubjectKeyList) < 10:
-+
+                    sNumbMax = len(sortedSubjectKeyList)
-+
+                else:
-+
+                    sNumbMax = 10
-+
+                for sNumb in range(sNumbMax):
-+
+                    #print(f"{sortedSubjectKeyList[s]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
-+
+                    self.subjectLineTopTenReportSet.add(f"{fl}|{sortedSubjectKeyList[sNumb]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}")
-+
+                print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
+                csv_file.close()
+                ##  Update the global all email addresses set, if they selected this option
+            print("Writing the deduplicated special senders data to log file....")
+            self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
+            print("Done.\n")
-+
+        if writeTopTenSubjectLinesReportLogFile:
-+
+            print("Writing the Top 10 Subject Lines Report to log file....")
-+
+            self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName)
-+
+            print("Done.\n")
+    def LanIDTrueTest(self, listOfIds):
+if __name__ == '__main__':
+    tsa = TopSendersAnalyzer()
-<
+    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True)
->
+    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)
+    #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines (old)
->
+Changed lines (new)

Comparing Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py (file contents): Revision 812 by nino.borges, Mon Jan 8 21:44:57 2024 UTC vs. Revision 813 by nino.borges, Wed Jan 10 19:51:05 2024 UTC

Diff Legend

Comparing Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py (file contents):
Revision 812 by nino.borges, Mon Jan 8 21:44:57 2024 UTC vs.
Revision 813 by nino.borges, Wed Jan 10 19:51:05 2024 UTC