| 14 |
|
import csv, os, re |
| 15 |
|
|
| 16 |
|
class TopSendersAnalyzer(object): |
| 17 |
< |
version = "0.09" |
| 17 |
> |
version = "0.10" |
| 18 |
|
|
| 19 |
|
def __init__(self): |
| 20 |
|
self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req" |
| 45 |
|
## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address. |
| 46 |
|
self.senderEmailAddressesAcrossCSVsSet = set() |
| 47 |
|
self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt" |
| 48 |
< |
|
| 48 |
> |
|
| 49 |
> |
|
| 50 |
> |
## Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam |
| 51 |
> |
self.subjectLineTopTenReportSet = set() |
| 52 |
> |
self.subjectLineTopTenReportFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt" |
| 53 |
> |
|
| 54 |
|
#self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM' |
| 55 |
|
self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM' |
| 56 |
|
|
| 66 |
|
#self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+" |
| 67 |
|
self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+" |
| 68 |
|
|
| 69 |
< |
def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False): |
| 69 |
> |
def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False): |
| 70 |
|
"""Main Method in this program""" |
| 71 |
|
print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %") |
| 72 |
|
for (root,dirs,files) in os.walk(self.startDir): |
| 87 |
|
toFieldAddressesInCSVCount = 0 |
| 88 |
|
|
| 89 |
|
|
| 90 |
< |
## This set holds the list of unique subject lines, so that I can get a count below. |
| 90 |
> |
## This set holds the list of unique subject lines, so that I can get a count below. |
| 91 |
|
uniqueSubjectLineSet = set() |
| 92 |
+ |
## This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10 |
| 93 |
+ |
subjectLineCountMatrix = {} |
| 94 |
|
|
| 95 |
|
with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file: |
| 96 |
|
csv_reader = csv.DictReader(csv_file) |
| 141 |
|
## Gather the subject line, so that I can later get a deduplicated count. |
| 142 |
|
subjectLineValue = row['Subject'] |
| 143 |
|
uniqueSubjectLineSet.add(subjectLineValue.upper()) |
| 144 |
+ |
## Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10 |
| 145 |
+ |
try: |
| 146 |
+ |
subjectLineCountMatrix[subjectLineValue.upper()] += 1 |
| 147 |
+ |
except: |
| 148 |
+ |
subjectLineCountMatrix[subjectLineValue.upper()] = 1 |
| 149 |
|
|
| 150 |
|
|
| 151 |
|
## This is the count of unique subject lines that exist within the CSV. |
| 152 |
< |
uniqueSubjectLineCount = len(uniqueSubjectLineSet) |
| 141 |
< |
scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount) |
| 152 |
> |
uniqueSubjectLineCount = len(uniqueSubjectLineSet) |
| 153 |
|
|
| 154 |
+ |
scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount) |
| 155 |
+ |
|
| 156 |
+ |
## Gather the top 10 subject lines |
| 157 |
+ |
sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True)) |
| 158 |
+ |
sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys()) |
| 159 |
+ |
if len(sortedSubjectKeyList) < 10: |
| 160 |
+ |
sNumbMax = len(sortedSubjectKeyList) |
| 161 |
+ |
else: |
| 162 |
+ |
sNumbMax = 10 |
| 163 |
+ |
for sNumb in range(sNumbMax): |
| 164 |
+ |
#print(f"{sortedSubjectKeyList[s]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}") |
| 165 |
+ |
self.subjectLineTopTenReportSet.add(f"{fl}|{sortedSubjectKeyList[sNumb]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}") |
| 166 |
+ |
|
| 167 |
|
print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}") |
| 168 |
|
csv_file.close() |
| 169 |
|
## Update the global all email addresses set, if they selected this option |
| 187 |
|
print("Writing the deduplicated special senders data to log file....") |
| 188 |
|
self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName) |
| 189 |
|
print("Done.\n") |
| 190 |
+ |
if writeTopTenSubjectLinesReportLogFile: |
| 191 |
+ |
print("Writing the Top 10 Subject Lines Report to log file....") |
| 192 |
+ |
self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName) |
| 193 |
+ |
print("Done.\n") |
| 194 |
|
|
| 195 |
|
|
| 196 |
|
def LanIDTrueTest(self, listOfIds): |
| 239 |
|
if __name__ == '__main__': |
| 240 |
|
|
| 241 |
|
tsa = TopSendersAnalyzer() |
| 242 |
< |
tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True) |
| 242 |
> |
tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True) |
| 243 |
|
|
| 244 |
|
|
| 245 |
|
#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.") |