| 1 |
nino.borges |
797 |
"""
|
| 2 |
|
|
|
| 3 |
|
|
NTRS-TopSenderAnalysis
|
| 4 |
|
|
|
| 5 |
|
|
Created by:
|
| 6 |
|
|
Emanuel Borges
|
| 7 |
|
|
09.20.2023
|
| 8 |
|
|
|
| 9 |
|
|
Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
|
| 10 |
nino.borges |
798 |
To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
|
| 11 |
nino.borges |
797 |
|
| 12 |
|
|
"""
|
| 13 |
|
|
|
| 14 |
nino.borges |
798 |
import csv, os, re
|
| 15 |
nino.borges |
797 |
|
| 16 |
nino.borges |
801 |
class TopSendersAnalyzer(object):
|
| 17 |
nino.borges |
814 |
version = "0.11"
|
| 18 |
nino.borges |
801 |
|
| 19 |
|
|
def __init__(self):
|
| 20 |
nino.borges |
809 |
self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
|
| 21 |
|
|
#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
|
| 22 |
nino.borges |
801 |
#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
|
| 23 |
|
|
|
| 24 |
nino.borges |
803 |
|
| 25 |
nino.borges |
809 |
## Matrix containing the scenarios to scenario descriptions
|
| 26 |
|
|
self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
|
| 27 |
|
|
|
| 28 |
nino.borges |
803 |
## All possible email addresses across all CSV files
|
| 29 |
|
|
self.allPossibleEmailAddressesSet = set()
|
| 30 |
|
|
self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
|
| 31 |
|
|
|
| 32 |
nino.borges |
801 |
## All email addresses with an @NTRS.COM domain. Currently unsupported.
|
| 33 |
|
|
#self.allToNtrsAddressesSet = set()
|
| 34 |
|
|
#self.allToNtrsAddressesOutputFileName = r""
|
| 35 |
|
|
|
| 36 |
|
|
## All true NTRS LAN ID matches, per specification provided to me.
|
| 37 |
|
|
self.trueLanIdAddressesSet = set()
|
| 38 |
|
|
self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
|
| 39 |
|
|
|
| 40 |
|
|
## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
|
| 41 |
|
|
self.falsePositiveLanIdAddressesSet = set()
|
| 42 |
|
|
self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
|
| 43 |
nino.borges |
810 |
|
| 44 |
nino.borges |
811 |
## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
|
| 45 |
|
|
## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
|
| 46 |
|
|
self.senderEmailAddressesAcrossCSVsSet = set()
|
| 47 |
nino.borges |
810 |
self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
|
| 48 |
nino.borges |
813 |
|
| 49 |
|
|
|
| 50 |
|
|
## Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
|
| 51 |
|
|
self.subjectLineTopTenReportSet = set()
|
| 52 |
|
|
self.subjectLineTopTenReportFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"
|
| 53 |
|
|
|
| 54 |
nino.borges |
803 |
#self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
|
| 55 |
|
|
self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
|
| 56 |
nino.borges |
800 |
|
| 57 |
nino.borges |
803 |
|
| 58 |
|
|
## Simple match to pull out the date as recorded in the path
|
| 59 |
|
|
self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
|
| 60 |
|
|
|
| 61 |
|
|
|
| 62 |
|
|
## Match for pulling out all email addresses, regardless of domain.
|
| 63 |
|
|
#self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
|
| 64 |
|
|
#self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
|
| 65 |
|
|
#self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
|
| 66 |
|
|
#self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
|
| 67 |
|
|
self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
|
| 68 |
|
|
|
| 69 |
nino.borges |
813 |
def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
|
| 70 |
nino.borges |
801 |
"""Main Method in this program"""
|
| 71 |
nino.borges |
814 |
print("FileName|Folder Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
|
| 72 |
nino.borges |
801 |
for (root,dirs,files) in os.walk(self.startDir):
|
| 73 |
|
|
for fl in files:
|
| 74 |
|
|
fileRowCount = 0
|
| 75 |
nino.borges |
809 |
noAddressesInToFieldCount = 0
|
| 76 |
nino.borges |
801 |
eightyCharRowCount = 0
|
| 77 |
|
|
nonEightyCharRowCount = 0
|
| 78 |
|
|
charsOverEighty = False
|
| 79 |
|
|
noNtrsDomainBucketCount = 0
|
| 80 |
|
|
ntrsAndLanBucketCount = 0
|
| 81 |
|
|
ntrsNoLanBucketCount = 0
|
| 82 |
nino.borges |
803 |
dateInPath = re.findall(self.dateInPathRegExPattern, root)
|
| 83 |
|
|
allEmailAddressesInCSVSet = set()
|
| 84 |
nino.borges |
801 |
|
| 85 |
nino.borges |
810 |
|
| 86 |
nino.borges |
809 |
## This is the full count of receipient addresses found in the CSV, not unique addresses
|
| 87 |
|
|
toFieldAddressesInCSVCount = 0
|
| 88 |
|
|
|
| 89 |
nino.borges |
812 |
|
| 90 |
nino.borges |
813 |
## This set holds the list of unique subject lines, so that I can get a count below.
|
| 91 |
nino.borges |
812 |
uniqueSubjectLineSet = set()
|
| 92 |
nino.borges |
813 |
## This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
|
| 93 |
|
|
subjectLineCountMatrix = {}
|
| 94 |
nino.borges |
812 |
|
| 95 |
nino.borges |
801 |
with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
|
| 96 |
|
|
csv_reader = csv.DictReader(csv_file)
|
| 97 |
|
|
for row in csv_reader:
|
| 98 |
|
|
fileRowCount += 1
|
| 99 |
|
|
if len(row['To']) == 80:
|
| 100 |
|
|
eightyCharRowCount +=1
|
| 101 |
|
|
else:
|
| 102 |
|
|
nonEightyCharRowCount +=1
|
| 103 |
|
|
if len(row['To']) > 80:
|
| 104 |
|
|
charsOverEighty = True
|
| 105 |
|
|
toValue = row['To']
|
| 106 |
|
|
toValue = toValue.upper()
|
| 107 |
nino.borges |
803 |
|
| 108 |
|
|
## Match and gather all possible email addresses, adding it to the per CSV set.
|
| 109 |
nino.borges |
809 |
allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
|
| 110 |
nino.borges |
803 |
for eAddress in allEmailAddresses:
|
| 111 |
|
|
allEmailAddressesInCSVSet.add(eAddress)
|
| 112 |
nino.borges |
809 |
|
| 113 |
|
|
## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
|
| 114 |
|
|
if len(allEmailAddresses) == 0:
|
| 115 |
|
|
noAddressesInToFieldCount +=1
|
| 116 |
|
|
else:
|
| 117 |
|
|
toFieldAddressesInCSVCount += len(allEmailAddresses)
|
| 118 |
nino.borges |
803 |
## Perform the main logic tests
|
| 119 |
nino.borges |
804 |
if "NTRS.COM" in toValue:
|
| 120 |
nino.borges |
801 |
## The domain was found. Apply next test.
|
| 121 |
nino.borges |
803 |
ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
|
| 122 |
nino.borges |
801 |
ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
|
| 123 |
|
|
if ntrsLanIDTrueTestResult:
|
| 124 |
|
|
## At least 1 LAN ID was found, using the True Test
|
| 125 |
|
|
ntrsAndLanBucketCount +=1
|
| 126 |
|
|
#for a in ntrsLanAddressesList:
|
| 127 |
|
|
# allToLanAddressesSet.add(a)
|
| 128 |
|
|
else:
|
| 129 |
|
|
## Not 1 true LAN ID was found, using the True Test
|
| 130 |
|
|
ntrsNoLanBucketCount +=1
|
| 131 |
nino.borges |
810 |
## Also since no LAN ID was found in the TO field, add to the unique senders list.
|
| 132 |
nino.borges |
812 |
senderValue = row['Sender']
|
| 133 |
nino.borges |
811 |
self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
|
| 134 |
nino.borges |
801 |
else:
|
| 135 |
|
|
## No ntrs addresses found at all,
|
| 136 |
|
|
noNtrsDomainBucketCount +=1
|
| 137 |
nino.borges |
810 |
## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
|
| 138 |
nino.borges |
812 |
senderValue = row['Sender']
|
| 139 |
nino.borges |
811 |
self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
|
| 140 |
|
|
|
| 141 |
nino.borges |
812 |
## Gather the subject line, so that I can later get a deduplicated count.
|
| 142 |
|
|
subjectLineValue = row['Subject']
|
| 143 |
|
|
uniqueSubjectLineSet.add(subjectLineValue.upper())
|
| 144 |
nino.borges |
813 |
## Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
|
| 145 |
|
|
try:
|
| 146 |
|
|
subjectLineCountMatrix[subjectLineValue.upper()] += 1
|
| 147 |
|
|
except:
|
| 148 |
|
|
subjectLineCountMatrix[subjectLineValue.upper()] = 1
|
| 149 |
nino.borges |
812 |
|
| 150 |
|
|
|
| 151 |
|
|
## This is the count of unique subject lines that exist within the CSV.
|
| 152 |
nino.borges |
813 |
uniqueSubjectLineCount = len(uniqueSubjectLineSet)
|
| 153 |
|
|
|
| 154 |
nino.borges |
809 |
scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
|
| 155 |
nino.borges |
813 |
|
| 156 |
|
|
## Gather the top 10 subject lines
|
| 157 |
|
|
sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
|
| 158 |
|
|
sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
|
| 159 |
|
|
if len(sortedSubjectKeyList) < 10:
|
| 160 |
|
|
sNumbMax = len(sortedSubjectKeyList)
|
| 161 |
|
|
else:
|
| 162 |
|
|
sNumbMax = 10
|
| 163 |
|
|
for sNumb in range(sNumbMax):
|
| 164 |
|
|
#print(f"{sortedSubjectKeyList[s]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
|
| 165 |
nino.borges |
814 |
self.subjectLineTopTenReportSet.add(f"{fl}|{sortedSubjectKeyList[sNumb]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]/fileRowCount}")
|
| 166 |
nino.borges |
813 |
|
| 167 |
nino.borges |
812 |
print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
|
| 168 |
nino.borges |
801 |
csv_file.close()
|
| 169 |
nino.borges |
803 |
## Update the global all email addresses set, if they selected this option
|
| 170 |
|
|
if writeAllPossibleEmailAddressesLogFile:
|
| 171 |
|
|
self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
|
| 172 |
nino.borges |
812 |
|
| 173 |
|
|
|
| 174 |
nino.borges |
801 |
if writeTrueLanIDLogFile:
|
| 175 |
|
|
print("Writing the True LAN ID log file...")
|
| 176 |
|
|
self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
|
| 177 |
|
|
print("Done.\n")
|
| 178 |
|
|
if writeFalsePositiveLanIDLogFile:
|
| 179 |
|
|
print("Writing the False-Positive LAN ID log file...")
|
| 180 |
|
|
self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
|
| 181 |
|
|
print("Done.\n")
|
| 182 |
nino.borges |
803 |
if writeAllPossibleEmailAddressesLogFile:
|
| 183 |
|
|
print("Writing the All Possible Email Addresses Across All CSV Files log file...")
|
| 184 |
|
|
self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
|
| 185 |
|
|
print("Done.\n")
|
| 186 |
nino.borges |
810 |
if writeSpecialSendersListLogFile:
|
| 187 |
|
|
print("Writing the deduplicated special senders data to log file....")
|
| 188 |
nino.borges |
811 |
self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
|
| 189 |
nino.borges |
810 |
print("Done.\n")
|
| 190 |
nino.borges |
813 |
if writeTopTenSubjectLinesReportLogFile:
|
| 191 |
|
|
print("Writing the Top 10 Subject Lines Report to log file....")
|
| 192 |
nino.borges |
814 |
self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName, doNotSort = True)
|
| 193 |
nino.borges |
813 |
print("Done.\n")
|
| 194 |
nino.borges |
801 |
|
| 195 |
|
|
|
| 196 |
|
|
def LanIDTrueTest(self, listOfIds):
|
| 197 |
|
|
"""A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
|
| 198 |
|
|
lanIDTestResult = False
|
| 199 |
|
|
for lanID in listOfIds:
|
| 200 |
|
|
alphaOnly = [x.lower() for x in lanID if x.isalpha()]
|
| 201 |
|
|
if len(alphaOnly) > 10:
|
| 202 |
|
|
## I'm too big to be a true LAN ID
|
| 203 |
|
|
self.falsePositiveLanIdAddressesSet.add(lanID)
|
| 204 |
|
|
else:
|
| 205 |
|
|
self.trueLanIdAddressesSet.add(lanID)
|
| 206 |
|
|
lanIDTestResult = True
|
| 207 |
|
|
return lanIDTestResult
|
| 208 |
|
|
|
| 209 |
|
|
|
| 210 |
nino.borges |
814 |
def WriteLogFile(self, setOfValues, outputFilePath, doNotSort = False):
|
| 211 |
nino.borges |
801 |
"""Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
|
| 212 |
|
|
fileNameInc = 0
|
| 213 |
|
|
while os.path.isfile(outputFilePath):
|
| 214 |
|
|
fileNameInc +=1
|
| 215 |
|
|
outputFile, extension = os.path.splitext(outputFilePath)
|
| 216 |
|
|
outputFilePath = outputFile + str(fileNameInc) + extension
|
| 217 |
|
|
outFl = open(outputFilePath,'w')
|
| 218 |
|
|
tempList = list(setOfValues)
|
| 219 |
nino.borges |
814 |
if doNotSort:
|
| 220 |
|
|
pass
|
| 221 |
|
|
else:
|
| 222 |
|
|
tempList.sort()
|
| 223 |
nino.borges |
801 |
for i in tempList:
|
| 224 |
nino.borges |
811 |
try:
|
| 225 |
|
|
outFl.write(f"{i}\n")
|
| 226 |
|
|
except:
|
| 227 |
|
|
print(i)
|
| 228 |
nino.borges |
801 |
outFl.close()
|
| 229 |
|
|
|
| 230 |
nino.borges |
809 |
def CalculateScenario(self,rawNumber, averageNumbRecip):
|
| 231 |
|
|
"""This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
|
| 232 |
|
|
scenario = "Uncategorized"
|
| 233 |
|
|
if rawNumber > .76:
|
| 234 |
|
|
scenario = "C"
|
| 235 |
|
|
elif averageNumbRecip < 15 and rawNumber < .019:
|
| 236 |
|
|
scenario = "A or B"
|
| 237 |
|
|
elif rawNumber > .009 and rawNumber < .34:
|
| 238 |
|
|
scenario = "D"
|
| 239 |
|
|
|
| 240 |
|
|
return scenario
|
| 241 |
|
|
|
| 242 |
nino.borges |
797 |
if __name__ == '__main__':
|
| 243 |
|
|
|
| 244 |
nino.borges |
801 |
tsa = TopSendersAnalyzer()
|
| 245 |
nino.borges |
813 |
tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)
|
| 246 |
nino.borges |
797 |
|
| 247 |
nino.borges |
801 |
|
| 248 |
nino.borges |
800 |
#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
|
| 249 |
|
|
#outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
|
| 250 |
|
|
#allToLanAddressesList = list(allToLanAddressesSet)
|
| 251 |
|
|
#allToLanAddressesList.sort()
|
| 252 |
|
|
#for i in allToLanAddressesList:
|
| 253 |
|
|
# outputFile.write(f"{i}\n")
|
| 254 |
|
|
#outputFile.close()
|
| 255 |
nino.borges |
797 |
|
| 256 |
|
|
## Initially gathering some very basic information across the CSV files, not using csv lib
|
| 257 |
|
|
# for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
|
| 258 |
|
|
# for fl in files:
|
| 259 |
|
|
# contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
|
| 260 |
|
|
# print(f"{fl}|{len(contents)-1}") |