ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 813
Committed: Wed Jan 10 19:51:05 2024 UTC (2 years, 2 months ago) by nino.borges
Content type: text/x-python
File size: 15774 byte(s)
Log Message:
This version was the delivered Analysis_v13 spreadsheet that added the top 10 subject line entries.

File Contents

# Content
1 """
2
3 NTRS-TopSenderAnalysis
4
5 Created by:
6 Emanuel Borges
7 09.20.2023
8
9 Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12 """
13
14 import csv, os, re
15
16 class TopSendersAnalyzer(object):
17 version = "0.10"
18
19 def __init__(self):
20 self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23
24
25 ## Matrix containing the scenarios to scenario descriptions
26 self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27
28 ## All possible email addresses across all CSV files
29 self.allPossibleEmailAddressesSet = set()
30 self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31
32 ## All email addresses with an @NTRS.COM domain. Currently unsupported.
33 #self.allToNtrsAddressesSet = set()
34 #self.allToNtrsAddressesOutputFileName = r""
35
36 ## All true NTRS LAN ID matches, per specification provided to me.
37 self.trueLanIdAddressesSet = set()
38 self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39
40 ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41 self.falsePositiveLanIdAddressesSet = set()
42 self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43
44 ## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
45 ## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46 self.senderEmailAddressesAcrossCSVsSet = set()
47 self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48
49
50 ## Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
51 self.subjectLineTopTenReportSet = set()
52 self.subjectLineTopTenReportFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"
53
54 #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
55 self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
56
57
58 ## Simple match to pull out the date as recorded in the path
59 self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
60
61
62 ## Match for pulling out all email addresses, regardless of domain.
63 #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
64 #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
65 #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
66 #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
67 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
68
69 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
70 """Main Method in this program"""
71 print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
72 for (root,dirs,files) in os.walk(self.startDir):
73 for fl in files:
74 fileRowCount = 0
75 noAddressesInToFieldCount = 0
76 eightyCharRowCount = 0
77 nonEightyCharRowCount = 0
78 charsOverEighty = False
79 noNtrsDomainBucketCount = 0
80 ntrsAndLanBucketCount = 0
81 ntrsNoLanBucketCount = 0
82 dateInPath = re.findall(self.dateInPathRegExPattern, root)
83 allEmailAddressesInCSVSet = set()
84
85
86 ## This is the full count of receipient addresses found in the CSV, not unique addresses
87 toFieldAddressesInCSVCount = 0
88
89
90 ## This set holds the list of unique subject lines, so that I can get a count below.
91 uniqueSubjectLineSet = set()
92 ## This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
93 subjectLineCountMatrix = {}
94
95 with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
96 csv_reader = csv.DictReader(csv_file)
97 for row in csv_reader:
98 fileRowCount += 1
99 if len(row['To']) == 80:
100 eightyCharRowCount +=1
101 else:
102 nonEightyCharRowCount +=1
103 if len(row['To']) > 80:
104 charsOverEighty = True
105 toValue = row['To']
106 toValue = toValue.upper()
107
108 ## Match and gather all possible email addresses, adding it to the per CSV set.
109 allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
110 for eAddress in allEmailAddresses:
111 allEmailAddressesInCSVSet.add(eAddress)
112
113 ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
114 if len(allEmailAddresses) == 0:
115 noAddressesInToFieldCount +=1
116 else:
117 toFieldAddressesInCSVCount += len(allEmailAddresses)
118 ## Perform the main logic tests
119 if "NTRS.COM" in toValue:
120 ## The domain was found. Apply next test.
121 ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
122 ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
123 if ntrsLanIDTrueTestResult:
124 ## At least 1 LAN ID was found, using the True Test
125 ntrsAndLanBucketCount +=1
126 #for a in ntrsLanAddressesList:
127 # allToLanAddressesSet.add(a)
128 else:
129 ## Not 1 true LAN ID was found, using the True Test
130 ntrsNoLanBucketCount +=1
131 ## Also since no LAN ID was found in the TO field, add to the unique senders list.
132 senderValue = row['Sender']
133 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
134 else:
135 ## No ntrs addresses found at all,
136 noNtrsDomainBucketCount +=1
137 ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
138 senderValue = row['Sender']
139 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
140
141 ## Gather the subject line, so that I can later get a deduplicated count.
142 subjectLineValue = row['Subject']
143 uniqueSubjectLineSet.add(subjectLineValue.upper())
144 ## Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
145 try:
146 subjectLineCountMatrix[subjectLineValue.upper()] += 1
147 except:
148 subjectLineCountMatrix[subjectLineValue.upper()] = 1
149
150
151 ## This is the count of unique subject lines that exist within the CSV.
152 uniqueSubjectLineCount = len(uniqueSubjectLineSet)
153
154 scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
155
156 ## Gather the top 10 subject lines
157 sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
158 sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
159 if len(sortedSubjectKeyList) < 10:
160 sNumbMax = len(sortedSubjectKeyList)
161 else:
162 sNumbMax = 10
163 for sNumb in range(sNumbMax):
164 #print(f"{sortedSubjectKeyList[s]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
165 self.subjectLineTopTenReportSet.add(f"{fl}|{sortedSubjectKeyList[sNumb]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}")
166
167 print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
168 csv_file.close()
169 ## Update the global all email addresses set, if they selected this option
170 if writeAllPossibleEmailAddressesLogFile:
171 self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
172
173
174 if writeTrueLanIDLogFile:
175 print("Writing the True LAN ID log file...")
176 self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
177 print("Done.\n")
178 if writeFalsePositiveLanIDLogFile:
179 print("Writing the False-Positive LAN ID log file...")
180 self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
181 print("Done.\n")
182 if writeAllPossibleEmailAddressesLogFile:
183 print("Writing the All Possible Email Addresses Across All CSV Files log file...")
184 self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
185 print("Done.\n")
186 if writeSpecialSendersListLogFile:
187 print("Writing the deduplicated special senders data to log file....")
188 self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
189 print("Done.\n")
190 if writeTopTenSubjectLinesReportLogFile:
191 print("Writing the Top 10 Subject Lines Report to log file....")
192 self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName)
193 print("Done.\n")
194
195
196 def LanIDTrueTest(self, listOfIds):
197 """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
198 lanIDTestResult = False
199 for lanID in listOfIds:
200 alphaOnly = [x.lower() for x in lanID if x.isalpha()]
201 if len(alphaOnly) > 10:
202 ## I'm too big to be a true LAN ID
203 self.falsePositiveLanIdAddressesSet.add(lanID)
204 else:
205 self.trueLanIdAddressesSet.add(lanID)
206 lanIDTestResult = True
207 return lanIDTestResult
208
209
210 def WriteLogFile(self, setOfValues, outputFilePath):
211 """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
212 fileNameInc = 0
213 while os.path.isfile(outputFilePath):
214 fileNameInc +=1
215 outputFile, extension = os.path.splitext(outputFilePath)
216 outputFilePath = outputFile + str(fileNameInc) + extension
217 outFl = open(outputFilePath,'w')
218 tempList = list(setOfValues)
219 tempList.sort()
220 for i in tempList:
221 try:
222 outFl.write(f"{i}\n")
223 except:
224 print(i)
225 outFl.close()
226
227 def CalculateScenario(self,rawNumber, averageNumbRecip):
228 """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
229 scenario = "Uncategorized"
230 if rawNumber > .76:
231 scenario = "C"
232 elif averageNumbRecip < 15 and rawNumber < .019:
233 scenario = "A or B"
234 elif rawNumber > .009 and rawNumber < .34:
235 scenario = "D"
236
237 return scenario
238
239 if __name__ == '__main__':
240
241 tsa = TopSendersAnalyzer()
242 tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)
243
244
245 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
246 #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
247 #allToLanAddressesList = list(allToLanAddressesSet)
248 #allToLanAddressesList.sort()
249 #for i in allToLanAddressesList:
250 # outputFile.write(f"{i}\n")
251 #outputFile.close()
252
253 ## Initially gathering some very basic information across the CSV files, not using csv lib
254 # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
255 # for fl in files:
256 # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
257 # print(f"{fl}|{len(contents)-1}")