ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 827
Committed: Mon Jul 22 20:13:36 2024 UTC (20 months ago) by nino.borges
Content type: text/x-python
File size: 16153 byte(s)
Log Message:
.12 update because of onedrive.

File Contents

# Content
1 """
2
3 NTRS-TopSenderAnalysis
4
5 Created by:
6 Emanuel Borges
7 09.20.2023
8
9 Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12 """
13
14 import csv, os, re
15
16 class TopSendersAnalyzer(object):
17 version = "0.12"
18
19 def __init__(self):
20 self.startDir = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req\_LocalVersion"
21 #self.startDir = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22 #self.startDir = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23
24
25 ## Matrix containing the scenarios to scenario descriptions
26 self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27
28 ## All possible email addresses across all CSV files
29 self.allPossibleEmailAddressesSet = set()
30 self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31
32 ## All email addresses with an @NTRS.COM domain. Currently unsupported.
33 #self.allToNtrsAddressesSet = set()
34 #self.allToNtrsAddressesOutputFileName = r""
35
36 ## All true NTRS LAN ID matches, per specification provided to me.
37 self.trueLanIdAddressesSet = set()
38 self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39
40 ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41 self.falsePositiveLanIdAddressesSet = set()
42 self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43
44 ## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
45 ## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46 self.senderEmailAddressesAcrossCSVsSet = set()
47 self.senderEmailAddressesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48
49
50 ## Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
51 self.subjectLineTopTenReportSet = set()
52 self.subjectLineTopTenReportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"
53
54 #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
55 self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
56
57
58 ## Simple match to pull out the date as recorded in the path
59 self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
60
61
62 ## Match for pulling out all email addresses, regardless of domain.
63 #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
64 #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
65 #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
66 #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
67 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
68
69 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
70 """Main Method in this program"""
71 print("FileName|Folder Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
72 for (root,dirs,files) in os.walk(self.startDir):
73 for fl in files:
74 fileRowCount = 0
75 noAddressesInToFieldCount = 0
76 eightyCharRowCount = 0
77 nonEightyCharRowCount = 0
78 charsOverEighty = False
79 noNtrsDomainBucketCount = 0
80 ntrsAndLanBucketCount = 0
81 ntrsNoLanBucketCount = 0
82 dateInPath = re.findall(self.dateInPathRegExPattern, root)
83 allEmailAddressesInCSVSet = set()
84
85
86 ## This is the full count of receipient addresses found in the CSV, not unique addresses
87 toFieldAddressesInCSVCount = 0
88
89
90 ## This set holds the list of unique subject lines, so that I can get a count below.
91 uniqueSubjectLineSet = set()
92 ## This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
93 subjectLineCountMatrix = {}
94
95 with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
96 csv_reader = csv.DictReader(csv_file)
97 for row in csv_reader:
98 fileRowCount += 1
99 if len(row['To']) == 80:
100 eightyCharRowCount +=1
101 else:
102 nonEightyCharRowCount +=1
103 if len(row['To']) > 80:
104 charsOverEighty = True
105 toValue = row['To']
106 toValue = toValue.upper()
107
108 ## Match and gather all possible email addresses, adding it to the per CSV set.
109 allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
110 for eAddress in allEmailAddresses:
111 allEmailAddressesInCSVSet.add(eAddress)
112
113 ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
114 if len(allEmailAddresses) == 0:
115 noAddressesInToFieldCount +=1
116 else:
117 toFieldAddressesInCSVCount += len(allEmailAddresses)
118 ## Perform the main logic tests
119 if "NTRS.COM" in toValue:
120 ## The domain was found. Apply next test.
121 ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
122 ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
123 if ntrsLanIDTrueTestResult:
124 ## At least 1 LAN ID was found, using the True Test
125 ntrsAndLanBucketCount +=1
126 #for a in ntrsLanAddressesList:
127 # allToLanAddressesSet.add(a)
128 else:
129 ## Not 1 true LAN ID was found, using the True Test
130 ntrsNoLanBucketCount +=1
131 ## Also since no LAN ID was found in the TO field, add to the unique senders list.
132 senderValue = row['Sender']
133 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
134 else:
135 ## No ntrs addresses found at all,
136 noNtrsDomainBucketCount +=1
137 ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
138 senderValue = row['Sender']
139 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
140
141 ## Gather the subject line, so that I can later get a deduplicated count.
142 subjectLineValue = row['Subject']
143 uniqueSubjectLineSet.add(subjectLineValue.upper())
144 ## Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
145 try:
146 subjectLineCountMatrix[subjectLineValue.upper()] += 1
147 except:
148 subjectLineCountMatrix[subjectLineValue.upper()] = 1
149
150
151 ## This is the count of unique subject lines that exist within the CSV.
152 uniqueSubjectLineCount = len(uniqueSubjectLineSet)
153
154 scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
155
156 ## Gather the top 10 subject lines
157 sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
158 sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
159 if len(sortedSubjectKeyList) < 10:
160 sNumbMax = len(sortedSubjectKeyList)
161 else:
162 sNumbMax = 10
163 for sNumb in range(sNumbMax):
164 #print(f"{sortedSubjectKeyList[s]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
165 self.subjectLineTopTenReportSet.add(f"{fl}|{sortedSubjectKeyList[sNumb]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]/fileRowCount}")
166
167 print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
168 csv_file.close()
169 ## Update the global all email addresses set, if they selected this option
170 if writeAllPossibleEmailAddressesLogFile:
171 self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
172
173
174 if writeTrueLanIDLogFile:
175 print("Writing the True LAN ID log file...")
176 self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
177 print("Done.\n")
178 if writeFalsePositiveLanIDLogFile:
179 print("Writing the False-Positive LAN ID log file...")
180 self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
181 print("Done.\n")
182 if writeAllPossibleEmailAddressesLogFile:
183 print("Writing the All Possible Email Addresses Across All CSV Files log file...")
184 self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
185 print("Done.\n")
186 if writeSpecialSendersListLogFile:
187 print("Writing the deduplicated special senders data to log file....")
188 self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
189 print("Done.\n")
190 if writeTopTenSubjectLinesReportLogFile:
191 print("Writing the Top 10 Subject Lines Report to log file....")
192 self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName, doNotSort = True)
193 print("Done.\n")
194
195
196 def LanIDTrueTest(self, listOfIds):
197 """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
198 lanIDTestResult = False
199 for lanID in listOfIds:
200 alphaOnly = [x.lower() for x in lanID if x.isalpha()]
201 if len(alphaOnly) > 10:
202 ## I'm too big to be a true LAN ID
203 self.falsePositiveLanIdAddressesSet.add(lanID)
204 else:
205 self.trueLanIdAddressesSet.add(lanID)
206 lanIDTestResult = True
207 return lanIDTestResult
208
209
210 def WriteLogFile(self, setOfValues, outputFilePath, doNotSort = False):
211 """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
212 fileNameInc = 0
213 while os.path.isfile(outputFilePath):
214 fileNameInc +=1
215 outputFile, extension = os.path.splitext(outputFilePath)
216 outputFilePath = outputFile + str(fileNameInc) + extension
217 outFl = open(outputFilePath,'w')
218 tempList = list(setOfValues)
219 if doNotSort:
220 pass
221 else:
222 tempList.sort()
223 for i in tempList:
224 try:
225 outFl.write(f"{i}\n")
226 except:
227 print(i)
228 outFl.close()
229
230 def CalculateScenario(self,rawNumber, averageNumbRecip):
231 """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
232 scenario = "Uncategorized"
233 if rawNumber > .76:
234 scenario = "C"
235 elif averageNumbRecip < 15 and rawNumber < .019:
236 scenario = "A or B"
237 elif rawNumber > .009 and rawNumber < .34:
238 scenario = "D"
239
240 return scenario
241
242 if __name__ == '__main__':
243
244 tsa = TopSendersAnalyzer()
245 tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)
246
247
248 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
249 #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
250 #allToLanAddressesList = list(allToLanAddressesSet)
251 #allToLanAddressesList.sort()
252 #for i in allToLanAddressesList:
253 # outputFile.write(f"{i}\n")
254 #outputFile.close()
255
256 ## Initially gathering some very basic information across the CSV files, not using csv lib
257 # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
258 # for fl in files:
259 # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
260 # print(f"{fl}|{len(contents)-1}")