ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 812
Committed: Mon Jan 8 21:44:57 2024 UTC (2 years, 2 months ago) by nino.borges
Content type: text/x-python
File size: 13817 byte(s)
Log Message:
This is the working version that has the final senders values code as well as the unique subject line count code but before I added any top 10 subject line coding was added.

File Contents

# Content
1 """
2
3 NTRS-TopSenderAnalysis
4
5 Created by:
6 Emanuel Borges
7 09.20.2023
8
9 Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12 """
13
14 import csv, os, re
15
16 class TopSendersAnalyzer(object):
17 version = "0.09"
18
19 def __init__(self):
20 self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23
24
25 ## Matrix containing the scenarios to scenario descriptions
26 self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27
28 ## All possible email addresses across all CSV files
29 self.allPossibleEmailAddressesSet = set()
30 self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31
32 ## All email addresses with an @NTRS.COM domain. Currently unsupported.
33 #self.allToNtrsAddressesSet = set()
34 #self.allToNtrsAddressesOutputFileName = r""
35
36 ## All true NTRS LAN ID matches, per specification provided to me.
37 self.trueLanIdAddressesSet = set()
38 self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39
40 ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41 self.falsePositiveLanIdAddressesSet = set()
42 self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43
44 ## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
45 ## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46 self.senderEmailAddressesAcrossCSVsSet = set()
47 self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48
49 #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
50 self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
51
52
53 ## Simple match to pull out the date as recorded in the path
54 self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
55
56
57 ## Match for pulling out all email addresses, regardless of domain.
58 #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
59 #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
60 #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
61 #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
62 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
63
64 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
65 """Main Method in this program"""
66 print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
67 for (root,dirs,files) in os.walk(self.startDir):
68 for fl in files:
69 fileRowCount = 0
70 noAddressesInToFieldCount = 0
71 eightyCharRowCount = 0
72 nonEightyCharRowCount = 0
73 charsOverEighty = False
74 noNtrsDomainBucketCount = 0
75 ntrsAndLanBucketCount = 0
76 ntrsNoLanBucketCount = 0
77 dateInPath = re.findall(self.dateInPathRegExPattern, root)
78 allEmailAddressesInCSVSet = set()
79
80
81 ## This is the full count of receipient addresses found in the CSV, not unique addresses
82 toFieldAddressesInCSVCount = 0
83
84
85 ## This set holds the list of unique subject lines, so that I can get a count below.
86 uniqueSubjectLineSet = set()
87
88 with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
89 csv_reader = csv.DictReader(csv_file)
90 for row in csv_reader:
91 fileRowCount += 1
92 if len(row['To']) == 80:
93 eightyCharRowCount +=1
94 else:
95 nonEightyCharRowCount +=1
96 if len(row['To']) > 80:
97 charsOverEighty = True
98 toValue = row['To']
99 toValue = toValue.upper()
100
101 ## Match and gather all possible email addresses, adding it to the per CSV set.
102 allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
103 for eAddress in allEmailAddresses:
104 allEmailAddressesInCSVSet.add(eAddress)
105
106 ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
107 if len(allEmailAddresses) == 0:
108 noAddressesInToFieldCount +=1
109 else:
110 toFieldAddressesInCSVCount += len(allEmailAddresses)
111 ## Perform the main logic tests
112 if "NTRS.COM" in toValue:
113 ## The domain was found. Apply next test.
114 ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
115 ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
116 if ntrsLanIDTrueTestResult:
117 ## At least 1 LAN ID was found, using the True Test
118 ntrsAndLanBucketCount +=1
119 #for a in ntrsLanAddressesList:
120 # allToLanAddressesSet.add(a)
121 else:
122 ## Not 1 true LAN ID was found, using the True Test
123 ntrsNoLanBucketCount +=1
124 ## Also since no LAN ID was found in the TO field, add to the unique senders list.
125 senderValue = row['Sender']
126 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
127 else:
128 ## No ntrs addresses found at all,
129 noNtrsDomainBucketCount +=1
130 ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
131 senderValue = row['Sender']
132 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
133
134 ## Gather the subject line, so that I can later get a deduplicated count.
135 subjectLineValue = row['Subject']
136 uniqueSubjectLineSet.add(subjectLineValue.upper())
137
138
139 ## This is the count of unique subject lines that exist within the CSV.
140 uniqueSubjectLineCount = len(uniqueSubjectLineSet)
141 scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
142
143 print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
144 csv_file.close()
145 ## Update the global all email addresses set, if they selected this option
146 if writeAllPossibleEmailAddressesLogFile:
147 self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
148
149
150 if writeTrueLanIDLogFile:
151 print("Writing the True LAN ID log file...")
152 self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
153 print("Done.\n")
154 if writeFalsePositiveLanIDLogFile:
155 print("Writing the False-Positive LAN ID log file...")
156 self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
157 print("Done.\n")
158 if writeAllPossibleEmailAddressesLogFile:
159 print("Writing the All Possible Email Addresses Across All CSV Files log file...")
160 self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
161 print("Done.\n")
162 if writeSpecialSendersListLogFile:
163 print("Writing the deduplicated special senders data to log file....")
164 self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
165 print("Done.\n")
166
167
168 def LanIDTrueTest(self, listOfIds):
169 """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
170 lanIDTestResult = False
171 for lanID in listOfIds:
172 alphaOnly = [x.lower() for x in lanID if x.isalpha()]
173 if len(alphaOnly) > 10:
174 ## I'm too big to be a true LAN ID
175 self.falsePositiveLanIdAddressesSet.add(lanID)
176 else:
177 self.trueLanIdAddressesSet.add(lanID)
178 lanIDTestResult = True
179 return lanIDTestResult
180
181
182 def WriteLogFile(self, setOfValues, outputFilePath):
183 """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
184 fileNameInc = 0
185 while os.path.isfile(outputFilePath):
186 fileNameInc +=1
187 outputFile, extension = os.path.splitext(outputFilePath)
188 outputFilePath = outputFile + str(fileNameInc) + extension
189 outFl = open(outputFilePath,'w')
190 tempList = list(setOfValues)
191 tempList.sort()
192 for i in tempList:
193 try:
194 outFl.write(f"{i}\n")
195 except:
196 print(i)
197 outFl.close()
198
199 def CalculateScenario(self,rawNumber, averageNumbRecip):
200 """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
201 scenario = "Uncategorized"
202 if rawNumber > .76:
203 scenario = "C"
204 elif averageNumbRecip < 15 and rawNumber < .019:
205 scenario = "A or B"
206 elif rawNumber > .009 and rawNumber < .34:
207 scenario = "D"
208
209 return scenario
210
211 if __name__ == '__main__':
212
213 tsa = TopSendersAnalyzer()
214 tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True)
215
216
217 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
218 #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
219 #allToLanAddressesList = list(allToLanAddressesSet)
220 #allToLanAddressesList.sort()
221 #for i in allToLanAddressesList:
222 # outputFile.write(f"{i}\n")
223 #outputFile.close()
224
225 ## Initially gathering some very basic information across the CSV files, not using csv lib
226 # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
227 # for fl in files:
228 # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
229 # print(f"{fl}|{len(contents)-1}")