ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 814
Committed: Thu Jan 11 17:03:32 2024 UTC (2 years, 2 months ago) by nino.borges
Content type: text/x-python
File size: 15947 byte(s)
Log Message:
Changed the column name CSV Date to Folder Date and tried adding support for no sorting in the write log method, because the team asked that the Top10Subject lines values be sorted in the spreadsheet by count and not by the subject line value.  These results saved as analysis report v15.

File Contents

# User Rev Content
1 nino.borges 797 """
2    
3     NTRS-TopSenderAnalysis
4    
5     Created by:
6     Emanuel Borges
7     09.20.2023
8    
9     Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 nino.borges 798 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11 nino.borges 797
12     """
13    
14 nino.borges 798 import csv, os, re
15 nino.borges 797
16 nino.borges 801 class TopSendersAnalyzer(object):
17 nino.borges 814 version = "0.11"
18 nino.borges 801
19     def __init__(self):
20 nino.borges 809 self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21     #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22 nino.borges 801 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23    
24 nino.borges 803
25 nino.borges 809 ## Matrix containing the scenarios to scenario descriptions
26     self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27    
28 nino.borges 803 ## All possible email addresses across all CSV files
29     self.allPossibleEmailAddressesSet = set()
30     self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31    
32 nino.borges 801 ## All email addresses with an @NTRS.COM domain. Currently unsupported.
33     #self.allToNtrsAddressesSet = set()
34     #self.allToNtrsAddressesOutputFileName = r""
35    
36     ## All true NTRS LAN ID matches, per specification provided to me.
37     self.trueLanIdAddressesSet = set()
38     self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39    
40     ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41     self.falsePositiveLanIdAddressesSet = set()
42     self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43 nino.borges 810
44 nino.borges 811 ## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
45     ## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46     self.senderEmailAddressesAcrossCSVsSet = set()
47 nino.borges 810 self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48 nino.borges 813
49    
50     ## Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
51     self.subjectLineTopTenReportSet = set()
52     self.subjectLineTopTenReportFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"
53    
54 nino.borges 803 #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
55     self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
56 nino.borges 800
57 nino.borges 803
58     ## Simple match to pull out the date as recorded in the path
59     self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
60    
61    
62     ## Match for pulling out all email addresses, regardless of domain.
63     #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
64     #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
65     #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
66     #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
67     self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
68    
69 nino.borges 813 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
70 nino.borges 801 """Main Method in this program"""
71 nino.borges 814 print("FileName|Folder Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
72 nino.borges 801 for (root,dirs,files) in os.walk(self.startDir):
73     for fl in files:
74     fileRowCount = 0
75 nino.borges 809 noAddressesInToFieldCount = 0
76 nino.borges 801 eightyCharRowCount = 0
77     nonEightyCharRowCount = 0
78     charsOverEighty = False
79     noNtrsDomainBucketCount = 0
80     ntrsAndLanBucketCount = 0
81     ntrsNoLanBucketCount = 0
82 nino.borges 803 dateInPath = re.findall(self.dateInPathRegExPattern, root)
83     allEmailAddressesInCSVSet = set()
84 nino.borges 801
85 nino.borges 810
86 nino.borges 809 ## This is the full count of receipient addresses found in the CSV, not unique addresses
87     toFieldAddressesInCSVCount = 0
88    
89 nino.borges 812
90 nino.borges 813 ## This set holds the list of unique subject lines, so that I can get a count below.
91 nino.borges 812 uniqueSubjectLineSet = set()
92 nino.borges 813 ## This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
93     subjectLineCountMatrix = {}
94 nino.borges 812
95 nino.borges 801 with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
96     csv_reader = csv.DictReader(csv_file)
97     for row in csv_reader:
98     fileRowCount += 1
99     if len(row['To']) == 80:
100     eightyCharRowCount +=1
101     else:
102     nonEightyCharRowCount +=1
103     if len(row['To']) > 80:
104     charsOverEighty = True
105     toValue = row['To']
106     toValue = toValue.upper()
107 nino.borges 803
108     ## Match and gather all possible email addresses, adding it to the per CSV set.
109 nino.borges 809 allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
110 nino.borges 803 for eAddress in allEmailAddresses:
111     allEmailAddressesInCSVSet.add(eAddress)
112 nino.borges 809
113     ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
114     if len(allEmailAddresses) == 0:
115     noAddressesInToFieldCount +=1
116     else:
117     toFieldAddressesInCSVCount += len(allEmailAddresses)
118 nino.borges 803 ## Perform the main logic tests
119 nino.borges 804 if "NTRS.COM" in toValue:
120 nino.borges 801 ## The domain was found. Apply next test.
121 nino.borges 803 ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
122 nino.borges 801 ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
123     if ntrsLanIDTrueTestResult:
124     ## At least 1 LAN ID was found, using the True Test
125     ntrsAndLanBucketCount +=1
126     #for a in ntrsLanAddressesList:
127     # allToLanAddressesSet.add(a)
128     else:
129     ## Not 1 true LAN ID was found, using the True Test
130     ntrsNoLanBucketCount +=1
131 nino.borges 810 ## Also since no LAN ID was found in the TO field, add to the unique senders list.
132 nino.borges 812 senderValue = row['Sender']
133 nino.borges 811 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
134 nino.borges 801 else:
135     ## No ntrs addresses found at all,
136     noNtrsDomainBucketCount +=1
137 nino.borges 810 ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
138 nino.borges 812 senderValue = row['Sender']
139 nino.borges 811 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
140    
141 nino.borges 812 ## Gather the subject line, so that I can later get a deduplicated count.
142     subjectLineValue = row['Subject']
143     uniqueSubjectLineSet.add(subjectLineValue.upper())
144 nino.borges 813 ## Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
145     try:
146     subjectLineCountMatrix[subjectLineValue.upper()] += 1
147     except:
148     subjectLineCountMatrix[subjectLineValue.upper()] = 1
149 nino.borges 812
150    
151     ## This is the count of unique subject lines that exist within the CSV.
152 nino.borges 813 uniqueSubjectLineCount = len(uniqueSubjectLineSet)
153    
154 nino.borges 809 scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
155 nino.borges 813
156     ## Gather the top 10 subject lines
157     sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
158     sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
159     if len(sortedSubjectKeyList) < 10:
160     sNumbMax = len(sortedSubjectKeyList)
161     else:
162     sNumbMax = 10
163     for sNumb in range(sNumbMax):
164     #print(f"{sortedSubjectKeyList[s]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
165 nino.borges 814 self.subjectLineTopTenReportSet.add(f"{fl}|{sortedSubjectKeyList[sNumb]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]/fileRowCount}")
166 nino.borges 813
167 nino.borges 812 print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
168 nino.borges 801 csv_file.close()
169 nino.borges 803 ## Update the global all email addresses set, if they selected this option
170     if writeAllPossibleEmailAddressesLogFile:
171     self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
172 nino.borges 812
173    
174 nino.borges 801 if writeTrueLanIDLogFile:
175     print("Writing the True LAN ID log file...")
176     self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
177     print("Done.\n")
178     if writeFalsePositiveLanIDLogFile:
179     print("Writing the False-Positive LAN ID log file...")
180     self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
181     print("Done.\n")
182 nino.borges 803 if writeAllPossibleEmailAddressesLogFile:
183     print("Writing the All Possible Email Addresses Across All CSV Files log file...")
184     self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
185     print("Done.\n")
186 nino.borges 810 if writeSpecialSendersListLogFile:
187     print("Writing the deduplicated special senders data to log file....")
188 nino.borges 811 self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
189 nino.borges 810 print("Done.\n")
190 nino.borges 813 if writeTopTenSubjectLinesReportLogFile:
191     print("Writing the Top 10 Subject Lines Report to log file....")
192 nino.borges 814 self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName, doNotSort = True)
193 nino.borges 813 print("Done.\n")
194 nino.borges 801
195    
196     def LanIDTrueTest(self, listOfIds):
197     """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
198     lanIDTestResult = False
199     for lanID in listOfIds:
200     alphaOnly = [x.lower() for x in lanID if x.isalpha()]
201     if len(alphaOnly) > 10:
202     ## I'm too big to be a true LAN ID
203     self.falsePositiveLanIdAddressesSet.add(lanID)
204     else:
205     self.trueLanIdAddressesSet.add(lanID)
206     lanIDTestResult = True
207     return lanIDTestResult
208    
209    
210 nino.borges 814 def WriteLogFile(self, setOfValues, outputFilePath, doNotSort = False):
211 nino.borges 801 """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
212     fileNameInc = 0
213     while os.path.isfile(outputFilePath):
214     fileNameInc +=1
215     outputFile, extension = os.path.splitext(outputFilePath)
216     outputFilePath = outputFile + str(fileNameInc) + extension
217     outFl = open(outputFilePath,'w')
218     tempList = list(setOfValues)
219 nino.borges 814 if doNotSort:
220     pass
221     else:
222     tempList.sort()
223 nino.borges 801 for i in tempList:
224 nino.borges 811 try:
225     outFl.write(f"{i}\n")
226     except:
227     print(i)
228 nino.borges 801 outFl.close()
229    
230 nino.borges 809 def CalculateScenario(self,rawNumber, averageNumbRecip):
231     """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
232     scenario = "Uncategorized"
233     if rawNumber > .76:
234     scenario = "C"
235     elif averageNumbRecip < 15 and rawNumber < .019:
236     scenario = "A or B"
237     elif rawNumber > .009 and rawNumber < .34:
238     scenario = "D"
239    
240     return scenario
241    
242 nino.borges 797 if __name__ == '__main__':
243    
244 nino.borges 801 tsa = TopSendersAnalyzer()
245 nino.borges 813 tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)
246 nino.borges 797
247 nino.borges 801
248 nino.borges 800 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
249     #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
250     #allToLanAddressesList = list(allToLanAddressesSet)
251     #allToLanAddressesList.sort()
252     #for i in allToLanAddressesList:
253     # outputFile.write(f"{i}\n")
254     #outputFile.close()
255 nino.borges 797
256     ## Initially gathering some very basic information across the CSV files, not using csv lib
257     # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
258     # for fl in files:
259     # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
260     # print(f"{fl}|{len(contents)-1}")