ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 812
Committed: Mon Jan 8 21:44:57 2024 UTC (2 years, 2 months ago) by nino.borges
Content type: text/x-python
File size: 13817 byte(s)
Log Message:
This is the working version that has the final senders values code as well as the unique subject line count code but before I added any top 10 subject line coding was added.

File Contents

# User Rev Content
1 nino.borges 797 """
2    
3     NTRS-TopSenderAnalysis
4    
5     Created by:
6     Emanuel Borges
7     09.20.2023
8    
9     Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 nino.borges 798 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11 nino.borges 797
12     """
13    
14 nino.borges 798 import csv, os, re
15 nino.borges 797
16 nino.borges 801 class TopSendersAnalyzer(object):
17 nino.borges 812 version = "0.09"
18 nino.borges 801
19     def __init__(self):
20 nino.borges 809 self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21     #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22 nino.borges 801 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23    
24 nino.borges 803
25 nino.borges 809 ## Matrix containing the scenarios to scenario descriptions
26     self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27    
28 nino.borges 803 ## All possible email addresses across all CSV files
29     self.allPossibleEmailAddressesSet = set()
30     self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31    
32 nino.borges 801 ## All email addresses with an @NTRS.COM domain. Currently unsupported.
33     #self.allToNtrsAddressesSet = set()
34     #self.allToNtrsAddressesOutputFileName = r""
35    
36     ## All true NTRS LAN ID matches, per specification provided to me.
37     self.trueLanIdAddressesSet = set()
38     self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39    
40     ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41     self.falsePositiveLanIdAddressesSet = set()
42     self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43 nino.borges 810
44 nino.borges 811 ## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
45     ## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46     self.senderEmailAddressesAcrossCSVsSet = set()
47 nino.borges 810 self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48 nino.borges 800
49 nino.borges 803 #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
50     self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
51 nino.borges 800
52 nino.borges 803
53     ## Simple match to pull out the date as recorded in the path
54     self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
55    
56    
57     ## Match for pulling out all email addresses, regardless of domain.
58     #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
59     #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
60     #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
61     #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
62     self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
63    
64 nino.borges 810 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
65 nino.borges 801 """Main Method in this program"""
66 nino.borges 812 print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
67 nino.borges 801 for (root,dirs,files) in os.walk(self.startDir):
68     for fl in files:
69     fileRowCount = 0
70 nino.borges 809 noAddressesInToFieldCount = 0
71 nino.borges 801 eightyCharRowCount = 0
72     nonEightyCharRowCount = 0
73     charsOverEighty = False
74     noNtrsDomainBucketCount = 0
75     ntrsAndLanBucketCount = 0
76     ntrsNoLanBucketCount = 0
77 nino.borges 803 dateInPath = re.findall(self.dateInPathRegExPattern, root)
78     allEmailAddressesInCSVSet = set()
79 nino.borges 801
80 nino.borges 810
81 nino.borges 809 ## This is the full count of receipient addresses found in the CSV, not unique addresses
82     toFieldAddressesInCSVCount = 0
83    
84 nino.borges 812
85     ## This set holds the list of unique subject lines, so that I can get a count below.
86     uniqueSubjectLineSet = set()
87    
88 nino.borges 801 with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
89     csv_reader = csv.DictReader(csv_file)
90     for row in csv_reader:
91     fileRowCount += 1
92     if len(row['To']) == 80:
93     eightyCharRowCount +=1
94     else:
95     nonEightyCharRowCount +=1
96     if len(row['To']) > 80:
97     charsOverEighty = True
98     toValue = row['To']
99     toValue = toValue.upper()
100 nino.borges 803
101     ## Match and gather all possible email addresses, adding it to the per CSV set.
102 nino.borges 809 allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
103 nino.borges 803 for eAddress in allEmailAddresses:
104     allEmailAddressesInCSVSet.add(eAddress)
105 nino.borges 809
106     ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
107     if len(allEmailAddresses) == 0:
108     noAddressesInToFieldCount +=1
109     else:
110     toFieldAddressesInCSVCount += len(allEmailAddresses)
111 nino.borges 803 ## Perform the main logic tests
112 nino.borges 804 if "NTRS.COM" in toValue:
113 nino.borges 801 ## The domain was found. Apply next test.
114 nino.borges 803 ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
115 nino.borges 801 ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
116     if ntrsLanIDTrueTestResult:
117     ## At least 1 LAN ID was found, using the True Test
118     ntrsAndLanBucketCount +=1
119     #for a in ntrsLanAddressesList:
120     # allToLanAddressesSet.add(a)
121     else:
122     ## Not 1 true LAN ID was found, using the True Test
123     ntrsNoLanBucketCount +=1
124 nino.borges 810 ## Also since no LAN ID was found in the TO field, add to the unique senders list.
125 nino.borges 812 senderValue = row['Sender']
126 nino.borges 811 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
127 nino.borges 801 else:
128     ## No ntrs addresses found at all,
129     noNtrsDomainBucketCount +=1
130 nino.borges 810 ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
131 nino.borges 812 senderValue = row['Sender']
132 nino.borges 811 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
133    
134 nino.borges 812 ## Gather the subject line, so that I can later get a deduplicated count.
135     subjectLineValue = row['Subject']
136     uniqueSubjectLineSet.add(subjectLineValue.upper())
137    
138    
139     ## This is the count of unique subject lines that exist within the CSV.
140     uniqueSubjectLineCount = len(uniqueSubjectLineSet)
141 nino.borges 809 scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
142 nino.borges 812
143     print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
144 nino.borges 801 csv_file.close()
145 nino.borges 803 ## Update the global all email addresses set, if they selected this option
146     if writeAllPossibleEmailAddressesLogFile:
147     self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
148 nino.borges 812
149    
150 nino.borges 801 if writeTrueLanIDLogFile:
151     print("Writing the True LAN ID log file...")
152     self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
153     print("Done.\n")
154     if writeFalsePositiveLanIDLogFile:
155     print("Writing the False-Positive LAN ID log file...")
156     self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
157     print("Done.\n")
158 nino.borges 803 if writeAllPossibleEmailAddressesLogFile:
159     print("Writing the All Possible Email Addresses Across All CSV Files log file...")
160     self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
161     print("Done.\n")
162 nino.borges 810 if writeSpecialSendersListLogFile:
163     print("Writing the deduplicated special senders data to log file....")
164 nino.borges 811 self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
165 nino.borges 810 print("Done.\n")
166 nino.borges 801
167    
168     def LanIDTrueTest(self, listOfIds):
169     """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
170     lanIDTestResult = False
171     for lanID in listOfIds:
172     alphaOnly = [x.lower() for x in lanID if x.isalpha()]
173     if len(alphaOnly) > 10:
174     ## I'm too big to be a true LAN ID
175     self.falsePositiveLanIdAddressesSet.add(lanID)
176     else:
177     self.trueLanIdAddressesSet.add(lanID)
178     lanIDTestResult = True
179     return lanIDTestResult
180    
181    
182     def WriteLogFile(self, setOfValues, outputFilePath):
183     """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
184     fileNameInc = 0
185     while os.path.isfile(outputFilePath):
186     fileNameInc +=1
187     outputFile, extension = os.path.splitext(outputFilePath)
188     outputFilePath = outputFile + str(fileNameInc) + extension
189     outFl = open(outputFilePath,'w')
190     tempList = list(setOfValues)
191     tempList.sort()
192     for i in tempList:
193 nino.borges 811 try:
194     outFl.write(f"{i}\n")
195     except:
196     print(i)
197 nino.borges 801 outFl.close()
198    
199 nino.borges 809 def CalculateScenario(self,rawNumber, averageNumbRecip):
200     """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
201     scenario = "Uncategorized"
202     if rawNumber > .76:
203     scenario = "C"
204     elif averageNumbRecip < 15 and rawNumber < .019:
205     scenario = "A or B"
206     elif rawNumber > .009 and rawNumber < .34:
207     scenario = "D"
208    
209     return scenario
210    
211 nino.borges 797 if __name__ == '__main__':
212    
213 nino.borges 801 tsa = TopSendersAnalyzer()
214 nino.borges 811 tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True)
215 nino.borges 797
216 nino.borges 801
217 nino.borges 800 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
218     #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
219     #allToLanAddressesList = list(allToLanAddressesSet)
220     #allToLanAddressesList.sort()
221     #for i in allToLanAddressesList:
222     # outputFile.write(f"{i}\n")
223     #outputFile.close()
224 nino.borges 797
225     ## Initially gathering some very basic information across the CSV files, not using csv lib
226     # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
227     # for fl in files:
228     # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
229     # print(f"{fl}|{len(contents)-1}")