ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 811
Committed: Fri Jan 5 20:25:29 2024 UTC (2 years, 2 months ago) by nino.borges
Content type: text/x-python
File size: 13153 byte(s)
Log Message:
This version has the working senderEmailAddressesAcrossCSVsSet and is before I'm adding code for the unique subject line count.

File Contents

# User Rev Content
1 nino.borges 797 """
2    
3     NTRS-TopSenderAnalysis
4    
5     Created by:
6     Emanuel Borges
7     09.20.2023
8    
9     Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 nino.borges 798 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11 nino.borges 797
12     """
13    
14 nino.borges 798 import csv, os, re
15 nino.borges 797
16 nino.borges 801 class TopSendersAnalyzer(object):
17 nino.borges 811 version = "0.08"
18 nino.borges 801
19     def __init__(self):
20 nino.borges 809 self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21     #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22 nino.borges 801 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23    
24 nino.borges 803
25 nino.borges 809 ## Matrix containing the scenarios to scenario descriptions
26     self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27    
28 nino.borges 803 ## All possible email addresses across all CSV files
29     self.allPossibleEmailAddressesSet = set()
30     self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31    
32 nino.borges 801 ## All email addresses with an @NTRS.COM domain. Currently unsupported.
33     #self.allToNtrsAddressesSet = set()
34     #self.allToNtrsAddressesOutputFileName = r""
35    
36     ## All true NTRS LAN ID matches, per specification provided to me.
37     self.trueLanIdAddressesSet = set()
38     self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39    
40     ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41     self.falsePositiveLanIdAddressesSet = set()
42     self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43 nino.borges 810
44 nino.borges 811 ## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
45     ## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46     self.senderEmailAddressesAcrossCSVsSet = set()
47 nino.borges 810 self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48 nino.borges 800
49 nino.borges 803 #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
50     self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
51 nino.borges 800
52 nino.borges 803
53     ## Simple match to pull out the date as recorded in the path
54     self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
55    
56    
57     ## Match for pulling out all email addresses, regardless of domain.
58     #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
59     #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
60     #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
61     #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
62     self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
63    
64 nino.borges 810 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
65 nino.borges 801 """Main Method in this program"""
66 nino.borges 809 print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description")
67 nino.borges 801 for (root,dirs,files) in os.walk(self.startDir):
68     for fl in files:
69     fileRowCount = 0
70 nino.borges 809 noAddressesInToFieldCount = 0
71 nino.borges 801 eightyCharRowCount = 0
72     nonEightyCharRowCount = 0
73     charsOverEighty = False
74     noNtrsDomainBucketCount = 0
75     ntrsAndLanBucketCount = 0
76     ntrsNoLanBucketCount = 0
77 nino.borges 803 dateInPath = re.findall(self.dateInPathRegExPattern, root)
78     allEmailAddressesInCSVSet = set()
79 nino.borges 801
80 nino.borges 810
81 nino.borges 809 ## This is the full count of receipient addresses found in the CSV, not unique addresses
82     toFieldAddressesInCSVCount = 0
83    
84 nino.borges 801 with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
85     csv_reader = csv.DictReader(csv_file)
86     for row in csv_reader:
87     fileRowCount += 1
88     if len(row['To']) == 80:
89     eightyCharRowCount +=1
90     else:
91     nonEightyCharRowCount +=1
92     if len(row['To']) > 80:
93     charsOverEighty = True
94     toValue = row['To']
95     toValue = toValue.upper()
96 nino.borges 803
97     ## Match and gather all possible email addresses, adding it to the per CSV set.
98 nino.borges 809 allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
99 nino.borges 803 for eAddress in allEmailAddresses:
100     allEmailAddressesInCSVSet.add(eAddress)
101 nino.borges 809
102     ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
103     if len(allEmailAddresses) == 0:
104     noAddressesInToFieldCount +=1
105     else:
106     toFieldAddressesInCSVCount += len(allEmailAddresses)
107 nino.borges 803 ## Perform the main logic tests
108 nino.borges 804 if "NTRS.COM" in toValue:
109 nino.borges 801 ## The domain was found. Apply next test.
110 nino.borges 803 ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
111 nino.borges 801 ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
112     if ntrsLanIDTrueTestResult:
113     ## At least 1 LAN ID was found, using the True Test
114     ntrsAndLanBucketCount +=1
115     #for a in ntrsLanAddressesList:
116     # allToLanAddressesSet.add(a)
117     else:
118     ## Not 1 true LAN ID was found, using the True Test
119     ntrsNoLanBucketCount +=1
120 nino.borges 810 ## Also since no LAN ID was found in the TO field, add to the unique senders list.
121     senderValue = toValue = row['Sender']
122 nino.borges 811 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
123 nino.borges 801 else:
124     ## No ntrs addresses found at all,
125     noNtrsDomainBucketCount +=1
126 nino.borges 810 ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
127     senderValue = toValue = row['Sender']
128 nino.borges 811 self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
129    
130 nino.borges 809 scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
131     print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}")
132 nino.borges 801 csv_file.close()
133 nino.borges 803 ## Update the global all email addresses set, if they selected this option
134     if writeAllPossibleEmailAddressesLogFile:
135     self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
136 nino.borges 810
137 nino.borges 801 if writeTrueLanIDLogFile:
138     print("Writing the True LAN ID log file...")
139     self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
140     print("Done.\n")
141     if writeFalsePositiveLanIDLogFile:
142     print("Writing the False-Positive LAN ID log file...")
143     self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
144     print("Done.\n")
145 nino.borges 803 if writeAllPossibleEmailAddressesLogFile:
146     print("Writing the All Possible Email Addresses Across All CSV Files log file...")
147     self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
148     print("Done.\n")
149 nino.borges 810 if writeSpecialSendersListLogFile:
150     print("Writing the deduplicated special senders data to log file....")
151 nino.borges 811 self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
152 nino.borges 810 print("Done.\n")
153 nino.borges 801
154    
155     def LanIDTrueTest(self, listOfIds):
156     """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
157     lanIDTestResult = False
158     for lanID in listOfIds:
159     alphaOnly = [x.lower() for x in lanID if x.isalpha()]
160     if len(alphaOnly) > 10:
161     ## I'm too big to be a true LAN ID
162     self.falsePositiveLanIdAddressesSet.add(lanID)
163     else:
164     self.trueLanIdAddressesSet.add(lanID)
165     lanIDTestResult = True
166     return lanIDTestResult
167    
168    
169     def WriteLogFile(self, setOfValues, outputFilePath):
170     """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
171     fileNameInc = 0
172     while os.path.isfile(outputFilePath):
173     fileNameInc +=1
174     outputFile, extension = os.path.splitext(outputFilePath)
175     outputFilePath = outputFile + str(fileNameInc) + extension
176     outFl = open(outputFilePath,'w')
177     tempList = list(setOfValues)
178     tempList.sort()
179     for i in tempList:
180 nino.borges 811 try:
181     outFl.write(f"{i}\n")
182     except:
183     print(i)
184 nino.borges 801 outFl.close()
185    
186 nino.borges 809 def CalculateScenario(self,rawNumber, averageNumbRecip):
187     """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
188     scenario = "Uncategorized"
189     if rawNumber > .76:
190     scenario = "C"
191     elif averageNumbRecip < 15 and rawNumber < .019:
192     scenario = "A or B"
193     elif rawNumber > .009 and rawNumber < .34:
194     scenario = "D"
195    
196     return scenario
197    
198 nino.borges 797 if __name__ == '__main__':
199    
200 nino.borges 801 tsa = TopSendersAnalyzer()
201 nino.borges 811 tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True)
202 nino.borges 797
203 nino.borges 801
204 nino.borges 800 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
205     #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
206     #allToLanAddressesList = list(allToLanAddressesSet)
207     #allToLanAddressesList.sort()
208     #for i in allToLanAddressesList:
209     # outputFile.write(f"{i}\n")
210     #outputFile.close()
211 nino.borges 797
212     ## Initially gathering some very basic information across the CSV files, not using csv lib
213     # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
214     # for fl in files:
215     # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
216     # print(f"{fl}|{len(contents)-1}")