ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 810
Committed: Fri Jan 5 19:24:53 2024 UTC (2 years, 2 months ago) by nino.borges
Content type: text/x-python
File size: 13325 byte(s)
Log Message:
When adding the special senders list where there is no NTRS domain in the TO or where there is but it's not a lanID, I initial started by wanting to have separate per CSV and then a global list.  However diana only needs a single large list but one that references what file it came from and only dedulicates on the file level.  I'm going to change the code for .8 to only have one set, that still only deduplicates on the file level, instead of two.  saving this to version control in the event she asks me to have both.

File Contents

# Content
1 """
2
3 NTRS-TopSenderAnalysis
4
5 Created by:
6 Emanuel Borges
7 09.20.2023
8
9 Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12 """
13
14 import csv, os, re
15
16 class TopSendersAnalyzer(object):
17 version = "0.07"
18
19 def __init__(self):
20 self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23
24
25 ## Matrix containing the scenarios to scenario descriptions
26 self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27
28 ## All possible email addresses across all CSV files
29 self.allPossibleEmailAddressesSet = set()
30 self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31
32 ## All email addresses with an @NTRS.COM domain. Currently unsupported.
33 #self.allToNtrsAddressesSet = set()
34 #self.allToNtrsAddressesOutputFileName = r""
35
36 ## All true NTRS LAN ID matches, per specification provided to me.
37 self.trueLanIdAddressesSet = set()
38 self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39
40 ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41 self.falsePositiveLanIdAddressesSet = set()
42 self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43
44 ## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Matrix of fileName:valuesSet.
45 self.senderEmailAddressesMatrix = {}
46 self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
47
48 #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
49 self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
50
51
52 ## Simple match to pull out the date as recorded in the path
53 self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
54
55
56 ## Match for pulling out all email addresses, regardless of domain.
57 #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
58 #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
59 #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
60 #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
61 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
62
63 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
64 """Main Method in this program"""
65 print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description")
66 for (root,dirs,files) in os.walk(self.startDir):
67 for fl in files:
68 fileRowCount = 0
69 noAddressesInToFieldCount = 0
70 eightyCharRowCount = 0
71 nonEightyCharRowCount = 0
72 charsOverEighty = False
73 noNtrsDomainBucketCount = 0
74 ntrsAndLanBucketCount = 0
75 ntrsNoLanBucketCount = 0
76 dateInPath = re.findall(self.dateInPathRegExPattern, root)
77 allEmailAddressesInCSVSet = set()
78
79 ## This is the list of unique Sender Addresses,per CSV, ONLY IF THE EMAIL IS NOT TO A LANID. Diana asked for this list.
80 ## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
81 senderEmailAddressesInCSVSet = set()
82
83 ## This is the full count of receipient addresses found in the CSV, not unique addresses
84 toFieldAddressesInCSVCount = 0
85
86 with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
87 csv_reader = csv.DictReader(csv_file)
88 for row in csv_reader:
89 fileRowCount += 1
90 if len(row['To']) == 80:
91 eightyCharRowCount +=1
92 else:
93 nonEightyCharRowCount +=1
94 if len(row['To']) > 80:
95 charsOverEighty = True
96 toValue = row['To']
97 toValue = toValue.upper()
98
99 ## Match and gather all possible email addresses, adding it to the per CSV set.
100 allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
101 for eAddress in allEmailAddresses:
102 allEmailAddressesInCSVSet.add(eAddress)
103
104 ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
105 if len(allEmailAddresses) == 0:
106 noAddressesInToFieldCount +=1
107 else:
108 toFieldAddressesInCSVCount += len(allEmailAddresses)
109 ## Perform the main logic tests
110 if "NTRS.COM" in toValue:
111 ## The domain was found. Apply next test.
112 ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
113 ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
114 if ntrsLanIDTrueTestResult:
115 ## At least 1 LAN ID was found, using the True Test
116 ntrsAndLanBucketCount +=1
117 #for a in ntrsLanAddressesList:
118 # allToLanAddressesSet.add(a)
119 else:
120 ## Not 1 true LAN ID was found, using the True Test
121 ntrsNoLanBucketCount +=1
122 ## Also since no LAN ID was found in the TO field, add to the unique senders list.
123 senderValue = toValue = row['Sender']
124 senderEmailAddressesInCSVSet.add(senderValue.upper())
125 else:
126 ## No ntrs addresses found at all,
127 noNtrsDomainBucketCount +=1
128 ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
129 senderValue = toValue = row['Sender']
130 senderEmailAddressesInCSVSet.add(senderValue.upper())
131 scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
132 print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}")
133 csv_file.close()
134 ## Update the global all email addresses set, if they selected this option
135 if writeAllPossibleEmailAddressesLogFile:
136 self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
137
138 ## Update the global special senders matrix, if they selected this option.
139 if writeSpecialSendersListLogFile:
140 self.senderEmailAddressesMatrix[fl] = senderEmailAddressesInCSVSet
141
142 if writeTrueLanIDLogFile:
143 print("Writing the True LAN ID log file...")
144 self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
145 print("Done.\n")
146 if writeFalsePositiveLanIDLogFile:
147 print("Writing the False-Positive LAN ID log file...")
148 self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
149 print("Done.\n")
150 if writeAllPossibleEmailAddressesLogFile:
151 print("Writing the All Possible Email Addresses Across All CSV Files log file...")
152 self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
153 print("Done.\n")
154 if writeSpecialSendersListLogFile:
155 print("Writing the deduplicated special senders data to log file....")
156
157
158 print("Done.\n")
159
160
161 def LanIDTrueTest(self, listOfIds):
162 """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
163 lanIDTestResult = False
164 for lanID in listOfIds:
165 alphaOnly = [x.lower() for x in lanID if x.isalpha()]
166 if len(alphaOnly) > 10:
167 ## I'm too big to be a true LAN ID
168 self.falsePositiveLanIdAddressesSet.add(lanID)
169 else:
170 self.trueLanIdAddressesSet.add(lanID)
171 lanIDTestResult = True
172 return lanIDTestResult
173
174
175 def WriteLogFile(self, setOfValues, outputFilePath):
176 """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
177 fileNameInc = 0
178 while os.path.isfile(outputFilePath):
179 fileNameInc +=1
180 outputFile, extension = os.path.splitext(outputFilePath)
181 outputFilePath = outputFile + str(fileNameInc) + extension
182 outFl = open(outputFilePath,'w')
183 tempList = list(setOfValues)
184 tempList.sort()
185 for i in tempList:
186 outFl.write(f"{i}\n")
187 outFl.close()
188
189 def CalculateScenario(self,rawNumber, averageNumbRecip):
190 """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
191 scenario = "Uncategorized"
192 if rawNumber > .76:
193 scenario = "C"
194 elif averageNumbRecip < 15 and rawNumber < .019:
195 scenario = "A or B"
196 elif rawNumber > .009 and rawNumber < .34:
197 scenario = "D"
198
199 return scenario
200
201 if __name__ == '__main__':
202
203 tsa = TopSendersAnalyzer()
204 tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True)
205
206
207 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
208 #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
209 #allToLanAddressesList = list(allToLanAddressesSet)
210 #allToLanAddressesList.sort()
211 #for i in allToLanAddressesList:
212 # outputFile.write(f"{i}\n")
213 #outputFile.close()
214
215 ## Initially gathering some very basic information across the CSV files, not using csv lib
216 # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
217 # for fl in files:
218 # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
219 # print(f"{fl}|{len(contents)-1}")