ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 809
Committed: Thu Jan 4 21:03:18 2024 UTC (2 years, 2 months ago) by nino.borges
Content type: text/x-python
File size: 11628 byte(s)
Log Message:
This was the final version that was used, which included the scenario calculations, and was sent to Tom and Diana.  I then changed the start path to the second topsender request and re-ran this.  This is why it points to the second top sender request.

File Contents

# Content
1 """
2
3 NTRS-TopSenderAnalysis
4
5 Created by:
6 Emanuel Borges
7 09.20.2023
8
9 Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12 """
13
14 import csv, os, re
15
16 class TopSendersAnalyzer(object):
17 version = "0.06"
18
19 def __init__(self):
20 self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23
24
25 ## Matrix containing the scenarios to scenario descriptions
26 self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27
28 ## All possible email addresses across all CSV files
29 self.allPossibleEmailAddressesSet = set()
30 self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31
32 ## All email addresses with an @NTRS.COM domain. Currently unsupported.
33 #self.allToNtrsAddressesSet = set()
34 #self.allToNtrsAddressesOutputFileName = r""
35
36 ## All true NTRS LAN ID matches, per specification provided to me.
37 self.trueLanIdAddressesSet = set()
38 self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39
40 ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41 self.falsePositiveLanIdAddressesSet = set()
42 self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43
44 #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
45 self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
46
47
48 ## Simple match to pull out the date as recorded in the path
49 self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
50
51
52 ## Match for pulling out all email addresses, regardless of domain.
53 #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
54 #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
55 #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
56 #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
57 self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
58
59 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False):
60 """Main Method in this program"""
61 print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description")
62 for (root,dirs,files) in os.walk(self.startDir):
63 for fl in files:
64 fileRowCount = 0
65 noAddressesInToFieldCount = 0
66 eightyCharRowCount = 0
67 nonEightyCharRowCount = 0
68 charsOverEighty = False
69 noNtrsDomainBucketCount = 0
70 ntrsAndLanBucketCount = 0
71 ntrsNoLanBucketCount = 0
72 dateInPath = re.findall(self.dateInPathRegExPattern, root)
73 allEmailAddressesInCSVSet = set()
74
75 ## This is the full count of receipient addresses found in the CSV, not unique addresses
76 toFieldAddressesInCSVCount = 0
77
78 with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
79 csv_reader = csv.DictReader(csv_file)
80 for row in csv_reader:
81 fileRowCount += 1
82 if len(row['To']) == 80:
83 eightyCharRowCount +=1
84 else:
85 nonEightyCharRowCount +=1
86 if len(row['To']) > 80:
87 charsOverEighty = True
88 toValue = row['To']
89 toValue = toValue.upper()
90
91 ## Match and gather all possible email addresses, adding it to the per CSV set.
92 allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
93 for eAddress in allEmailAddresses:
94 allEmailAddressesInCSVSet.add(eAddress)
95
96 ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
97 if len(allEmailAddresses) == 0:
98 noAddressesInToFieldCount +=1
99 else:
100 toFieldAddressesInCSVCount += len(allEmailAddresses)
101 ## Perform the main logic tests
102 if "NTRS.COM" in toValue:
103 ## The domain was found. Apply next test.
104 ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
105 ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
106 if ntrsLanIDTrueTestResult:
107 ## At least 1 LAN ID was found, using the True Test
108 ntrsAndLanBucketCount +=1
109 #for a in ntrsLanAddressesList:
110 # allToLanAddressesSet.add(a)
111 else:
112 ## Not 1 true LAN ID was found, using the True Test
113 ntrsNoLanBucketCount +=1
114 else:
115 ## No ntrs addresses found at all,
116 noNtrsDomainBucketCount +=1
117 scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
118 print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}")
119 csv_file.close()
120 ## Update the global all email addresses set, if they selected this option
121 if writeAllPossibleEmailAddressesLogFile:
122 self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
123 if writeTrueLanIDLogFile:
124 print("Writing the True LAN ID log file...")
125 self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
126 print("Done.\n")
127 if writeFalsePositiveLanIDLogFile:
128 print("Writing the False-Positive LAN ID log file...")
129 self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
130 print("Done.\n")
131 if writeAllPossibleEmailAddressesLogFile:
132 print("Writing the All Possible Email Addresses Across All CSV Files log file...")
133 self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
134 print("Done.\n")
135
136
137 def LanIDTrueTest(self, listOfIds):
138 """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
139 lanIDTestResult = False
140 for lanID in listOfIds:
141 alphaOnly = [x.lower() for x in lanID if x.isalpha()]
142 if len(alphaOnly) > 10:
143 ## I'm too big to be a true LAN ID
144 self.falsePositiveLanIdAddressesSet.add(lanID)
145 else:
146 self.trueLanIdAddressesSet.add(lanID)
147 lanIDTestResult = True
148 return lanIDTestResult
149
150
151 def WriteLogFile(self, setOfValues, outputFilePath):
152 """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
153 fileNameInc = 0
154 while os.path.isfile(outputFilePath):
155 fileNameInc +=1
156 outputFile, extension = os.path.splitext(outputFilePath)
157 outputFilePath = outputFile + str(fileNameInc) + extension
158 outFl = open(outputFilePath,'w')
159 tempList = list(setOfValues)
160 tempList.sort()
161 for i in tempList:
162 outFl.write(f"{i}\n")
163 outFl.close()
164
165 def CalculateScenario(self,rawNumber, averageNumbRecip):
166 """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
167 scenario = "Uncategorized"
168 if rawNumber > .76:
169 scenario = "C"
170 elif averageNumbRecip < 15 and rawNumber < .019:
171 scenario = "A or B"
172 elif rawNumber > .009 and rawNumber < .34:
173 scenario = "D"
174
175 return scenario
176
177 if __name__ == '__main__':
178
179 tsa = TopSendersAnalyzer()
180 tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True)
181
182
183 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
184 #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
185 #allToLanAddressesList = list(allToLanAddressesSet)
186 #allToLanAddressesList.sort()
187 #for i in allToLanAddressesList:
188 # outputFile.write(f"{i}\n")
189 #outputFile.close()
190
191 ## Initially gathering some very basic information across the CSV files, not using csv lib
192 # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
193 # for fl in files:
194 # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
195 # print(f"{fl}|{len(contents)-1}")