| 14 |
|
import csv, os, re |
| 15 |
|
|
| 16 |
|
class TopSendersAnalyzer(object): |
| 17 |
< |
version = "0.07" |
| 17 |
> |
version = "0.08" |
| 18 |
|
|
| 19 |
|
def __init__(self): |
| 20 |
|
self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req" |
| 41 |
|
self.falsePositiveLanIdAddressesSet = set() |
| 42 |
|
self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt" |
| 43 |
|
|
| 44 |
< |
## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Matrix of fileName:valuesSet. |
| 45 |
< |
self.senderEmailAddressesMatrix = {} |
| 44 |
> |
## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet. |
| 45 |
> |
## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address. |
| 46 |
> |
self.senderEmailAddressesAcrossCSVsSet = set() |
| 47 |
|
self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt" |
| 48 |
|
|
| 49 |
|
#self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM' |
| 77 |
|
dateInPath = re.findall(self.dateInPathRegExPattern, root) |
| 78 |
|
allEmailAddressesInCSVSet = set() |
| 79 |
|
|
| 79 |
– |
## This is the list of unique Sender Addresses,per CSV, ONLY IF THE EMAIL IS NOT TO A LANID. Diana asked for this list. |
| 80 |
– |
## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address. |
| 81 |
– |
senderEmailAddressesInCSVSet = set() |
| 80 |
|
|
| 81 |
|
## This is the full count of receipient addresses found in the CSV, not unique addresses |
| 82 |
|
toFieldAddressesInCSVCount = 0 |
| 119 |
|
ntrsNoLanBucketCount +=1 |
| 120 |
|
## Also since no LAN ID was found in the TO field, add to the unique senders list. |
| 121 |
|
senderValue = toValue = row['Sender'] |
| 122 |
< |
senderEmailAddressesInCSVSet.add(senderValue.upper()) |
| 122 |
> |
self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}") |
| 123 |
|
else: |
| 124 |
|
## No ntrs addresses found at all, |
| 125 |
|
noNtrsDomainBucketCount +=1 |
| 126 |
|
## Also since no NTRS address was found in the TO field at all, add to the unique senders list. |
| 127 |
|
senderValue = toValue = row['Sender'] |
| 128 |
< |
senderEmailAddressesInCSVSet.add(senderValue.upper()) |
| 128 |
> |
self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}") |
| 129 |
> |
|
| 130 |
|
scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount) |
| 131 |
|
print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}") |
| 132 |
|
csv_file.close() |
| 133 |
|
## Update the global all email addresses set, if they selected this option |
| 134 |
|
if writeAllPossibleEmailAddressesLogFile: |
| 135 |
|
self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet) |
| 137 |
– |
|
| 138 |
– |
## Update the global special senders matrix, if they selected this option. |
| 139 |
– |
if writeSpecialSendersListLogFile: |
| 140 |
– |
self.senderEmailAddressesMatrix[fl] = senderEmailAddressesInCSVSet |
| 136 |
|
|
| 137 |
|
if writeTrueLanIDLogFile: |
| 138 |
|
print("Writing the True LAN ID log file...") |
| 148 |
|
print("Done.\n") |
| 149 |
|
if writeSpecialSendersListLogFile: |
| 150 |
|
print("Writing the deduplicated special senders data to log file....") |
| 151 |
< |
|
| 157 |
< |
|
| 151 |
> |
self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName) |
| 152 |
|
print("Done.\n") |
| 153 |
|
|
| 154 |
|
|
| 177 |
|
tempList = list(setOfValues) |
| 178 |
|
tempList.sort() |
| 179 |
|
for i in tempList: |
| 180 |
< |
outFl.write(f"{i}\n") |
| 180 |
> |
try: |
| 181 |
> |
outFl.write(f"{i}\n") |
| 182 |
> |
except: |
| 183 |
> |
print(i) |
| 184 |
|
outFl.close() |
| 185 |
|
|
| 186 |
|
def CalculateScenario(self,rawNumber, averageNumbRecip): |
| 198 |
|
if __name__ == '__main__': |
| 199 |
|
|
| 200 |
|
tsa = TopSendersAnalyzer() |
| 201 |
< |
tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True) |
| 201 |
> |
tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True) |
| 202 |
|
|
| 203 |
|
|
| 204 |
|
#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.") |