| 14 |
|
import csv, os, re |
| 15 |
|
|
| 16 |
|
class TopSendersAnalyzer(object): |
| 17 |
< |
version = "0.06" |
| 17 |
> |
version = "0.07" |
| 18 |
|
|
| 19 |
|
def __init__(self): |
| 20 |
|
self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req" |
| 40 |
|
## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis) |
| 41 |
|
self.falsePositiveLanIdAddressesSet = set() |
| 42 |
|
self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt" |
| 43 |
+ |
|
| 44 |
+ |
## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Matrix of fileName:valuesSet. |
| 45 |
+ |
self.senderEmailAddressesMatrix = {} |
| 46 |
+ |
self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt" |
| 47 |
|
|
| 48 |
|
#self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM' |
| 49 |
|
self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM' |
| 60 |
|
#self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+" |
| 61 |
|
self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+" |
| 62 |
|
|
| 63 |
< |
def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False): |
| 63 |
> |
def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False): |
| 64 |
|
"""Main Method in this program""" |
| 65 |
|
print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description") |
| 66 |
|
for (root,dirs,files) in os.walk(self.startDir): |
| 76 |
|
dateInPath = re.findall(self.dateInPathRegExPattern, root) |
| 77 |
|
allEmailAddressesInCSVSet = set() |
| 78 |
|
|
| 79 |
+ |
## This is the list of unique Sender Addresses,per CSV, ONLY IF THE EMAIL IS NOT TO A LANID. Diana asked for this list. |
| 80 |
+ |
## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address. |
| 81 |
+ |
senderEmailAddressesInCSVSet = set() |
| 82 |
+ |
|
| 83 |
|
## This is the full count of receipient addresses found in the CSV, not unique addresses |
| 84 |
|
toFieldAddressesInCSVCount = 0 |
| 85 |
|
|
| 119 |
|
else: |
| 120 |
|
## Not 1 true LAN ID was found, using the True Test |
| 121 |
|
ntrsNoLanBucketCount +=1 |
| 122 |
+ |
## Also since no LAN ID was found in the TO field, add to the unique senders list. |
| 123 |
+ |
senderValue = toValue = row['Sender'] |
| 124 |
+ |
senderEmailAddressesInCSVSet.add(senderValue.upper()) |
| 125 |
|
else: |
| 126 |
|
## No ntrs addresses found at all, |
| 127 |
|
noNtrsDomainBucketCount +=1 |
| 128 |
+ |
## Also since no NTRS address was found in the TO field at all, add to the unique senders list. |
| 129 |
+ |
senderValue = toValue = row['Sender'] |
| 130 |
+ |
senderEmailAddressesInCSVSet.add(senderValue.upper()) |
| 131 |
|
scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount) |
| 132 |
|
print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}") |
| 133 |
|
csv_file.close() |
| 134 |
|
## Update the global all email addresses set, if they selected this option |
| 135 |
|
if writeAllPossibleEmailAddressesLogFile: |
| 136 |
|
self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet) |
| 137 |
+ |
|
| 138 |
+ |
## Update the global special senders matrix, if they selected this option. |
| 139 |
+ |
if writeSpecialSendersListLogFile: |
| 140 |
+ |
self.senderEmailAddressesMatrix[fl] = senderEmailAddressesInCSVSet |
| 141 |
+ |
|
| 142 |
|
if writeTrueLanIDLogFile: |
| 143 |
|
print("Writing the True LAN ID log file...") |
| 144 |
|
self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName) |
| 151 |
|
print("Writing the All Possible Email Addresses Across All CSV Files log file...") |
| 152 |
|
self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName) |
| 153 |
|
print("Done.\n") |
| 154 |
+ |
if writeSpecialSendersListLogFile: |
| 155 |
+ |
print("Writing the deduplicated special senders data to log file....") |
| 156 |
+ |
|
| 157 |
+ |
|
| 158 |
+ |
print("Done.\n") |
| 159 |
|
|
| 160 |
|
|
| 161 |
|
def LanIDTrueTest(self, listOfIds): |