Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py

"""

NTRS-TopSenderAnalysis

Created by:
Emanuel Borges
09.20.2023

Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
To-Do: Method to QC for any parsing errors.  There should be the same number of fields across all of the CSVs.; 

"""

import csv, os, re

class TopSendersAnalyzer(object):
    version = "0.12"

    def __init__(self):
        self.startDir = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req\_LocalVersion"
        #self.startDir = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
        #self.startDir = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"


        ##  Matrix containing the scenarios to scenario descriptions
        self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}

        ##  All possible email addresses across all CSV files
        self.allPossibleEmailAddressesSet = set()
        self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"

        ##  All email addresses with an @NTRS.COM domain. Currently unsupported.
        #self.allToNtrsAddressesSet = set()
        #self.allToNtrsAddressesOutputFileName = r""

        ##  All true NTRS LAN ID matches, per specification provided to me.
        self.trueLanIdAddressesSet = set()
        self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"

        ##  False positive NTRS LAN ID matches, per specification provided to me.  Close but just outside of specification. (for analysis)
        self.falsePositiveLanIdAddressesSet = set()
        self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"

        ##  Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
        ##  Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
        self.senderEmailAddressesAcrossCSVsSet = set()
        self.senderEmailAddressesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\SendersListSpecial.txt"


        ##  Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
        self.subjectLineTopTenReportSet = set()
        self.subjectLineTopTenReportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"

        #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
        self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'


        ##  Simple match to pull out the date as recorded in the path
        self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'


        ##  Match for pulling out all email addresses, regardless of domain.
        #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
        #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
        #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
        #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"

    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
        """Main Method in this program"""
        print("FileName|Folder Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
        for (root,dirs,files) in os.walk(self.startDir):
            for fl in files:
                fileRowCount = 0
                noAddressesInToFieldCount = 0
                eightyCharRowCount = 0
                nonEightyCharRowCount = 0
                charsOverEighty = False
                noNtrsDomainBucketCount = 0
                ntrsAndLanBucketCount = 0
                ntrsNoLanBucketCount = 0
                dateInPath = re.findall(self.dateInPathRegExPattern, root)
                allEmailAddressesInCSVSet = set()


                ##  This is the full count of receipient addresses found in the CSV, not unique addresses
                toFieldAddressesInCSVCount = 0


                ##  This set holds the list of unique subject lines, so that I can get a count below.
                uniqueSubjectLineSet = set()
                ##  This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
                subjectLineCountMatrix = {}

                with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
                    csv_reader = csv.DictReader(csv_file)    
                    for row in csv_reader:
                        fileRowCount += 1
                        if len(row['To']) == 80:
                            eightyCharRowCount +=1
                        else:
                            nonEightyCharRowCount +=1
                        if len(row['To']) > 80:
                           charsOverEighty = True 
                        toValue = row['To']
                        toValue = toValue.upper()
                        
                        ##  Match and gather all possible email addresses, adding it to the per CSV set.                        
                        allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)    
                        for eAddress in allEmailAddresses:
                            allEmailAddressesInCSVSet.add(eAddress)

                        ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
                        if len(allEmailAddresses) == 0:
                            noAddressesInToFieldCount +=1
                        else:
                            toFieldAddressesInCSVCount += len(allEmailAddresses)
                        ##  Perform the main logic tests
                        if "NTRS.COM" in toValue:
                            ## The domain was found. Apply next test.
                            ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
                            ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
                            if ntrsLanIDTrueTestResult:
                                ## At least 1 LAN ID was found, using the True Test
                                ntrsAndLanBucketCount +=1
                                #for a in ntrsLanAddressesList:
                                #    allToLanAddressesSet.add(a)
                            else:
                                ## Not 1 true LAN ID was found, using the True Test
                                ntrsNoLanBucketCount +=1
                                ## Also since no LAN ID was found in the TO field, add to the unique senders list.
                                senderValue = row['Sender']
                                self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
                        else:
                            ## No ntrs addresses found at all, 
                            noNtrsDomainBucketCount +=1
                            ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
                            senderValue = row['Sender']
                            self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
                            
                        ##  Gather the subject line, so that I can later get a deduplicated count.
                        subjectLineValue = row['Subject']
                        uniqueSubjectLineSet.add(subjectLineValue.upper())
                        ##  Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
                        try:
                            subjectLineCountMatrix[subjectLineValue.upper()] += 1
                        except:
                            subjectLineCountMatrix[subjectLineValue.upper()] = 1


                ##  This is the count of unique subject lines that exist within the CSV.
                uniqueSubjectLineCount = len(uniqueSubjectLineSet)
                
                scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)

                ##  Gather the top 10 subject lines
                sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
                sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
                if len(sortedSubjectKeyList) < 10:
                    sNumbMax = len(sortedSubjectKeyList)
                else:
                    sNumbMax = 10
                for sNumb in range(sNumbMax):
                    #print(f"{sortedSubjectKeyList[s]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
                    self.subjectLineTopTenReportSet.add(f"{fl}|{sortedSubjectKeyList[sNumb]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]/fileRowCount}")

                print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
                csv_file.close()
                ##  Update the global all email addresses set, if they selected this option
                if writeAllPossibleEmailAddressesLogFile:
                    self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)

                
        if writeTrueLanIDLogFile:
            print("Writing the True LAN ID log file...")
            self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeFalsePositiveLanIDLogFile:
            print("Writing the False-Positive LAN ID log file...")
            self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeAllPossibleEmailAddressesLogFile:
            print("Writing the All Possible Email Addresses Across All CSV Files log file...")
            self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
            print("Done.\n")
        if writeSpecialSendersListLogFile:
            print("Writing the deduplicated special senders data to log file....")
            self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
            print("Done.\n")
        if writeTopTenSubjectLinesReportLogFile:
            print("Writing the Top 10 Subject Lines Report to log file....")
            self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName, doNotSort = True)
            print("Done.\n")
        

    def LanIDTrueTest(self, listOfIds):
        """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
        lanIDTestResult = False
        for lanID in listOfIds:
            alphaOnly = [x.lower() for x in lanID if x.isalpha()]
            if len(alphaOnly) > 10:
                ## I'm too big to be a true LAN ID
                self.falsePositiveLanIdAddressesSet.add(lanID)
            else:
                self.trueLanIdAddressesSet.add(lanID)
                lanIDTestResult = True
        return lanIDTestResult


    def WriteLogFile(self, setOfValues, outputFilePath, doNotSort = False):
        """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
        fileNameInc = 0
        while os.path.isfile(outputFilePath):
            fileNameInc +=1
            outputFile, extension = os.path.splitext(outputFilePath)
            outputFilePath = outputFile + str(fileNameInc) + extension
        outFl = open(outputFilePath,'w')
        tempList = list(setOfValues)
        if doNotSort:
            pass
        else:
            tempList.sort()
        for i in tempList:
            try:
                outFl.write(f"{i}\n")
            except:
                print(i) 
        outFl.close()

    def CalculateScenario(self,rawNumber, averageNumbRecip):
        """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
        scenario = "Uncategorized"
        if rawNumber > .76:
            scenario = "C"
        elif averageNumbRecip < 15 and rawNumber < .019:
            scenario = "A or B"
        elif  rawNumber > .009 and rawNumber < .34:
            scenario = "D"

        return scenario

if __name__ == '__main__':

    tsa = TopSendersAnalyzer()
    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)


    #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
    #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
    #allToLanAddressesList = list(allToLanAddressesSet)
    #allToLanAddressesList.sort()
    #for i in allToLanAddressesList:
    #    outputFile.write(f"{i}\n")
    #outputFile.close()

    ## Initially gathering some very basic information across the CSV files, not using csv lib
#    for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
#        for fl in files:
#            contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
#            print(f"{fl}|{len(contents)-1}")
Revision:	827
Committed:	Mon Jul 22 20:13:36 2024 UTC (20 months ago) by nino.borges
Content type:	text/x-python
File size:	16153 byte(s)
Log Message:	.12 update because of onedrive.
#	Content
1	"""
2
3	NTRS-TopSenderAnalysis
4
5	Created by:
6	Emanuel Borges
7	09.20.2023
8
9	Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10	To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12	"""
13
14	import csv, os, re
15
16	class TopSendersAnalyzer(object):
17	version = "0.12"
18
19	def __init__(self):
20	self.startDir = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req\_LocalVersion"
21	#self.startDir = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22	#self.startDir = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23
24
25	## Matrix containing the scenarios to scenario descriptions
26	self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27
28	## All possible email addresses across all CSV files
29	self.allPossibleEmailAddressesSet = set()
30	self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31
32	## All email addresses with an @NTRS.COM domain. Currently unsupported.
33	#self.allToNtrsAddressesSet = set()
34	#self.allToNtrsAddressesOutputFileName = r""
35
36	## All true NTRS LAN ID matches, per specification provided to me.
37	self.trueLanIdAddressesSet = set()
38	self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39
40	## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41	self.falsePositiveLanIdAddressesSet = set()
42	self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43
44	## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName\|valuesSet.
45	## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46	self.senderEmailAddressesAcrossCSVsSet = set()
47	self.senderEmailAddressesFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48
49
50	## Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
51	self.subjectLineTopTenReportSet = set()
52	self.subjectLineTopTenReportFileName = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"
53
54	#self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
55	self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
56
57
58	## Simple match to pull out the date as recorded in the path
59	self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
60
61
62	## Match for pulling out all email addresses, regardless of domain.
63	#self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z\|a-z]{2,})+'
64	#self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'+/=?^_`{\|}~-]+(?:\.[a-z0-9!#$%&'+/=^_`{\|}~-]+)\|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])")@(?:(?:[a-z0-9](?:[a-z0-9-][a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-][a-z0-9])?\|\[(?:(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9])\|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
65	#self.allPossibleEmailAddressesRegExPattern = r"([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\"([]!#-[^-~ \t]\|(\\[\t -~]))+\")@([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\[[\t -Z^-~]*])"
66	#self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
67	self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
68
69	def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
70	"""Main Method in this program"""
71	print("FileName\|Folder Date\|Count no Header\|80 Char TO Row Count\|# Rows with NO EMAIL ADDRESSES\|non-80 Char TO Row Count\|TO over 80 Char?\|Unique Email Addresses Count\|Average # of Recipients\|# Rows No NTRS Domains\|% No NTRS Domains\|# Rows NTRS and LAN IDs\|% NTRS and LAN IDs\|# Rows NTRS No LAN IDs\|% NTRS No LAN IDs\|Scenario\|Scenario Short Description\|Unique Subject Line Count\|Unique Subject Line %")
72	for (root,dirs,files) in os.walk(self.startDir):
73	for fl in files:
74	fileRowCount = 0
75	noAddressesInToFieldCount = 0
76	eightyCharRowCount = 0
77	nonEightyCharRowCount = 0
78	charsOverEighty = False
79	noNtrsDomainBucketCount = 0
80	ntrsAndLanBucketCount = 0
81	ntrsNoLanBucketCount = 0
82	dateInPath = re.findall(self.dateInPathRegExPattern, root)
83	allEmailAddressesInCSVSet = set()
84
85
86	## This is the full count of receipient addresses found in the CSV, not unique addresses
87	toFieldAddressesInCSVCount = 0
88
89
90	## This set holds the list of unique subject lines, so that I can get a count below.
91	uniqueSubjectLineSet = set()
92	## This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
93	subjectLineCountMatrix = {}
94
95	with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
96	csv_reader = csv.DictReader(csv_file)
97	for row in csv_reader:
98	fileRowCount += 1
99	if len(row['To']) == 80:
100	eightyCharRowCount +=1
101	else:
102	nonEightyCharRowCount +=1
103	if len(row['To']) > 80:
104	charsOverEighty = True
105	toValue = row['To']
106	toValue = toValue.upper()
107
108	## Match and gather all possible email addresses, adding it to the per CSV set.
109	allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
110	for eAddress in allEmailAddresses:
111	allEmailAddressesInCSVSet.add(eAddress)
112
113	## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
114	if len(allEmailAddresses) == 0:
115	noAddressesInToFieldCount +=1
116	else:
117	toFieldAddressesInCSVCount += len(allEmailAddresses)
118	## Perform the main logic tests
119	if "NTRS.COM" in toValue:
120	## The domain was found. Apply next test.
121	ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
122	ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
123	if ntrsLanIDTrueTestResult:
124	## At least 1 LAN ID was found, using the True Test
125	ntrsAndLanBucketCount +=1
126	#for a in ntrsLanAddressesList:
127	# allToLanAddressesSet.add(a)
128	else:
129	## Not 1 true LAN ID was found, using the True Test
130	ntrsNoLanBucketCount +=1
131	## Also since no LAN ID was found in the TO field, add to the unique senders list.
132	senderValue = row['Sender']
133	self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}\|{senderValue.upper()}")
134	else:
135	## No ntrs addresses found at all,
136	noNtrsDomainBucketCount +=1
137	## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
138	senderValue = row['Sender']
139	self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}\|{senderValue.upper()}")
140
141	## Gather the subject line, so that I can later get a deduplicated count.
142	subjectLineValue = row['Subject']
143	uniqueSubjectLineSet.add(subjectLineValue.upper())
144	## Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
145	try:
146	subjectLineCountMatrix[subjectLineValue.upper()] += 1
147	except:
148	subjectLineCountMatrix[subjectLineValue.upper()] = 1
149
150
151	## This is the count of unique subject lines that exist within the CSV.
152	uniqueSubjectLineCount = len(uniqueSubjectLineSet)
153
154	scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
155
156	## Gather the top 10 subject lines
157	sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
158	sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
159	if len(sortedSubjectKeyList) < 10:
160	sNumbMax = len(sortedSubjectKeyList)
161	else:
162	sNumbMax = 10
163	for sNumb in range(sNumbMax):
164	#print(f"{sortedSubjectKeyList[s]}\|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
165	self.subjectLineTopTenReportSet.add(f"{fl}\|{sortedSubjectKeyList[sNumb]}\|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}\|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]/fileRowCount}")
166
167	print(f"{fl}\|{dateInPath}\|{fileRowCount}\|{eightyCharRowCount}\|{noAddressesInToFieldCount}\|{nonEightyCharRowCount}\|{charsOverEighty}\|{len(allEmailAddressesInCSVSet)}\|{toFieldAddressesInCSVCount/fileRowCount}\|{noNtrsDomainBucketCount}\|{noNtrsDomainBucketCount/fileRowCount}\|{ntrsAndLanBucketCount}\|{ntrsAndLanBucketCount/fileRowCount}\|{ntrsNoLanBucketCount}\|{ntrsNoLanBucketCount/fileRowCount}\|{scenario}\|{self.scenarioDescriptionMatrix[scenario]}\|{uniqueSubjectLineCount}\|{uniqueSubjectLineCount/fileRowCount}")
168	csv_file.close()
169	## Update the global all email addresses set, if they selected this option
170	if writeAllPossibleEmailAddressesLogFile:
171	self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
172
173
174	if writeTrueLanIDLogFile:
175	print("Writing the True LAN ID log file...")
176	self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
177	print("Done.\n")
178	if writeFalsePositiveLanIDLogFile:
179	print("Writing the False-Positive LAN ID log file...")
180	self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
181	print("Done.\n")
182	if writeAllPossibleEmailAddressesLogFile:
183	print("Writing the All Possible Email Addresses Across All CSV Files log file...")
184	self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
185	print("Done.\n")
186	if writeSpecialSendersListLogFile:
187	print("Writing the deduplicated special senders data to log file....")
188	self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
189	print("Done.\n")
190	if writeTopTenSubjectLinesReportLogFile:
191	print("Writing the Top 10 Subject Lines Report to log file....")
192	self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName, doNotSort = True)
193	print("Done.\n")
194
195
196	def LanIDTrueTest(self, listOfIds):
197	"""A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
198	lanIDTestResult = False
199	for lanID in listOfIds:
200	alphaOnly = [x.lower() for x in lanID if x.isalpha()]
201	if len(alphaOnly) > 10:
202	## I'm too big to be a true LAN ID
203	self.falsePositiveLanIdAddressesSet.add(lanID)
204	else:
205	self.trueLanIdAddressesSet.add(lanID)
206	lanIDTestResult = True
207	return lanIDTestResult
208
209
210	def WriteLogFile(self, setOfValues, outputFilePath, doNotSort = False):
211	"""Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
212	fileNameInc = 0
213	while os.path.isfile(outputFilePath):
214	fileNameInc +=1
215	outputFile, extension = os.path.splitext(outputFilePath)
216	outputFilePath = outputFile + str(fileNameInc) + extension
217	outFl = open(outputFilePath,'w')
218	tempList = list(setOfValues)
219	if doNotSort:
220	pass
221	else:
222	tempList.sort()
223	for i in tempList:
224	try:
225	outFl.write(f"{i}\n")
226	except:
227	print(i)
228	outFl.close()
229
230	def CalculateScenario(self,rawNumber, averageNumbRecip):
231	"""This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
232	scenario = "Uncategorized"
233	if rawNumber > .76:
234	scenario = "C"
235	elif averageNumbRecip < 15 and rawNumber < .019:
236	scenario = "A or B"
237	elif rawNumber > .009 and rawNumber < .34:
238	scenario = "D"
239
240	return scenario
241
242	if __name__ == '__main__':
243
244	tsa = TopSendersAnalyzer()
245	tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)
246
247
248	#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
249	#outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
250	#allToLanAddressesList = list(allToLanAddressesSet)
251	#allToLanAddressesList.sort()
252	#for i in allToLanAddressesList:
253	# outputFile.write(f"{i}\n")
254	#outputFile.close()
255
256	## Initially gathering some very basic information across the CSV files, not using csv lib
257	# for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
258	# for fl in files:
259	# contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
260	# print(f"{fl}\|{len(contents)-1}")