Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py

"""

NTRS-TopSenderAnalysis

Created by:
Emanuel Borges
09.20.2023

Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
To-Do: Method to QC for any parsing errors.  There should be the same number of fields across all of the CSVs.; 

"""

import csv, os, re

class TopSendersAnalyzer(object):
    version = "0.11"

    def __init__(self):
        self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
        #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
        #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"


        ##  Matrix containing the scenarios to scenario descriptions
        self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}

        ##  All possible email addresses across all CSV files
        self.allPossibleEmailAddressesSet = set()
        self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"

        ##  All email addresses with an @NTRS.COM domain. Currently unsupported.
        #self.allToNtrsAddressesSet = set()
        #self.allToNtrsAddressesOutputFileName = r""

        ##  All true NTRS LAN ID matches, per specification provided to me.
        self.trueLanIdAddressesSet = set()
        self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"

        ##  False positive NTRS LAN ID matches, per specification provided to me.  Close but just outside of specification. (for analysis)
        self.falsePositiveLanIdAddressesSet = set()
        self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"

        ##  Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
        ##  Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
        self.senderEmailAddressesAcrossCSVsSet = set()
        self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"


        ##  Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
        self.subjectLineTopTenReportSet = set()
        self.subjectLineTopTenReportFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"

        #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
        self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'


        ##  Simple match to pull out the date as recorded in the path
        self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'


        ##  Match for pulling out all email addresses, regardless of domain.
        #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
        #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
        #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
        #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"

    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
        """Main Method in this program"""
        print("FileName|Folder Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
        for (root,dirs,files) in os.walk(self.startDir):
            for fl in files:
                fileRowCount = 0
                noAddressesInToFieldCount = 0
                eightyCharRowCount = 0
                nonEightyCharRowCount = 0
                charsOverEighty = False
                noNtrsDomainBucketCount = 0
                ntrsAndLanBucketCount = 0
                ntrsNoLanBucketCount = 0
                dateInPath = re.findall(self.dateInPathRegExPattern, root)
                allEmailAddressesInCSVSet = set()


                ##  This is the full count of receipient addresses found in the CSV, not unique addresses
                toFieldAddressesInCSVCount = 0


                ##  This set holds the list of unique subject lines, so that I can get a count below.
                uniqueSubjectLineSet = set()
                ##  This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
                subjectLineCountMatrix = {}

                with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
                    csv_reader = csv.DictReader(csv_file)    
                    for row in csv_reader:
                        fileRowCount += 1
                        if len(row['To']) == 80:
                            eightyCharRowCount +=1
                        else:
                            nonEightyCharRowCount +=1
                        if len(row['To']) > 80:
                           charsOverEighty = True 
                        toValue = row['To']
                        toValue = toValue.upper()
                        
                        ##  Match and gather all possible email addresses, adding it to the per CSV set.                        
                        allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)    
                        for eAddress in allEmailAddresses:
                            allEmailAddressesInCSVSet.add(eAddress)

                        ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
                        if len(allEmailAddresses) == 0:
                            noAddressesInToFieldCount +=1
                        else:
                            toFieldAddressesInCSVCount += len(allEmailAddresses)
                        ##  Perform the main logic tests
                        if "NTRS.COM" in toValue:
                            ## The domain was found. Apply next test.
                            ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
                            ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
                            if ntrsLanIDTrueTestResult:
                                ## At least 1 LAN ID was found, using the True Test
                                ntrsAndLanBucketCount +=1
                                #for a in ntrsLanAddressesList:
                                #    allToLanAddressesSet.add(a)
                            else:
                                ## Not 1 true LAN ID was found, using the True Test
                                ntrsNoLanBucketCount +=1
                                ## Also since no LAN ID was found in the TO field, add to the unique senders list.
                                senderValue = row['Sender']
                                self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
                        else:
                            ## No ntrs addresses found at all, 
                            noNtrsDomainBucketCount +=1
                            ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
                            senderValue = row['Sender']
                            self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
                            
                        ##  Gather the subject line, so that I can later get a deduplicated count.
                        subjectLineValue = row['Subject']
                        uniqueSubjectLineSet.add(subjectLineValue.upper())
                        ##  Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
                        try:
                            subjectLineCountMatrix[subjectLineValue.upper()] += 1
                        except:
                            subjectLineCountMatrix[subjectLineValue.upper()] = 1


                ##  This is the count of unique subject lines that exist within the CSV.
                uniqueSubjectLineCount = len(uniqueSubjectLineSet)
                
                scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)

                ##  Gather the top 10 subject lines
                sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
                sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
                if len(sortedSubjectKeyList) < 10:
                    sNumbMax = len(sortedSubjectKeyList)
                else:
                    sNumbMax = 10
                for sNumb in range(sNumbMax):
                    #print(f"{sortedSubjectKeyList[s]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
                    self.subjectLineTopTenReportSet.add(f"{fl}|{sortedSubjectKeyList[sNumb]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]/fileRowCount}")

                print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
                csv_file.close()
                ##  Update the global all email addresses set, if they selected this option
                if writeAllPossibleEmailAddressesLogFile:
                    self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)

                
        if writeTrueLanIDLogFile:
            print("Writing the True LAN ID log file...")
            self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeFalsePositiveLanIDLogFile:
            print("Writing the False-Positive LAN ID log file...")
            self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeAllPossibleEmailAddressesLogFile:
            print("Writing the All Possible Email Addresses Across All CSV Files log file...")
            self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
            print("Done.\n")
        if writeSpecialSendersListLogFile:
            print("Writing the deduplicated special senders data to log file....")
            self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
            print("Done.\n")
        if writeTopTenSubjectLinesReportLogFile:
            print("Writing the Top 10 Subject Lines Report to log file....")
            self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName, doNotSort = True)
            print("Done.\n")
        

    def LanIDTrueTest(self, listOfIds):
        """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
        lanIDTestResult = False
        for lanID in listOfIds:
            alphaOnly = [x.lower() for x in lanID if x.isalpha()]
            if len(alphaOnly) > 10:
                ## I'm too big to be a true LAN ID
                self.falsePositiveLanIdAddressesSet.add(lanID)
            else:
                self.trueLanIdAddressesSet.add(lanID)
                lanIDTestResult = True
        return lanIDTestResult


    def WriteLogFile(self, setOfValues, outputFilePath, doNotSort = False):
        """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
        fileNameInc = 0
        while os.path.isfile(outputFilePath):
            fileNameInc +=1
            outputFile, extension = os.path.splitext(outputFilePath)
            outputFilePath = outputFile + str(fileNameInc) + extension
        outFl = open(outputFilePath,'w')
        tempList = list(setOfValues)
        if doNotSort:
            pass
        else:
            tempList.sort()
        for i in tempList:
            try:
                outFl.write(f"{i}\n")
            except:
                print(i) 
        outFl.close()

    def CalculateScenario(self,rawNumber, averageNumbRecip):
        """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
        scenario = "Uncategorized"
        if rawNumber > .76:
            scenario = "C"
        elif averageNumbRecip < 15 and rawNumber < .019:
            scenario = "A or B"
        elif  rawNumber > .009 and rawNumber < .34:
            scenario = "D"

        return scenario

if __name__ == '__main__':

    tsa = TopSendersAnalyzer()
    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)


    #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
    #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
    #allToLanAddressesList = list(allToLanAddressesSet)
    #allToLanAddressesList.sort()
    #for i in allToLanAddressesList:
    #    outputFile.write(f"{i}\n")
    #outputFile.close()

    ## Initially gathering some very basic information across the CSV files, not using csv lib
#    for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
#        for fl in files:
#            contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
#            print(f"{fl}|{len(contents)-1}")
Revision:	814
Committed:	Thu Jan 11 17:03:32 2024 UTC (2 years, 2 months ago) by nino.borges
Content type:	text/x-python
File size:	15947 byte(s)
Log Message:	Changed the column name CSV Date to Folder Date and tried adding support for no sorting in the write log method, because the team asked that the Top10Subject lines values be sorted in the spreadsheet by count and not by the subject line value. These results saved as analysis report v15.
#	User	Rev	Content
1	nino.borges	797	"""
2
3			NTRS-TopSenderAnalysis
4
5			Created by:
6			Emanuel Borges
7			09.20.2023
8
9			Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10	nino.borges	798	To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11	nino.borges	797
12			"""
13
14	nino.borges	798	import csv, os, re
15	nino.borges	797
16	nino.borges	801	class TopSendersAnalyzer(object):
17	nino.borges	814	version = "0.11"
18	nino.borges	801
19			def __init__(self):
20	nino.borges	809	self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21			#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22	nino.borges	801	#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23
24	nino.borges	803
25	nino.borges	809	## Matrix containing the scenarios to scenario descriptions
26			self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27
28	nino.borges	803	## All possible email addresses across all CSV files
29			self.allPossibleEmailAddressesSet = set()
30			self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31
32	nino.borges	801	## All email addresses with an @NTRS.COM domain. Currently unsupported.
33			#self.allToNtrsAddressesSet = set()
34			#self.allToNtrsAddressesOutputFileName = r""
35
36			## All true NTRS LAN ID matches, per specification provided to me.
37			self.trueLanIdAddressesSet = set()
38			self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39
40			## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41			self.falsePositiveLanIdAddressesSet = set()
42			self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43	nino.borges	810
44	nino.borges	811	## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName\|valuesSet.
45			## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46			self.senderEmailAddressesAcrossCSVsSet = set()
47	nino.borges	810	self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48	nino.borges	813
49
50			## Top 10 Subject Lines Report across all CSVs, keepng track of the subject line, count and file from where it cam
51			self.subjectLineTopTenReportSet = set()
52			self.subjectLineTopTenReportFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Top-10-SubjectLinesReport.txt"
53
54	nino.borges	803	#self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
55			self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
56	nino.borges	800
57	nino.borges	803
58			## Simple match to pull out the date as recorded in the path
59			self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
60
61
62			## Match for pulling out all email addresses, regardless of domain.
63			#self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z\|a-z]{2,})+'
64			#self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'+/=?^_`{\|}~-]+(?:\.[a-z0-9!#$%&'+/=^_`{\|}~-]+)\|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])")@(?:(?:[a-z0-9](?:[a-z0-9-][a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-][a-z0-9])?\|\[(?:(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9])\|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
65			#self.allPossibleEmailAddressesRegExPattern = r"([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\"([]!#-[^-~ \t]\|(\\[\t -~]))+\")@([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\[[\t -Z^-~]*])"
66			#self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
67			self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
68
69	nino.borges	813	def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False, writeTopTenSubjectLinesReportLogFile = False):
70	nino.borges	801	"""Main Method in this program"""
71	nino.borges	814	print("FileName\|Folder Date\|Count no Header\|80 Char TO Row Count\|# Rows with NO EMAIL ADDRESSES\|non-80 Char TO Row Count\|TO over 80 Char?\|Unique Email Addresses Count\|Average # of Recipients\|# Rows No NTRS Domains\|% No NTRS Domains\|# Rows NTRS and LAN IDs\|% NTRS and LAN IDs\|# Rows NTRS No LAN IDs\|% NTRS No LAN IDs\|Scenario\|Scenario Short Description\|Unique Subject Line Count\|Unique Subject Line %")
72	nino.borges	801	for (root,dirs,files) in os.walk(self.startDir):
73			for fl in files:
74			fileRowCount = 0
75	nino.borges	809	noAddressesInToFieldCount = 0
76	nino.borges	801	eightyCharRowCount = 0
77			nonEightyCharRowCount = 0
78			charsOverEighty = False
79			noNtrsDomainBucketCount = 0
80			ntrsAndLanBucketCount = 0
81			ntrsNoLanBucketCount = 0
82	nino.borges	803	dateInPath = re.findall(self.dateInPathRegExPattern, root)
83			allEmailAddressesInCSVSet = set()
84	nino.borges	801
85	nino.borges	810
86	nino.borges	809	## This is the full count of receipient addresses found in the CSV, not unique addresses
87			toFieldAddressesInCSVCount = 0
88
89	nino.borges	812
90	nino.borges	813	## This set holds the list of unique subject lines, so that I can get a count below.
91	nino.borges	812	uniqueSubjectLineSet = set()
92	nino.borges	813	## This is duplicative to the above set but I'm creating a matrix here that tracks the number of times a subject appears to gather a top 10
93			subjectLineCountMatrix = {}
94	nino.borges	812
95	nino.borges	801	with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
96			csv_reader = csv.DictReader(csv_file)
97			for row in csv_reader:
98			fileRowCount += 1
99			if len(row['To']) == 80:
100			eightyCharRowCount +=1
101			else:
102			nonEightyCharRowCount +=1
103			if len(row['To']) > 80:
104			charsOverEighty = True
105			toValue = row['To']
106			toValue = toValue.upper()
107	nino.borges	803
108			## Match and gather all possible email addresses, adding it to the per CSV set.
109	nino.borges	809	allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
110	nino.borges	803	for eAddress in allEmailAddresses:
111			allEmailAddressesInCSVSet.add(eAddress)
112	nino.borges	809
113			## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
114			if len(allEmailAddresses) == 0:
115			noAddressesInToFieldCount +=1
116			else:
117			toFieldAddressesInCSVCount += len(allEmailAddresses)
118	nino.borges	803	## Perform the main logic tests
119	nino.borges	804	if "NTRS.COM" in toValue:
120	nino.borges	801	## The domain was found. Apply next test.
121	nino.borges	803	ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
122	nino.borges	801	ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
123			if ntrsLanIDTrueTestResult:
124			## At least 1 LAN ID was found, using the True Test
125			ntrsAndLanBucketCount +=1
126			#for a in ntrsLanAddressesList:
127			# allToLanAddressesSet.add(a)
128			else:
129			## Not 1 true LAN ID was found, using the True Test
130			ntrsNoLanBucketCount +=1
131	nino.borges	810	## Also since no LAN ID was found in the TO field, add to the unique senders list.
132	nino.borges	812	senderValue = row['Sender']
133	nino.borges	811	self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}\|{senderValue.upper()}")
134	nino.borges	801	else:
135			## No ntrs addresses found at all,
136			noNtrsDomainBucketCount +=1
137	nino.borges	810	## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
138	nino.borges	812	senderValue = row['Sender']
139	nino.borges	811	self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}\|{senderValue.upper()}")
140
141	nino.borges	812	## Gather the subject line, so that I can later get a deduplicated count.
142			subjectLineValue = row['Subject']
143			uniqueSubjectLineSet.add(subjectLineValue.upper())
144	nino.borges	813	## Also populate a subject line matrix, tracking the number of times it appears, to gather a top 10
145			try:
146			subjectLineCountMatrix[subjectLineValue.upper()] += 1
147			except:
148			subjectLineCountMatrix[subjectLineValue.upper()] = 1
149	nino.borges	812
150
151			## This is the count of unique subject lines that exist within the CSV.
152	nino.borges	813	uniqueSubjectLineCount = len(uniqueSubjectLineSet)
153
154	nino.borges	809	scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
155	nino.borges	813
156			## Gather the top 10 subject lines
157			sortedSubjectLineCountMatrix = dict(sorted(subjectLineCountMatrix.items(), key=lambda item: item[1], reverse=True))
158			sortedSubjectKeyList = list(sortedSubjectLineCountMatrix.keys())
159			if len(sortedSubjectKeyList) < 10:
160			sNumbMax = len(sortedSubjectKeyList)
161			else:
162			sNumbMax = 10
163			for sNumb in range(sNumbMax):
164			#print(f"{sortedSubjectKeyList[s]}\|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[s]]}")
165	nino.borges	814	self.subjectLineTopTenReportSet.add(f"{fl}\|{sortedSubjectKeyList[sNumb]}\|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]}\|{sortedSubjectLineCountMatrix[sortedSubjectKeyList[sNumb]]/fileRowCount}")
166	nino.borges	813
167	nino.borges	812	print(f"{fl}\|{dateInPath}\|{fileRowCount}\|{eightyCharRowCount}\|{noAddressesInToFieldCount}\|{nonEightyCharRowCount}\|{charsOverEighty}\|{len(allEmailAddressesInCSVSet)}\|{toFieldAddressesInCSVCount/fileRowCount}\|{noNtrsDomainBucketCount}\|{noNtrsDomainBucketCount/fileRowCount}\|{ntrsAndLanBucketCount}\|{ntrsAndLanBucketCount/fileRowCount}\|{ntrsNoLanBucketCount}\|{ntrsNoLanBucketCount/fileRowCount}\|{scenario}\|{self.scenarioDescriptionMatrix[scenario]}\|{uniqueSubjectLineCount}\|{uniqueSubjectLineCount/fileRowCount}")
168	nino.borges	801	csv_file.close()
169	nino.borges	803	## Update the global all email addresses set, if they selected this option
170			if writeAllPossibleEmailAddressesLogFile:
171			self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
172	nino.borges	812
173
174	nino.borges	801	if writeTrueLanIDLogFile:
175			print("Writing the True LAN ID log file...")
176			self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
177			print("Done.\n")
178			if writeFalsePositiveLanIDLogFile:
179			print("Writing the False-Positive LAN ID log file...")
180			self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
181			print("Done.\n")
182	nino.borges	803	if writeAllPossibleEmailAddressesLogFile:
183			print("Writing the All Possible Email Addresses Across All CSV Files log file...")
184			self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
185			print("Done.\n")
186	nino.borges	810	if writeSpecialSendersListLogFile:
187			print("Writing the deduplicated special senders data to log file....")
188	nino.borges	811	self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
189	nino.borges	810	print("Done.\n")
190	nino.borges	813	if writeTopTenSubjectLinesReportLogFile:
191			print("Writing the Top 10 Subject Lines Report to log file....")
192	nino.borges	814	self.WriteLogFile(self.subjectLineTopTenReportSet, self.subjectLineTopTenReportFileName, doNotSort = True)
193	nino.borges	813	print("Done.\n")
194	nino.borges	801
195
196			def LanIDTrueTest(self, listOfIds):
197			"""A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
198			lanIDTestResult = False
199			for lanID in listOfIds:
200			alphaOnly = [x.lower() for x in lanID if x.isalpha()]
201			if len(alphaOnly) > 10:
202			## I'm too big to be a true LAN ID
203			self.falsePositiveLanIdAddressesSet.add(lanID)
204			else:
205			self.trueLanIdAddressesSet.add(lanID)
206			lanIDTestResult = True
207			return lanIDTestResult
208
209
210	nino.borges	814	def WriteLogFile(self, setOfValues, outputFilePath, doNotSort = False):
211	nino.borges	801	"""Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
212			fileNameInc = 0
213			while os.path.isfile(outputFilePath):
214			fileNameInc +=1
215			outputFile, extension = os.path.splitext(outputFilePath)
216			outputFilePath = outputFile + str(fileNameInc) + extension
217			outFl = open(outputFilePath,'w')
218			tempList = list(setOfValues)
219	nino.borges	814	if doNotSort:
220			pass
221			else:
222			tempList.sort()
223	nino.borges	801	for i in tempList:
224	nino.borges	811	try:
225			outFl.write(f"{i}\n")
226			except:
227			print(i)
228	nino.borges	801	outFl.close()
229
230	nino.borges	809	def CalculateScenario(self,rawNumber, averageNumbRecip):
231			"""This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
232			scenario = "Uncategorized"
233			if rawNumber > .76:
234			scenario = "C"
235			elif averageNumbRecip < 15 and rawNumber < .019:
236			scenario = "A or B"
237			elif rawNumber > .009 and rawNumber < .34:
238			scenario = "D"
239
240			return scenario
241
242	nino.borges	797	if __name__ == '__main__':
243
244	nino.borges	801	tsa = TopSendersAnalyzer()
245	nino.borges	813	tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True, writeTopTenSubjectLinesReportLogFile = True)
246	nino.borges	797
247	nino.borges	801
248	nino.borges	800	#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
249			#outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
250			#allToLanAddressesList = list(allToLanAddressesSet)
251			#allToLanAddressesList.sort()
252			#for i in allToLanAddressesList:
253			# outputFile.write(f"{i}\n")
254			#outputFile.close()
255	nino.borges	797
256			## Initially gathering some very basic information across the CSV files, not using csv lib
257			# for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
258			# for fl in files:
259			# contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
260			# print(f"{fl}\|{len(contents)-1}")