Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py

"""

NTRS-TopSenderAnalysis

Created by:
Emanuel Borges
09.20.2023

Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
To-Do: Method to QC for any parsing errors.  There should be the same number of fields across all of the CSVs.; 

"""

import csv, os, re

class TopSendersAnalyzer(object):
    version = "0.09"

    def __init__(self):
        self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
        #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
        #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"


        ##  Matrix containing the scenarios to scenario descriptions
        self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}

        ##  All possible email addresses across all CSV files
        self.allPossibleEmailAddressesSet = set()
        self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"

        ##  All email addresses with an @NTRS.COM domain. Currently unsupported.
        #self.allToNtrsAddressesSet = set()
        #self.allToNtrsAddressesOutputFileName = r""

        ##  All true NTRS LAN ID matches, per specification provided to me.
        self.trueLanIdAddressesSet = set()
        self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"

        ##  False positive NTRS LAN ID matches, per specification provided to me.  Close but just outside of specification. (for analysis)
        self.falsePositiveLanIdAddressesSet = set()
        self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"

        ##  Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName|valuesSet.
        ##  Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
        self.senderEmailAddressesAcrossCSVsSet = set()
        self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
        
        #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
        self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'


        ##  Simple match to pull out the date as recorded in the path
        self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'


        ##  Match for pulling out all email addresses, regardless of domain.
        #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
        #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
        #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
        #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"

    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
        """Main Method in this program"""
        print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description|Unique Subject Line Count|Unique Subject Line %")
        for (root,dirs,files) in os.walk(self.startDir):
            for fl in files:
                fileRowCount = 0
                noAddressesInToFieldCount = 0
                eightyCharRowCount = 0
                nonEightyCharRowCount = 0
                charsOverEighty = False
                noNtrsDomainBucketCount = 0
                ntrsAndLanBucketCount = 0
                ntrsNoLanBucketCount = 0
                dateInPath = re.findall(self.dateInPathRegExPattern, root)
                allEmailAddressesInCSVSet = set()


                ##  This is the full count of receipient addresses found in the CSV, not unique addresses
                toFieldAddressesInCSVCount = 0


               ##  This set holds the list of unique subject lines, so that I can get a count below.
                uniqueSubjectLineSet = set()

                with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
                    csv_reader = csv.DictReader(csv_file)    
                    for row in csv_reader:
                        fileRowCount += 1
                        if len(row['To']) == 80:
                            eightyCharRowCount +=1
                        else:
                            nonEightyCharRowCount +=1
                        if len(row['To']) > 80:
                           charsOverEighty = True 
                        toValue = row['To']
                        toValue = toValue.upper()
                        
                        ##  Match and gather all possible email addresses, adding it to the per CSV set.                        
                        allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)    
                        for eAddress in allEmailAddresses:
                            allEmailAddressesInCSVSet.add(eAddress)

                        ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
                        if len(allEmailAddresses) == 0:
                            noAddressesInToFieldCount +=1
                        else:
                            toFieldAddressesInCSVCount += len(allEmailAddresses)
                        ##  Perform the main logic tests
                        if "NTRS.COM" in toValue:
                            ## The domain was found. Apply next test.
                            ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
                            ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
                            if ntrsLanIDTrueTestResult:
                                ## At least 1 LAN ID was found, using the True Test
                                ntrsAndLanBucketCount +=1
                                #for a in ntrsLanAddressesList:
                                #    allToLanAddressesSet.add(a)
                            else:
                                ## Not 1 true LAN ID was found, using the True Test
                                ntrsNoLanBucketCount +=1
                                ## Also since no LAN ID was found in the TO field, add to the unique senders list.
                                senderValue = row['Sender']
                                self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
                        else:
                            ## No ntrs addresses found at all, 
                            noNtrsDomainBucketCount +=1
                            ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
                            senderValue = row['Sender']
                            self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}|{senderValue.upper()}")
                            
                        ##  Gather the subject line, so that I can later get a deduplicated count.
                        subjectLineValue = row['Subject']
                        uniqueSubjectLineSet.add(subjectLineValue.upper())


                ##  This is the count of unique subject lines that exist within the CSV.
                uniqueSubjectLineCount = len(uniqueSubjectLineSet)        
                scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
                
                print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}|{uniqueSubjectLineCount}|{uniqueSubjectLineCount/fileRowCount}")
                csv_file.close()
                ##  Update the global all email addresses set, if they selected this option
                if writeAllPossibleEmailAddressesLogFile:
                    self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)

                
        if writeTrueLanIDLogFile:
            print("Writing the True LAN ID log file...")
            self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeFalsePositiveLanIDLogFile:
            print("Writing the False-Positive LAN ID log file...")
            self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeAllPossibleEmailAddressesLogFile:
            print("Writing the All Possible Email Addresses Across All CSV Files log file...")
            self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
            print("Done.\n")
        if writeSpecialSendersListLogFile:
            print("Writing the deduplicated special senders data to log file....")
            self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
            print("Done.\n")
        

    def LanIDTrueTest(self, listOfIds):
        """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
        lanIDTestResult = False
        for lanID in listOfIds:
            alphaOnly = [x.lower() for x in lanID if x.isalpha()]
            if len(alphaOnly) > 10:
                ## I'm too big to be a true LAN ID
                self.falsePositiveLanIdAddressesSet.add(lanID)
            else:
                self.trueLanIdAddressesSet.add(lanID)
                lanIDTestResult = True
        return lanIDTestResult


    def WriteLogFile(self, setOfValues, outputFilePath):
        """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
        fileNameInc = 0
        while os.path.isfile(outputFilePath):
            fileNameInc +=1
            outputFile, extension = os.path.splitext(outputFilePath)
            outputFilePath = outputFile + str(fileNameInc) + extension
        outFl = open(outputFilePath,'w')
        tempList = list(setOfValues)
        tempList.sort()
        for i in tempList:
            try:
                outFl.write(f"{i}\n")
            except:
                print(i) 
        outFl.close()

    def CalculateScenario(self,rawNumber, averageNumbRecip):
        """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
        scenario = "Uncategorized"
        if rawNumber > .76:
            scenario = "C"
        elif averageNumbRecip < 15 and rawNumber < .019:
            scenario = "A or B"
        elif  rawNumber > .009 and rawNumber < .34:
            scenario = "D"

        return scenario

if __name__ == '__main__':

    tsa = TopSendersAnalyzer()
    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True)


    #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
    #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
    #allToLanAddressesList = list(allToLanAddressesSet)
    #allToLanAddressesList.sort()
    #for i in allToLanAddressesList:
    #    outputFile.write(f"{i}\n")
    #outputFile.close()

    ## Initially gathering some very basic information across the CSV files, not using csv lib
#    for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
#        for fl in files:
#            contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
#            print(f"{fl}|{len(contents)-1}")
Revision:	812
Committed:	Mon Jan 8 21:44:57 2024 UTC (2 years, 2 months ago) by nino.borges
Content type:	text/x-python
File size:	13817 byte(s)
Log Message:	This is the working version that has the final senders values code as well as the unique subject line count code but before I added any top 10 subject line coding was added.
#	Content
1	"""
2
3	NTRS-TopSenderAnalysis
4
5	Created by:
6	Emanuel Borges
7	09.20.2023
8
9	Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10	To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12	"""
13
14	import csv, os, re
15
16	class TopSendersAnalyzer(object):
17	version = "0.09"
18
19	def __init__(self):
20	self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21	#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22	#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23
24
25	## Matrix containing the scenarios to scenario descriptions
26	self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27
28	## All possible email addresses across all CSV files
29	self.allPossibleEmailAddressesSet = set()
30	self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31
32	## All email addresses with an @NTRS.COM domain. Currently unsupported.
33	#self.allToNtrsAddressesSet = set()
34	#self.allToNtrsAddressesOutputFileName = r""
35
36	## All true NTRS LAN ID matches, per specification provided to me.
37	self.trueLanIdAddressesSet = set()
38	self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39
40	## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41	self.falsePositiveLanIdAddressesSet = set()
42	self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43
44	## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Set of fileName\|valuesSet.
45	## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
46	self.senderEmailAddressesAcrossCSVsSet = set()
47	self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
48
49	#self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
50	self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
51
52
53	## Simple match to pull out the date as recorded in the path
54	self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
55
56
57	## Match for pulling out all email addresses, regardless of domain.
58	#self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z\|a-z]{2,})+'
59	#self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'+/=?^_`{\|}~-]+(?:\.[a-z0-9!#$%&'+/=^_`{\|}~-]+)\|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])")@(?:(?:[a-z0-9](?:[a-z0-9-][a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-][a-z0-9])?\|\[(?:(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9])\|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
60	#self.allPossibleEmailAddressesRegExPattern = r"([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\"([]!#-[^-~ \t]\|(\\[\t -~]))+\")@([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\[[\t -Z^-~]*])"
61	#self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
62	self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
63
64	def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
65	"""Main Method in this program"""
66	print("FileName\|CSV Date\|Count no Header\|80 Char TO Row Count\|# Rows with NO EMAIL ADDRESSES\|non-80 Char TO Row Count\|TO over 80 Char?\|Unique Email Addresses Count\|Average # of Recipients\|# Rows No NTRS Domains\|% No NTRS Domains\|# Rows NTRS and LAN IDs\|% NTRS and LAN IDs\|# Rows NTRS No LAN IDs\|% NTRS No LAN IDs\|Scenario\|Scenario Short Description\|Unique Subject Line Count\|Unique Subject Line %")
67	for (root,dirs,files) in os.walk(self.startDir):
68	for fl in files:
69	fileRowCount = 0
70	noAddressesInToFieldCount = 0
71	eightyCharRowCount = 0
72	nonEightyCharRowCount = 0
73	charsOverEighty = False
74	noNtrsDomainBucketCount = 0
75	ntrsAndLanBucketCount = 0
76	ntrsNoLanBucketCount = 0
77	dateInPath = re.findall(self.dateInPathRegExPattern, root)
78	allEmailAddressesInCSVSet = set()
79
80
81	## This is the full count of receipient addresses found in the CSV, not unique addresses
82	toFieldAddressesInCSVCount = 0
83
84
85	## This set holds the list of unique subject lines, so that I can get a count below.
86	uniqueSubjectLineSet = set()
87
88	with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
89	csv_reader = csv.DictReader(csv_file)
90	for row in csv_reader:
91	fileRowCount += 1
92	if len(row['To']) == 80:
93	eightyCharRowCount +=1
94	else:
95	nonEightyCharRowCount +=1
96	if len(row['To']) > 80:
97	charsOverEighty = True
98	toValue = row['To']
99	toValue = toValue.upper()
100
101	## Match and gather all possible email addresses, adding it to the per CSV set.
102	allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
103	for eAddress in allEmailAddresses:
104	allEmailAddressesInCSVSet.add(eAddress)
105
106	## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
107	if len(allEmailAddresses) == 0:
108	noAddressesInToFieldCount +=1
109	else:
110	toFieldAddressesInCSVCount += len(allEmailAddresses)
111	## Perform the main logic tests
112	if "NTRS.COM" in toValue:
113	## The domain was found. Apply next test.
114	ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
115	ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
116	if ntrsLanIDTrueTestResult:
117	## At least 1 LAN ID was found, using the True Test
118	ntrsAndLanBucketCount +=1
119	#for a in ntrsLanAddressesList:
120	# allToLanAddressesSet.add(a)
121	else:
122	## Not 1 true LAN ID was found, using the True Test
123	ntrsNoLanBucketCount +=1
124	## Also since no LAN ID was found in the TO field, add to the unique senders list.
125	senderValue = row['Sender']
126	self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}\|{senderValue.upper()}")
127	else:
128	## No ntrs addresses found at all,
129	noNtrsDomainBucketCount +=1
130	## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
131	senderValue = row['Sender']
132	self.senderEmailAddressesAcrossCSVsSet.add(f"{fl}\|{senderValue.upper()}")
133
134	## Gather the subject line, so that I can later get a deduplicated count.
135	subjectLineValue = row['Subject']
136	uniqueSubjectLineSet.add(subjectLineValue.upper())
137
138
139	## This is the count of unique subject lines that exist within the CSV.
140	uniqueSubjectLineCount = len(uniqueSubjectLineSet)
141	scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
142
143	print(f"{fl}\|{dateInPath}\|{fileRowCount}\|{eightyCharRowCount}\|{noAddressesInToFieldCount}\|{nonEightyCharRowCount}\|{charsOverEighty}\|{len(allEmailAddressesInCSVSet)}\|{toFieldAddressesInCSVCount/fileRowCount}\|{noNtrsDomainBucketCount}\|{noNtrsDomainBucketCount/fileRowCount}\|{ntrsAndLanBucketCount}\|{ntrsAndLanBucketCount/fileRowCount}\|{ntrsNoLanBucketCount}\|{ntrsNoLanBucketCount/fileRowCount}\|{scenario}\|{self.scenarioDescriptionMatrix[scenario]}\|{uniqueSubjectLineCount}\|{uniqueSubjectLineCount/fileRowCount}")
144	csv_file.close()
145	## Update the global all email addresses set, if they selected this option
146	if writeAllPossibleEmailAddressesLogFile:
147	self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
148
149
150	if writeTrueLanIDLogFile:
151	print("Writing the True LAN ID log file...")
152	self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
153	print("Done.\n")
154	if writeFalsePositiveLanIDLogFile:
155	print("Writing the False-Positive LAN ID log file...")
156	self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
157	print("Done.\n")
158	if writeAllPossibleEmailAddressesLogFile:
159	print("Writing the All Possible Email Addresses Across All CSV Files log file...")
160	self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
161	print("Done.\n")
162	if writeSpecialSendersListLogFile:
163	print("Writing the deduplicated special senders data to log file....")
164	self.WriteLogFile(self.senderEmailAddressesAcrossCSVsSet, self.senderEmailAddressesFileName)
165	print("Done.\n")
166
167
168	def LanIDTrueTest(self, listOfIds):
169	"""A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
170	lanIDTestResult = False
171	for lanID in listOfIds:
172	alphaOnly = [x.lower() for x in lanID if x.isalpha()]
173	if len(alphaOnly) > 10:
174	## I'm too big to be a true LAN ID
175	self.falsePositiveLanIdAddressesSet.add(lanID)
176	else:
177	self.trueLanIdAddressesSet.add(lanID)
178	lanIDTestResult = True
179	return lanIDTestResult
180
181
182	def WriteLogFile(self, setOfValues, outputFilePath):
183	"""Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
184	fileNameInc = 0
185	while os.path.isfile(outputFilePath):
186	fileNameInc +=1
187	outputFile, extension = os.path.splitext(outputFilePath)
188	outputFilePath = outputFile + str(fileNameInc) + extension
189	outFl = open(outputFilePath,'w')
190	tempList = list(setOfValues)
191	tempList.sort()
192	for i in tempList:
193	try:
194	outFl.write(f"{i}\n")
195	except:
196	print(i)
197	outFl.close()
198
199	def CalculateScenario(self,rawNumber, averageNumbRecip):
200	"""This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
201	scenario = "Uncategorized"
202	if rawNumber > .76:
203	scenario = "C"
204	elif averageNumbRecip < 15 and rawNumber < .019:
205	scenario = "A or B"
206	elif rawNumber > .009 and rawNumber < .34:
207	scenario = "D"
208
209	return scenario
210
211	if __name__ == '__main__':
212
213	tsa = TopSendersAnalyzer()
214	tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True, writeSpecialSendersListLogFile = True)
215
216
217	#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
218	#outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
219	#allToLanAddressesList = list(allToLanAddressesSet)
220	#allToLanAddressesList.sort()
221	#for i in allToLanAddressesList:
222	# outputFile.write(f"{i}\n")
223	#outputFile.close()
224
225	## Initially gathering some very basic information across the CSV files, not using csv lib
226	# for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
227	# for fl in files:
228	# contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
229	# print(f"{fl}\|{len(contents)-1}")