Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py

"""

NTRS-TopSenderAnalysis

Created by:
Emanuel Borges
09.20.2023

Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
To-Do: Method to QC for any parsing errors.  There should be the same number of fields across all of the CSVs.; 

"""

import csv, os, re

class TopSendersAnalyzer(object):
    version = "0.07"

    def __init__(self):
        self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
        #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
        #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"


        ##  Matrix containing the scenarios to scenario descriptions
        self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}

        ##  All possible email addresses across all CSV files
        self.allPossibleEmailAddressesSet = set()
        self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"

        ##  All email addresses with an @NTRS.COM domain. Currently unsupported.
        #self.allToNtrsAddressesSet = set()
        #self.allToNtrsAddressesOutputFileName = r""

        ##  All true NTRS LAN ID matches, per specification provided to me.
        self.trueLanIdAddressesSet = set()
        self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"

        ##  False positive NTRS LAN ID matches, per specification provided to me.  Close but just outside of specification. (for analysis)
        self.falsePositiveLanIdAddressesSet = set()
        self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"

        ##  Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Matrix of fileName:valuesSet.
        self.senderEmailAddressesMatrix = {}
        self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
        
        #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
        self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'


        ##  Simple match to pull out the date as recorded in the path
        self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'


        ##  Match for pulling out all email addresses, regardless of domain.
        #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
        #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
        #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
        #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"

    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
        """Main Method in this program"""
        print("FileName|CSV Date|Count no Header|80 Char TO Row Count|# Rows with NO EMAIL ADDRESSES|non-80 Char TO Row Count|TO over 80 Char?|Unique Email Addresses Count|Average # of Recipients|# Rows No NTRS Domains|% No NTRS Domains|# Rows NTRS and LAN IDs|% NTRS and LAN IDs|# Rows NTRS No LAN IDs|% NTRS No LAN IDs|Scenario|Scenario Short Description")
        for (root,dirs,files) in os.walk(self.startDir):
            for fl in files:
                fileRowCount = 0
                noAddressesInToFieldCount = 0
                eightyCharRowCount = 0
                nonEightyCharRowCount = 0
                charsOverEighty = False
                noNtrsDomainBucketCount = 0
                ntrsAndLanBucketCount = 0
                ntrsNoLanBucketCount = 0
                dateInPath = re.findall(self.dateInPathRegExPattern, root)
                allEmailAddressesInCSVSet = set()

                ##  This is the list of unique Sender Addresses,per CSV, ONLY IF THE EMAIL IS NOT TO A LANID.  Diana asked for this list.
                ##  Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
                senderEmailAddressesInCSVSet = set()

                ##  This is the full count of receipient addresses found in the CSV, not unique addresses
                toFieldAddressesInCSVCount = 0

                with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
                    csv_reader = csv.DictReader(csv_file)    
                    for row in csv_reader:
                        fileRowCount += 1
                        if len(row['To']) == 80:
                            eightyCharRowCount +=1
                        else:
                            nonEightyCharRowCount +=1
                        if len(row['To']) > 80:
                           charsOverEighty = True 
                        toValue = row['To']
                        toValue = toValue.upper()
                        
                        ##  Match and gather all possible email addresses, adding it to the per CSV set.                        
                        allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)    
                        for eAddress in allEmailAddresses:
                            allEmailAddressesInCSVSet.add(eAddress)

                        ## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
                        if len(allEmailAddresses) == 0:
                            noAddressesInToFieldCount +=1
                        else:
                            toFieldAddressesInCSVCount += len(allEmailAddresses)
                        ##  Perform the main logic tests
                        if "NTRS.COM" in toValue:
                            ## The domain was found. Apply next test.
                            ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
                            ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
                            if ntrsLanIDTrueTestResult:
                                ## At least 1 LAN ID was found, using the True Test
                                ntrsAndLanBucketCount +=1
                                #for a in ntrsLanAddressesList:
                                #    allToLanAddressesSet.add(a)
                            else:
                                ## Not 1 true LAN ID was found, using the True Test
                                ntrsNoLanBucketCount +=1
                                ## Also since no LAN ID was found in the TO field, add to the unique senders list.
                                senderValue = toValue = row['Sender']
                                senderEmailAddressesInCSVSet.add(senderValue.upper())
                        else:
                            ## No ntrs addresses found at all, 
                            noNtrsDomainBucketCount +=1
                            ## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
                            senderValue = toValue = row['Sender']
                            senderEmailAddressesInCSVSet.add(senderValue.upper())
                scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
                print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{noAddressesInToFieldCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{toFieldAddressesInCSVCount/fileRowCount}|{noNtrsDomainBucketCount}|{noNtrsDomainBucketCount/fileRowCount}|{ntrsAndLanBucketCount}|{ntrsAndLanBucketCount/fileRowCount}|{ntrsNoLanBucketCount}|{ntrsNoLanBucketCount/fileRowCount}|{scenario}|{self.scenarioDescriptionMatrix[scenario]}")
                csv_file.close()
                ##  Update the global all email addresses set, if they selected this option
                if writeAllPossibleEmailAddressesLogFile:
                    self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)

                ##  Update the global special senders matrix, if they selected this option.
                if writeSpecialSendersListLogFile:
                    self.senderEmailAddressesMatrix[fl] = senderEmailAddressesInCSVSet
                    
        if writeTrueLanIDLogFile:
            print("Writing the True LAN ID log file...")
            self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeFalsePositiveLanIDLogFile:
            print("Writing the False-Positive LAN ID log file...")
            self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeAllPossibleEmailAddressesLogFile:
            print("Writing the All Possible Email Addresses Across All CSV Files log file...")
            self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
            print("Done.\n")
        if writeSpecialSendersListLogFile:
            print("Writing the deduplicated special senders data to log file....")
            
            
            print("Done.\n")
        

    def LanIDTrueTest(self, listOfIds):
        """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
        lanIDTestResult = False
        for lanID in listOfIds:
            alphaOnly = [x.lower() for x in lanID if x.isalpha()]
            if len(alphaOnly) > 10:
                ## I'm too big to be a true LAN ID
                self.falsePositiveLanIdAddressesSet.add(lanID)
            else:
                self.trueLanIdAddressesSet.add(lanID)
                lanIDTestResult = True
        return lanIDTestResult


    def WriteLogFile(self, setOfValues, outputFilePath):
        """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
        fileNameInc = 0
        while os.path.isfile(outputFilePath):
            fileNameInc +=1
            outputFile, extension = os.path.splitext(outputFilePath)
            outputFilePath = outputFile + str(fileNameInc) + extension
        outFl = open(outputFilePath,'w')
        tempList = list(setOfValues)
        tempList.sort()
        for i in tempList:
            outFl.write(f"{i}\n")
        outFl.close()

    def CalculateScenario(self,rawNumber, averageNumbRecip):
        """This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
        scenario = "Uncategorized"
        if rawNumber > .76:
            scenario = "C"
        elif averageNumbRecip < 15 and rawNumber < .019:
            scenario = "A or B"
        elif  rawNumber > .009 and rawNumber < .34:
            scenario = "D"

        return scenario

if __name__ == '__main__':

    tsa = TopSendersAnalyzer()
    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True)


    #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
    #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
    #allToLanAddressesList = list(allToLanAddressesSet)
    #allToLanAddressesList.sort()
    #for i in allToLanAddressesList:
    #    outputFile.write(f"{i}\n")
    #outputFile.close()

    ## Initially gathering some very basic information across the CSV files, not using csv lib
#    for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
#        for fl in files:
#            contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
#            print(f"{fl}|{len(contents)-1}")
Revision:	810
Committed:	Fri Jan 5 19:24:53 2024 UTC (2 years, 2 months ago) by nino.borges
Content type:	text/x-python
File size:	13325 byte(s)
Log Message:	When adding the special senders list where there is no NTRS domain in the TO or where there is but it's not a lanID, I initial started by wanting to have separate per CSV and then a global list. However diana only needs a single large list but one that references what file it came from and only dedulicates on the file level. I'm going to change the code for .8 to only have one set, that still only deduplicates on the file level, instead of two. saving this to version control in the event she asks me to have both.
#	Content
1	"""
2
3	NTRS-TopSenderAnalysis
4
5	Created by:
6	Emanuel Borges
7	09.20.2023
8
9	Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10	To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12	"""
13
14	import csv, os, re
15
16	class TopSendersAnalyzer(object):
17	version = "0.07"
18
19	def __init__(self):
20	self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20231227 - FileNetTopSenderAnalysis-Req"
21	#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
22	#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
23
24
25	## Matrix containing the scenarios to scenario descriptions
26	self.scenarioDescriptionMatrix = {"A or B":"Sends to few recipients; evaluate subjects re: single or multi-purpose", "C":">75% of messages are to LAN IDs", "D":"<33% of messages are to LAN IDs","Uncategorized":"Remainders"}
27
28	## All possible email addresses across all CSV files
29	self.allPossibleEmailAddressesSet = set()
30	self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
31
32	## All email addresses with an @NTRS.COM domain. Currently unsupported.
33	#self.allToNtrsAddressesSet = set()
34	#self.allToNtrsAddressesOutputFileName = r""
35
36	## All true NTRS LAN ID matches, per specification provided to me.
37	self.trueLanIdAddressesSet = set()
38	self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
39
40	## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
41	self.falsePositiveLanIdAddressesSet = set()
42	self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
43
44	## Sender values, deduplicated within each CSV only, where there is either no NTRS domain in the to or there is but it's not a lanID. Matrix of fileName:valuesSet.
45	self.senderEmailAddressesMatrix = {}
46	self.senderEmailAddressesFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\SendersListSpecial.txt"
47
48	#self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
49	self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
50
51
52	## Simple match to pull out the date as recorded in the path
53	self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
54
55
56	## Match for pulling out all email addresses, regardless of domain.
57	#self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z\|a-z]{2,})+'
58	#self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'+/=?^_`{\|}~-]+(?:\.[a-z0-9!#$%&'+/=^_`{\|}~-]+)\|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])")@(?:(?:[a-z0-9](?:[a-z0-9-][a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-][a-z0-9])?\|\[(?:(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9])\|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
59	#self.allPossibleEmailAddressesRegExPattern = r"([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\"([]!#-[^-~ \t]\|(\\[\t -~]))+\")@([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\[[\t -Z^-~]*])"
60	#self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
61	self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
62
63	def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False, writeSpecialSendersListLogFile = False):
64	"""Main Method in this program"""
65	print("FileName\|CSV Date\|Count no Header\|80 Char TO Row Count\|# Rows with NO EMAIL ADDRESSES\|non-80 Char TO Row Count\|TO over 80 Char?\|Unique Email Addresses Count\|Average # of Recipients\|# Rows No NTRS Domains\|% No NTRS Domains\|# Rows NTRS and LAN IDs\|% NTRS and LAN IDs\|# Rows NTRS No LAN IDs\|% NTRS No LAN IDs\|Scenario\|Scenario Short Description")
66	for (root,dirs,files) in os.walk(self.startDir):
67	for fl in files:
68	fileRowCount = 0
69	noAddressesInToFieldCount = 0
70	eightyCharRowCount = 0
71	nonEightyCharRowCount = 0
72	charsOverEighty = False
73	noNtrsDomainBucketCount = 0
74	ntrsAndLanBucketCount = 0
75	ntrsNoLanBucketCount = 0
76	dateInPath = re.findall(self.dateInPathRegExPattern, root)
77	allEmailAddressesInCSVSet = set()
78
79	## This is the list of unique Sender Addresses,per CSV, ONLY IF THE EMAIL IS NOT TO A LANID. Diana asked for this list.
80	## Since there is only a single value in sender, I'm grabbing the entire value and not just the email address.
81	senderEmailAddressesInCSVSet = set()
82
83	## This is the full count of receipient addresses found in the CSV, not unique addresses
84	toFieldAddressesInCSVCount = 0
85
86	with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
87	csv_reader = csv.DictReader(csv_file)
88	for row in csv_reader:
89	fileRowCount += 1
90	if len(row['To']) == 80:
91	eightyCharRowCount +=1
92	else:
93	nonEightyCharRowCount +=1
94	if len(row['To']) > 80:
95	charsOverEighty = True
96	toValue = row['To']
97	toValue = toValue.upper()
98
99	## Match and gather all possible email addresses, adding it to the per CSV set.
100	allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
101	for eAddress in allEmailAddresses:
102	allEmailAddressesInCSVSet.add(eAddress)
103
104	## If there are no email addresses in the TO field at all, increment that count. Else, add the number to the full count of email addresses for average calculation
105	if len(allEmailAddresses) == 0:
106	noAddressesInToFieldCount +=1
107	else:
108	toFieldAddressesInCSVCount += len(allEmailAddresses)
109	## Perform the main logic tests
110	if "NTRS.COM" in toValue:
111	## The domain was found. Apply next test.
112	ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
113	ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
114	if ntrsLanIDTrueTestResult:
115	## At least 1 LAN ID was found, using the True Test
116	ntrsAndLanBucketCount +=1
117	#for a in ntrsLanAddressesList:
118	# allToLanAddressesSet.add(a)
119	else:
120	## Not 1 true LAN ID was found, using the True Test
121	ntrsNoLanBucketCount +=1
122	## Also since no LAN ID was found in the TO field, add to the unique senders list.
123	senderValue = toValue = row['Sender']
124	senderEmailAddressesInCSVSet.add(senderValue.upper())
125	else:
126	## No ntrs addresses found at all,
127	noNtrsDomainBucketCount +=1
128	## Also since no NTRS address was found in the TO field at all, add to the unique senders list.
129	senderValue = toValue = row['Sender']
130	senderEmailAddressesInCSVSet.add(senderValue.upper())
131	scenario = self.CalculateScenario(ntrsAndLanBucketCount/fileRowCount, toFieldAddressesInCSVCount/fileRowCount)
132	print(f"{fl}\|{dateInPath}\|{fileRowCount}\|{eightyCharRowCount}\|{noAddressesInToFieldCount}\|{nonEightyCharRowCount}\|{charsOverEighty}\|{len(allEmailAddressesInCSVSet)}\|{toFieldAddressesInCSVCount/fileRowCount}\|{noNtrsDomainBucketCount}\|{noNtrsDomainBucketCount/fileRowCount}\|{ntrsAndLanBucketCount}\|{ntrsAndLanBucketCount/fileRowCount}\|{ntrsNoLanBucketCount}\|{ntrsNoLanBucketCount/fileRowCount}\|{scenario}\|{self.scenarioDescriptionMatrix[scenario]}")
133	csv_file.close()
134	## Update the global all email addresses set, if they selected this option
135	if writeAllPossibleEmailAddressesLogFile:
136	self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
137
138	## Update the global special senders matrix, if they selected this option.
139	if writeSpecialSendersListLogFile:
140	self.senderEmailAddressesMatrix[fl] = senderEmailAddressesInCSVSet
141
142	if writeTrueLanIDLogFile:
143	print("Writing the True LAN ID log file...")
144	self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
145	print("Done.\n")
146	if writeFalsePositiveLanIDLogFile:
147	print("Writing the False-Positive LAN ID log file...")
148	self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
149	print("Done.\n")
150	if writeAllPossibleEmailAddressesLogFile:
151	print("Writing the All Possible Email Addresses Across All CSV Files log file...")
152	self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
153	print("Done.\n")
154	if writeSpecialSendersListLogFile:
155	print("Writing the deduplicated special senders data to log file....")
156
157
158	print("Done.\n")
159
160
161	def LanIDTrueTest(self, listOfIds):
162	"""A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
163	lanIDTestResult = False
164	for lanID in listOfIds:
165	alphaOnly = [x.lower() for x in lanID if x.isalpha()]
166	if len(alphaOnly) > 10:
167	## I'm too big to be a true LAN ID
168	self.falsePositiveLanIdAddressesSet.add(lanID)
169	else:
170	self.trueLanIdAddressesSet.add(lanID)
171	lanIDTestResult = True
172	return lanIDTestResult
173
174
175	def WriteLogFile(self, setOfValues, outputFilePath):
176	"""Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
177	fileNameInc = 0
178	while os.path.isfile(outputFilePath):
179	fileNameInc +=1
180	outputFile, extension = os.path.splitext(outputFilePath)
181	outputFilePath = outputFile + str(fileNameInc) + extension
182	outFl = open(outputFilePath,'w')
183	tempList = list(setOfValues)
184	tempList.sort()
185	for i in tempList:
186	outFl.write(f"{i}\n")
187	outFl.close()
188
189	def CalculateScenario(self,rawNumber, averageNumbRecip):
190	"""This method takes the raw number, which should be a decimal calculation of the percent, and returns the scenario code"""
191	scenario = "Uncategorized"
192	if rawNumber > .76:
193	scenario = "C"
194	elif averageNumbRecip < 15 and rawNumber < .019:
195	scenario = "A or B"
196	elif rawNumber > .009 and rawNumber < .34:
197	scenario = "D"
198
199	return scenario
200
201	if __name__ == '__main__':
202
203	tsa = TopSendersAnalyzer()
204	tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True)
205
206
207	#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
208	#outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
209	#allToLanAddressesList = list(allToLanAddressesSet)
210	#allToLanAddressesList.sort()
211	#for i in allToLanAddressesList:
212	# outputFile.write(f"{i}\n")
213	#outputFile.close()
214
215	## Initially gathering some very basic information across the CSV files, not using csv lib
216	# for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
217	# for fl in files:
218	# contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
219	# print(f"{fl}\|{len(contents)-1}")