Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py

"""

NTRS-TopSenderAnalysis

Created by:
Emanuel Borges
09.20.2023

Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
To-Do: Method to QC for any parsing errors.  There should be the same number of fields across all of the CSVs.; 

"""

import csv, os, re

class TopSendersAnalyzer(object):
    version = "0.05"

    def __init__(self):
        self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
        #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"


        ##  All possible email addresses across all CSV files
        self.allPossibleEmailAddressesSet = set()
        self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"

        ##  All email addresses with an @NTRS.COM domain. Currently unsupported.
        #self.allToNtrsAddressesSet = set()
        #self.allToNtrsAddressesOutputFileName = r""

        ##  All true NTRS LAN ID matches, per specification provided to me.
        self.trueLanIdAddressesSet = set()
        self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"

        ##  False positive NTRS LAN ID matches, per specification provided to me.  Close but just outside of specification. (for analysis)
        self.falsePositiveLanIdAddressesSet = set()
        self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
        
        #self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
        self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'


        ##  Simple match to pull out the date as recorded in the path
        self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'


        ##  Match for pulling out all email addresses, regardless of domain.
        #self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
        #self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
        #self.allPossibleEmailAddressesRegExPattern = r"([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])"
        #self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
        self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"

    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False):
        """Main Method in this program"""
        for (root,dirs,files) in os.walk(self.startDir):
            for fl in files:
                fileRowCount = 0
                eightyCharRowCount = 0
                nonEightyCharRowCount = 0
                charsOverEighty = False
                noNtrsDomainBucketCount = 0
                ntrsAndLanBucketCount = 0
                ntrsNoLanBucketCount = 0
                dateInPath = re.findall(self.dateInPathRegExPattern, root)
                allEmailAddressesInCSVSet = set()

                with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
                    csv_reader = csv.DictReader(csv_file)    
                    for row in csv_reader:
                        fileRowCount += 1
                        if len(row['To']) == 80:
                            eightyCharRowCount +=1
                        else:
                            nonEightyCharRowCount +=1
                        if len(row['To']) > 80:
                           charsOverEighty = True 
                        toValue = row['To']
                        toValue = toValue.upper()
                        
                        ##  Match and gather all possible email addresses, adding it to the per CSV set.                        
                        allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
                        for eAddress in allEmailAddresses:
                            allEmailAddressesInCSVSet.add(eAddress)
                            
                        ##  Perform the main logic tests
                        if "NTRS.COM" in toValue:
                            ## The domain was found. Apply next test.
                            ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
                            ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
                            if ntrsLanIDTrueTestResult:
                                ## At least 1 LAN ID was found, using the True Test
                                ntrsAndLanBucketCount +=1
                                #for a in ntrsLanAddressesList:
                                #    allToLanAddressesSet.add(a)
                            else:
                                ## Not 1 true LAN ID was found, using the True Test
                                ntrsNoLanBucketCount +=1
                        else:
                            ## No ntrs addresses found at all, 
                            noNtrsDomainBucketCount +=1
                print(f"{fl}|{dateInPath}|{fileRowCount}|{eightyCharRowCount}|{nonEightyCharRowCount}|{charsOverEighty}|{len(allEmailAddressesInCSVSet)}|{noNtrsDomainBucketCount}|{ntrsAndLanBucketCount}|{ntrsNoLanBucketCount}")
                csv_file.close()
                ##  Update the global all email addresses set, if they selected this option
                if writeAllPossibleEmailAddressesLogFile:
                    self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
        if writeTrueLanIDLogFile:
            print("Writing the True LAN ID log file...")
            self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeFalsePositiveLanIDLogFile:
            print("Writing the False-Positive LAN ID log file...")
            self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeAllPossibleEmailAddressesLogFile:
            print("Writing the All Possible Email Addresses Across All CSV Files log file...")
            self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
            print("Done.\n")
        

    def LanIDTrueTest(self, listOfIds):
        """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
        lanIDTestResult = False
        for lanID in listOfIds:
            alphaOnly = [x.lower() for x in lanID if x.isalpha()]
            if len(alphaOnly) > 10:
                ## I'm too big to be a true LAN ID
                self.falsePositiveLanIdAddressesSet.add(lanID)
            else:
                self.trueLanIdAddressesSet.add(lanID)
                lanIDTestResult = True
        return lanIDTestResult


    def WriteLogFile(self, setOfValues, outputFilePath):
        """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
        fileNameInc = 0
        while os.path.isfile(outputFilePath):
            fileNameInc +=1
            outputFile, extension = os.path.splitext(outputFilePath)
            outputFilePath = outputFile + str(fileNameInc) + extension
        outFl = open(outputFilePath,'w')
        tempList = list(setOfValues)
        tempList.sort()
        for i in tempList:
            outFl.write(f"{i}\n")
        outFl.close()

if __name__ == '__main__':

    tsa = TopSendersAnalyzer()
    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True)


    #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
    #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
    #allToLanAddressesList = list(allToLanAddressesSet)
    #allToLanAddressesList.sort()
    #for i in allToLanAddressesList:
    #    outputFile.write(f"{i}\n")
    #outputFile.close()

    ## Initially gathering some very basic information across the CSV files, not using csv lib
#    for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
#        for fl in files:
#            contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
#            print(f"{fl}|{len(contents)-1}")
Revision:	804
Committed:	Thu Oct 12 16:03:05 2023 UTC (2 years, 5 months ago) by nino.borges
Content type:	text/x-python
File size:	9396 byte(s)
Log Message:	Per Diana, @EXNTRS.COM is a valid NTRS domain, so changing the NTRS Domain Criteria to now be NTRS.COM, which simplifies that line a bit and will capture anything with that in it.
#	Content
1	"""
2
3	NTRS-TopSenderAnalysis
4
5	Created by:
6	Emanuel Borges
7	09.20.2023
8
9	Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10	To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12	"""
13
14	import csv, os, re
15
16	class TopSendersAnalyzer(object):
17	version = "0.05"
18
19	def __init__(self):
20	self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
21	#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
22
23
24	## All possible email addresses across all CSV files
25	self.allPossibleEmailAddressesSet = set()
26	self.allPossibleEmailAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_AllEmailAddresses_All-CSVs.txt"
27
28	## All email addresses with an @NTRS.COM domain. Currently unsupported.
29	#self.allToNtrsAddressesSet = set()
30	#self.allToNtrsAddressesOutputFileName = r""
31
32	## All true NTRS LAN ID matches, per specification provided to me.
33	self.trueLanIdAddressesSet = set()
34	self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
35
36	## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
37	self.falsePositiveLanIdAddressesSet = set()
38	self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
39
40	#self.lanIdRegExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
41	self.lanIdRegExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
42
43
44	## Simple match to pull out the date as recorded in the path
45	self.dateInPathRegExPattern = '2023-[0-9]{2}-[0-9]{2}'
46
47
48	## Match for pulling out all email addresses, regardless of domain.
49	#self.allPossibleEmailAddressesRegExPattern = '([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z\|a-z]{2,})+'
50	#self.allPossibleEmailAddressesRegExPattern = """(?:[a-z0-9!#$%&'+/=?^_`{\|}~-]+(?:\.[a-z0-9!#$%&'+/=^_`{\|}~-]+)\|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])")@(?:(?:[a-z0-9](?:[a-z0-9-][a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-][a-z0-9])?\|\[(?:(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]\|[0-4][0-9])\|1[0-9][0-9]\|[1-9]?[0-9])\|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]\|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
51	#self.allPossibleEmailAddressesRegExPattern = r"([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\"([]!#-[^-~ \t]\|(\\[\t -~]))+\")@([-!#-'+/-9=?A-Z^-~]+(\.[-!#-'+/-9=?A-Z^-~]+)\|\[[\t -Z^-~]*])"
52	#self.allPossibleEmailAddressesRegExPattern = r"[\w\.-]+@[\w\.-]+\.\w+"
53	self.allPossibleEmailAddressesRegExPattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
54
55	def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False, writeAllPossibleEmailAddressesLogFile = False):
56	"""Main Method in this program"""
57	for (root,dirs,files) in os.walk(self.startDir):
58	for fl in files:
59	fileRowCount = 0
60	eightyCharRowCount = 0
61	nonEightyCharRowCount = 0
62	charsOverEighty = False
63	noNtrsDomainBucketCount = 0
64	ntrsAndLanBucketCount = 0
65	ntrsNoLanBucketCount = 0
66	dateInPath = re.findall(self.dateInPathRegExPattern, root)
67	allEmailAddressesInCSVSet = set()
68
69	with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
70	csv_reader = csv.DictReader(csv_file)
71	for row in csv_reader:
72	fileRowCount += 1
73	if len(row['To']) == 80:
74	eightyCharRowCount +=1
75	else:
76	nonEightyCharRowCount +=1
77	if len(row['To']) > 80:
78	charsOverEighty = True
79	toValue = row['To']
80	toValue = toValue.upper()
81
82	## Match and gather all possible email addresses, adding it to the per CSV set.
83	allEmailAddresses = re.findall(self.allPossibleEmailAddressesRegExPattern, toValue)
84	for eAddress in allEmailAddresses:
85	allEmailAddressesInCSVSet.add(eAddress)
86
87	## Perform the main logic tests
88	if "NTRS.COM" in toValue:
89	## The domain was found. Apply next test.
90	ntrsLanAddressesList = re.findall(self.lanIdRegExPattern, toValue)
91	ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
92	if ntrsLanIDTrueTestResult:
93	## At least 1 LAN ID was found, using the True Test
94	ntrsAndLanBucketCount +=1
95	#for a in ntrsLanAddressesList:
96	# allToLanAddressesSet.add(a)
97	else:
98	## Not 1 true LAN ID was found, using the True Test
99	ntrsNoLanBucketCount +=1
100	else:
101	## No ntrs addresses found at all,
102	noNtrsDomainBucketCount +=1
103	print(f"{fl}\|{dateInPath}\|{fileRowCount}\|{eightyCharRowCount}\|{nonEightyCharRowCount}\|{charsOverEighty}\|{len(allEmailAddressesInCSVSet)}\|{noNtrsDomainBucketCount}\|{ntrsAndLanBucketCount}\|{ntrsNoLanBucketCount}")
104	csv_file.close()
105	## Update the global all email addresses set, if they selected this option
106	if writeAllPossibleEmailAddressesLogFile:
107	self.allPossibleEmailAddressesSet.update(allEmailAddressesInCSVSet)
108	if writeTrueLanIDLogFile:
109	print("Writing the True LAN ID log file...")
110	self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
111	print("Done.\n")
112	if writeFalsePositiveLanIDLogFile:
113	print("Writing the False-Positive LAN ID log file...")
114	self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
115	print("Done.\n")
116	if writeAllPossibleEmailAddressesLogFile:
117	print("Writing the All Possible Email Addresses Across All CSV Files log file...")
118	self.WriteLogFile(self.allPossibleEmailAddressesSet, self.allPossibleEmailAddressesOutputFileName)
119	print("Done.\n")
120
121
122	def LanIDTrueTest(self, listOfIds):
123	"""A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
124	lanIDTestResult = False
125	for lanID in listOfIds:
126	alphaOnly = [x.lower() for x in lanID if x.isalpha()]
127	if len(alphaOnly) > 10:
128	## I'm too big to be a true LAN ID
129	self.falsePositiveLanIdAddressesSet.add(lanID)
130	else:
131	self.trueLanIdAddressesSet.add(lanID)
132	lanIDTestResult = True
133	return lanIDTestResult
134
135
136	def WriteLogFile(self, setOfValues, outputFilePath):
137	"""Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
138	fileNameInc = 0
139	while os.path.isfile(outputFilePath):
140	fileNameInc +=1
141	outputFile, extension = os.path.splitext(outputFilePath)
142	outputFilePath = outputFile + str(fileNameInc) + extension
143	outFl = open(outputFilePath,'w')
144	tempList = list(setOfValues)
145	tempList.sort()
146	for i in tempList:
147	outFl.write(f"{i}\n")
148	outFl.close()
149
150	if __name__ == '__main__':
151
152	tsa = TopSendersAnalyzer()
153	tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True, writeAllPossibleEmailAddressesLogFile = True)
154
155
156	#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
157	#outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
158	#allToLanAddressesList = list(allToLanAddressesSet)
159	#allToLanAddressesList.sort()
160	#for i in allToLanAddressesList:
161	# outputFile.write(f"{i}\n")
162	#outputFile.close()
163
164	## Initially gathering some very basic information across the CSV files, not using csv lib
165	# for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
166	# for fl in files:
167	# contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
168	# print(f"{fl}\|{len(contents)-1}")