Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py

"""

NTRS-TopSenderAnalysis

Created by:
Emanuel Borges
09.20.2023

Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
To-Do: Method to QC for any parsing errors.  There should be the same number of fields across all of the CSVs.; 

"""

import csv, os, re

class TopSendersAnalyzer(object):
    version = "0.03"

    def __init__(self):
        self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
        #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"

        ##  All email addresses with an @NTRS.COM domain. Currently unsupported.
        #self.allToNtrsAddressesSet = set()
        #self.allToNtrsAddressesOutputFileName = r""

        ##  All true NTRS LAN ID matches, per specification provided to me.
        self.trueLanIdAddressesSet = set()
        self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"

        ##  False positive NTRS LAN ID matches, per specification provided to me.  Close but just outside of specification. (for analysis)
        self.falsePositiveLanIdAddressesSet = set()
        self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
        
        #self.regExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
        self.regExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
        

    def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False):
        """Main Method in this program"""
        for (root,dirs,files) in os.walk(self.startDir):
            for fl in files:
                fileRowCount = 0
                eightyCharRowCount = 0
                nonEightyCharRowCount = 0
                charsOverEighty = False
                noNtrsDomainBucketCount = 0
                ntrsAndLanBucketCount = 0
                ntrsNoLanBucketCount = 0

                with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
                    csv_reader = csv.DictReader(csv_file)    
                    for row in csv_reader:
                        fileRowCount += 1
                        if len(row['To']) == 80:
                            eightyCharRowCount +=1
                        else:
                            nonEightyCharRowCount +=1
                        if len(row['To']) > 80:
                           charsOverEighty = True 
                        toValue = row['To']
                        toValue = toValue.upper()
                        if "@NTRS.COM" in toValue:
                            ## The domain was found. Apply next test.
                            ntrsLanAddressesList = re.findall(self.regExPattern, toValue)
                            ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
                            if ntrsLanIDTrueTestResult:
                                ## At least 1 LAN ID was found, using the True Test
                                ntrsAndLanBucketCount +=1
                                #for a in ntrsLanAddressesList:
                                #    allToLanAddressesSet.add(a)
                            else:
                                ## Not 1 true LAN ID was found, using the True Test
                                ntrsNoLanBucketCount +=1
                        else:
                            ## No ntrs addresses found at all, 
                            noNtrsDomainBucketCount +=1
                print(f"{fl}|{fileRowCount}|{eightyCharRowCount}|{nonEightyCharRowCount}|{charsOverEighty}|{noNtrsDomainBucketCount}|{ntrsAndLanBucketCount}|{ntrsNoLanBucketCount}")
                csv_file.close()
        if writeTrueLanIDLogFile:
            print("Writing the True LAN ID log file...")
            self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
            print("Done.\n")
        if writeFalsePositiveLanIDLogFile:
            print("Writing the False-Positive LAN ID log file...")
            self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
            print("Done.\n")
        

    def LanIDTrueTest(self, listOfIds):
        """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
        lanIDTestResult = False
        for lanID in listOfIds:
            alphaOnly = [x.lower() for x in lanID if x.isalpha()]
            if len(alphaOnly) > 10:
                ## I'm too big to be a true LAN ID
                self.falsePositiveLanIdAddressesSet.add(lanID)
            else:
                self.trueLanIdAddressesSet.add(lanID)
                lanIDTestResult = True
        return lanIDTestResult


    def WriteLogFile(self, setOfValues, outputFilePath):
        """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
        fileNameInc = 0
        while os.path.isfile(outputFilePath):
            fileNameInc +=1
            outputFile, extension = os.path.splitext(outputFilePath)
            outputFilePath = outputFile + str(fileNameInc) + extension
        outFl = open(outputFilePath,'w')
        tempList = list(setOfValues)
        tempList.sort()
        for i in tempList:
            outFl.write(f"{i}\n")
        outFl.close()

if __name__ == '__main__':

    tsa = TopSendersAnalyzer()
    tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True)


    #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
    #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
    #allToLanAddressesList = list(allToLanAddressesSet)
    #allToLanAddressesList.sort()
    #for i in allToLanAddressesList:
    #    outputFile.write(f"{i}\n")
    #outputFile.close()

    ## Initially gathering some very basic information across the CSV files, not using csv lib
#    for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
#        for fl in files:
#            contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
#            print(f"{fl}|{len(contents)-1}")
Revision:	801
Committed:	Thu Sep 28 13:37:57 2023 UTC (2 years, 5 months ago) by nino.borges
Content type:	text/x-python
File size:	6629 byte(s)
Log Message:	Converted to a class for better reusability and commenting.
#	Content
1	"""
2
3	NTRS-TopSenderAnalysis
4
5	Created by:
6	Emanuel Borges
7	09.20.2023
8
9	Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10	To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12	"""
13
14	import csv, os, re
15
16	class TopSendersAnalyzer(object):
17	version = "0.03"
18
19	def __init__(self):
20	self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
21	#self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
22
23	## All email addresses with an @NTRS.COM domain. Currently unsupported.
24	#self.allToNtrsAddressesSet = set()
25	#self.allToNtrsAddressesOutputFileName = r""
26
27	## All true NTRS LAN ID matches, per specification provided to me.
28	self.trueLanIdAddressesSet = set()
29	self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
30
31	## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
32	self.falsePositiveLanIdAddressesSet = set()
33	self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
34
35	#self.regExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
36	self.regExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
37
38
39	def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False):
40	"""Main Method in this program"""
41	for (root,dirs,files) in os.walk(self.startDir):
42	for fl in files:
43	fileRowCount = 0
44	eightyCharRowCount = 0
45	nonEightyCharRowCount = 0
46	charsOverEighty = False
47	noNtrsDomainBucketCount = 0
48	ntrsAndLanBucketCount = 0
49	ntrsNoLanBucketCount = 0
50
51	with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
52	csv_reader = csv.DictReader(csv_file)
53	for row in csv_reader:
54	fileRowCount += 1
55	if len(row['To']) == 80:
56	eightyCharRowCount +=1
57	else:
58	nonEightyCharRowCount +=1
59	if len(row['To']) > 80:
60	charsOverEighty = True
61	toValue = row['To']
62	toValue = toValue.upper()
63	if "@NTRS.COM" in toValue:
64	## The domain was found. Apply next test.
65	ntrsLanAddressesList = re.findall(self.regExPattern, toValue)
66	ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
67	if ntrsLanIDTrueTestResult:
68	## At least 1 LAN ID was found, using the True Test
69	ntrsAndLanBucketCount +=1
70	#for a in ntrsLanAddressesList:
71	# allToLanAddressesSet.add(a)
72	else:
73	## Not 1 true LAN ID was found, using the True Test
74	ntrsNoLanBucketCount +=1
75	else:
76	## No ntrs addresses found at all,
77	noNtrsDomainBucketCount +=1
78	print(f"{fl}\|{fileRowCount}\|{eightyCharRowCount}\|{nonEightyCharRowCount}\|{charsOverEighty}\|{noNtrsDomainBucketCount}\|{ntrsAndLanBucketCount}\|{ntrsNoLanBucketCount}")
79	csv_file.close()
80	if writeTrueLanIDLogFile:
81	print("Writing the True LAN ID log file...")
82	self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
83	print("Done.\n")
84	if writeFalsePositiveLanIDLogFile:
85	print("Writing the False-Positive LAN ID log file...")
86	self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
87	print("Done.\n")
88
89
90	def LanIDTrueTest(self, listOfIds):
91	"""A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
92	lanIDTestResult = False
93	for lanID in listOfIds:
94	alphaOnly = [x.lower() for x in lanID if x.isalpha()]
95	if len(alphaOnly) > 10:
96	## I'm too big to be a true LAN ID
97	self.falsePositiveLanIdAddressesSet.add(lanID)
98	else:
99	self.trueLanIdAddressesSet.add(lanID)
100	lanIDTestResult = True
101	return lanIDTestResult
102
103
104	def WriteLogFile(self, setOfValues, outputFilePath):
105	"""Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
106	fileNameInc = 0
107	while os.path.isfile(outputFilePath):
108	fileNameInc +=1
109	outputFile, extension = os.path.splitext(outputFilePath)
110	outputFilePath = outputFile + str(fileNameInc) + extension
111	outFl = open(outputFilePath,'w')
112	tempList = list(setOfValues)
113	tempList.sort()
114	for i in tempList:
115	outFl.write(f"{i}\n")
116	outFl.close()
117
118	if __name__ == '__main__':
119
120	tsa = TopSendersAnalyzer()
121	tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True)
122
123
124	#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
125	#outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
126	#allToLanAddressesList = list(allToLanAddressesSet)
127	#allToLanAddressesList.sort()
128	#for i in allToLanAddressesList:
129	# outputFile.write(f"{i}\n")
130	#outputFile.close()
131
132	## Initially gathering some very basic information across the CSV files, not using csv lib
133	# for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
134	# for fl in files:
135	# contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
136	# print(f"{fl}\|{len(contents)-1}")