Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py

"""

NTRS-TopSenderAnalysis

Created by:
Emanuel Borges
09.20.2023

Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
To-Do: Method to QC for any parsing errors.  There should be the same number of fields across all of the CSVs.; 

"""

import csv, os, re

def LanIDTrueTest(listOfIds):
    """a need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
    lanIDTestResult = False
    for lanID in listOfIds:
        alphaOnly = [x.lower() for x in lanID if x.isalpha()]
        if len(alphaOnly) > 10:
            ## I'm too big to be a true LAN ID
            print(lanID)
        else:
            lanIDTestResult = True
    return lanIDTestResult
        

if __name__ == '__main__':
    startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
    #startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-07-11 FileNet Messages Delete Project - Top sender analysis\documents\ERM_Notifications"
    allToLanAddressesSet = set()
    #regExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
    regExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
    

    for (root,dirs,files) in os.walk(startDir):
        for fl in files:
            fileRowCount = 0
            eightyCharRowCount = 0
            nonEightyCharRowCount = 0
            charsOverEighty = False
            noNtrsDomainBucketCount = 0
            ntrsAndLanBucketCount = 0
            ntrsNoLanBucketCount = 0
            
            with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
                csv_reader = csv.DictReader(csv_file)    
                for row in csv_reader:
                    fileRowCount += 1
                    if len(row['To']) == 80:
                        eightyCharRowCount +=1
                    else:
                        nonEightyCharRowCount +=1
                    if len(row['To']) > 80:
                       charsOverEighty = True 
                    toValue = row['To']
                    toValue = toValue.upper()
                    if "@NTRS.COM" in toValue:
                        ## The domain was found. Apply next test.
                        ntrsLanAddressesList = re.findall(regExPattern, toValue)
                        ntrsLanIDTrueTestResult = LanIDTrueTest(ntrsLanAddressesList)
                        if ntrsLanIDTrueTestResult:
                            ## At least 1 LAN ID was found, using the True Test
                            ntrsAndLanBucketCount +=1
                            #for a in ntrsLanAddressesList:
                            #    allToLanAddressesSet.add(a)
                        else:
                            ## Not 1 true LAN ID was found, using the True Test
                            ntrsNoLanBucketCount +=1
                    else:
                        ## No ntrs addresses found at all, 
                        noNtrsDomainBucketCount +=1
            print(f"{fl}|{fileRowCount}|{eightyCharRowCount}|{nonEightyCharRowCount}|{charsOverEighty}|{noNtrsDomainBucketCount}|{ntrsAndLanBucketCount}|{ntrsNoLanBucketCount}")
            csv_file.close()
    #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
    #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
    #allToLanAddressesList = list(allToLanAddressesSet)
    #allToLanAddressesList.sort()
    #for i in allToLanAddressesList:
    #    outputFile.write(f"{i}\n")
    #outputFile.close()

    ## Initially gathering some very basic information across the CSV files, not using csv lib
#    for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
#        for fl in files:
#            contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
#            print(f"{fl}|{len(contents)-1}")
Revision:	800
Committed:	Mon Sep 25 18:17:43 2023 UTC (2 years, 6 months ago) by nino.borges
Content type:	text/x-python
File size:	4079 byte(s)
Log Message:	This version with a working TrueTest method which goes one step further than the RegEx.
#	Content
1	"""
2
3	NTRS-TopSenderAnalysis
4
5	Created by:
6	Emanuel Borges
7	09.20.2023
8
9	Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10	To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12	"""
13
14	import csv, os, re
15
16	def LanIDTrueTest(listOfIds):
17	"""a need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
18	lanIDTestResult = False
19	for lanID in listOfIds:
20	alphaOnly = [x.lower() for x in lanID if x.isalpha()]
21	if len(alphaOnly) > 10:
22	## I'm too big to be a true LAN ID
23	print(lanID)
24	else:
25	lanIDTestResult = True
26	return lanIDTestResult
27
28
29	if __name__ == '__main__':
30	startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
31	#startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-07-11 FileNet Messages Delete Project - Top sender analysis\documents\ERM_Notifications"
32	allToLanAddressesSet = set()
33	#regExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
34	regExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
35
36
37
38	for (root,dirs,files) in os.walk(startDir):
39	for fl in files:
40	fileRowCount = 0
41	eightyCharRowCount = 0
42	nonEightyCharRowCount = 0
43	charsOverEighty = False
44	noNtrsDomainBucketCount = 0
45	ntrsAndLanBucketCount = 0
46	ntrsNoLanBucketCount = 0
47
48	with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
49	csv_reader = csv.DictReader(csv_file)
50	for row in csv_reader:
51	fileRowCount += 1
52	if len(row['To']) == 80:
53	eightyCharRowCount +=1
54	else:
55	nonEightyCharRowCount +=1
56	if len(row['To']) > 80:
57	charsOverEighty = True
58	toValue = row['To']
59	toValue = toValue.upper()
60	if "@NTRS.COM" in toValue:
61	## The domain was found. Apply next test.
62	ntrsLanAddressesList = re.findall(regExPattern, toValue)
63	ntrsLanIDTrueTestResult = LanIDTrueTest(ntrsLanAddressesList)
64	if ntrsLanIDTrueTestResult:
65	## At least 1 LAN ID was found, using the True Test
66	ntrsAndLanBucketCount +=1
67	#for a in ntrsLanAddressesList:
68	# allToLanAddressesSet.add(a)
69	else:
70	## Not 1 true LAN ID was found, using the True Test
71	ntrsNoLanBucketCount +=1
72	else:
73	## No ntrs addresses found at all,
74	noNtrsDomainBucketCount +=1
75	print(f"{fl}\|{fileRowCount}\|{eightyCharRowCount}\|{nonEightyCharRowCount}\|{charsOverEighty}\|{noNtrsDomainBucketCount}\|{ntrsAndLanBucketCount}\|{ntrsNoLanBucketCount}")
76	csv_file.close()
77	#print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
78	#outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
79	#allToLanAddressesList = list(allToLanAddressesSet)
80	#allToLanAddressesList.sort()
81	#for i in allToLanAddressesList:
82	# outputFile.write(f"{i}\n")
83	#outputFile.close()
84
85	## Initially gathering some very basic information across the CSV files, not using csv lib
86	# for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
87	# for fl in files:
88	# contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
89	# print(f"{fl}\|{len(contents)-1}")