ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 801
Committed: Thu Sep 28 13:37:57 2023 UTC (2 years, 5 months ago) by nino.borges
Content type: text/x-python
File size: 6629 byte(s)
Log Message:
Converted to a class for better reusability and commenting.

File Contents

# Content
1 """
2
3 NTRS-TopSenderAnalysis
4
5 Created by:
6 Emanuel Borges
7 09.20.2023
8
9 Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11
12 """
13
14 import csv, os, re
15
16 class TopSendersAnalyzer(object):
17 version = "0.03"
18
19 def __init__(self):
20 self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
21 #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
22
23 ## All email addresses with an @NTRS.COM domain. Currently unsupported.
24 #self.allToNtrsAddressesSet = set()
25 #self.allToNtrsAddressesOutputFileName = r""
26
27 ## All true NTRS LAN ID matches, per specification provided to me.
28 self.trueLanIdAddressesSet = set()
29 self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
30
31 ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
32 self.falsePositiveLanIdAddressesSet = set()
33 self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
34
35 #self.regExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
36 self.regExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
37
38
39 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False):
40 """Main Method in this program"""
41 for (root,dirs,files) in os.walk(self.startDir):
42 for fl in files:
43 fileRowCount = 0
44 eightyCharRowCount = 0
45 nonEightyCharRowCount = 0
46 charsOverEighty = False
47 noNtrsDomainBucketCount = 0
48 ntrsAndLanBucketCount = 0
49 ntrsNoLanBucketCount = 0
50
51 with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
52 csv_reader = csv.DictReader(csv_file)
53 for row in csv_reader:
54 fileRowCount += 1
55 if len(row['To']) == 80:
56 eightyCharRowCount +=1
57 else:
58 nonEightyCharRowCount +=1
59 if len(row['To']) > 80:
60 charsOverEighty = True
61 toValue = row['To']
62 toValue = toValue.upper()
63 if "@NTRS.COM" in toValue:
64 ## The domain was found. Apply next test.
65 ntrsLanAddressesList = re.findall(self.regExPattern, toValue)
66 ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
67 if ntrsLanIDTrueTestResult:
68 ## At least 1 LAN ID was found, using the True Test
69 ntrsAndLanBucketCount +=1
70 #for a in ntrsLanAddressesList:
71 # allToLanAddressesSet.add(a)
72 else:
73 ## Not 1 true LAN ID was found, using the True Test
74 ntrsNoLanBucketCount +=1
75 else:
76 ## No ntrs addresses found at all,
77 noNtrsDomainBucketCount +=1
78 print(f"{fl}|{fileRowCount}|{eightyCharRowCount}|{nonEightyCharRowCount}|{charsOverEighty}|{noNtrsDomainBucketCount}|{ntrsAndLanBucketCount}|{ntrsNoLanBucketCount}")
79 csv_file.close()
80 if writeTrueLanIDLogFile:
81 print("Writing the True LAN ID log file...")
82 self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
83 print("Done.\n")
84 if writeFalsePositiveLanIDLogFile:
85 print("Writing the False-Positive LAN ID log file...")
86 self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
87 print("Done.\n")
88
89
90 def LanIDTrueTest(self, listOfIds):
91 """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
92 lanIDTestResult = False
93 for lanID in listOfIds:
94 alphaOnly = [x.lower() for x in lanID if x.isalpha()]
95 if len(alphaOnly) > 10:
96 ## I'm too big to be a true LAN ID
97 self.falsePositiveLanIdAddressesSet.add(lanID)
98 else:
99 self.trueLanIdAddressesSet.add(lanID)
100 lanIDTestResult = True
101 return lanIDTestResult
102
103
104 def WriteLogFile(self, setOfValues, outputFilePath):
105 """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
106 fileNameInc = 0
107 while os.path.isfile(outputFilePath):
108 fileNameInc +=1
109 outputFile, extension = os.path.splitext(outputFilePath)
110 outputFilePath = outputFile + str(fileNameInc) + extension
111 outFl = open(outputFilePath,'w')
112 tempList = list(setOfValues)
113 tempList.sort()
114 for i in tempList:
115 outFl.write(f"{i}\n")
116 outFl.close()
117
118 if __name__ == '__main__':
119
120 tsa = TopSendersAnalyzer()
121 tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True)
122
123
124 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
125 #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
126 #allToLanAddressesList = list(allToLanAddressesSet)
127 #allToLanAddressesList.sort()
128 #for i in allToLanAddressesList:
129 # outputFile.write(f"{i}\n")
130 #outputFile.close()
131
132 ## Initially gathering some very basic information across the CSV files, not using csv lib
133 # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
134 # for fl in files:
135 # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
136 # print(f"{fl}|{len(contents)-1}")