ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/NTRS-TopSenderAnalysis.py
Revision: 801
Committed: Thu Sep 28 13:37:57 2023 UTC (2 years, 5 months ago) by nino.borges
Content type: text/x-python
File size: 6629 byte(s)
Log Message:
Converted to a class for better reusability and commenting.

File Contents

# User Rev Content
1 nino.borges 797 """
2    
3     NTRS-TopSenderAnalysis
4    
5     Created by:
6     Emanuel Borges
7     09.20.2023
8    
9     Very simple program that will read multiple CSV files and export a report based on LanID and other general information.
10 nino.borges 798 To-Do: Method to QC for any parsing errors. There should be the same number of fields across all of the CSVs.;
11 nino.borges 797
12     """
13    
14 nino.borges 798 import csv, os, re
15 nino.borges 797
16 nino.borges 801 class TopSendersAnalyzer(object):
17     version = "0.03"
18    
19     def __init__(self):
20     self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req"
21     #self.startDir = r"C:\Users\eborges\Documents\Cases\Northern Trust\20230919 - FileNetTopSenderAnalysis-Req\_LocalVersion\2023-08-14 FileNet Messages Delete Project - Top sender analysis\8_14_2023\xact_report_Dec_2014_CSV"
22    
23     ## All email addresses with an @NTRS.COM domain. Currently unsupported.
24     #self.allToNtrsAddressesSet = set()
25     #self.allToNtrsAddressesOutputFileName = r""
26    
27     ## All true NTRS LAN ID matches, per specification provided to me.
28     self.trueLanIdAddressesSet = set()
29     self.trueLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_TRUE_LanAddresses.txt"
30    
31     ## False positive NTRS LAN ID matches, per specification provided to me. Close but just outside of specification. (for analysis)
32     self.falsePositiveLanIdAddressesSet = set()
33     self.falsePositiveLanIdAddressesOutputFileName = r"C:\Users\eborges\Documents\Cases\Northern Trust\Extracted_FALSE-POSTIVE_LanAddresses.txt"
34 nino.borges 800
35 nino.borges 801 #self.regExPattern = '[A-Za-z]{2}[0-9]{2,3}@NTRS.COM'
36     self.regExPattern = '[A-Za-z]{1,15}[0-9]{1,3}@NTRS.COM'
37    
38 nino.borges 800
39 nino.borges 801 def AnalyzeTopSenders(self, writeTrueLanIDLogFile = False, writeFalsePositiveLanIDLogFile = False):
40     """Main Method in this program"""
41     for (root,dirs,files) in os.walk(self.startDir):
42     for fl in files:
43     fileRowCount = 0
44     eightyCharRowCount = 0
45     nonEightyCharRowCount = 0
46     charsOverEighty = False
47     noNtrsDomainBucketCount = 0
48     ntrsAndLanBucketCount = 0
49     ntrsNoLanBucketCount = 0
50    
51     with open(os.path.join(root,fl),mode='r',encoding='ANSI') as csv_file:
52     csv_reader = csv.DictReader(csv_file)
53     for row in csv_reader:
54     fileRowCount += 1
55     if len(row['To']) == 80:
56     eightyCharRowCount +=1
57     else:
58     nonEightyCharRowCount +=1
59     if len(row['To']) > 80:
60     charsOverEighty = True
61     toValue = row['To']
62     toValue = toValue.upper()
63     if "@NTRS.COM" in toValue:
64     ## The domain was found. Apply next test.
65     ntrsLanAddressesList = re.findall(self.regExPattern, toValue)
66     ntrsLanIDTrueTestResult = self.LanIDTrueTest(ntrsLanAddressesList)
67     if ntrsLanIDTrueTestResult:
68     ## At least 1 LAN ID was found, using the True Test
69     ntrsAndLanBucketCount +=1
70     #for a in ntrsLanAddressesList:
71     # allToLanAddressesSet.add(a)
72     else:
73     ## Not 1 true LAN ID was found, using the True Test
74     ntrsNoLanBucketCount +=1
75     else:
76     ## No ntrs addresses found at all,
77     noNtrsDomainBucketCount +=1
78     print(f"{fl}|{fileRowCount}|{eightyCharRowCount}|{nonEightyCharRowCount}|{charsOverEighty}|{noNtrsDomainBucketCount}|{ntrsAndLanBucketCount}|{ntrsNoLanBucketCount}")
79     csv_file.close()
80     if writeTrueLanIDLogFile:
81     print("Writing the True LAN ID log file...")
82     self.WriteLogFile(self.trueLanIdAddressesSet, self.trueLanIdAddressesOutputFileName)
83     print("Done.\n")
84     if writeFalsePositiveLanIDLogFile:
85     print("Writing the False-Positive LAN ID log file...")
86     self.WriteLogFile(self.falsePositiveLanIdAddressesSet, self.falsePositiveLanIdAddressesOutputFileName)
87     print("Done.\n")
88    
89    
90     def LanIDTrueTest(self, listOfIds):
91     """A need for a more complicated LAN ID test was needed. Returns True if at least 1 true LAN ID is found in the list."""
92     lanIDTestResult = False
93     for lanID in listOfIds:
94     alphaOnly = [x.lower() for x in lanID if x.isalpha()]
95     if len(alphaOnly) > 10:
96     ## I'm too big to be a true LAN ID
97     self.falsePositiveLanIdAddressesSet.add(lanID)
98     else:
99     self.trueLanIdAddressesSet.add(lanID)
100     lanIDTestResult = True
101     return lanIDTestResult
102    
103    
104     def WriteLogFile(self, setOfValues, outputFilePath):
105     """Takes a Set containing values, sorts these, and then writes them to the given outputFilePath)"""
106     fileNameInc = 0
107     while os.path.isfile(outputFilePath):
108     fileNameInc +=1
109     outputFile, extension = os.path.splitext(outputFilePath)
110     outputFilePath = outputFile + str(fileNameInc) + extension
111     outFl = open(outputFilePath,'w')
112     tempList = list(setOfValues)
113     tempList.sort()
114     for i in tempList:
115     outFl.write(f"{i}\n")
116     outFl.close()
117    
118 nino.borges 797 if __name__ == '__main__':
119    
120 nino.borges 801 tsa = TopSendersAnalyzer()
121     tsa.AnalyzeTopSenders(writeTrueLanIDLogFile = True, writeFalsePositiveLanIDLogFile = True)
122 nino.borges 797
123 nino.borges 801
124 nino.borges 800 #print(f"There are {len(allToLanAddressesSet)} unique LAN ID addresses.")
125     #outputFile = open(r"C:\Users\eborges\Documents\Cases\Northern Trust\ExtractedLanAddresses.txt",'w')
126     #allToLanAddressesList = list(allToLanAddressesSet)
127     #allToLanAddressesList.sort()
128     #for i in allToLanAddressesList:
129     # outputFile.write(f"{i}\n")
130     #outputFile.close()
131 nino.borges 797
132     ## Initially gathering some very basic information across the CSV files, not using csv lib
133     # for (root,dirs,files) in os.walk(r"C:\Users\eborges\Documents\Cases\Northern Trust"):
134     # for fl in files:
135     # contents = open(os.path.join(root,fl), encoding='ANSI').readlines()
136     # print(f"{fl}|{len(contents)-1}")