ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Evidox/MultiFieldDomainAndEmailExtractor.py
Revision: 746
Committed: Thu Apr 15 20:11:16 2021 UTC (4 years, 11 months ago) by nino.borges
Content type: text/x-python
File size: 4053 byte(s)
Log Message:
Updated to be compatible with python3, which made the bak files.

File Contents

# User Rev Content
1 nino.borges 634 """
2    
3     MultiFieldDomainAndEmailExtractor
4    
5     Created by
6     Emanuel Borges
7     03.23.2018
8    
9     This program works a lot like SenderDomainExtractor but supports multiple email fields. Will extract a list of unique domains and unique email addresses.
10    
11     Will skip any that are internal and do not have a @ symbol, just FYI.
12    
13     Doesnt support unicode, so convert to ascii first.
14    
15     WARNING: removes the headder from the dat, so make sure you have one.
16    
17    
18     """
19    
20     import re
21    
22     def CreateRawList(datFileName, delim, quoteChar, fieldNumberList):
23     """Creates that first unique list of the raw email address values"""
24     count = 0
25     rawMatrix = {}
26 nino.borges 746 print("creating the raw list...")
27 nino.borges 634 contents = open(datFileName).readlines()
28     contents = contents[1:]
29     for line in contents:
30     count = count +1
31     if count > 5000:
32 nino.borges 746 print("Heartbeat")
33 nino.borges 634 count = 0
34     line = line.replace("\n","")
35     #line = line.replace(quoteChar,"")
36     line = line.split(delim)
37     #print len(line)
38     if len(line) >1:
39     bates = line[0]
40     for numb in fieldNumberList:
41     if line[numb]:
42     emailAddresses = line[numb].split(";")
43     for emailAddress in emailAddresses:
44     emailAddress = emailAddress.replace(quoteChar,"")
45     if emailAddress:
46     rawMatrix[emailAddress.lower()] = 1
47 nino.borges 746 print("Raw matrix made.")
48     rawList = list(rawMatrix.keys())
49     print("There are %d unique raw email addresses."% len(rawList))
50 nino.borges 634 rawList.sort()
51     return rawList
52    
53     def ProcessRawList(rawList):
54     """Using RE, will process the rawList and return unique realEmailAddresses (with an @) and unique domains lists"""
55     emailAddressesMatrix = {}
56     domainMatrix = {}
57 nino.borges 746 print("processing list...")
58 nino.borges 634 for i in rawList:
59     if '@' in i:
60     x = re.search('(\w+[.|\w])*@(\w+[.])*\w+',i)
61     if x:
62     emailAddy = x.group()
63     emailAddressesMatrix[emailAddy.lower()] = 1
64     domain = "@%s"%emailAddy.split("@")[1]
65     domainMatrix[domain.lower()] = 1
66    
67 nino.borges 746 emailAddressesList = list(emailAddressesMatrix.keys())
68 nino.borges 634 emailAddressesList.sort()
69 nino.borges 746 domainList = list(domainMatrix.keys())
70 nino.borges 634 domainList.sort()
71 nino.borges 746 print("Done.")
72     print('There are %d unique "real" email addresses.'% len(emailAddressesList))
73     print('There are %d unique email domains.'% len(domainList))
74 nino.borges 634 return emailAddressesList, domainList
75    
76    
77    
78    
79     if __name__ == '__main__':
80     inputDatFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\test2Cleaned.dat"
81     rawEmailsOutputFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\UniqueRawEmailAddresses.dat"
82     domainOutputFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\UniqueDomains.txt"
83     emailsOutputFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\UniqueRealEmailAddresses.dat"
84     ## A list of the positions of the email fields
85     fieldNumbersList = [1,2,3,4]
86    
87    
88     delim = '\x14'
89     quoteChar = '\xfe'
90    
91     rawList = CreateRawList(inputDatFileName, delim, quoteChar, fieldNumbersList)
92     emailAddressesList, domainsList = ProcessRawList(rawList)
93    
94    
95 nino.borges 746 print("Unpacking...")
96 nino.borges 634 domainsOutputFile = open(domainOutputFileName,'w')
97     emailsOutputFile = open(emailsOutputFileName,'w')
98     rawEmailsOutputFile = open(rawEmailsOutputFileName,'w')
99     for i in domainsList:
100     domainsOutputFile.write("%s\n"%i)
101     domainsOutputFile.close()
102     for i in emailAddressesList:
103     emailsOutputFile.write("%s\n"%i)
104     emailsOutputFile.close()
105     for i in rawList:
106     rawEmailsOutputFile.write("%s\n"%i)
107     rawEmailsOutputFile.close()
108 nino.borges 746 print("Unpack Finished.")