ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Evidox/MultiFieldDomainAndEmailExtractor.py
Revision: 634
Committed: Wed Mar 28 14:08:28 2018 UTC (8 years ago) by nino.borges
Content type: text/x-python
File size: 4025 byte(s)
Log Message:
Initial version

File Contents

# User Rev Content
1 nino.borges 634 """
2    
3     MultiFieldDomainAndEmailExtractor
4    
5     Created by
6     Emanuel Borges
7     03.23.2018
8    
9     This program works a lot like SenderDomainExtractor but supports multiple email fields. Will extract a list of unique domains and unique email addresses.
10    
11     Will skip any that are internal and do not have a @ symbol, just FYI.
12    
13     Doesnt support unicode, so convert to ascii first.
14    
15     WARNING: removes the headder from the dat, so make sure you have one.
16    
17    
18     """
19    
20     import re
21    
22     def CreateRawList(datFileName, delim, quoteChar, fieldNumberList):
23     """Creates that first unique list of the raw email address values"""
24     count = 0
25     rawMatrix = {}
26     print "creating the raw list..."
27     contents = open(datFileName).readlines()
28     contents = contents[1:]
29     for line in contents:
30     count = count +1
31     if count > 5000:
32     print "Heartbeat"
33     count = 0
34     line = line.replace("\n","")
35     #line = line.replace(quoteChar,"")
36     line = line.split(delim)
37     #print len(line)
38     if len(line) >1:
39     bates = line[0]
40     for numb in fieldNumberList:
41     if line[numb]:
42     emailAddresses = line[numb].split(";")
43     for emailAddress in emailAddresses:
44     emailAddress = emailAddress.replace(quoteChar,"")
45     if emailAddress:
46     rawMatrix[emailAddress.lower()] = 1
47     print "Raw matrix made."
48     rawList = rawMatrix.keys()
49     print "There are %d unique raw email addresses."% len(rawList)
50     rawList.sort()
51     return rawList
52    
53     def ProcessRawList(rawList):
54     """Using RE, will process the rawList and return unique realEmailAddresses (with an @) and unique domains lists"""
55     emailAddressesMatrix = {}
56     domainMatrix = {}
57     print "processing list..."
58     for i in rawList:
59     if '@' in i:
60     x = re.search('(\w+[.|\w])*@(\w+[.])*\w+',i)
61     if x:
62     emailAddy = x.group()
63     emailAddressesMatrix[emailAddy.lower()] = 1
64     domain = "@%s"%emailAddy.split("@")[1]
65     domainMatrix[domain.lower()] = 1
66    
67     emailAddressesList = emailAddressesMatrix.keys()
68     emailAddressesList.sort()
69     domainList = domainMatrix.keys()
70     domainList.sort()
71     print "Done."
72     print 'There are %d unique "real" email addresses.'% len(emailAddressesList)
73     print 'There are %d unique email domains.'% len(domainList)
74     return emailAddressesList, domainList
75    
76    
77    
78    
79     if __name__ == '__main__':
80     inputDatFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\test2Cleaned.dat"
81     rawEmailsOutputFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\UniqueRawEmailAddresses.dat"
82     domainOutputFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\UniqueDomains.txt"
83     emailsOutputFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\UniqueRealEmailAddresses.dat"
84     ## A list of the positions of the email fields
85     fieldNumbersList = [1,2,3,4]
86    
87    
88     delim = '\x14'
89     quoteChar = '\xfe'
90    
91     rawList = CreateRawList(inputDatFileName, delim, quoteChar, fieldNumbersList)
92     emailAddressesList, domainsList = ProcessRawList(rawList)
93    
94    
95     print "Unpacking..."
96     domainsOutputFile = open(domainOutputFileName,'w')
97     emailsOutputFile = open(emailsOutputFileName,'w')
98     rawEmailsOutputFile = open(rawEmailsOutputFileName,'w')
99     for i in domainsList:
100     domainsOutputFile.write("%s\n"%i)
101     domainsOutputFile.close()
102     for i in emailAddressesList:
103     emailsOutputFile.write("%s\n"%i)
104     emailsOutputFile.close()
105     for i in rawList:
106     rawEmailsOutputFile.write("%s\n"%i)
107     rawEmailsOutputFile.close()
108     print "Unpack Finished."