ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Evidox/MultiFieldDomainAndEmailExtractor.py
Revision: 634
Committed: Wed Mar 28 14:08:28 2018 UTC (8 years ago) by nino.borges
Content type: text/x-python
File size: 4025 byte(s)
Log Message:
Initial version

File Contents

# Content
1 """
2
3 MultiFieldDomainAndEmailExtractor
4
5 Created by
6 Emanuel Borges
7 03.23.2018
8
9 This program works a lot like SenderDomainExtractor but supports multiple email fields. Will extract a list of unique domains and unique email addresses.
10
11 Will skip any that are internal and do not have a @ symbol, just FYI.
12
13 Doesnt support unicode, so convert to ascii first.
14
15 WARNING: removes the headder from the dat, so make sure you have one.
16
17
18 """
19
20 import re
21
22 def CreateRawList(datFileName, delim, quoteChar, fieldNumberList):
23 """Creates that first unique list of the raw email address values"""
24 count = 0
25 rawMatrix = {}
26 print "creating the raw list..."
27 contents = open(datFileName).readlines()
28 contents = contents[1:]
29 for line in contents:
30 count = count +1
31 if count > 5000:
32 print "Heartbeat"
33 count = 0
34 line = line.replace("\n","")
35 #line = line.replace(quoteChar,"")
36 line = line.split(delim)
37 #print len(line)
38 if len(line) >1:
39 bates = line[0]
40 for numb in fieldNumberList:
41 if line[numb]:
42 emailAddresses = line[numb].split(";")
43 for emailAddress in emailAddresses:
44 emailAddress = emailAddress.replace(quoteChar,"")
45 if emailAddress:
46 rawMatrix[emailAddress.lower()] = 1
47 print "Raw matrix made."
48 rawList = rawMatrix.keys()
49 print "There are %d unique raw email addresses."% len(rawList)
50 rawList.sort()
51 return rawList
52
53 def ProcessRawList(rawList):
54 """Using RE, will process the rawList and return unique realEmailAddresses (with an @) and unique domains lists"""
55 emailAddressesMatrix = {}
56 domainMatrix = {}
57 print "processing list..."
58 for i in rawList:
59 if '@' in i:
60 x = re.search('(\w+[.|\w])*@(\w+[.])*\w+',i)
61 if x:
62 emailAddy = x.group()
63 emailAddressesMatrix[emailAddy.lower()] = 1
64 domain = "@%s"%emailAddy.split("@")[1]
65 domainMatrix[domain.lower()] = 1
66
67 emailAddressesList = emailAddressesMatrix.keys()
68 emailAddressesList.sort()
69 domainList = domainMatrix.keys()
70 domainList.sort()
71 print "Done."
72 print 'There are %d unique "real" email addresses.'% len(emailAddressesList)
73 print 'There are %d unique email domains.'% len(domainList)
74 return emailAddressesList, domainList
75
76
77
78
79 if __name__ == '__main__':
80 inputDatFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\test2Cleaned.dat"
81 rawEmailsOutputFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\UniqueRawEmailAddresses.dat"
82 domainOutputFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\UniqueDomains.txt"
83 emailsOutputFileName = r"L:\__People\Emanuel\MyCases\Murphy & King\McLaughlin Northrup - McDonald Fracassa\File_Cabinet\UniqueRealEmailAddresses.dat"
84 ## A list of the positions of the email fields
85 fieldNumbersList = [1,2,3,4]
86
87
88 delim = '\x14'
89 quoteChar = '\xfe'
90
91 rawList = CreateRawList(inputDatFileName, delim, quoteChar, fieldNumbersList)
92 emailAddressesList, domainsList = ProcessRawList(rawList)
93
94
95 print "Unpacking..."
96 domainsOutputFile = open(domainOutputFileName,'w')
97 emailsOutputFile = open(emailsOutputFileName,'w')
98 rawEmailsOutputFile = open(rawEmailsOutputFileName,'w')
99 for i in domainsList:
100 domainsOutputFile.write("%s\n"%i)
101 domainsOutputFile.close()
102 for i in emailAddressesList:
103 emailsOutputFile.write("%s\n"%i)
104 emailsOutputFile.close()
105 for i in rawList:
106 rawEmailsOutputFile.write("%s\n"%i)
107 rawEmailsOutputFile.close()
108 print "Unpack Finished."