ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Evidox/SenderDomainExtractor.py
Revision: 633
Committed: Wed Mar 28 13:54:39 2018 UTC (8 years ago) by nino.borges
Content type: text/x-python
File size: 1668 byte(s)
Log Message:
A folder for my Evidox programs

File Contents

# User Rev Content
1 nino.borges 633 """
2    
3     sender domain extractor
4    
5     """
6     import re
7    
8    
9     if __name__ == '__main__':
10     contents = open(r"C:\Test-PY\GDWN-MBTA-SD_export-clean.dat").readlines()
11     domainsOutputFile = open(r"C:\Test-PY\GDWN-MBTA-SenderDomains.txt",'w')
12     emailsOutputFile = open(r"C:\Test-PY\GDWN-MBTA-SenderEmailAddresses.txt",'w')
13    
14     rawMatrix = {}
15     delim = ''
16     emailAddressesMatrix = {}
17     domainMatrix = {}
18     count = 0
19    
20     print "creating the raw list..."
21     for line in contents:
22     count = count +1
23     if count > 5000:
24     print "Heartbeat"
25     count = 0
26     line = line.replace("\n","")
27     line = line.split(delim)
28     if len(line) >1:
29     bates = line[0]
30     sender = line[1]
31     if sender:
32     rawMatrix[sender] = 1
33     print "raw matrix made."
34     rawList = rawMatrix.keys()
35     rawList.sort()
36     print "processing list..."
37     for i in rawList:
38     print i
39     if '@' in i:
40     x = re.search('(\w+[.|\w])*@(\w+[.])*\w+',i)
41     if x:
42     emailAddy = x.group()
43     emailAddressesMatrix[emailAddy.lower()] = 1
44     domain = "@%s"%emailAddy.split("@")[1]
45     domainMatrix[domain.lower()] = 1
46     print "done."
47     print "Unpacking..."
48     domainsList = domainMatrix.keys()
49     domainsList.sort()
50     emailsList = emailAddressesMatrix.keys()
51     emailsList.sort()
52     for i in domainsList:
53     domainsOutputFile.write("%s\n"%i)
54     domainsOutputFile.close()
55     for i in emailsList:
56     emailsOutputFile.write("%s\n"%i)
57     emailsOutputFile.close()
58