ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Evidox/SenderDomainExtractor.py
Revision: 746
Committed: Thu Apr 15 20:11:16 2021 UTC (4 years, 11 months ago) by nino.borges
Content type: text/x-python
File size: 1693 byte(s)
Log Message:
Updated to be compatible with python3, which made the bak files.

File Contents

# User Rev Content
1 nino.borges 633 """
2    
3     sender domain extractor
4    
5     """
6     import re
7    
8    
9     if __name__ == '__main__':
10     contents = open(r"C:\Test-PY\GDWN-MBTA-SD_export-clean.dat").readlines()
11     domainsOutputFile = open(r"C:\Test-PY\GDWN-MBTA-SenderDomains.txt",'w')
12     emailsOutputFile = open(r"C:\Test-PY\GDWN-MBTA-SenderEmailAddresses.txt",'w')
13    
14     rawMatrix = {}
15     delim = ''
16     emailAddressesMatrix = {}
17     domainMatrix = {}
18     count = 0
19    
20 nino.borges 746 print("creating the raw list...")
21 nino.borges 633 for line in contents:
22     count = count +1
23     if count > 5000:
24 nino.borges 746 print("Heartbeat")
25 nino.borges 633 count = 0
26     line = line.replace("\n","")
27     line = line.split(delim)
28     if len(line) >1:
29     bates = line[0]
30     sender = line[1]
31     if sender:
32     rawMatrix[sender] = 1
33 nino.borges 746 print("raw matrix made.")
34     rawList = list(rawMatrix.keys())
35 nino.borges 633 rawList.sort()
36 nino.borges 746 print("processing list...")
37 nino.borges 633 for i in rawList:
38 nino.borges 746 print(i)
39 nino.borges 633 if '@' in i:
40     x = re.search('(\w+[.|\w])*@(\w+[.])*\w+',i)
41     if x:
42     emailAddy = x.group()
43     emailAddressesMatrix[emailAddy.lower()] = 1
44     domain = "@%s"%emailAddy.split("@")[1]
45     domainMatrix[domain.lower()] = 1
46 nino.borges 746 print("done.")
47     print("Unpacking...")
48     domainsList = list(domainMatrix.keys())
49 nino.borges 633 domainsList.sort()
50 nino.borges 746 emailsList = list(emailAddressesMatrix.keys())
51 nino.borges 633 emailsList.sort()
52     for i in domainsList:
53     domainsOutputFile.write("%s\n"%i)
54     domainsOutputFile.close()
55     for i in emailsList:
56     emailsOutputFile.write("%s\n"%i)
57     emailsOutputFile.close()
58