ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/ns_dev/Python/NinoCode/Active_prgs/Redgrave/vCardSearchAndDownload.py
Revision: 875
Committed: Tue Dec 31 03:27:12 2024 UTC (14 months, 3 weeks ago) by nino.borges
Content type: text/x-python
File size: 4550 byte(s)
Log Message:
I started getting forbidden messages when trying to download some vcards from other sites, so I added support for urllib, which does a bit better of a job with dealing with sending header information.  I also broke out a few variables, so I can better support other element texts, etc.  Works great for Hogan Lovells now too.

File Contents

# User Rev Content
1 nino.borges 873 """
2    
3     vCardSearchAndDownload
4    
5     Created by:
6     Emanuel Borges
7     12.30.2024
8    
9     This program uses chromedriver to first search for a person on a public company people search and then, if found, download the vCard for that person.
10    
11     """
12    
13     import os
14     import time
15     from selenium import webdriver
16     from selenium.webdriver.common.by import By
17     from selenium.webdriver.common.keys import Keys
18     from urllib.parse import urljoin
19     import requests
20 nino.borges 875 import urllib
21 nino.borges 873
22    
23 nino.borges 874 #BASE_URL = "https://perkinscoie.com/people-search"
24 nino.borges 875 #BASE_URL = "https://www.klgates.com/people"
25     BASE_URL = "https://www.hoganlovells.com/en/our-people"
26 nino.borges 873
27     VCARD_DIR = "vcards"
28     os.makedirs(VCARD_DIR, exist_ok = True)
29    
30    
31     driver = webdriver.Chrome()
32    
33 nino.borges 875 version = '0.3.0'
34 nino.borges 873
35    
36 nino.borges 875 def search_and_download_vcards(person_name, element_id_name, quotedSearch = False, protectedDownload = False):
37 nino.borges 873 try:
38     driver.get(BASE_URL)
39    
40     time.sleep(3)
41    
42    
43 nino.borges 875 search_box = driver.find_element(By.ID, element_id_name)
44     #search_box = driver.find_element(By.ID, "searchbox")
45 nino.borges 874 #search_box = driver.find_element(By.ID, "edit-keyword")
46     #search_box = driver.find_element(By.ID, "name3")
47 nino.borges 873 search_box.clear()
48 nino.borges 875 if quotedSearch == True:
49     search_box.send_keys(f'"{person_name}"')
50     else:
51     search_box.send_keys(person_name)
52    
53 nino.borges 873 search_box.send_keys(Keys.RETURN)
54    
55     time.sleep(3)
56    
57    
58     results = driver.find_elements(By.CSS_SELECTOR, "a[href*='vcard']")
59 nino.borges 874
60 nino.borges 873 if not results:
61     print(f"No vCards found for '{person_name}'.")
62     return
63    
64    
65     for link in results:
66     vcard_url = link.get_attribute("href")
67 nino.borges 875 if protectedDownload == True:
68     download_vcard_protected(vcard_url,person_name)
69     else:
70     download_vcard(vcard_url,person_name)
71 nino.borges 873
72     except Exception as e:
73     print(f"Error during search or download: {e}")
74 nino.borges 874 ## finally:
75     ## driver.quit()
76 nino.borges 873
77    
78    
79     def download_vcard(vcard_url, person_name):
80     try:
81     response = requests.get(vcard_url)
82     response.raise_for_status()
83    
84     #filename = os.path.basename(vcard_url)
85     filename = f"{person_name}.vcf"
86     filepath = os.path.join(VCARD_DIR,filename)
87    
88    
89     with open(filepath,"wb") as file:
90     file.write(response.content)
91     print(f"Downloaded vCard: {filepath}")
92     except Exception as e:
93     print(f"Error downloading vCard from {vcard_url}: {e}")
94    
95    
96 nino.borges 875 def download_vcard_protected(vcard_url, person_name):
97     try:
98     req = urllib.request.Request(vcard_url)
99     req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
100     r = urllib.request.urlopen(req).read()#.decode('utf-8')
101    
102     filename = f"{person_name}.vcf"
103     filepath = os.path.join(VCARD_DIR,filename)
104    
105    
106     with open(filepath,"wb") as file:
107     file.write(r)
108    
109     print(f"Downloaded vCard: {filepath}")
110     except Exception as e:
111     print(f"Error downloading vCard from {vcard_url}: {e}")
112    
113 nino.borges 873 if __name__ == '__main__':
114 nino.borges 874 #inputFilePath = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\MAL-NamesToFind\KLGATES.txt"
115 nino.borges 875 #inputFilePath = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\MAL-NamesToFind\KLGATES_SUB3.txt"
116 nino.borges 874 #inputFilePath = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\MAL-NamesToFind\TEST.txt"
117 nino.borges 875 inputFilePath = r"C:\Users\eborges\OneDrive - Redgrave LLP\Documents\Cases\Amazon\MAL-NamesToFind\HOGANLOVELLS_SUB1.txt"
118 nino.borges 873
119 nino.borges 875 element_id_name = "name3"
120 nino.borges 874
121    
122 nino.borges 875
123 nino.borges 874 ## This can be a list of any value to search on but for now I need it to be unique, which is why I'm searching only on email addresses
124     listOfEmailAddresses = set()
125    
126    
127     contents = open(inputFilePath).readlines()
128     for line in contents:
129     line = line.replace("\n","")
130     uniqueRowNumb,emailAddr = line.split("|")
131     person_to_search = emailAddr
132 nino.borges 875 search_and_download_vcards(person_to_search, element_id_name, protectedDownload = True)
133 nino.borges 874
134    
135 nino.borges 873
136 nino.borges 874
137 nino.borges 873 #person_to_search = "Gaia Bacchi"
138     #person_to_search = "tim.weston@klgates.com"
139 nino.borges 874 #person_to_search = "BPeters@perkinscoie.com"
140     #person_to_search = "Katie.McMullan@hoganlovells.com"
141 nino.borges 875 #search_and_download_vcards(person_to_search, element_id_name, protectedDownload = True)
142 nino.borges 874
143     driver.quit()