-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlinkedin_scraper.py
More file actions
105 lines (99 loc) · 4.53 KB
/
linkedin_scraper.py
File metadata and controls
105 lines (99 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# https://gist.github.com/Diegiwg/51c22fa7ec9d92ed9b5d1f537b9e1107?permalink_comment_id=5418613
import re
from seleniumbase import Driver
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import traceback
def add_value(data, key, value):
if key in data.keys():
if type(data[key]) is list:
data[key] = data[key] + [value]
else:
data[key] = [data[key]] + [value]
else:
data[key] = value
return data
def get_description(id_):
driver = Driver(headless=True)
linkedin_job_link = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{id_}"
driver.get(linkedin_job_link)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
while soup.select_one('div[class^="show-more-less-html__markup"]') is None: # not driver.is_element_visible('div[class^="show-more-less-html__markup"]')
driver.get(linkedin_job_link)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
if soup.select_one('div[class^="show-more-less-html__markup"]') is None:
print("error")
description = soup.select_one('div[class^="show-more-less-html__markup"]').text.strip()
driver.quit()
return id_, description
def linkedin_scraper(url):
index = 0
my_dict = {}
driver = Driver(uc=True, headless=True)
try:
while True:
LINKEDIN = url + f"&start={index}"
driver.get(LINKEDIN)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
while driver.is_element_visible('div[class="focus-page"]'): # There is 2 of these while because depending on the page loading time one is skipped (neeed to find a better way later)
driver.get(LINKEDIN)
driver.sleep(2)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
list_job = soup.select('li', recursive=False)
# this if and while are horrible
if not list_job:
break_flag = True
driver.sleep(2)
while driver.is_element_visible('div[class="focus-page"]'):
break_flag = False
driver.get(LINKEDIN)
driver.sleep(2)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
list_job = soup.select('li', recursive=False)
if break_flag:
break
for job in list_job:
base_card = job.select_one('div.base-card')
if not base_card:
base_card = job.select_one('a.base-card')
name_path = job.find('h3', attrs={'class': 'base-search-card__title'})
name = name_path.text.strip()
name = name.replace('"', '')
data_entity_urn = base_card.get('data-entity-urn')
job_id = re.sub(r'\D+', '', data_entity_urn)
if job_id in my_dict:
print(index)
print(job_id)
print(my_dict[job_id])
print("already scraped")
my_dict.update({job_id: f'=HYPERLINK("https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}","{name}")'})
location = job.find('span', attrs={'class': 'job-search-card__location'}).text.strip()
company_name = job.find('h4', attrs={'class': 'base-search-card__subtitle'}).text.strip()
my_dict = add_value(my_dict, job_id, company_name)
my_dict = add_value(my_dict, job_id, location)
index += 1
driver.quit()
if not my_dict:
raise Exception("No jobs found")
args = [id_ for id_ in my_dict.keys()]
# Parallelized scraping using multi-threading: example found here : https://github.com/seleniumbase/SeleniumBase/discussions/3647
# start = time.process_time()
with ThreadPoolExecutor() as executor:
result = list(executor.map(get_description, args))
# for arg in args:
# executor.submit(get_description, arg[0], arg[1])
for id_, description in result:
my_dict = add_value(my_dict, id_, description)
# end = time.process_time()
# print(f'Finished in {end-start} seconds')
except Exception as e:
my_dict.clear()
my_dict.update({'ERROR in linkedin script': str(e)})
print(traceback.format_exc())
finally:
return my_dict