Job_Scraper/linkedin_scraper.py at main · Patrick2ooo/Job_Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# https://gist.github.com/Diegiwg/51c22fa7ec9d92ed9b5d1f537b9e1107?permalink_comment_id=5418613
import re
from seleniumbase import Driver
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import traceback

def add_value(data, key, value):
    if key in data.keys():
        if type(data[key]) is list:
            data[key] = data[key] + [value]
        else:
            data[key] = [data[key]] + [value]
    else:
        data[key] = value

    return data

def get_description(id_):
    driver = Driver(headless=True)
    linkedin_job_link = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{id_}"
    driver.get(linkedin_job_link)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    while soup.select_one('div[class^="show-more-less-html__markup"]') is None: # not driver.is_element_visible('div[class^="show-more-less-html__markup"]')
        driver.get(linkedin_job_link)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
    if soup.select_one('div[class^="show-more-less-html__markup"]') is None:
        print("error")
    description = soup.select_one('div[class^="show-more-less-html__markup"]').text.strip()
    driver.quit()
    return id_, description

def linkedin_scraper(url):
    index = 0
    my_dict = {}
    driver = Driver(uc=True, headless=True)
    try:
        while True:
            LINKEDIN = url + f"&start={index}"
            driver.get(LINKEDIN)
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            while driver.is_element_visible('div[class="focus-page"]'): # There is 2 of these while because depending on the page loading time one is skipped (neeed to find a better way later)
                driver.get(LINKEDIN)
                driver.sleep(2)
                html = driver.page_source
                soup = BeautifulSoup(html, 'html.parser')
            list_job = soup.select('li', recursive=False)
            # this if and while are horrible
            if not list_job:
                break_flag = True
                driver.sleep(2)
                while driver.is_element_visible('div[class="focus-page"]'):
                    break_flag = False
                    driver.get(LINKEDIN)
                    driver.sleep(2)
                    html = driver.page_source
                    soup = BeautifulSoup(html, 'html.parser')
                    list_job = soup.select('li', recursive=False)
                if break_flag:
                    break
            for job in list_job:
                base_card = job.select_one('div.base-card')
                if not base_card:
                    base_card = job.select_one('a.base-card')
                name_path = job.find('h3', attrs={'class': 'base-search-card__title'})
                name = name_path.text.strip()
                name = name.replace('"', '')
                data_entity_urn = base_card.get('data-entity-urn')
                job_id = re.sub(r'\D+', '', data_entity_urn)
                if job_id in my_dict:
                    print(index)
                    print(job_id)
                    print(my_dict[job_id])
                    print("already scraped")
                my_dict.update({job_id: f'=HYPERLINK("https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}","{name}")'})
                location = job.find('span', attrs={'class': 'job-search-card__location'}).text.strip()
                company_name = job.find('h4', attrs={'class': 'base-search-card__subtitle'}).text.strip()
                my_dict = add_value(my_dict, job_id, company_name)
                my_dict = add_value(my_dict, job_id, location)
                index += 1
        driver.quit()
        if not my_dict:
            raise Exception("No jobs found")
        args = [id_ for id_ in my_dict.keys()]
        # Parallelized scraping using multi-threading: example found here : https://github.com/seleniumbase/SeleniumBase/discussions/3647
        # start = time.process_time()
        with ThreadPoolExecutor() as executor:
            result = list(executor.map(get_description, args))
            # for arg in args:
            #     executor.submit(get_description, arg[0], arg[1])
        for id_, description in result:
            my_dict = add_value(my_dict, id_, description)
        # end = time.process_time()
        # print(f'Finished in {end-start} seconds')
    except Exception as e:
        my_dict.clear()
        my_dict.update({'ERROR in linkedin script': str(e)})
        print(traceback.format_exc())
    finally:
        return my_dict