Skip to content

Commit

Permalink
Salary parse (Bunsly#163)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson authored and isonupandit11 committed Jun 20, 2024
1 parent 79e6a61 commit b51f411
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 45 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.55"
version = "1.1.57"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
36 changes: 31 additions & 5 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from concurrent.futures import ThreadPoolExecutor, as_completed

from .jobs import JobType, Location
from .scrapers.utils import logger, set_logger_level
from .scrapers.utils import logger, set_logger_level, extract_salary
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
Expand Down Expand Up @@ -135,6 +135,21 @@ def worker(site):
site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data

def convert_to_annual(job_data: dict):
if job_data["interval"] == "hourly":
job_data["min_amount"] *= 2080
job_data["max_amount"] *= 2080
if job_data["interval"] == "monthly":
job_data["min_amount"] *= 12
job_data["max_amount"] *= 12
if job_data["interval"] == "weekly":
job_data["min_amount"] *= 52
job_data["max_amount"] *= 52
if job_data["interval"] == "daily":
job_data["min_amount"] *= 260
job_data["max_amount"] *= 260
job_data["interval"] = "yearly"

jobs_dfs: list[pd.DataFrame] = []

for site, job_response in site_to_jobs_dict.items():
Expand Down Expand Up @@ -363,11 +378,22 @@ def worker(site):
job_data["min_amount"] = compensation_obj.get("min_amount")
job_data["max_amount"] = compensation_obj.get("max_amount")
job_data["currency"] = compensation_obj.get("currency", "USD")
if (
job_data["interval"]
and job_data["interval"] != "yearly"
and job_data["min_amount"]
and job_data["max_amount"]
):
convert_to_annual(job_data)

else:
job_data["interval"] = None
job_data["min_amount"] = None
job_data["max_amount"] = None
job_data["currency"] = None
if country_enum == Country.USA:
(
job_data["interval"],
job_data["min_amount"],
job_data["max_amount"],
job_data["currency"],
) = extract_salary(job_data["description"])

job_df = pd.DataFrame([job_data])
jobs_dfs.append(job_df)
Expand Down
10 changes: 5 additions & 5 deletions src/jobspy/scrapers/glassdoor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
if location_type is None:
logger.error("Glassdoor: location not parsed")
return JobResponse(jobs=[])
all_jobs: list[JobPost] = []
job_list: list[JobPost] = []
cursor = None

range_start = 1 + (scraper_input.offset // self.jobs_per_page)
Expand All @@ -81,14 +81,14 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
)
all_jobs.extend(jobs)
if not jobs or len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted]
job_list.extend(jobs)
if not jobs or len(job_list) >= scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted]
break
except Exception as e:
logger.error(f"Glassdoor: {str(e)}")
break
return JobResponse(jobs=all_jobs)
return JobResponse(jobs=job_list)

def _fetch_jobs_page(
self,
Expand Down
4 changes: 2 additions & 2 deletions src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,8 @@ def _get_compensation(job: dict) -> Compensation | None:
max_range = comp["range"].get("max")
return Compensation(
interval=interval,
min_amount=round(min_range, 2) if min_range is not None else None,
max_amount=round(max_range, 2) if max_range is not None else None,
min_amount=int(min_range) if min_range is not None else None,
max_amount=int(max_range) if max_range is not None else None,
currency=job["compensation"]["currencyCode"],
)

Expand Down
55 changes: 23 additions & 32 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_urls = set()
seen_ids = set()
page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
request_count = 0
seconds_old = (
Expand Down Expand Up @@ -133,25 +133,24 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
return JobResponse(jobs=job_list)

for job_card in job_cards:
job_url = None
href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
job_url = f"{self.base_url}/jobs/view/{job_id}"

if job_url in seen_urls:
continue
seen_urls.add(job_url)
try:
fetch_desc = scraper_input.linkedin_fetch_description
job_post = self._process_job(job_card, job_url, fetch_desc)
if job_post:
job_list.append(job_post)
if not continue_search():
break
except Exception as e:
raise LinkedInException(str(e))

if job_id in seen_ids:
continue
seen_ids.add(job_id)

try:
fetch_desc = scraper_input.linkedin_fetch_description
job_post = self._process_job(job_card, job_id, fetch_desc)
if job_post:
job_list.append(job_post)
if not continue_search():
break
except Exception as e:
raise LinkedInException(str(e))

if continue_search():
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
Expand All @@ -161,7 +160,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
return JobResponse(jobs=job_list)

def _process_job(
self, job_card: Tag, job_url: str, full_descr: bool
self, job_card: Tag, job_id: str, full_descr: bool
) -> Optional[JobPost]:
salary_tag = job_card.find("span", class_="job-search-card__salary-info")

Expand Down Expand Up @@ -208,16 +207,16 @@ def _process_job(
date_posted = None
job_details = {}
if full_descr:
job_details = self._get_job_details(job_url)
job_details = self._get_job_details(job_id)

return JobPost(
id=self._get_id(job_url),
id=job_id,
title=title,
company_name=company,
company_url=company_url,
location=location,
date_posted=date_posted,
job_url=job_url,
job_url=f"{self.base_url}/jobs/view/{job_id}",
compensation=compensation,
job_type=job_details.get("job_type"),
description=job_details.get("description"),
Expand All @@ -227,24 +226,16 @@ def _process_job(
job_function=job_details.get("job_function"),
)

def _get_id(self, url: str):
"""
Extracts the job id from the job url
:param url:
:return: str
"""
if not url:
return None
return url.split("/")[-1]

def _get_job_details(self, job_page_url: str) -> dict:
def _get_job_details(self, job_id: str) -> dict:
"""
Retrieves job description and other job details by going to the job page url
:param job_page_url:
:return: dict
"""
try:
response = self.session.get(job_page_url, timeout=5)
response = self.session.get(
f"{self.base_url}/jobs-guest/jobs/api/jobPosting/{job_id}", timeout=5
)
response.raise_for_status()
except:
return {}
Expand Down
58 changes: 58 additions & 0 deletions src/jobspy/scrapers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,61 @@ def currency_parser(cur_str):
num = float(cur_str)

return np.round(num, 2)


def remove_attributes(tag):
for attr in list(tag.attrs):
del tag[attr]
return tag


def extract_salary(
salary_str,
lower_limit=1000,
upper_limit=700000,
hourly_threshold=350,
monthly_threshold=30000,
):
if not salary_str:
return None, None, None, None

min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"

def to_int(s):
return int(float(s.replace(",", "")))

def convert_hourly_to_annual(hourly_wage):
return hourly_wage * 2080

def convert_monthly_to_annual(monthly_wage):
return monthly_wage * 12

match = re.search(min_max_pattern, salary_str)

if match:
min_salary = to_int(match.group(1))
max_salary = to_int(match.group(3))
# Handle 'k' suffix for min and max salaries independently
if "k" in match.group(2).lower() or "k" in match.group(4).lower():
min_salary *= 1000
max_salary *= 1000

# Convert to annual if less than the hourly threshold
if min_salary < hourly_threshold:
min_salary = convert_hourly_to_annual(min_salary)
if max_salary < hourly_threshold:
max_salary = convert_hourly_to_annual(max_salary)

elif min_salary < monthly_threshold:
min_salary = convert_monthly_to_annual(min_salary)
if max_salary < monthly_threshold:
max_salary = convert_monthly_to_annual(max_salary)

# Ensure salary range is within specified limits
if (
lower_limit <= min_salary <= upper_limit
and lower_limit <= max_salary <= upper_limit
and min_salary < max_salary
):
return "yearly", min_salary, max_salary, "USD"
return None, None, None, None

0 comments on commit b51f411

Please sign in to comment.