diff --git a/.gitignore b/.gitignore index ae0ffe8..2f72680 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,9 @@ .ipynb_checkpoints/ __pycache__/ +archive/ .env test.py -src/test.csv +test.csv *.ipynb -*.log -*.zip diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..e8b6a7d --- /dev/null +++ b/alembic.ini @@ -0,0 +1,110 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python-dateutil library that can be +# installed by adding `alembic[tz]` to the pip requirements +# string value is passed to dateutil.tz.gettz() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = sqlite:///database/jobs.db + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/alembic/README b/alembic/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..56f2324 --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,78 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +from database.models import Base +target_metadata = Base.metadata +# target_metadata = None + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/alembic/script.py.mako b/alembic/script.py.mako new file mode 100644 index 0000000..55df286 --- /dev/null +++ b/alembic/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/alembic/versions/80a4b5684683_create_jobs_table.py b/alembic/versions/80a4b5684683_create_jobs_table.py new file mode 100644 index 0000000..b48218b --- /dev/null +++ b/alembic/versions/80a4b5684683_create_jobs_table.py @@ -0,0 +1,40 @@ +"""create jobs table + +Revision ID: 80a4b5684683 +Revises: +Create Date: 2023-04-15 05:42:29.021389 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '80a4b5684683' +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('jobs', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('role', sa.String(), nullable=False), + sa.Column('experience_level', sa.String(), nullable=True), + sa.Column('location', sa.String(), nullable=False), + sa.Column('url', sa.String(), nullable=False), + sa.Column('company', sa.String(), nullable=False), + sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.text("(datetime('now'))"), nullable=False), + sa.Column('qualify_for', sa.Boolean(), server_default=sa.text('0'), nullable=False), + sa.Column('sent', sa.Boolean(), server_default=sa.text('0'), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('url') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('jobs') + # ### end Alembic commands ### diff --git a/alembic/versions/ae98339276a4_rename_sent_column.py b/alembic/versions/ae98339276a4_rename_sent_column.py new file mode 100644 index 0000000..0ff2bb2 --- /dev/null +++ b/alembic/versions/ae98339276a4_rename_sent_column.py @@ -0,0 +1,30 @@ +"""rename sent column + +Revision ID: ae98339276a4 +Revises: 80a4b5684683 +Create Date: 2023-04-15 05:44:29.353001 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'ae98339276a4' +down_revision = '80a4b5684683' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('jobs', sa.Column('is_sent', sa.Boolean(), server_default=sa.text('0'), nullable=False)) + op.drop_column('jobs', 'sent') + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('jobs', sa.Column('sent', sa.BOOLEAN(), server_default=sa.text('0'), nullable=False)) + op.drop_column('jobs', 'is_sent') + # ### end Alembic commands ### diff --git a/chromedriver.exe b/chromedriver.exe deleted file mode 100644 index 4a64091..0000000 Binary files a/chromedriver.exe and /dev/null differ diff --git a/database/jobs.db b/database/jobs.db index e69de29..24899a3 100644 Binary files a/database/jobs.db and b/database/jobs.db differ diff --git a/database/models.py b/database/models.py new file mode 100644 index 0000000..5142a09 --- /dev/null +++ b/database/models.py @@ -0,0 +1,31 @@ +from sqlalchemy import TIMESTAMP, Boolean, Column, String, Integer, text +from sqlalchemy.sql import expression +from sqlalchemy.ext.declarative import declarative_base + + +Base = declarative_base() + +# class Company(Base): +# __tablename__ = 'companies' + +# id = Column(Integer, primary_key=True) +# name = Column(Text, nullable=False) + +class Job(Base): + + __tablename__ = 'jobs' + + id = Column(Integer, primary_key=True, nullable=False) + role = Column(String, nullable=False) + experience_level = Column(String, nullable=True) + location = Column(String, nullable=False) + url = Column(String, nullable=False, unique=True) + company = Column(String, nullable=False) + created_at = Column(TIMESTAMP(timezone=True), nullable=False, server_default=text("datetime('now')")) + qualify_for = Column(Boolean, nullable=False, server_default=expression.false()) + is_sent = Column(Boolean, nullable=False, server_default=expression.false()) + + + + + diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..7607659 --- /dev/null +++ b/src/config.py @@ -0,0 +1,17 @@ +COMPANIES = ('spotify',) + + + +KEYWORDS_MAP = { + 'spotify': ('Associate', 'Engineer'), + 'zalando': ('Engineer',), + 'hellofresh': ('Junior', 'Engineer'), + } + + + +URLS_MAP = { + 'spotify': "https://www.lifeatspotify.com/jobs", + 'zalando': "https://jobs.zalando.com/en/jobs", + 'hellofresh': "https://careers.hellofresh.com/global/en/search-results" + } \ No newline at end of file diff --git a/src/helper_functions.py b/src/helper_functions.py deleted file mode 100644 index 0f556cb..0000000 --- a/src/helper_functions.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -import pandas as pd - -import sqlite3 - -from browsers.spotify_browser import SpotifyBrowser -from browsers.zalando_browser import ZalandoBrowser -from browsers.hellofresh_browser import HellofreshBrowser - - -def scrape_all_jobs(company: str, url): - browser_class_name = company.capitalize() + 'Browser' - browser = eval(browser_class_name)(url) - - browser.load_all_jobs() - job_info = browser.scrape_all_jobs() - - browser.close_browser() - return job_info - -def connect_to_database(): - try: - conn = sqlite3.connect('database/jobs.db') - print('Database connection iniialized') - except sqlite3.Error as error: - conn = None - print('Error occurred: ', error) - return conn - -def query_database(conn, query=None): - try: - cursor = conn.cursor() - cursor.execute(query) - result = cursor.fetchall() - except sqlite3.Error as error: - print('Error occurred: ', error) - result = None - finally: - if conn: - conn.close() - print('Database connection closed') - return result - -def export_jobs_to_file(job_dict, path_to_file): - job_df = pd.DataFrame(job_dict) - - print(f'exporting data to {path_to_file} ...') - job_df.to_csv(path_to_file, index=False) - return job_df - - -def filter_jobs(df, path_to_file, *args, type): - filtered_df = df - - if type=='zalando': - keywords = 'Apprenticeship|Graduate|Entry|Intern|Trainee' - filtered_df = filtered_df[filtered_df.exp_level.str.contains(keywords)] - for keyword in args: - filtered_df = filtered_df[filtered_df.role.str.contains(keyword)] - - filtered_df.to_csv(path_to_file, index=False) - return - - -def get_file_path(file_name): - dir = 'output_files/' - if not os.path.exists(dir): - os.mkdir(dir) - - path_to_file = os.path.join(dir, file_name) - return path_to_file diff --git a/src/run_telegram_bot.py b/src/run_telegram_bot.py index 27c155f..8e1cd9a 100644 --- a/src/run_telegram_bot.py +++ b/src/run_telegram_bot.py @@ -1,88 +1,24 @@ -import pandas as pd -import requests import time +from utils.database_helper_functions import connect_to_database +from utils.telegram_helper_functions import get_latest_jobs, send_job_notification -companies = ['spotify', 'zalando'] - -def fetch_dataframe(file_name): - file_path = f'output_files/{file_name}' - - while True: - try: - df = pd.read_csv(file_path, usecols=['role', 'location', 'url']) - break - - except FileNotFoundError: - job_header = ['role', 'location', 'url'] - pd.DataFrame(columns=job_header).to_csv(file_path, index=False) - continue - - return df - -def update_cache(cache_file, roles, locations, urls): - new_job_info = {} - - new_job_info['role'] = roles - new_job_info['location'] = locations - new_job_info['url'] = urls - - new_df = pd.DataFrame(new_job_info) - new_df.to_csv(f'output_files/{cache_file}', index=False, mode='a', header=False) - return - -def generate_messages(df, cache_df, cache_file): - - cache_indx = cache_df.set_index(['role', 'location']).index - - latest_jobs = [] - - new_roles = [] - new_urls = [] - new_locations = [] - - for row in range(len(df)): - role = df.loc[row, 'role'] - location = df.loc[row, 'location'] - - if (role, location) not in cache_indx: - url = df.loc[row, 'url'] - job = f'Role - {role}\nLocation - {location}\nURL - {url}' - job = job.replace('&', '%26') - - latest_jobs.append(job) - - new_roles.append(role) - new_locations.append(location) - new_urls.append(url) - - if len(new_roles)!=0 and len(new_locations)!=0 and len(new_urls)!=0: - update_cache(cache_file=cache_file, roles=new_roles, locations=new_locations, urls=new_urls) - - return latest_jobs - - -def send_message(message): - # print(message) - - url_prefix = 'https://api.telegram.org/bot5550807059:AAEFaAQ53OyWQpz23dsVWDwkKpRx-xz36T4/sendMessage' - url_query = f'?chat_id=-776374127&text={message}' - - message_url = url_prefix + url_query - requests.get(message_url) if __name__ == '__main__': - for company in companies: - found_jobs = f'filtered_{company}-jobs.csv' - cached_jobs = f'cached_{company}-jobs.csv' + conn = connect_to_database() - new_jobs_df = fetch_dataframe(found_jobs) - cached_df = fetch_dataframe(cached_jobs) + try: + latest_jobs = get_latest_jobs(conn) + + for job in latest_jobs: + send_job_notification(conn=conn, job=job) + time.sleep(5) - latest_jobs = generate_messages(new_jobs_df, cached_df, cached_jobs) + except Exception as e: + raise e - for job in latest_jobs: - send_message(message=job) - time.sleep(5) \ No newline at end of file + finally: + conn.close() + print('Database connection closed') \ No newline at end of file diff --git a/src/scrape_jobs.py b/src/scrape_jobs.py index ced313a..a595e9c 100644 --- a/src/scrape_jobs.py +++ b/src/scrape_jobs.py @@ -1,44 +1,29 @@ import chromedriver_autoinstaller -from helper_functions import * +from utils.scraper_helper_functions import scrape_webpage +from utils.database_helper_functions import connect_to_database +from config import * -companies = ('spotify', 'zalando',) - -keywords_dict = { - 'spotify': ('Associate', 'Engineer'), - 'zalando': ('Engineer'), - 'hellofresh': ('Junior', 'Engineer'), - } - -urls_dict = { - 'spotify': "https://www.lifeatspotify.com/jobs", - 'zalando': "https://jobs.zalando.com/en/jobs", - 'hellofresh': "https://careers.hellofresh.com/global/en/search-results" - } - -def scrape_webpage(company, url, file_name, keywords): - job_dict = scrape_all_jobs(company, url) - - path_to_file = get_file_path(file_name) - job_df = export_jobs_to_file(job_dict, path_to_file) - print('export completed... 100%') - - filtered_path = get_file_path('filtered_' + file_name) - filter_jobs(job_df, filtered_path, *keywords, type=company) if __name__ == '__main__': chromedriver_autoinstaller.install() conn = connect_to_database() - conn.close() - - for company in companies: - keywords = keywords_dict[company] - url = urls_dict[company] - file_name = f'{company}-jobs.csv' - - print(f'scraping {company} job site ...') - scrape_webpage(company, url, file_name, keywords) - - print('scrape job completed... 100%') \ No newline at end of file + + try: + for company in COMPANIES: + keywords = KEYWORDS_MAP[company] + url = URLS_MAP[company] + file_name = f'{company}-jobs.csv' + + print(f'scraping {company} job site ...') + scrape_webpage(company, url, conn, keywords) + print('scrape job completed... 100%') + + except Exception as e: + print(e) + + finally: + conn.close() + print('Database connection closed') \ No newline at end of file diff --git a/src/browser_base_class.py b/src/utils/browsers/browser_base_class.py similarity index 96% rename from src/browser_base_class.py rename to src/utils/browsers/browser_base_class.py index 6e46d9d..f7e961c 100644 --- a/src/browser_base_class.py +++ b/src/utils/browsers/browser_base_class.py @@ -1,25 +1,25 @@ -from selenium import webdriver -from selenium.webdriver.chrome.options import Options - -import time - - -class Browser(): - OPTIONS = Options() - OPTIONS.add_argument('--no-sandbox') - OPTIONS.add_argument('--window-size=1420,1080') - OPTIONS.add_argument('--headless') - OPTIONS.add_argument('--disable-gpu') - - # DRIVER_PATH = './chromedriver.exe' - - def __init__(self, url) -> None: - self.url = url - self.browser = webdriver.Chrome(options=self.OPTIONS) - self.browser.get(self.url) - time.sleep(5) - print('webpage opened in background') - - def close_browser(self): - self.browser.close() - +from selenium import webdriver +from selenium.webdriver.chrome.options import Options + +import time + + +class Browser(): + OPTIONS = Options() + OPTIONS.add_argument('--no-sandbox') + OPTIONS.add_argument('--window-size=1420,1080') + OPTIONS.add_argument('--headless') + OPTIONS.add_argument('--disable-gpu') + + # DRIVER_PATH = './chromedriver.exe' + + def __init__(self, url) -> None: + self.url = url + self.browser = webdriver.Chrome(options=self.OPTIONS) + self.browser.get(self.url) + time.sleep(5) + print('webpage opened in background') + + def close_browser(self): + self.browser.close() + diff --git a/src/browsers/hellofresh_browser.py b/src/utils/browsers/hellofresh_browser.py similarity index 97% rename from src/browsers/hellofresh_browser.py rename to src/utils/browsers/hellofresh_browser.py index 4b7f0ba..e296cbf 100644 --- a/src/browsers/hellofresh_browser.py +++ b/src/utils/browsers/hellofresh_browser.py @@ -1,4 +1,4 @@ -from browser_base_class import Browser +from browsers.browser_base_class import Browser from selenium.webdriver.support import expected_conditions from selenium.webdriver.common.by import By diff --git a/src/browsers/spotify_browser.py b/src/utils/browsers/spotify_browser.py similarity index 95% rename from src/browsers/spotify_browser.py rename to src/utils/browsers/spotify_browser.py index 62d950a..31f3f09 100644 --- a/src/browsers/spotify_browser.py +++ b/src/utils/browsers/spotify_browser.py @@ -1,66 +1,67 @@ -from browser_base_class import Browser - -from selenium.webdriver.support import expected_conditions -from selenium.webdriver.common.by import By -from selenium.common.exceptions import WebDriverException -from selenium.webdriver.support.ui import WebDriverWait - -import time - - -class SpotifyBrowser(Browser): - - def __init__(self, url) -> None: - super().__init__(url=url) - - - def load_all_jobs(self): - WebDriverWait(self.browser, 20).until(expected_conditions.presence_of_element_located((By.XPATH, '//button'))) - all_buttons = { - button.text:button - for button in self.browser.find_elements(By.XPATH, '//button') - } - - print('loading all jobs...') - while True: - try: - load_more_button = all_buttons['Load more jobs'] - - self.browser.execute_script("arguments[0].scrollIntoView(true);", load_more_button) - load_more_button.click() - - WebDriverWait(self.browser, 20).until(expected_conditions.presence_of_element_located((By.XPATH, '//button'))) - all_buttons = { - button.text:button - for button in self.browser.find_elements(By.XPATH, '//button') - } - - except KeyError: - print('all jobs loaded...') - break - except WebDriverException as error: - print('page crashed') - print('Error: ', error) - time.sleep(3) - - - def scrape_all_jobs(self): - job_info = {} - roles = [] - urls = [] - locations = [] - - job_item = self.browser.find_elements(By.XPATH, '//div[@class="mb-xxxs mb-mobile-xxs entry_cols__3vENU entry_header__2Rw2O"]/a') - for job in job_item: - roles.append(job.text) - urls.append(job.get_attribute('href')) - - job_locations = self.browser.find_elements(By.XPATH, '//div[@class="mb-xxxs mb-mobile-xxs entry_cols__3vENU entry_header__2Rw2O"]/p') - for location in job_locations: - locations.append(location.text) - - job_info['role'] = roles - job_info['location'] = locations - job_info['url'] = urls - - return (job_info) +from .browser_base_class import Browser + +from selenium.webdriver.support import expected_conditions +from selenium.webdriver.common.by import By +from selenium.common.exceptions import WebDriverException +from selenium.webdriver.support.ui import WebDriverWait + +import time + + +class SpotifyBrowser(Browser): + + def __init__(self, url) -> None: + super().__init__(url=url) + + + def load_all_jobs(self): + WebDriverWait(self.browser, 20).until(expected_conditions.presence_of_element_located((By.XPATH, '//button'))) + all_buttons = { + button.text:button + for button in self.browser.find_elements(By.XPATH, '//button') + } + + print('loading all jobs...') + while True: + try: + load_more_button = all_buttons['Load more jobs'] + + self.browser.execute_script("arguments[0].scrollIntoView(true);", load_more_button) + load_more_button.click() + + WebDriverWait(self.browser, 20).until(expected_conditions.presence_of_element_located((By.XPATH, '//button'))) + all_buttons = { + button.text:button + for button in self.browser.find_elements(By.XPATH, '//button') + } + + except KeyError: + print('all jobs loaded...') + break + except WebDriverException as error: + print('page crashed') + print('Error: ', error) + time.sleep(3) + + + def scrape_all_jobs(self): + job_info = {} + roles = [] + urls = [] + locations = [] + + job_item = self.browser.find_elements(By.XPATH, '//div[@class="mb-xxxs mb-mobile-xxs entry_cols__3vENU entry_header__2Rw2O"]/a') + for job in job_item: + roles.append(job.text) + urls.append(job.get_attribute('href')) + + job_locations = self.browser.find_elements(By.XPATH, '//div[@class="mb-xxxs mb-mobile-xxs entry_cols__3vENU entry_header__2Rw2O"]/p') + for location in job_locations: + locations.append(location.text) + + job_info['role'] = roles + job_info['location'] = locations + job_info['url'] = urls + job_info['company'] = ['spotify'] * len(urls) + + return (job_info) diff --git a/src/browsers/wayfair_browser.py b/src/utils/browsers/wayfair_browser.py similarity index 100% rename from src/browsers/wayfair_browser.py rename to src/utils/browsers/wayfair_browser.py diff --git a/src/browsers/zalando_browser.py b/src/utils/browsers/zalando_browser.py similarity index 95% rename from src/browsers/zalando_browser.py rename to src/utils/browsers/zalando_browser.py index bcdfc42..bc67700 100644 --- a/src/browsers/zalando_browser.py +++ b/src/utils/browsers/zalando_browser.py @@ -1,74 +1,75 @@ -from browser_base_class import Browser - -from selenium.webdriver.support import expected_conditions -from selenium.webdriver.common.by import By -from selenium.common.exceptions import WebDriverException -from selenium.webdriver.support.ui import WebDriverWait - -import time - - -class ZalandoBrowser(Browser): - - def __init__(self, url) -> None: - super().__init__(url=url) - self.max_page = 1 - - - def load_all_jobs(self): - max_page = max(set( - int(num.text) - for num in self.browser.find_elements(By.XPATH, "//li[@class='pagination__button']") - )) - self.max_page = max_page - - def scrape_all_jobs(self): - - job_info = {} - roles = [] - urls = [] - locations = [] - exps = [] - - curr_page = 1 - while curr_page <= self.max_page: - job_titles = self.browser.find_elements(By.XPATH, "//div[@class='card--job-result__title-container']/span[@class='card--job-result__title']") - for job in job_titles: - roles.append(job.text) - - job_locations = self.browser.find_elements(By.XPATH, '//div[@class="card--job-result__locations-container"]') - for location in job_locations: - locations.append(location.text) - - links = [link_item.get_attribute('href') for link_item in self.browser.find_elements(By.XPATH, "//li[@class='card-outer']/a")] - for link in links: - urls.append(link) - - while True: - try: - self.browser.get(link) - time.sleep(2) - - experience_level = [elem.text.split('\n')[1] for elem in self.browser.find_elements(By.XPATH, '//div[h2[contains(text(),"Experience Level")]]')] - exps.extend(experience_level) - - break - except IndexError: - continue - - if curr_page==self.max_page: - break - - curr_page+=1 - next_page = f"https://jobs.zalando.com/en/jobs?page={curr_page}" - self.browser.get(next_page) - time.sleep(5) - - job_info['role'] = roles - job_info['exp_level'] = exps - job_info['location'] = locations - job_info['url'] = urls - - return job_info - - +from browsers.browser_base_class import Browser + +from selenium.webdriver.support import expected_conditions +from selenium.webdriver.common.by import By +from selenium.common.exceptions import WebDriverException +from selenium.webdriver.support.ui import WebDriverWait + +import time + + +class ZalandoBrowser(Browser): + + def __init__(self, url) -> None: + super().__init__(url=url) + self.max_page = 1 + + + def load_all_jobs(self): + max_page = max(set( + int(num.text) + for num in self.browser.find_elements(By.XPATH, "//li[@class='pagination__button']") + )) + self.max_page = max_page + + def scrape_all_jobs(self): + + job_info = {} + roles = [] + urls = [] + locations = [] + exps = [] + + curr_page = 1 + while curr_page <= self.max_page: + job_titles = self.browser.find_elements(By.XPATH, "//div[@class='card--job-result__title-container']/span[@class='card--job-result__title']") + for job in job_titles: + roles.append(job.text) + + job_locations = self.browser.find_elements(By.XPATH, '//div[@class="card--job-result__locations-container"]') + for location in job_locations: + locations.append(location.text) + + links = [link_item.get_attribute('href') for link_item in self.browser.find_elements(By.XPATH, "//li[@class='card-outer']/a")] + for link in links: + urls.append(link) + + while True: + try: + self.browser.get(link) + time.sleep(2) + + experience_level = [elem.text.split('\n')[1] for elem in self.browser.find_elements(By.XPATH, '//div[h2[contains(text(),"Experience Level")]]')] + exps.extend(experience_level) + + break + except IndexError: + continue + + if curr_page==self.max_page: + break + + curr_page+=1 + next_page = f"https://jobs.zalando.com/en/jobs?page={curr_page}" + self.browser.get(next_page) + time.sleep(5) + + job_info['role'] = roles + job_info['exp_level'] = exps + job_info['location'] = locations + job_info['url'] = urls + job_info['company'] = ['zalando'] * len(urls) + + return job_info + + diff --git a/src/utils/database_helper_functions.py b/src/utils/database_helper_functions.py new file mode 100644 index 0000000..55f7fe6 --- /dev/null +++ b/src/utils/database_helper_functions.py @@ -0,0 +1,38 @@ +import sqlite3 + +from sqlalchemy import create_engine + + + +def get_database(): + conn = create_engine('sqlite:///database/jobs.db') + +def connect_to_database(): + try: + conn = sqlite3.connect('database/jobs.db') + print('Database connection iniialized') + + except sqlite3.Error as error: + conn = None + print('Error occurred: ', error) + + return conn + + +def query_database(conn, type, query=None): + try: + cursor = conn.cursor() + cursor.execute(query) + + if type in ('insert', 'update'): + conn.commit() + + else: + result = cursor.fetchall() + return result + + except Exception as error: + if conn: + conn.close() + print('Database connection closed') + raise error diff --git a/src/utils/scraper_helper_functions.py b/src/utils/scraper_helper_functions.py new file mode 100644 index 0000000..49b6381 --- /dev/null +++ b/src/utils/scraper_helper_functions.py @@ -0,0 +1,88 @@ +from utils.browsers.spotify_browser import SpotifyBrowser +from utils.browsers.zalando_browser import ZalandoBrowser +from utils.browsers.hellofresh_browser import HellofreshBrowser +from utils.database_helper_functions import query_database + + + +def update_qualified_jobs(conn, keywords, company): + + keyword_clause = ' AND '.join([f"role LIKE '%{keyword}%'" for keyword in keywords]) + main_query = f''' + UPDATE + jobs + SET + qualify_for = True + WHERE + {keyword_clause} + AND company = '{company}' + ''' + + if company=='zalando': + exp_keywords = ('Apprenticeship', 'Graduate', 'Entry', 'Intern', 'Trainee') + exp_keyword_clause = ' OR '.join([f"experience_level LIKE '%{exp_keyword}%'" for exp_keyword in exp_keywords]) + + modify_query = f''' + {main_query} + AND ({exp_keyword_clause}) + ''' + + print (modify_query) + query_database(conn=conn, type="update", query=modify_query) + + else: + modify_query = main_query + print (modify_query) + query_database(conn=conn, type="update", query=modify_query) + + return + + +def insert_jobs_to_db(job_dict, conn): + + columns = ", ".join(job_dict.keys()) + values = [ + f'''({', '.join( + (f'"{str(job_dict[key][i])}"' + for key in job_dict.keys()) + )})''' + + for i in range(len(list(job_dict.values())[0])) + ] + + insert_query = f''' + INSERT OR IGNORE INTO + jobs ({columns}) + VALUES + {', '.join(values)} + ''' + + + # job_df = pd.DataFrame(job_dict) + print(f'inserting jobs to database ...') + query_database(conn=conn, type="insert", query=insert_query) + # job_df.to_sql('jobs', con=db_engine, if_exists='append', index=False) + print('insertion completed... 100%') + + return + + +def scrape_all_jobs(company: str, url) -> dict: + + browser_class_name = company.capitalize() + 'Browser' + browser = eval(browser_class_name)(url) + + browser.load_all_jobs() + job_info = browser.scrape_all_jobs() + browser.close_browser() + + return job_info + + +def scrape_webpage(company, url, conn, keywords): + + job_dict = scrape_all_jobs(company, url) + insert_jobs_to_db(job_dict=job_dict, conn=conn) + update_qualified_jobs(conn=conn, keywords=keywords, company=company) + + return \ No newline at end of file diff --git a/src/utils/telegram_helper_functions.py b/src/utils/telegram_helper_functions.py new file mode 100644 index 0000000..d70f326 --- /dev/null +++ b/src/utils/telegram_helper_functions.py @@ -0,0 +1,60 @@ +import requests + +from utils.database_helper_functions import query_database + + + +def update_job_delivery(conn, url): + modify_query = f''' + UPDATE + jobs + SET + is_sent = True + WHERE + url = '{url}' + ''' + + query_database(conn=conn, type="update", query=modify_query) + return + + +def send_job_notification(conn, job): + # print(message) + company:str = job[0] + role = job[1] + location = job[2] + url = job[3] + + message = f''' + {company.capitalize()}\n + =====================\n\n + + Role - {role}\n + Location - {location}\n + URL - {url} + '''.replace('&', '%26') + + url_prefix = 'https://api.telegram.org/bot5550807059:AAEFaAQ53OyWQpz23dsVWDwkKpRx-xz36T4/sendMessage' + url_query = f'?chat_id=-776374127&text={message}' + + message_url = url_prefix + url_query + requests.get(message_url) + update_job_delivery(conn=conn, url=url) + + return + + +def get_latest_jobs(conn): + select_query = f''' + SELECT + company, role, location, url + FROM + jobs + WHERE + qualify_for = True + AND is_sent = False + ''' + + latest_jobs = query_database(conn=conn, type="select", query=select_query) + + return latest_jobs