Skip to content

Commit

Permalink
dockerize
Browse files Browse the repository at this point in the history
  • Loading branch information
topefolorunso committed Aug 10, 2022
1 parent e0fb3da commit beccb4e
Show file tree
Hide file tree
Showing 11 changed files with 423 additions and 577 deletions.
26 changes: 26 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
// https://github.com/microsoft/vscode-dev-containers/tree/v0.238.0/containers/docker-existing-dockerfile
{
"name": "Existing Dockerfile",

// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..",

// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerFile": "../Dockerfile"

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Uncomment the next line to run commands after the container is created - for example installing curl.
// "postCreateCommand": "apt-get update && apt-get install -y curl",

// Uncomment when using a ptrace-based debugger like C++, Go, and Rust
// "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ],

// Uncomment to use the Docker CLI from inside the container. See https://aka.ms/vscode-remote/samples/docker-from-docker.
// "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ],

// Uncomment to connect as a non-root user if you've added one. See https://aka.ms/vscode-remote/containers/non-root.
// "remoteUser": "vscode"
}
32 changes: 32 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
FROM python:3.9

# Setting Up Google Chrome

# Adding trusting keys to apt for repositories
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
# Adding Google Chrome to the repositories
RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list'
# Updating apt to see and install Google Chrome
RUN apt-get -y update
# Magic happens
RUN apt-get install -y google-chrome-stable

# Installing Chrome Driver

# Installing Unzip
RUN apt-get install -yqq unzip
# Download the Chrome Driver
RUN wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip
# Unzip the Chrome Driver into /usr/local/bin directory
RUN unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
# Set display port as an environment variable
ENV DISPLAY=:99

RUN apt-get install -qq -y cron

COPY src src
COPY requirements.txt .

RUN pip install -r requirements.txt
RUN python src/scrape_jobs.py
RUN src/schedule_cron.sh
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
# spotify-jobs


docker build -t scrape_jobs:v01 .

docker run --name scrape_jobs -it scrape_jobs:v01

docker exec -t -i scrape_jobs bash
Binary file added chromedriver.exe
Binary file not shown.
Binary file added chromedriver_win32.zip
Binary file not shown.
2 changes: 1 addition & 1 deletion output_files/filtered_spotify-jobs.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
role,location,url
"Associate Analytics Engineer, Direct Guaranteed, Ads R&D",New York or Remote Americas,https://www.lifeatspotify.com/jobs/associate-analytics-engineer-direct-guaranteed-ads-rd
"Associate Analytics Engineer, Direct Guaranteed, Ads R&D",,https://www.lifeatspotify.com/jobs/associate-analytics-engineer-direct-guaranteed-ads-rd
877 changes: 324 additions & 553 deletions output_files/spotify-jobs.csv

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pandas
requests
selenium
4 changes: 1 addition & 3 deletions src/schedule_cron.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#!/bin/sh

(crontab -l && echo "0 8,12,18 * * * python ~/spotify-jobs/src/scrape_jobs.py") | crontab -

(crontab -l && echo "15 8,12,18 * * * python ~/spotify-jobs/src/run_telegram_bot.py") | crontab -
(echo "0 8,12,18 * * * python /src/scrape_jobs.py" && echo "15 8,12,18 * * * python /src/run_telegram_bot.py") | crontab -
48 changes: 30 additions & 18 deletions src/scrape_functions.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
from doctest import FAIL_FAST
from textwrap import indent
import os
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException

import time


class Browser():
def __init__(self, url) -> None:
self.url = url

options = Options()
options.headless = True
options.add_argument('--no-sandbox')
options.add_argument('--window-size=1420,1080')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
self.browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=options)

self.browser = webdriver.Firefox(options=options)
self.browser.get(self.url)

time.sleep(5)
Expand All @@ -29,14 +33,22 @@ def load_all_jobs(self):
# n = 1

print('loading all jobs...')
while 'Load more jobs' in [button.text for button in all_buttons]:
# print(n)
# n+=1
for button in all_buttons:
if button.text == 'Load more jobs':
button.click()
all_buttons = self.browser.find_elements(By.XPATH, '//button')
print('all jobs loaded...')
while True:
try:
while 'Load more jobs' in [button.text for button in all_buttons]:
# print(n)
# n+=1
for button in all_buttons:
if button.text == 'Load more jobs':
button.click()
all_buttons = self.browser.find_elements(By.XPATH, '//button')
print('all jobs loaded...')
break
except WebDriverException as error:
print('page crashed')
print('Error: ', error)
time.sleep(3)


def scrape_all_jobs(self):
job_info = {}
Expand Down Expand Up @@ -72,7 +84,6 @@ def __init__(self, dict=None) -> None:
self.df = pd.DataFrame()

def add_column_to_dataframe(self, column, values):

self.df[column] = values

def export_to_file(self, path_to_file):
Expand Down Expand Up @@ -102,15 +113,16 @@ def filter_jobs(df, path_to_file, *args):
filtered_df = df

for keyword in args:
filtered_df = filtered_df[filtered_df.role.str.contains(keyword)]
filtered_df = filtered_df[filtered_df.role.astype(str).str.contains(keyword)]

filtered_df.to_csv(path_to_file, index=False)
return filtered_df


def get_file_path(file_name):
dir = '../output_files/'
if not os.path.exists(dir):
os.mkdir(dir)

path_prefix = '../output_files/'
path_to_file = f'{path_prefix}{file_name}'

path_to_file = os.path.join(dir, file_name)
return path_to_file
1 change: 0 additions & 1 deletion src/scrape_jobs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from requests import get
from scrape_functions import *

def scrape_webpage(url, file_name, keywords):
Expand Down

0 comments on commit beccb4e

Please sign in to comment.