dockerize

topefolorunso · Aug 10, 2022 · beccb4e · beccb4e
1 parent e0fb3da
commit beccb4e
Show file tree

Hide file tree

Showing 11 changed files with 423 additions and 577 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,26 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
+// https://github.com/microsoft/vscode-dev-containers/tree/v0.238.0/containers/docker-existing-dockerfile
+{
+	"name": "Existing Dockerfile",
+
+	// Sets the run context to one level up instead of the .devcontainer folder.
+	"context": "..",
+
+	// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
+	"dockerFile": "../Dockerfile"
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Uncomment the next line to run commands after the container is created - for example installing curl.
+	// "postCreateCommand": "apt-get update && apt-get install -y curl",
+
+	// Uncomment when using a ptrace-based debugger like C++, Go, and Rust
+	// "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ],
+
+	// Uncomment to use the Docker CLI from inside the container. See https://aka.ms/vscode-remote/samples/docker-from-docker.
+	// "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ],
+
+	// Uncomment to connect as a non-root user if you've added one. See https://aka.ms/vscode-remote/containers/non-root.
+	// "remoteUser": "vscode"
+}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,32 @@
+FROM python:3.9
+
+# Setting Up Google Chrome
+
+# Adding trusting keys to apt for repositories
+RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
+# Adding Google Chrome to the repositories
+RUN  sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list'
+# Updating apt to see and install Google Chrome
+RUN apt-get -y update
+# Magic happens
+RUN apt-get install -y google-chrome-stable
+
+# Installing Chrome Driver
+
+# Installing Unzip
+RUN apt-get install -yqq unzip
+# Download the Chrome Driver
+RUN wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip
+# Unzip the Chrome Driver into /usr/local/bin directory
+RUN unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
+# Set display port as an environment variable
+ENV DISPLAY=:99
+
+RUN apt-get install -qq -y cron
+
+COPY src src
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt
+RUN python src/scrape_jobs.py
+RUN src/schedule_cron.sh
diff --git a/README.md b/README.md
@@ -1,2 +1,7 @@
 # spotify-jobs
-
+
+docker build -t scrape_jobs:v01 .
+
+docker run --name scrape_jobs -it scrape_jobs:v01
+
+docker exec -t -i scrape_jobs bash
diff --git a/chromedriver.exe b/chromedriver.exe
diff --git a/chromedriver_win32.zip b/chromedriver_win32.zip
diff --git a/output_files/filtered_spotify-jobs.csv b/output_files/filtered_spotify-jobs.csv
@@ -1,2 +1,2 @@
 role,location,url
-"Associate Analytics Engineer, Direct Guaranteed, Ads R&D",New York or Remote Americas,https://www.lifeatspotify.com/jobs/associate-analytics-engineer-direct-guaranteed-ads-rd
+"Associate Analytics Engineer, Direct Guaranteed, Ads R&D",,https://www.lifeatspotify.com/jobs/associate-analytics-engineer-direct-guaranteed-ads-rd
diff --git a/output_files/spotify-jobs.csv b/output_files/spotify-jobs.csv
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+requests
+selenium
diff --git a/src/schedule_cron.sh b/src/schedule_cron.sh
@@ -1,5 +1,3 @@
 #!/bin/sh
 
-(crontab -l && echo "0 8,12,18 * * * python ~/spotify-jobs/src/scrape_jobs.py") | crontab -
-
-(crontab -l && echo "15 8,12,18 * * * python ~/spotify-jobs/src/run_telegram_bot.py") | crontab -
+(echo "0 8,12,18 * * * python /src/scrape_jobs.py" && echo "15 8,12,18 * * * python /src/run_telegram_bot.py") | crontab -
diff --git a/src/scrape_functions.py b/src/scrape_functions.py
@@ -1,21 +1,25 @@
-from doctest import FAIL_FAST
-from textwrap import indent
+import os
 import pandas as pd
 
 from selenium import webdriver
 from selenium.webdriver.common.by import By
-from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import WebDriverException
 
 import time
 
 
 class Browser():
     def __init__(self, url) -> None:
         self.url = url
+
         options = Options()
-        options.headless = True
+        options.add_argument('--no-sandbox')
+        options.add_argument('--window-size=1420,1080')
+        options.add_argument('--headless')
+        options.add_argument('--disable-gpu')
+        self.browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=options)
 
-        self.browser = webdriver.Firefox(options=options)
         self.browser.get(self.url)
 
         time.sleep(5)
@@ -29,14 +33,22 @@ def load_all_jobs(self):
         # n = 1
 
         print('loading all jobs...')
-        while 'Load more jobs' in [button.text for button in all_buttons]:
-            # print(n)
-            # n+=1
-            for button in all_buttons:
-                if button.text == 'Load more jobs':
-                    button.click()
-            all_buttons = self.browser.find_elements(By.XPATH, '//button')
-        print('all jobs loaded...')
+        while True:
+            try:
+                while 'Load more jobs' in [button.text for button in all_buttons]:
+                    # print(n)
+                    # n+=1
+                    for button in all_buttons:
+                        if button.text == 'Load more jobs':
+                            button.click()
+                    all_buttons = self.browser.find_elements(By.XPATH, '//button')
+                print('all jobs loaded...')
+                break
+            except WebDriverException as error:
+                print('page crashed')
+                print('Error: ', error)
+                time.sleep(3)
+
 
     def scrape_all_jobs(self):
         job_info = {}
@@ -72,7 +84,6 @@ def __init__(self, dict=None) -> None:
             self.df = pd.DataFrame()
 
     def add_column_to_dataframe(self, column, values):
-
         self.df[column] = values
 
     def export_to_file(self, path_to_file):
@@ -102,15 +113,16 @@ def filter_jobs(df, path_to_file, *args):
     filtered_df = df
 
     for keyword in args:
-        filtered_df = filtered_df[filtered_df.role.str.contains(keyword)]
+        filtered_df = filtered_df[filtered_df.role.astype(str).str.contains(keyword)]
 
     filtered_df.to_csv(path_to_file, index=False)
     return filtered_df
 
 
 def get_file_path(file_name):
+    dir = '../output_files/'
+    if not os.path.exists(dir):
+        os.mkdir(dir)
 
-    path_prefix = '../output_files/'
-    path_to_file = f'{path_prefix}{file_name}'
-
+    path_to_file = os.path.join(dir, file_name)
     return path_to_file
diff --git a/src/scrape_jobs.py b/src/scrape_jobs.py
@@ -1,4 +1,3 @@
-from requests import get
 from scrape_functions import *
 
 def scrape_webpage(url, file_name, keywords):