From 8e2149021b46719654fe36a425d34afde1584b7d Mon Sep 17 00:00:00 2001 From: E Date: Fri, 24 May 2024 15:23:55 +0100 Subject: [PATCH] Get pricing and stock from page as well as change image method (#8) * Move compiling to end so we can use caching for the chrome bin * WIP: Start selecting elements to check for * WIP: Get image via xpath instead and add comment * Fix error handling * Add stealth and attempt to debug Hanging when attempting to get element as rod waits for it to appear * WIP * Add to README.md * Use HasX and HasR * Add debug mode * Slightly flesh out logs * Install go manually * Remove get html * Add more explanation to particular error --- .env.example | 1 + Dockerfile | 21 +++++-- README.md | 37 ++++++++++-- go.mod | 1 + go.sum | 3 + scraper/main.go | 146 ++++++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 188 insertions(+), 21 deletions(-) diff --git a/.env.example b/.env.example index cbc065e..71f0104 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,2 @@ CHROME_PATH=/usr/bin/google-chrome-stable +DEBUG=false diff --git a/Dockerfile b/Dockerfile index ca52945..656640f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,20 +4,24 @@ USER root ARG DEBIAN_FRONTEND=noninteractive ARG TZ=Europe/London + ARG CHROME_VERSION=1299153 +ARG GO_VERSION=1.22.3 ARG UID=1000 ARG GID=1000 +ARG DEBUG=false + ENV TZ=$TZ ENV DEBIAN_FRONTEND=$DEBIAN_FRONTEND ENV LANG="C.UTF-8" ENV DEBUG_COLORS=true ENV CHROME_PATH=/task/chrome/chrome +ENV DEBUG=${DEBUG} RUN apt-get update && apt-get install -y RUN apt-get install ca-certificates gnupg -y -RUN apt-get install golang-go -y # Chrome dependencies RUN apt-get install -y software-properties-common xvfb libu2f-udev gconf-service \ @@ -48,11 +52,12 @@ USER worker WORKDIR /task -COPY scraper/main.go go.mod go.sum ./ - -RUN GOARCH=amd64 GOOS=linux go build -ldflags="-s -w" -o app main.go && \ - rm main.go go.mod go.sum +RUN curl -Lo "go.tar.gz" "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" \ + && tar -C /task/ -xzf go.tar.gz +ENV PATH="${PATH}:/task/go/bin" +RUN go version +# Get chrome RUN mkdir -p "/task/chrome/" \ && curl -Lo "/task/chrome/chrome-linux.zip" "https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F${CHROME_VERSION}%2Fchrome-linux.zip?alt=media" \ && unzip -q "/task/chrome/chrome-linux.zip" -d "/task/chrome/" && mv /task/chrome/chrome-linux/* /task/chrome/ \ @@ -60,4 +65,10 @@ RUN mkdir -p "/task/chrome/" \ RUN echo CHROME_PATH=${CHROME_PATH} > .env +# Copy source and build scraper +COPY scraper/main.go go.mod go.sum ./ + +RUN GOARCH=amd64 GOOS=linux go build -ldflags="-s -w" -o app main.go && \ + rm main.go go.mod go.sum + ENTRYPOINT [ "task/app" ] diff --git a/README.md b/README.md index 48f4069..fd1ad1c 100644 --- a/README.md +++ b/README.md @@ -2,16 +2,43 @@ This is a serverless service to be ran on an aws lambda to provide functions to scrape websites -# Development +## Development + +### Quick guide - First run `make getDebugTools` - To build and run locally: + - Set `CHROME_PATH` in .env to chrome instillation - Run: `make runDebug` in one terminal - - In another terminal run `curl -XPOST "http://localhost:8080/2015-03-31/functions/function/invocations" -d '{"Name": "World"}'` + - In another terminal run `curl -XPOST "http://localhost:8080/2015-03-31/functions/function/invocations" -d 'JSON HERE'` + - Replace with actual json - Wait for the response! + - To run docker image locally, first run: - - Build the docker image: `docker run -p 9000:8080 watcher-local-build:latest` - - Run the image: `make runDebugDocker` - - In another terminal, run `url -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"payload":"hello world!"}'` + + - Run: `make runDebugDocker` + - In another terminal run :`curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d 'JSON HERE'` + - Replace with actual json - Wait for the response! + +### Json + +- url: url of product +- price_xpath: xpath to price element +- image_xpath: xpath to image element +- in_stock_string: the text when a product is in stock +- out_of_stock_string: the text when a product is out of stock +- Empty json: `{"url": "", "price_xpath": "", "image_xpath": "", "in_stock_string": "", "out_of_stock_string": ""}` + +Curl full example: + +``` +curl -XPOST "http://localhost:$PORT/2015-03-31/functions/function/invocations" \ + -d '{"url": "https://www.pokemoncenter.com/en-gb/product/701E11880/bulbasaur-pokemon-soda-pop-plush-5-in", \ + "price_xpath": "/html/body/div[2]/main/div/div[2]/div[2]/p/span", \ + "image_xpath": "/html/body/div[2]/main/div/div[2]/div[1]/div/div/div/div[2]/div/div/div[5]/figure/div/div[1]/img", \ + "in_stock_string": "add to basket", \ + "out_of_stock_string": "out of stock" \ + }' > response.json +``` diff --git a/go.mod b/go.mod index 8f0efb3..c4dc012 100644 --- a/go.mod +++ b/go.mod @@ -15,5 +15,6 @@ require ( require ( github.com/go-rod/rod v0.116.0 + github.com/go-rod/stealth v0.4.9 github.com/stretchr/testify v1.8.4 // indirect ) diff --git a/go.sum b/go.sum index 66f08a1..6db2f2d 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,11 @@ github.com/aws/aws-lambda-go v1.47.0 h1:0H8s0vumYx/YKs4sE7YM0ktwL2eWse+kfopsRI1s github.com/aws/aws-lambda-go v1.47.0/go.mod h1:dpMpZgvWx5vuQJfBt0zqBha60q7Dd7RfgJv23DymV8A= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-rod/rod v0.113.0/go.mod h1:aiedSEFg5DwG/fnNbUOTPMTTWX3MRj6vIs/a684Mthw= github.com/go-rod/rod v0.116.0 h1:ypRryjTys3EnqHskJ/TdgodFMvXV0EHvmy4bSkKZgHM= github.com/go-rod/rod v0.116.0/go.mod h1:aiedSEFg5DwG/fnNbUOTPMTTWX3MRj6vIs/a684Mthw= +github.com/go-rod/stealth v0.4.9 h1:X2PmQk4DUF2wzw6GOsWjW/glb8K5ebnftbEvLh7MlZ4= +github.com/go-rod/stealth v0.4.9/go.mod h1:eAzyvw8c0iAd5nJJsSWeh0fQ5z94vCIfdi1hUmYDimc= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/scraper/main.go b/scraper/main.go index 211ecac..d4393a5 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -3,40 +3,164 @@ package main import ( "context" "encoding/base64" + "errors" + "fmt" "log" + "net/http" "os" + "strings" "github.com/aws/aws-lambda-go/lambda" "github.com/go-rod/rod" "github.com/go-rod/rod/lib/launcher" - "github.com/go-rod/rod/lib/proto" + "github.com/go-rod/stealth" "github.com/joho/godotenv" ) type MyEvent struct { - URL string `json:"URL"` - XPATH string `json:"XPATH"` + Url string `json:"url"` + PriceXpath string `json:"price_xpath"` + ImageXpath string `json:"image_xpath"` + InStockString string `json:"in_stock_string"` + OutOfStockString string `json:"out_of_stock_string"` } type MyResponse struct { - Content string `json:"content"` - Hash string `json:"hash"` + Price string `json:"price"` + InStock bool `json:"in_stock"` Image string `json:"image"` + HTML string `json:"html"` + Error string `json:"error"` +} + +func getStock(page *rod.Page, inStockString string, outOfStockString string) (*bool, error) { + hasInStockElement, _, inStockErr := page.HasR("button", fmt.Sprintf("/%s/i", inStockString)) + hasOutOfStockElement, _, outOfStockErr := page.HasR("button", fmt.Sprintf("/%s/i", outOfStockString)) + + stockStatus := new(bool) + *stockStatus = false + + if inStockErr != nil { + log.Println(inStockErr) + return stockStatus, errors.New("stock: internal in stock error") + } + + if outOfStockErr != nil { + log.Println(outOfStockErr) + return stockStatus, errors.New("stock: internal out of stock error") + } + + if hasInStockElement && hasOutOfStockElement { + return stockStatus, errors.New("stock: both in and out of stock") + } else if !hasInStockElement && !hasOutOfStockElement { + return stockStatus, errors.New("stock: neither in or out of stock, this could be due to being redirected to verify you are not a robot page") + } + + if hasInStockElement && !hasOutOfStockElement { + *stockStatus = true + } else if !hasInStockElement && hasOutOfStockElement { + *stockStatus = false + } else { + return stockStatus, errors.New("stock: uncaught error") + } + + return stockStatus, nil +} + +func getPrice(page *rod.Page, priceXpath string) (string, error) { + hasElement, element, err := page.HasX(priceXpath) + + if err != nil { + log.Println(err) + return "", errors.New("price: internal price error") + } + + if !hasElement { + return "", errors.New("price: cannot fetch element from xpath") + } + + text := element.MustText() + + if text == "" { + return "", errors.New("price: no text content for element") + } + + return text, nil + +} + +func getImageAsBase64(page *rod.Page, imageXpath string) (string, error) { + hasElement, element, err := page.HasX(imageXpath) + + if err != nil { + log.Println(err) + return "", errors.New("image: internal image error") + } + + if !hasElement { + return "", errors.New("image: cannot fetch element from xpath") + } + + image := element.MustResource() + + if image == nil { + return "", errors.New("image: couldn't get image resource") + } + + return fmt.Sprintf("data:image/%s;base64,%s", http.DetectContentType(image), base64.StdEncoding.EncodeToString(image)), nil } func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { + if event.ImageXpath == "" || event.PriceXpath == "" || event.Url == "" || event.InStockString == "" || event.OutOfStockString == "" { + return &MyResponse{}, errors.New("request: doesn't have all json attributes") + } + + log.Println("Started scrape") + CHROME_PATH := os.Getenv("CHROME_PATH") u := launcher.New().Bin(CHROME_PATH).MustLaunch() - page := rod.New().ControlURL(u).MustConnect().MustPage(event.URL) - page.MustWaitStable() - bytes, err := page.Screenshot(false, &proto.PageCaptureScreenshot{}) + browser := rod.New().ControlURL(u).MustConnect() + page := stealth.MustPage(browser) + page.MustNavigate(event.Url).MustWaitStable() + + log.Println("Got page") + + DEBUG := strings.ToLower(os.Getenv("DEBUG")) + if DEBUG == "true" { + if err := os.WriteFile("page.html", []byte(page.MustHTML()), 0666); err != nil { + log.Fatal(err) + } + + } + + stockStatus, err := getStock(page, event.InStockString, event.OutOfStockString) + + if err != nil { + log.Println(err) + return &MyResponse{}, err + } + + log.Println("Finished getStock") + + price, err := getPrice(page, event.PriceXpath) if err != nil { - log.Fatal(err) + log.Println(err) + return &MyResponse{}, err } - base64Encoding := "data:image/png;base64," + base64.StdEncoding.EncodeToString(bytes) - return &MyResponse{Content: "Test", Hash: "Test", Image: base64Encoding}, nil + log.Println("Finished price") + + imageAsBase64, err := getImageAsBase64(page, event.ImageXpath) + + if err != nil { + log.Println(err) + return &MyResponse{}, err + } + + log.Println("Finished getImageAsBase64") + + return &MyResponse{HTML: page.MustHTML(), Price: price, Image: imageAsBase64, InStock: *stockStatus}, nil } func main() {