From 62e087137926f4b5691e712da2cf62955aefbf43 Mon Sep 17 00:00:00 2001 From: E Date: Fri, 24 May 2024 15:29:12 +0100 Subject: [PATCH 01/11] Remove unneeded line --- scraper/main.go | 1 - 1 file changed, 1 deletion(-) diff --git a/scraper/main.go b/scraper/main.go index d4393a5..9b21148 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -30,7 +30,6 @@ type MyResponse struct { InStock bool `json:"in_stock"` Image string `json:"image"` HTML string `json:"html"` - Error string `json:"error"` } func getStock(page *rod.Page, inStockString string, outOfStockString string) (*bool, error) { From 3737493187e060d08385086b7198bcca6bb8a5cd Mon Sep 17 00:00:00 2001 From: E Date: Fri, 24 May 2024 15:32:14 +0100 Subject: [PATCH 02/11] Remove another unneeded line and remove go folder from docker image --- Dockerfile | 3 ++- scraper/main.go | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 656640f..52f5ae0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -69,6 +69,7 @@ RUN echo CHROME_PATH=${CHROME_PATH} > .env COPY scraper/main.go go.mod go.sum ./ RUN GOARCH=amd64 GOOS=linux go build -ldflags="-s -w" -o app main.go && \ - rm main.go go.mod go.sum + rm main.go go.mod go.sum && \ + rm -rf /task/go ENTRYPOINT [ "task/app" ] diff --git a/scraper/main.go b/scraper/main.go index 9b21148..57225ff 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -37,7 +37,6 @@ func getStock(page *rod.Page, inStockString string, outOfStockString string) (*b hasOutOfStockElement, _, outOfStockErr := page.HasR("button", fmt.Sprintf("/%s/i", outOfStockString)) stockStatus := new(bool) - *stockStatus = false if inStockErr != nil { log.Println(inStockErr) From 50dedd95609baa93068c6cd8697200e4068a0017 Mon Sep 17 00:00:00 2001 From: E Date: Fri, 24 May 2024 15:37:29 +0100 Subject: [PATCH 03/11] Refactor browser setup into separate function --- scraper/main.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scraper/main.go b/scraper/main.go index 57225ff..fbab8d4 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -108,6 +108,13 @@ func getImageAsBase64(page *rod.Page, imageXpath string) (string, error) { return fmt.Sprintf("data:image/%s;base64,%s", http.DetectContentType(image), base64.StdEncoding.EncodeToString(image)), nil } +func setupBrowser() *rod.Page { + CHROME_PATH := os.Getenv("CHROME_PATH") + u := launcher.New().Bin(CHROME_PATH).MustLaunch() + browser := rod.New().ControlURL(u).MustConnect() + return stealth.MustPage(browser) +} + func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { if event.ImageXpath == "" || event.PriceXpath == "" || event.Url == "" || event.InStockString == "" || event.OutOfStockString == "" { return &MyResponse{}, errors.New("request: doesn't have all json attributes") @@ -115,10 +122,7 @@ func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { log.Println("Started scrape") - CHROME_PATH := os.Getenv("CHROME_PATH") - u := launcher.New().Bin(CHROME_PATH).MustLaunch() - browser := rod.New().ControlURL(u).MustConnect() - page := stealth.MustPage(browser) + page := setupBrowser() page.MustNavigate(event.Url).MustWaitStable() log.Println("Got page") From 38ae444dfe6aca5557776e00de0c9217a0ca55c8 Mon Sep 17 00:00:00 2001 From: E Date: Fri, 24 May 2024 18:08:07 +0100 Subject: [PATCH 04/11] WIP --- .gitignore | 3 +++ Dockerfile | 2 ++ Makefile | 19 +++++++++++++------ scraper/main.go | 31 +++++++++++++++++++++++++++---- 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index f84fd5a..cdbf9a0 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,6 @@ bin .aws-lambda-rie .env + +page.html +page-docker.html diff --git a/Dockerfile b/Dockerfile index 52f5ae0..6368937 100644 --- a/Dockerfile +++ b/Dockerfile @@ -64,6 +64,8 @@ RUN mkdir -p "/task/chrome/" \ && rm -rf /task/chrome/chrome-linux "/task/chrome/chrome-linux.zip" RUN echo CHROME_PATH=${CHROME_PATH} > .env +RUN echo DEBUG=${DEBUG} >> .env +RUN if [ "$DEBUG" = "true" ] ; then touch /task/page.html; fi # Copy source and build scraper COPY scraper/main.go go.mod go.sum ./ diff --git a/Makefile b/Makefile index 1670ec1..9e017ca 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,15 @@ -.PHONY: clean install build buildDocker runDebug runDebugDocker getDebugTools deploy +.PHONY: clean cleanServerless installDebugTools installServerless build buildDocker buildDebugDocker runDebug runDebugDocker deploy clean: - rm -rf ./node_modules + rm -rf ./bin -install: +cleanServerless: + rm ./node_modules + +installServerless: npm i -getDebugTools: +installDebugTools: wget https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie mkdir -p .aws-lambda-rie mv aws-lambda-rie .aws-lambda-rie/aws-lambda-rie @@ -24,8 +27,12 @@ runDebug: debug buildDocker: docker build -t watcher-local-build . -runDebugDocker: buildDocker - docker run --platform linux/amd64 -v ./.aws-lambda-rie:/aws-lambda -p 9000:8080 --entrypoint /aws-lambda/aws-lambda-rie watcher-local-build /task/app +buildDebugDocker: + docker build --build-arg="DEBUG=true" -t watcher-local-build . + +runDebugDocker: buildDebugDocker + touch page-docker.html + docker run --platform linux/amd64 -v ./.aws-lambda-rie:/aws-lambda -v ./page-docker.html:/task/page.html -p 9000:8080 --entrypoint /aws-lambda/aws-lambda-rie watcher-local-build /task/app deploy: install npx sls deploy --verbose diff --git a/scraper/main.go b/scraper/main.go index fbab8d4..54d9d9d 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -12,6 +12,7 @@ import ( "github.com/aws/aws-lambda-go/lambda" "github.com/go-rod/rod" + "github.com/go-rod/rod/lib/devices" "github.com/go-rod/rod/lib/launcher" "github.com/go-rod/stealth" "github.com/joho/godotenv" @@ -51,7 +52,7 @@ func getStock(page *rod.Page, inStockString string, outOfStockString string) (*b if hasInStockElement && hasOutOfStockElement { return stockStatus, errors.New("stock: both in and out of stock") } else if !hasInStockElement && !hasOutOfStockElement { - return stockStatus, errors.New("stock: neither in or out of stock, this could be due to being redirected to verify you are not a robot page") + return stockStatus, errors.New("stock: neither in or out of stock, this could be due to being redirected to their 'verify you are not a robot' page") } if hasInStockElement && !hasOutOfStockElement { @@ -110,9 +111,29 @@ func getImageAsBase64(page *rod.Page, imageXpath string) (string, error) { func setupBrowser() *rod.Page { CHROME_PATH := os.Getenv("CHROME_PATH") - u := launcher.New().Bin(CHROME_PATH).MustLaunch() - browser := rod.New().ControlURL(u).MustConnect() - return stealth.MustPage(browser) + browserArgs := launcher.New(). + UserDataDir("/tmp/profile"). + Leakless(true). + Devtools(false). + Headless(true). + NoSandbox(true). + Set("--no-zygote"). + Set("--disable-dev-shm-usage"). + Set("--disable-setuid-sandbox"). + Set("--disable-dev-shm-usage"). + Set("--disable-gpu"). + Set("--no-zygote"). + Set("--single-process") + + wsURL := browserArgs.Bin(CHROME_PATH).MustLaunch() + browser := rod.New().ControlURL(wsURL).MustConnect().DefaultDevice(devices.IPhoneX) + page := stealth.MustPage(browser) + page.MustSetExtraHeaders( + "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language", "en-GB,en;q=0.5", + "Accept-Encoding", "gzip, deflate, br, zstd", + ) + return page } func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { @@ -162,6 +183,8 @@ func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { log.Println("Finished getImageAsBase64") + page.MustClose() + return &MyResponse{HTML: page.MustHTML(), Price: price, Image: imageAsBase64, InStock: *stockStatus}, nil } From 6090e0ed49c9ea85571bfefcdeac8c6c1b084f3e Mon Sep 17 00:00:00 2001 From: E Date: Fri, 24 May 2024 19:03:21 +0100 Subject: [PATCH 05/11] Remove breaking line --- scraper/main.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/scraper/main.go b/scraper/main.go index 54d9d9d..b1ffc48 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -183,8 +183,6 @@ func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { log.Println("Finished getImageAsBase64") - page.MustClose() - return &MyResponse{HTML: page.MustHTML(), Price: price, Image: imageAsBase64, InStock: *stockStatus}, nil } From 995cfc1f29e7c2cdaed7193109e41e1dad4c9992 Mon Sep 17 00:00:00 2001 From: E Date: Wed, 5 Jun 2024 15:33:49 +0100 Subject: [PATCH 06/11] WIP: attempt to remove headers and set browser settings --- Makefile | 2 +- scraper/main.go | 29 +++++++++++++++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 9e017ca..2eef130 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,7 @@ buildDebugDocker: runDebugDocker: buildDebugDocker touch page-docker.html - docker run --platform linux/amd64 -v ./.aws-lambda-rie:/aws-lambda -v ./page-docker.html:/task/page.html -p 9000:8080 --entrypoint /aws-lambda/aws-lambda-rie watcher-local-build /task/app + docker run --platform linux/amd64 -v ./.aws-lambda-rie:/aws-lambda -v ./page-docker.html:/task/page.html -p 0.0.0.0:9000:8080 --entrypoint /aws-lambda/aws-lambda-rie watcher-local-build /task/app deploy: install npx sls deploy --verbose diff --git a/scraper/main.go b/scraper/main.go index b1ffc48..2c6c8fb 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -9,6 +9,7 @@ import ( "net/http" "os" "strings" + "time" "github.com/aws/aws-lambda-go/lambda" "github.com/go-rod/rod" @@ -109,7 +110,7 @@ func getImageAsBase64(page *rod.Page, imageXpath string) (string, error) { return fmt.Sprintf("data:image/%s;base64,%s", http.DetectContentType(image), base64.StdEncoding.EncodeToString(image)), nil } -func setupBrowser() *rod.Page { +func setupBrowser() (*rod.Page, *rod.HijackRouter) { CHROME_PATH := os.Getenv("CHROME_PATH") browserArgs := launcher.New(). UserDataDir("/tmp/profile"). @@ -127,13 +128,28 @@ func setupBrowser() *rod.Page { wsURL := browserArgs.Bin(CHROME_PATH).MustLaunch() browser := rod.New().ControlURL(wsURL).MustConnect().DefaultDevice(devices.IPhoneX) + page := stealth.MustPage(browser) page.MustSetExtraHeaders( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language", "en-GB,en;q=0.5", "Accept-Encoding", "gzip, deflate, br, zstd", ) - return page + + router := page.HijackRequests() + + router.MustAdd("https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending?sort=dont-sort", func(ctx *rod.Hijack) { + r := ctx.Request + r.Req().Header.Del("DEVICE-MEMORY") + r.Req().Header.Del("DPR") + r.Req().Header.Del("SEC-CH-PREFERS-COLOR-SCHEME") + r.Req().Header.Del("SEC-CH-PREFERS-REDUCED-MOTION") + ctx.MustLoadResponse() + }) + + go router.Run() + + return page, router } func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { @@ -143,8 +159,13 @@ func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { log.Println("Started scrape") - page := setupBrowser() - page.MustNavigate(event.Url).MustWaitStable() + page, router := setupBrowser() + + log.Println(router) + + log.Println("Set up web browser") + + page.MustNavigate(event.Url).WaitStable(time.Duration(15)) log.Println("Got page") From da039c2d124b7ac448beeb01fa1b06fb3bc0c1a3 Mon Sep 17 00:00:00 2001 From: E Date: Wed, 5 Jun 2024 16:21:10 +0100 Subject: [PATCH 07/11] Attempt to switch to desktop headers and handle error --- scraper/main.go | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/scraper/main.go b/scraper/main.go index 2c6c8fb..47d62f2 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -13,7 +13,6 @@ import ( "github.com/aws/aws-lambda-go/lambda" "github.com/go-rod/rod" - "github.com/go-rod/rod/lib/devices" "github.com/go-rod/rod/lib/launcher" "github.com/go-rod/stealth" "github.com/joho/godotenv" @@ -127,23 +126,31 @@ func setupBrowser() (*rod.Page, *rod.HijackRouter) { Set("--single-process") wsURL := browserArgs.Bin(CHROME_PATH).MustLaunch() - browser := rod.New().ControlURL(wsURL).MustConnect().DefaultDevice(devices.IPhoneX) + browser := rod.New().ControlURL(wsURL).MustConnect() page := stealth.MustPage(browser) page.MustSetExtraHeaders( - "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language", "en-GB,en;q=0.5", - "Accept-Encoding", "gzip, deflate, br, zstd", + "DNT", "1", + "SEC-FETCH-DEST", "document", + "SEC-FETCH-MODE", "navigate", + "SEC-FETCH-SITE", "same-origin", + "SEC-FETCH-USER", "?1", + "SEC-GPC", "1", + "PRIORITY", "u=1", + // "Accept-Encoding", "gzip, deflate, br, zstd", ) router := page.HijackRequests() router.MustAdd("https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending?sort=dont-sort", func(ctx *rod.Hijack) { r := ctx.Request - r.Req().Header.Del("DEVICE-MEMORY") + r.Req().Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8") + r.Req().Header.Set("Accept-Language", "en-GB,en;q=0.5") r.Req().Header.Del("DPR") + r.Req().Header.Del("DEVICE-MEMORY") r.Req().Header.Del("SEC-CH-PREFERS-COLOR-SCHEME") r.Req().Header.Del("SEC-CH-PREFERS-REDUCED-MOTION") + ctx.MustLoadResponse() }) @@ -165,7 +172,12 @@ func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { log.Println("Set up web browser") - page.MustNavigate(event.Url).WaitStable(time.Duration(15)) + err := page.MustNavigate(event.Url).WaitStable(time.Duration(15)) + + if err != nil { + log.Println(err) + return &MyResponse{}, err + } log.Println("Got page") From 2a537f60cee21229b884e01c46fcc44cd664bd53 Mon Sep 17 00:00:00 2001 From: E Date: Wed, 5 Jun 2024 16:30:30 +0100 Subject: [PATCH 08/11] Add debug option to Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2eef130..716bf2d 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ debug: env GOARCH=amd64 GOOS=linux go build -v -gcflags='all=-N -l' -ldflags="-s -w" -o bin/scraperDebug scraper/main.go runDebug: debug - ./.aws-lambda-rie/aws-lambda-rie ./bin/scraperDebug + DEBUG=true ./.aws-lambda-rie/aws-lambda-rie ./bin/scraperDebug buildDocker: docker build -t watcher-local-build . From 12f49437ad16ff4961b977b3c4937aa945aa879d Mon Sep 17 00:00:00 2001 From: E Date: Wed, 5 Jun 2024 16:30:49 +0100 Subject: [PATCH 09/11] Add start maximised option to browser --- scraper/main.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scraper/main.go b/scraper/main.go index 47d62f2..dbb87c1 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -123,7 +123,8 @@ func setupBrowser() (*rod.Page, *rod.HijackRouter) { Set("--disable-dev-shm-usage"). Set("--disable-gpu"). Set("--no-zygote"). - Set("--single-process") + Set("--single-process"). + Set("--start-maximized") wsURL := browserArgs.Bin(CHROME_PATH).MustLaunch() browser := rod.New().ControlURL(wsURL).MustConnect() From 5394cce13967d1eecb13873b9f05d245072a4719 Mon Sep 17 00:00:00 2001 From: E Date: Thu, 6 Jun 2024 11:14:09 +0100 Subject: [PATCH 10/11] Add comments about the http headers library issue --- scraper/main.go | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/scraper/main.go b/scraper/main.go index dbb87c1..e207122 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -109,7 +109,7 @@ func getImageAsBase64(page *rod.Page, imageXpath string) (string, error) { return fmt.Sprintf("data:image/%s;base64,%s", http.DetectContentType(image), base64.StdEncoding.EncodeToString(image)), nil } -func setupBrowser() (*rod.Page, *rod.HijackRouter) { +func setupBrowser() *rod.Page { CHROME_PATH := os.Getenv("CHROME_PATH") browserArgs := launcher.New(). UserDataDir("/tmp/profile"). @@ -130,6 +130,20 @@ func setupBrowser() (*rod.Page, *rod.HijackRouter) { browser := rod.New().ControlURL(wsURL).MustConnect() page := stealth.MustPage(browser) + + setupPageHeaders(page) + + return page +} + +func setupPageHeaders(page *rod.Page) { + // Currently this is seemingly the best we can do with the headers + // The ordering of them is most likely triggering Incapsula + // However the http library header object doesn't allow ordering (as of yet) + // Issues: + // https://github.com/golang/go/issues/24375 + // https://github.com/golang/go/issues/5465 + page.MustSetExtraHeaders( "DNT", "1", "SEC-FETCH-DEST", "document", @@ -139,11 +153,14 @@ func setupBrowser() (*rod.Page, *rod.HijackRouter) { "SEC-GPC", "1", "PRIORITY", "u=1", // "Accept-Encoding", "gzip, deflate, br, zstd", + // The above is commented out, as when hijacking seemingly only gzip works ) router := page.HijackRequests() - router.MustAdd("https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending?sort=dont-sort", func(ctx *rod.Hijack) { + // Currently the only way to remove headers is by hijacking the request + // Setting the Accept* headers gets overridden when set in `MustSetExtraHeaders` so it's done here instead + router.MustAdd("*", func(ctx *rod.Hijack) { r := ctx.Request r.Req().Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8") r.Req().Header.Set("Accept-Language", "en-GB,en;q=0.5") @@ -156,8 +173,6 @@ func setupBrowser() (*rod.Page, *rod.HijackRouter) { }) go router.Run() - - return page, router } func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { @@ -167,9 +182,7 @@ func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { log.Println("Started scrape") - page, router := setupBrowser() - - log.Println(router) + page := setupBrowser() log.Println("Set up web browser") @@ -182,8 +195,7 @@ func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { log.Println("Got page") - DEBUG := strings.ToLower(os.Getenv("DEBUG")) - if DEBUG == "true" { + if strings.ToLower(os.Getenv("DEBUG")) == "true" { if err := os.WriteFile("page.html", []byte(page.MustHTML()), 0666); err != nil { log.Fatal(err) } From 1ca609d8dee13acbb811c1309379c2d8b44ce9ec Mon Sep 17 00:00:00 2001 From: E Date: Thu, 6 Jun 2024 11:27:54 +0100 Subject: [PATCH 11/11] Fix time not waiting actual duration --- scraper/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/main.go b/scraper/main.go index e207122..3b1a4b8 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -186,7 +186,7 @@ func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) { log.Println("Set up web browser") - err := page.MustNavigate(event.Url).WaitStable(time.Duration(15)) + err := page.MustNavigate(event.Url).WaitStable(time.Duration(5 * time.Second)) if err != nil { log.Println(err)