Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attempt to bypass robots page #15

Merged
merged 11 commits into from
Jun 7, 2024
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ bin

.aws-lambda-rie
.env

page.html
page-docker.html
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,14 @@ RUN mkdir -p "/task/chrome/" \
&& rm -rf /task/chrome/chrome-linux "/task/chrome/chrome-linux.zip"

RUN echo CHROME_PATH=${CHROME_PATH} > .env
RUN echo DEBUG=${DEBUG} >> .env
RUN if [ "$DEBUG" = "true" ] ; then touch /task/page.html; fi

# Copy source and build scraper
COPY scraper/main.go go.mod go.sum ./

RUN GOARCH=amd64 GOOS=linux go build -ldflags="-s -w" -o app main.go && \
rm main.go go.mod go.sum
rm main.go go.mod go.sum && \
rm -rf /task/go

ENTRYPOINT [ "task/app" ]
21 changes: 14 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
.PHONY: clean install build buildDocker runDebug runDebugDocker getDebugTools deploy
.PHONY: clean cleanServerless installDebugTools installServerless build buildDocker buildDebugDocker runDebug runDebugDocker deploy

clean:
rm -rf ./node_modules
rm -rf ./bin

install:
cleanServerless:
rm ./node_modules

installServerless:
npm i

getDebugTools:
installDebugTools:
wget https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie
mkdir -p .aws-lambda-rie
mv aws-lambda-rie .aws-lambda-rie/aws-lambda-rie
Expand All @@ -19,13 +22,17 @@ debug:
env GOARCH=amd64 GOOS=linux go build -v -gcflags='all=-N -l' -ldflags="-s -w" -o bin/scraperDebug scraper/main.go

runDebug: debug
./.aws-lambda-rie/aws-lambda-rie ./bin/scraperDebug
DEBUG=true ./.aws-lambda-rie/aws-lambda-rie ./bin/scraperDebug

buildDocker:
docker build -t watcher-local-build .

runDebugDocker: buildDocker
docker run --platform linux/amd64 -v ./.aws-lambda-rie:/aws-lambda -p 9000:8080 --entrypoint /aws-lambda/aws-lambda-rie watcher-local-build /task/app
buildDebugDocker:
docker build --build-arg="DEBUG=true" -t watcher-local-build .

runDebugDocker: buildDebugDocker
touch page-docker.html
docker run --platform linux/amd64 -v ./.aws-lambda-rie:/aws-lambda -v ./page-docker.html:/task/page.html -p 0.0.0.0:9000:8080 --entrypoint /aws-lambda/aws-lambda-rie watcher-local-build /task/app

deploy: install
npx sls deploy --verbose
89 changes: 79 additions & 10 deletions scraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"net/http"
"os"
"strings"
"time"

"github.com/aws/aws-lambda-go/lambda"
"github.com/go-rod/rod"
Expand All @@ -30,15 +31,13 @@ type MyResponse struct {
InStock bool `json:"in_stock"`
Image string `json:"image"`
HTML string `json:"html"`
Error string `json:"error"`
}

func getStock(page *rod.Page, inStockString string, outOfStockString string) (*bool, error) {
hasInStockElement, _, inStockErr := page.HasR("button", fmt.Sprintf("/%s/i", inStockString))
hasOutOfStockElement, _, outOfStockErr := page.HasR("button", fmt.Sprintf("/%s/i", outOfStockString))

stockStatus := new(bool)
*stockStatus = false

if inStockErr != nil {
log.Println(inStockErr)
Expand All @@ -53,7 +52,7 @@ func getStock(page *rod.Page, inStockString string, outOfStockString string) (*b
if hasInStockElement && hasOutOfStockElement {
return stockStatus, errors.New("stock: both in and out of stock")
} else if !hasInStockElement && !hasOutOfStockElement {
return stockStatus, errors.New("stock: neither in or out of stock, this could be due to being redirected to verify you are not a robot page")
return stockStatus, errors.New("stock: neither in or out of stock, this could be due to being redirected to their 'verify you are not a robot' page")
}

if hasInStockElement && !hasOutOfStockElement {
Expand Down Expand Up @@ -110,23 +109,93 @@ func getImageAsBase64(page *rod.Page, imageXpath string) (string, error) {
return fmt.Sprintf("data:image/%s;base64,%s", http.DetectContentType(image), base64.StdEncoding.EncodeToString(image)), nil
}

func setupBrowser() *rod.Page {
CHROME_PATH := os.Getenv("CHROME_PATH")
browserArgs := launcher.New().
UserDataDir("/tmp/profile").
Leakless(true).
Devtools(false).
Headless(true).
NoSandbox(true).
Set("--no-zygote").
Set("--disable-dev-shm-usage").
Set("--disable-setuid-sandbox").
Set("--disable-dev-shm-usage").
Set("--disable-gpu").
Set("--no-zygote").
Set("--single-process").
Set("--start-maximized")

wsURL := browserArgs.Bin(CHROME_PATH).MustLaunch()
browser := rod.New().ControlURL(wsURL).MustConnect()

page := stealth.MustPage(browser)

setupPageHeaders(page)

return page
}

func setupPageHeaders(page *rod.Page) {
// Currently this is seemingly the best we can do with the headers
// The ordering of them is most likely triggering Incapsula
// However the http library header object doesn't allow ordering (as of yet)
// Issues:
// https://github.com/golang/go/issues/24375
// https://github.com/golang/go/issues/5465

page.MustSetExtraHeaders(
"DNT", "1",
"SEC-FETCH-DEST", "document",
"SEC-FETCH-MODE", "navigate",
"SEC-FETCH-SITE", "same-origin",
"SEC-FETCH-USER", "?1",
"SEC-GPC", "1",
"PRIORITY", "u=1",
// "Accept-Encoding", "gzip, deflate, br, zstd",
// The above is commented out, as when hijacking seemingly only gzip works
)

router := page.HijackRequests()

// Currently the only way to remove headers is by hijacking the request
// Setting the Accept* headers gets overridden when set in `MustSetExtraHeaders` so it's done here instead
router.MustAdd("*", func(ctx *rod.Hijack) {
r := ctx.Request
r.Req().Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
r.Req().Header.Set("Accept-Language", "en-GB,en;q=0.5")
r.Req().Header.Del("DPR")
r.Req().Header.Del("DEVICE-MEMORY")
r.Req().Header.Del("SEC-CH-PREFERS-COLOR-SCHEME")
r.Req().Header.Del("SEC-CH-PREFERS-REDUCED-MOTION")

ctx.MustLoadResponse()
})

go router.Run()
}

func scrape(ctx context.Context, event *MyEvent) (*MyResponse, error) {
if event.ImageXpath == "" || event.PriceXpath == "" || event.Url == "" || event.InStockString == "" || event.OutOfStockString == "" {
return &MyResponse{}, errors.New("request: doesn't have all json attributes")
}

log.Println("Started scrape")

CHROME_PATH := os.Getenv("CHROME_PATH")
u := launcher.New().Bin(CHROME_PATH).MustLaunch()
browser := rod.New().ControlURL(u).MustConnect()
page := stealth.MustPage(browser)
page.MustNavigate(event.Url).MustWaitStable()
page := setupBrowser()

log.Println("Set up web browser")

err := page.MustNavigate(event.Url).WaitStable(time.Duration(5 * time.Second))

if err != nil {
log.Println(err)
return &MyResponse{}, err
}

log.Println("Got page")

DEBUG := strings.ToLower(os.Getenv("DEBUG"))
if DEBUG == "true" {
if strings.ToLower(os.Getenv("DEBUG")) == "true" {
if err := os.WriteFile("page.html", []byte(page.MustHTML()), 0666); err != nil {
log.Fatal(err)
}
Expand Down