Skip to content

Commit

Permalink
Merge pull request #84 from Gallaecio/fix-max-pages
Browse files Browse the repository at this point in the history
Fix max_pages
  • Loading branch information
kmike authored Nov 20, 2024
2 parents 99a42f5 + 728a3a9 commit bb21a2a
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 6 deletions.
16 changes: 14 additions & 2 deletions tests/test_serp.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,9 @@ def test_search_queries():

def test_pagination():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar")
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo bar", max_pages=3
)

def run_parse_serp(total_results, page=1):
url = "https://www.google.com/search?q=foo+bar"
Expand Down Expand Up @@ -388,6 +390,14 @@ def run_parse_serp(total_results, page=1):
assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20"
assert requests[0].cb_kwargs["page_number"] == 3

# Do not go over max_pages
items, requests = run_parse_serp(
total_results=31,
page=3,
)
assert len(items) == 1
assert len(requests) == 0


def test_get_serp_request():
crawler = get_crawler()
Expand All @@ -404,7 +414,9 @@ def test_get_serp_request():

def test_parse_serp():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar")
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo bar", max_pages=43
)
url = "https://www.google.com/search?q=foo+bar"
response = ZyteAPITextResponse.from_api_response(
api_response={
Expand Down
9 changes: 5 additions & 4 deletions zyte_spider_templates/spiders/serp.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,10 @@ def start_requests(self) -> Iterable[Request]:
def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]:
serp = Serp.from_dict(response.raw_api_response["serp"])

next_start = page_number * self._results_per_page
if serp.organicResults and serp.metadata.totalOrganicResults > next_start:
next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
yield self.get_serp_request(next_url, page_number=page_number + 1)
if page_number < self.args.max_pages:
next_start = page_number * self._results_per_page
if serp.organicResults and serp.metadata.totalOrganicResults > next_start:
next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
yield self.get_serp_request(next_url, page_number=page_number + 1)

yield serp

0 comments on commit bb21a2a

Please sign in to comment.