diff --git a/docs/reference/index.rst b/docs/reference/index.rst index d623779..a7862f1 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -52,5 +52,10 @@ Parameter mixins .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy +.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType + .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam :exclude-members: model_computed_fields diff --git a/setup.py b/setup.py index cf48be0..869b1f5 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ "scrapy-spider-metadata>=0.2.0", "scrapy-zyte-api[provider]>=0.23.0", "web-poet>=0.17.1", - "zyte-common-items>=0.25.0", + "zyte-common-items>=0.26.2", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_serp.py b/tests/test_serp.py index 699fee5..0b56a2b 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -4,8 +4,13 @@ from scrapy_spider_metadata import get_spider_metadata from scrapy_zyte_api.responses import ZyteAPITextResponse from w3lib.url import add_or_replace_parameter +from zyte_common_items import Product -from zyte_spider_templates.spiders.serp import GoogleSearchSpider +from zyte_spider_templates.spiders.serp import ( + ITEM_TYPE_CLASSES, + GoogleSearchSpider, + SerpItemType, +) from . import get_crawler from .utils import assertEqualSpiderMetadata @@ -259,6 +264,25 @@ def test_metadata(): "title": "Max Pages", "type": "integer", }, + "item_type": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, + "description": ( + "If specified, follow organic search result links, " + "and extract the selected data type from the target " + "pages. Spider output items will be of the specified " + "data type, not search engine results page items." + ), + "enum": [ + "article", + "articleList", + "forumThread", + "jobPosting", + "product", + "productList", + ], + "title": "Follow and Extract", + }, "max_requests": { "anyOf": [{"type": "integer"}, {"type": "null"}], "default": 100, @@ -457,3 +481,70 @@ def test_parse_serp(): # The page_number parameter is required. with pytest.raises(TypeError): spider.parse_serp(response) # type: ignore[call-arg] + + +def test_item_type(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=43, item_type="product" + ) + url = "https://www.google.com/search?q=foo+bar" + response = ZyteAPITextResponse.from_api_response( + api_response={ + "serp": { + "organicResults": [ + { + "description": "…", + "name": "…", + "url": f"https://example.com/{rank}", + "rank": rank, + } + for rank in range(1, 11) + ], + "metadata": { + "dateDownloaded": "2024-10-25T08:59:45Z", + "displayedQuery": "foo bar", + "searchedQuery": "foo bar", + "totalOrganicResults": 99999, + }, + "pageNumber": 1, + "url": url, + }, + "url": url, + }, + ) + items = [] + requests = [] + for item_or_request in spider.parse_serp(response, page_number=42): + if isinstance(item_or_request, Request): + requests.append(item_or_request) + else: + items.append(item_or_request) + assert len(items) == 0 + assert len(requests) == 11 + + assert requests[0].url == add_or_replace_parameter(url, "start", "420") + assert requests[0].cb_kwargs["page_number"] == 43 + + for rank in range(1, 11): + assert requests[rank].url == f"https://example.com/{rank}" + assert requests[rank].callback == spider.parse_result + assert requests[rank].meta == { + "crawling_logs": {"page_type": "product"}, + "inject": [Product], + } + + +def test_item_type_mappings(): + # Ensure that all SerpItemType keys and values match. + for entry in SerpItemType: + assert entry.name == entry.value + + # Ensure that the ITEM_TYPE_CLASSES dict maps all values from the + # corresponding enum except for serp. + actual_keys = set(ITEM_TYPE_CLASSES) + expected_keys = set(entry.value for entry in SerpItemType) + assert actual_keys == expected_keys + + # Also ensure that no dict value is repeated. + assert len(actual_keys) == len(set(ITEM_TYPE_CLASSES.values())) diff --git a/tox.ini b/tox.ini index a79c93b..576d792 100644 --- a/tox.ini +++ b/tox.ini @@ -31,7 +31,7 @@ deps = scrapy-spider-metadata==0.2.0 scrapy-zyte-api[provider]==0.23.0 web-poet==0.17.1 - zyte-common-items==0.25.0 + zyte-common-items==0.26.2 [testenv:mypy] deps = diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index ed0d1e7..d69856e 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -1,12 +1,23 @@ +from enum import Enum from typing import Any, Dict, Iterable, List, Optional, Union from pydantic import BaseModel, Field, field_validator from scrapy import Request from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings +from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args from w3lib.url import add_or_replace_parameter -from zyte_common_items import Serp - +from zyte_common_items import ( + Article, + ArticleList, + ForumThread, + JobPosting, + Product, + ProductList, + Serp, +) + +from ..documentation import document_enum from ..params import MaxRequestsParam from ._google_domains import GoogleDomain from .base import BaseSpider @@ -48,6 +59,62 @@ class SerpMaxPagesParam(BaseModel): ) +@document_enum +class SerpItemType(str, Enum): + article: str = "article" + """ + Article data. + """ + + articleList: str = "articleList" + """ + Article list data. + """ + + forumThread: str = "forumThread" + """ + Forum thread data. + """ + + jobPosting: str = "jobPosting" + """ + Job posting data. + """ + + product: str = "product" + """ + Product data. + """ + + productList: str = "productList" + """ + Product list data. + """ + + +ITEM_TYPE_CLASSES = { + SerpItemType.article: Article, + SerpItemType.articleList: ArticleList, + SerpItemType.forumThread: ForumThread, + SerpItemType.jobPosting: JobPosting, + SerpItemType.product: Product, + SerpItemType.productList: ProductList, +} + + +class SerpItemTypeParam(BaseModel): + item_type: Optional[SerpItemType] = Field( + title="Follow and Extract", + description=( + "If specified, follow organic search result links, and extract " + "the selected data type from the target pages. Spider output " + "items will be of the specified data type, not search engine " + "results page items." + ), + default=None, + ) + + class GoogleDomainParam(BaseModel): domain: GoogleDomain = Field( title="Domain", @@ -58,6 +125,7 @@ class GoogleDomainParam(BaseModel): class GoogleSearchSpiderParams( MaxRequestsParam, + SerpItemTypeParam, SerpMaxPagesParam, SearchQueriesParam, GoogleDomainParam, @@ -132,4 +200,21 @@ def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) yield self.get_serp_request(next_url, page_number=page_number + 1) - yield serp + if self.args.item_type is None: + yield serp + return + + for result in serp.organicResults: + yield response.follow( + result.url, + callback=self.parse_result, + meta={ + "crawling_logs": {"page_type": self.args.item_type.value}, + "inject": [ITEM_TYPE_CLASSES[self.args.item_type]], + }, + ) + + def parse_result( + self, response: DummyResponse, dynamic: DynamicDeps + ) -> Iterable[Any]: + yield next(iter(dynamic.values()))