Skip to content

Commit

Permalink
Google Search: Allow extracting items from SERP results (#78)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Nov 22, 2024
1 parent e9bf031 commit 55663e0
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 6 deletions.
5 changes: 5 additions & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,10 @@ Parameter mixins

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
:exclude-members: model_computed_fields
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"scrapy-spider-metadata>=0.2.0",
"scrapy-zyte-api[provider]>=0.23.0",
"web-poet>=0.17.1",
"zyte-common-items>=0.25.0",
"zyte-common-items>=0.26.2",
],
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
93 changes: 92 additions & 1 deletion tests/test_serp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@
from scrapy_spider_metadata import get_spider_metadata
from scrapy_zyte_api.responses import ZyteAPITextResponse
from w3lib.url import add_or_replace_parameter
from zyte_common_items import Product

from zyte_spider_templates.spiders.serp import GoogleSearchSpider
from zyte_spider_templates.spiders.serp import (
ITEM_TYPE_CLASSES,
GoogleSearchSpider,
SerpItemType,
)

from . import get_crawler
from .utils import assertEqualSpiderMetadata
Expand Down Expand Up @@ -259,6 +264,25 @@ def test_metadata():
"title": "Max Pages",
"type": "integer",
},
"item_type": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"default": None,
"description": (
"If specified, follow organic search result links, "
"and extract the selected data type from the target "
"pages. Spider output items will be of the specified "
"data type, not search engine results page items."
),
"enum": [
"article",
"articleList",
"forumThread",
"jobPosting",
"product",
"productList",
],
"title": "Follow and Extract",
},
"max_requests": {
"anyOf": [{"type": "integer"}, {"type": "null"}],
"default": 100,
Expand Down Expand Up @@ -457,3 +481,70 @@ def test_parse_serp():
# The page_number parameter is required.
with pytest.raises(TypeError):
spider.parse_serp(response) # type: ignore[call-arg]


def test_item_type():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo bar", max_pages=43, item_type="product"
)
url = "https://www.google.com/search?q=foo+bar"
response = ZyteAPITextResponse.from_api_response(
api_response={
"serp": {
"organicResults": [
{
"description": "…",
"name": "…",
"url": f"https://example.com/{rank}",
"rank": rank,
}
for rank in range(1, 11)
],
"metadata": {
"dateDownloaded": "2024-10-25T08:59:45Z",
"displayedQuery": "foo bar",
"searchedQuery": "foo bar",
"totalOrganicResults": 99999,
},
"pageNumber": 1,
"url": url,
},
"url": url,
},
)
items = []
requests = []
for item_or_request in spider.parse_serp(response, page_number=42):
if isinstance(item_or_request, Request):
requests.append(item_or_request)
else:
items.append(item_or_request)
assert len(items) == 0
assert len(requests) == 11

assert requests[0].url == add_or_replace_parameter(url, "start", "420")
assert requests[0].cb_kwargs["page_number"] == 43

for rank in range(1, 11):
assert requests[rank].url == f"https://example.com/{rank}"
assert requests[rank].callback == spider.parse_result
assert requests[rank].meta == {
"crawling_logs": {"page_type": "product"},
"inject": [Product],
}


def test_item_type_mappings():
# Ensure that all SerpItemType keys and values match.
for entry in SerpItemType:
assert entry.name == entry.value

# Ensure that the ITEM_TYPE_CLASSES dict maps all values from the
# corresponding enum except for serp.
actual_keys = set(ITEM_TYPE_CLASSES)
expected_keys = set(entry.value for entry in SerpItemType)
assert actual_keys == expected_keys

# Also ensure that no dict value is repeated.
assert len(actual_keys) == len(set(ITEM_TYPE_CLASSES.values()))
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ deps =
scrapy-spider-metadata==0.2.0
scrapy-zyte-api[provider]==0.23.0
web-poet==0.17.1
zyte-common-items==0.25.0
zyte-common-items==0.26.2

[testenv:mypy]
deps =
Expand Down
91 changes: 88 additions & 3 deletions zyte_spider_templates/spiders/serp.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
from enum import Enum
from typing import Any, Dict, Iterable, List, Optional, Union

from pydantic import BaseModel, Field, field_validator
from scrapy import Request
from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from w3lib.url import add_or_replace_parameter
from zyte_common_items import Serp

from zyte_common_items import (
Article,
ArticleList,
ForumThread,
JobPosting,
Product,
ProductList,
Serp,
)

from ..documentation import document_enum
from ..params import MaxRequestsParam
from ._google_domains import GoogleDomain
from .base import BaseSpider
Expand Down Expand Up @@ -48,6 +59,62 @@ class SerpMaxPagesParam(BaseModel):
)


@document_enum
class SerpItemType(str, Enum):
article: str = "article"
"""
Article data.
"""

articleList: str = "articleList"
"""
Article list data.
"""

forumThread: str = "forumThread"
"""
Forum thread data.
"""

jobPosting: str = "jobPosting"
"""
Job posting data.
"""

product: str = "product"
"""
Product data.
"""

productList: str = "productList"
"""
Product list data.
"""


ITEM_TYPE_CLASSES = {
SerpItemType.article: Article,
SerpItemType.articleList: ArticleList,
SerpItemType.forumThread: ForumThread,
SerpItemType.jobPosting: JobPosting,
SerpItemType.product: Product,
SerpItemType.productList: ProductList,
}


class SerpItemTypeParam(BaseModel):
item_type: Optional[SerpItemType] = Field(
title="Follow and Extract",
description=(
"If specified, follow organic search result links, and extract "
"the selected data type from the target pages. Spider output "
"items will be of the specified data type, not search engine "
"results page items."
),
default=None,
)


class GoogleDomainParam(BaseModel):
domain: GoogleDomain = Field(
title="Domain",
Expand All @@ -58,6 +125,7 @@ class GoogleDomainParam(BaseModel):

class GoogleSearchSpiderParams(
MaxRequestsParam,
SerpItemTypeParam,
SerpMaxPagesParam,
SearchQueriesParam,
GoogleDomainParam,
Expand Down Expand Up @@ -132,4 +200,21 @@ def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]:
next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
yield self.get_serp_request(next_url, page_number=page_number + 1)

yield serp
if self.args.item_type is None:
yield serp
return

for result in serp.organicResults:
yield response.follow(
result.url,
callback=self.parse_result,
meta={
"crawling_logs": {"page_type": self.args.item_type.value},
"inject": [ITEM_TYPE_CLASSES[self.args.item_type]],
},
)

def parse_result(
self, response: DummyResponse, dynamic: DynamicDeps
) -> Iterable[Any]:
yield next(iter(dynamic.values()))

0 comments on commit 55663e0

Please sign in to comment.