Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google Search: add geolocation parameters #80

Merged
merged 9 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ ignore_missing_imports = true

[tool.black]
target-version = ["py38", "py39", "py310", "py311", "py312"]
force-exclude = "template.py"
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ ignore =
# First line should not be the function's "signature"
D402

exclude =
template.py

per-file-ignores =
# F401: Ignore "imported but unused" errors in __init__ files, as those
# imports are there to expose submodule functions so they can be imported
Expand Down
141 changes: 2 additions & 139 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pytest
import requests
import scrapy
from pydantic import ValidationError
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import get_spider_metadata
from web_poet.page_inputs.browser import BrowserResponse
Expand All @@ -21,43 +20,13 @@
GEOLOCATION_OPTIONS_WITH_CODE,
Geolocation,
)
from zyte_spider_templates.spiders.ecommerce import (
EcommerceCrawlStrategy,
EcommerceSpider,
)
from zyte_spider_templates.spiders.ecommerce import EcommerceSpider

from . import get_crawler
from .test_utils import URL_TO_DOMAIN
from .utils import assertEqualSpiderMetadata


def test_parameters():
with pytest.raises(ValidationError):
EcommerceSpider()

EcommerceSpider(url="https://example.com")
EcommerceSpider(
url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.automatic
)
EcommerceSpider(url="https://example.com", crawl_strategy="automatic")

with pytest.raises(ValidationError):
EcommerceSpider(url="https://example.com", crawl_strategy="unknown")

EcommerceSpider(
url="https://example.com", crawl_strategy="direct_item", search_queries=""
)
EcommerceSpider(
url="https://example.com", crawl_strategy="automatic", search_queries="foo"
)
with pytest.raises(ValidationError):
EcommerceSpider(
url="https://example.com",
crawl_strategy="direct_item",
search_queries="foo",
)


def test_start_requests():
url = "https://example.com"
crawler = get_crawler()
Expand Down Expand Up @@ -305,108 +274,6 @@ def test_parse_search_request_template_probability(probability, yields_items):
assert items if yields_items else not items


def test_arguments():
# Ensure passing no arguments works.
crawler = get_crawler()

# Needed since it's a required argument.
base_kwargs = {"url": "https://example.com"}

EcommerceSpider.from_crawler(crawler, **base_kwargs)

for param, arg, setting, old_setting_value, getter_name, new_setting_value in (
("max_requests", "123", "ZYTE_API_MAX_REQUESTS", None, "getint", 123),
(
"geolocation",
"DE",
"ZYTE_API_AUTOMAP_PARAMS",
None,
"getdict",
{"geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_AUTOMAP_PARAMS",
'{"browserHtml": true}',
"getdict",
{"browserHtml": True, "geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_AUTOMAP_PARAMS",
'{"geolocation": "IE"}',
"getdict",
{"geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_PROVIDER_PARAMS",
None,
"getdict",
{"geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_PROVIDER_PARAMS",
'{"browserHtml": true}',
"getdict",
{"browserHtml": True, "geolocation": "DE"},
),
(
"geolocation",
"DE",
"ZYTE_API_PROVIDER_PARAMS",
'{"geolocation": "IE"}',
"getdict",
{"geolocation": "DE"},
),
(
"extract_from",
"browserHtml",
"ZYTE_API_PROVIDER_PARAMS",
None,
"getdict",
{
"productOptions": {"extractFrom": "browserHtml"},
"productNavigationOptions": {"extractFrom": "browserHtml"},
},
),
(
"extract_from",
"httpResponseBody",
"ZYTE_API_PROVIDER_PARAMS",
{"geolocation": "US"},
"getdict",
{
"productOptions": {"extractFrom": "httpResponseBody"},
"productNavigationOptions": {"extractFrom": "httpResponseBody"},
"geolocation": "US",
},
),
(
"extract_from",
None,
"ZYTE_API_PROVIDER_PARAMS",
{"geolocation": "US"},
"getdict",
{"geolocation": "US"},
),
):
kwargs = {param: arg}
settings = {}
if old_setting_value is not None:
settings[setting] = old_setting_value
crawler = get_crawler(settings=settings)
spider = EcommerceSpider.from_crawler(crawler, **kwargs, **base_kwargs)
getter = getattr(crawler.settings, getter_name)
assert getter(setting) == new_setting_value
assert spider.allowed_domains == ["example.com"] # type: ignore[attr-defined]


def test_metadata():
actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True)
expected_metadata = {
Expand Down Expand Up @@ -550,11 +417,7 @@ def test_metadata():
{"type": "null"},
],
"default": None,
"description": (
"ISO 3166-1 alpha-2 2-character string specified in "
"https://docs.zyte.com/zyte-api/usage/reference.html"
"#operation/extract/request/geolocation."
),
"description": "Country of the IP addresses to use.",
"enumMeta": {
code: {
"title": GEOLOCATION_OPTIONS_WITH_CODE[code],
Expand Down
Loading