From 440159e7f4b648a3d1dee870482b072fbd95f866 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 25 Jun 2026 10:50:29 +0000 Subject: [PATCH] =?UTF-8?q?Harden=20Solar=20API=20client=20against=20429s:?= =?UTF-8?q?=20jitter,=20Retry-After,=20bounded=20backoff=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Route the Google Solar client through the shared call_with_retry with full jitter (de-synchronises the 32 concurrent containers per Google's "avoid synchronised requests" guidance), honouring Retry-After, a 60s max backoff (rides out the 600 QPM per-minute window), and 6 bounded retries. 429/5xx/transport errors are transient; other 4xx propagate immediately; 404-entity-not-found stays BuildingInsightsNotFoundError. On exhaustion a TransientHttpError surfaces so the subtask fails and is re-triggered (no silent degrade). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../solar/google_solar_api_client.py | 57 ++++++++-- .../solar/test_google_solar_api_client.py | 101 ++++++++++++++++-- 2 files changed, 140 insertions(+), 18 deletions(-) diff --git a/infrastructure/solar/google_solar_api_client.py b/infrastructure/solar/google_solar_api_client.py index 28e91866..6dcc2e18 100644 --- a/infrastructure/solar/google_solar_api_client.py +++ b/infrastructure/solar/google_solar_api_client.py @@ -1,8 +1,9 @@ -import time from typing import Any, Literal, Optional import requests +from infrastructure.http_retry import TransientHttpError, call_with_retry + class BuildingInsightsNotFoundError(Exception): pass @@ -10,12 +11,28 @@ class BuildingInsightsNotFoundError(Exception): class GoogleSolarApiClient: base_url: str = "https://solar.googleapis.com/v1" - MAX_RETRIES: int = 5 + # Bounded retries (NOT Google's infinite-loop example). With the 60s max + # backoff below, 6 retries span ~2 of the Solar API's per-minute (600 QPM) + # windows — enough to ride out a transient 429 burst, then give up so a + # stuck request can't blow the container timeout; the subtask fails and is + # re-triggered. + MAX_RETRIES: int = 6 + MAX_BACKOFF_SECONDS: float = 60.0 ENTITY_NOT_FOUND_ERROR: str = "Requested entity was not found." def __init__(self, api_key: str) -> None: self._api_key = api_key + @staticmethod + def _parse_retry_after(response: requests.Response) -> Optional[float]: + header = response.headers.get("Retry-After") + if header is None: + return None + try: + return float(header) + except (TypeError, ValueError): + return None + def get_building_insights( self, longitude: float, @@ -29,22 +46,40 @@ class GoogleSolarApiClient: "requiredQuality": required_quality, "key": self._api_key, } - last_exc: Optional[Exception] = None - for attempt in range(self.MAX_RETRIES): + + def _fetch() -> dict[str, Any]: try: response = requests.get(insights_url, params=params) response.raise_for_status() result: dict[str, Any] = response.json() return result except requests.exceptions.RequestException as e: + response = e.response + if response is None: + # No response = a transport-level failure (connection reset / + # read or connect timeout) — transient, retry. + raise TransientHttpError(f"Solar API transport error: {e}") from e + status = response.status_code if ( - e.response is not None - and e.response.status_code == 404 - and e.response.json()["error"]["message"] == self.ENTITY_NOT_FOUND_ERROR + status == 404 + and response.json().get("error", {}).get("message") + == self.ENTITY_NOT_FOUND_ERROR ): raise BuildingInsightsNotFoundError() from e - last_exc = e - time.sleep(2 ** attempt) + if status == 429 or status >= 500: + # Rate-limit (429) and server errors (5xx) are transient. + # Honour the server's Retry-After when present (jittered + # exponential backoff otherwise — see `call_with_retry`). + raise TransientHttpError( + f"Solar API {status}", + retry_after=self._parse_retry_after(response), + ) from e + # Other 4xx are the caller's fault — non-transient, propagate. + raise - assert last_exc is not None - raise last_exc + return call_with_retry( + _fetch, + max_retries=self.MAX_RETRIES, + max_backoff=self.MAX_BACKOFF_SECONDS, + jitter=True, + ) diff --git a/tests/infrastructure/solar/test_google_solar_api_client.py b/tests/infrastructure/solar/test_google_solar_api_client.py index d4328fc0..9deebc15 100644 --- a/tests/infrastructure/solar/test_google_solar_api_client.py +++ b/tests/infrastructure/solar/test_google_solar_api_client.py @@ -1,15 +1,100 @@ -from typing import Any -from unittest.mock import MagicMock, patch +from typing import Any, Optional +from unittest.mock import MagicMock, call, patch import pytest import requests +from infrastructure.http_retry import TransientHttpError from infrastructure.solar.google_solar_api_client import ( BuildingInsightsNotFoundError, GoogleSolarApiClient, ) +def _http_error( + status_code: int, *, headers: Optional[dict[str, str]] = None +) -> requests.exceptions.HTTPError: + resp = MagicMock() + resp.status_code = status_code + resp.headers = headers or {} + err = requests.exceptions.HTTPError(str(status_code)) + err.response = resp + return err + + +def _raising_response(err: Exception) -> MagicMock: + resp = MagicMock() + resp.raise_for_status.side_effect = err + return resp + + +def test_get_building_insights_retries_429_then_succeeds() -> None: + # Arrange — a rate-limit (429) then a success. The 429 is transient, so the + # client backs off and retries rather than failing. + client = GoogleSolarApiClient(api_key="test-key") + payload: dict[str, Any] = {"solarPotential": {"maxArrayPanelsCount": 20}} + ok = MagicMock() + ok.json.return_value = payload + ok.raise_for_status.return_value = None + + with patch("requests.get", side_effect=[_raising_response(_http_error(429)), ok]) as mock_get: + with patch("infrastructure.http_retry.time.sleep"): + # Act + result = client.get_building_insights(longitude=-0.1278, latitude=51.5074) + + # Assert + assert result == payload + assert mock_get.call_count == 2 + + +def test_get_building_insights_honours_retry_after_on_429() -> None: + # Arrange — a persistent 429 advertising Retry-After: 2s. The client sleeps + # exactly that (not a jittered computed backoff) and gives up bounded. + client = GoogleSolarApiClient(api_key="test-key") + err = _http_error(429, headers={"Retry-After": "2"}) + + with patch("requests.get", return_value=_raising_response(err)) as mock_get: + with patch("infrastructure.http_retry.time.sleep") as sleep: + # Act / Assert — exhaustion raises a TransientHttpError, not the raw HTTPError. + with pytest.raises(TransientHttpError): + client.get_building_insights(longitude=-0.1278, latitude=51.5074) + + # Bounded: MAX_RETRIES retries + the initial attempt; Retry-After honoured. + assert mock_get.call_count == GoogleSolarApiClient.MAX_RETRIES + 1 + assert call(2.0) in sleep.call_args_list + + +def test_get_building_insights_retries_on_500() -> None: + # Arrange — a 5xx is transient too; retry then succeed. + client = GoogleSolarApiClient(api_key="test-key") + payload: dict[str, Any] = {"solarPotential": {}} + ok = MagicMock() + ok.json.return_value = payload + ok.raise_for_status.return_value = None + + with patch("requests.get", side_effect=[_raising_response(_http_error(503)), ok]) as mock_get: + with patch("infrastructure.http_retry.time.sleep"): + # Act + result = client.get_building_insights(longitude=-0.1278, latitude=51.5074) + + # Assert + assert result == payload + assert mock_get.call_count == 2 + + +def test_get_building_insights_does_not_retry_a_400() -> None: + # Arrange — a non-transient client error (e.g. bad request) propagates at once. + client = GoogleSolarApiClient(api_key="test-key") + + with patch("requests.get", return_value=_raising_response(_http_error(400))) as mock_get: + with patch("infrastructure.http_retry.time.sleep"): + # Act / Assert + with pytest.raises(requests.exceptions.HTTPError): + client.get_building_insights(longitude=-0.1278, latitude=51.5074) + + assert mock_get.call_count == 1 + + # --------------------------------------------------------------------------- # Slice 1: Successful response returns parsed JSON # --------------------------------------------------------------------------- @@ -94,17 +179,19 @@ def test_get_building_insights_raises_on_entity_not_found() -> None: # --------------------------------------------------------------------------- -def test_get_building_insights_propagates_exception_after_retry_exhaustion() -> None: - # Arrange +def test_get_building_insights_fails_bounded_after_retry_exhaustion() -> None: + # Arrange — a persistent transport failure (no response). It's transient, so + # the client retries up to the bound, then surfaces a TransientHttpError for + # the caller to fail the subtask on (re-triggerable) — NOT an infinite loop. client = GoogleSolarApiClient(api_key="test-key") error = requests.exceptions.ConnectionError("persistent failure") error.response = None # type: ignore[attr-defined] with patch("requests.get", side_effect=error) as mock_get: - with patch("time.sleep"): + with patch("infrastructure.http_retry.time.sleep"): # Act / Assert - with pytest.raises(requests.exceptions.ConnectionError): + with pytest.raises(TransientHttpError): client.get_building_insights(longitude=-0.1278, latitude=51.5074) - assert mock_get.call_count == GoogleSolarApiClient.MAX_RETRIES + assert mock_get.call_count == GoogleSolarApiClient.MAX_RETRIES + 1