mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Replaces the handler's whole-pipeline Session (one transaction across all three stages, connection pinned during Ingestion's external IO) with a Unit-of-Work per stage (ADR-0012, added here). Each stage runs its batch in one unit and commits once; any property raising aborts the batch and the subtask fails noisily. - BaselineOrchestrator(unit_of_work, rebaseliner): one unit for the batch, commit once. Raise on a pre-SAP10 property leaves the unit uncommitted. - IngestionOrchestrator(unit_of_work, epc_fetcher, geospatial_repo, solar_fetcher): fetch/write split — phase 1 fetches the whole batch (EPC / coords / solar) with NO unit open; phase 2 writes in one unit and commits. The connection is never held during external IO. Geospatial S3 repo stays injected (reference data, not transactional). - Handler: module-scoped engine (pool reused across warm invocations) + a UoW factory; whole-pipeline `with Session` gone. `build_first_run_pipeline` composes on the factory. Source clients still behind the raising seam. - ADR-0012 records the decision (per-stage boundary, all-or-nothing batch, idempotent re-run, fetch/write split, module-scoped engine). Modelling stub left untouched (no-op, no DB) per the ADR. Tests: orchestrators on a shared FakeUnitOfWork (assert persisted batch + exactly-once commit + no-commit-on-raise). New real-DB E2E integration test: real PostgresUnitOfWork, Ingestion writes the EPC → Baseline reads it back through the repo → re-run replaces, not duplicates (1 EPC row, 1 baseline row after two runs). 121 pass in tests/; pyright strict clean; AAA. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
121 lines
4.4 KiB
Python
121 lines
4.4 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
from collections.abc import Callable
|
|
from typing import Any, Optional, Protocol
|
|
|
|
from sqlalchemy import Engine
|
|
from sqlmodel import Session
|
|
|
|
from applications.ara_first_run.ara_first_run_trigger_body import (
|
|
AraFirstRunTriggerBody,
|
|
)
|
|
from domain.baseline.rebaseliner import StubRebaseliner
|
|
from infrastructure.postgres.config import PostgresConfig
|
|
from infrastructure.postgres.engine import make_engine
|
|
from orchestration.baseline_orchestrator import BaselineOrchestrator
|
|
from orchestration.first_run_pipeline import FirstRunPipeline
|
|
from orchestration.ingestion_orchestrator import (
|
|
EpcFetcher,
|
|
IngestionOrchestrator,
|
|
SolarFetcher,
|
|
)
|
|
from orchestration.modelling_orchestrator import ModellingOrchestrator
|
|
from orchestration.task_orchestrator import TaskOrchestrator
|
|
from repositories.geospatial.geospatial_repository import GeospatialRepository
|
|
from repositories.materials.materials_repository import MaterialsRepository
|
|
from repositories.postgres_unit_of_work import PostgresUnitOfWork
|
|
from repositories.scenario.scenario_repository import ScenarioRepository
|
|
from repositories.unit_of_work import UnitOfWork
|
|
from utilities.aws_lambda.subtask_handler import subtask_handler
|
|
|
|
# Module-scoped so the connection pool is reused across warm Lambda invocations
|
|
# rather than rebuilt per invocation (ADR-0012).
|
|
_engine: Optional[Engine] = None
|
|
|
|
|
|
def _get_engine() -> Engine:
|
|
global _engine
|
|
if _engine is None:
|
|
_engine = make_engine(PostgresConfig.from_env(dict(os.environ)))
|
|
return _engine
|
|
|
|
|
|
class _RunsFirstRun(Protocol):
|
|
"""The slice of FirstRunPipeline the handler delegates to."""
|
|
|
|
def run(self, command: AraFirstRunTriggerBody) -> None: ...
|
|
|
|
|
|
def dispatch_first_run(body: dict[str, Any], *, pipeline: _RunsFirstRun) -> None:
|
|
"""Validate the raw event body and hand the command to the pipeline.
|
|
|
|
The handler's entire decision logic — kept as a named seam so it is
|
|
exercised without the Lambda runtime. No business logic: validate, delegate.
|
|
"""
|
|
trigger = AraFirstRunTriggerBody.model_validate(body)
|
|
pipeline.run(trigger)
|
|
|
|
|
|
def build_first_run_pipeline(
|
|
*,
|
|
unit_of_work: Callable[[], UnitOfWork],
|
|
epc_fetcher: EpcFetcher,
|
|
geospatial_repo: GeospatialRepository,
|
|
solar_fetcher: SolarFetcher,
|
|
) -> FirstRunPipeline:
|
|
"""Compose the real three-stage pipeline on a Unit-of-Work factory.
|
|
|
|
Each stage opens its own unit(s) and commits per batch (ADR-0012); the
|
|
handler no longer holds a session. The source clients are passed in because
|
|
their config is not settled — see ``_source_clients_from_env``. Modelling is
|
|
stubbed (#1136); its Scenario / Materials ports are seams.
|
|
"""
|
|
return FirstRunPipeline(
|
|
ingestion=IngestionOrchestrator(
|
|
unit_of_work=unit_of_work,
|
|
epc_fetcher=epc_fetcher,
|
|
geospatial_repo=geospatial_repo,
|
|
solar_fetcher=solar_fetcher,
|
|
),
|
|
baseline=BaselineOrchestrator(
|
|
unit_of_work=unit_of_work,
|
|
rebaseliner=StubRebaseliner(),
|
|
),
|
|
modelling=ModellingOrchestrator(
|
|
scenario_repo=ScenarioRepository(),
|
|
materials_repo=MaterialsRepository(),
|
|
),
|
|
)
|
|
|
|
|
|
def _source_clients_from_env() -> tuple[EpcFetcher, GeospatialRepository, SolarFetcher]:
|
|
"""The Ingestion source clients — EPC API, Google Solar, geospatial S3.
|
|
|
|
TODO(deploy): their config (EPC auth token, Google Solar API key, geospatial
|
|
S3 parquet reader), env-var names, and the pandas/s3fs runtime deps are not
|
|
settled — that wiring is a separate Terraform piece, out of scope for #1136.
|
|
Raises until then so the lambda fails loudly rather than half-running.
|
|
"""
|
|
raise NotImplementedError(
|
|
"ara_first_run source-client wiring (EPC / Google Solar / geospatial) "
|
|
"is pending the deploy/Terraform piece; see #1136."
|
|
)
|
|
|
|
|
|
@subtask_handler()
|
|
def handler(
|
|
body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator
|
|
) -> None:
|
|
engine = _get_engine()
|
|
unit_of_work: Callable[[], UnitOfWork] = lambda: PostgresUnitOfWork(
|
|
lambda: Session(engine)
|
|
)
|
|
epc_fetcher, geospatial_repo, solar_fetcher = _source_clients_from_env()
|
|
pipeline = build_first_run_pipeline(
|
|
unit_of_work=unit_of_work,
|
|
epc_fetcher=epc_fetcher,
|
|
geospatial_repo=geospatial_repo,
|
|
solar_fetcher=solar_fetcher,
|
|
)
|
|
dispatch_first_run(body, pipeline=pipeline)
|