Merge branch 'main' of https://github.com/Hestia-Homes/Model into feature/per-cert-mapper-validation

This commit is contained in:
Khalim Conn-Kowlessar 2026-06-02 16:10:41 +00:00
commit 69995edec8
97 changed files with 3494 additions and 351 deletions

View file

@ -27,3 +27,4 @@ pytest-postgresql
# Formatting
black==26.1.0
boto3-stubs
openai

View file

@ -92,6 +92,9 @@ on:
TF_VAR_magicplan_api_key:
required: false
TF_VAR_openai_api_key:
required: false
jobs:
deploy:
runs-on: ubuntu-latest
@ -163,6 +166,7 @@ jobs:
TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }}
TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }}
TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }}
TF_VAR_openai_api_key: ${{ secrets.TF_VAR_openai_api_key }}
run: |
ECR_REPO_URL_VAR=""
if [[ -n "${{ inputs.ecr_repo }}" ]]; then
@ -213,6 +217,7 @@ jobs:
TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }}
TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }}
TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }}
TF_VAR_openai_api_key: ${{ secrets.TF_VAR_openai_api_key }}
run: |
EXTRA_VARS=""
if [[ -n "${{ inputs.ecr_repo }}" ]]; then

32
.github/workflows/ddd_tests.yml vendored Normal file
View file

@ -0,0 +1,32 @@
name: Run DDD tests
on:
pull_request:
branches:
- "**"
# The DDD rewrite (tests/) defines SQLModel table classes that map to the same
# physical tables as the legacy backend models. Both sets share the one global
# SQLModel.metadata, so they cannot be imported into the same pytest process —
# hence this runs as a separate workflow from unit_tests.yml until the legacy
# models are retired. It also covers the Lambda packaging linter
# (tests/test_lambda_packaging.py). Its DB is spawned in-process by
# pytest-postgresql, so no `services: postgres` and no env secrets are required,
# and it runs independently of (and in parallel with) the legacy unit tests.
jobs:
ddd-tests:
name: DDD tests (Docker)
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Build test image
run: docker build -f Dockerfile.test -t model-test .
- name: Run DDD tests
run: |
docker run --rm \
--network host \
model-test pytest -vv tests/

View file

@ -203,6 +203,47 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
# ============================================================
# Build Landlord Description Overrides image and Push
# ============================================================
landlordDescriptionOverrides_image:
needs: [determine_stage, shared_terraform]
uses: ./.github/workflows/_build_image.yml
with:
ecr_repo: landlord_description_overrides-${{ needs.determine_stage.outputs.stage }}
dockerfile_path: applications/landlord_description_overrides/Dockerfile
build_context: .
build_args: |
DEV_DB_HOST=$DEV_DB_HOST
DEV_DB_PORT=$DEV_DB_PORT
DEV_DB_NAME=$DEV_DB_NAME
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }}
DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
# ============================================================
# Deploy Landlord Description Overrides Lambda
# ============================================================
landlordDescriptionOverrides_lambda:
needs: [landlordDescriptionOverrides_image, determine_stage]
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: landlordDescriptionOverrides
lambda_path: deployment/terraform/lambda/landlordDescriptionOverrides
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: landlord_description_overrides-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.landlordDescriptionOverrides_image.outputs.image_digest }}
terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
TF_VAR_openai_api_key: ${{ secrets.DEV_OPENAI_API_KEY }}
# ============================================================
# Build Bulk Address2UPRN Combiner image and Push
# ============================================================

View file

@ -43,6 +43,16 @@ jobs:
build_context: .
service_name: postcode-splitter-ddd
# ============================================================
# Landlord Description Overrides
# ============================================================
landlord_description_overrides_smoke_test:
uses: ./.github/workflows/_smoke_test_lambda.yml
with:
dockerfile_path: applications/landlord_description_overrides/Dockerfile
build_context: .
service_name: landlord-description-overrides
# ============================================================
# Bulk Address2UPRN Combiner
# ============================================================

View file

@ -61,14 +61,7 @@ jobs:
-e DB_PORT=5432 \
model-test pytest -vv -m 'not integration'
# The DDD rewrite (tests/) defines SQLModel table classes that map to the
# same physical tables as the legacy backend models. Both sets share the
# one global SQLModel.metadata, so they cannot be imported into the same
# pytest process. It runs as a separate invocation until the legacy
# models are retired. Its DB is spawned in-process by pytest-postgresql,
# so no DB service or env is required.
- name: Run DDD tests
run: |
docker run --rm \
--network host \
model-test pytest -vv tests/
# The DDD rewrite (tests/) runs in its own workflow (ddd_tests.yml): its
# SQLModel table classes map to the same physical tables as the legacy
# backend models and share the one global SQLModel.metadata, so the two
# suites cannot be imported into the same pytest process.

View file

@ -1,7 +1,84 @@
# Ubiquitous Language
This file has been **superseded by [CONTEXT.md](./CONTEXT.md)**.
Domain terminology glossary for this project. Generated and maintained by the `/ubiquitous-language` Claude Code skill.
The project's domain glossary now lives at the repo root in `CONTEXT.md`, maintained by the `/grill-with-docs` skill (which replaced `/ubiquitous-language`).
Invoke `/ubiquitous-language` in any session to extract new terms from the conversation, flag ambiguities, and update this file with canonical definitions.
If you arrived here from a link in `CLAUDE.md` or older docs, follow the link above. This file is kept only to preserve git history and may be removed once internal references are updated.
---
## Energy Performance Certificates
| Term | Definition | Aliases to avoid |
|------|------------|------------------|
| **EPC** | An Energy Performance Certificate — a government-issued document rating a dwelling's energy efficiency from A (best) to G (worst). | "energy certificate", "energy report" |
| **Certificate Number** | The unique identifier assigned to an EPC by the government registry. | "cert number", "EPC ID" |
| **Registration Date** | The date an EPC was lodged with the government register; used to identify the most recent certificate for a property. | "assessment date", "submission date" |
| **EPC Band** | A single letter AG representing a property's current or potential energy efficiency rating. | "energy rating", "EPC grade", "EPC score" |
| **Schema Type** | The versioned RdSAP or SAP schema that describes the structure of a certificate's raw data (e.g. `RdSAP-Schema-21.0.1`). | "schema version", "EPC format" |
| **Domestic Certificate** | An EPC issued for a residential dwelling, as opposed to a commercial one. | "residential EPC", "home EPC" |
## Properties and Addresses
| Term | Definition | Aliases to avoid |
|------|------------|------------------|
| **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" |
| **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" |
| **Unstandardised Address** | A frozen dataclass (`domain.addresses.unstandardised_address.UnstandardisedAddress`) capturing a single address exactly as a customer supplied it, before any standardisation: a free-text `address` line (intentionally NOT normalised), a canonical `postcode` (a `Postcode` value object, sanitised on construction), an optional `org_reference` (the customer's own identifier for the property), and `additional_info` (the full source row — every column of the customer's upload, preserved verbatim). | "user address", "asset list", "raw address", "landlord address", "Hyde address" |
| **Address List** | A nominal `NewType` over `list[UnstandardisedAddress]` (`domain.addresses.unstandardised_address.AddressList`) — a batch of unstandardised addresses, such as one customer's bulk-onboarding upload or a postcode-grouped sub-batch produced for downstream processing. Being nominal, it is constructed explicitly: `AddressList([...])`. It is the raw *input* to ingestion; the standardised *output* is a **Standardised Asset List**. | "asset list", "Hyde address list", "user addresses" |
| **Standardised Asset List (SAL)** | A customer's property portfolio after ingestion has cleaned and standardised it — each property carrying a canonical field set (UPRN, standardised address, postcode, property type, built form, …). It is the standardised *output* of the pipeline whose raw *input* is an **Address List** of **Unstandardised Addresses**; generated by the `SALOrchestrator`. (Legacy implementation: `asset_list.AssetList` via `load_standardised_asset_list`.) | "address list" (that is the raw input), "asset register", "portfolio list" |
| **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" |
## Address Matching
| Term | Definition | Aliases to avoid |
|------|------------|------------------|
| **Lexiscore** | A similarity score in [0, 1] between an unstandardised address and a candidate EPC address; combines token overlap and character-level similarity. | "score", "match score", "similarity" |
| **Lexirank** | Dense rank of candidates sorted by lexiscore descending; rank 1 = best match. | "rank", "position" |
| **UPRN Candidate** | An EPC search result that is a plausible match for a given unstandardised address, before scoring decides the winner. | "match candidate", "result" |
| **Score Threshold** | The minimum lexiscore (currently 0.6) below which no match is returned even if a candidate exists. | "minimum score", "cutoff" |
| **Ambiguous Match** | A matching outcome where two or more candidates share lexirank 1, making it impossible to select a unique winner. | "tie", "draw", "duplicate" |
| **Best Match** | The single UPRN candidate with lexirank 1 that meets or exceeds the score threshold. | "winner", "top result" |
## API and Integration
| Term | Definition | Aliases to avoid |
|------|------------|------------------|
| **EPC Search Result** | A lightweight record returned by the government domestic search endpoint — contains address lines, postcode, UPRN, band, and certificate number but not the full certificate data. | "search row", "EPC row", "result" |
| **EPC Property Data** | The fully mapped domain object produced after fetching and parsing a complete EPC certificate. | "EPC data", "certificate data", "parsed EPC" |
| **Old EPC API** | The retired government API (`epc.opendatacommunities.org`) using HTTP Basic auth; decommissioned May 2026. | "legacy API" |
| **New EPC API** | The replacement government API (`api.get-energy-performance-data.communities.gov.uk`) using Bearer token auth. | "new API", "current API" |
| **Bearer Token** | The auth credential required by the new EPC API; stored in the `EPC_AUTH_TOKEN` environment variable. | "API key", "auth token", "secret" |
## Relationships
- An **EPC** belongs to exactly one **Dwelling** and has one **Certificate Number**.
- A **Dwelling** may have multiple **EPCs** across time; the one with the most recent **Registration Date** is the current one.
- A **UPRN** identifies a **Dwelling** permanently; it does not change when the property changes owner.
- An **EPC Search Result** is a summary; it points to a full **EPC** via its **Certificate Number**.
- An **Address List** is an ordered batch of **Unstandardised Addresses**; a customer's bulk-onboarding upload arrives as one.
- Ingestion turns an **Address List** (raw input) into a **Standardised Asset List** (standardised output) — the **SAL Orchestrator** drives this.
- **Address Matching** uses an **Unstandardised Address** and **Postcode** to find a **UPRN** by scoring **UPRN Candidates** from an EPC search.
- A **Lexirank** of 1 with no **Ambiguous Match** and a **Lexiscore** ≥ the **Score Threshold** produces a **Best Match**.
## Example dialogue
> **Dev:** "We have an unstandardised address and postcode. How do we find the UPRN?"
> **Domain expert:** "Search the **New EPC API** by **Postcode** — you get back a list of **EPC Search Results** for that area. Each one has an address and a **UPRN**. Score each against the **Unstandardised Address** using the **Lexiscore**. If the top **UPRN Candidate** scores above the **Score Threshold** and there's no **Ambiguous Match**, that's your **Best Match**."
> **Dev:** "What if two results share the same address line 1?"
> **Domain expert:** "That's an **Ambiguous Match** — two candidates at **Lexirank** 1. Fall back to scoring on the full address using all address lines joined together. If that still ties, return nothing."
> **Dev:** "Once we have the best match, do we use the UPRN or fetch the full EPC?"
> **Domain expert:** "Depends on what you need. The **EPC Search Result** gives you the **EPC Band** and **Certificate Number**. If you need energy efficiency detail, use the **Certificate Number** to fetch the full **EPC Property Data**."
## Flagged ambiguities
- **"address"** appears in several senses: the **Unstandardised Address** dataclass (one customer-supplied address before standardisation), its free-text `address` field, and the normalised address lines on an **EPC Search Result**. Always qualify: "unstandardised address" vs "EPC address" or "address line 1". Within `domain/addresses/`, the dataclass is **Unstandardised Address**; in upstream ingestion contexts (CSV columns, SQS payloads) "address" may still mean the bare free-text string.
- **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments.
- **"user_inputed_address"** (and `user_address`) in `backend/address2UPRN/` is legacy naming — a misspelled synonym for what is now the **Unstandardised Address**. That address-matching code has not been renamed; new code should use **Unstandardised Address**.
- **"Hyde address list"** — "Hyde" is the name of one customer, not a domain concept. A domain expert may say "the Hyde address list" because Hyde is the customer in front of them, but the generalised term is **Address List** (and **Unstandardised Address** for a single item). A customer's identity is data — it belongs in `org_reference` or `additional_info`, never in a type or module name.
- **"address list"** vs **"asset list"** — opposite ends of the ingestion pipeline; do not conflate them. An **Address List** is the raw *input* (unstandardised addresses as the customer supplied them); a **Standardised Asset List** is the standardised *output*. The historical `AssetList` dataclass (now **Unstandardised Address**) misnamed the input an "asset list" — that mistake is what the rename corrected.
- **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter.

View file

@ -25,6 +25,9 @@ COPY infrastructure/ infrastructure/
COPY orchestration/ orchestration/
COPY repositories/ repositories/
COPY utilities/ utilities/
# domain.property.site_notes -> datatypes.epc.domain.epc_property_data; without
# this the lambda fails at init with "No module named 'datatypes'".
COPY datatypes/ datatypes/
COPY applications/ applications/
# Place the handler at the Lambda task root so the runtime can resolve

View file

@ -0,0 +1,34 @@
FROM public.ecr.aws/lambda/python:3.11
# Postgres host/port/database are baked into the image at build time from
# the deploy workflow's --build-arg values (GitHub Actions DEV_DB_* secrets),
# mirroring backend/postcode_splitter/handler/Dockerfile. They map onto the
# POSTGRES_* names PostgresConfig.from_env reads. Username/password are NOT
# baked in -- Terraform injects those as Lambda env vars from Secrets Manager.
ARG DEV_DB_HOST
ARG DEV_DB_PORT
ARG DEV_DB_NAME
ENV POSTGRES_HOST=${DEV_DB_HOST}
ENV POSTGRES_PORT=${DEV_DB_PORT}
ENV POSTGRES_DATABASE=${DEV_DB_NAME}
WORKDIR /var/task
COPY applications/landlord_description_overrides/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy the layered source the handler imports from. The new splitter pulls
# only DDD-shaped packages — no pandas, no legacy backend/.
COPY domain/ domain/
COPY infrastructure/ infrastructure/
COPY orchestration/ orchestration/
COPY repositories/ repositories/
COPY utilities/ utilities/
COPY applications/ applications/
# Place the handler at the Lambda task root so the runtime can resolve
# ``main.handler`` without an extra package prefix.
COPY applications/landlord_description_overrides/handler.py /var/task/main.py
CMD ["main.handler"]

View file

@ -0,0 +1,169 @@
import logging
import os
from typing import Any
import boto3
from applications.landlord_description_overrides.landlord_description_overrides_trigger_body import (
LandlordDescriptionOverridesTriggerBody,
)
from domain.epc.built_form_type import BuiltFormType
from domain.epc.property_type import PropertyType
from domain.epc.roof_type import RoofType
from domain.epc.wall_type import WallType
from domain.epc.wall_type_construction_dates import (
wall_type_construction_date_prompt_hint,
)
from infrastructure.chatgpt.chatgpt import ChatGPT
from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
LandlordOverridesRepository,
)
from infrastructure.postgres.config import PostgresConfig
from infrastructure.postgres.engine import commit_scope, make_engine, make_session
from infrastructure.postgres.landlord_built_form_type_override_table import (
LandlordBuiltFormTypeOverrideRow,
)
from infrastructure.postgres.landlord_property_type_override_table import (
LandlordPropertyTypeOverrideRow,
)
from infrastructure.postgres.landlord_roof_type_override_table import (
LandlordRoofTypeOverrideRow,
)
from infrastructure.postgres.landlord_wall_type_override_table import (
LandlordWallTypeOverrideRow,
)
from infrastructure.s3.csv_s3_client import CsvS3Client
from infrastructure.s3.s3_uri import parse_s3_uri
from orchestration.classifiable_column import ClassifiableColumn
from orchestration.landlord_description_overrides_orchestrator import (
LandlordDescriptionOverridesOrchestrator,
)
from orchestration.task_orchestrator import TaskOrchestrator
from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import (
UnstandardisedAddressListCsvS3Repository,
)
from utilities.aws_lambda.subtask_handler import subtask_handler
logger = logging.getLogger(__name__)
def _build_columns(
column_mapping: dict[str, str], chat_gpt: ChatGPT, session: Any
) -> list[ClassifiableColumn[Any]]:
"""One ClassifiableColumn per mapped category.
``column_mapping`` is ``{category -> source CSV header}``. One header may
feed several categories -- e.g. ``"Property Type"`` -> property_type and
built_form_type -- which falls out naturally because each is a separate
entry. Unknown categories are skipped.
"""
factories = {
"property_type": lambda src: ClassifiableColumn(
name="property_type",
source_column=src,
classifier=ChatGptColumnClassifier(
chat_gpt, PropertyType, PropertyType.UNKNOWN
),
repo=LandlordOverridesRepository[PropertyType](
session, LandlordPropertyTypeOverrideRow
),
),
"built_form_type": lambda src: ClassifiableColumn(
name="built_form_type",
source_column=src,
classifier=ChatGptColumnClassifier(
chat_gpt, BuiltFormType, BuiltFormType.UNKNOWN
),
repo=LandlordOverridesRepository[BuiltFormType](
session, LandlordBuiltFormTypeOverrideRow
),
),
"wall_type": lambda src: ClassifiableColumn(
name="wall_type",
source_column=src,
classifier=ChatGptColumnClassifier(
chat_gpt,
WallType,
WallType.UNKNOWN,
extra_instructions=wall_type_construction_date_prompt_hint(),
),
repo=LandlordOverridesRepository[WallType](
session, LandlordWallTypeOverrideRow
),
),
"roof_type": lambda src: ClassifiableColumn(
name="roof_type",
source_column=src,
classifier=ChatGptColumnClassifier(
chat_gpt, RoofType, RoofType.UNKNOWN
),
repo=LandlordOverridesRepository[RoofType](
session, LandlordRoofTypeOverrideRow
),
),
}
columns: list[ClassifiableColumn[Any]] = []
for category, source_column in column_mapping.items():
factory = factories.get(category)
if factory is None:
logger.warning("Unknown classifier category %r; skipping.", category)
continue
columns.append(factory(source_column))
return columns
@subtask_handler()
def handler(
body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator
) -> dict[str, int]:
trigger = LandlordDescriptionOverridesTriggerBody.model_validate(body)
# The classifier reads a dedicated CSV of the classifier columns (raw
# landlord headers preserved), converted from the upload by the frontend, so
# the S3 bucket comes from the trigger URI rather than a fixed env var.
bucket, _key = parse_s3_uri(trigger.s3_uri)
# boto3.client is overloaded per-service in the installed stubs; cast to Any
# so the strict-mode checker treats it as opaque.
boto3_client: Any = (
boto3.client
) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
boto_s3: Any = boto3_client("s3")
csv_client = CsvS3Client(boto_s3, bucket)
unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(
csv_client, bucket
)
# Raw rows, not load_batch: the classifier CSV carries the description
# columns but not the canonical address/postcode columns load_batch requires.
rows = csv_client.read_rows(trigger.s3_uri)
engine = make_engine(PostgresConfig.from_env(os.environ))
# The session is built up front (SQLModel sessions are lazy, so no
# connection is checked out yet) and owned by this handler. Classification
# runs first and calls ChatGPT, which is slow; we deliberately keep no
# transaction open across it. Only the persistence below -- inside
# ``commit_scope`` -- holds a connection.
session = make_session(engine)
try:
chat_gpt = ChatGPT()
columns = _build_columns(trigger.column_mapping, chat_gpt, session)
orchestrator = LandlordDescriptionOverridesOrchestrator(
unstandardised_address_repo=unstandardised_address_repo,
columns=columns,
)
classified = orchestrator.classify_from_rows(rows)
with commit_scope(session):
orchestrator.persist(classified, portfolio_id=trigger.portfolio_id)
finally:
session.close()
counts = {name: len(mapping) for name, mapping in classified.items()}
for name, n in counts.items():
logger.info("Classified %d descriptions for column %r.", n, name)
return counts

View file

@ -0,0 +1,19 @@
from uuid import UUID
from pydantic import BaseModel, ConfigDict
class LandlordDescriptionOverridesTriggerBody(BaseModel):
model_config = ConfigDict(extra="allow")
task_id: UUID
sub_task_id: UUID
s3_uri: str
# ``portfolio_id`` is ``bigint`` in the ``landlord_*_overrides`` schema --
# Python ``int`` is unbounded so the Pydantic side stays simple; the
# SQLModel row class pins the storage to ``BigInteger``.
portfolio_id: int
# category -> source CSV header (the classifier subset of the upload
# mapping). Defaulted so a malformed/empty message classifies nothing
# rather than failing validation.
column_mapping: dict[str, str] = {}

View file

@ -0,0 +1,5 @@
POSTGRES_HOST=
POSTGRES_PORT=5432
POSTGRES_USERNAME=
POSTGRES_PASSWORD=
POSTGRES_DATABASE=

View file

@ -0,0 +1,9 @@
services:
landlord_overrides:
build:
context: ../../../
dockerfile: applications/landlord_description_overrides/Dockerfile
ports:
- "9002:8080"
env_file:
- .env.local

View file

@ -0,0 +1,16 @@
#!/usr/bin/env python3
import json
import requests
HOST = "localhost"
PORT = "9002"
LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations"
payload = {"Records": [{"body": json.dumps({})}]}
response = requests.post(LAMBDA_URL, json=payload)
print("Status code:", response.status_code)
print("Response:")
print(response.text)

View file

@ -0,0 +1,12 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")"
if [ ! -f .env.local ]; then
cp .env.local.example .env.local
echo "Created .env.local from the template — fill it in, then re-run." >&2
exit 1
fi
docker compose build --no-cache
docker compose up --force-recreate

View file

@ -0,0 +1,5 @@
boto3
pydantic
sqlmodel
psycopg2-binary
openai==1.93.0

View file

@ -9,11 +9,11 @@ from applications.postcode_splitter.postcode_splitter_trigger_body import (
PostcodeSplitterTriggerBody,
)
from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
from infrastructure.csv_s3_client import CsvS3Client
from infrastructure.s3.csv_s3_client import CsvS3Client
from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator
from orchestration.task_orchestrator import TaskOrchestrator
from repositories.user_address.user_address_csv_s3_repository import (
UserAddressCsvS3Repository,
from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import (
UnstandardisedAddressListCsvS3Repository,
)
from utilities.aws_lambda.subtask_handler import subtask_handler
@ -29,17 +29,19 @@ def handler(
# boto3.client is overloaded per-service in the installed stubs; cast
# to Any so the strict-mode checker treats it as opaque.
boto3_client: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
boto3_client: Any = (
boto3.client
) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
boto_s3: Any = boto3_client("s3")
boto_sqs: Any = boto3_client("sqs")
csv_client = CsvS3Client(boto_s3, bucket)
user_address_repo = UserAddressCsvS3Repository(csv_client, bucket)
unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(csv_client, bucket)
queue_client = Address2UprnQueueClient(boto_sqs, queue_url)
splitter = PostcodeSplitterOrchestrator(
task_orchestrator=task_orchestrator,
user_address_repo=user_address_repo,
unstandardised_address_repo=unstandardised_address_repo,
queue_client=queue_client,
)

View file

@ -32,6 +32,12 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY utils/ utils/
COPY backend/ backend/
COPY datatypes/ datatypes/
# main.py imports infrastructure.epc_client.epc_client_service (EpcClientService);
# without this the lambda fails at init with "No module named 'infrastructure'".
COPY infrastructure/ infrastructure/
# EpcClientService -> datatypes.epc.domain.mapper -> domain.sap10_calculator;
# without this the lambda fails at init with "No module named 'domain'".
COPY domain/ domain/
# Copy the handler
COPY backend/address2UPRN/main.py .

View file

@ -13,6 +13,7 @@ from backend.app.bulk_uploads.schema import (
CombinedResultsResponse,
CombinerTriggerRequest,
FlagsSummary,
LandlordOverridesTriggerRequest,
PostcodeSplitterTriggerRequest,
)
from backend.app.bulk_uploads.scoring import score_bucket
@ -92,6 +93,26 @@ async def trigger_combiner(req: CombinerTriggerRequest):
}
@router.post("/trigger-landlord-overrides", status_code=202)
async def trigger_landlord_overrides(req: LandlordOverridesTriggerRequest):
settings = get_settings()
try:
sqs = boto3.client("sqs", settings.AWS_DEFAULT_REGION)
response = sqs.send_message(
QueueUrl=settings.LANDLORD_OVERRIDES_SQS_URL,
MessageBody=req.model_dump_json(),
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"SQS error: {e}")
return {
"task_id": req.task_id,
"sub_task_id": req.sub_task_id,
"sqs_message_id": response.get("MessageId"),
}
@router.get("/{task_id}/combined-results", response_model=CombinedResultsResponse)
async def get_combined_results(
task_id: UUID,

View file

@ -14,6 +14,15 @@ class CombinerTriggerRequest(BaseModel):
sub_task_id: str
class LandlordOverridesTriggerRequest(BaseModel):
task_id: str
sub_task_id: str
s3_uri: str
portfolio_id: int
# category -> source CSV header (the classifier subset of the upload mapping)
column_mapping: dict[str, str]
class FlagsSummary(BaseModel):
duplicates: int
missing: int

View file

@ -42,6 +42,7 @@ class Settings(BaseSettings):
MAGICPLAN_SQS_URL: str = "changeme"
POSTCODE_SPLITTER_SQS_URL: str = "changeme"
COMBINER_SQS_URL: str = "changeme"
LANDLORD_OVERRIDES_SQS_URL: str = "changeme"
# Third parties
EPC_AUTH_TOKEN: str = "changeme"

View file

@ -40,6 +40,9 @@ COPY --from=build-image /usr/local/lib/python3.11/site-packages/ /usr/local/lib/
COPY ./backend/ ./backend
COPY ./recommendations/ ./recommendations
COPY ./utils/ ./utils/
# engine.py -> backend.apis.GoogleSolarApi -> infrastructure.solar; without this
# the lambda fails at init with "No module named 'infrastructure'".
COPY ./infrastructure/ ./infrastructure/
COPY ./etl/epc/ ./etl/epc/
COPY ./etl/epc_clean/ ./etl/epc_clean/
COPY ./etl/bill_savings/ ./etl/bill_savings/

View file

@ -13,6 +13,12 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY utils/ utils/
COPY backend/ backend/
COPY datatypes/ datatypes/
# handler -> ecmk_service -> documents_parser.parser -> datatypes.epc.domain.mapper
# -> domain.sap10_calculator, and -> db_writer -> epc_property model
# -> infrastructure.postgres. Without these the lambda fails at init with
# "No module named 'domain'" / "'infrastructure'".
COPY domain/ domain/
COPY infrastructure/ infrastructure/
# Local lambda entrypoint
# ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"]

View file

@ -10,6 +10,12 @@ WORKDIR /var/task
COPY utils/ utils/
COPY backend/ backend/
COPY datatypes/ datatypes/
# handler -> pashub_service -> documents_parser.parser -> datatypes.epc.domain.mapper
# -> domain.sap10_calculator, and -> db_writer -> epc_property model
# -> infrastructure.postgres. Without these the lambda fails at init with
# "No module named 'domain'" / "'infrastructure'".
COPY domain/ domain/
COPY infrastructure/ infrastructure/
COPY backend/pashub_fetcher/handler/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

View file

@ -14,55 +14,11 @@ class TestSearchEpcIntegration:
def epc_auth_token(self):
return os.getenv("EPC_AUTH_TOKEN")
@pytest.mark.parametrize(
"address, postcode, uprn, skip_os, lmk_key, n_old_epcs",
[
# Test case 1: Valid address and postcode, skipping OS
# In this case, the property is an individual flat but the uprn associated to the
# EPC is for the building as a whole, possibly because there was a conversion of sorts
("Garden Flat, 48 Bedminster Parade", "BS3 4HS", 308249, True,
"260907a5431fa073d193cc6bbec51fbf1ba9a61845ab2503f85aa19ce3ed6afd", 1),
# Test case 2: Another valid address and postcode
# In this case, the newest EPC, does not have a uprn associated to it. If we did a search by
# uprn, we would get an old EPC
("Flat 8, Hainton House", "DN32 9AQ", "", True,
"bd1149a20a73397184f07a9955f872424826e70f4870c058d71be887766ee1f8", 2),
# Test case 3: When we make a request to the API for this property, we get back results for
# flats 1, 2 and 3. We have some logic to handle the response so that we get back flat 1
("Flat 1, 1 Tottenham Street, London", "W1T 2AE", 5167411, True,
"3e6414d7f15f4cf7a69dc20c469bcf043d31a49239b183f1bd0c0e1aafa23c93", 0),
],
)
def test_find_property(self, epc_auth_token, address, postcode, uprn, skip_os, lmk_key, n_old_epcs):
"""
Integration test for `find_property`, making actual API calls.
"""
# Provide your actual API keys or tokens here
os_api_key = ""
# Initialize the SearchEpc instance
epc_searcher = SearchEpc(
address1=address,
postcode=postcode,
uprn=uprn,
auth_token=epc_auth_token,
os_api_key=os_api_key,
)
# Execute the method
epc_searcher.find_property(skip_os=skip_os)
# We check that we have the correct epc
assert epc_searcher.newest_epc["lmk-key"] == lmk_key
assert len(epc_searcher.older_epcs) == n_old_epcs
def test_search_housenumber(self):
eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter'
eg1 = "Flat A11, Mortimer House, Grendon Road, Exeter"
res1 = SearchEpc.get_house_number(eg1, None)
assert res1 == "A11"
eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL'
eg2 = "Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL"
res2 = SearchEpc.get_house_number(eg2, None)
assert res2 == "A9"

View file

@ -98,7 +98,9 @@ class MainHeatingDetail:
boiler_flue_type: Optional[int] = None # TODO: make enum?
boiler_ignition_type: Optional[int] = None # TODO: make enum?
central_heating_pump_age: Optional[int] = None
central_heating_pump_age_str: Optional[str] = None # str from site notes e.g. "Unknown", "Pre 2013"
central_heating_pump_age_str: Optional[str] = (
None # str from site notes e.g. "Unknown", "Pre 2013"
)
main_heating_index_number: Optional[int] = None
sap_main_heating_code: Optional[int] = None # TODO: make enum?
main_heating_number: Optional[int] = None
@ -132,7 +134,7 @@ class ShowerOutlets:
@dataclass
class SapHeating:
instantaneous_wwhrs: InstantaneousWwhrs
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
main_heating_details: List[MainHeatingDetail]
has_fixed_air_conditioning: bool
cylinder_size: Optional[Union[int, str]] = (
@ -145,7 +147,9 @@ class SapHeating:
cylinder_insulation_type: Optional[Union[int, str]] = None
cylinder_thermostat: Optional[str] = None
secondary_fuel_type: Optional[int] = None
secondary_heating_type: Optional[Union[int, str]] = None # int from API; str from site notes
secondary_heating_type: Optional[Union[int, str]] = (
None # int from API; str from site notes
)
cylinder_insulation_thickness_mm: Optional[int] = None
# SAP10 hot-water demand inputs from sap_heating.
number_baths: Optional[int] = None
@ -168,7 +172,9 @@ class SapHeating:
class SapVentilation:
ventilation_type: Optional[str] = None
draught_lobby: Optional[bool] = None
pressure_test: Optional[str] = None # str from site notes e.g. "No test"; int in API via mechanical_ventilation
pressure_test: Optional[str] = (
None # str from site notes e.g. "No test"; int in API via mechanical_ventilation
)
open_flues_count: Optional[int] = None
closed_flues_count: Optional[int] = None
boiler_flues_count: Optional[int] = None
@ -476,8 +482,12 @@ class SapBuildingPart:
None # TODO: make enum/mapping?
)
floor_type: Optional[str] = None # str from site notes e.g. "Ground Floor"
floor_construction_type: Optional[str] = None # str from site notes; distinct from floor_construction: int in SapFloorDimension
floor_insulation_type_str: Optional[str] = None # str from site notes e.g. "As Built"
floor_construction_type: Optional[str] = (
None # str from site notes; distinct from floor_construction: int in SapFloorDimension
)
floor_insulation_type_str: Optional[str] = (
None # str from site notes e.g. "As Built"
)
floor_u_value_known: Optional[bool] = None
roof_construction: Optional[int] = None

View file

@ -37,7 +37,7 @@ class SapHeating:
cylinder_size: int
water_heating_code: int
water_heating_fuel: int
instantaneous_wwhrs: InstantaneousWwhrs
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
main_heating_details: List[MainHeatingDetail]
immersion_heating_type: Union[int, str]
cylinder_insulation_type: int

View file

@ -41,7 +41,7 @@ class SapHeating:
cylinder_size: int
water_heating_code: int
water_heating_fuel: int
instantaneous_wwhrs: InstantaneousWwhrs
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
main_heating_details: List[MainHeatingDetail]
immersion_heating_type: Union[int, str]
cylinder_insulation_type: int

View file

@ -41,7 +41,7 @@ class SapHeating:
cylinder_size: int
water_heating_code: int
water_heating_fuel: int
instantaneous_wwhrs: InstantaneousWwhrs
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
main_heating_details: List[MainHeatingDetail]
immersion_heating_type: Union[int, str]
has_fixed_air_conditioning: str
@ -86,6 +86,7 @@ class SapFloorDimension:
@dataclass
class SapRoomInRoof:
"""Room-in-roof details. floor_area is a Measurement object in schema 18.0."""
floor_area: Measurement
insulation: str
roof_room_connected: str

View file

@ -41,7 +41,7 @@ class SapHeating:
cylinder_size: int
water_heating_code: int
water_heating_fuel: int
instantaneous_wwhrs: InstantaneousWwhrs
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
main_heating_details: List[MainHeatingDetail]
immersion_heating_type: Union[int, str]
has_fixed_air_conditioning: str

View file

@ -49,7 +49,7 @@ class SapHeating:
cylinder_size: int
water_heating_code: int
water_heating_fuel: int
instantaneous_wwhrs: InstantaneousWwhrs
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
main_heating_details: List[MainHeatingDetail]
immersion_heating_type: Union[int, str]
has_fixed_air_conditioning: str
@ -103,6 +103,7 @@ class SapFloorDimension:
@dataclass
class SapRoomInRoof:
"""Room-in-roof details. floor_area is a plain number in schema 20.0.0 (not a Measurement object)."""
floor_area: Union[int, float]
insulation: str
roof_room_connected: str

View file

@ -0,0 +1,50 @@
data "terraform_remote_state" "shared" {
backend = "s3"
config = {
bucket = "assessment-model-terraform-state"
key = "env:/${var.stage}/terraform.tfstate"
region = "eu-west-2"
}
}
data "aws_secretsmanager_secret_version" "db_credentials" {
secret_id = "${var.stage}/assessment_model/db_credentials"
}
locals {
db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)
}
module "lambda" {
source = "../../modules/lambda_with_sqs"
name = "landlord-description-overrides"
stage = var.stage
image_uri = local.image_uri
# The classifier calls OpenAI once per distinct description per column, so it
# is latency-bound. 300s leaves headroom under the queue's 1000s visibility
# timeout. batch_size = 1 keeps one upload per invocation, so a single bad
# record cannot redrive its siblings. maximum_concurrency caps fan-out to
# respect OpenAI rate limits.
timeout = 300
batch_size = 1
maximum_concurrency = 5
environment = merge(
{
STAGE = var.stage
LOG_LEVEL = "info"
POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username
POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password
OPENAI_API_KEY = var.openai_api_key
},
)
}
# Attach S3 read policy so the handler can read the original upload CSV.
resource "aws_iam_role_policy_attachment" "landlord_overrides_s3_read" {
role = module.lambda.role_name
policy_arn = data.terraform_remote_state.shared.outputs.landlord_overrides_s3_read_arn
}

View file

@ -0,0 +1,9 @@
output "landlord_description_overrides_queue_url" {
value = module.lambda.queue_url
description = "URL of the Landlord Description Overrides SQS queue (wire into the FastAPI LANDLORD_OVERRIDES_SQS_URL)"
}
output "landlord_description_overrides_queue_arn" {
value = module.lambda.queue_arn
description = "ARN of the Landlord Description Overrides SQS queue"
}

View file

@ -0,0 +1,16 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
backend "s3" {
bucket = "landlord-description-overrides-terraform-state"
key = "terraform.tfstate"
region = "eu-west-2"
}
required_version = ">= 1.2.0"
}

View file

@ -0,0 +1,33 @@
variable "lambda_name" {
type = string
description = "Logical name of the lambda (e.g. landlordDescriptionOverrides)"
}
variable "stage" {
description = "Deployment stage (e.g. dev, prod)"
type = string
}
variable "ecr_repo_url" {
type = string
description = "ECR repository URL (no tag, no digest)"
}
variable "image_digest" {
type = string
description = "Image digest (sha256:...)"
}
variable "openai_api_key" {
type = string
description = "OpenAI API key used by the ChatGPT column classifier"
sensitive = true
}
locals {
image_uri = "${var.ecr_repo_url}@${var.image_digest}"
}
output "resolved_image_uri" {
value = local.image_uri
}

View file

@ -268,11 +268,11 @@ output "retrofit_heat_baseline_predictions_bucket_name" {
// We make this bucket presignable, because we want to generate download links for the frontend
module "retrofit_energy_assessments" {
source = "../modules/s3_presignable_bucket"
bucketname = "retrofit-energy-assessments-${var.stage}"
allowed_origins = var.allowed_origins
environment = var.stage
enable_versioning = true
source = "../modules/s3_presignable_bucket"
bucketname = "retrofit-energy-assessments-${var.stage}"
allowed_origins = var.allowed_origins
environment = var.stage
enable_versioning = true
}
output "retrofit_energy_assessments_bucket_name" {
@ -494,6 +494,42 @@ output "postcode_splitter_s3_read_arn" {
value = module.postcode_splitter_s3_read.policy_arn
}
################################################
# Landlord Description Overrides Lambda
################################################
module "landlord_description_overrides_state_bucket" {
source = "../modules/tf_state_bucket"
bucket_name = "landlord-description-overrides-terraform-state"
}
module "landlord_description_overrides_registry" {
source = "../modules/container_registry"
name = "landlord_description_overrides"
stage = var.stage
}
# S3 policy for the landlord classifier to read the original upload CSV.
module "landlord_overrides_s3_read" {
source = "../modules/s3_iam_policy"
policy_name = "LandlordOverridesReadS3"
# NOTE: aws_iam_policy.description is ForceNew changing it destroys+recreates the
# policy, which deadlocks because the policy is attached to the lambda role in the
# separate landlordDescriptionOverrides stack (DeleteConflict). Keep this string
# byte-for-byte identical to what's in state so the bucket change applies in-place.
policy_description = "Allow landlord description overrides Lambda to read from retrofit-data bucket"
bucket_arns = [
"arn:aws:s3:::retrofit-plan-inputs-${var.stage}",
"arn:aws:s3:::retrofit-data-${var.stage}",
]
actions = ["s3:GetObject", "s3:ListBucket"]
resource_paths = ["/*"]
}
output "landlord_overrides_s3_read_arn" {
value = module.landlord_overrides_s3_read.policy_arn
}
################################################
# Bulk Address2UPRN Combiner Lambda ECR
################################################
@ -729,7 +765,7 @@ module "hubspot_etl_bucket" {
module "hubspot_etl_registry" {
source = "../modules/container_registry"
name = "hubspot-etl"
stage = var.stage
stage = var.stage
}

View file

@ -0,0 +1,79 @@
# ADR-0003: Python writes landlord overrides directly to Postgres
**Status:** Accepted
**Date:** 2026-05-26
**Supersedes (in part):** [assessment-model/docs/adr/0002-landlord-override-vocabulary.md](https://github.com/.../assessment-model/blob/main/docs/adr/0002-landlord-override-vocabulary.md) — specifically the clause beginning *"Writes happen from Next.js …"*.
## Context
ADR-0002 (in the `assessment-model` TS repo) defined the `landlord_property_type_overrides` and `landlord_wall_type_overrides` tables and noted that the Model service would POST classification results to a Next.js route handler, with Next.js performing the upsert. Drizzle remained the schema source of truth.
That extra hop has not been built and is now judged unnecessary for the present scope:
- The classification result is internal — a Lambda computes it, the same Lambda persists it. No third party needs to participate in the write.
- Drizzle remains the schema's source of truth either way: the Python adapter mirrors the schema in a SQLModel row, but the migrations stay with Drizzle. Adding a Next.js route would not change which side owns schema definition.
- The Python lambda already lives next to a Postgres connection in the existing pipeline (`subtask`/`tasks` tables are written from Python today). Adding two more tables to that adapter surface is a small, well-understood change. Routing the same writes through Next.js would mean: lambda → JSON-over-HTTP → Next.js route → Drizzle → Postgres, instead of lambda → SQLAlchemy → Postgres. Three extra moving parts to ship, deploy, monitor, and authenticate for no behavioural gain.
## Decision
The Model service (specifically `applications/landlord_description_overrides/handler.py`) writes directly to `landlord_property_type_overrides` and `landlord_wall_type_overrides` via a SQLAlchemy-backed `LandlordOverrideRepository[E]` adapter. No Next.js route handler is required.
Transaction boundaries live in `infrastructure/postgres/engine.transactional_session` — a context manager that commits on clean exit and rolls back on exception. The application layer (`handler.py`) never calls `.commit()` or `.rollback()` itself; it only opens the context. Orchestration and repository code likewise never commits — keeping transaction semantics confined to one infrastructure helper.
The conflict policy lives in SQL and is identical for every override category. A single generic adapter, `LandlordOverridesRepository[E]`, implements it once; the target table is selected by the SQLModel `…Row` class passed at construction. Each category (property / built-form / wall / roof type) is that same adapter parameterised by its row class:
```sql
INSERT INTO landlord_property_type_overrides (portfolio_id, description, value, source)
VALUES …
ON CONFLICT (portfolio_id, description)
DO UPDATE SET value = EXCLUDED.value,
source = EXCLUDED.source,
updated_at = now()
WHERE landlord_property_type_overrides.source = 'classifier';
```
The `WHERE existing.source = 'classifier'` guard is load-bearing: it lets the classifier refresh its own past output while leaving `source = 'user'` rows untouched. This is the contract ADR-0002's `source` column was added for.
`UNKNOWN` values are persisted, not skipped — consistent with ADR-0002 §5. A future user override can upgrade them.
## Consequences
**Positive.**
- One fewer service to deploy, monitor, and authenticate.
- The classifier and persistence live in the same process — failures surface against a single `sub_task` row, not split across two systems.
- The Postgres adapter mirrors the existing `subtask`/`tasks` repositories, so reviewers have a precedent to compare against.
**Negative.**
- The Python repo now holds two schemas — the schema-source-of-truth Drizzle definition lives in the TS repo, and the Python `SQLModel` row class shadows it. They must stay in lockstep. Mitigations: the TS schema header comment (`landlord_overrides.ts:12`) already names the Python source-of-truth file; a future ADR may add a CI check that diffs the two.
- The boundary that ADR-0002 anticipated for pgEnum validation (a Next.js route validating incoming values before insert) is gone. Pydantic + the Python `Enum` type catch invalid values on the producing side, and Postgres's pgEnum will reject anything that slips through.
## File layout
This ADR also fixes a placement convention for Postgres adapters going forward. The codebase currently has the ChatGPT classifier split cleanly along DDD lines — port in `domain/`, adapter in `infrastructure/chatgpt/` — but the `tasks` Postgres adapter does not follow the same shape: its concrete class lives in `repositories/tasks/`, not `infrastructure/postgres/`.
The convention going forward separates the persistence *behaviour* (grouped by aggregate) from the schema *mirrors* (grouped by technology, since they share pgEnums and engine metadata):
- **Port (protocol / abstract base):** `repositories/<aggregate>/<thing>_repository.py`
- **Postgres repository adapter (concrete):** `infrastructure/<aggregate>/<aggregate>_postgres_repository.py`
- **SQLModel row class (`table=True` schema mirror):** `infrastructure/postgres/<thing>_table.py`
The `LandlordOverridesRepository` adapter follows this convention: the concrete class — the aggregate's "talker" to Postgres — lives at `infrastructure/landlord_overrides/landlord_overrides_postgres_repository.py`, while the per-category `…Row` classes stay in `infrastructure/postgres/`. The `…Row` classes are one-per-table — each mirrors a genuinely distinct Drizzle table and `value` pgEnum, and they share the single `override_source` pgEnum instance, so they belong together in the Postgres technology bucket as schema mirrors, not duplicated logic.
(This refines the placement first sketched in this ADR, which put the adapter in `infrastructure/postgres/` alongside the row classes. The adapter holds no schema — only the write path — so it groups by aggregate; only the `table=True` mirrors stay tech-bucketed.)
**Existing outliers to relocate in a follow-up:**
- `repositories/tasks/task_postgres_repository.py``infrastructure/tasks/task_postgres_repository.py`
- `repositories/tasks/subtask_postgres_repository.py``infrastructure/tasks/subtask_postgres_repository.py`
(Their `task_table.py` / `subtask_table.py` schema mirrors already sit correctly in `infrastructure/postgres/`.) Both moves are mechanical (import-path updates only). They are intentionally out of scope for the present PR.
## Out of scope (deferred to follow-up work)
- Relocating `task_postgres_repository.py` and `subtask_postgres_repository.py` into `infrastructure/tasks/` per the convention above.
- ~~Extracting a shared upsert helper / base class once a third `landlord_*_overrides` column lands — until then the per-category adapters' 95%-identical bodies are kept side-by-side for direct comparison.~~ **Done.** The per-category adapter bodies were byte-identical (varying only in their row class), so they were consolidated into one generic `LandlordOverridesRepository[E]` parameterised by row class rather than waiting for a third column.
- Switching `applications/landlord_description_overrides/handler.py` to acquire its `Session` via a `@subtask_handler()`-style decorator instead of building its own engine.
- A cross-repo PR amending ADR-0002 to point at this ADR.
- A CI check (or codegen) that diffs the Drizzle pgEnum literals against the Python `Enum.value` strings.

View file

@ -2,21 +2,21 @@ from __future__ import annotations
from collections.abc import Iterable, Iterator
from domain.addresses.user_address import UserAddress
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
from domain.postcode import Postcode
def iter_postcode_grouped_batches(
addresses: Iterable[UserAddress],
addresses: Iterable[UnstandardisedAddress],
*,
max_batch_size: int = 500,
) -> Iterator[list[UserAddress]]:
) -> Iterator[AddressList]:
if max_batch_size < 1:
raise ValueError("max_batch_size must be >= 1")
groups = _group_by_postcode_in_order(addresses)
buffer: list[UserAddress] = []
buffer: AddressList = AddressList([])
for group in groups.values():
group_len = len(group)
@ -26,14 +26,14 @@ def iter_postcode_grouped_batches(
if group_len >= max_batch_size:
if buffer:
yield buffer
buffer = []
buffer = AddressList([])
yield group
continue
# Adding this group would overflow: flush buffer before appending.
if len(buffer) + group_len > max_batch_size:
yield buffer
buffer = []
buffer = AddressList([])
buffer.extend(group)
@ -43,9 +43,9 @@ def iter_postcode_grouped_batches(
def _group_by_postcode_in_order(
addresses: Iterable[UserAddress],
) -> dict[Postcode, list[UserAddress]]:
groups: dict[Postcode, list[UserAddress]] = {}
addresses: Iterable[UnstandardisedAddress],
) -> dict[Postcode, AddressList]:
groups: dict[Postcode, AddressList] = {}
for address in addresses:
groups.setdefault(address.postcode, []).append(address)
groups.setdefault(address.postcode, AddressList([])).append(address)
return groups

View file

@ -0,0 +1,21 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import NewType, Optional
from domain.postcode import Postcode
def _empty_source_row() -> dict[str, str]:
return {}
@dataclass(frozen=True)
class StandardisedAddress:
address: str
postcode: Postcode
org_reference: Optional[str] = None
# Standardised Asset List -- the cleaned output counterpart to AddressList.
SAL = NewType("SAL", list[StandardisedAddress])

View file

@ -0,0 +1,24 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Optional, NewType
from domain.postcode import Postcode
def _empty_source_row() -> dict[str, str]:
return {}
@dataclass(frozen=True)
class UnstandardisedAddress:
address: str
postcode: Postcode
org_reference: Optional[str] = None
additional_info: dict[str, str] = field(
default_factory=_empty_source_row, compare=False
)
# A batch of raw, pre-standardisation addresses as supplied by a landlord.
AddressList = NewType("AddressList", list[UnstandardisedAddress])

View file

@ -1,18 +0,0 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Optional
from domain.postcode import Postcode
def _empty_source_row() -> dict[str, str]:
return {}
@dataclass(frozen=True)
class UserAddress:
user_address: str
postcode: Postcode
internal_reference: Optional[str] = None
source_row: dict[str, str] = field(default_factory=_empty_source_row, compare=False)

View file

@ -0,0 +1,39 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from enum import Enum
from typing import Generic, TypeVar
E = TypeVar("E", bound=Enum)
class ClassificationError(Exception):
"""Raised when classifying a column's descriptions fails wholesale.
A whole-batch failure (the AI backend is unreachable, or returns a reply
that cannot be parsed) raises this. A single description that merely
cannot be resolved is not an error -- it maps to the enum's UNKNOWN member.
"""
class ColumnClassifier(ABC, Generic[E]):
"""Port: resolves free-text descriptions into a category enum ``E``.
One classifier handles one landlord-CSV column. Implementations decide
*how* the mapping is performed (an LLM, a lookup table, a rules engine);
``LandlordDescriptionOverridesOrchestrator`` depends only on this interface.
"""
@abstractmethod
def classify(self, descriptions: set[str]) -> dict[str, E]:
"""Classify each description into a category enum member.
Every input description appears as a key in the result. A description
that cannot be resolved maps to the enum's UNKNOWN member.
Raises:
ClassificationError: If the classification call fails wholesale
(e.g. the backend is unreachable or returns an unparseable
response).
"""
...

View file

@ -0,0 +1,20 @@
from enum import Enum
class BuiltFormType(Enum):
"""A landlord-supplied built form, as resolved by the landlord-description-overrides context.
Mirrors the EPC built-form values. ``NOT_RECORDED`` is the legitimate
EPC value for properties whose built form the surveyor did not capture;
``UNKNOWN`` is the classifier fallback for landlord values that cannot be
resolved at all.
"""
DETACHED = "Detached"
SEMI_DETACHED = "Semi-Detached"
MID_TERRACE = "Mid-Terrace"
END_TERRACE = "End-Terrace"
ENCLOSED_MID_TERRACE = "Enclosed Mid-Terrace"
ENCLOSED_END_TERRACE = "Enclosed End-Terrace"
NOT_RECORDED = "Not Recorded"
UNKNOWN = "Unknown"

View file

@ -0,0 +1,16 @@
from enum import Enum
class PropertyType(Enum):
"""A landlord-supplied property type, as resolved by the landlord-description-overrides context.
Distinct from the EPC context's ``PropertyType``: a landlord CSV value
may be unresolvable, so this enum carries an explicit ``UNKNOWN`` member.
"""
HOUSE = "House"
BUNGALOW = "Bungalow"
FLAT = "Flat"
MAISONETTE = "Maisonette"
PARK_HOME = "Park home"
UNKNOWN = "Unknown"

70
domain/epc/roof_type.py Normal file
View file

@ -0,0 +1,70 @@
from enum import Enum
class RoofType(Enum):
"""A landlord-supplied roof description, as resolved by the landlord-description-overrides context.
Each member is one full EPC roof-description string, combining shape
(flat, pitched, roof room(s), thatched) with insulation state and, for
pitched roofs, the loft-insulation depth in millimetres. Adjacency
markers like ``(another dwelling above)`` represent a unit whose top
boundary is another dwelling rather than a roof of its own; they are
kept as members because they appear in the same EPC column.
``UNKNOWN`` covers values the classifier cannot resolve -- most
commonly raw ``Average thermal transmittance`` U-value strings that
carry no shape/insulation information.
"""
FLAT_INSULATED = "Flat, insulated"
FLAT_INSULATED_ASSUMED = "Flat, insulated (assumed)"
FLAT_LIMITED_INSULATION = "Flat, limited insulation"
FLAT_LIMITED_INSULATION_ASSUMED = "Flat, limited insulation (assumed)"
FLAT_NO_INSULATION = "Flat, no insulation"
FLAT_NO_INSULATION_ASSUMED = "Flat, no insulation (assumed)"
PITCHED_INSULATED = "Pitched, insulated"
PITCHED_INSULATED_ASSUMED = "Pitched, insulated (assumed)"
PITCHED_INSULATED_AT_RAFTERS = "Pitched, insulated at rafters"
PITCHED_LIMITED_INSULATION = "Pitched, limited insulation"
PITCHED_LIMITED_INSULATION_ASSUMED = "Pitched, limited insulation (assumed)"
PITCHED_NO_INSULATION = "Pitched, no insulation"
PITCHED_NO_INSULATION_ASSUMED = "Pitched, no insulation (assumed)"
PITCHED_UNKNOWN_LOFT_INSULATION = "Pitched, Unknown loft insulation"
PITCHED_LOFT_0MM = "Pitched, 0 mm loft insulation"
PITCHED_LOFT_12MM = "Pitched, 12 mm loft insulation"
PITCHED_LOFT_25MM = "Pitched, 25 mm loft insulation"
PITCHED_LOFT_50MM = "Pitched, 50 mm loft insulation"
PITCHED_LOFT_75MM = "Pitched, 75 mm loft insulation"
PITCHED_LOFT_100MM = "Pitched, 100 mm loft insulation"
PITCHED_LOFT_125MM = "Pitched, 125 mm loft insulation"
PITCHED_LOFT_150MM = "Pitched, 150 mm loft insulation"
PITCHED_LOFT_175MM = "Pitched, 175 mm loft insulation"
PITCHED_LOFT_200MM = "Pitched, 200 mm loft insulation"
PITCHED_LOFT_225MM = "Pitched, 225 mm loft insulation"
PITCHED_LOFT_250MM = "Pitched, 250 mm loft insulation"
PITCHED_LOFT_270MM = "Pitched, 270 mm loft insulation"
PITCHED_LOFT_300MM = "Pitched, 300 mm loft insulation"
PITCHED_LOFT_350MM = "Pitched, 350 mm loft insulation"
PITCHED_LOFT_400MM = "Pitched, 400 mm loft insulation"
PITCHED_LOFT_400_PLUS_MM = "Pitched, 400+ mm loft insulation"
ROOF_ROOM_INSULATED = "Roof room(s), insulated"
ROOF_ROOM_INSULATED_ASSUMED = "Roof room(s), insulated (assumed)"
ROOF_ROOM_LIMITED_INSULATION = "Roof room(s), limited insulation"
ROOF_ROOM_LIMITED_INSULATION_ASSUMED = "Roof room(s), limited insulation (assumed)"
ROOF_ROOM_NO_INSULATION = "Roof room(s), no insulation"
ROOF_ROOM_NO_INSULATION_ASSUMED = "Roof room(s), no insulation (assumed)"
ROOF_ROOM_CEILING_INSULATED = "Roof room(s), ceiling insulated"
ROOF_ROOM_THATCHED = "Roof room(s), thatched"
ROOF_ROOM_THATCHED_WITH_ADDITIONAL_INSULATION = "Roof room(s), thatched with additional insulation"
THATCHED = "Thatched"
THATCHED_WITH_ADDITIONAL_INSULATION = "Thatched, with additional insulation"
ADJACENT_ANOTHER_DWELLING_ABOVE = "(another dwelling above)"
ADJACENT_SAME_DWELLING_ABOVE = "(same dwelling above)"
ADJACENT_OTHER_PREMISES_ABOVE = "(other premises above)"
ADJACENT_ANOTHER_PREMISES_ABOVE = "(another premises above)"
ANOTHER_PREMISES_ABOVE = "Another Premises Above"
UNKNOWN = "Unknown"

117
domain/epc/wall_type.py Normal file
View file

@ -0,0 +1,117 @@
from enum import Enum
class WallType(Enum):
"""A landlord-supplied wall description, as resolved by the landlord-description-overrides context.
Each member is one full EPC wall-description string, combining material
(cavity, solid brick, sandstone, ) with construction/insulation state
(as built, filled cavity, with internal insulation, ). ``UNKNOWN`` covers
values the classifier cannot resolve most commonly raw
``Average thermal transmittance`` U-value strings that carry no material
information.
"""
CAVITY_FILLED = "Cavity wall, filled cavity"
CAVITY_AS_BUILT_INSULATED_ASSUMED = (
"Cavity wall, as built, insulated (assumed)" # 1983 - 1990
)
CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = (
"Cavity wall, as built, no insulation (assumed)" # Pre-1975
)
CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Cavity wall, as built, partial insulation (assumed)" # 1976 - 1982
)
CAVITY_WITH_INTERNAL_INSULATION = "Cavity wall, with internal insulation"
CAVITY_WITH_EXTERNAL_INSULATION = "Cavity wall, with external insulation"
CAVITY_FILLED_AND_INTERNAL_INSULATION = (
"Cavity wall, filled cavity and internal insulation"
)
CAVITY_FILLED_AND_EXTERNAL_INSULATION = (
"Cavity wall, filled cavity and external insulation"
)
SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = (
"Solid brick, as built, no insulation (assumed)"
)
SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = (
"Solid brick, as built, insulated (assumed)"
)
SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Solid brick, as built, partial insulation (assumed)"
)
SOLID_BRICK_WITH_INTERNAL_INSULATION = "Solid brick, with internal insulation"
SOLID_BRICK_WITH_EXTERNAL_INSULATION = "Solid brick, with external insulation"
TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = (
"Timber frame, as built, no insulation (assumed)"
)
TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = (
"Timber frame, as built, insulated (assumed)"
)
TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Timber frame, as built, partial insulation (assumed)"
)
TIMBER_FRAME_WITH_ADDITIONAL_INSULATION = "Timber frame, with additional insulation"
SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = (
"Sandstone, as built, no insulation (assumed)"
)
SANDSTONE_AS_BUILT_INSULATED_ASSUMED = "Sandstone, as built, insulated (assumed)"
SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Sandstone, as built, partial insulation (assumed)"
)
SANDSTONE_WITH_INTERNAL_INSULATION = "Sandstone, with internal insulation"
SANDSTONE_WITH_EXTERNAL_INSULATION = "Sandstone, with external insulation"
GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = (
"Granite or whin, as built, no insulation (assumed)"
)
GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = (
"Granite or whin, as built, insulated (assumed)"
)
GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Granite or whin, as built, partial insulation (assumed)"
)
GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = (
"Granite or whin, with internal insulation"
)
GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = (
"Granite or whin, with external insulation"
)
SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = (
"System built, as built, no insulation (assumed)"
)
SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = (
"System built, as built, insulated (assumed)"
)
SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"System built, as built, partial insulation (assumed)"
)
SYSTEM_BUILT_WITH_INTERNAL_INSULATION = "System built, with internal insulation"
SYSTEM_BUILT_WITH_EXTERNAL_INSULATION = "System built, with external insulation"
PARK_HOME_AS_BUILT = "Park home wall, as built"
PARK_HOME_WITH_INTERNAL_INSULATION = "Park home wall, with internal insulation"
PARK_HOME_WITH_EXTERNAL_INSULATION = "Park home wall, with external insulation"
COB_AS_BUILT = "Cob, as built"
COB_WITH_INTERNAL_INSULATION = "Cob, with internal insulation"
COB_WITH_EXTERNAL_INSULATION = "Cob, with external insulation"
CURTAIN_WALL = "Curtain wall"
CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = (
"Curtain Wall, as built, no insulation (assumed)"
)
CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = (
"Curtain Wall, as built, insulated (assumed)"
)
CURTAIN_WALL_FILLED = "Curtain Wall, filled cavity"
CURTAIN_WALL_WITH_INTERNAL_INSULATION = "Curtain Wall, with internal insulation"
BASEMENT_WALL = "Basement wall"
BASEMENT_WALL_AS_BUILT = "Basement wall, as built"
UNKNOWN = "Unknown"

View file

@ -0,0 +1,72 @@
"""Construction-date metadata for the "assumed" ``WallType`` variants.
The ``(assumed)`` variants of ``WallType`` are what RdSAP picks when a
surveyor has no direct observation and falls back to the typical wall make-up
for a property's build era. The era boundaries reflect UK Building
Regulations milestones for cavity-wall insulation:
* up to 1975 -- no cavity insulation requirement
* 1976-1982 -- partial-fill cavity (early insulation requirement)
* 1983-1990 -- full-fill cavity (insulation required)
Captured here as a structured lookup so:
* the LLM prompt builder can render the ranges as a hint, helping the
classifier resolve era-implying landlord descriptions to the right
``(assumed)`` variant;
* future date-aware paths (a deterministic year-to-variant shortcut, a
date-keyed repo) can read from the same source instead of duplicating
the knowledge.
Only the variants where we have a defensible era boundary appear here; the
remaining ``(assumed)`` members are left out rather than guessed.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Mapping, Optional
from domain.epc.wall_type import WallType
@dataclass(frozen=True)
class YearRange:
"""An inclusive year range. ``None`` on either end means "no bound"."""
start: Optional[int] = None
end: Optional[int] = None
def __str__(self) -> str:
if self.start is None and self.end is not None:
return f"pre-{self.end + 1}"
if self.start is not None and self.end is None:
return f"{self.start}+"
return f"{self.start}-{self.end}"
WALL_TYPE_CONSTRUCTION_YEARS: Mapping[WallType, YearRange] = {
WallType.CAVITY_AS_BUILT_NO_INSULATION_ASSUMED: YearRange(end=1975),
WallType.CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED: YearRange(
start=1976, end=1982
),
WallType.CAVITY_AS_BUILT_INSULATED_ASSUMED: YearRange(start=1983, end=1990),
}
def wall_type_construction_date_prompt_hint() -> str:
"""Render the date metadata as a prompt fragment for the LLM classifier.
The fragment lists each (variant, year range) pair so the model can
prefer the era-matching ``(assumed)`` variant when a landlord
description carries era information (e.g. "1970s semi", "built before
the war").
"""
lines = [
f"- {wall_type.value!r}: typically built {year_range}"
for wall_type, year_range in WALL_TYPE_CONSTRUCTION_YEARS.items()
]
return (
"When the description carries construction-era information, prefer "
"the category whose typical build year matches:\n" + "\n".join(lines)
)

View file

View file

@ -0,0 +1,60 @@
from __future__ import annotations
import os
from typing import Optional
from openai import OpenAI
from openai.types.chat import ChatCompletionMessageParam
from infrastructure.chatgpt.exceptions import ChatGPTClientError
class ChatGPT:
"""Thin wrapper over the OpenAI Chat Completions API.
Sends a single prompt and returns the assistant's reply as plain text.
"""
DEFAULT_MODEL = "gpt-4o-mini"
def __init__(
self,
api_key: Optional[str] = None,
model: Optional[str] = None,
) -> None:
key = api_key or os.environ.get("OPENAI_API_KEY")
if not key:
raise ChatGPTClientError(
"No OpenAI API key provided. "
"Pass api_key or set the OPENAI_API_KEY environment variable."
)
self._client = OpenAI(api_key=key)
self._model = model or self.DEFAULT_MODEL
def generate(
self,
prompt: str,
system_prompt: Optional[str] = None,
) -> str:
"""Send a prompt to the model and return its reply text.
Args:
prompt: The user message to send.
system_prompt: Optional instruction that sets the model's behaviour.
Raises:
OpenAiClientError: If the model returns an empty response.
"""
messages: list[ChatCompletionMessageParam] = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
response = self._client.chat.completions.create(
model=self._model,
messages=messages,
)
content = response.choices[0].message.content
if content is None:
raise ChatGPTClientError("ChatGPT returned an empty response.")
return content

View file

@ -0,0 +1,99 @@
from __future__ import annotations
import json
from enum import Enum
from typing import Any, Optional, TypeVar
from domain.data_transformation.column_classifier import (
ClassificationError,
ColumnClassifier,
)
from infrastructure.chatgpt.chatgpt import ChatGPT
from infrastructure.chatgpt.exceptions import ChatGPTClientError
E = TypeVar("E", bound=Enum)
class ChatGptColumnClassifier(ColumnClassifier[E]):
"""ColumnClassifier backed by ChatGPT, parametrised by a category enum.
The same classification path -- prompt, JSON parsing, UNKNOWN fallback --
serves any category enum; only ``category_enum`` and its ``unknown``
member differ between columns.
"""
def __init__(
self,
chat_gpt: ChatGPT,
category_enum: type[E],
unknown: E,
extra_instructions: Optional[str] = None,
) -> None:
self._chat_gpt = chat_gpt
self._category_enum = category_enum
self._unknown = unknown
# Free-form column-specific guidance appended to the system prompt
# ahead of the JSON-output instruction. Lets each column ship its
# own hints (e.g. wall-type construction-era ranges) without the
# generic classifier knowing what they are.
self._extra_instructions = extra_instructions
def classify(self, descriptions: set[str]) -> dict[str, E]:
if not descriptions:
return {}
try:
reply = self._chat_gpt.generate(
prompt=json.dumps(sorted(descriptions)),
system_prompt=self._system_prompt(),
)
except ChatGPTClientError as error:
raise ClassificationError(
f"ChatGPT classification failed for "
f"{self._category_enum.__name__}."
) from error
try:
raw: dict[str, Any] = json.loads(self._strip_code_fence(reply))
except json.JSONDecodeError as error:
raise ClassificationError(
f"ChatGPT returned a reply that is not valid JSON: {reply!r}"
) from error
return {
description: self._to_category(raw.get(description))
for description in descriptions
}
def _system_prompt(self) -> str:
categories = ", ".join(
member.value
for member in self._category_enum
if member is not self._unknown
)
parts = [
"Classify each free-text description into exactly one category. ",
f"Categories: {categories}. ",
]
if self._extra_instructions:
parts.append(self._extra_instructions + " ")
parts.append(
"Reply with only a JSON object mapping each original description "
"to its category, and nothing else."
)
return "".join(parts)
def _to_category(self, value: Any) -> E:
"""Map a reply value to a category member, defaulting to UNKNOWN."""
try:
return self._category_enum(value)
except ValueError:
return self._unknown
@staticmethod
def _strip_code_fence(reply: str) -> str:
"""Remove a surrounding markdown code fence, if ChatGPT added one."""
text = reply.strip()
if not text.startswith("```"):
return text
lines = text.splitlines()[1:]
if lines and lines[-1].strip() == "```":
lines = lines[:-1]
return "\n".join(lines)

View file

@ -0,0 +1,2 @@
class ChatGPTClientError(Exception):
"""Base for all OpenAI client errors."""

View file

@ -1,33 +0,0 @@
import csv
from io import StringIO
from infrastructure.s3_client import S3Client
from infrastructure.s3_uri import parse_s3_uri
class CsvS3Client(S3Client):
def read_rows(self, s3_uri: str) -> list[dict[str, str]]:
bucket, key = parse_s3_uri(s3_uri)
if bucket != self.bucket:
raise ValueError(
f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}"
)
raw = self.get_object(key)
try:
text = raw.decode("utf-8-sig")
except UnicodeDecodeError:
# Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8.
text = raw.decode("cp1252")
reader = csv.DictReader(StringIO(text))
return [dict(row) for row in reader]
def save_rows(self, rows: list[dict[str, str]], key: str) -> str:
if not rows:
raise ValueError("Cannot save an empty rows list: header is unknown")
buffer = StringIO()
fieldnames = list(rows[0].keys())
writer = csv.DictWriter(buffer, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
return self.put_object(key, buffer.getvalue().encode("utf-8"))

View file

@ -0,0 +1,97 @@
"""One Postgres adapter for every ``landlord_<category>_overrides`` table.
The four override categories (property / built-form / wall / roof type) share
an identical write path: the same ``(portfolio_id, description) -> value`` row
shape and the same source-aware conflict policy (ADR-0003 §Decision fixes one
policy for all of them). The only thing that varies per category is the target
table, which is selected by the SQLModel row class handed to the constructor.
So a single generic adapter serves all four -- callers parameterise it by the
category enum ``E`` and pass the matching ``Row`` class:
LandlordOverridesRepository[PropertyType](session, LandlordPropertyTypeOverrideRow)
This adapter -- the aggregate's "talker" to Postgres -- lives under
``infrastructure/landlord_overrides/``, grouped by aggregate. The ``table=True``
SQLModel row classes it writes through stay in ``infrastructure/postgres/`` as
schema mirrors (they share the ``override_source`` pgEnum and register against
the same engine metadata). See ADR-0003 §File layout.
"""
from __future__ import annotations
from datetime import datetime, timezone
from enum import Enum
from typing import TypeVar, cast
from sqlalchemy import Table
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlmodel import Session, SQLModel
from infrastructure.postgres.landlord_override_enums import OverrideSource
from repositories.landlord_overrides.landlord_override_repository import (
LandlordOverrideRepository,
)
E = TypeVar("E", bound=Enum)
class LandlordOverridesRepository(LandlordOverrideRepository[E]):
"""Writes classifier overrides to the ``landlord_<category>_overrides`` table
backing ``row_type``.
``row_type`` is the SQLModel mirror of the target table (e.g.
``LandlordPropertyTypeOverrideRow``); its category enum must match the type
argument ``E`` the caller binds. The pairing is a convention, not a type
constraint -- the row classes are not generic over their value enum -- so
keep the two in step at the call site.
"""
def __init__(self, session: Session, row_type: type[SQLModel]) -> None:
self._session = session
# SQLModel's class-level ``__table__`` is injected at runtime on
# ``table=True`` classes but isn't exposed by the stubs; pin it to
# ``Table`` via ``getattr`` so the dialect insert helper carries
# through with strict types.
self._table: Table = cast(Table, getattr(row_type, "__table__"))
def upsert_all(
self,
portfolio_id: int,
descriptions_to_values: dict[str, E],
) -> None:
if not descriptions_to_values:
return
now = datetime.now(timezone.utc)
rows = [
{
"portfolio_id": portfolio_id,
"description": description,
"value": value.value,
"source": OverrideSource.CLASSIFIER,
"created_at": now,
"updated_at": now,
}
for description, value in descriptions_to_values.items()
]
stmt = pg_insert(self._table).values(rows)
# The classifier may refresh its own past output, but must never
# overwrite a user correction -- the ``WHERE existing.source =
# 'classifier'`` guard enforces that. See ADR-0003 §Decision.
stmt = stmt.on_conflict_do_update(
index_elements=["portfolio_id", "description"],
set_={
"value": stmt.excluded.value,
"source": stmt.excluded.source,
"updated_at": stmt.excluded.updated_at,
},
where=self._table.c.source == OverrideSource.CLASSIFIER,
)
# SQLModel re-exports SQLAlchemy's ``Session.execute``; one of the
# overload signatures is marked deprecated in stubs, which fires
# here even though our INSERT path is the supported one.
self._session.execute(stmt) # pyright: ignore[reportDeprecated]

View file

View file

@ -0,0 +1,2 @@
class OpenAiClientError(Exception):
"""Base for all OpenAI client errors."""

View file

@ -0,0 +1,60 @@
from __future__ import annotations
import os
from typing import Optional
from openai import OpenAI
from openai.types.chat import ChatCompletionMessageParam
from infrastructure.openai.exceptions import OpenAiClientError
class OpenAiChatClient:
"""Thin wrapper over the OpenAI Chat Completions API.
Sends a single prompt and returns the assistant's reply as plain text.
"""
DEFAULT_MODEL = "gpt-4o-mini"
def __init__(
self,
api_key: Optional[str] = None,
model: Optional[str] = None,
) -> None:
key = api_key or os.environ.get("OPENAI_API_KEY")
if not key:
raise OpenAiClientError(
"No OpenAI API key provided. "
"Pass api_key or set the OPENAI_API_KEY environment variable."
)
self._client = OpenAI(api_key=key)
self._model = model or self.DEFAULT_MODEL
def generate(
self,
prompt: str,
system_prompt: Optional[str] = None,
) -> str:
"""Send a prompt to the model and return its reply text.
Args:
prompt: The user message to send.
system_prompt: Optional instruction that sets the model's behaviour.
Raises:
OpenAiClientError: If the model returns an empty response.
"""
messages: list[ChatCompletionMessageParam] = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
response = self._client.chat.completions.create(
model=self._model,
messages=messages,
)
content = response.choices[0].message.content
if content is None:
raise OpenAiClientError("OpenAI returned an empty response.")
return content

View file

@ -1,3 +1,6 @@
from collections.abc import Iterator
from contextlib import contextmanager
from sqlalchemy.engine import Engine
from sqlmodel import Session, create_engine
@ -16,3 +19,42 @@ def make_engine(config: PostgresConfig) -> Engine:
def make_session(engine: Engine) -> Session:
return Session(engine)
@contextmanager # pyright: ignore[reportDeprecated]
def transactional_session(engine: Engine) -> Iterator[Session]:
"""Yield a session whose lifecycle owns the transaction.
On clean exit the session commits; on any exception it rolls back and
re-raises. Either way the session is closed. Callers in the application
layer can do their work inside the ``with`` block without ever invoking
``.commit()`` / ``.rollback()`` themselves -- transaction semantics stay
in the infrastructure layer.
"""
session = Session(engine)
try:
yield session
session.commit()
except Exception:
session.rollback()
raise
finally:
session.close()
@contextmanager # pyright: ignore[reportDeprecated]
def commit_scope(session: Session) -> Iterator[Session]:
"""Commit a caller-owned session on clean exit; roll back on error.
Like ``transactional_session`` but for a session the caller already holds
and will close itself. Use it to keep slow, non-DB work *outside* the
transaction: build the session, run the slow work, then enter
``commit_scope`` only for the persistence -- so a connection is checked out
(SQLModel sessions are lazy) for the shortest possible window.
"""
try:
yield session
session.commit()
except Exception:
session.rollback()
raise

View file

@ -0,0 +1,69 @@
"""SQLModel mirror of the ``landlord_built_form_type_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only
differences are the table name, the ``built_form_type`` pgEnum on ``value``,
and the unique-constraint name.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.built_form_type import BuiltFormType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordBuiltFormTypeOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_built_form_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_built_form_type_overrides_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: BuiltFormType = Field(
sa_column=Column(
SAEnum(
BuiltFormType,
name="built_form_type",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

@ -0,0 +1,35 @@
"""Shared pgEnum definitions used by every ``landlord_*_overrides`` row class.
The ``override_source`` pgEnum is referenced by both
``landlord_property_type_overrides`` and ``landlord_wall_type_overrides``
(per the Drizzle schema -- see ``landlord_overrides.ts``). Defining it once
here and reusing the same SQLAlchemy ``Enum`` instance across both row
classes keeps SQLModel's metadata coherent: ``create_all`` emits exactly one
``CREATE TYPE override_source`` statement, not two parallel ones colliding
on the same pgEnum name.
"""
from __future__ import annotations
from sqlalchemy import Enum as SAEnum
class OverrideSource:
"""Mirror of the ``override_source`` pgEnum.
Drizzle defines this as ``('classifier', 'user')`` in
``landlord_overrides.ts``. Modelled here as string constants so callers
don't sprinkle magic strings; the column is constrained by Postgres,
and the only Python-side producer (the classifier path) writes the
literal ``OverrideSource.CLASSIFIER``.
"""
CLASSIFIER = "classifier"
USER = "user"
override_source_sa_enum = SAEnum(
OverrideSource.CLASSIFIER,
OverrideSource.USER,
name="override_source",
)

View file

@ -0,0 +1,67 @@
"""SQLModel mirror of the ``landlord_property_type_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.property_type import PropertyType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordPropertyTypeOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_property_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_property_type_overrides_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: PropertyType = Field(
sa_column=Column(
SAEnum(
PropertyType,
name="property_type",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

@ -0,0 +1,69 @@
"""SQLModel mirror of the ``landlord_roof_type_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only
differences are the table name, the ``roof_type`` pgEnum on ``value``, and
the unique-constraint name.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.roof_type import RoofType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordRoofTypeOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_roof_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_roof_type_overrides_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: RoofType = Field(
sa_column=Column(
SAEnum(
RoofType,
name="roof_type",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

@ -0,0 +1,69 @@
"""SQLModel mirror of the ``landlord_wall_type_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only
differences are the table name, the ``wall_type`` pgEnum on ``value``, and
the unique-constraint name.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.wall_type import WallType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordWallTypeOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_wall_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_wall_type_overrides_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: WallType = Field(
sa_column=Column(
SAEnum(
WallType,
name="wall_type",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

View file

@ -0,0 +1,62 @@
import csv
from io import StringIO
from infrastructure.s3.s3_client import S3Client
from infrastructure.s3.s3_uri import parse_s3_uri
def _dedupe_fieldnames(fieldnames: list[str]) -> list[str]:
"""Disambiguate repeated CSV headers by appending an index.
The first occurrence keeps its name; each later one becomes
``name_1``, ``name_2`` so duplicate columns survive as distinct
keys instead of collapsing onto one (last-wins) dict entry.
"""
deduped: list[str] = []
counts: dict[str, int] = {}
for name in fieldnames:
if name not in counts:
counts[name] = 0
deduped.append(name)
continue
counts[name] += 1
candidate = f"{name}_{counts[name]}"
while candidate in counts:
counts[name] += 1
candidate = f"{name}_{counts[name]}"
counts[candidate] = 0
deduped.append(candidate)
return deduped
class CsvS3Client(S3Client):
def read_rows(self, s3_uri: str) -> list[dict[str, str]]:
bucket, key = parse_s3_uri(s3_uri)
if bucket != self.bucket:
raise ValueError(
f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}"
)
raw = self.get_object(key)
try:
text = raw.decode("utf-8-sig")
except UnicodeDecodeError:
# Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8.
text = raw.decode("cp1252")
buffer = StringIO(text)
header = next(csv.reader(buffer), None)
if header is None:
return []
fieldnames = _dedupe_fieldnames(header)
reader = csv.DictReader(buffer, fieldnames=fieldnames)
return [dict(row) for row in reader]
def save_rows(self, rows: list[dict[str, str]], key: str) -> str:
if not rows:
raise ValueError("Cannot save an empty rows list: header is unknown")
buffer = StringIO()
fieldnames = list(rows[0].keys())
writer = csv.DictWriter(buffer, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
return self.put_object(key, buffer.getvalue().encode("utf-8"))

View file

@ -0,0 +1,37 @@
from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
from typing import Generic, TypeVar
from domain.data_transformation.column_classifier import ColumnClassifier
from repositories.landlord_overrides.landlord_override_repository import (
LandlordOverrideRepository,
)
E = TypeVar("E", bound=Enum)
@dataclass(frozen=True)
class ClassifiableColumn(Generic[E]):
"""Pairs a column's classifier with the repository that persists its results.
The orchestrator registers one ``ClassifiableColumn`` per
(source column, target enum) pair. Bundling the classifier and the
repository together makes the "this enum lands in this table" invariant
structural -- the handler can no longer wire ``PropertyType``
classifications to a ``WallType`` repo by keying two dicts with the same
string.
``source_column`` is the landlord-CSV header to read from; ``name`` is the
unique key the orchestrator uses to report this classification's results
(and the key the handler logs). Two ``ClassifiableColumn``s may share a
``source_column`` -- e.g. the ``"Property Type"`` CSV column feeds both
``PropertyType`` and ``BuiltFormType`` classifiers off the same free-text
description -- but each must have a unique ``name``.
"""
name: str
source_column: str
classifier: ColumnClassifier[E]
repo: LandlordOverrideRepository[E]

View file

@ -0,0 +1,142 @@
from enum import Enum
from typing import Any
from domain.addresses.unstandardised_address import AddressList
from orchestration.classifiable_column import ClassifiableColumn
from repositories.unstandardised_address.unstandardised_address_list_repository import (
UnstandardisedAddressListRepository,
)
class LandlordDescriptionOverridesOrchestrator:
def __init__(
self,
unstandardised_address_repo: UnstandardisedAddressListRepository,
columns: list[ClassifiableColumn[Any]],
) -> None:
self._unstandardised_address_repo = unstandardised_address_repo
# Each entry is one (source CSV column, target enum) classification.
# Two entries may share ``source_column`` -- e.g. ``"Property Type"``
# feeds both PropertyType and BuiltFormType classifiers -- so the
# registry is a list rather than a dict keyed by header.
self._columns = columns
def get_unstandardised_addresses(
self,
input_s3_uri: str,
) -> AddressList:
return self._unstandardised_address_repo.load_batch(input_s3_uri)
def get_col_to_description_mappings(
self, list_of_unstandardised_address: AddressList
) -> dict[str, set[str]]:
mappings: dict[str, set[str]] = {}
for unstandardised_address in list_of_unstandardised_address:
for key, value in unstandardised_address.additional_info.items():
bucket = mappings.setdefault(key, set())
# A comma-separated value is several descriptions in one cell;
# split it so each is its own entry. Lower-case so case-only
# typos collapse to one variant.
for variant in value.split(","):
variant = variant.strip().lower()
if variant:
bucket.add(variant)
return mappings
def classify_columns(
self, addresses: AddressList
) -> dict[str, dict[str, Enum]]:
"""Classify every registered column's descriptions.
Returns a mapping of ``ClassifiableColumn.name`` to
``{description: category}``. A registered column whose ``source_column``
is absent from the addresses contributes an empty inner mapping.
"""
col_to_desc = self.get_col_to_description_mappings(addresses)
return {
column.name: column.classifier.classify(
col_to_desc.get(column.source_column, set())
)
for column in self._columns
}
def persist(
self, classified: dict[str, dict[str, Enum]], portfolio_id: int
) -> None:
"""Persist already-classified results via each column's repository.
``classified`` is keyed by ``ClassifiableColumn.name`` -- the shape
``classify_columns`` and ``classify_from_rows`` return. Each non-empty
mapping is written through the column's own repo under
``source = 'classifier'``; an empty mapping (a registered column absent
from this batch) skips the DB round-trip.
The orchestrator does not commit -- the caller owns the transaction
boundary, and is expected to open it only around this call so the
slow classification never holds a connection.
"""
for column in self._columns:
mapping = classified.get(column.name)
if not mapping:
continue
column.repo.upsert_all(portfolio_id, mapping)
def classify_and_persist(
self, addresses: AddressList, portfolio_id: int
) -> dict[str, dict[str, Enum]]:
"""Classify every registered column and persist the results.
Returns the same shape as ``classify_columns`` so callers can log
per-column counts.
"""
classified = self.classify_columns(addresses)
self.persist(classified, portfolio_id)
return classified
def classify_from_rows(
self, rows: list[dict[str, str]]
) -> dict[str, dict[str, Enum]]:
"""Classify raw CSV rows without touching the database.
The classification half of ``classify_and_persist_from_rows``, split
out so a caller can run the slow ChatGPT work *before* opening a
transaction and then write the finished results with ``persist`` inside
one short-lived connection.
Unlike the ``AddressList`` path this builds no ``AddressList``, so it
has no canonical address/postcode requirement -- the classifier only
needs the raw description cells. Used when reading the original
landlord upload (raw headers) rather than the address-matching CSV.
"""
col_to_desc = self._descriptions_from_rows(rows)
return {
column.name: column.classifier.classify(
col_to_desc.get(column.source_column, set())
)
for column in self._columns
}
def classify_and_persist_from_rows(
self, rows: list[dict[str, str]], portfolio_id: int
) -> dict[str, dict[str, Enum]]:
"""Classify + persist straight from raw CSV rows in one call.
A convenience composition of ``classify_from_rows`` + ``persist``.
Prefer calling the two separately when classification is slow, so the
transaction opens only around ``persist`` (see the Lambda handler).
"""
classified = self.classify_from_rows(rows)
self.persist(classified, portfolio_id)
return classified
@staticmethod
def _descriptions_from_rows(rows: list[dict[str, str]]) -> dict[str, set[str]]:
mappings: dict[str, set[str]] = {}
for row in rows:
for key, value in row.items():
bucket = mappings.setdefault(key, set())
for variant in (value or "").split(","):
variant = variant.strip().lower()
if variant:
bucket.add(variant)
return mappings

View file

@ -5,19 +5,21 @@ from uuid import UUID
from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
from orchestration.task_orchestrator import TaskOrchestrator
from domain.addresses.postcode_batching import iter_postcode_grouped_batches
from repositories.user_address.user_address_repository import UserAddressRepository
from repositories.unstandardised_address.unstandardised_address_list_repository import (
UnstandardisedAddressListRepository,
)
class PostcodeSplitterOrchestrator:
def __init__(
self,
task_orchestrator: TaskOrchestrator,
user_address_repo: UserAddressRepository,
unstandardised_address_repo: UnstandardisedAddressListRepository,
queue_client: Address2UprnQueueClient,
max_batch_size: int = 500,
) -> None:
self._task_orchestrator = task_orchestrator
self._user_address_repo = user_address_repo
self._unstandardised_address_repo = unstandardised_address_repo
self._queue_client = queue_client
self._max_batch_size = max_batch_size
@ -28,7 +30,7 @@ class PostcodeSplitterOrchestrator:
parent_subtask_id: UUID,
input_s3_uri: str,
) -> list[UUID]:
addresses = self._user_address_repo.load_batch(input_s3_uri)
addresses = self._unstandardised_address_repo.load_batch(input_s3_uri)
path_prefix = (
f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}"
)
@ -37,7 +39,7 @@ class PostcodeSplitterOrchestrator:
for batch in iter_postcode_grouped_batches(
addresses, max_batch_size=self._max_batch_size
):
batch_uri = self._user_address_repo.save_batch(batch, path_prefix)
batch_uri = self._unstandardised_address_repo.save_batch(batch, path_prefix)
child = self._task_orchestrator.create_child_subtask(
parent_task_id,
inputs={

View file

@ -26,5 +26,6 @@ testpaths =
etl/hubspot/tests
etl/spatial/tests
domain/sap10_ml/tests
tests/
markers =
integration: mark a test as an integration test

View file

@ -0,0 +1,40 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from enum import Enum
from typing import Generic, TypeVar
E = TypeVar("E", bound=Enum)
class LandlordOverrideRepository(ABC, Generic[E]):
"""Port: persists landlord (description -> category) overrides for a portfolio.
One repository implementation targets one ``landlord_<category>_overrides``
table. The category enum ``E`` (e.g. ``PropertyType``, ``WallType``) determines
which table the adapter writes to; the orchestrator depends only on this
interface and never names a concrete table.
A single concrete adapter,
``infrastructure/landlord_overrides/landlord_overrides_postgres_repository.LandlordOverridesRepository``,
serves every category (see ADR-0003) -- it is parameterised by the
SQLModel row class for the target table.
"""
@abstractmethod
def upsert_all(
self,
portfolio_id: int,
descriptions_to_values: dict[str, E],
) -> None:
"""Upsert each ``(portfolio_id, description) -> value`` row with ``source='classifier'``.
On conflict with an existing row whose ``source = 'classifier'``, the row
is updated (value, source, updated_at). On conflict with a row whose
``source = 'user'``, the existing row is preserved -- the classifier
never overwrites a user correction. See ADR-0003 §Decision.
An empty ``descriptions_to_values`` mapping is a no-op; callers may
skip this call entirely when they have nothing to write.
"""
...

View file

@ -4,10 +4,12 @@ import uuid
from datetime import datetime, timezone
from typing import Optional
from domain.addresses.user_address import UserAddress
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
from domain.postcode import Postcode
from infrastructure.csv_s3_client import CsvS3Client
from repositories.user_address.user_address_repository import UserAddressRepository
from infrastructure.s3.csv_s3_client import CsvS3Client
from repositories.unstandardised_address.unstandardised_address_list_repository import (
UnstandardisedAddressListRepository,
)
_ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3")
_POSTCODE_COLUMN: str = "postcode"
@ -15,42 +17,45 @@ _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference"
_POSTCODE_CLEAN_COLUMN: str = "postcode_clean"
class UserAddressCsvS3Repository(UserAddressRepository):
class UnstandardisedAddressListCsvS3Repository(UnstandardisedAddressListRepository):
def __init__(self, csv_client: CsvS3Client, bucket: str) -> None:
self._csv_client = csv_client
self._bucket = bucket
def load_batch(self, s3_uri: str) -> list[UserAddress]:
def load_batch(self, s3_uri: str) -> AddressList:
rows = self._csv_client.read_rows(s3_uri)
if rows and _POSTCODE_COLUMN not in rows[0]:
raise ValueError(
f"Input CSV {s3_uri} has no {_POSTCODE_COLUMN!r} column; "
f"columns present: {sorted(rows[0])}"
)
addresses: list[UserAddress] = []
addresses: AddressList = AddressList([])
for row in rows:
parts = [
row[col].strip()
for col in _ADDRESS_COLUMNS
if col in row and row[col].strip()
]
user_address = ", ".join(parts)
unstandardised_address = ", ".join(parts)
postcode = row.get(_POSTCODE_COLUMN, "")
raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip()
internal_reference: Optional[str] = raw_ref or None
addresses.append(
UserAddress(
user_address=user_address,
UnstandardisedAddress(
address=unstandardised_address,
postcode=Postcode(postcode),
internal_reference=internal_reference,
source_row=row,
org_reference=internal_reference,
additional_info=row,
)
)
return addresses
def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str:
def save_batch(self, addresses: AddressList, path_prefix: str) -> str:
rows: list[dict[str, str]] = [
{**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)}
{
**addr.additional_info,
_POSTCODE_CLEAN_COLUMN: str(addr.postcode),
}
for addr in addresses
]

View file

@ -0,0 +1,13 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from domain.addresses.unstandardised_address import AddressList
class UnstandardisedAddressListRepository(ABC):
@abstractmethod
def load_batch(self, s3_uri: str) -> AddressList: ...
@abstractmethod
def save_batch(self, addresses: AddressList, path_prefix: str) -> str: ...

View file

@ -1,13 +0,0 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from domain.addresses.user_address import UserAddress
class UserAddressRepository(ABC):
@abstractmethod
def load_batch(self, s3_uri: str) -> list[UserAddress]: ...
@abstractmethod
def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: ...

View file

@ -10,4 +10,5 @@ fuzzywuzzy
pymupdf
playwright==1.58.0
msal
moto[s3,sqs]
moto[s3,sqs]
openai==1.93.0

View file

@ -1,17 +1,17 @@
import pytest
from domain.addresses.postcode_batching import iter_postcode_grouped_batches
from domain.addresses.user_address import UserAddress
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
from domain.postcode import Postcode
def _addrs(postcode: str, n: int) -> list[UserAddress]:
return [
UserAddress(
user_address=f"{i} {postcode} Street", postcode=Postcode(postcode)
)
for i in range(n)
]
def _addrs(postcode: str, n: int) -> AddressList:
return AddressList(
[
UnstandardisedAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode))
for i in range(n)
]
)
def test_empty_input_yields_no_batches() -> None:
@ -74,9 +74,7 @@ def test_oversize_group_flushes_existing_buffer_first() -> None:
big = _addrs("BB2 2BB", 7)
tail = _addrs("CC3 3CC", 1)
# act
batches = list(
iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)
)
batches = list(iter_postcode_grouped_batches(small + big + tail, max_batch_size=5))
# assert
assert len(batches) == 3
assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"]

View file

@ -0,0 +1,96 @@
import dataclasses
import pytest
from domain.addresses.unstandardised_address import UnstandardisedAddress
from domain.postcode import Postcode
def test_unstandardised_address_holds_postcode_value_object() -> None:
# act
addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa"))
# assert
assert addr.postcode == Postcode("SW1A1AA")
def test_unstandardised_address_preserves_unstandardised_address_verbatim() -> None:
# The free-text unstandardised_address string is intentionally NOT normalised --
# only the postcode is canonicalised, and that happens inside Postcode.
# act
addr = UnstandardisedAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA"))
# assert
assert addr.address == " 1 The Street "
def test_unstandardised_address_internal_reference_defaults_to_none() -> None:
# act
addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA"))
# assert
assert addr.org_reference is None
def test_unstandardised_address_internal_reference_accepted() -> None:
# act
addr = UnstandardisedAddress(
address="1 The Street",
postcode=Postcode("SW1A1AA"),
org_reference="cust-42",
)
# assert
assert addr.org_reference == "cust-42"
def test_unstandardised_address_is_frozen() -> None:
# arrange
addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA"))
# act / assert
with pytest.raises(dataclasses.FrozenInstanceError):
addr.postcode = Postcode("OTHER") # type: ignore[misc]
def test_unstandardised_address_equality_uses_canonical_postcode() -> None:
# Postcode sanitises eagerly, so addresses built from different surface
# forms of the same postcode compare equal.
# arrange
a = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa"))
b = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA"))
# act / assert
assert a == b
def test_unstandardised_address_source_row_defaults_to_empty_dict() -> None:
# act
addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA"))
# assert
assert addr.additional_info == {}
def test_unstandardised_address_carries_source_row() -> None:
# arrange
row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"}
# act
addr = UnstandardisedAddress(
address="1 The Street",
postcode=Postcode("SW1A 1AA"),
additional_info=row,
)
# assert
assert addr.additional_info == row
def test_unstandardised_address_equality_ignores_source_row() -> None:
# source_row is excluded from equality (and hashing): identity stays
# defined by the parsed fields.
# arrange
a = UnstandardisedAddress(
address="1 The Street",
postcode=Postcode("SW1A1AA"),
additional_info={"x": "1"},
)
b = UnstandardisedAddress(
address="1 The Street",
postcode=Postcode("SW1A1AA"),
additional_info={"y": "2"},
)
# act / assert
assert a == b

View file

@ -1,98 +0,0 @@
import dataclasses
import pytest
from domain.addresses.user_address import UserAddress
from domain.postcode import Postcode
def test_user_address_holds_postcode_value_object() -> None:
# act
addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
# assert
assert addr.postcode == Postcode("SW1A1AA")
def test_user_address_preserves_user_address_verbatim() -> None:
# The free-text user_address string is intentionally NOT normalised --
# only the postcode is canonicalised, and that happens inside Postcode.
# act
addr = UserAddress(
user_address=" 1 The Street ", postcode=Postcode("SW1A1AA")
)
# assert
assert addr.user_address == " 1 The Street "
def test_user_address_internal_reference_defaults_to_none() -> None:
# act
addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
# assert
assert addr.internal_reference is None
def test_user_address_internal_reference_accepted() -> None:
# act
addr = UserAddress(
user_address="1 The Street",
postcode=Postcode("SW1A1AA"),
internal_reference="cust-42",
)
# assert
assert addr.internal_reference == "cust-42"
def test_user_address_is_frozen() -> None:
# arrange
addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
# act / assert
with pytest.raises(dataclasses.FrozenInstanceError):
addr.postcode = Postcode("OTHER") # type: ignore[misc]
def test_user_address_equality_uses_canonical_postcode() -> None:
# Postcode sanitises eagerly, so addresses built from different surface
# forms of the same postcode compare equal.
# arrange
a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
# act / assert
assert a == b
def test_user_address_source_row_defaults_to_empty_dict() -> None:
# act
addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
# assert
assert addr.source_row == {}
def test_user_address_carries_source_row() -> None:
# arrange
row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"}
# act
addr = UserAddress(
user_address="1 The Street",
postcode=Postcode("SW1A 1AA"),
source_row=row,
)
# assert
assert addr.source_row == row
def test_user_address_equality_ignores_source_row() -> None:
# source_row is excluded from equality (and hashing): identity stays
# defined by the parsed fields.
# arrange
a = UserAddress(
user_address="1 The Street",
postcode=Postcode("SW1A1AA"),
source_row={"x": "1"},
)
b = UserAddress(
user_address="1 The Street",
postcode=Postcode("SW1A1AA"),
source_row={"y": "2"},
)
# act / assert
assert a == b

View file

View file

@ -0,0 +1,185 @@
from __future__ import annotations
from typing import Optional
import pytest
from domain.data_transformation.column_classifier import ClassificationError
from domain.epc.property_type import PropertyType
from domain.epc.wall_type import WallType
from infrastructure.chatgpt.chatgpt import ChatGPT
from infrastructure.chatgpt.chatgpt_column_classifier import (
ChatGptColumnClassifier,
)
from infrastructure.chatgpt.exceptions import ChatGPTClientError
class _FakeChatGPT(ChatGPT):
"""Hand-written ChatGPT stand-in: returns a canned reply, records prompts."""
def __init__(
self,
reply: str = "{}",
error: Optional[Exception] = None,
) -> None:
self.prompts: list[str] = []
self.system_prompts: list[Optional[str]] = []
self._reply = reply
self._error = error
def generate(self, prompt: str, system_prompt: Optional[str] = None) -> str:
self.prompts.append(prompt)
self.system_prompts.append(system_prompt)
if self._error is not None:
raise self._error
return self._reply
def _property_type_classifier(
chat_gpt: ChatGPT,
) -> ChatGptColumnClassifier[PropertyType]:
return ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN)
def test_classifies_description_into_its_category() -> None:
# Arrange
chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}')
classifier = _property_type_classifier(chat_gpt)
# Act
result = classifier.classify({"semi-detached"})
# Assert
assert result == {"semi-detached": PropertyType.HOUSE}
def test_classifies_when_reply_is_wrapped_in_a_markdown_fence() -> None:
# Arrange: ChatGPT wraps the JSON in a ```json ... ``` code fence.
chat_gpt = _FakeChatGPT(reply='```json\n{"semi-detached": "House"}\n```')
classifier = _property_type_classifier(chat_gpt)
# Act
result = classifier.classify({"semi-detached"})
# Assert
assert result == {"semi-detached": PropertyType.HOUSE}
def test_unrecognised_category_maps_to_unknown() -> None:
# Arrange
chat_gpt = _FakeChatGPT(reply='{"garden shed": "Shed"}')
classifier = _property_type_classifier(chat_gpt)
# Act
result = classifier.classify({"garden shed"})
# Assert
assert result == {"garden shed": PropertyType.UNKNOWN}
def test_description_omitted_from_reply_maps_to_unknown() -> None:
# Arrange: the reply classifies one description but not the other.
chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}')
classifier = _property_type_classifier(chat_gpt)
# Act
result = classifier.classify({"semi-detached", "TBC"})
# Assert
assert result == {
"semi-detached": PropertyType.HOUSE,
"TBC": PropertyType.UNKNOWN,
}
def test_chatgpt_failure_raises_classification_error() -> None:
# Arrange
chat_gpt = _FakeChatGPT(error=ChatGPTClientError("backend unreachable"))
classifier = _property_type_classifier(chat_gpt)
# Act / Assert
with pytest.raises(ClassificationError):
classifier.classify({"semi-detached"})
def test_non_json_reply_raises_classification_error_with_the_raw_reply() -> None:
# Arrange
chat_gpt = _FakeChatGPT(reply="sorry, I can't do that")
classifier = _property_type_classifier(chat_gpt)
# Act / Assert: the error surfaces the offending reply for diagnosis.
with pytest.raises(ClassificationError, match="sorry, I can't do that"):
classifier.classify({"semi-detached"})
def test_empty_description_set_returns_empty_without_calling_chatgpt() -> None:
# Arrange
chat_gpt = _FakeChatGPT(reply='{"unused": "House"}')
classifier = _property_type_classifier(chat_gpt)
# Act
result = classifier.classify(set())
# Assert
assert result == {}
assert chat_gpt.prompts == []
def test_classifies_with_a_different_category_enum() -> None:
# Arrange: the same adapter classifies a WallType column.
chat_gpt = _FakeChatGPT(
reply='{"solid brick wall": "Solid brick, as built, no insulation (assumed)"}'
)
classifier = ChatGptColumnClassifier(chat_gpt, WallType, WallType.UNKNOWN)
# Act
result = classifier.classify({"solid brick wall"})
# Assert
assert result == {
"solid brick wall": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED
}
def test_extra_instructions_are_appended_to_the_system_prompt() -> None:
# Arrange: column-specific guidance (e.g. wall-type build-era hints)
# should reach the model verbatim, in the system prompt ahead of the
# JSON-output instruction.
chat_gpt = _FakeChatGPT(reply='{"1970s semi": "House"}')
classifier = ChatGptColumnClassifier(
chat_gpt,
PropertyType,
PropertyType.UNKNOWN,
extra_instructions="If the description carries a build decade, prefer X.",
)
# Act
classifier.classify({"1970s semi"})
# Assert: the hint sits in the system prompt, before the JSON instruction.
system_prompt = chat_gpt.system_prompts[0]
assert system_prompt is not None
assert "If the description carries a build decade, prefer X." in system_prompt
hint_index = system_prompt.index("If the description carries a build decade")
json_index = system_prompt.index("Reply with only a JSON object")
assert hint_index < json_index
def test_omitting_extra_instructions_leaves_the_system_prompt_unchanged() -> None:
# Arrange: a classifier without per-column guidance must still produce
# the original system prompt -- no trailing whitespace, no orphan hint.
chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}')
classifier = ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN)
# Act
classifier.classify({"semi-detached"})
# Assert
system_prompt = chat_gpt.system_prompts[0]
assert system_prompt is not None
assert system_prompt == (
"Classify each free-text description into exactly one category. "
"Categories: House, Bungalow, Flat, Maisonette, Park home. "
"Reply with only a JSON object mapping each original description "
"to its category, and nothing else."
)

View file

@ -3,7 +3,7 @@ from collections.abc import Iterator
import pytest
from moto import mock_aws
from infrastructure.csv_s3_client import CsvS3Client
from infrastructure.s3.csv_s3_client import CsvS3Client
from tests.infrastructure import make_boto_client
BUCKET = "csv-bucket"
@ -49,3 +49,36 @@ def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None:
# act / assert
with pytest.raises(ValueError, match="does not match client bucket"):
csv_client.read_rows("s3://other-bucket/uploads/addresses.csv")
def test_read_rows_indexes_duplicate_column_names(csv_client: CsvS3Client) -> None:
# arrange: the Hyde export has two columns both headed "Walls" — a
# description and a score. Without disambiguation csv.DictReader would
# collapse them onto one key and the description would be lost.
raw = "Address 1,Walls,Roofs,Walls\n1 High St,Cavity: Filled,Pitched 300mm,9.6\n"
uri = csv_client.put_object("uploads/dup.csv", raw.encode("utf-8"))
# act
rows = csv_client.read_rows(uri)
# assert: the first occurrence keeps its name, the second gets an index.
assert rows == [
{
"Address 1": "1 High St",
"Walls": "Cavity: Filled",
"Roofs": "Pitched 300mm",
"Walls_1": "9.6",
}
]
def test_read_rows_indexes_each_repeat_of_a_column(csv_client: CsvS3Client) -> None:
# arrange: three columns sharing one header.
raw = "Walls,Walls,Walls\nfirst,second,third\n"
uri = csv_client.put_object("uploads/triple.csv", raw.encode("utf-8"))
# act
rows = csv_client.read_rows(uri)
# assert
assert rows == [{"Walls": "first", "Walls_1": "second", "Walls_2": "third"}]

View file

@ -3,7 +3,7 @@ from collections.abc import Iterator
import pytest
from moto import mock_aws
from infrastructure.s3_client import S3Client
from infrastructure.s3.s3_client import S3Client
from tests.infrastructure import make_boto_client
BUCKET = "test-bucket"

View file

@ -1,6 +1,6 @@
import pytest
from infrastructure.s3_uri import parse_s3_uri
from infrastructure.s3.s3_uri import parse_s3_uri
def test_parses_simple_s3_uri() -> None:

View file

@ -0,0 +1,469 @@
from __future__ import annotations
from enum import Enum
from typing import Any, Optional
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
from domain.epc.built_form_type import BuiltFormType
from domain.epc.property_type import PropertyType
from domain.epc.wall_type import WallType
from domain.postcode import Postcode
from domain.data_transformation.column_classifier import ColumnClassifier
from orchestration.classifiable_column import ClassifiableColumn
from orchestration.landlord_description_overrides_orchestrator import (
LandlordDescriptionOverridesOrchestrator,
)
from repositories.landlord_overrides.landlord_override_repository import (
LandlordOverrideRepository,
)
from repositories.unstandardised_address.unstandardised_address_list_repository import (
UnstandardisedAddressListRepository,
)
class _StubUnstandardisedAddressRepository(UnstandardisedAddressListRepository):
"""``get_col_to_description_mappings`` never touches the repo."""
def load_batch(self, s3_uri: str) -> AddressList:
raise NotImplementedError()
def save_batch(self, addresses: AddressList, path_prefix: str) -> str:
raise NotImplementedError()
class _StubColumnClassifier(ColumnClassifier[Enum]):
"""Records the descriptions it received; returns a canned mapping."""
def __init__(self, result: dict[str, Enum]) -> None:
self.received: Optional[set[str]] = None
self._result = result
def classify(self, descriptions: set[str]) -> dict[str, Enum]:
self.received = descriptions
return self._result
class _StubLandlordOverrideRepository(LandlordOverrideRepository[Enum]):
"""Records every ``upsert_all`` call so tests can assert routing."""
def __init__(self) -> None:
self.calls: list[tuple[int, dict[str, Enum]]] = []
def upsert_all(
self, portfolio_id: int, descriptions_to_values: dict[str, Enum]
) -> None:
self.calls.append((portfolio_id, dict(descriptions_to_values)))
def _make_unstandardised_address(
landlord_additional_info: dict[str, str],
) -> UnstandardisedAddress:
return UnstandardisedAddress(
address="1 High St",
postcode=Postcode("AA1 1AA"),
additional_info=landlord_additional_info,
)
def _orchestrator(
columns: Optional[list[ClassifiableColumn[Any]]] = None,
) -> LandlordDescriptionOverridesOrchestrator:
return LandlordDescriptionOverridesOrchestrator(
unstandardised_address_repo=_StubUnstandardisedAddressRepository(),
columns=columns or [],
)
def _column(
name: str,
source_column: str,
classifier: ColumnClassifier[Any],
repo: Optional[LandlordOverrideRepository[Any]] = None,
) -> ClassifiableColumn[Any]:
return ClassifiableColumn(
name=name,
source_column=source_column,
classifier=classifier,
repo=repo or _StubLandlordOverrideRepository(),
)
def test_collects_every_value_per_shared_key() -> None:
# arrange: every address carries the same keys, all values distinct.
addresses = AddressList(
[
_make_unstandardised_address({"description": "cosy", "condition": "new"}),
_make_unstandardised_address({"description": "spacious", "condition": "worn"}),
_make_unstandardised_address({"description": "bright", "condition": "fair"}),
]
)
# act
mappings = _orchestrator().get_col_to_description_mappings(addresses)
# assert
assert mappings == {
"description": {"cosy", "spacious", "bright"},
"condition": {"new", "worn", "fair"},
}
def test_repeated_values_collapse_to_one_variant() -> None:
# arrange: two addresses share the same wall description.
addresses = AddressList(
[
_make_unstandardised_address({"description": "cosy"}),
_make_unstandardised_address({"description": "cosy"}),
_make_unstandardised_address({"description": "bright"}),
]
)
# act
mappings = _orchestrator().get_col_to_description_mappings(addresses)
# assert: a set keeps one entry per distinct variant.
assert mappings == {"description": {"cosy", "bright"}}
def test_case_only_variants_collapse_to_one() -> None:
# arrange: the same description typed with inconsistent casing.
addresses = AddressList(
[
_make_unstandardised_address({"description": "Cosy"}),
_make_unstandardised_address({"description": "cosy"}),
_make_unstandardised_address({"description": "COSY"}),
]
)
# act
mappings = _orchestrator().get_col_to_description_mappings(addresses)
# assert: lower-casing folds the casing typos into one variant.
assert mappings == {"description": {"cosy"}}
def test_comma_separated_value_splits_into_individual_entries() -> None:
# arrange: a single cell packs several descriptions, comma-separated.
addresses = AddressList(
[_make_unstandardised_address({"description": "cosy, bright, COSY"})]
)
# act
mappings = _orchestrator().get_col_to_description_mappings(addresses)
# assert: each comma-separated part is its own trimmed, lower-cased entry.
assert mappings == {"description": {"cosy", "bright"}}
def test_empty_address_list_yields_empty_mapping() -> None:
# arrange / act
mappings = _orchestrator().get_col_to_description_mappings(AddressList([]))
# assert
assert mappings == {}
def test_single_address_yields_single_value_per_key() -> None:
# arrange
addresses = AddressList([_make_unstandardised_address({"description": "cosy"})])
# act
mappings = _orchestrator().get_col_to_description_mappings(addresses)
# assert
assert mappings == {"description": {"cosy"}}
def test_classify_columns_classifies_each_registered_column() -> None:
# arrange: addresses carry two classifiable columns.
addresses = AddressList(
[
_make_unstandardised_address(
{"Property Type": "semi-detached", "Walls": "solid brick"}
),
]
)
property_types = _StubColumnClassifier(
result={"semi-detached": PropertyType.HOUSE}
)
wall_types = _StubColumnClassifier(result={"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED})
# act
result = _orchestrator(
[
_column("property_type", "Property Type", property_types),
_column("wall_type", "Walls", wall_types),
]
).classify_columns(addresses)
# assert: each registered column was classified independently, keyed by name.
assert result == {
"property_type": {"semi-detached": PropertyType.HOUSE},
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
}
def test_classify_columns_yields_empty_mapping_for_an_absent_column() -> None:
# arrange: a classifier is registered for a source column the addresses lack.
addresses = AddressList([_make_unstandardised_address({"Walls": "cavity"})])
property_types = _StubColumnClassifier(result={})
# act
result = _orchestrator(
[_column("property_type", "Property Type", property_types)]
).classify_columns(addresses)
# assert: the absent column classified an empty description set.
assert result == {"property_type": {}}
assert property_types.received == set()
def test_classify_columns_runs_two_classifiers_against_a_shared_source_column() -> None:
# arrange: the "Property Type" landlord column feeds two classifiers --
# PropertyType (what kind of dwelling) and BuiltFormType (how it joins
# to neighbours). Both must run against the same description set; each
# result is keyed by its column's ``name``.
addresses = AddressList(
[_make_unstandardised_address({"Property Type": "semi-detached house"})]
)
property_types = _StubColumnClassifier(
result={"semi-detached house": PropertyType.HOUSE}
)
built_form_types = _StubColumnClassifier(
result={"semi-detached house": BuiltFormType.SEMI_DETACHED}
)
# act
result = _orchestrator(
[
_column("property_type", "Property Type", property_types),
_column("built_form_type", "Property Type", built_form_types),
]
).classify_columns(addresses)
# assert: both classifiers saw the same description set, and the two
# results live under their own ``name`` keys without colliding.
assert property_types.received == {"semi-detached house"}
assert built_form_types.received == {"semi-detached house"}
assert result == {
"property_type": {"semi-detached house": PropertyType.HOUSE},
"built_form_type": {"semi-detached house": BuiltFormType.SEMI_DETACHED},
}
def test_classify_and_persist_writes_each_columns_mapping_to_its_own_repo() -> None:
# arrange: two columns with distinct repos -- the orchestrator must
# route each column's classifications to its own repo, not mix them.
addresses = AddressList(
[
_make_unstandardised_address(
{"Property Type": "semi-detached", "Walls": "solid brick"}
),
]
)
property_type_repo = _StubLandlordOverrideRepository()
wall_type_repo = _StubLandlordOverrideRepository()
columns: list[ClassifiableColumn[Any]] = [
_column(
"property_type",
"Property Type",
_StubColumnClassifier({"semi-detached": PropertyType.HOUSE}),
property_type_repo,
),
_column(
"wall_type",
"Walls",
_StubColumnClassifier({"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}),
wall_type_repo,
),
]
# act
result = _orchestrator(columns).classify_and_persist(addresses, portfolio_id=42)
# assert: each repo received exactly its own column's mapping, under the
# given portfolio_id, and the return value mirrors classify_columns.
assert property_type_repo.calls == [(42, {"semi-detached": PropertyType.HOUSE})]
assert wall_type_repo.calls == [
(42, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED})
]
assert result == {
"property_type": {"semi-detached": PropertyType.HOUSE},
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
}
def test_classify_and_persist_skips_upsert_for_a_column_absent_from_the_batch() -> None:
# arrange: ``Walls`` is registered but the address has no ``Walls`` column.
# The orchestrator should still classify (yielding an empty mapping) but
# must NOT call ``upsert_all`` -- an empty bulk insert is a noisy no-op.
addresses = AddressList(
[_make_unstandardised_address({"Property Type": "semi-detached"})]
)
property_type_repo = _StubLandlordOverrideRepository()
wall_type_repo = _StubLandlordOverrideRepository()
columns: list[ClassifiableColumn[Any]] = [
_column(
"property_type",
"Property Type",
_StubColumnClassifier({"semi-detached": PropertyType.HOUSE}),
property_type_repo,
),
_column(
"wall_type",
"Walls",
_StubColumnClassifier({}),
wall_type_repo,
),
]
# act
_orchestrator(columns).classify_and_persist(addresses, portfolio_id=7)
# assert: Property Type wrote; Walls did not.
assert property_type_repo.calls == [(7, {"semi-detached": PropertyType.HOUSE})]
assert wall_type_repo.calls == []
def test_classify_from_rows_classifies_each_column_without_persisting() -> None:
# arrange: raw CSV rows (not an AddressList) carry two classifiable columns.
rows = [{"Property Type": "semi-detached", "Walls": "solid brick"}]
property_types = _StubColumnClassifier({"semi-detached": PropertyType.HOUSE})
wall_types = _StubColumnClassifier(
{"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}
)
property_type_repo = _StubLandlordOverrideRepository()
wall_type_repo = _StubLandlordOverrideRepository()
# act
result = _orchestrator(
[
_column("property_type", "Property Type", property_types, property_type_repo),
_column("wall_type", "Walls", wall_types, wall_type_repo),
]
).classify_from_rows(rows)
# assert: each classifier ran against its column's descriptions, keyed by
# name -- and NOT a single repo was touched (classification is DB-free, so
# the slow ChatGPT work can run before any transaction opens).
assert result == {
"property_type": {"semi-detached": PropertyType.HOUSE},
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
}
assert property_type_repo.calls == []
assert wall_type_repo.calls == []
def test_classify_from_rows_splits_and_normalises_descriptions() -> None:
# arrange: one cell packs several descriptions with inconsistent casing,
# spread across rows. The rows path must fold them exactly like the
# AddressList path: comma-split, trimmed, lower-cased, de-duped.
rows = [
{"Walls": "Solid Brick, cavity"},
{"Walls": "SOLID BRICK"},
]
wall_types = _StubColumnClassifier({})
# act
_orchestrator(
[_column("wall_type", "Walls", wall_types)]
).classify_from_rows(rows)
# assert: the classifier saw one normalised entry per distinct variant.
assert wall_types.received == {"solid brick", "cavity"}
def test_classify_from_rows_yields_empty_mapping_for_an_absent_column() -> None:
# arrange: a column is registered for a header the rows lack.
rows = [{"Walls": "cavity"}]
property_types = _StubColumnClassifier({})
# act
result = _orchestrator(
[_column("property_type", "Property Type", property_types)]
).classify_from_rows(rows)
# assert: the absent column classified an empty description set.
assert result == {"property_type": {}}
assert property_types.received == set()
def test_persist_routes_each_columns_mapping_to_its_own_repo() -> None:
# arrange: a finished ``classified`` mapping (as classify_* would return)
# and two columns with distinct repos.
property_type_repo = _StubLandlordOverrideRepository()
wall_type_repo = _StubLandlordOverrideRepository()
columns: list[ClassifiableColumn[Any]] = [
_column("property_type", "Property Type", _StubColumnClassifier({}), property_type_repo),
_column("wall_type", "Walls", _StubColumnClassifier({}), wall_type_repo),
]
classified: dict[str, dict[str, Enum]] = {
"property_type": {"semi-detached": PropertyType.HOUSE},
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
}
# act
_orchestrator(columns).persist(classified, portfolio_id=42)
# assert: each repo received exactly its own column's mapping.
assert property_type_repo.calls == [(42, {"semi-detached": PropertyType.HOUSE})]
assert wall_type_repo.calls == [
(42, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED})
]
def test_persist_skips_empty_and_missing_mappings() -> None:
# arrange: ``property_type`` has an empty mapping; ``wall_type`` is absent
# from ``classified`` entirely. Neither should hit the DB -- and the
# missing key must not raise (``persist`` reads with ``.get``).
property_type_repo = _StubLandlordOverrideRepository()
wall_type_repo = _StubLandlordOverrideRepository()
columns: list[ClassifiableColumn[Any]] = [
_column("property_type", "Property Type", _StubColumnClassifier({}), property_type_repo),
_column("wall_type", "Walls", _StubColumnClassifier({}), wall_type_repo),
]
classified: dict[str, dict[str, Enum]] = {"property_type": {}}
# act
_orchestrator(columns).persist(classified, portfolio_id=7)
# assert: no upserts at all.
assert property_type_repo.calls == []
assert wall_type_repo.calls == []
def test_classify_and_persist_from_rows_composes_classify_then_persist() -> None:
# arrange: the one-shot rows path must classify AND route to repos, so the
# convenience composition stays equivalent to calling the two in sequence.
rows = [{"Property Type": "semi-detached", "Walls": "solid brick"}]
property_type_repo = _StubLandlordOverrideRepository()
wall_type_repo = _StubLandlordOverrideRepository()
columns: list[ClassifiableColumn[Any]] = [
_column(
"property_type",
"Property Type",
_StubColumnClassifier({"semi-detached": PropertyType.HOUSE}),
property_type_repo,
),
_column(
"wall_type",
"Walls",
_StubColumnClassifier(
{"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}
),
wall_type_repo,
),
]
# act
result = _orchestrator(columns).classify_and_persist_from_rows(rows, portfolio_id=99)
# assert: same return shape as classify_from_rows, and each repo wrote once.
assert result == {
"property_type": {"semi-detached": PropertyType.HOUSE},
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
}
assert property_type_repo.calls == [(99, {"semi-detached": PropertyType.HOUSE})]
assert wall_type_repo.calls == [
(99, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED})
]

View file

@ -13,13 +13,13 @@ from sqlalchemy import Engine
from sqlmodel import Session
from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
from infrastructure.csv_s3_client import CsvS3Client
from infrastructure.s3.csv_s3_client import CsvS3Client
from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator
from orchestration.task_orchestrator import TaskOrchestrator
from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository
from repositories.tasks.task_postgres_repository import TaskPostgresRepository
from repositories.user_address.user_address_csv_s3_repository import (
UserAddressCsvS3Repository,
from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import (
UnstandardisedAddressListCsvS3Repository,
)
BUCKET = "splitter-bucket"
@ -27,7 +27,9 @@ REGION = "us-east-1"
def _make_boto_client(service_name: str) -> Any:
factory: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
factory: Any = (
boto3.client
) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
return factory(service_name, region_name=REGION)
@ -62,7 +64,7 @@ class Harness:
csv_client: CsvS3Client
boto_sqs: Any
queue_url: str
repo: UserAddressCsvS3Repository
repo: UnstandardisedAddressListCsvS3Repository
@pytest.fixture
@ -76,7 +78,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]:
queue_url = cast(str, queue["QueueUrl"])
csv_client = CsvS3Client(boto_s3, BUCKET)
repo = UserAddressCsvS3Repository(csv_client, BUCKET)
repo = UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET)
queue_client = Address2UprnQueueClient(boto_sqs, queue_url)
# DB: ephemeral PostgreSQL TaskOrchestrator
@ -89,7 +91,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]:
splitter = PostcodeSplitterOrchestrator(
task_orchestrator=task_orchestrator,
user_address_repo=repo,
unstandardised_address_repo=repo,
queue_client=queue_client,
max_batch_size=3,
)
@ -169,10 +171,8 @@ def test_split_and_dispatch_creates_three_children_for_fixture(
harness: Harness,
) -> None:
# arrange
parent_task, parent_subtask = (
harness.task_orchestrator.create_task_with_subtask(
task_source="manual:postcode-splitter-int"
)
parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask(
task_source="manual:postcode-splitter-int"
)
input_uri = _upload_fixture_csv(harness.csv_client)
@ -197,10 +197,8 @@ def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri(
harness: Harness,
) -> None:
# arrange
parent_task, parent_subtask = (
harness.task_orchestrator.create_task_with_subtask(
task_source="manual:postcode-splitter-int"
)
parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask(
task_source="manual:postcode-splitter-int"
)
input_uri = _upload_fixture_csv(harness.csv_client)
@ -230,10 +228,8 @@ def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids(
harness: Harness,
) -> None:
# arrange
parent_task, parent_subtask = (
harness.task_orchestrator.create_task_with_subtask(
task_source="manual:postcode-splitter-int"
)
parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask(
task_source="manual:postcode-splitter-int"
)
input_uri = _upload_fixture_csv(harness.csv_client)
@ -267,10 +263,8 @@ def test_split_and_dispatch_returns_child_ids_in_dispatch_order(
harness: Harness,
) -> None:
# arrange
parent_task, parent_subtask = (
harness.task_orchestrator.create_task_with_subtask(
task_source="manual:postcode-splitter-int"
)
parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask(
task_source="manual:postcode-splitter-int"
)
input_uri = _upload_fixture_csv(harness.csv_client)

View file

@ -0,0 +1,220 @@
"""Integration tests for the consolidated landlord-overrides upsert adapter.
``LandlordOverridesRepository`` serves every ``landlord_<category>_overrides``
table; the table is selected by the ``Row`` class passed at construction. The
source-aware conflict policy lives entirely in SQL (``INSERT ... ON CONFLICT
... DO UPDATE ... WHERE existing.source = 'classifier'``), so the only way to
verify it correctly distinguishes ``EXCLUDED.source`` from the qualified
``<table>.source`` is against a real Postgres -- the ``db_engine`` fixture in
``tests/conftest.py`` spins one up per test.
Each scenario is parametrised across two distinct categories (property type and
wall type). Running the shared body against two different ``Row`` classes
proves the adapter writes to whichever table it was handed -- not just that one
table happens to work.
"""
from __future__ import annotations
from collections.abc import Iterator
from dataclasses import dataclass
from enum import Enum
from typing import cast
import pytest
from sqlalchemy import Engine, Table
from sqlmodel import Session, SQLModel, select
from domain.epc.property_type import PropertyType
from domain.epc.wall_type import WallType
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
LandlordOverridesRepository,
)
from infrastructure.postgres.landlord_override_enums import OverrideSource
from infrastructure.postgres.landlord_property_type_override_table import (
LandlordPropertyTypeOverrideRow,
)
from infrastructure.postgres.landlord_wall_type_override_table import (
LandlordWallTypeOverrideRow,
)
@dataclass(frozen=True)
class Category:
"""One ``landlord_<category>_overrides`` table plus a few of its enum values.
``first``/``second`` are two distinct classifier values for re-upsert
tests; ``user_correction`` stands in for a value only a user would pick;
``unknown`` is the category's ``UNKNOWN`` member.
"""
id: str
row_type: type[SQLModel]
first: Enum
second: Enum
user_correction: Enum
unknown: Enum
CATEGORIES = [
Category(
id="property_type",
row_type=LandlordPropertyTypeOverrideRow,
first=PropertyType.FLAT,
second=PropertyType.HOUSE,
user_correction=PropertyType.BUNGALOW,
unknown=PropertyType.UNKNOWN,
),
Category(
id="wall_type",
row_type=LandlordWallTypeOverrideRow,
first=WallType.CAVITY_FILLED,
second=WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED,
user_correction=WallType.SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED,
unknown=WallType.UNKNOWN,
),
]
@pytest.fixture
def session(db_engine: Engine) -> Iterator[Session]:
with Session(db_engine) as s:
yield s
@pytest.fixture(params=CATEGORIES, ids=lambda c: c.id)
def category(request: pytest.FixtureRequest) -> Category:
return request.param
def _select_row(
session: Session, row_type: type[SQLModel], portfolio_id: int, description: str
) -> SQLModel:
# The row class is only known as ``type[SQLModel]`` here, so its column
# descriptors aren't typed; filter via the Core ``Table`` columns instead.
table = cast(Table, getattr(row_type, "__table__"))
rows = session.exec(
select(row_type).where(
table.c.portfolio_id == portfolio_id,
table.c.description == description,
)
).all()
assert len(rows) == 1, f"expected exactly one row, got {len(rows)}"
return rows[0]
def test_inserts_a_fresh_row_with_source_classifier(
session: Session, category: Category
) -> None:
# arrange
repo = LandlordOverridesRepository[Enum](session, category.row_type)
# act
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.first})
session.commit()
# assert
row = _select_row(session, category.row_type, portfolio_id=1, description="cosy")
assert getattr(row, "value") is category.first
assert getattr(row, "source") == OverrideSource.CLASSIFIER
def test_reupsert_overwrites_a_classifier_row(
session: Session, category: Category
) -> None:
# arrange: a stale classifier row exists.
repo = LandlordOverridesRepository[Enum](session, category.row_type)
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.first})
session.commit()
# act: re-classify with a different value.
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.second})
session.commit()
# assert: the new classification wins.
row = _select_row(session, category.row_type, portfolio_id=1, description="cosy")
assert getattr(row, "value") is category.second
assert getattr(row, "source") == OverrideSource.CLASSIFIER
def test_reupsert_does_not_overwrite_a_user_row(
session: Session, category: Category
) -> None:
# arrange: a user has corrected the row. The classifier path never produces
# ``source = 'user'``; we install the row directly to mimic the override
# frontend.
user_row = category.row_type(
portfolio_id=1,
description="cosy",
value=category.user_correction,
source=OverrideSource.USER,
)
session.add(user_row)
session.commit()
# act: the classifier re-runs and tries to reclassify the same description.
# Under the source-aware conflict policy, this must be silently skipped --
# user edits beat classifier reruns.
repo = LandlordOverridesRepository[Enum](session, category.row_type)
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.second})
session.commit()
# assert: the user row is unchanged.
row = _select_row(session, category.row_type, portfolio_id=1, description="cosy")
assert getattr(row, "value") is category.user_correction
assert getattr(row, "source") == OverrideSource.USER
def test_upsert_keeps_other_portfolios_descriptions_independent(
session: Session, category: Category
) -> None:
# arrange / act: the unique key is ``(portfolio_id, description)``, so the
# same description for two different portfolios must coexist as two rows.
repo = LandlordOverridesRepository[Enum](session, category.row_type)
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.first})
repo.upsert_all(portfolio_id=2, descriptions_to_values={"cosy": category.second})
session.commit()
# assert: both rows survive with their own values.
assert (
getattr(_select_row(session, category.row_type, 1, "cosy"), "value")
is category.first
)
assert (
getattr(_select_row(session, category.row_type, 2, "cosy"), "value")
is category.second
)
def test_upsert_persists_unknown_so_a_user_can_resolve_it_later(
session: Session, category: Category
) -> None:
# arrange / act: a description the classifier couldn't resolve still lands
# -- per ADR-0002 §5 / ADR-0003 §Decision, so a future user override can
# upgrade it to a real value.
repo = LandlordOverridesRepository[Enum](session, category.row_type)
repo.upsert_all(
portfolio_id=1,
descriptions_to_values={"unparseable nonsense": category.unknown},
)
session.commit()
# assert: the row exists with value=UNKNOWN, source=classifier.
row = _select_row(
session, category.row_type, portfolio_id=1, description="unparseable nonsense"
)
assert getattr(row, "value") is category.unknown
assert getattr(row, "source") == OverrideSource.CLASSIFIER
def test_upsert_all_with_empty_mapping_is_a_no_op(
session: Session, category: Category
) -> None:
# arrange / act
repo = LandlordOverridesRepository[Enum](session, category.row_type)
repo.upsert_all(portfolio_id=1, descriptions_to_values={})
session.commit()
# assert: nothing was inserted.
rows = session.exec(select(category.row_type)).all()
assert rows == []

View file

@ -3,11 +3,11 @@ from collections.abc import Iterator
import pytest
from moto import mock_aws
from domain.addresses.user_address import UserAddress
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
from domain.postcode import Postcode
from infrastructure.csv_s3_client import CsvS3Client
from repositories.user_address.user_address_csv_s3_repository import (
UserAddressCsvS3Repository,
from infrastructure.s3.csv_s3_client import CsvS3Client
from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import (
UnstandardisedAddressListCsvS3Repository,
)
from tests.infrastructure import make_boto_client
@ -15,22 +15,22 @@ BUCKET = "user-address-bucket"
@pytest.fixture
def repo() -> Iterator[UserAddressCsvS3Repository]:
def repo() -> Iterator[UnstandardisedAddressListCsvS3Repository]:
with mock_aws():
boto_client = make_boto_client("s3")
boto_client.create_bucket(Bucket=BUCKET)
csv_client = CsvS3Client(boto_client, BUCKET)
yield UserAddressCsvS3Repository(csv_client, BUCKET)
yield UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET)
def _upload_csv(
repo: UserAddressCsvS3Repository, rows: list[dict[str, str]], key: str
repo: UnstandardisedAddressListCsvS3Repository, rows: list[dict[str, str]], key: str
) -> str:
return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage]
def test_load_batch_parses_address_postcode_and_reference(
repo: UserAddressCsvS3Repository,
repo: UnstandardisedAddressListCsvS3Repository,
) -> None:
# arrange
rows = [
@ -50,13 +50,13 @@ def test_load_batch_parses_address_postcode_and_reference(
# assert
assert len(addresses) == 1
address = addresses[0]
assert address.user_address == "1 High Street, Flat 2, Townville"
assert address.address == "1 High Street, Flat 2, Townville"
assert address.postcode == Postcode("SW1A1AA")
assert address.internal_reference == "REF-001"
assert address.org_reference == "REF-001"
def test_load_batch_uses_only_address_1_when_others_missing(
repo: UserAddressCsvS3Repository,
repo: UnstandardisedAddressListCsvS3Repository,
) -> None:
# arrange
rows = [
@ -75,13 +75,13 @@ def test_load_batch_uses_only_address_1_when_others_missing(
# assert
assert len(addresses) == 1
assert addresses[0].user_address == "10 Cardiff Road"
assert addresses[0].address == "10 Cardiff Road"
assert addresses[0].postcode == Postcode("CF101AA")
assert addresses[0].internal_reference == "REF-002"
assert addresses[0].org_reference == "REF-002"
def test_load_batch_handles_missing_internal_reference(
repo: UserAddressCsvS3Repository,
repo: UnstandardisedAddressListCsvS3Repository,
) -> None:
# arrange
rows = [
@ -100,16 +100,16 @@ def test_load_batch_handles_missing_internal_reference(
# assert
assert len(addresses) == 1
assert addresses[0].user_address == "5 Park Lane"
assert addresses[0].address == "5 Park Lane"
assert addresses[0].postcode == Postcode("M11AA")
assert addresses[0].internal_reference is None
assert addresses[0].org_reference is None
def test_load_batch_captures_full_source_row(
repo: UserAddressCsvS3Repository,
repo: UnstandardisedAddressListCsvS3Repository,
) -> None:
# A raw EPC-export-shaped row: the splitter must preserve every column,
# not just the ones it parses into UserAddress fields.
# not just the ones it parses into UnstandardisedAddress fields.
# arrange
row = {
"Asset Reference": "511",
@ -124,11 +124,11 @@ def test_load_batch_captures_full_source_row(
addresses = repo.load_batch(uri)
# assert
assert addresses[0].source_row == row
assert addresses[0].additional_info == row
def test_load_batch_raises_when_postcode_column_absent(
repo: UserAddressCsvS3Repository,
repo: UnstandardisedAddressListCsvS3Repository,
) -> None:
# arrange
rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}]
@ -140,7 +140,7 @@ def test_load_batch_raises_when_postcode_column_absent(
def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
repo: UserAddressCsvS3Repository,
repo: UnstandardisedAddressListCsvS3Repository,
) -> None:
# arrange
row = {
@ -154,7 +154,9 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
# act
saved_uri = repo.save_batch(addresses, "tasks/passthrough")
saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage]
saved_rows = repo._csv_client.read_rows(
saved_uri
) # pyright: ignore[reportPrivateUsage]
# assert
assert len(saved_rows) == 1
@ -167,16 +169,21 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
def test_save_batch_returns_uri_under_path_prefix(
repo: UserAddressCsvS3Repository,
repo: UnstandardisedAddressListCsvS3Repository,
) -> None:
# arrange
addresses = [
UserAddress(
user_address="1 High Street",
postcode=Postcode("SW1A 1AA"),
source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
),
]
addresses = AddressList(
[
UnstandardisedAddress(
address="1 High Street",
postcode=Postcode("SW1A 1AA"),
additional_info={
"Address 1": "1 High Street",
"postcode": "SW1A 1AA",
},
),
]
)
# act
uri = repo.save_batch(addresses, "tasks/abc/batches")
@ -187,7 +194,7 @@ def test_save_batch_returns_uri_under_path_prefix(
def test_save_then_reload_round_trip_preserves_columns(
repo: UserAddressCsvS3Repository,
repo: UnstandardisedAddressListCsvS3Repository,
) -> None:
# arrange
rows = [
@ -207,7 +214,9 @@ def test_save_then_reload_round_trip_preserves_columns(
# act
saved_uri = repo.save_batch(addresses, "tasks/round-trip")
saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage]
saved_rows = repo._csv_client.read_rows(
saved_uri
) # pyright: ignore[reportPrivateUsage]
# assert
# Original columns come back verbatim; postcode_clean is the only addition.
@ -218,16 +227,21 @@ def test_save_then_reload_round_trip_preserves_columns(
def test_save_batch_uses_unique_filename_per_call(
repo: UserAddressCsvS3Repository,
repo: UnstandardisedAddressListCsvS3Repository,
) -> None:
# arrange
addresses = [
UserAddress(
user_address="1 High Street",
postcode=Postcode("SW1A 1AA"),
source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
),
]
addresses = AddressList(
[
UnstandardisedAddress(
address="1 High Street",
postcode=Postcode("SW1A 1AA"),
additional_info={
"Address 1": "1 High Street",
"postcode": "SW1A 1AA",
},
),
]
)
# act
uri_1 = repo.save_batch(addresses, "tasks/uniqueness")

View file

@ -0,0 +1,267 @@
"""Static packaging linter for Lambda container images.
Every Lambda here ships as a Docker image that copies a *subset* of the repo
(``COPY utils/ utils/``, ``COPY backend/ backend/``, ...) and then runs a
handler via ``CMD ["<module>.handler"]``. If the handler's import graph reaches
a top-level package the Dockerfile forgot to ``COPY``, the function dies at cold
start with ``Runtime.ImportModuleError: No module named '<pkg>'`` but only
*inside the image*. In the dev/test tree every package is present, so a plain
``import`` test can't see the gap. This is exactly how ``No module named
'domain'`` reached a deployed address2UPRN.
The RIE smoke tests (.github/workflows/_smoke_test_lambda.yml) catch this too,
but only by building the full image (minutes) and only for hand-listed services.
This test catches the same class of bug in milliseconds, locally, for *every*
handler Dockerfile by statically computing each handler's import-time module
graph and asserting every repo file it reaches is copied into the image.
Scope: import-time (module-level) imports only the ones that run at Lambda
init, which is what ImportModuleError is about. Imports inside function bodies
and under ``if TYPE_CHECKING:`` are deliberately ignored. Third-party / stdlib
imports are out of scope (that's requirements.txt's job, covered by the RIE
smoke test actually installing and importing).
"""
from __future__ import annotations
import ast
import json
import re
from pathlib import Path
from typing import Optional
import pytest
REPO_ROOT = Path(__file__).resolve().parents[1]
# Dockerfiles that are not Lambda handlers (test harness, dev containers).
_SKIP_DOCKERFILES = {"Dockerfile.test", "Dockerfile.test.dockerignore"}
_SKIP_PARTS = {".git", "node_modules", ".devcontainer"}
def _toplevel_names() -> set[str]:
"""Top-level repo packages/modules — the namespace handler imports resolve
against (imports are absolute: ``domain.x``, ``backend.y``)."""
names: set[str] = set()
for p in REPO_ROOT.iterdir():
if p.name.startswith(".") or p.name == "__pycache__":
continue
if p.is_dir():
names.add(p.name)
elif p.suffix == ".py":
names.add(p.stem)
return names
_TOP = _toplevel_names()
def _is_type_checking(test: ast.expr) -> bool:
if isinstance(test, ast.Name):
return test.id == "TYPE_CHECKING"
if isinstance(test, ast.Attribute):
return test.attr == "TYPE_CHECKING"
return False
def _import_time_imports(path: Path) -> list[str]:
"""Absolute module names imported when ``path`` is imported (i.e. at Lambda
init). Descends into module-level if/try/with and class bodies, but not into
function bodies (lazy) or ``if TYPE_CHECKING:`` blocks (never executed)."""
try:
tree = ast.parse(path.read_text(encoding="utf-8"), str(path))
except (SyntaxError, UnicodeDecodeError):
return []
out: list[str] = []
def visit(stmts: list[ast.stmt]) -> None:
for node in stmts:
if isinstance(node, ast.Import):
out.extend(alias.name for alias in node.names)
elif isinstance(node, ast.ImportFrom):
if not node.level and node.module: # absolute imports only
out.append(node.module)
elif isinstance(node, ast.If):
if _is_type_checking(node.test):
continue
visit(node.body)
visit(node.orelse)
elif isinstance(node, ast.Try):
visit(node.body)
visit(node.orelse)
visit(node.finalbody)
for handler in node.handlers:
visit(handler.body)
elif isinstance(node, ast.With):
visit(node.body)
elif isinstance(node, ast.ClassDef):
visit(node.body)
# FunctionDef / AsyncFunctionDef bodies are intentionally skipped.
visit(tree.body)
return out
def _module_to_file(module: str) -> Optional[Path]:
"""Resolve a dotted module to its repo source file (``foo.bar`` ->
``foo/bar.py`` or ``foo/bar/__init__.py``)."""
base = REPO_ROOT.joinpath(*module.split("."))
py = base.with_suffix(".py")
if py.is_file():
return py
init = base / "__init__.py"
if init.is_file():
return init
return None
def _import_closure(start: Path) -> dict[Path, Optional[Path]]:
"""Repo files reachable from ``start`` via import-time imports, mapped to the
first file that imported each (for blame in failure messages)."""
reached: dict[Path, Optional[Path]] = {}
stack: list[tuple[Path, Optional[Path]]] = [(start, None)]
while stack:
path, importer = stack.pop()
if path in reached:
continue
reached[path] = importer
for module in _import_time_imports(path):
if module.split(".")[0] not in _TOP:
continue # stdlib / third-party — not our concern here
target = _module_to_file(module)
if target is not None and target not in reached:
stack.append((target, path))
return reached
def _norm(path_token: str) -> str:
return path_token.lstrip("./").rstrip("/")
def _parse_handler_spec(dockerfile_text: str) -> Optional[str]:
"""The ``<module>.handler`` string from the ``CMD`` line, or None if this
isn't a Lambda handler image."""
match = re.search(r"^CMD\s+(\[.*\]|.+)$", dockerfile_text, re.MULTILINE)
if not match:
return None
raw = match.group(1).strip()
try:
parsed = json.loads(raw)
spec = parsed[0] if isinstance(parsed, list) and parsed else raw
except json.JSONDecodeError:
spec = raw.strip('"')
return spec if isinstance(spec, str) and spec.endswith(".handler") else None
def _parse_copies(dockerfile_text: str) -> list[tuple[list[str], str]]:
"""``COPY`` instructions as (sources, dest), dropping ``--flag`` tokens."""
copies: list[tuple[list[str], str]] = []
for match in re.finditer(r"^COPY\s+(.+)$", dockerfile_text, re.MULTILINE):
tokens = [t for t in match.group(1).split() if not t.startswith("--")]
if len(tokens) < 2:
continue
*sources, dest = tokens
copies.append((sources, dest))
return copies
def _resolve_handler_file(
spec: str, copies: list[tuple[list[str], str]]
) -> Optional[Path]:
"""Map a handler spec to its repo source file.
Handles both in-place layouts (``backend.foo.handler`` -> ``backend/foo/
handler.py``, present via ``COPY backend/ backend/``) and root-placed
handlers (``main.handler`` where a ``COPY <src> /var/task/main.py`` or
``COPY <src>/main.py .`` puts the file at the image root)."""
module_path, _func = spec.rsplit(".", 1)
direct = REPO_ROOT / (module_path.replace(".", "/") + ".py")
if direct.is_file():
return direct
# Root-placed module: find the COPY whose destination basename matches.
wanted = module_path.split("/")[-1] + ".py"
for sources, dest in copies:
dest_norm = _norm(dest)
dest_is_named_file = Path(dest_norm).name == wanted
dest_is_dir = dest_norm in ("", module_path.split("/")[-1]) or dest.endswith("/")
for src in sources:
src_path = REPO_ROOT / _norm(src)
if not src_path.is_file():
continue
if dest_is_named_file or (dest_is_dir and src_path.name == wanted):
return src_path
return None
def _is_copied(rel_path: str, copies: list[tuple[list[str], str]]) -> bool:
"""Whether a repo-relative file path lands in the image via some COPY."""
rel_path = _norm(rel_path)
for sources, _dest in copies:
for src in sources:
src_norm = _norm(src)
if src_norm == "" or src_norm == rel_path or rel_path.startswith(src_norm + "/"):
return True
return False
def _discover_handler_dockerfiles() -> list[Path]:
found: list[Path] = []
for path in REPO_ROOT.rglob("*Dockerfile*"):
if path.name in _SKIP_DOCKERFILES:
continue
if any(part in _SKIP_PARTS for part in path.relative_to(REPO_ROOT).parts):
continue
try:
text = path.read_text(encoding="utf-8")
except OSError:
continue
if _parse_handler_spec(text):
found.append(path)
return sorted(found)
_HANDLER_DOCKERFILES = _discover_handler_dockerfiles()
def test_handler_dockerfiles_discovered() -> None:
"""Guard against the discovery silently finding nothing (e.g. a refactor
that renames Dockerfiles), which would make every check below vacuous."""
assert _HANDLER_DOCKERFILES, "no Lambda handler Dockerfiles found under repo root"
@pytest.mark.parametrize(
"dockerfile",
_HANDLER_DOCKERFILES,
ids=[str(p.relative_to(REPO_ROOT)) for p in _HANDLER_DOCKERFILES],
)
def test_lambda_image_copies_full_import_closure(dockerfile: Path) -> None:
"""Every repo file the handler imports at init must be COPYed into the image."""
text = dockerfile.read_text(encoding="utf-8")
spec = _parse_handler_spec(text)
assert spec is not None # discovery guaranteed this
copies = _parse_copies(text)
handler_file = _resolve_handler_file(spec, copies)
assert handler_file is not None, (
f"{dockerfile.relative_to(REPO_ROOT)}: could not locate the source file "
f"for CMD handler {spec!r}. Update _resolve_handler_file if this is a new "
f"handler layout."
)
missing: list[str] = []
for reached, importer in _import_closure(handler_file).items():
rel = str(reached.relative_to(REPO_ROOT))
if not _is_copied(rel, copies):
blame = (
str(importer.relative_to(REPO_ROOT)) if importer else "(handler entrypoint)"
)
missing.append(f" - {rel}\n imported by {blame}")
assert not missing, (
f"{dockerfile.relative_to(REPO_ROOT)} runs `{spec}` but does not COPY "
f"{len(missing)} file(s) it imports at init. The Lambda will fail at cold "
f"start with Runtime.ImportModuleError. Add the missing top-level "
f"package(s) as `COPY <pkg>/ <pkg>/`:\n" + "\n".join(sorted(missing))
)