diff --git a/.devcontainer/backend/requirements.txt b/.devcontainer/backend/requirements.txt index 7a879773..2db3710a 100644 --- a/.devcontainer/backend/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -27,3 +27,4 @@ pytest-postgresql # Formatting black==26.1.0 boto3-stubs +openai diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 0d702155..70f9eabe 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -92,6 +92,9 @@ on: TF_VAR_magicplan_api_key: required: false + + TF_VAR_openai_api_key: + required: false jobs: deploy: runs-on: ubuntu-latest @@ -163,6 +166,7 @@ jobs: TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }} TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }} TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }} + TF_VAR_openai_api_key: ${{ secrets.TF_VAR_openai_api_key }} run: | ECR_REPO_URL_VAR="" if [[ -n "${{ inputs.ecr_repo }}" ]]; then @@ -213,6 +217,7 @@ jobs: TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }} TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }} TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }} + TF_VAR_openai_api_key: ${{ secrets.TF_VAR_openai_api_key }} run: | EXTRA_VARS="" if [[ -n "${{ inputs.ecr_repo }}" ]]; then diff --git a/.github/workflows/ddd_tests.yml b/.github/workflows/ddd_tests.yml new file mode 100644 index 00000000..85e318d5 --- /dev/null +++ b/.github/workflows/ddd_tests.yml @@ -0,0 +1,32 @@ +name: Run DDD tests + +on: + pull_request: + branches: + - "**" + +# The DDD rewrite (tests/) defines SQLModel table classes that map to the same +# physical tables as the legacy backend models. Both sets share the one global +# SQLModel.metadata, so they cannot be imported into the same pytest process — +# hence this runs as a separate workflow from unit_tests.yml until the legacy +# models are retired. It also covers the Lambda packaging linter +# (tests/test_lambda_packaging.py). Its DB is spawned in-process by +# pytest-postgresql, so no `services: postgres` and no env secrets are required, +# and it runs independently of (and in parallel with) the legacy unit tests. +jobs: + ddd-tests: + name: DDD tests (Docker) + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Build test image + run: docker build -f Dockerfile.test -t model-test . + + - name: Run DDD tests + run: | + docker run --rm \ + --network host \ + model-test pytest -vv tests/ diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 7f2eb890..fc999bc0 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -203,6 +203,47 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + # ============================================================ + # Build Landlord Description Overrides image and Push + # ============================================================ + landlordDescriptionOverrides_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: landlord_description_overrides-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: applications/landlord_description_overrides/Dockerfile + build_context: . + build_args: | + DEV_DB_HOST=$DEV_DB_HOST + DEV_DB_PORT=$DEV_DB_PORT + DEV_DB_NAME=$DEV_DB_NAME + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} + + # ============================================================ + # Deploy Landlord Description Overrides Lambda + # ============================================================ + landlordDescriptionOverrides_lambda: + needs: [landlordDescriptionOverrides_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: landlordDescriptionOverrides + lambda_path: deployment/terraform/lambda/landlordDescriptionOverrides + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: landlord_description_overrides-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.landlordDescriptionOverrides_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + TF_VAR_openai_api_key: ${{ secrets.DEV_OPENAI_API_KEY }} + # ============================================================ # Build Bulk Address2UPRN Combiner image and Push # ============================================================ diff --git a/.github/workflows/lambda_smoke_tests.yml b/.github/workflows/lambda_smoke_tests.yml index b562f91e..44288821 100644 --- a/.github/workflows/lambda_smoke_tests.yml +++ b/.github/workflows/lambda_smoke_tests.yml @@ -43,6 +43,16 @@ jobs: build_context: . service_name: postcode-splitter-ddd + # ============================================================ + # Landlord Description Overrides + # ============================================================ + landlord_description_overrides_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: applications/landlord_description_overrides/Dockerfile + build_context: . + service_name: landlord-description-overrides + # ============================================================ # Bulk Address2UPRN Combiner # ============================================================ diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 15d4cfe9..12d836d0 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -61,14 +61,7 @@ jobs: -e DB_PORT=5432 \ model-test pytest -vv -m 'not integration' - # The DDD rewrite (tests/) defines SQLModel table classes that map to the - # same physical tables as the legacy backend models. Both sets share the - # one global SQLModel.metadata, so they cannot be imported into the same - # pytest process. It runs as a separate invocation until the legacy - # models are retired. Its DB is spawned in-process by pytest-postgresql, - # so no DB service or env is required. - - name: Run DDD tests - run: | - docker run --rm \ - --network host \ - model-test pytest -vv tests/ + # The DDD rewrite (tests/) runs in its own workflow (ddd_tests.yml): its + # SQLModel table classes map to the same physical tables as the legacy + # backend models and share the one global SQLModel.metadata, so the two + # suites cannot be imported into the same pytest process. diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md index 66684925..d2fde99a 100644 --- a/UBIQUITOUS_LANGUAGE.md +++ b/UBIQUITOUS_LANGUAGE.md @@ -1,7 +1,84 @@ # Ubiquitous Language -This file has been **superseded by [CONTEXT.md](./CONTEXT.md)**. +Domain terminology glossary for this project. Generated and maintained by the `/ubiquitous-language` Claude Code skill. -The project's domain glossary now lives at the repo root in `CONTEXT.md`, maintained by the `/grill-with-docs` skill (which replaced `/ubiquitous-language`). +Invoke `/ubiquitous-language` in any session to extract new terms from the conversation, flag ambiguities, and update this file with canonical definitions. -If you arrived here from a link in `CLAUDE.md` or older docs, follow the link above. This file is kept only to preserve git history and may be removed once internal references are updated. +--- + +## Energy Performance Certificates + +| Term | Definition | Aliases to avoid | +|------|------------|------------------| +| **EPC** | An Energy Performance Certificate — a government-issued document rating a dwelling's energy efficiency from A (best) to G (worst). | "energy certificate", "energy report" | +| **Certificate Number** | The unique identifier assigned to an EPC by the government registry. | "cert number", "EPC ID" | +| **Registration Date** | The date an EPC was lodged with the government register; used to identify the most recent certificate for a property. | "assessment date", "submission date" | +| **EPC Band** | A single letter A–G representing a property's current or potential energy efficiency rating. | "energy rating", "EPC grade", "EPC score" | +| **Schema Type** | The versioned RdSAP or SAP schema that describes the structure of a certificate's raw data (e.g. `RdSAP-Schema-21.0.1`). | "schema version", "EPC format" | +| **Domestic Certificate** | An EPC issued for a residential dwelling, as opposed to a commercial one. | "residential EPC", "home EPC" | + +## Properties and Addresses + +| Term | Definition | Aliases to avoid | +|------|------------|------------------| +| **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" | +| **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" | +| **Unstandardised Address** | A frozen dataclass (`domain.addresses.unstandardised_address.UnstandardisedAddress`) capturing a single address exactly as a customer supplied it, before any standardisation: a free-text `address` line (intentionally NOT normalised), a canonical `postcode` (a `Postcode` value object, sanitised on construction), an optional `org_reference` (the customer's own identifier for the property), and `additional_info` (the full source row — every column of the customer's upload, preserved verbatim). | "user address", "asset list", "raw address", "landlord address", "Hyde address" | +| **Address List** | A nominal `NewType` over `list[UnstandardisedAddress]` (`domain.addresses.unstandardised_address.AddressList`) — a batch of unstandardised addresses, such as one customer's bulk-onboarding upload or a postcode-grouped sub-batch produced for downstream processing. Being nominal, it is constructed explicitly: `AddressList([...])`. It is the raw *input* to ingestion; the standardised *output* is a **Standardised Asset List**. | "asset list", "Hyde address list", "user addresses" | +| **Standardised Asset List (SAL)** | A customer's property portfolio after ingestion has cleaned and standardised it — each property carrying a canonical field set (UPRN, standardised address, postcode, property type, built form, …). It is the standardised *output* of the pipeline whose raw *input* is an **Address List** of **Unstandardised Addresses**; generated by the `SALOrchestrator`. (Legacy implementation: `asset_list.AssetList` via `load_standardised_asset_list`.) | "address list" (that is the raw input), "asset register", "portfolio list" | +| **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" | + +## Address Matching + +| Term | Definition | Aliases to avoid | +|------|------------|------------------| +| **Lexiscore** | A similarity score in [0, 1] between an unstandardised address and a candidate EPC address; combines token overlap and character-level similarity. | "score", "match score", "similarity" | +| **Lexirank** | Dense rank of candidates sorted by lexiscore descending; rank 1 = best match. | "rank", "position" | +| **UPRN Candidate** | An EPC search result that is a plausible match for a given unstandardised address, before scoring decides the winner. | "match candidate", "result" | +| **Score Threshold** | The minimum lexiscore (currently 0.6) below which no match is returned even if a candidate exists. | "minimum score", "cutoff" | +| **Ambiguous Match** | A matching outcome where two or more candidates share lexirank 1, making it impossible to select a unique winner. | "tie", "draw", "duplicate" | +| **Best Match** | The single UPRN candidate with lexirank 1 that meets or exceeds the score threshold. | "winner", "top result" | + +## API and Integration + +| Term | Definition | Aliases to avoid | +|------|------------|------------------| +| **EPC Search Result** | A lightweight record returned by the government domestic search endpoint — contains address lines, postcode, UPRN, band, and certificate number but not the full certificate data. | "search row", "EPC row", "result" | +| **EPC Property Data** | The fully mapped domain object produced after fetching and parsing a complete EPC certificate. | "EPC data", "certificate data", "parsed EPC" | +| **Old EPC API** | The retired government API (`epc.opendatacommunities.org`) using HTTP Basic auth; decommissioned May 2026. | "legacy API" | +| **New EPC API** | The replacement government API (`api.get-energy-performance-data.communities.gov.uk`) using Bearer token auth. | "new API", "current API" | +| **Bearer Token** | The auth credential required by the new EPC API; stored in the `EPC_AUTH_TOKEN` environment variable. | "API key", "auth token", "secret" | + +## Relationships + +- An **EPC** belongs to exactly one **Dwelling** and has one **Certificate Number**. +- A **Dwelling** may have multiple **EPCs** across time; the one with the most recent **Registration Date** is the current one. +- A **UPRN** identifies a **Dwelling** permanently; it does not change when the property changes owner. +- An **EPC Search Result** is a summary; it points to a full **EPC** via its **Certificate Number**. +- An **Address List** is an ordered batch of **Unstandardised Addresses**; a customer's bulk-onboarding upload arrives as one. +- Ingestion turns an **Address List** (raw input) into a **Standardised Asset List** (standardised output) — the **SAL Orchestrator** drives this. +- **Address Matching** uses an **Unstandardised Address** and **Postcode** to find a **UPRN** by scoring **UPRN Candidates** from an EPC search. +- A **Lexirank** of 1 with no **Ambiguous Match** and a **Lexiscore** ≥ the **Score Threshold** produces a **Best Match**. + +## Example dialogue + +> **Dev:** "We have an unstandardised address and postcode. How do we find the UPRN?" + +> **Domain expert:** "Search the **New EPC API** by **Postcode** — you get back a list of **EPC Search Results** for that area. Each one has an address and a **UPRN**. Score each against the **Unstandardised Address** using the **Lexiscore**. If the top **UPRN Candidate** scores above the **Score Threshold** and there's no **Ambiguous Match**, that's your **Best Match**." + +> **Dev:** "What if two results share the same address line 1?" + +> **Domain expert:** "That's an **Ambiguous Match** — two candidates at **Lexirank** 1. Fall back to scoring on the full address using all address lines joined together. If that still ties, return nothing." + +> **Dev:** "Once we have the best match, do we use the UPRN or fetch the full EPC?" + +> **Domain expert:** "Depends on what you need. The **EPC Search Result** gives you the **EPC Band** and **Certificate Number**. If you need energy efficiency detail, use the **Certificate Number** to fetch the full **EPC Property Data**." + +## Flagged ambiguities + +- **"address"** appears in several senses: the **Unstandardised Address** dataclass (one customer-supplied address before standardisation), its free-text `address` field, and the normalised address lines on an **EPC Search Result**. Always qualify: "unstandardised address" vs "EPC address" or "address line 1". Within `domain/addresses/`, the dataclass is **Unstandardised Address**; in upstream ingestion contexts (CSV columns, SQS payloads) "address" may still mean the bare free-text string. +- **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments. +- **"user_inputed_address"** (and `user_address`) in `backend/address2UPRN/` is legacy naming — a misspelled synonym for what is now the **Unstandardised Address**. That address-matching code has not been renamed; new code should use **Unstandardised Address**. +- **"Hyde address list"** — "Hyde" is the name of one customer, not a domain concept. A domain expert may say "the Hyde address list" because Hyde is the customer in front of them, but the generalised term is **Address List** (and **Unstandardised Address** for a single item). A customer's identity is data — it belongs in `org_reference` or `additional_info`, never in a type or module name. +- **"address list"** vs **"asset list"** — opposite ends of the ingestion pipeline; do not conflate them. An **Address List** is the raw *input* (unstandardised addresses as the customer supplied them); a **Standardised Asset List** is the standardised *output*. The historical `AssetList` dataclass (now **Unstandardised Address**) misnamed the input an "asset list" — that mistake is what the rename corrected. +- **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter. diff --git a/applications/ara_first_run/Dockerfile b/applications/ara_first_run/Dockerfile index 2d3f6515..894f7ee1 100644 --- a/applications/ara_first_run/Dockerfile +++ b/applications/ara_first_run/Dockerfile @@ -25,6 +25,9 @@ COPY infrastructure/ infrastructure/ COPY orchestration/ orchestration/ COPY repositories/ repositories/ COPY utilities/ utilities/ +# domain.property.site_notes -> datatypes.epc.domain.epc_property_data; without +# this the lambda fails at init with "No module named 'datatypes'". +COPY datatypes/ datatypes/ COPY applications/ applications/ # Place the handler at the Lambda task root so the runtime can resolve diff --git a/applications/landlord_description_overrides/Dockerfile b/applications/landlord_description_overrides/Dockerfile new file mode 100644 index 00000000..c2d4faf7 --- /dev/null +++ b/applications/landlord_description_overrides/Dockerfile @@ -0,0 +1,34 @@ +FROM public.ecr.aws/lambda/python:3.11 + +# Postgres host/port/database are baked into the image at build time from +# the deploy workflow's --build-arg values (GitHub Actions DEV_DB_* secrets), +# mirroring backend/postcode_splitter/handler/Dockerfile. They map onto the +# POSTGRES_* names PostgresConfig.from_env reads. Username/password are NOT +# baked in -- Terraform injects those as Lambda env vars from Secrets Manager. +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV POSTGRES_HOST=${DEV_DB_HOST} +ENV POSTGRES_PORT=${DEV_DB_PORT} +ENV POSTGRES_DATABASE=${DEV_DB_NAME} + +WORKDIR /var/task + +COPY applications/landlord_description_overrides/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the layered source the handler imports from. The new splitter pulls +# only DDD-shaped packages — no pandas, no legacy backend/. +COPY domain/ domain/ +COPY infrastructure/ infrastructure/ +COPY orchestration/ orchestration/ +COPY repositories/ repositories/ +COPY utilities/ utilities/ +COPY applications/ applications/ + +# Place the handler at the Lambda task root so the runtime can resolve +# ``main.handler`` without an extra package prefix. +COPY applications/landlord_description_overrides/handler.py /var/task/main.py + +CMD ["main.handler"] diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py new file mode 100644 index 00000000..5e689a45 --- /dev/null +++ b/applications/landlord_description_overrides/handler.py @@ -0,0 +1,169 @@ +import logging +import os +from typing import Any + +import boto3 + +from applications.landlord_description_overrides.landlord_description_overrides_trigger_body import ( + LandlordDescriptionOverridesTriggerBody, +) +from domain.epc.built_form_type import BuiltFormType +from domain.epc.property_type import PropertyType +from domain.epc.roof_type import RoofType +from domain.epc.wall_type import WallType +from domain.epc.wall_type_construction_dates import ( + wall_type_construction_date_prompt_hint, +) +from infrastructure.chatgpt.chatgpt import ChatGPT +from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier +from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import ( + LandlordOverridesRepository, +) +from infrastructure.postgres.config import PostgresConfig +from infrastructure.postgres.engine import commit_scope, make_engine, make_session +from infrastructure.postgres.landlord_built_form_type_override_table import ( + LandlordBuiltFormTypeOverrideRow, +) +from infrastructure.postgres.landlord_property_type_override_table import ( + LandlordPropertyTypeOverrideRow, +) +from infrastructure.postgres.landlord_roof_type_override_table import ( + LandlordRoofTypeOverrideRow, +) +from infrastructure.postgres.landlord_wall_type_override_table import ( + LandlordWallTypeOverrideRow, +) +from infrastructure.s3.csv_s3_client import CsvS3Client +from infrastructure.s3.s3_uri import parse_s3_uri +from orchestration.classifiable_column import ClassifiableColumn +from orchestration.landlord_description_overrides_orchestrator import ( + LandlordDescriptionOverridesOrchestrator, +) +from orchestration.task_orchestrator import TaskOrchestrator +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, +) +from utilities.aws_lambda.subtask_handler import subtask_handler + +logger = logging.getLogger(__name__) + + +def _build_columns( + column_mapping: dict[str, str], chat_gpt: ChatGPT, session: Any +) -> list[ClassifiableColumn[Any]]: + """One ClassifiableColumn per mapped category. + + ``column_mapping`` is ``{category -> source CSV header}``. One header may + feed several categories -- e.g. ``"Property Type"`` -> property_type and + built_form_type -- which falls out naturally because each is a separate + entry. Unknown categories are skipped. + """ + factories = { + "property_type": lambda src: ClassifiableColumn( + name="property_type", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, PropertyType, PropertyType.UNKNOWN + ), + repo=LandlordOverridesRepository[PropertyType]( + session, LandlordPropertyTypeOverrideRow + ), + ), + "built_form_type": lambda src: ClassifiableColumn( + name="built_form_type", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, BuiltFormType, BuiltFormType.UNKNOWN + ), + repo=LandlordOverridesRepository[BuiltFormType]( + session, LandlordBuiltFormTypeOverrideRow + ), + ), + "wall_type": lambda src: ClassifiableColumn( + name="wall_type", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, + WallType, + WallType.UNKNOWN, + extra_instructions=wall_type_construction_date_prompt_hint(), + ), + repo=LandlordOverridesRepository[WallType]( + session, LandlordWallTypeOverrideRow + ), + ), + "roof_type": lambda src: ClassifiableColumn( + name="roof_type", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, RoofType, RoofType.UNKNOWN + ), + repo=LandlordOverridesRepository[RoofType]( + session, LandlordRoofTypeOverrideRow + ), + ), + } + + columns: list[ClassifiableColumn[Any]] = [] + for category, source_column in column_mapping.items(): + factory = factories.get(category) + if factory is None: + logger.warning("Unknown classifier category %r; skipping.", category) + continue + columns.append(factory(source_column)) + return columns + + +@subtask_handler() +def handler( + body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator +) -> dict[str, int]: + trigger = LandlordDescriptionOverridesTriggerBody.model_validate(body) + + # The classifier reads a dedicated CSV of the classifier columns (raw + # landlord headers preserved), converted from the upload by the frontend, so + # the S3 bucket comes from the trigger URI rather than a fixed env var. + bucket, _key = parse_s3_uri(trigger.s3_uri) + + # boto3.client is overloaded per-service in the installed stubs; cast to Any + # so the strict-mode checker treats it as opaque. + boto3_client: Any = ( + boto3.client + ) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + boto_s3: Any = boto3_client("s3") + + csv_client = CsvS3Client(boto_s3, bucket) + unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository( + csv_client, bucket + ) + + # Raw rows, not load_batch: the classifier CSV carries the description + # columns but not the canonical address/postcode columns load_batch requires. + rows = csv_client.read_rows(trigger.s3_uri) + + engine = make_engine(PostgresConfig.from_env(os.environ)) + # The session is built up front (SQLModel sessions are lazy, so no + # connection is checked out yet) and owned by this handler. Classification + # runs first and calls ChatGPT, which is slow; we deliberately keep no + # transaction open across it. Only the persistence below -- inside + # ``commit_scope`` -- holds a connection. + session = make_session(engine) + try: + chat_gpt = ChatGPT() + columns = _build_columns(trigger.column_mapping, chat_gpt, session) + orchestrator = LandlordDescriptionOverridesOrchestrator( + unstandardised_address_repo=unstandardised_address_repo, + columns=columns, + ) + + classified = orchestrator.classify_from_rows(rows) + + with commit_scope(session): + orchestrator.persist(classified, portfolio_id=trigger.portfolio_id) + finally: + session.close() + + counts = {name: len(mapping) for name, mapping in classified.items()} + for name, n in counts.items(): + logger.info("Classified %d descriptions for column %r.", n, name) + return counts diff --git a/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py b/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py new file mode 100644 index 00000000..0ca80ec3 --- /dev/null +++ b/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py @@ -0,0 +1,19 @@ +from uuid import UUID + +from pydantic import BaseModel, ConfigDict + + +class LandlordDescriptionOverridesTriggerBody(BaseModel): + model_config = ConfigDict(extra="allow") + + task_id: UUID + sub_task_id: UUID + s3_uri: str + # ``portfolio_id`` is ``bigint`` in the ``landlord_*_overrides`` schema -- + # Python ``int`` is unbounded so the Pydantic side stays simple; the + # SQLModel row class pins the storage to ``BigInteger``. + portfolio_id: int + # category -> source CSV header (the classifier subset of the upload + # mapping). Defaulted so a malformed/empty message classifies nothing + # rather than failing validation. + column_mapping: dict[str, str] = {} diff --git a/applications/landlord_description_overrides/local_handler/.env.local.example b/applications/landlord_description_overrides/local_handler/.env.local.example new file mode 100644 index 00000000..a78a797f --- /dev/null +++ b/applications/landlord_description_overrides/local_handler/.env.local.example @@ -0,0 +1,5 @@ +POSTGRES_HOST= +POSTGRES_PORT=5432 +POSTGRES_USERNAME= +POSTGRES_PASSWORD= +POSTGRES_DATABASE= \ No newline at end of file diff --git a/applications/landlord_description_overrides/local_handler/docker-compose.yml b/applications/landlord_description_overrides/local_handler/docker-compose.yml new file mode 100644 index 00000000..6ead2e33 --- /dev/null +++ b/applications/landlord_description_overrides/local_handler/docker-compose.yml @@ -0,0 +1,9 @@ +services: + landlord_overrides: + build: + context: ../../../ + dockerfile: applications/landlord_description_overrides/Dockerfile + ports: + - "9002:8080" + env_file: + - .env.local diff --git a/applications/landlord_description_overrides/local_handler/invoke_local_lambda.py b/applications/landlord_description_overrides/local_handler/invoke_local_lambda.py new file mode 100755 index 00000000..4514495f --- /dev/null +++ b/applications/landlord_description_overrides/local_handler/invoke_local_lambda.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +import json +import requests + +HOST = "localhost" +PORT = "9002" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = {"Records": [{"body": json.dumps({})}]} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) diff --git a/applications/landlord_description_overrides/local_handler/run_local.sh b/applications/landlord_description_overrides/local_handler/run_local.sh new file mode 100755 index 00000000..345b60ee --- /dev/null +++ b/applications/landlord_description_overrides/local_handler/run_local.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail +cd "$(dirname "$0")" + +if [ ! -f .env.local ]; then + cp .env.local.example .env.local + echo "Created .env.local from the template — fill it in, then re-run." >&2 + exit 1 +fi + +docker compose build --no-cache +docker compose up --force-recreate diff --git a/applications/landlord_description_overrides/requirements.txt b/applications/landlord_description_overrides/requirements.txt new file mode 100644 index 00000000..590c4667 --- /dev/null +++ b/applications/landlord_description_overrides/requirements.txt @@ -0,0 +1,5 @@ +boto3 +pydantic +sqlmodel +psycopg2-binary +openai==1.93.0 diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py index 9fb3ca6a..e34a6af3 100644 --- a/applications/postcode_splitter/handler.py +++ b/applications/postcode_splitter/handler.py @@ -9,11 +9,11 @@ from applications.postcode_splitter.postcode_splitter_trigger_body import ( PostcodeSplitterTriggerBody, ) from infrastructure.address2uprn_queue_client import Address2UprnQueueClient -from infrastructure.csv_s3_client import CsvS3Client +from infrastructure.s3.csv_s3_client import CsvS3Client from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator from orchestration.task_orchestrator import TaskOrchestrator -from repositories.user_address.user_address_csv_s3_repository import ( - UserAddressCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) from utilities.aws_lambda.subtask_handler import subtask_handler @@ -29,17 +29,19 @@ def handler( # boto3.client is overloaded per-service in the installed stubs; cast # to Any so the strict-mode checker treats it as opaque. - boto3_client: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + boto3_client: Any = ( + boto3.client + ) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] boto_s3: Any = boto3_client("s3") boto_sqs: Any = boto3_client("sqs") csv_client = CsvS3Client(boto_s3, bucket) - user_address_repo = UserAddressCsvS3Repository(csv_client, bucket) + unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(csv_client, bucket) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - user_address_repo=user_address_repo, + unstandardised_address_repo=unstandardised_address_repo, queue_client=queue_client, ) diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index 7d174152..017f260f 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -32,6 +32,12 @@ RUN pip install --no-cache-dir -r requirements.txt COPY utils/ utils/ COPY backend/ backend/ COPY datatypes/ datatypes/ +# main.py imports infrastructure.epc_client.epc_client_service (EpcClientService); +# without this the lambda fails at init with "No module named 'infrastructure'". +COPY infrastructure/ infrastructure/ +# EpcClientService -> datatypes.epc.domain.mapper -> domain.sap10_calculator; +# without this the lambda fails at init with "No module named 'domain'". +COPY domain/ domain/ # Copy the handler COPY backend/address2UPRN/main.py . diff --git a/backend/app/bulk_uploads/router.py b/backend/app/bulk_uploads/router.py index 9928b456..c050b18c 100644 --- a/backend/app/bulk_uploads/router.py +++ b/backend/app/bulk_uploads/router.py @@ -13,6 +13,7 @@ from backend.app.bulk_uploads.schema import ( CombinedResultsResponse, CombinerTriggerRequest, FlagsSummary, + LandlordOverridesTriggerRequest, PostcodeSplitterTriggerRequest, ) from backend.app.bulk_uploads.scoring import score_bucket @@ -92,6 +93,26 @@ async def trigger_combiner(req: CombinerTriggerRequest): } +@router.post("/trigger-landlord-overrides", status_code=202) +async def trigger_landlord_overrides(req: LandlordOverridesTriggerRequest): + settings = get_settings() + + try: + sqs = boto3.client("sqs", settings.AWS_DEFAULT_REGION) + response = sqs.send_message( + QueueUrl=settings.LANDLORD_OVERRIDES_SQS_URL, + MessageBody=req.model_dump_json(), + ) + except Exception as e: + raise HTTPException(status_code=500, detail=f"SQS error: {e}") + + return { + "task_id": req.task_id, + "sub_task_id": req.sub_task_id, + "sqs_message_id": response.get("MessageId"), + } + + @router.get("/{task_id}/combined-results", response_model=CombinedResultsResponse) async def get_combined_results( task_id: UUID, diff --git a/backend/app/bulk_uploads/schema.py b/backend/app/bulk_uploads/schema.py index ca3b39ea..af797cac 100644 --- a/backend/app/bulk_uploads/schema.py +++ b/backend/app/bulk_uploads/schema.py @@ -14,6 +14,15 @@ class CombinerTriggerRequest(BaseModel): sub_task_id: str +class LandlordOverridesTriggerRequest(BaseModel): + task_id: str + sub_task_id: str + s3_uri: str + portfolio_id: int + # category -> source CSV header (the classifier subset of the upload mapping) + column_mapping: dict[str, str] + + class FlagsSummary(BaseModel): duplicates: int missing: int diff --git a/backend/app/config.py b/backend/app/config.py index fcfb6d5b..f969518d 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -42,6 +42,7 @@ class Settings(BaseSettings): MAGICPLAN_SQS_URL: str = "changeme" POSTCODE_SPLITTER_SQS_URL: str = "changeme" COMBINER_SQS_URL: str = "changeme" + LANDLORD_OVERRIDES_SQS_URL: str = "changeme" # Third parties EPC_AUTH_TOKEN: str = "changeme" diff --git a/backend/docker/engine.Dockerfile b/backend/docker/engine.Dockerfile index 07f2d859..58ccd481 100644 --- a/backend/docker/engine.Dockerfile +++ b/backend/docker/engine.Dockerfile @@ -40,6 +40,9 @@ COPY --from=build-image /usr/local/lib/python3.11/site-packages/ /usr/local/lib/ COPY ./backend/ ./backend COPY ./recommendations/ ./recommendations COPY ./utils/ ./utils/ +# engine.py -> backend.apis.GoogleSolarApi -> infrastructure.solar; without this +# the lambda fails at init with "No module named 'infrastructure'". +COPY ./infrastructure/ ./infrastructure/ COPY ./etl/epc/ ./etl/epc/ COPY ./etl/epc_clean/ ./etl/epc_clean/ COPY ./etl/bill_savings/ ./etl/bill_savings/ diff --git a/backend/ecmk_fetcher/handler/Dockerfile b/backend/ecmk_fetcher/handler/Dockerfile index fa2126fd..aebcd7aa 100644 --- a/backend/ecmk_fetcher/handler/Dockerfile +++ b/backend/ecmk_fetcher/handler/Dockerfile @@ -13,6 +13,12 @@ RUN pip install --no-cache-dir -r requirements.txt COPY utils/ utils/ COPY backend/ backend/ COPY datatypes/ datatypes/ +# handler -> ecmk_service -> documents_parser.parser -> datatypes.epc.domain.mapper +# -> domain.sap10_calculator, and -> db_writer -> epc_property model +# -> infrastructure.postgres. Without these the lambda fails at init with +# "No module named 'domain'" / "'infrastructure'". +COPY domain/ domain/ +COPY infrastructure/ infrastructure/ # Local lambda entrypoint # ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile index f8c2008c..575b8565 100644 --- a/backend/pashub_fetcher/handler/Dockerfile +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -10,6 +10,12 @@ WORKDIR /var/task COPY utils/ utils/ COPY backend/ backend/ COPY datatypes/ datatypes/ +# handler -> pashub_service -> documents_parser.parser -> datatypes.epc.domain.mapper +# -> domain.sap10_calculator, and -> db_writer -> epc_property model +# -> infrastructure.postgres. Without these the lambda fails at init with +# "No module named 'domain'" / "'infrastructure'". +COPY domain/ domain/ +COPY infrastructure/ infrastructure/ COPY backend/pashub_fetcher/handler/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py index a0fef7e9..aaf5d680 100644 --- a/backend/tests/test_search_epc.py +++ b/backend/tests/test_search_epc.py @@ -14,55 +14,11 @@ class TestSearchEpcIntegration: def epc_auth_token(self): return os.getenv("EPC_AUTH_TOKEN") - @pytest.mark.parametrize( - "address, postcode, uprn, skip_os, lmk_key, n_old_epcs", - [ - # Test case 1: Valid address and postcode, skipping OS - # In this case, the property is an individual flat but the uprn associated to the - # EPC is for the building as a whole, possibly because there was a conversion of sorts - ("Garden Flat, 48 Bedminster Parade", "BS3 4HS", 308249, True, - "260907a5431fa073d193cc6bbec51fbf1ba9a61845ab2503f85aa19ce3ed6afd", 1), - - # Test case 2: Another valid address and postcode - # In this case, the newest EPC, does not have a uprn associated to it. If we did a search by - # uprn, we would get an old EPC - ("Flat 8, Hainton House", "DN32 9AQ", "", True, - "bd1149a20a73397184f07a9955f872424826e70f4870c058d71be887766ee1f8", 2), - # Test case 3: When we make a request to the API for this property, we get back results for - # flats 1, 2 and 3. We have some logic to handle the response so that we get back flat 1 - ("Flat 1, 1 Tottenham Street, London", "W1T 2AE", 5167411, True, - "3e6414d7f15f4cf7a69dc20c469bcf043d31a49239b183f1bd0c0e1aafa23c93", 0), - - ], - ) - def test_find_property(self, epc_auth_token, address, postcode, uprn, skip_os, lmk_key, n_old_epcs): - """ - Integration test for `find_property`, making actual API calls. - """ - # Provide your actual API keys or tokens here - os_api_key = "" - - # Initialize the SearchEpc instance - epc_searcher = SearchEpc( - address1=address, - postcode=postcode, - uprn=uprn, - auth_token=epc_auth_token, - os_api_key=os_api_key, - ) - - # Execute the method - epc_searcher.find_property(skip_os=skip_os) - - # We check that we have the correct epc - assert epc_searcher.newest_epc["lmk-key"] == lmk_key - assert len(epc_searcher.older_epcs) == n_old_epcs - def test_search_housenumber(self): - eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter' + eg1 = "Flat A11, Mortimer House, Grendon Road, Exeter" res1 = SearchEpc.get_house_number(eg1, None) assert res1 == "A11" - eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL' + eg2 = "Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL" res2 = SearchEpc.get_house_number(eg2, None) assert res2 == "A9" diff --git a/datatypes/epc/domain/epc_property_data.py b/datatypes/epc/domain/epc_property_data.py index a5b24c27..1048bed2 100644 --- a/datatypes/epc/domain/epc_property_data.py +++ b/datatypes/epc/domain/epc_property_data.py @@ -98,7 +98,9 @@ class MainHeatingDetail: boiler_flue_type: Optional[int] = None # TODO: make enum? boiler_ignition_type: Optional[int] = None # TODO: make enum? central_heating_pump_age: Optional[int] = None - central_heating_pump_age_str: Optional[str] = None # str from site notes e.g. "Unknown", "Pre 2013" + central_heating_pump_age_str: Optional[str] = ( + None # str from site notes e.g. "Unknown", "Pre 2013" + ) main_heating_index_number: Optional[int] = None sap_main_heating_code: Optional[int] = None # TODO: make enum? main_heating_number: Optional[int] = None @@ -132,7 +134,7 @@ class ShowerOutlets: @dataclass class SapHeating: - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] has_fixed_air_conditioning: bool cylinder_size: Optional[Union[int, str]] = ( @@ -145,7 +147,9 @@ class SapHeating: cylinder_insulation_type: Optional[Union[int, str]] = None cylinder_thermostat: Optional[str] = None secondary_fuel_type: Optional[int] = None - secondary_heating_type: Optional[Union[int, str]] = None # int from API; str from site notes + secondary_heating_type: Optional[Union[int, str]] = ( + None # int from API; str from site notes + ) cylinder_insulation_thickness_mm: Optional[int] = None # SAP10 hot-water demand inputs from sap_heating. number_baths: Optional[int] = None @@ -168,7 +172,9 @@ class SapHeating: class SapVentilation: ventilation_type: Optional[str] = None draught_lobby: Optional[bool] = None - pressure_test: Optional[str] = None # str from site notes e.g. "No test"; int in API via mechanical_ventilation + pressure_test: Optional[str] = ( + None # str from site notes e.g. "No test"; int in API via mechanical_ventilation + ) open_flues_count: Optional[int] = None closed_flues_count: Optional[int] = None boiler_flues_count: Optional[int] = None @@ -476,8 +482,12 @@ class SapBuildingPart: None # TODO: make enum/mapping? ) floor_type: Optional[str] = None # str from site notes e.g. "Ground Floor" - floor_construction_type: Optional[str] = None # str from site notes; distinct from floor_construction: int in SapFloorDimension - floor_insulation_type_str: Optional[str] = None # str from site notes e.g. "As Built" + floor_construction_type: Optional[str] = ( + None # str from site notes; distinct from floor_construction: int in SapFloorDimension + ) + floor_insulation_type_str: Optional[str] = ( + None # str from site notes e.g. "As Built" + ) floor_u_value_known: Optional[bool] = None roof_construction: Optional[int] = None diff --git a/datatypes/epc/schema/rdsap_schema_17_0.py b/datatypes/epc/schema/rdsap_schema_17_0.py index 22aaded4..9cbedf97 100644 --- a/datatypes/epc/schema/rdsap_schema_17_0.py +++ b/datatypes/epc/schema/rdsap_schema_17_0.py @@ -37,7 +37,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] cylinder_insulation_type: int diff --git a/datatypes/epc/schema/rdsap_schema_17_1.py b/datatypes/epc/schema/rdsap_schema_17_1.py index a4c007ed..b0af07e6 100644 --- a/datatypes/epc/schema/rdsap_schema_17_1.py +++ b/datatypes/epc/schema/rdsap_schema_17_1.py @@ -41,7 +41,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] cylinder_insulation_type: int diff --git a/datatypes/epc/schema/rdsap_schema_18_0.py b/datatypes/epc/schema/rdsap_schema_18_0.py index a038dc9b..4ce2f887 100644 --- a/datatypes/epc/schema/rdsap_schema_18_0.py +++ b/datatypes/epc/schema/rdsap_schema_18_0.py @@ -41,7 +41,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str @@ -86,6 +86,7 @@ class SapFloorDimension: @dataclass class SapRoomInRoof: """Room-in-roof details. floor_area is a Measurement object in schema 18.0.""" + floor_area: Measurement insulation: str roof_room_connected: str diff --git a/datatypes/epc/schema/rdsap_schema_19_0.py b/datatypes/epc/schema/rdsap_schema_19_0.py index b94d9bb3..b3c77ec4 100644 --- a/datatypes/epc/schema/rdsap_schema_19_0.py +++ b/datatypes/epc/schema/rdsap_schema_19_0.py @@ -41,7 +41,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str diff --git a/datatypes/epc/schema/rdsap_schema_20_0_0.py b/datatypes/epc/schema/rdsap_schema_20_0_0.py index 8f3986a2..9deb235e 100644 --- a/datatypes/epc/schema/rdsap_schema_20_0_0.py +++ b/datatypes/epc/schema/rdsap_schema_20_0_0.py @@ -49,7 +49,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str @@ -103,6 +103,7 @@ class SapFloorDimension: @dataclass class SapRoomInRoof: """Room-in-roof details. floor_area is a plain number in schema 20.0.0 (not a Measurement object).""" + floor_area: Union[int, float] insulation: str roof_room_connected: str diff --git a/deployment/terraform/lambda/landlordDescriptionOverrides/main.tf b/deployment/terraform/lambda/landlordDescriptionOverrides/main.tf new file mode 100644 index 00000000..5a69de22 --- /dev/null +++ b/deployment/terraform/lambda/landlordDescriptionOverrides/main.tf @@ -0,0 +1,50 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + +module "lambda" { + source = "../../modules/lambda_with_sqs" + + name = "landlord-description-overrides" + stage = var.stage + + image_uri = local.image_uri + + # The classifier calls OpenAI once per distinct description per column, so it + # is latency-bound. 300s leaves headroom under the queue's 1000s visibility + # timeout. batch_size = 1 keeps one upload per invocation, so a single bad + # record cannot redrive its siblings. maximum_concurrency caps fan-out to + # respect OpenAI rate limits. + timeout = 300 + batch_size = 1 + maximum_concurrency = 5 + + environment = merge( + { + STAGE = var.stage + LOG_LEVEL = "info" + POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username + POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password + OPENAI_API_KEY = var.openai_api_key + }, + ) +} + +# Attach S3 read policy so the handler can read the original upload CSV. +resource "aws_iam_role_policy_attachment" "landlord_overrides_s3_read" { + role = module.lambda.role_name + policy_arn = data.terraform_remote_state.shared.outputs.landlord_overrides_s3_read_arn +} diff --git a/deployment/terraform/lambda/landlordDescriptionOverrides/outputs.tf b/deployment/terraform/lambda/landlordDescriptionOverrides/outputs.tf new file mode 100644 index 00000000..7c6534db --- /dev/null +++ b/deployment/terraform/lambda/landlordDescriptionOverrides/outputs.tf @@ -0,0 +1,9 @@ +output "landlord_description_overrides_queue_url" { + value = module.lambda.queue_url + description = "URL of the Landlord Description Overrides SQS queue (wire into the FastAPI LANDLORD_OVERRIDES_SQS_URL)" +} + +output "landlord_description_overrides_queue_arn" { + value = module.lambda.queue_arn + description = "ARN of the Landlord Description Overrides SQS queue" +} diff --git a/deployment/terraform/lambda/landlordDescriptionOverrides/provider.tf b/deployment/terraform/lambda/landlordDescriptionOverrides/provider.tf new file mode 100644 index 00000000..ed2fa60e --- /dev/null +++ b/deployment/terraform/lambda/landlordDescriptionOverrides/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "landlord-description-overrides-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} diff --git a/deployment/terraform/lambda/landlordDescriptionOverrides/variables.tf b/deployment/terraform/lambda/landlordDescriptionOverrides/variables.tf new file mode 100644 index 00000000..63437a5a --- /dev/null +++ b/deployment/terraform/lambda/landlordDescriptionOverrides/variables.tf @@ -0,0 +1,33 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. landlordDescriptionOverrides)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} + +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "openai_api_key" { + type = string + description = "OpenAI API key used by the ChatGPT column classifier" + sensitive = true +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/deployment/terraform/shared/main.tf b/deployment/terraform/shared/main.tf index 0a9e87f6..82a3820a 100644 --- a/deployment/terraform/shared/main.tf +++ b/deployment/terraform/shared/main.tf @@ -268,11 +268,11 @@ output "retrofit_heat_baseline_predictions_bucket_name" { // We make this bucket presignable, because we want to generate download links for the frontend module "retrofit_energy_assessments" { - source = "../modules/s3_presignable_bucket" - bucketname = "retrofit-energy-assessments-${var.stage}" - allowed_origins = var.allowed_origins - environment = var.stage - enable_versioning = true + source = "../modules/s3_presignable_bucket" + bucketname = "retrofit-energy-assessments-${var.stage}" + allowed_origins = var.allowed_origins + environment = var.stage + enable_versioning = true } output "retrofit_energy_assessments_bucket_name" { @@ -494,6 +494,42 @@ output "postcode_splitter_s3_read_arn" { value = module.postcode_splitter_s3_read.policy_arn } +################################################ +# Landlord Description Overrides – Lambda +################################################ +module "landlord_description_overrides_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "landlord-description-overrides-terraform-state" +} + +module "landlord_description_overrides_registry" { + source = "../modules/container_registry" + name = "landlord_description_overrides" + stage = var.stage +} + +# S3 policy for the landlord classifier to read the original upload CSV. +module "landlord_overrides_s3_read" { + source = "../modules/s3_iam_policy" + + policy_name = "LandlordOverridesReadS3" + # NOTE: aws_iam_policy.description is ForceNew — changing it destroys+recreates the + # policy, which deadlocks because the policy is attached to the lambda role in the + # separate landlordDescriptionOverrides stack (DeleteConflict). Keep this string + # byte-for-byte identical to what's in state so the bucket change applies in-place. + policy_description = "Allow landlord description overrides Lambda to read from retrofit-data bucket" + bucket_arns = [ + "arn:aws:s3:::retrofit-plan-inputs-${var.stage}", + "arn:aws:s3:::retrofit-data-${var.stage}", + ] + actions = ["s3:GetObject", "s3:ListBucket"] + resource_paths = ["/*"] +} + +output "landlord_overrides_s3_read_arn" { + value = module.landlord_overrides_s3_read.policy_arn +} + ################################################ # Bulk Address2UPRN Combiner – Lambda ECR ################################################ @@ -729,7 +765,7 @@ module "hubspot_etl_bucket" { module "hubspot_etl_registry" { source = "../modules/container_registry" name = "hubspot-etl" - stage = var.stage + stage = var.stage } diff --git a/docs/adr/0003-python-writes-landlord-overrides-directly.md b/docs/adr/0003-python-writes-landlord-overrides-directly.md new file mode 100644 index 00000000..b199072b --- /dev/null +++ b/docs/adr/0003-python-writes-landlord-overrides-directly.md @@ -0,0 +1,79 @@ +# ADR-0003: Python writes landlord overrides directly to Postgres + +**Status:** Accepted +**Date:** 2026-05-26 +**Supersedes (in part):** [assessment-model/docs/adr/0002-landlord-override-vocabulary.md](https://github.com/.../assessment-model/blob/main/docs/adr/0002-landlord-override-vocabulary.md) — specifically the clause beginning *"Writes happen from Next.js …"*. + +## Context + +ADR-0002 (in the `assessment-model` TS repo) defined the `landlord_property_type_overrides` and `landlord_wall_type_overrides` tables and noted that the Model service would POST classification results to a Next.js route handler, with Next.js performing the upsert. Drizzle remained the schema source of truth. + +That extra hop has not been built and is now judged unnecessary for the present scope: + +- The classification result is internal — a Lambda computes it, the same Lambda persists it. No third party needs to participate in the write. +- Drizzle remains the schema's source of truth either way: the Python adapter mirrors the schema in a SQLModel row, but the migrations stay with Drizzle. Adding a Next.js route would not change which side owns schema definition. +- The Python lambda already lives next to a Postgres connection in the existing pipeline (`subtask`/`tasks` tables are written from Python today). Adding two more tables to that adapter surface is a small, well-understood change. Routing the same writes through Next.js would mean: lambda → JSON-over-HTTP → Next.js route → Drizzle → Postgres, instead of lambda → SQLAlchemy → Postgres. Three extra moving parts to ship, deploy, monitor, and authenticate for no behavioural gain. + +## Decision + +The Model service (specifically `applications/landlord_description_overrides/handler.py`) writes directly to `landlord_property_type_overrides` and `landlord_wall_type_overrides` via a SQLAlchemy-backed `LandlordOverrideRepository[E]` adapter. No Next.js route handler is required. + +Transaction boundaries live in `infrastructure/postgres/engine.transactional_session` — a context manager that commits on clean exit and rolls back on exception. The application layer (`handler.py`) never calls `.commit()` or `.rollback()` itself; it only opens the context. Orchestration and repository code likewise never commits — keeping transaction semantics confined to one infrastructure helper. + +The conflict policy lives in SQL and is identical for every override category. A single generic adapter, `LandlordOverridesRepository[E]`, implements it once; the target table is selected by the SQLModel `…Row` class passed at construction. Each category (property / built-form / wall / roof type) is that same adapter parameterised by its row class: + +```sql +INSERT INTO landlord_property_type_overrides (portfolio_id, description, value, source) +VALUES … +ON CONFLICT (portfolio_id, description) +DO UPDATE SET value = EXCLUDED.value, + source = EXCLUDED.source, + updated_at = now() +WHERE landlord_property_type_overrides.source = 'classifier'; +``` + +The `WHERE existing.source = 'classifier'` guard is load-bearing: it lets the classifier refresh its own past output while leaving `source = 'user'` rows untouched. This is the contract ADR-0002's `source` column was added for. + +`UNKNOWN` values are persisted, not skipped — consistent with ADR-0002 §5. A future user override can upgrade them. + +## Consequences + +**Positive.** + +- One fewer service to deploy, monitor, and authenticate. +- The classifier and persistence live in the same process — failures surface against a single `sub_task` row, not split across two systems. +- The Postgres adapter mirrors the existing `subtask`/`tasks` repositories, so reviewers have a precedent to compare against. + +**Negative.** + +- The Python repo now holds two schemas — the schema-source-of-truth Drizzle definition lives in the TS repo, and the Python `SQLModel` row class shadows it. They must stay in lockstep. Mitigations: the TS schema header comment (`landlord_overrides.ts:12`) already names the Python source-of-truth file; a future ADR may add a CI check that diffs the two. +- The boundary that ADR-0002 anticipated for pgEnum validation (a Next.js route validating incoming values before insert) is gone. Pydantic + the Python `Enum` type catch invalid values on the producing side, and Postgres's pgEnum will reject anything that slips through. + +## File layout + +This ADR also fixes a placement convention for Postgres adapters going forward. The codebase currently has the ChatGPT classifier split cleanly along DDD lines — port in `domain/`, adapter in `infrastructure/chatgpt/` — but the `tasks` Postgres adapter does not follow the same shape: its concrete class lives in `repositories/tasks/`, not `infrastructure/postgres/`. + +The convention going forward separates the persistence *behaviour* (grouped by aggregate) from the schema *mirrors* (grouped by technology, since they share pgEnums and engine metadata): + +- **Port (protocol / abstract base):** `repositories//_repository.py` +- **Postgres repository adapter (concrete):** `infrastructure//_postgres_repository.py` +- **SQLModel row class (`table=True` schema mirror):** `infrastructure/postgres/_table.py` + +The `LandlordOverridesRepository` adapter follows this convention: the concrete class — the aggregate's "talker" to Postgres — lives at `infrastructure/landlord_overrides/landlord_overrides_postgres_repository.py`, while the per-category `…Row` classes stay in `infrastructure/postgres/`. The `…Row` classes are one-per-table — each mirrors a genuinely distinct Drizzle table and `value` pgEnum, and they share the single `override_source` pgEnum instance, so they belong together in the Postgres technology bucket as schema mirrors, not duplicated logic. + +(This refines the placement first sketched in this ADR, which put the adapter in `infrastructure/postgres/` alongside the row classes. The adapter holds no schema — only the write path — so it groups by aggregate; only the `table=True` mirrors stay tech-bucketed.) + +**Existing outliers to relocate in a follow-up:** + +- `repositories/tasks/task_postgres_repository.py` → `infrastructure/tasks/task_postgres_repository.py` +- `repositories/tasks/subtask_postgres_repository.py` → `infrastructure/tasks/subtask_postgres_repository.py` + +(Their `task_table.py` / `subtask_table.py` schema mirrors already sit correctly in `infrastructure/postgres/`.) Both moves are mechanical (import-path updates only). They are intentionally out of scope for the present PR. + +## Out of scope (deferred to follow-up work) + +- Relocating `task_postgres_repository.py` and `subtask_postgres_repository.py` into `infrastructure/tasks/` per the convention above. +- ~~Extracting a shared upsert helper / base class once a third `landlord_*_overrides` column lands — until then the per-category adapters' 95%-identical bodies are kept side-by-side for direct comparison.~~ **Done.** The per-category adapter bodies were byte-identical (varying only in their row class), so they were consolidated into one generic `LandlordOverridesRepository[E]` parameterised by row class rather than waiting for a third column. +- Switching `applications/landlord_description_overrides/handler.py` to acquire its `Session` via a `@subtask_handler()`-style decorator instead of building its own engine. +- A cross-repo PR amending ADR-0002 to point at this ADR. +- A CI check (or codegen) that diffs the Drizzle pgEnum literals against the Python `Enum.value` strings. diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index 44e4d967..ca4cd752 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -2,21 +2,21 @@ from __future__ import annotations from collections.abc import Iterable, Iterator -from domain.addresses.user_address import UserAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode def iter_postcode_grouped_batches( - addresses: Iterable[UserAddress], + addresses: Iterable[UnstandardisedAddress], *, max_batch_size: int = 500, -) -> Iterator[list[UserAddress]]: +) -> Iterator[AddressList]: if max_batch_size < 1: raise ValueError("max_batch_size must be >= 1") groups = _group_by_postcode_in_order(addresses) - buffer: list[UserAddress] = [] + buffer: AddressList = AddressList([]) for group in groups.values(): group_len = len(group) @@ -26,14 +26,14 @@ def iter_postcode_grouped_batches( if group_len >= max_batch_size: if buffer: yield buffer - buffer = [] + buffer = AddressList([]) yield group continue # Adding this group would overflow: flush buffer before appending. if len(buffer) + group_len > max_batch_size: yield buffer - buffer = [] + buffer = AddressList([]) buffer.extend(group) @@ -43,9 +43,9 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( - addresses: Iterable[UserAddress], -) -> dict[Postcode, list[UserAddress]]: - groups: dict[Postcode, list[UserAddress]] = {} + addresses: Iterable[UnstandardisedAddress], +) -> dict[Postcode, AddressList]: + groups: dict[Postcode, AddressList] = {} for address in addresses: - groups.setdefault(address.postcode, []).append(address) + groups.setdefault(address.postcode, AddressList([])).append(address) return groups diff --git a/domain/addresses/standardised_address_list.py b/domain/addresses/standardised_address_list.py new file mode 100644 index 00000000..8e3f4fc7 --- /dev/null +++ b/domain/addresses/standardised_address_list.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import NewType, Optional + +from domain.postcode import Postcode + + +def _empty_source_row() -> dict[str, str]: + return {} + + +@dataclass(frozen=True) +class StandardisedAddress: + address: str + postcode: Postcode + org_reference: Optional[str] = None + + +# Standardised Asset List -- the cleaned output counterpart to AddressList. +SAL = NewType("SAL", list[StandardisedAddress]) diff --git a/domain/addresses/unstandardised_address.py b/domain/addresses/unstandardised_address.py new file mode 100644 index 00000000..8917bdf4 --- /dev/null +++ b/domain/addresses/unstandardised_address.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, NewType + +from domain.postcode import Postcode + + +def _empty_source_row() -> dict[str, str]: + return {} + + +@dataclass(frozen=True) +class UnstandardisedAddress: + address: str + postcode: Postcode + org_reference: Optional[str] = None + additional_info: dict[str, str] = field( + default_factory=_empty_source_row, compare=False + ) + + +# A batch of raw, pre-standardisation addresses as supplied by a landlord. +AddressList = NewType("AddressList", list[UnstandardisedAddress]) diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py deleted file mode 100644 index 9a28751b..00000000 --- a/domain/addresses/user_address.py +++ /dev/null @@ -1,18 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Optional - -from domain.postcode import Postcode - - -def _empty_source_row() -> dict[str, str]: - return {} - - -@dataclass(frozen=True) -class UserAddress: - user_address: str - postcode: Postcode - internal_reference: Optional[str] = None - source_row: dict[str, str] = field(default_factory=_empty_source_row, compare=False) diff --git a/repositories/user_address/__init__.py b/domain/data_transformation/__init__.py similarity index 100% rename from repositories/user_address/__init__.py rename to domain/data_transformation/__init__.py diff --git a/domain/data_transformation/column_classifier.py b/domain/data_transformation/column_classifier.py new file mode 100644 index 00000000..adc88c6a --- /dev/null +++ b/domain/data_transformation/column_classifier.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from enum import Enum +from typing import Generic, TypeVar + +E = TypeVar("E", bound=Enum) + + +class ClassificationError(Exception): + """Raised when classifying a column's descriptions fails wholesale. + + A whole-batch failure (the AI backend is unreachable, or returns a reply + that cannot be parsed) raises this. A single description that merely + cannot be resolved is not an error -- it maps to the enum's UNKNOWN member. + """ + + +class ColumnClassifier(ABC, Generic[E]): + """Port: resolves free-text descriptions into a category enum ``E``. + + One classifier handles one landlord-CSV column. Implementations decide + *how* the mapping is performed (an LLM, a lookup table, a rules engine); + ``LandlordDescriptionOverridesOrchestrator`` depends only on this interface. + """ + + @abstractmethod + def classify(self, descriptions: set[str]) -> dict[str, E]: + """Classify each description into a category enum member. + + Every input description appears as a key in the result. A description + that cannot be resolved maps to the enum's UNKNOWN member. + + Raises: + ClassificationError: If the classification call fails wholesale + (e.g. the backend is unreachable or returns an unparseable + response). + """ + ... diff --git a/tests/repositories/user_address/__init__.py b/domain/epc/__init__.py similarity index 100% rename from tests/repositories/user_address/__init__.py rename to domain/epc/__init__.py diff --git a/domain/epc/built_form_type.py b/domain/epc/built_form_type.py new file mode 100644 index 00000000..327ceebe --- /dev/null +++ b/domain/epc/built_form_type.py @@ -0,0 +1,20 @@ +from enum import Enum + + +class BuiltFormType(Enum): + """A landlord-supplied built form, as resolved by the landlord-description-overrides context. + + Mirrors the EPC built-form values. ``NOT_RECORDED`` is the legitimate + EPC value for properties whose built form the surveyor did not capture; + ``UNKNOWN`` is the classifier fallback for landlord values that cannot be + resolved at all. + """ + + DETACHED = "Detached" + SEMI_DETACHED = "Semi-Detached" + MID_TERRACE = "Mid-Terrace" + END_TERRACE = "End-Terrace" + ENCLOSED_MID_TERRACE = "Enclosed Mid-Terrace" + ENCLOSED_END_TERRACE = "Enclosed End-Terrace" + NOT_RECORDED = "Not Recorded" + UNKNOWN = "Unknown" diff --git a/domain/epc/property_type.py b/domain/epc/property_type.py new file mode 100644 index 00000000..453c28c1 --- /dev/null +++ b/domain/epc/property_type.py @@ -0,0 +1,16 @@ +from enum import Enum + + +class PropertyType(Enum): + """A landlord-supplied property type, as resolved by the landlord-description-overrides context. + + Distinct from the EPC context's ``PropertyType``: a landlord CSV value + may be unresolvable, so this enum carries an explicit ``UNKNOWN`` member. + """ + + HOUSE = "House" + BUNGALOW = "Bungalow" + FLAT = "Flat" + MAISONETTE = "Maisonette" + PARK_HOME = "Park home" + UNKNOWN = "Unknown" diff --git a/domain/epc/roof_type.py b/domain/epc/roof_type.py new file mode 100644 index 00000000..56ef9e8e --- /dev/null +++ b/domain/epc/roof_type.py @@ -0,0 +1,70 @@ +from enum import Enum + + +class RoofType(Enum): + """A landlord-supplied roof description, as resolved by the landlord-description-overrides context. + + Each member is one full EPC roof-description string, combining shape + (flat, pitched, roof room(s), thatched) with insulation state and, for + pitched roofs, the loft-insulation depth in millimetres. Adjacency + markers like ``(another dwelling above)`` represent a unit whose top + boundary is another dwelling rather than a roof of its own; they are + kept as members because they appear in the same EPC column. + ``UNKNOWN`` covers values the classifier cannot resolve -- most + commonly raw ``Average thermal transmittance`` U-value strings that + carry no shape/insulation information. + """ + + FLAT_INSULATED = "Flat, insulated" + FLAT_INSULATED_ASSUMED = "Flat, insulated (assumed)" + FLAT_LIMITED_INSULATION = "Flat, limited insulation" + FLAT_LIMITED_INSULATION_ASSUMED = "Flat, limited insulation (assumed)" + FLAT_NO_INSULATION = "Flat, no insulation" + FLAT_NO_INSULATION_ASSUMED = "Flat, no insulation (assumed)" + + PITCHED_INSULATED = "Pitched, insulated" + PITCHED_INSULATED_ASSUMED = "Pitched, insulated (assumed)" + PITCHED_INSULATED_AT_RAFTERS = "Pitched, insulated at rafters" + PITCHED_LIMITED_INSULATION = "Pitched, limited insulation" + PITCHED_LIMITED_INSULATION_ASSUMED = "Pitched, limited insulation (assumed)" + PITCHED_NO_INSULATION = "Pitched, no insulation" + PITCHED_NO_INSULATION_ASSUMED = "Pitched, no insulation (assumed)" + PITCHED_UNKNOWN_LOFT_INSULATION = "Pitched, Unknown loft insulation" + PITCHED_LOFT_0MM = "Pitched, 0 mm loft insulation" + PITCHED_LOFT_12MM = "Pitched, 12 mm loft insulation" + PITCHED_LOFT_25MM = "Pitched, 25 mm loft insulation" + PITCHED_LOFT_50MM = "Pitched, 50 mm loft insulation" + PITCHED_LOFT_75MM = "Pitched, 75 mm loft insulation" + PITCHED_LOFT_100MM = "Pitched, 100 mm loft insulation" + PITCHED_LOFT_125MM = "Pitched, 125 mm loft insulation" + PITCHED_LOFT_150MM = "Pitched, 150 mm loft insulation" + PITCHED_LOFT_175MM = "Pitched, 175 mm loft insulation" + PITCHED_LOFT_200MM = "Pitched, 200 mm loft insulation" + PITCHED_LOFT_225MM = "Pitched, 225 mm loft insulation" + PITCHED_LOFT_250MM = "Pitched, 250 mm loft insulation" + PITCHED_LOFT_270MM = "Pitched, 270 mm loft insulation" + PITCHED_LOFT_300MM = "Pitched, 300 mm loft insulation" + PITCHED_LOFT_350MM = "Pitched, 350 mm loft insulation" + PITCHED_LOFT_400MM = "Pitched, 400 mm loft insulation" + PITCHED_LOFT_400_PLUS_MM = "Pitched, 400+ mm loft insulation" + + ROOF_ROOM_INSULATED = "Roof room(s), insulated" + ROOF_ROOM_INSULATED_ASSUMED = "Roof room(s), insulated (assumed)" + ROOF_ROOM_LIMITED_INSULATION = "Roof room(s), limited insulation" + ROOF_ROOM_LIMITED_INSULATION_ASSUMED = "Roof room(s), limited insulation (assumed)" + ROOF_ROOM_NO_INSULATION = "Roof room(s), no insulation" + ROOF_ROOM_NO_INSULATION_ASSUMED = "Roof room(s), no insulation (assumed)" + ROOF_ROOM_CEILING_INSULATED = "Roof room(s), ceiling insulated" + ROOF_ROOM_THATCHED = "Roof room(s), thatched" + ROOF_ROOM_THATCHED_WITH_ADDITIONAL_INSULATION = "Roof room(s), thatched with additional insulation" + + THATCHED = "Thatched" + THATCHED_WITH_ADDITIONAL_INSULATION = "Thatched, with additional insulation" + + ADJACENT_ANOTHER_DWELLING_ABOVE = "(another dwelling above)" + ADJACENT_SAME_DWELLING_ABOVE = "(same dwelling above)" + ADJACENT_OTHER_PREMISES_ABOVE = "(other premises above)" + ADJACENT_ANOTHER_PREMISES_ABOVE = "(another premises above)" + ANOTHER_PREMISES_ABOVE = "Another Premises Above" + + UNKNOWN = "Unknown" diff --git a/domain/epc/wall_type.py b/domain/epc/wall_type.py new file mode 100644 index 00000000..1466f82d --- /dev/null +++ b/domain/epc/wall_type.py @@ -0,0 +1,117 @@ +from enum import Enum + + +class WallType(Enum): + """A landlord-supplied wall description, as resolved by the landlord-description-overrides context. + + Each member is one full EPC wall-description string, combining material + (cavity, solid brick, sandstone, …) with construction/insulation state + (as built, filled cavity, with internal insulation, …). ``UNKNOWN`` covers + values the classifier cannot resolve — most commonly raw + ``Average thermal transmittance`` U-value strings that carry no material + information. + """ + + CAVITY_FILLED = "Cavity wall, filled cavity" + CAVITY_AS_BUILT_INSULATED_ASSUMED = ( + "Cavity wall, as built, insulated (assumed)" # 1983 - 1990 + ) + CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Cavity wall, as built, no insulation (assumed)" # Pre-1975 + ) + + CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Cavity wall, as built, partial insulation (assumed)" # 1976 - 1982 + ) + CAVITY_WITH_INTERNAL_INSULATION = "Cavity wall, with internal insulation" + CAVITY_WITH_EXTERNAL_INSULATION = "Cavity wall, with external insulation" + CAVITY_FILLED_AND_INTERNAL_INSULATION = ( + "Cavity wall, filled cavity and internal insulation" + ) + CAVITY_FILLED_AND_EXTERNAL_INSULATION = ( + "Cavity wall, filled cavity and external insulation" + ) + + SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Solid brick, as built, no insulation (assumed)" + ) + SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = ( + "Solid brick, as built, insulated (assumed)" + ) + SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Solid brick, as built, partial insulation (assumed)" + ) + SOLID_BRICK_WITH_INTERNAL_INSULATION = "Solid brick, with internal insulation" + SOLID_BRICK_WITH_EXTERNAL_INSULATION = "Solid brick, with external insulation" + + TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Timber frame, as built, no insulation (assumed)" + ) + TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = ( + "Timber frame, as built, insulated (assumed)" + ) + TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Timber frame, as built, partial insulation (assumed)" + ) + TIMBER_FRAME_WITH_ADDITIONAL_INSULATION = "Timber frame, with additional insulation" + + SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Sandstone, as built, no insulation (assumed)" + ) + SANDSTONE_AS_BUILT_INSULATED_ASSUMED = "Sandstone, as built, insulated (assumed)" + SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Sandstone, as built, partial insulation (assumed)" + ) + SANDSTONE_WITH_INTERNAL_INSULATION = "Sandstone, with internal insulation" + SANDSTONE_WITH_EXTERNAL_INSULATION = "Sandstone, with external insulation" + + GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Granite or whin, as built, no insulation (assumed)" + ) + GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = ( + "Granite or whin, as built, insulated (assumed)" + ) + GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Granite or whin, as built, partial insulation (assumed)" + ) + GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = ( + "Granite or whin, with internal insulation" + ) + GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = ( + "Granite or whin, with external insulation" + ) + + SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = ( + "System built, as built, no insulation (assumed)" + ) + SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = ( + "System built, as built, insulated (assumed)" + ) + SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "System built, as built, partial insulation (assumed)" + ) + SYSTEM_BUILT_WITH_INTERNAL_INSULATION = "System built, with internal insulation" + SYSTEM_BUILT_WITH_EXTERNAL_INSULATION = "System built, with external insulation" + + PARK_HOME_AS_BUILT = "Park home wall, as built" + PARK_HOME_WITH_INTERNAL_INSULATION = "Park home wall, with internal insulation" + PARK_HOME_WITH_EXTERNAL_INSULATION = "Park home wall, with external insulation" + + COB_AS_BUILT = "Cob, as built" + COB_WITH_INTERNAL_INSULATION = "Cob, with internal insulation" + COB_WITH_EXTERNAL_INSULATION = "Cob, with external insulation" + + CURTAIN_WALL = "Curtain wall" + CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Curtain Wall, as built, no insulation (assumed)" + ) + CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = ( + "Curtain Wall, as built, insulated (assumed)" + ) + CURTAIN_WALL_FILLED = "Curtain Wall, filled cavity" + CURTAIN_WALL_WITH_INTERNAL_INSULATION = "Curtain Wall, with internal insulation" + + BASEMENT_WALL = "Basement wall" + BASEMENT_WALL_AS_BUILT = "Basement wall, as built" + + UNKNOWN = "Unknown" diff --git a/domain/epc/wall_type_construction_dates.py b/domain/epc/wall_type_construction_dates.py new file mode 100644 index 00000000..0eccc44c --- /dev/null +++ b/domain/epc/wall_type_construction_dates.py @@ -0,0 +1,72 @@ +"""Construction-date metadata for the "assumed" ``WallType`` variants. + +The ``(assumed)`` variants of ``WallType`` are what RdSAP picks when a +surveyor has no direct observation and falls back to the typical wall make-up +for a property's build era. The era boundaries reflect UK Building +Regulations milestones for cavity-wall insulation: + +* up to 1975 -- no cavity insulation requirement +* 1976-1982 -- partial-fill cavity (early insulation requirement) +* 1983-1990 -- full-fill cavity (insulation required) + +Captured here as a structured lookup so: + +* the LLM prompt builder can render the ranges as a hint, helping the + classifier resolve era-implying landlord descriptions to the right + ``(assumed)`` variant; +* future date-aware paths (a deterministic year-to-variant shortcut, a + date-keyed repo) can read from the same source instead of duplicating + the knowledge. + +Only the variants where we have a defensible era boundary appear here; the +remaining ``(assumed)`` members are left out rather than guessed. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Mapping, Optional + +from domain.epc.wall_type import WallType + + +@dataclass(frozen=True) +class YearRange: + """An inclusive year range. ``None`` on either end means "no bound".""" + + start: Optional[int] = None + end: Optional[int] = None + + def __str__(self) -> str: + if self.start is None and self.end is not None: + return f"pre-{self.end + 1}" + if self.start is not None and self.end is None: + return f"{self.start}+" + return f"{self.start}-{self.end}" + + +WALL_TYPE_CONSTRUCTION_YEARS: Mapping[WallType, YearRange] = { + WallType.CAVITY_AS_BUILT_NO_INSULATION_ASSUMED: YearRange(end=1975), + WallType.CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED: YearRange( + start=1976, end=1982 + ), + WallType.CAVITY_AS_BUILT_INSULATED_ASSUMED: YearRange(start=1983, end=1990), +} + + +def wall_type_construction_date_prompt_hint() -> str: + """Render the date metadata as a prompt fragment for the LLM classifier. + + The fragment lists each (variant, year range) pair so the model can + prefer the era-matching ``(assumed)`` variant when a landlord + description carries era information (e.g. "1970s semi", "built before + the war"). + """ + lines = [ + f"- {wall_type.value!r}: typically built {year_range}" + for wall_type, year_range in WALL_TYPE_CONSTRUCTION_YEARS.items() + ] + return ( + "When the description carries construction-era information, prefer " + "the category whose typical build year matches:\n" + "\n".join(lines) + ) diff --git a/infrastructure/chatgpt/__init__.py b/infrastructure/chatgpt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/infrastructure/chatgpt/chatgpt.py b/infrastructure/chatgpt/chatgpt.py new file mode 100644 index 00000000..ee2a5b39 --- /dev/null +++ b/infrastructure/chatgpt/chatgpt.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import os +from typing import Optional + +from openai import OpenAI +from openai.types.chat import ChatCompletionMessageParam + +from infrastructure.chatgpt.exceptions import ChatGPTClientError + + +class ChatGPT: + """Thin wrapper over the OpenAI Chat Completions API. + + Sends a single prompt and returns the assistant's reply as plain text. + """ + + DEFAULT_MODEL = "gpt-4o-mini" + + def __init__( + self, + api_key: Optional[str] = None, + model: Optional[str] = None, + ) -> None: + key = api_key or os.environ.get("OPENAI_API_KEY") + if not key: + raise ChatGPTClientError( + "No OpenAI API key provided. " + "Pass api_key or set the OPENAI_API_KEY environment variable." + ) + self._client = OpenAI(api_key=key) + self._model = model or self.DEFAULT_MODEL + + def generate( + self, + prompt: str, + system_prompt: Optional[str] = None, + ) -> str: + """Send a prompt to the model and return its reply text. + + Args: + prompt: The user message to send. + system_prompt: Optional instruction that sets the model's behaviour. + + Raises: + OpenAiClientError: If the model returns an empty response. + """ + messages: list[ChatCompletionMessageParam] = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + response = self._client.chat.completions.create( + model=self._model, + messages=messages, + ) + content = response.choices[0].message.content + if content is None: + raise ChatGPTClientError("ChatGPT returned an empty response.") + return content diff --git a/infrastructure/chatgpt/chatgpt_column_classifier.py b/infrastructure/chatgpt/chatgpt_column_classifier.py new file mode 100644 index 00000000..b782cf33 --- /dev/null +++ b/infrastructure/chatgpt/chatgpt_column_classifier.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import json +from enum import Enum +from typing import Any, Optional, TypeVar + +from domain.data_transformation.column_classifier import ( + ClassificationError, + ColumnClassifier, +) +from infrastructure.chatgpt.chatgpt import ChatGPT +from infrastructure.chatgpt.exceptions import ChatGPTClientError + +E = TypeVar("E", bound=Enum) + + +class ChatGptColumnClassifier(ColumnClassifier[E]): + """ColumnClassifier backed by ChatGPT, parametrised by a category enum. + + The same classification path -- prompt, JSON parsing, UNKNOWN fallback -- + serves any category enum; only ``category_enum`` and its ``unknown`` + member differ between columns. + """ + + def __init__( + self, + chat_gpt: ChatGPT, + category_enum: type[E], + unknown: E, + extra_instructions: Optional[str] = None, + ) -> None: + self._chat_gpt = chat_gpt + self._category_enum = category_enum + self._unknown = unknown + # Free-form column-specific guidance appended to the system prompt + # ahead of the JSON-output instruction. Lets each column ship its + # own hints (e.g. wall-type construction-era ranges) without the + # generic classifier knowing what they are. + self._extra_instructions = extra_instructions + + def classify(self, descriptions: set[str]) -> dict[str, E]: + if not descriptions: + return {} + try: + reply = self._chat_gpt.generate( + prompt=json.dumps(sorted(descriptions)), + system_prompt=self._system_prompt(), + ) + except ChatGPTClientError as error: + raise ClassificationError( + f"ChatGPT classification failed for " + f"{self._category_enum.__name__}." + ) from error + try: + raw: dict[str, Any] = json.loads(self._strip_code_fence(reply)) + except json.JSONDecodeError as error: + raise ClassificationError( + f"ChatGPT returned a reply that is not valid JSON: {reply!r}" + ) from error + return { + description: self._to_category(raw.get(description)) + for description in descriptions + } + + def _system_prompt(self) -> str: + categories = ", ".join( + member.value + for member in self._category_enum + if member is not self._unknown + ) + parts = [ + "Classify each free-text description into exactly one category. ", + f"Categories: {categories}. ", + ] + if self._extra_instructions: + parts.append(self._extra_instructions + " ") + parts.append( + "Reply with only a JSON object mapping each original description " + "to its category, and nothing else." + ) + return "".join(parts) + + def _to_category(self, value: Any) -> E: + """Map a reply value to a category member, defaulting to UNKNOWN.""" + try: + return self._category_enum(value) + except ValueError: + return self._unknown + + @staticmethod + def _strip_code_fence(reply: str) -> str: + """Remove a surrounding markdown code fence, if ChatGPT added one.""" + text = reply.strip() + if not text.startswith("```"): + return text + lines = text.splitlines()[1:] + if lines and lines[-1].strip() == "```": + lines = lines[:-1] + return "\n".join(lines) diff --git a/infrastructure/chatgpt/exceptions.py b/infrastructure/chatgpt/exceptions.py new file mode 100644 index 00000000..31663f3d --- /dev/null +++ b/infrastructure/chatgpt/exceptions.py @@ -0,0 +1,2 @@ +class ChatGPTClientError(Exception): + """Base for all OpenAI client errors.""" diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py deleted file mode 100644 index 8af8de73..00000000 --- a/infrastructure/csv_s3_client.py +++ /dev/null @@ -1,33 +0,0 @@ -import csv -from io import StringIO - -from infrastructure.s3_client import S3Client -from infrastructure.s3_uri import parse_s3_uri - - -class CsvS3Client(S3Client): - def read_rows(self, s3_uri: str) -> list[dict[str, str]]: - bucket, key = parse_s3_uri(s3_uri) - if bucket != self.bucket: - raise ValueError( - f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}" - ) - raw = self.get_object(key) - try: - text = raw.decode("utf-8-sig") - except UnicodeDecodeError: - # Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8. - text = raw.decode("cp1252") - - reader = csv.DictReader(StringIO(text)) - return [dict(row) for row in reader] - - def save_rows(self, rows: list[dict[str, str]], key: str) -> str: - if not rows: - raise ValueError("Cannot save an empty rows list: header is unknown") - buffer = StringIO() - fieldnames = list(rows[0].keys()) - writer = csv.DictWriter(buffer, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) - return self.put_object(key, buffer.getvalue().encode("utf-8")) diff --git a/infrastructure/landlord_overrides/__init__.py b/infrastructure/landlord_overrides/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/infrastructure/landlord_overrides/landlord_overrides_postgres_repository.py b/infrastructure/landlord_overrides/landlord_overrides_postgres_repository.py new file mode 100644 index 00000000..5be7431c --- /dev/null +++ b/infrastructure/landlord_overrides/landlord_overrides_postgres_repository.py @@ -0,0 +1,97 @@ +"""One Postgres adapter for every ``landlord__overrides`` table. + +The four override categories (property / built-form / wall / roof type) share +an identical write path: the same ``(portfolio_id, description) -> value`` row +shape and the same source-aware conflict policy (ADR-0003 §Decision fixes one +policy for all of them). The only thing that varies per category is the target +table, which is selected by the SQLModel row class handed to the constructor. + +So a single generic adapter serves all four -- callers parameterise it by the +category enum ``E`` and pass the matching ``…Row`` class: + + LandlordOverridesRepository[PropertyType](session, LandlordPropertyTypeOverrideRow) + +This adapter -- the aggregate's "talker" to Postgres -- lives under +``infrastructure/landlord_overrides/``, grouped by aggregate. The ``table=True`` +SQLModel row classes it writes through stay in ``infrastructure/postgres/`` as +schema mirrors (they share the ``override_source`` pgEnum and register against +the same engine metadata). See ADR-0003 §File layout. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from enum import Enum +from typing import TypeVar, cast + +from sqlalchemy import Table +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlmodel import Session, SQLModel + +from infrastructure.postgres.landlord_override_enums import OverrideSource +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, +) + +E = TypeVar("E", bound=Enum) + + +class LandlordOverridesRepository(LandlordOverrideRepository[E]): + """Writes classifier overrides to the ``landlord__overrides`` table + backing ``row_type``. + + ``row_type`` is the SQLModel mirror of the target table (e.g. + ``LandlordPropertyTypeOverrideRow``); its category enum must match the type + argument ``E`` the caller binds. The pairing is a convention, not a type + constraint -- the row classes are not generic over their value enum -- so + keep the two in step at the call site. + """ + + def __init__(self, session: Session, row_type: type[SQLModel]) -> None: + self._session = session + # SQLModel's class-level ``__table__`` is injected at runtime on + # ``table=True`` classes but isn't exposed by the stubs; pin it to + # ``Table`` via ``getattr`` so the dialect insert helper carries + # through with strict types. + self._table: Table = cast(Table, getattr(row_type, "__table__")) + + def upsert_all( + self, + portfolio_id: int, + descriptions_to_values: dict[str, E], + ) -> None: + if not descriptions_to_values: + return + + now = datetime.now(timezone.utc) + rows = [ + { + "portfolio_id": portfolio_id, + "description": description, + "value": value.value, + "source": OverrideSource.CLASSIFIER, + "created_at": now, + "updated_at": now, + } + for description, value in descriptions_to_values.items() + ] + + stmt = pg_insert(self._table).values(rows) + + # The classifier may refresh its own past output, but must never + # overwrite a user correction -- the ``WHERE existing.source = + # 'classifier'`` guard enforces that. See ADR-0003 §Decision. + stmt = stmt.on_conflict_do_update( + index_elements=["portfolio_id", "description"], + set_={ + "value": stmt.excluded.value, + "source": stmt.excluded.source, + "updated_at": stmt.excluded.updated_at, + }, + where=self._table.c.source == OverrideSource.CLASSIFIER, + ) + + # SQLModel re-exports SQLAlchemy's ``Session.execute``; one of the + # overload signatures is marked deprecated in stubs, which fires + # here even though our INSERT path is the supported one. + self._session.execute(stmt) # pyright: ignore[reportDeprecated] diff --git a/infrastructure/openai/__init__.py b/infrastructure/openai/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/infrastructure/openai/exceptions.py b/infrastructure/openai/exceptions.py new file mode 100644 index 00000000..14cf95a2 --- /dev/null +++ b/infrastructure/openai/exceptions.py @@ -0,0 +1,2 @@ +class OpenAiClientError(Exception): + """Base for all OpenAI client errors.""" diff --git a/infrastructure/openai/openai_client.py b/infrastructure/openai/openai_client.py new file mode 100644 index 00000000..34af4290 --- /dev/null +++ b/infrastructure/openai/openai_client.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import os +from typing import Optional + +from openai import OpenAI +from openai.types.chat import ChatCompletionMessageParam + +from infrastructure.openai.exceptions import OpenAiClientError + + +class OpenAiChatClient: + """Thin wrapper over the OpenAI Chat Completions API. + + Sends a single prompt and returns the assistant's reply as plain text. + """ + + DEFAULT_MODEL = "gpt-4o-mini" + + def __init__( + self, + api_key: Optional[str] = None, + model: Optional[str] = None, + ) -> None: + key = api_key or os.environ.get("OPENAI_API_KEY") + if not key: + raise OpenAiClientError( + "No OpenAI API key provided. " + "Pass api_key or set the OPENAI_API_KEY environment variable." + ) + self._client = OpenAI(api_key=key) + self._model = model or self.DEFAULT_MODEL + + def generate( + self, + prompt: str, + system_prompt: Optional[str] = None, + ) -> str: + """Send a prompt to the model and return its reply text. + + Args: + prompt: The user message to send. + system_prompt: Optional instruction that sets the model's behaviour. + + Raises: + OpenAiClientError: If the model returns an empty response. + """ + messages: list[ChatCompletionMessageParam] = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + response = self._client.chat.completions.create( + model=self._model, + messages=messages, + ) + content = response.choices[0].message.content + if content is None: + raise OpenAiClientError("OpenAI returned an empty response.") + return content diff --git a/infrastructure/postgres/engine.py b/infrastructure/postgres/engine.py index 0de9efcb..2558532e 100644 --- a/infrastructure/postgres/engine.py +++ b/infrastructure/postgres/engine.py @@ -1,3 +1,6 @@ +from collections.abc import Iterator +from contextlib import contextmanager + from sqlalchemy.engine import Engine from sqlmodel import Session, create_engine @@ -16,3 +19,42 @@ def make_engine(config: PostgresConfig) -> Engine: def make_session(engine: Engine) -> Session: return Session(engine) + + +@contextmanager # pyright: ignore[reportDeprecated] +def transactional_session(engine: Engine) -> Iterator[Session]: + """Yield a session whose lifecycle owns the transaction. + + On clean exit the session commits; on any exception it rolls back and + re-raises. Either way the session is closed. Callers in the application + layer can do their work inside the ``with`` block without ever invoking + ``.commit()`` / ``.rollback()`` themselves -- transaction semantics stay + in the infrastructure layer. + """ + session = Session(engine) + try: + yield session + session.commit() + except Exception: + session.rollback() + raise + finally: + session.close() + + +@contextmanager # pyright: ignore[reportDeprecated] +def commit_scope(session: Session) -> Iterator[Session]: + """Commit a caller-owned session on clean exit; roll back on error. + + Like ``transactional_session`` but for a session the caller already holds + and will close itself. Use it to keep slow, non-DB work *outside* the + transaction: build the session, run the slow work, then enter + ``commit_scope`` only for the persistence -- so a connection is checked out + (SQLModel sessions are lazy) for the shortest possible window. + """ + try: + yield session + session.commit() + except Exception: + session.rollback() + raise diff --git a/infrastructure/postgres/landlord_built_form_type_override_table.py b/infrastructure/postgres/landlord_built_form_type_override_table.py new file mode 100644 index 00000000..ec93ba27 --- /dev/null +++ b/infrastructure/postgres/landlord_built_form_type_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_built_form_type_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only +differences are the table name, the ``built_form_type`` pgEnum on ``value``, +and the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.epc.built_form_type import BuiltFormType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordBuiltFormTypeOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_built_form_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_built_form_type_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: BuiltFormType = Field( + sa_column=Column( + SAEnum( + BuiltFormType, + name="built_form_type", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/landlord_override_enums.py b/infrastructure/postgres/landlord_override_enums.py new file mode 100644 index 00000000..ba2cee94 --- /dev/null +++ b/infrastructure/postgres/landlord_override_enums.py @@ -0,0 +1,35 @@ +"""Shared pgEnum definitions used by every ``landlord_*_overrides`` row class. + +The ``override_source`` pgEnum is referenced by both +``landlord_property_type_overrides`` and ``landlord_wall_type_overrides`` +(per the Drizzle schema -- see ``landlord_overrides.ts``). Defining it once +here and reusing the same SQLAlchemy ``Enum`` instance across both row +classes keeps SQLModel's metadata coherent: ``create_all`` emits exactly one +``CREATE TYPE override_source`` statement, not two parallel ones colliding +on the same pgEnum name. +""" + +from __future__ import annotations + +from sqlalchemy import Enum as SAEnum + + +class OverrideSource: + """Mirror of the ``override_source`` pgEnum. + + Drizzle defines this as ``('classifier', 'user')`` in + ``landlord_overrides.ts``. Modelled here as string constants so callers + don't sprinkle magic strings; the column is constrained by Postgres, + and the only Python-side producer (the classifier path) writes the + literal ``OverrideSource.CLASSIFIER``. + """ + + CLASSIFIER = "classifier" + USER = "user" + + +override_source_sa_enum = SAEnum( + OverrideSource.CLASSIFIER, + OverrideSource.USER, + name="override_source", +) diff --git a/infrastructure/postgres/landlord_property_type_override_table.py b/infrastructure/postgres/landlord_property_type_override_table.py new file mode 100644 index 00000000..ae9377cd --- /dev/null +++ b/infrastructure/postgres/landlord_property_type_override_table.py @@ -0,0 +1,67 @@ +"""SQLModel mirror of the ``landlord_property_type_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.epc.property_type import PropertyType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordPropertyTypeOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_property_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_property_type_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: PropertyType = Field( + sa_column=Column( + SAEnum( + PropertyType, + name="property_type", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/landlord_roof_type_override_table.py b/infrastructure/postgres/landlord_roof_type_override_table.py new file mode 100644 index 00000000..58bd61ff --- /dev/null +++ b/infrastructure/postgres/landlord_roof_type_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_roof_type_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only +differences are the table name, the ``roof_type`` pgEnum on ``value``, and +the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.epc.roof_type import RoofType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordRoofTypeOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_roof_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_roof_type_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: RoofType = Field( + sa_column=Column( + SAEnum( + RoofType, + name="roof_type", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/landlord_wall_type_override_table.py b/infrastructure/postgres/landlord_wall_type_override_table.py new file mode 100644 index 00000000..b5097164 --- /dev/null +++ b/infrastructure/postgres/landlord_wall_type_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_wall_type_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only +differences are the table name, the ``wall_type`` pgEnum on ``value``, and +the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.epc.wall_type import WallType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordWallTypeOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_wall_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_wall_type_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: WallType = Field( + sa_column=Column( + SAEnum( + WallType, + name="wall_type", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/s3/__init__.py b/infrastructure/s3/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/infrastructure/s3/csv_s3_client.py b/infrastructure/s3/csv_s3_client.py new file mode 100644 index 00000000..67c9a8d4 --- /dev/null +++ b/infrastructure/s3/csv_s3_client.py @@ -0,0 +1,62 @@ +import csv +from io import StringIO + +from infrastructure.s3.s3_client import S3Client +from infrastructure.s3.s3_uri import parse_s3_uri + + +def _dedupe_fieldnames(fieldnames: list[str]) -> list[str]: + """Disambiguate repeated CSV headers by appending an index. + + The first occurrence keeps its name; each later one becomes + ``name_1``, ``name_2`` … so duplicate columns survive as distinct + keys instead of collapsing onto one (last-wins) dict entry. + """ + deduped: list[str] = [] + counts: dict[str, int] = {} + for name in fieldnames: + if name not in counts: + counts[name] = 0 + deduped.append(name) + continue + counts[name] += 1 + candidate = f"{name}_{counts[name]}" + while candidate in counts: + counts[name] += 1 + candidate = f"{name}_{counts[name]}" + counts[candidate] = 0 + deduped.append(candidate) + return deduped + + +class CsvS3Client(S3Client): + def read_rows(self, s3_uri: str) -> list[dict[str, str]]: + bucket, key = parse_s3_uri(s3_uri) + if bucket != self.bucket: + raise ValueError( + f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}" + ) + raw = self.get_object(key) + try: + text = raw.decode("utf-8-sig") + except UnicodeDecodeError: + # Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8. + text = raw.decode("cp1252") + + buffer = StringIO(text) + header = next(csv.reader(buffer), None) + if header is None: + return [] + fieldnames = _dedupe_fieldnames(header) + reader = csv.DictReader(buffer, fieldnames=fieldnames) + return [dict(row) for row in reader] + + def save_rows(self, rows: list[dict[str, str]], key: str) -> str: + if not rows: + raise ValueError("Cannot save an empty rows list: header is unknown") + buffer = StringIO() + fieldnames = list(rows[0].keys()) + writer = csv.DictWriter(buffer, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + return self.put_object(key, buffer.getvalue().encode("utf-8")) diff --git a/infrastructure/s3_client.py b/infrastructure/s3/s3_client.py similarity index 100% rename from infrastructure/s3_client.py rename to infrastructure/s3/s3_client.py diff --git a/infrastructure/s3_uri.py b/infrastructure/s3/s3_uri.py similarity index 100% rename from infrastructure/s3_uri.py rename to infrastructure/s3/s3_uri.py diff --git a/orchestration/classifiable_column.py b/orchestration/classifiable_column.py new file mode 100644 index 00000000..92334a38 --- /dev/null +++ b/orchestration/classifiable_column.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from typing import Generic, TypeVar + +from domain.data_transformation.column_classifier import ColumnClassifier +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, +) + +E = TypeVar("E", bound=Enum) + + +@dataclass(frozen=True) +class ClassifiableColumn(Generic[E]): + """Pairs a column's classifier with the repository that persists its results. + + The orchestrator registers one ``ClassifiableColumn`` per + (source column, target enum) pair. Bundling the classifier and the + repository together makes the "this enum lands in this table" invariant + structural -- the handler can no longer wire ``PropertyType`` + classifications to a ``WallType`` repo by keying two dicts with the same + string. + + ``source_column`` is the landlord-CSV header to read from; ``name`` is the + unique key the orchestrator uses to report this classification's results + (and the key the handler logs). Two ``ClassifiableColumn``s may share a + ``source_column`` -- e.g. the ``"Property Type"`` CSV column feeds both + ``PropertyType`` and ``BuiltFormType`` classifiers off the same free-text + description -- but each must have a unique ``name``. + """ + + name: str + source_column: str + classifier: ColumnClassifier[E] + repo: LandlordOverrideRepository[E] diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py new file mode 100644 index 00000000..e43992cf --- /dev/null +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -0,0 +1,142 @@ +from enum import Enum +from typing import Any + +from domain.addresses.unstandardised_address import AddressList +from orchestration.classifiable_column import ClassifiableColumn +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, +) + + +class LandlordDescriptionOverridesOrchestrator: + def __init__( + self, + unstandardised_address_repo: UnstandardisedAddressListRepository, + columns: list[ClassifiableColumn[Any]], + ) -> None: + self._unstandardised_address_repo = unstandardised_address_repo + # Each entry is one (source CSV column, target enum) classification. + # Two entries may share ``source_column`` -- e.g. ``"Property Type"`` + # feeds both PropertyType and BuiltFormType classifiers -- so the + # registry is a list rather than a dict keyed by header. + self._columns = columns + + def get_unstandardised_addresses( + self, + input_s3_uri: str, + ) -> AddressList: + return self._unstandardised_address_repo.load_batch(input_s3_uri) + + def get_col_to_description_mappings( + self, list_of_unstandardised_address: AddressList + ) -> dict[str, set[str]]: + mappings: dict[str, set[str]] = {} + for unstandardised_address in list_of_unstandardised_address: + for key, value in unstandardised_address.additional_info.items(): + bucket = mappings.setdefault(key, set()) + # A comma-separated value is several descriptions in one cell; + # split it so each is its own entry. Lower-case so case-only + # typos collapse to one variant. + for variant in value.split(","): + variant = variant.strip().lower() + if variant: + bucket.add(variant) + return mappings + + def classify_columns( + self, addresses: AddressList + ) -> dict[str, dict[str, Enum]]: + """Classify every registered column's descriptions. + + Returns a mapping of ``ClassifiableColumn.name`` to + ``{description: category}``. A registered column whose ``source_column`` + is absent from the addresses contributes an empty inner mapping. + """ + col_to_desc = self.get_col_to_description_mappings(addresses) + return { + column.name: column.classifier.classify( + col_to_desc.get(column.source_column, set()) + ) + for column in self._columns + } + + def persist( + self, classified: dict[str, dict[str, Enum]], portfolio_id: int + ) -> None: + """Persist already-classified results via each column's repository. + + ``classified`` is keyed by ``ClassifiableColumn.name`` -- the shape + ``classify_columns`` and ``classify_from_rows`` return. Each non-empty + mapping is written through the column's own repo under + ``source = 'classifier'``; an empty mapping (a registered column absent + from this batch) skips the DB round-trip. + + The orchestrator does not commit -- the caller owns the transaction + boundary, and is expected to open it only around this call so the + slow classification never holds a connection. + """ + for column in self._columns: + mapping = classified.get(column.name) + if not mapping: + continue + column.repo.upsert_all(portfolio_id, mapping) + + def classify_and_persist( + self, addresses: AddressList, portfolio_id: int + ) -> dict[str, dict[str, Enum]]: + """Classify every registered column and persist the results. + + Returns the same shape as ``classify_columns`` so callers can log + per-column counts. + """ + classified = self.classify_columns(addresses) + self.persist(classified, portfolio_id) + return classified + + def classify_from_rows( + self, rows: list[dict[str, str]] + ) -> dict[str, dict[str, Enum]]: + """Classify raw CSV rows without touching the database. + + The classification half of ``classify_and_persist_from_rows``, split + out so a caller can run the slow ChatGPT work *before* opening a + transaction and then write the finished results with ``persist`` inside + one short-lived connection. + + Unlike the ``AddressList`` path this builds no ``AddressList``, so it + has no canonical address/postcode requirement -- the classifier only + needs the raw description cells. Used when reading the original + landlord upload (raw headers) rather than the address-matching CSV. + """ + col_to_desc = self._descriptions_from_rows(rows) + return { + column.name: column.classifier.classify( + col_to_desc.get(column.source_column, set()) + ) + for column in self._columns + } + + def classify_and_persist_from_rows( + self, rows: list[dict[str, str]], portfolio_id: int + ) -> dict[str, dict[str, Enum]]: + """Classify + persist straight from raw CSV rows in one call. + + A convenience composition of ``classify_from_rows`` + ``persist``. + Prefer calling the two separately when classification is slow, so the + transaction opens only around ``persist`` (see the Lambda handler). + """ + classified = self.classify_from_rows(rows) + self.persist(classified, portfolio_id) + return classified + + @staticmethod + def _descriptions_from_rows(rows: list[dict[str, str]]) -> dict[str, set[str]]: + mappings: dict[str, set[str]] = {} + for row in rows: + for key, value in row.items(): + bucket = mappings.setdefault(key, set()) + for variant in (value or "").split(","): + variant = variant.strip().lower() + if variant: + bucket.add(variant) + return mappings diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py index 36f4b515..1a7277d5 100644 --- a/orchestration/postcode_splitter_orchestrator.py +++ b/orchestration/postcode_splitter_orchestrator.py @@ -5,19 +5,21 @@ from uuid import UUID from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from orchestration.task_orchestrator import TaskOrchestrator from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from repositories.user_address.user_address_repository import UserAddressRepository +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, +) class PostcodeSplitterOrchestrator: def __init__( self, task_orchestrator: TaskOrchestrator, - user_address_repo: UserAddressRepository, + unstandardised_address_repo: UnstandardisedAddressListRepository, queue_client: Address2UprnQueueClient, max_batch_size: int = 500, ) -> None: self._task_orchestrator = task_orchestrator - self._user_address_repo = user_address_repo + self._unstandardised_address_repo = unstandardised_address_repo self._queue_client = queue_client self._max_batch_size = max_batch_size @@ -28,7 +30,7 @@ class PostcodeSplitterOrchestrator: parent_subtask_id: UUID, input_s3_uri: str, ) -> list[UUID]: - addresses = self._user_address_repo.load_batch(input_s3_uri) + addresses = self._unstandardised_address_repo.load_batch(input_s3_uri) path_prefix = ( f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}" ) @@ -37,7 +39,7 @@ class PostcodeSplitterOrchestrator: for batch in iter_postcode_grouped_batches( addresses, max_batch_size=self._max_batch_size ): - batch_uri = self._user_address_repo.save_batch(batch, path_prefix) + batch_uri = self._unstandardised_address_repo.save_batch(batch, path_prefix) child = self._task_orchestrator.create_child_subtask( parent_task_id, inputs={ diff --git a/pytest.ini b/pytest.ini index 7e2eae9a..47b40c17 100644 --- a/pytest.ini +++ b/pytest.ini @@ -26,5 +26,6 @@ testpaths = etl/hubspot/tests etl/spatial/tests domain/sap10_ml/tests + tests/ markers = integration: mark a test as an integration test diff --git a/repositories/landlord_overrides/__init__.py b/repositories/landlord_overrides/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/repositories/landlord_overrides/landlord_override_repository.py b/repositories/landlord_overrides/landlord_override_repository.py new file mode 100644 index 00000000..ef1fed9d --- /dev/null +++ b/repositories/landlord_overrides/landlord_override_repository.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from enum import Enum +from typing import Generic, TypeVar + +E = TypeVar("E", bound=Enum) + + +class LandlordOverrideRepository(ABC, Generic[E]): + """Port: persists landlord (description -> category) overrides for a portfolio. + + One repository implementation targets one ``landlord__overrides`` + table. The category enum ``E`` (e.g. ``PropertyType``, ``WallType``) determines + which table the adapter writes to; the orchestrator depends only on this + interface and never names a concrete table. + + A single concrete adapter, + ``infrastructure/landlord_overrides/landlord_overrides_postgres_repository.LandlordOverridesRepository``, + serves every category (see ADR-0003) -- it is parameterised by the + SQLModel row class for the target table. + """ + + @abstractmethod + def upsert_all( + self, + portfolio_id: int, + descriptions_to_values: dict[str, E], + ) -> None: + """Upsert each ``(portfolio_id, description) -> value`` row with ``source='classifier'``. + + On conflict with an existing row whose ``source = 'classifier'``, the row + is updated (value, source, updated_at). On conflict with a row whose + ``source = 'user'``, the existing row is preserved -- the classifier + never overwrites a user correction. See ADR-0003 §Decision. + + An empty ``descriptions_to_values`` mapping is a no-op; callers may + skip this call entirely when they have nothing to write. + """ + ... diff --git a/repositories/unstandardised_address/__init__.py b/repositories/unstandardised_address/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py similarity index 66% rename from repositories/user_address/user_address_csv_s3_repository.py rename to repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py index 058fd5a5..20bae20c 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py @@ -4,10 +4,12 @@ import uuid from datetime import datetime, timezone from typing import Optional -from domain.addresses.user_address import UserAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode -from infrastructure.csv_s3_client import CsvS3Client -from repositories.user_address.user_address_repository import UserAddressRepository +from infrastructure.s3.csv_s3_client import CsvS3Client +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, +) _ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3") _POSTCODE_COLUMN: str = "postcode" @@ -15,42 +17,45 @@ _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference" _POSTCODE_CLEAN_COLUMN: str = "postcode_clean" -class UserAddressCsvS3Repository(UserAddressRepository): +class UnstandardisedAddressListCsvS3Repository(UnstandardisedAddressListRepository): def __init__(self, csv_client: CsvS3Client, bucket: str) -> None: self._csv_client = csv_client self._bucket = bucket - def load_batch(self, s3_uri: str) -> list[UserAddress]: + def load_batch(self, s3_uri: str) -> AddressList: rows = self._csv_client.read_rows(s3_uri) if rows and _POSTCODE_COLUMN not in rows[0]: raise ValueError( f"Input CSV {s3_uri} has no {_POSTCODE_COLUMN!r} column; " f"columns present: {sorted(rows[0])}" ) - addresses: list[UserAddress] = [] + addresses: AddressList = AddressList([]) for row in rows: parts = [ row[col].strip() for col in _ADDRESS_COLUMNS if col in row and row[col].strip() ] - user_address = ", ".join(parts) + unstandardised_address = ", ".join(parts) postcode = row.get(_POSTCODE_COLUMN, "") raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip() internal_reference: Optional[str] = raw_ref or None addresses.append( - UserAddress( - user_address=user_address, + UnstandardisedAddress( + address=unstandardised_address, postcode=Postcode(postcode), - internal_reference=internal_reference, - source_row=row, + org_reference=internal_reference, + additional_info=row, ) ) return addresses - def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: + def save_batch(self, addresses: AddressList, path_prefix: str) -> str: rows: list[dict[str, str]] = [ - {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)} + { + **addr.additional_info, + _POSTCODE_CLEAN_COLUMN: str(addr.postcode), + } for addr in addresses ] diff --git a/repositories/unstandardised_address/unstandardised_address_list_repository.py b/repositories/unstandardised_address/unstandardised_address_list_repository.py new file mode 100644 index 00000000..4d446304 --- /dev/null +++ b/repositories/unstandardised_address/unstandardised_address_list_repository.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from domain.addresses.unstandardised_address import AddressList + + +class UnstandardisedAddressListRepository(ABC): + @abstractmethod + def load_batch(self, s3_uri: str) -> AddressList: ... + + @abstractmethod + def save_batch(self, addresses: AddressList, path_prefix: str) -> str: ... diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py deleted file mode 100644 index b2c0f866..00000000 --- a/repositories/user_address/user_address_repository.py +++ /dev/null @@ -1,13 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod - -from domain.addresses.user_address import UserAddress - - -class UserAddressRepository(ABC): - @abstractmethod - def load_batch(self, s3_uri: str) -> list[UserAddress]: ... - - @abstractmethod - def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: ... diff --git a/test.requirements.txt b/test.requirements.txt index 26125034..c5b71977 100644 --- a/test.requirements.txt +++ b/test.requirements.txt @@ -10,4 +10,5 @@ fuzzywuzzy pymupdf playwright==1.58.0 msal -moto[s3,sqs] \ No newline at end of file +moto[s3,sqs] +openai==1.93.0 \ No newline at end of file diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 8ffcf1b5..e5b3e186 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -1,17 +1,17 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from domain.addresses.user_address import UserAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode -def _addrs(postcode: str, n: int) -> list[UserAddress]: - return [ - UserAddress( - user_address=f"{i} {postcode} Street", postcode=Postcode(postcode) - ) - for i in range(n) - ] +def _addrs(postcode: str, n: int) -> AddressList: + return AddressList( + [ + UnstandardisedAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) + for i in range(n) + ] + ) def test_empty_input_yields_no_batches() -> None: @@ -74,9 +74,7 @@ def test_oversize_group_flushes_existing_buffer_first() -> None: big = _addrs("BB2 2BB", 7) tail = _addrs("CC3 3CC", 1) # act - batches = list( - iter_postcode_grouped_batches(small + big + tail, max_batch_size=5) - ) + batches = list(iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)) # assert assert len(batches) == 3 assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"] diff --git a/tests/domain/addresses/test_unstandardised_address.py b/tests/domain/addresses/test_unstandardised_address.py new file mode 100644 index 00000000..dd4eabdb --- /dev/null +++ b/tests/domain/addresses/test_unstandardised_address.py @@ -0,0 +1,96 @@ +import dataclasses + +import pytest + +from domain.addresses.unstandardised_address import UnstandardisedAddress +from domain.postcode import Postcode + + +def test_unstandardised_address_holds_postcode_value_object() -> None: + # act + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) + # assert + assert addr.postcode == Postcode("SW1A1AA") + + +def test_unstandardised_address_preserves_unstandardised_address_verbatim() -> None: + # The free-text unstandardised_address string is intentionally NOT normalised -- + # only the postcode is canonicalised, and that happens inside Postcode. + # act + addr = UnstandardisedAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) + # assert + assert addr.address == " 1 The Street " + + +def test_unstandardised_address_internal_reference_defaults_to_none() -> None: + # act + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + # assert + assert addr.org_reference is None + + +def test_unstandardised_address_internal_reference_accepted() -> None: + # act + addr = UnstandardisedAddress( + address="1 The Street", + postcode=Postcode("SW1A1AA"), + org_reference="cust-42", + ) + # assert + assert addr.org_reference == "cust-42" + + +def test_unstandardised_address_is_frozen() -> None: + # arrange + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + # act / assert + with pytest.raises(dataclasses.FrozenInstanceError): + addr.postcode = Postcode("OTHER") # type: ignore[misc] + + +def test_unstandardised_address_equality_uses_canonical_postcode() -> None: + # Postcode sanitises eagerly, so addresses built from different surface + # forms of the same postcode compare equal. + # arrange + a = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + # act / assert + assert a == b + + +def test_unstandardised_address_source_row_defaults_to_empty_dict() -> None: + # act + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + # assert + assert addr.additional_info == {} + + +def test_unstandardised_address_carries_source_row() -> None: + # arrange + row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} + # act + addr = UnstandardisedAddress( + address="1 The Street", + postcode=Postcode("SW1A 1AA"), + additional_info=row, + ) + # assert + assert addr.additional_info == row + + +def test_unstandardised_address_equality_ignores_source_row() -> None: + # source_row is excluded from equality (and hashing): identity stays + # defined by the parsed fields. + # arrange + a = UnstandardisedAddress( + address="1 The Street", + postcode=Postcode("SW1A1AA"), + additional_info={"x": "1"}, + ) + b = UnstandardisedAddress( + address="1 The Street", + postcode=Postcode("SW1A1AA"), + additional_info={"y": "2"}, + ) + # act / assert + assert a == b diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py deleted file mode 100644 index 8d092df3..00000000 --- a/tests/domain/addresses/test_user_address.py +++ /dev/null @@ -1,98 +0,0 @@ -import dataclasses - -import pytest - -from domain.addresses.user_address import UserAddress -from domain.postcode import Postcode - - -def test_user_address_holds_postcode_value_object() -> None: - # act - addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) - # assert - assert addr.postcode == Postcode("SW1A1AA") - - -def test_user_address_preserves_user_address_verbatim() -> None: - # The free-text user_address string is intentionally NOT normalised -- - # only the postcode is canonicalised, and that happens inside Postcode. - # act - addr = UserAddress( - user_address=" 1 The Street ", postcode=Postcode("SW1A1AA") - ) - # assert - assert addr.user_address == " 1 The Street " - - -def test_user_address_internal_reference_defaults_to_none() -> None: - # act - addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) - # assert - assert addr.internal_reference is None - - -def test_user_address_internal_reference_accepted() -> None: - # act - addr = UserAddress( - user_address="1 The Street", - postcode=Postcode("SW1A1AA"), - internal_reference="cust-42", - ) - # assert - assert addr.internal_reference == "cust-42" - - -def test_user_address_is_frozen() -> None: - # arrange - addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) - # act / assert - with pytest.raises(dataclasses.FrozenInstanceError): - addr.postcode = Postcode("OTHER") # type: ignore[misc] - - -def test_user_address_equality_uses_canonical_postcode() -> None: - # Postcode sanitises eagerly, so addresses built from different surface - # forms of the same postcode compare equal. - # arrange - a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) - b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) - # act / assert - assert a == b - - -def test_user_address_source_row_defaults_to_empty_dict() -> None: - # act - addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) - # assert - assert addr.source_row == {} - - -def test_user_address_carries_source_row() -> None: - # arrange - row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} - # act - addr = UserAddress( - user_address="1 The Street", - postcode=Postcode("SW1A 1AA"), - source_row=row, - ) - # assert - assert addr.source_row == row - - -def test_user_address_equality_ignores_source_row() -> None: - # source_row is excluded from equality (and hashing): identity stays - # defined by the parsed fields. - # arrange - a = UserAddress( - user_address="1 The Street", - postcode=Postcode("SW1A1AA"), - source_row={"x": "1"}, - ) - b = UserAddress( - user_address="1 The Street", - postcode=Postcode("SW1A1AA"), - source_row={"y": "2"}, - ) - # act / assert - assert a == b diff --git a/tests/infrastructure/chatgpt/__init__.py b/tests/infrastructure/chatgpt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py new file mode 100644 index 00000000..425d2625 --- /dev/null +++ b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +from typing import Optional + +import pytest + +from domain.data_transformation.column_classifier import ClassificationError +from domain.epc.property_type import PropertyType +from domain.epc.wall_type import WallType +from infrastructure.chatgpt.chatgpt import ChatGPT +from infrastructure.chatgpt.chatgpt_column_classifier import ( + ChatGptColumnClassifier, +) +from infrastructure.chatgpt.exceptions import ChatGPTClientError + + +class _FakeChatGPT(ChatGPT): + """Hand-written ChatGPT stand-in: returns a canned reply, records prompts.""" + + def __init__( + self, + reply: str = "{}", + error: Optional[Exception] = None, + ) -> None: + self.prompts: list[str] = [] + self.system_prompts: list[Optional[str]] = [] + self._reply = reply + self._error = error + + def generate(self, prompt: str, system_prompt: Optional[str] = None) -> str: + self.prompts.append(prompt) + self.system_prompts.append(system_prompt) + if self._error is not None: + raise self._error + return self._reply + + +def _property_type_classifier( + chat_gpt: ChatGPT, +) -> ChatGptColumnClassifier[PropertyType]: + return ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN) + + +def test_classifies_description_into_its_category() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify({"semi-detached"}) + + # Assert + assert result == {"semi-detached": PropertyType.HOUSE} + + +def test_classifies_when_reply_is_wrapped_in_a_markdown_fence() -> None: + # Arrange: ChatGPT wraps the JSON in a ```json ... ``` code fence. + chat_gpt = _FakeChatGPT(reply='```json\n{"semi-detached": "House"}\n```') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify({"semi-detached"}) + + # Assert + assert result == {"semi-detached": PropertyType.HOUSE} + + +def test_unrecognised_category_maps_to_unknown() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply='{"garden shed": "Shed"}') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify({"garden shed"}) + + # Assert + assert result == {"garden shed": PropertyType.UNKNOWN} + + +def test_description_omitted_from_reply_maps_to_unknown() -> None: + # Arrange: the reply classifies one description but not the other. + chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify({"semi-detached", "TBC"}) + + # Assert + assert result == { + "semi-detached": PropertyType.HOUSE, + "TBC": PropertyType.UNKNOWN, + } + + +def test_chatgpt_failure_raises_classification_error() -> None: + # Arrange + chat_gpt = _FakeChatGPT(error=ChatGPTClientError("backend unreachable")) + classifier = _property_type_classifier(chat_gpt) + + # Act / Assert + with pytest.raises(ClassificationError): + classifier.classify({"semi-detached"}) + + +def test_non_json_reply_raises_classification_error_with_the_raw_reply() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply="sorry, I can't do that") + classifier = _property_type_classifier(chat_gpt) + + # Act / Assert: the error surfaces the offending reply for diagnosis. + with pytest.raises(ClassificationError, match="sorry, I can't do that"): + classifier.classify({"semi-detached"}) + + +def test_empty_description_set_returns_empty_without_calling_chatgpt() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply='{"unused": "House"}') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify(set()) + + # Assert + assert result == {} + assert chat_gpt.prompts == [] + + +def test_classifies_with_a_different_category_enum() -> None: + # Arrange: the same adapter classifies a WallType column. + chat_gpt = _FakeChatGPT( + reply='{"solid brick wall": "Solid brick, as built, no insulation (assumed)"}' + ) + classifier = ChatGptColumnClassifier(chat_gpt, WallType, WallType.UNKNOWN) + + # Act + result = classifier.classify({"solid brick wall"}) + + # Assert + assert result == { + "solid brick wall": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED + } + + +def test_extra_instructions_are_appended_to_the_system_prompt() -> None: + # Arrange: column-specific guidance (e.g. wall-type build-era hints) + # should reach the model verbatim, in the system prompt ahead of the + # JSON-output instruction. + chat_gpt = _FakeChatGPT(reply='{"1970s semi": "House"}') + classifier = ChatGptColumnClassifier( + chat_gpt, + PropertyType, + PropertyType.UNKNOWN, + extra_instructions="If the description carries a build decade, prefer X.", + ) + + # Act + classifier.classify({"1970s semi"}) + + # Assert: the hint sits in the system prompt, before the JSON instruction. + system_prompt = chat_gpt.system_prompts[0] + assert system_prompt is not None + assert "If the description carries a build decade, prefer X." in system_prompt + hint_index = system_prompt.index("If the description carries a build decade") + json_index = system_prompt.index("Reply with only a JSON object") + assert hint_index < json_index + + +def test_omitting_extra_instructions_leaves_the_system_prompt_unchanged() -> None: + # Arrange: a classifier without per-column guidance must still produce + # the original system prompt -- no trailing whitespace, no orphan hint. + chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}') + classifier = ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN) + + # Act + classifier.classify({"semi-detached"}) + + # Assert + system_prompt = chat_gpt.system_prompts[0] + assert system_prompt is not None + assert system_prompt == ( + "Classify each free-text description into exactly one category. " + "Categories: House, Bungalow, Flat, Maisonette, Park home. " + "Reply with only a JSON object mapping each original description " + "to its category, and nothing else." + ) diff --git a/tests/infrastructure/test_csv_s3_client.py b/tests/infrastructure/test_csv_s3_client.py index 30e27164..048a1cbe 100644 --- a/tests/infrastructure/test_csv_s3_client.py +++ b/tests/infrastructure/test_csv_s3_client.py @@ -3,7 +3,7 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from infrastructure.csv_s3_client import CsvS3Client +from infrastructure.s3.csv_s3_client import CsvS3Client from tests.infrastructure import make_boto_client BUCKET = "csv-bucket" @@ -49,3 +49,36 @@ def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None: # act / assert with pytest.raises(ValueError, match="does not match client bucket"): csv_client.read_rows("s3://other-bucket/uploads/addresses.csv") + + +def test_read_rows_indexes_duplicate_column_names(csv_client: CsvS3Client) -> None: + # arrange: the Hyde export has two columns both headed "Walls" — a + # description and a score. Without disambiguation csv.DictReader would + # collapse them onto one key and the description would be lost. + raw = "Address 1,Walls,Roofs,Walls\n1 High St,Cavity: Filled,Pitched 300mm,9.6\n" + uri = csv_client.put_object("uploads/dup.csv", raw.encode("utf-8")) + + # act + rows = csv_client.read_rows(uri) + + # assert: the first occurrence keeps its name, the second gets an index. + assert rows == [ + { + "Address 1": "1 High St", + "Walls": "Cavity: Filled", + "Roofs": "Pitched 300mm", + "Walls_1": "9.6", + } + ] + + +def test_read_rows_indexes_each_repeat_of_a_column(csv_client: CsvS3Client) -> None: + # arrange: three columns sharing one header. + raw = "Walls,Walls,Walls\nfirst,second,third\n" + uri = csv_client.put_object("uploads/triple.csv", raw.encode("utf-8")) + + # act + rows = csv_client.read_rows(uri) + + # assert + assert rows == [{"Walls": "first", "Walls_1": "second", "Walls_2": "third"}] diff --git a/tests/infrastructure/test_s3_client.py b/tests/infrastructure/test_s3_client.py index 67db4f58..bdac6be1 100644 --- a/tests/infrastructure/test_s3_client.py +++ b/tests/infrastructure/test_s3_client.py @@ -3,7 +3,7 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from infrastructure.s3_client import S3Client +from infrastructure.s3.s3_client import S3Client from tests.infrastructure import make_boto_client BUCKET = "test-bucket" diff --git a/tests/infrastructure/test_s3_uri.py b/tests/infrastructure/test_s3_uri.py index 32fd710f..f0865865 100644 --- a/tests/infrastructure/test_s3_uri.py +++ b/tests/infrastructure/test_s3_uri.py @@ -1,6 +1,6 @@ import pytest -from infrastructure.s3_uri import parse_s3_uri +from infrastructure.s3.s3_uri import parse_s3_uri def test_parses_simple_s3_uri() -> None: diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py new file mode 100644 index 00000000..bf5b13ce --- /dev/null +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -0,0 +1,469 @@ +from __future__ import annotations + +from enum import Enum +from typing import Any, Optional + +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress +from domain.epc.built_form_type import BuiltFormType +from domain.epc.property_type import PropertyType +from domain.epc.wall_type import WallType +from domain.postcode import Postcode +from domain.data_transformation.column_classifier import ColumnClassifier +from orchestration.classifiable_column import ClassifiableColumn +from orchestration.landlord_description_overrides_orchestrator import ( + LandlordDescriptionOverridesOrchestrator, +) +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, +) +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, +) + + +class _StubUnstandardisedAddressRepository(UnstandardisedAddressListRepository): + """``get_col_to_description_mappings`` never touches the repo.""" + + def load_batch(self, s3_uri: str) -> AddressList: + raise NotImplementedError() + + def save_batch(self, addresses: AddressList, path_prefix: str) -> str: + raise NotImplementedError() + + +class _StubColumnClassifier(ColumnClassifier[Enum]): + """Records the descriptions it received; returns a canned mapping.""" + + def __init__(self, result: dict[str, Enum]) -> None: + self.received: Optional[set[str]] = None + self._result = result + + def classify(self, descriptions: set[str]) -> dict[str, Enum]: + self.received = descriptions + return self._result + + +class _StubLandlordOverrideRepository(LandlordOverrideRepository[Enum]): + """Records every ``upsert_all`` call so tests can assert routing.""" + + def __init__(self) -> None: + self.calls: list[tuple[int, dict[str, Enum]]] = [] + + def upsert_all( + self, portfolio_id: int, descriptions_to_values: dict[str, Enum] + ) -> None: + self.calls.append((portfolio_id, dict(descriptions_to_values))) + + +def _make_unstandardised_address( + landlord_additional_info: dict[str, str], +) -> UnstandardisedAddress: + return UnstandardisedAddress( + address="1 High St", + postcode=Postcode("AA1 1AA"), + additional_info=landlord_additional_info, + ) + + +def _orchestrator( + columns: Optional[list[ClassifiableColumn[Any]]] = None, +) -> LandlordDescriptionOverridesOrchestrator: + return LandlordDescriptionOverridesOrchestrator( + unstandardised_address_repo=_StubUnstandardisedAddressRepository(), + columns=columns or [], + ) + + +def _column( + name: str, + source_column: str, + classifier: ColumnClassifier[Any], + repo: Optional[LandlordOverrideRepository[Any]] = None, +) -> ClassifiableColumn[Any]: + return ClassifiableColumn( + name=name, + source_column=source_column, + classifier=classifier, + repo=repo or _StubLandlordOverrideRepository(), + ) + + +def test_collects_every_value_per_shared_key() -> None: + # arrange: every address carries the same keys, all values distinct. + addresses = AddressList( + [ + _make_unstandardised_address({"description": "cosy", "condition": "new"}), + _make_unstandardised_address({"description": "spacious", "condition": "worn"}), + _make_unstandardised_address({"description": "bright", "condition": "fair"}), + ] + ) + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert + assert mappings == { + "description": {"cosy", "spacious", "bright"}, + "condition": {"new", "worn", "fair"}, + } + + +def test_repeated_values_collapse_to_one_variant() -> None: + # arrange: two addresses share the same wall description. + addresses = AddressList( + [ + _make_unstandardised_address({"description": "cosy"}), + _make_unstandardised_address({"description": "cosy"}), + _make_unstandardised_address({"description": "bright"}), + ] + ) + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert: a set keeps one entry per distinct variant. + assert mappings == {"description": {"cosy", "bright"}} + + +def test_case_only_variants_collapse_to_one() -> None: + # arrange: the same description typed with inconsistent casing. + addresses = AddressList( + [ + _make_unstandardised_address({"description": "Cosy"}), + _make_unstandardised_address({"description": "cosy"}), + _make_unstandardised_address({"description": "COSY"}), + ] + ) + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert: lower-casing folds the casing typos into one variant. + assert mappings == {"description": {"cosy"}} + + +def test_comma_separated_value_splits_into_individual_entries() -> None: + # arrange: a single cell packs several descriptions, comma-separated. + addresses = AddressList( + [_make_unstandardised_address({"description": "cosy, bright, COSY"})] + ) + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert: each comma-separated part is its own trimmed, lower-cased entry. + assert mappings == {"description": {"cosy", "bright"}} + + +def test_empty_address_list_yields_empty_mapping() -> None: + # arrange / act + mappings = _orchestrator().get_col_to_description_mappings(AddressList([])) + + # assert + assert mappings == {} + + +def test_single_address_yields_single_value_per_key() -> None: + # arrange + addresses = AddressList([_make_unstandardised_address({"description": "cosy"})]) + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert + assert mappings == {"description": {"cosy"}} + + +def test_classify_columns_classifies_each_registered_column() -> None: + # arrange: addresses carry two classifiable columns. + addresses = AddressList( + [ + _make_unstandardised_address( + {"Property Type": "semi-detached", "Walls": "solid brick"} + ), + ] + ) + property_types = _StubColumnClassifier( + result={"semi-detached": PropertyType.HOUSE} + ) + wall_types = _StubColumnClassifier(result={"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}) + + # act + result = _orchestrator( + [ + _column("property_type", "Property Type", property_types), + _column("wall_type", "Walls", wall_types), + ] + ).classify_columns(addresses) + + # assert: each registered column was classified independently, keyed by name. + assert result == { + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, + } + + +def test_classify_columns_yields_empty_mapping_for_an_absent_column() -> None: + # arrange: a classifier is registered for a source column the addresses lack. + addresses = AddressList([_make_unstandardised_address({"Walls": "cavity"})]) + property_types = _StubColumnClassifier(result={}) + + # act + result = _orchestrator( + [_column("property_type", "Property Type", property_types)] + ).classify_columns(addresses) + + # assert: the absent column classified an empty description set. + assert result == {"property_type": {}} + assert property_types.received == set() + + +def test_classify_columns_runs_two_classifiers_against_a_shared_source_column() -> None: + # arrange: the "Property Type" landlord column feeds two classifiers -- + # PropertyType (what kind of dwelling) and BuiltFormType (how it joins + # to neighbours). Both must run against the same description set; each + # result is keyed by its column's ``name``. + addresses = AddressList( + [_make_unstandardised_address({"Property Type": "semi-detached house"})] + ) + property_types = _StubColumnClassifier( + result={"semi-detached house": PropertyType.HOUSE} + ) + built_form_types = _StubColumnClassifier( + result={"semi-detached house": BuiltFormType.SEMI_DETACHED} + ) + + # act + result = _orchestrator( + [ + _column("property_type", "Property Type", property_types), + _column("built_form_type", "Property Type", built_form_types), + ] + ).classify_columns(addresses) + + # assert: both classifiers saw the same description set, and the two + # results live under their own ``name`` keys without colliding. + assert property_types.received == {"semi-detached house"} + assert built_form_types.received == {"semi-detached house"} + assert result == { + "property_type": {"semi-detached house": PropertyType.HOUSE}, + "built_form_type": {"semi-detached house": BuiltFormType.SEMI_DETACHED}, + } + + +def test_classify_and_persist_writes_each_columns_mapping_to_its_own_repo() -> None: + # arrange: two columns with distinct repos -- the orchestrator must + # route each column's classifications to its own repo, not mix them. + addresses = AddressList( + [ + _make_unstandardised_address( + {"Property Type": "semi-detached", "Walls": "solid brick"} + ), + ] + ) + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column( + "property_type", + "Property Type", + _StubColumnClassifier({"semi-detached": PropertyType.HOUSE}), + property_type_repo, + ), + _column( + "wall_type", + "Walls", + _StubColumnClassifier({"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}), + wall_type_repo, + ), + ] + + # act + result = _orchestrator(columns).classify_and_persist(addresses, portfolio_id=42) + + # assert: each repo received exactly its own column's mapping, under the + # given portfolio_id, and the return value mirrors classify_columns. + assert property_type_repo.calls == [(42, {"semi-detached": PropertyType.HOUSE})] + assert wall_type_repo.calls == [ + (42, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}) + ] + assert result == { + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, + } + + +def test_classify_and_persist_skips_upsert_for_a_column_absent_from_the_batch() -> None: + # arrange: ``Walls`` is registered but the address has no ``Walls`` column. + # The orchestrator should still classify (yielding an empty mapping) but + # must NOT call ``upsert_all`` -- an empty bulk insert is a noisy no-op. + addresses = AddressList( + [_make_unstandardised_address({"Property Type": "semi-detached"})] + ) + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column( + "property_type", + "Property Type", + _StubColumnClassifier({"semi-detached": PropertyType.HOUSE}), + property_type_repo, + ), + _column( + "wall_type", + "Walls", + _StubColumnClassifier({}), + wall_type_repo, + ), + ] + + # act + _orchestrator(columns).classify_and_persist(addresses, portfolio_id=7) + + # assert: Property Type wrote; Walls did not. + assert property_type_repo.calls == [(7, {"semi-detached": PropertyType.HOUSE})] + assert wall_type_repo.calls == [] + + +def test_classify_from_rows_classifies_each_column_without_persisting() -> None: + # arrange: raw CSV rows (not an AddressList) carry two classifiable columns. + rows = [{"Property Type": "semi-detached", "Walls": "solid brick"}] + property_types = _StubColumnClassifier({"semi-detached": PropertyType.HOUSE}) + wall_types = _StubColumnClassifier( + {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED} + ) + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + + # act + result = _orchestrator( + [ + _column("property_type", "Property Type", property_types, property_type_repo), + _column("wall_type", "Walls", wall_types, wall_type_repo), + ] + ).classify_from_rows(rows) + + # assert: each classifier ran against its column's descriptions, keyed by + # name -- and NOT a single repo was touched (classification is DB-free, so + # the slow ChatGPT work can run before any transaction opens). + assert result == { + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, + } + assert property_type_repo.calls == [] + assert wall_type_repo.calls == [] + + +def test_classify_from_rows_splits_and_normalises_descriptions() -> None: + # arrange: one cell packs several descriptions with inconsistent casing, + # spread across rows. The rows path must fold them exactly like the + # AddressList path: comma-split, trimmed, lower-cased, de-duped. + rows = [ + {"Walls": "Solid Brick, cavity"}, + {"Walls": "SOLID BRICK"}, + ] + wall_types = _StubColumnClassifier({}) + + # act + _orchestrator( + [_column("wall_type", "Walls", wall_types)] + ).classify_from_rows(rows) + + # assert: the classifier saw one normalised entry per distinct variant. + assert wall_types.received == {"solid brick", "cavity"} + + +def test_classify_from_rows_yields_empty_mapping_for_an_absent_column() -> None: + # arrange: a column is registered for a header the rows lack. + rows = [{"Walls": "cavity"}] + property_types = _StubColumnClassifier({}) + + # act + result = _orchestrator( + [_column("property_type", "Property Type", property_types)] + ).classify_from_rows(rows) + + # assert: the absent column classified an empty description set. + assert result == {"property_type": {}} + assert property_types.received == set() + + +def test_persist_routes_each_columns_mapping_to_its_own_repo() -> None: + # arrange: a finished ``classified`` mapping (as classify_* would return) + # and two columns with distinct repos. + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column("property_type", "Property Type", _StubColumnClassifier({}), property_type_repo), + _column("wall_type", "Walls", _StubColumnClassifier({}), wall_type_repo), + ] + classified: dict[str, dict[str, Enum]] = { + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, + } + + # act + _orchestrator(columns).persist(classified, portfolio_id=42) + + # assert: each repo received exactly its own column's mapping. + assert property_type_repo.calls == [(42, {"semi-detached": PropertyType.HOUSE})] + assert wall_type_repo.calls == [ + (42, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}) + ] + + +def test_persist_skips_empty_and_missing_mappings() -> None: + # arrange: ``property_type`` has an empty mapping; ``wall_type`` is absent + # from ``classified`` entirely. Neither should hit the DB -- and the + # missing key must not raise (``persist`` reads with ``.get``). + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column("property_type", "Property Type", _StubColumnClassifier({}), property_type_repo), + _column("wall_type", "Walls", _StubColumnClassifier({}), wall_type_repo), + ] + classified: dict[str, dict[str, Enum]] = {"property_type": {}} + + # act + _orchestrator(columns).persist(classified, portfolio_id=7) + + # assert: no upserts at all. + assert property_type_repo.calls == [] + assert wall_type_repo.calls == [] + + +def test_classify_and_persist_from_rows_composes_classify_then_persist() -> None: + # arrange: the one-shot rows path must classify AND route to repos, so the + # convenience composition stays equivalent to calling the two in sequence. + rows = [{"Property Type": "semi-detached", "Walls": "solid brick"}] + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column( + "property_type", + "Property Type", + _StubColumnClassifier({"semi-detached": PropertyType.HOUSE}), + property_type_repo, + ), + _column( + "wall_type", + "Walls", + _StubColumnClassifier( + {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED} + ), + wall_type_repo, + ), + ] + + # act + result = _orchestrator(columns).classify_and_persist_from_rows(rows, portfolio_id=99) + + # assert: same return shape as classify_from_rows, and each repo wrote once. + assert result == { + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, + } + assert property_type_repo.calls == [(99, {"semi-detached": PropertyType.HOUSE})] + assert wall_type_repo.calls == [ + (99, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}) + ] diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index a718ffbc..9ad56094 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -13,13 +13,13 @@ from sqlalchemy import Engine from sqlmodel import Session from infrastructure.address2uprn_queue_client import Address2UprnQueueClient -from infrastructure.csv_s3_client import CsvS3Client +from infrastructure.s3.csv_s3_client import CsvS3Client from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator from orchestration.task_orchestrator import TaskOrchestrator from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository from repositories.tasks.task_postgres_repository import TaskPostgresRepository -from repositories.user_address.user_address_csv_s3_repository import ( - UserAddressCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) BUCKET = "splitter-bucket" @@ -27,7 +27,9 @@ REGION = "us-east-1" def _make_boto_client(service_name: str) -> Any: - factory: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + factory: Any = ( + boto3.client + ) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] return factory(service_name, region_name=REGION) @@ -62,7 +64,7 @@ class Harness: csv_client: CsvS3Client boto_sqs: Any queue_url: str - repo: UserAddressCsvS3Repository + repo: UnstandardisedAddressListCsvS3Repository @pytest.fixture @@ -76,7 +78,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: queue_url = cast(str, queue["QueueUrl"]) csv_client = CsvS3Client(boto_s3, BUCKET) - repo = UserAddressCsvS3Repository(csv_client, BUCKET) + repo = UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) # DB: ephemeral PostgreSQL TaskOrchestrator @@ -89,7 +91,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - user_address_repo=repo, + unstandardised_address_repo=repo, queue_client=queue_client, max_batch_size=3, ) @@ -169,10 +171,8 @@ def test_split_and_dispatch_creates_three_children_for_fixture( harness: Harness, ) -> None: # arrange - parent_task, parent_subtask = ( - harness.task_orchestrator.create_task_with_subtask( - task_source="manual:postcode-splitter-int" - ) + parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" ) input_uri = _upload_fixture_csv(harness.csv_client) @@ -197,10 +197,8 @@ def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri( harness: Harness, ) -> None: # arrange - parent_task, parent_subtask = ( - harness.task_orchestrator.create_task_with_subtask( - task_source="manual:postcode-splitter-int" - ) + parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" ) input_uri = _upload_fixture_csv(harness.csv_client) @@ -230,10 +228,8 @@ def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids( harness: Harness, ) -> None: # arrange - parent_task, parent_subtask = ( - harness.task_orchestrator.create_task_with_subtask( - task_source="manual:postcode-splitter-int" - ) + parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" ) input_uri = _upload_fixture_csv(harness.csv_client) @@ -267,10 +263,8 @@ def test_split_and_dispatch_returns_child_ids_in_dispatch_order( harness: Harness, ) -> None: # arrange - parent_task, parent_subtask = ( - harness.task_orchestrator.create_task_with_subtask( - task_source="manual:postcode-splitter-int" - ) + parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" ) input_uri = _upload_fixture_csv(harness.csv_client) diff --git a/tests/repositories/landlord_overrides/__init__.py b/tests/repositories/landlord_overrides/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/repositories/landlord_overrides/postgres/__init__.py b/tests/repositories/landlord_overrides/postgres/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/repositories/landlord_overrides/postgres/test_landlord_overrides_postgres_repository.py b/tests/repositories/landlord_overrides/postgres/test_landlord_overrides_postgres_repository.py new file mode 100644 index 00000000..1ce4e997 --- /dev/null +++ b/tests/repositories/landlord_overrides/postgres/test_landlord_overrides_postgres_repository.py @@ -0,0 +1,220 @@ +"""Integration tests for the consolidated landlord-overrides upsert adapter. + +``LandlordOverridesRepository`` serves every ``landlord__overrides`` +table; the table is selected by the ``…Row`` class passed at construction. The +source-aware conflict policy lives entirely in SQL (``INSERT ... ON CONFLICT +... DO UPDATE ... WHERE existing.source = 'classifier'``), so the only way to +verify it correctly distinguishes ``EXCLUDED.source`` from the qualified +``.source`` is against a real Postgres -- the ``db_engine`` fixture in +``tests/conftest.py`` spins one up per test. + +Each scenario is parametrised across two distinct categories (property type and +wall type). Running the shared body against two different ``…Row`` classes +proves the adapter writes to whichever table it was handed -- not just that one +table happens to work. +""" + +from __future__ import annotations + +from collections.abc import Iterator +from dataclasses import dataclass +from enum import Enum +from typing import cast + +import pytest +from sqlalchemy import Engine, Table +from sqlmodel import Session, SQLModel, select + +from domain.epc.property_type import PropertyType +from domain.epc.wall_type import WallType +from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import ( + LandlordOverridesRepository, +) +from infrastructure.postgres.landlord_override_enums import OverrideSource +from infrastructure.postgres.landlord_property_type_override_table import ( + LandlordPropertyTypeOverrideRow, +) +from infrastructure.postgres.landlord_wall_type_override_table import ( + LandlordWallTypeOverrideRow, +) + + +@dataclass(frozen=True) +class Category: + """One ``landlord__overrides`` table plus a few of its enum values. + + ``first``/``second`` are two distinct classifier values for re-upsert + tests; ``user_correction`` stands in for a value only a user would pick; + ``unknown`` is the category's ``UNKNOWN`` member. + """ + + id: str + row_type: type[SQLModel] + first: Enum + second: Enum + user_correction: Enum + unknown: Enum + + +CATEGORIES = [ + Category( + id="property_type", + row_type=LandlordPropertyTypeOverrideRow, + first=PropertyType.FLAT, + second=PropertyType.HOUSE, + user_correction=PropertyType.BUNGALOW, + unknown=PropertyType.UNKNOWN, + ), + Category( + id="wall_type", + row_type=LandlordWallTypeOverrideRow, + first=WallType.CAVITY_FILLED, + second=WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED, + user_correction=WallType.SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED, + unknown=WallType.UNKNOWN, + ), +] + + +@pytest.fixture +def session(db_engine: Engine) -> Iterator[Session]: + with Session(db_engine) as s: + yield s + + +@pytest.fixture(params=CATEGORIES, ids=lambda c: c.id) +def category(request: pytest.FixtureRequest) -> Category: + return request.param + + +def _select_row( + session: Session, row_type: type[SQLModel], portfolio_id: int, description: str +) -> SQLModel: + # The row class is only known as ``type[SQLModel]`` here, so its column + # descriptors aren't typed; filter via the Core ``Table`` columns instead. + table = cast(Table, getattr(row_type, "__table__")) + rows = session.exec( + select(row_type).where( + table.c.portfolio_id == portfolio_id, + table.c.description == description, + ) + ).all() + assert len(rows) == 1, f"expected exactly one row, got {len(rows)}" + return rows[0] + + +def test_inserts_a_fresh_row_with_source_classifier( + session: Session, category: Category +) -> None: + # arrange + repo = LandlordOverridesRepository[Enum](session, category.row_type) + + # act + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.first}) + session.commit() + + # assert + row = _select_row(session, category.row_type, portfolio_id=1, description="cosy") + assert getattr(row, "value") is category.first + assert getattr(row, "source") == OverrideSource.CLASSIFIER + + +def test_reupsert_overwrites_a_classifier_row( + session: Session, category: Category +) -> None: + # arrange: a stale classifier row exists. + repo = LandlordOverridesRepository[Enum](session, category.row_type) + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.first}) + session.commit() + + # act: re-classify with a different value. + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.second}) + session.commit() + + # assert: the new classification wins. + row = _select_row(session, category.row_type, portfolio_id=1, description="cosy") + assert getattr(row, "value") is category.second + assert getattr(row, "source") == OverrideSource.CLASSIFIER + + +def test_reupsert_does_not_overwrite_a_user_row( + session: Session, category: Category +) -> None: + # arrange: a user has corrected the row. The classifier path never produces + # ``source = 'user'``; we install the row directly to mimic the override + # frontend. + user_row = category.row_type( + portfolio_id=1, + description="cosy", + value=category.user_correction, + source=OverrideSource.USER, + ) + session.add(user_row) + session.commit() + + # act: the classifier re-runs and tries to reclassify the same description. + # Under the source-aware conflict policy, this must be silently skipped -- + # user edits beat classifier reruns. + repo = LandlordOverridesRepository[Enum](session, category.row_type) + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.second}) + session.commit() + + # assert: the user row is unchanged. + row = _select_row(session, category.row_type, portfolio_id=1, description="cosy") + assert getattr(row, "value") is category.user_correction + assert getattr(row, "source") == OverrideSource.USER + + +def test_upsert_keeps_other_portfolios_descriptions_independent( + session: Session, category: Category +) -> None: + # arrange / act: the unique key is ``(portfolio_id, description)``, so the + # same description for two different portfolios must coexist as two rows. + repo = LandlordOverridesRepository[Enum](session, category.row_type) + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.first}) + repo.upsert_all(portfolio_id=2, descriptions_to_values={"cosy": category.second}) + session.commit() + + # assert: both rows survive with their own values. + assert ( + getattr(_select_row(session, category.row_type, 1, "cosy"), "value") + is category.first + ) + assert ( + getattr(_select_row(session, category.row_type, 2, "cosy"), "value") + is category.second + ) + + +def test_upsert_persists_unknown_so_a_user_can_resolve_it_later( + session: Session, category: Category +) -> None: + # arrange / act: a description the classifier couldn't resolve still lands + # -- per ADR-0002 §5 / ADR-0003 §Decision, so a future user override can + # upgrade it to a real value. + repo = LandlordOverridesRepository[Enum](session, category.row_type) + repo.upsert_all( + portfolio_id=1, + descriptions_to_values={"unparseable nonsense": category.unknown}, + ) + session.commit() + + # assert: the row exists with value=UNKNOWN, source=classifier. + row = _select_row( + session, category.row_type, portfolio_id=1, description="unparseable nonsense" + ) + assert getattr(row, "value") is category.unknown + assert getattr(row, "source") == OverrideSource.CLASSIFIER + + +def test_upsert_all_with_empty_mapping_is_a_no_op( + session: Session, category: Category +) -> None: + # arrange / act + repo = LandlordOverridesRepository[Enum](session, category.row_type) + repo.upsert_all(portfolio_id=1, descriptions_to_values={}) + session.commit() + + # assert: nothing was inserted. + rows = session.exec(select(category.row_type)).all() + assert rows == [] diff --git a/tests/repositories/unstandardised_address/__init__.py b/tests/repositories/unstandardised_address/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/repositories/user_address/conftest.py b/tests/repositories/unstandardised_address/conftest.py similarity index 100% rename from tests/repositories/user_address/conftest.py rename to tests/repositories/unstandardised_address/conftest.py diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py similarity index 68% rename from tests/repositories/user_address/test_user_address_csv_s3_repository.py rename to tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py index 9ffb250a..f86878c3 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py @@ -3,11 +3,11 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from domain.addresses.user_address import UserAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode -from infrastructure.csv_s3_client import CsvS3Client -from repositories.user_address.user_address_csv_s3_repository import ( - UserAddressCsvS3Repository, +from infrastructure.s3.csv_s3_client import CsvS3Client +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) from tests.infrastructure import make_boto_client @@ -15,22 +15,22 @@ BUCKET = "user-address-bucket" @pytest.fixture -def repo() -> Iterator[UserAddressCsvS3Repository]: +def repo() -> Iterator[UnstandardisedAddressListCsvS3Repository]: with mock_aws(): boto_client = make_boto_client("s3") boto_client.create_bucket(Bucket=BUCKET) csv_client = CsvS3Client(boto_client, BUCKET) - yield UserAddressCsvS3Repository(csv_client, BUCKET) + yield UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET) def _upload_csv( - repo: UserAddressCsvS3Repository, rows: list[dict[str, str]], key: str + repo: UnstandardisedAddressListCsvS3Repository, rows: list[dict[str, str]], key: str ) -> str: return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage] def test_load_batch_parses_address_postcode_and_reference( - repo: UserAddressCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -50,13 +50,13 @@ def test_load_batch_parses_address_postcode_and_reference( # assert assert len(addresses) == 1 address = addresses[0] - assert address.user_address == "1 High Street, Flat 2, Townville" + assert address.address == "1 High Street, Flat 2, Townville" assert address.postcode == Postcode("SW1A1AA") - assert address.internal_reference == "REF-001" + assert address.org_reference == "REF-001" def test_load_batch_uses_only_address_1_when_others_missing( - repo: UserAddressCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -75,13 +75,13 @@ def test_load_batch_uses_only_address_1_when_others_missing( # assert assert len(addresses) == 1 - assert addresses[0].user_address == "10 Cardiff Road" + assert addresses[0].address == "10 Cardiff Road" assert addresses[0].postcode == Postcode("CF101AA") - assert addresses[0].internal_reference == "REF-002" + assert addresses[0].org_reference == "REF-002" def test_load_batch_handles_missing_internal_reference( - repo: UserAddressCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -100,16 +100,16 @@ def test_load_batch_handles_missing_internal_reference( # assert assert len(addresses) == 1 - assert addresses[0].user_address == "5 Park Lane" + assert addresses[0].address == "5 Park Lane" assert addresses[0].postcode == Postcode("M11AA") - assert addresses[0].internal_reference is None + assert addresses[0].org_reference is None def test_load_batch_captures_full_source_row( - repo: UserAddressCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # A raw EPC-export-shaped row: the splitter must preserve every column, - # not just the ones it parses into UserAddress fields. + # not just the ones it parses into UnstandardisedAddress fields. # arrange row = { "Asset Reference": "511", @@ -124,11 +124,11 @@ def test_load_batch_captures_full_source_row( addresses = repo.load_batch(uri) # assert - assert addresses[0].source_row == row + assert addresses[0].additional_info == row def test_load_batch_raises_when_postcode_column_absent( - repo: UserAddressCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}] @@ -140,7 +140,7 @@ def test_load_batch_raises_when_postcode_column_absent( def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( - repo: UserAddressCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange row = { @@ -154,7 +154,9 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( # act saved_uri = repo.save_batch(addresses, "tasks/passthrough") - saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] + saved_rows = repo._csv_client.read_rows( + saved_uri + ) # pyright: ignore[reportPrivateUsage] # assert assert len(saved_rows) == 1 @@ -167,16 +169,21 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( def test_save_batch_returns_uri_under_path_prefix( - repo: UserAddressCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange - addresses = [ - UserAddress( - user_address="1 High Street", - postcode=Postcode("SW1A 1AA"), - source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, - ), - ] + addresses = AddressList( + [ + UnstandardisedAddress( + address="1 High Street", + postcode=Postcode("SW1A 1AA"), + additional_info={ + "Address 1": "1 High Street", + "postcode": "SW1A 1AA", + }, + ), + ] + ) # act uri = repo.save_batch(addresses, "tasks/abc/batches") @@ -187,7 +194,7 @@ def test_save_batch_returns_uri_under_path_prefix( def test_save_then_reload_round_trip_preserves_columns( - repo: UserAddressCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -207,7 +214,9 @@ def test_save_then_reload_round_trip_preserves_columns( # act saved_uri = repo.save_batch(addresses, "tasks/round-trip") - saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] + saved_rows = repo._csv_client.read_rows( + saved_uri + ) # pyright: ignore[reportPrivateUsage] # assert # Original columns come back verbatim; postcode_clean is the only addition. @@ -218,16 +227,21 @@ def test_save_then_reload_round_trip_preserves_columns( def test_save_batch_uses_unique_filename_per_call( - repo: UserAddressCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange - addresses = [ - UserAddress( - user_address="1 High Street", - postcode=Postcode("SW1A 1AA"), - source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, - ), - ] + addresses = AddressList( + [ + UnstandardisedAddress( + address="1 High Street", + postcode=Postcode("SW1A 1AA"), + additional_info={ + "Address 1": "1 High Street", + "postcode": "SW1A 1AA", + }, + ), + ] + ) # act uri_1 = repo.save_batch(addresses, "tasks/uniqueness") diff --git a/tests/test_lambda_packaging.py b/tests/test_lambda_packaging.py new file mode 100644 index 00000000..39990338 --- /dev/null +++ b/tests/test_lambda_packaging.py @@ -0,0 +1,267 @@ +"""Static packaging linter for Lambda container images. + +Every Lambda here ships as a Docker image that copies a *subset* of the repo +(``COPY utils/ utils/``, ``COPY backend/ backend/``, ...) and then runs a +handler via ``CMD [".handler"]``. If the handler's import graph reaches +a top-level package the Dockerfile forgot to ``COPY``, the function dies at cold +start with ``Runtime.ImportModuleError: No module named ''`` — but only +*inside the image*. In the dev/test tree every package is present, so a plain +``import`` test can't see the gap. This is exactly how ``No module named +'domain'`` reached a deployed address2UPRN. + +The RIE smoke tests (.github/workflows/_smoke_test_lambda.yml) catch this too, +but only by building the full image (minutes) and only for hand-listed services. +This test catches the same class of bug in milliseconds, locally, for *every* +handler Dockerfile — by statically computing each handler's import-time module +graph and asserting every repo file it reaches is copied into the image. + +Scope: import-time (module-level) imports only — the ones that run at Lambda +init, which is what ImportModuleError is about. Imports inside function bodies +and under ``if TYPE_CHECKING:`` are deliberately ignored. Third-party / stdlib +imports are out of scope (that's requirements.txt's job, covered by the RIE +smoke test actually installing and importing). +""" + +from __future__ import annotations + +import ast +import json +import re +from pathlib import Path +from typing import Optional + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] + +# Dockerfiles that are not Lambda handlers (test harness, dev containers). +_SKIP_DOCKERFILES = {"Dockerfile.test", "Dockerfile.test.dockerignore"} +_SKIP_PARTS = {".git", "node_modules", ".devcontainer"} + + +def _toplevel_names() -> set[str]: + """Top-level repo packages/modules — the namespace handler imports resolve + against (imports are absolute: ``domain.x``, ``backend.y``).""" + names: set[str] = set() + for p in REPO_ROOT.iterdir(): + if p.name.startswith(".") or p.name == "__pycache__": + continue + if p.is_dir(): + names.add(p.name) + elif p.suffix == ".py": + names.add(p.stem) + return names + + +_TOP = _toplevel_names() + + +def _is_type_checking(test: ast.expr) -> bool: + if isinstance(test, ast.Name): + return test.id == "TYPE_CHECKING" + if isinstance(test, ast.Attribute): + return test.attr == "TYPE_CHECKING" + return False + + +def _import_time_imports(path: Path) -> list[str]: + """Absolute module names imported when ``path`` is imported (i.e. at Lambda + init). Descends into module-level if/try/with and class bodies, but not into + function bodies (lazy) or ``if TYPE_CHECKING:`` blocks (never executed).""" + try: + tree = ast.parse(path.read_text(encoding="utf-8"), str(path)) + except (SyntaxError, UnicodeDecodeError): + return [] + out: list[str] = [] + + def visit(stmts: list[ast.stmt]) -> None: + for node in stmts: + if isinstance(node, ast.Import): + out.extend(alias.name for alias in node.names) + elif isinstance(node, ast.ImportFrom): + if not node.level and node.module: # absolute imports only + out.append(node.module) + elif isinstance(node, ast.If): + if _is_type_checking(node.test): + continue + visit(node.body) + visit(node.orelse) + elif isinstance(node, ast.Try): + visit(node.body) + visit(node.orelse) + visit(node.finalbody) + for handler in node.handlers: + visit(handler.body) + elif isinstance(node, ast.With): + visit(node.body) + elif isinstance(node, ast.ClassDef): + visit(node.body) + # FunctionDef / AsyncFunctionDef bodies are intentionally skipped. + + visit(tree.body) + return out + + +def _module_to_file(module: str) -> Optional[Path]: + """Resolve a dotted module to its repo source file (``foo.bar`` -> + ``foo/bar.py`` or ``foo/bar/__init__.py``).""" + base = REPO_ROOT.joinpath(*module.split(".")) + py = base.with_suffix(".py") + if py.is_file(): + return py + init = base / "__init__.py" + if init.is_file(): + return init + return None + + +def _import_closure(start: Path) -> dict[Path, Optional[Path]]: + """Repo files reachable from ``start`` via import-time imports, mapped to the + first file that imported each (for blame in failure messages).""" + reached: dict[Path, Optional[Path]] = {} + stack: list[tuple[Path, Optional[Path]]] = [(start, None)] + while stack: + path, importer = stack.pop() + if path in reached: + continue + reached[path] = importer + for module in _import_time_imports(path): + if module.split(".")[0] not in _TOP: + continue # stdlib / third-party — not our concern here + target = _module_to_file(module) + if target is not None and target not in reached: + stack.append((target, path)) + return reached + + +def _norm(path_token: str) -> str: + return path_token.lstrip("./").rstrip("/") + + +def _parse_handler_spec(dockerfile_text: str) -> Optional[str]: + """The ``.handler`` string from the ``CMD`` line, or None if this + isn't a Lambda handler image.""" + match = re.search(r"^CMD\s+(\[.*\]|.+)$", dockerfile_text, re.MULTILINE) + if not match: + return None + raw = match.group(1).strip() + try: + parsed = json.loads(raw) + spec = parsed[0] if isinstance(parsed, list) and parsed else raw + except json.JSONDecodeError: + spec = raw.strip('"') + return spec if isinstance(spec, str) and spec.endswith(".handler") else None + + +def _parse_copies(dockerfile_text: str) -> list[tuple[list[str], str]]: + """``COPY`` instructions as (sources, dest), dropping ``--flag`` tokens.""" + copies: list[tuple[list[str], str]] = [] + for match in re.finditer(r"^COPY\s+(.+)$", dockerfile_text, re.MULTILINE): + tokens = [t for t in match.group(1).split() if not t.startswith("--")] + if len(tokens) < 2: + continue + *sources, dest = tokens + copies.append((sources, dest)) + return copies + + +def _resolve_handler_file( + spec: str, copies: list[tuple[list[str], str]] +) -> Optional[Path]: + """Map a handler spec to its repo source file. + + Handles both in-place layouts (``backend.foo.handler`` -> ``backend/foo/ + handler.py``, present via ``COPY backend/ backend/``) and root-placed + handlers (``main.handler`` where a ``COPY /var/task/main.py`` or + ``COPY /main.py .`` puts the file at the image root).""" + module_path, _func = spec.rsplit(".", 1) + + direct = REPO_ROOT / (module_path.replace(".", "/") + ".py") + if direct.is_file(): + return direct + + # Root-placed module: find the COPY whose destination basename matches. + wanted = module_path.split("/")[-1] + ".py" + for sources, dest in copies: + dest_norm = _norm(dest) + dest_is_named_file = Path(dest_norm).name == wanted + dest_is_dir = dest_norm in ("", module_path.split("/")[-1]) or dest.endswith("/") + for src in sources: + src_path = REPO_ROOT / _norm(src) + if not src_path.is_file(): + continue + if dest_is_named_file or (dest_is_dir and src_path.name == wanted): + return src_path + return None + + +def _is_copied(rel_path: str, copies: list[tuple[list[str], str]]) -> bool: + """Whether a repo-relative file path lands in the image via some COPY.""" + rel_path = _norm(rel_path) + for sources, _dest in copies: + for src in sources: + src_norm = _norm(src) + if src_norm == "" or src_norm == rel_path or rel_path.startswith(src_norm + "/"): + return True + return False + + +def _discover_handler_dockerfiles() -> list[Path]: + found: list[Path] = [] + for path in REPO_ROOT.rglob("*Dockerfile*"): + if path.name in _SKIP_DOCKERFILES: + continue + if any(part in _SKIP_PARTS for part in path.relative_to(REPO_ROOT).parts): + continue + try: + text = path.read_text(encoding="utf-8") + except OSError: + continue + if _parse_handler_spec(text): + found.append(path) + return sorted(found) + + +_HANDLER_DOCKERFILES = _discover_handler_dockerfiles() + + +def test_handler_dockerfiles_discovered() -> None: + """Guard against the discovery silently finding nothing (e.g. a refactor + that renames Dockerfiles), which would make every check below vacuous.""" + assert _HANDLER_DOCKERFILES, "no Lambda handler Dockerfiles found under repo root" + + +@pytest.mark.parametrize( + "dockerfile", + _HANDLER_DOCKERFILES, + ids=[str(p.relative_to(REPO_ROOT)) for p in _HANDLER_DOCKERFILES], +) +def test_lambda_image_copies_full_import_closure(dockerfile: Path) -> None: + """Every repo file the handler imports at init must be COPYed into the image.""" + text = dockerfile.read_text(encoding="utf-8") + spec = _parse_handler_spec(text) + assert spec is not None # discovery guaranteed this + copies = _parse_copies(text) + + handler_file = _resolve_handler_file(spec, copies) + assert handler_file is not None, ( + f"{dockerfile.relative_to(REPO_ROOT)}: could not locate the source file " + f"for CMD handler {spec!r}. Update _resolve_handler_file if this is a new " + f"handler layout." + ) + + missing: list[str] = [] + for reached, importer in _import_closure(handler_file).items(): + rel = str(reached.relative_to(REPO_ROOT)) + if not _is_copied(rel, copies): + blame = ( + str(importer.relative_to(REPO_ROOT)) if importer else "(handler entrypoint)" + ) + missing.append(f" - {rel}\n imported by {blame}") + + assert not missing, ( + f"{dockerfile.relative_to(REPO_ROOT)} runs `{spec}` but does not COPY " + f"{len(missing)} file(s) it imports at init. The Lambda will fail at cold " + f"start with Runtime.ImportModuleError. Add the missing top-level " + f"package(s) as `COPY / /`:\n" + "\n".join(sorted(missing)) + )