mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Merge branch 'main' of https://github.com/Hestia-Homes/Model into feature/per-cert-mapper-validation
This commit is contained in:
commit
69995edec8
97 changed files with 3494 additions and 351 deletions
|
|
@ -27,3 +27,4 @@ pytest-postgresql
|
|||
# Formatting
|
||||
black==26.1.0
|
||||
boto3-stubs
|
||||
openai
|
||||
|
|
|
|||
5
.github/workflows/_deploy_lambda.yml
vendored
5
.github/workflows/_deploy_lambda.yml
vendored
|
|
@ -92,6 +92,9 @@ on:
|
|||
|
||||
TF_VAR_magicplan_api_key:
|
||||
required: false
|
||||
|
||||
TF_VAR_openai_api_key:
|
||||
required: false
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
|
|
@ -163,6 +166,7 @@ jobs:
|
|||
TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }}
|
||||
TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }}
|
||||
TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }}
|
||||
TF_VAR_openai_api_key: ${{ secrets.TF_VAR_openai_api_key }}
|
||||
run: |
|
||||
ECR_REPO_URL_VAR=""
|
||||
if [[ -n "${{ inputs.ecr_repo }}" ]]; then
|
||||
|
|
@ -213,6 +217,7 @@ jobs:
|
|||
TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }}
|
||||
TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }}
|
||||
TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }}
|
||||
TF_VAR_openai_api_key: ${{ secrets.TF_VAR_openai_api_key }}
|
||||
run: |
|
||||
EXTRA_VARS=""
|
||||
if [[ -n "${{ inputs.ecr_repo }}" ]]; then
|
||||
|
|
|
|||
32
.github/workflows/ddd_tests.yml
vendored
Normal file
32
.github/workflows/ddd_tests.yml
vendored
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
name: Run DDD tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
|
||||
# The DDD rewrite (tests/) defines SQLModel table classes that map to the same
|
||||
# physical tables as the legacy backend models. Both sets share the one global
|
||||
# SQLModel.metadata, so they cannot be imported into the same pytest process —
|
||||
# hence this runs as a separate workflow from unit_tests.yml until the legacy
|
||||
# models are retired. It also covers the Lambda packaging linter
|
||||
# (tests/test_lambda_packaging.py). Its DB is spawned in-process by
|
||||
# pytest-postgresql, so no `services: postgres` and no env secrets are required,
|
||||
# and it runs independently of (and in parallel with) the legacy unit tests.
|
||||
jobs:
|
||||
ddd-tests:
|
||||
name: DDD tests (Docker)
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build test image
|
||||
run: docker build -f Dockerfile.test -t model-test .
|
||||
|
||||
- name: Run DDD tests
|
||||
run: |
|
||||
docker run --rm \
|
||||
--network host \
|
||||
model-test pytest -vv tests/
|
||||
41
.github/workflows/deploy_terraform.yml
vendored
41
.github/workflows/deploy_terraform.yml
vendored
|
|
@ -203,6 +203,47 @@ jobs:
|
|||
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
|
||||
|
||||
# ============================================================
|
||||
# Build Landlord Description Overrides image and Push
|
||||
# ============================================================
|
||||
landlordDescriptionOverrides_image:
|
||||
needs: [determine_stage, shared_terraform]
|
||||
uses: ./.github/workflows/_build_image.yml
|
||||
with:
|
||||
ecr_repo: landlord_description_overrides-${{ needs.determine_stage.outputs.stage }}
|
||||
dockerfile_path: applications/landlord_description_overrides/Dockerfile
|
||||
build_context: .
|
||||
build_args: |
|
||||
DEV_DB_HOST=$DEV_DB_HOST
|
||||
DEV_DB_PORT=$DEV_DB_PORT
|
||||
DEV_DB_NAME=$DEV_DB_NAME
|
||||
secrets:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
|
||||
DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }}
|
||||
DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
|
||||
DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
|
||||
|
||||
# ============================================================
|
||||
# Deploy Landlord Description Overrides Lambda
|
||||
# ============================================================
|
||||
landlordDescriptionOverrides_lambda:
|
||||
needs: [landlordDescriptionOverrides_image, determine_stage]
|
||||
uses: ./.github/workflows/_deploy_lambda.yml
|
||||
with:
|
||||
lambda_name: landlordDescriptionOverrides
|
||||
lambda_path: deployment/terraform/lambda/landlordDescriptionOverrides
|
||||
stage: ${{ needs.determine_stage.outputs.stage }}
|
||||
ecr_repo: landlord_description_overrides-${{ needs.determine_stage.outputs.stage }}
|
||||
image_digest: ${{ needs.landlordDescriptionOverrides_image.outputs.image_digest }}
|
||||
terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
|
||||
secrets:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
|
||||
TF_VAR_openai_api_key: ${{ secrets.DEV_OPENAI_API_KEY }}
|
||||
|
||||
# ============================================================
|
||||
# Build Bulk Address2UPRN Combiner image and Push
|
||||
# ============================================================
|
||||
|
|
|
|||
10
.github/workflows/lambda_smoke_tests.yml
vendored
10
.github/workflows/lambda_smoke_tests.yml
vendored
|
|
@ -43,6 +43,16 @@ jobs:
|
|||
build_context: .
|
||||
service_name: postcode-splitter-ddd
|
||||
|
||||
# ============================================================
|
||||
# Landlord Description Overrides
|
||||
# ============================================================
|
||||
landlord_description_overrides_smoke_test:
|
||||
uses: ./.github/workflows/_smoke_test_lambda.yml
|
||||
with:
|
||||
dockerfile_path: applications/landlord_description_overrides/Dockerfile
|
||||
build_context: .
|
||||
service_name: landlord-description-overrides
|
||||
|
||||
# ============================================================
|
||||
# Bulk Address2UPRN Combiner
|
||||
# ============================================================
|
||||
|
|
|
|||
15
.github/workflows/unit_tests.yml
vendored
15
.github/workflows/unit_tests.yml
vendored
|
|
@ -61,14 +61,7 @@ jobs:
|
|||
-e DB_PORT=5432 \
|
||||
model-test pytest -vv -m 'not integration'
|
||||
|
||||
# The DDD rewrite (tests/) defines SQLModel table classes that map to the
|
||||
# same physical tables as the legacy backend models. Both sets share the
|
||||
# one global SQLModel.metadata, so they cannot be imported into the same
|
||||
# pytest process. It runs as a separate invocation until the legacy
|
||||
# models are retired. Its DB is spawned in-process by pytest-postgresql,
|
||||
# so no DB service or env is required.
|
||||
- name: Run DDD tests
|
||||
run: |
|
||||
docker run --rm \
|
||||
--network host \
|
||||
model-test pytest -vv tests/
|
||||
# The DDD rewrite (tests/) runs in its own workflow (ddd_tests.yml): its
|
||||
# SQLModel table classes map to the same physical tables as the legacy
|
||||
# backend models and share the one global SQLModel.metadata, so the two
|
||||
# suites cannot be imported into the same pytest process.
|
||||
|
|
|
|||
|
|
@ -1,7 +1,84 @@
|
|||
# Ubiquitous Language
|
||||
|
||||
This file has been **superseded by [CONTEXT.md](./CONTEXT.md)**.
|
||||
Domain terminology glossary for this project. Generated and maintained by the `/ubiquitous-language` Claude Code skill.
|
||||
|
||||
The project's domain glossary now lives at the repo root in `CONTEXT.md`, maintained by the `/grill-with-docs` skill (which replaced `/ubiquitous-language`).
|
||||
Invoke `/ubiquitous-language` in any session to extract new terms from the conversation, flag ambiguities, and update this file with canonical definitions.
|
||||
|
||||
If you arrived here from a link in `CLAUDE.md` or older docs, follow the link above. This file is kept only to preserve git history and may be removed once internal references are updated.
|
||||
---
|
||||
|
||||
## Energy Performance Certificates
|
||||
|
||||
| Term | Definition | Aliases to avoid |
|
||||
|------|------------|------------------|
|
||||
| **EPC** | An Energy Performance Certificate — a government-issued document rating a dwelling's energy efficiency from A (best) to G (worst). | "energy certificate", "energy report" |
|
||||
| **Certificate Number** | The unique identifier assigned to an EPC by the government registry. | "cert number", "EPC ID" |
|
||||
| **Registration Date** | The date an EPC was lodged with the government register; used to identify the most recent certificate for a property. | "assessment date", "submission date" |
|
||||
| **EPC Band** | A single letter A–G representing a property's current or potential energy efficiency rating. | "energy rating", "EPC grade", "EPC score" |
|
||||
| **Schema Type** | The versioned RdSAP or SAP schema that describes the structure of a certificate's raw data (e.g. `RdSAP-Schema-21.0.1`). | "schema version", "EPC format" |
|
||||
| **Domestic Certificate** | An EPC issued for a residential dwelling, as opposed to a commercial one. | "residential EPC", "home EPC" |
|
||||
|
||||
## Properties and Addresses
|
||||
|
||||
| Term | Definition | Aliases to avoid |
|
||||
|------|------------|------------------|
|
||||
| **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" |
|
||||
| **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" |
|
||||
| **Unstandardised Address** | A frozen dataclass (`domain.addresses.unstandardised_address.UnstandardisedAddress`) capturing a single address exactly as a customer supplied it, before any standardisation: a free-text `address` line (intentionally NOT normalised), a canonical `postcode` (a `Postcode` value object, sanitised on construction), an optional `org_reference` (the customer's own identifier for the property), and `additional_info` (the full source row — every column of the customer's upload, preserved verbatim). | "user address", "asset list", "raw address", "landlord address", "Hyde address" |
|
||||
| **Address List** | A nominal `NewType` over `list[UnstandardisedAddress]` (`domain.addresses.unstandardised_address.AddressList`) — a batch of unstandardised addresses, such as one customer's bulk-onboarding upload or a postcode-grouped sub-batch produced for downstream processing. Being nominal, it is constructed explicitly: `AddressList([...])`. It is the raw *input* to ingestion; the standardised *output* is a **Standardised Asset List**. | "asset list", "Hyde address list", "user addresses" |
|
||||
| **Standardised Asset List (SAL)** | A customer's property portfolio after ingestion has cleaned and standardised it — each property carrying a canonical field set (UPRN, standardised address, postcode, property type, built form, …). It is the standardised *output* of the pipeline whose raw *input* is an **Address List** of **Unstandardised Addresses**; generated by the `SALOrchestrator`. (Legacy implementation: `asset_list.AssetList` via `load_standardised_asset_list`.) | "address list" (that is the raw input), "asset register", "portfolio list" |
|
||||
| **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" |
|
||||
|
||||
## Address Matching
|
||||
|
||||
| Term | Definition | Aliases to avoid |
|
||||
|------|------------|------------------|
|
||||
| **Lexiscore** | A similarity score in [0, 1] between an unstandardised address and a candidate EPC address; combines token overlap and character-level similarity. | "score", "match score", "similarity" |
|
||||
| **Lexirank** | Dense rank of candidates sorted by lexiscore descending; rank 1 = best match. | "rank", "position" |
|
||||
| **UPRN Candidate** | An EPC search result that is a plausible match for a given unstandardised address, before scoring decides the winner. | "match candidate", "result" |
|
||||
| **Score Threshold** | The minimum lexiscore (currently 0.6) below which no match is returned even if a candidate exists. | "minimum score", "cutoff" |
|
||||
| **Ambiguous Match** | A matching outcome where two or more candidates share lexirank 1, making it impossible to select a unique winner. | "tie", "draw", "duplicate" |
|
||||
| **Best Match** | The single UPRN candidate with lexirank 1 that meets or exceeds the score threshold. | "winner", "top result" |
|
||||
|
||||
## API and Integration
|
||||
|
||||
| Term | Definition | Aliases to avoid |
|
||||
|------|------------|------------------|
|
||||
| **EPC Search Result** | A lightweight record returned by the government domestic search endpoint — contains address lines, postcode, UPRN, band, and certificate number but not the full certificate data. | "search row", "EPC row", "result" |
|
||||
| **EPC Property Data** | The fully mapped domain object produced after fetching and parsing a complete EPC certificate. | "EPC data", "certificate data", "parsed EPC" |
|
||||
| **Old EPC API** | The retired government API (`epc.opendatacommunities.org`) using HTTP Basic auth; decommissioned May 2026. | "legacy API" |
|
||||
| **New EPC API** | The replacement government API (`api.get-energy-performance-data.communities.gov.uk`) using Bearer token auth. | "new API", "current API" |
|
||||
| **Bearer Token** | The auth credential required by the new EPC API; stored in the `EPC_AUTH_TOKEN` environment variable. | "API key", "auth token", "secret" |
|
||||
|
||||
## Relationships
|
||||
|
||||
- An **EPC** belongs to exactly one **Dwelling** and has one **Certificate Number**.
|
||||
- A **Dwelling** may have multiple **EPCs** across time; the one with the most recent **Registration Date** is the current one.
|
||||
- A **UPRN** identifies a **Dwelling** permanently; it does not change when the property changes owner.
|
||||
- An **EPC Search Result** is a summary; it points to a full **EPC** via its **Certificate Number**.
|
||||
- An **Address List** is an ordered batch of **Unstandardised Addresses**; a customer's bulk-onboarding upload arrives as one.
|
||||
- Ingestion turns an **Address List** (raw input) into a **Standardised Asset List** (standardised output) — the **SAL Orchestrator** drives this.
|
||||
- **Address Matching** uses an **Unstandardised Address** and **Postcode** to find a **UPRN** by scoring **UPRN Candidates** from an EPC search.
|
||||
- A **Lexirank** of 1 with no **Ambiguous Match** and a **Lexiscore** ≥ the **Score Threshold** produces a **Best Match**.
|
||||
|
||||
## Example dialogue
|
||||
|
||||
> **Dev:** "We have an unstandardised address and postcode. How do we find the UPRN?"
|
||||
|
||||
> **Domain expert:** "Search the **New EPC API** by **Postcode** — you get back a list of **EPC Search Results** for that area. Each one has an address and a **UPRN**. Score each against the **Unstandardised Address** using the **Lexiscore**. If the top **UPRN Candidate** scores above the **Score Threshold** and there's no **Ambiguous Match**, that's your **Best Match**."
|
||||
|
||||
> **Dev:** "What if two results share the same address line 1?"
|
||||
|
||||
> **Domain expert:** "That's an **Ambiguous Match** — two candidates at **Lexirank** 1. Fall back to scoring on the full address using all address lines joined together. If that still ties, return nothing."
|
||||
|
||||
> **Dev:** "Once we have the best match, do we use the UPRN or fetch the full EPC?"
|
||||
|
||||
> **Domain expert:** "Depends on what you need. The **EPC Search Result** gives you the **EPC Band** and **Certificate Number**. If you need energy efficiency detail, use the **Certificate Number** to fetch the full **EPC Property Data**."
|
||||
|
||||
## Flagged ambiguities
|
||||
|
||||
- **"address"** appears in several senses: the **Unstandardised Address** dataclass (one customer-supplied address before standardisation), its free-text `address` field, and the normalised address lines on an **EPC Search Result**. Always qualify: "unstandardised address" vs "EPC address" or "address line 1". Within `domain/addresses/`, the dataclass is **Unstandardised Address**; in upstream ingestion contexts (CSV columns, SQS payloads) "address" may still mean the bare free-text string.
|
||||
- **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments.
|
||||
- **"user_inputed_address"** (and `user_address`) in `backend/address2UPRN/` is legacy naming — a misspelled synonym for what is now the **Unstandardised Address**. That address-matching code has not been renamed; new code should use **Unstandardised Address**.
|
||||
- **"Hyde address list"** — "Hyde" is the name of one customer, not a domain concept. A domain expert may say "the Hyde address list" because Hyde is the customer in front of them, but the generalised term is **Address List** (and **Unstandardised Address** for a single item). A customer's identity is data — it belongs in `org_reference` or `additional_info`, never in a type or module name.
|
||||
- **"address list"** vs **"asset list"** — opposite ends of the ingestion pipeline; do not conflate them. An **Address List** is the raw *input* (unstandardised addresses as the customer supplied them); a **Standardised Asset List** is the standardised *output*. The historical `AssetList` dataclass (now **Unstandardised Address**) misnamed the input an "asset list" — that mistake is what the rename corrected.
|
||||
- **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter.
|
||||
|
|
|
|||
|
|
@ -25,6 +25,9 @@ COPY infrastructure/ infrastructure/
|
|||
COPY orchestration/ orchestration/
|
||||
COPY repositories/ repositories/
|
||||
COPY utilities/ utilities/
|
||||
# domain.property.site_notes -> datatypes.epc.domain.epc_property_data; without
|
||||
# this the lambda fails at init with "No module named 'datatypes'".
|
||||
COPY datatypes/ datatypes/
|
||||
COPY applications/ applications/
|
||||
|
||||
# Place the handler at the Lambda task root so the runtime can resolve
|
||||
|
|
|
|||
34
applications/landlord_description_overrides/Dockerfile
Normal file
34
applications/landlord_description_overrides/Dockerfile
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
FROM public.ecr.aws/lambda/python:3.11
|
||||
|
||||
# Postgres host/port/database are baked into the image at build time from
|
||||
# the deploy workflow's --build-arg values (GitHub Actions DEV_DB_* secrets),
|
||||
# mirroring backend/postcode_splitter/handler/Dockerfile. They map onto the
|
||||
# POSTGRES_* names PostgresConfig.from_env reads. Username/password are NOT
|
||||
# baked in -- Terraform injects those as Lambda env vars from Secrets Manager.
|
||||
ARG DEV_DB_HOST
|
||||
ARG DEV_DB_PORT
|
||||
ARG DEV_DB_NAME
|
||||
|
||||
ENV POSTGRES_HOST=${DEV_DB_HOST}
|
||||
ENV POSTGRES_PORT=${DEV_DB_PORT}
|
||||
ENV POSTGRES_DATABASE=${DEV_DB_NAME}
|
||||
|
||||
WORKDIR /var/task
|
||||
|
||||
COPY applications/landlord_description_overrides/requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy the layered source the handler imports from. The new splitter pulls
|
||||
# only DDD-shaped packages — no pandas, no legacy backend/.
|
||||
COPY domain/ domain/
|
||||
COPY infrastructure/ infrastructure/
|
||||
COPY orchestration/ orchestration/
|
||||
COPY repositories/ repositories/
|
||||
COPY utilities/ utilities/
|
||||
COPY applications/ applications/
|
||||
|
||||
# Place the handler at the Lambda task root so the runtime can resolve
|
||||
# ``main.handler`` without an extra package prefix.
|
||||
COPY applications/landlord_description_overrides/handler.py /var/task/main.py
|
||||
|
||||
CMD ["main.handler"]
|
||||
169
applications/landlord_description_overrides/handler.py
Normal file
169
applications/landlord_description_overrides/handler.py
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import boto3
|
||||
|
||||
from applications.landlord_description_overrides.landlord_description_overrides_trigger_body import (
|
||||
LandlordDescriptionOverridesTriggerBody,
|
||||
)
|
||||
from domain.epc.built_form_type import BuiltFormType
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.roof_type import RoofType
|
||||
from domain.epc.wall_type import WallType
|
||||
from domain.epc.wall_type_construction_dates import (
|
||||
wall_type_construction_date_prompt_hint,
|
||||
)
|
||||
from infrastructure.chatgpt.chatgpt import ChatGPT
|
||||
from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier
|
||||
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
|
||||
LandlordOverridesRepository,
|
||||
)
|
||||
from infrastructure.postgres.config import PostgresConfig
|
||||
from infrastructure.postgres.engine import commit_scope, make_engine, make_session
|
||||
from infrastructure.postgres.landlord_built_form_type_override_table import (
|
||||
LandlordBuiltFormTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_property_type_override_table import (
|
||||
LandlordPropertyTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_roof_type_override_table import (
|
||||
LandlordRoofTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_wall_type_override_table import (
|
||||
LandlordWallTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.s3.csv_s3_client import CsvS3Client
|
||||
from infrastructure.s3.s3_uri import parse_s3_uri
|
||||
from orchestration.classifiable_column import ClassifiableColumn
|
||||
from orchestration.landlord_description_overrides_orchestrator import (
|
||||
LandlordDescriptionOverridesOrchestrator,
|
||||
)
|
||||
from orchestration.task_orchestrator import TaskOrchestrator
|
||||
from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import (
|
||||
UnstandardisedAddressListCsvS3Repository,
|
||||
)
|
||||
from utilities.aws_lambda.subtask_handler import subtask_handler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _build_columns(
|
||||
column_mapping: dict[str, str], chat_gpt: ChatGPT, session: Any
|
||||
) -> list[ClassifiableColumn[Any]]:
|
||||
"""One ClassifiableColumn per mapped category.
|
||||
|
||||
``column_mapping`` is ``{category -> source CSV header}``. One header may
|
||||
feed several categories -- e.g. ``"Property Type"`` -> property_type and
|
||||
built_form_type -- which falls out naturally because each is a separate
|
||||
entry. Unknown categories are skipped.
|
||||
"""
|
||||
factories = {
|
||||
"property_type": lambda src: ClassifiableColumn(
|
||||
name="property_type",
|
||||
source_column=src,
|
||||
classifier=ChatGptColumnClassifier(
|
||||
chat_gpt, PropertyType, PropertyType.UNKNOWN
|
||||
),
|
||||
repo=LandlordOverridesRepository[PropertyType](
|
||||
session, LandlordPropertyTypeOverrideRow
|
||||
),
|
||||
),
|
||||
"built_form_type": lambda src: ClassifiableColumn(
|
||||
name="built_form_type",
|
||||
source_column=src,
|
||||
classifier=ChatGptColumnClassifier(
|
||||
chat_gpt, BuiltFormType, BuiltFormType.UNKNOWN
|
||||
),
|
||||
repo=LandlordOverridesRepository[BuiltFormType](
|
||||
session, LandlordBuiltFormTypeOverrideRow
|
||||
),
|
||||
),
|
||||
"wall_type": lambda src: ClassifiableColumn(
|
||||
name="wall_type",
|
||||
source_column=src,
|
||||
classifier=ChatGptColumnClassifier(
|
||||
chat_gpt,
|
||||
WallType,
|
||||
WallType.UNKNOWN,
|
||||
extra_instructions=wall_type_construction_date_prompt_hint(),
|
||||
),
|
||||
repo=LandlordOverridesRepository[WallType](
|
||||
session, LandlordWallTypeOverrideRow
|
||||
),
|
||||
),
|
||||
"roof_type": lambda src: ClassifiableColumn(
|
||||
name="roof_type",
|
||||
source_column=src,
|
||||
classifier=ChatGptColumnClassifier(
|
||||
chat_gpt, RoofType, RoofType.UNKNOWN
|
||||
),
|
||||
repo=LandlordOverridesRepository[RoofType](
|
||||
session, LandlordRoofTypeOverrideRow
|
||||
),
|
||||
),
|
||||
}
|
||||
|
||||
columns: list[ClassifiableColumn[Any]] = []
|
||||
for category, source_column in column_mapping.items():
|
||||
factory = factories.get(category)
|
||||
if factory is None:
|
||||
logger.warning("Unknown classifier category %r; skipping.", category)
|
||||
continue
|
||||
columns.append(factory(source_column))
|
||||
return columns
|
||||
|
||||
|
||||
@subtask_handler()
|
||||
def handler(
|
||||
body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator
|
||||
) -> dict[str, int]:
|
||||
trigger = LandlordDescriptionOverridesTriggerBody.model_validate(body)
|
||||
|
||||
# The classifier reads a dedicated CSV of the classifier columns (raw
|
||||
# landlord headers preserved), converted from the upload by the frontend, so
|
||||
# the S3 bucket comes from the trigger URI rather than a fixed env var.
|
||||
bucket, _key = parse_s3_uri(trigger.s3_uri)
|
||||
|
||||
# boto3.client is overloaded per-service in the installed stubs; cast to Any
|
||||
# so the strict-mode checker treats it as opaque.
|
||||
boto3_client: Any = (
|
||||
boto3.client
|
||||
) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
boto_s3: Any = boto3_client("s3")
|
||||
|
||||
csv_client = CsvS3Client(boto_s3, bucket)
|
||||
unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(
|
||||
csv_client, bucket
|
||||
)
|
||||
|
||||
# Raw rows, not load_batch: the classifier CSV carries the description
|
||||
# columns but not the canonical address/postcode columns load_batch requires.
|
||||
rows = csv_client.read_rows(trigger.s3_uri)
|
||||
|
||||
engine = make_engine(PostgresConfig.from_env(os.environ))
|
||||
# The session is built up front (SQLModel sessions are lazy, so no
|
||||
# connection is checked out yet) and owned by this handler. Classification
|
||||
# runs first and calls ChatGPT, which is slow; we deliberately keep no
|
||||
# transaction open across it. Only the persistence below -- inside
|
||||
# ``commit_scope`` -- holds a connection.
|
||||
session = make_session(engine)
|
||||
try:
|
||||
chat_gpt = ChatGPT()
|
||||
columns = _build_columns(trigger.column_mapping, chat_gpt, session)
|
||||
orchestrator = LandlordDescriptionOverridesOrchestrator(
|
||||
unstandardised_address_repo=unstandardised_address_repo,
|
||||
columns=columns,
|
||||
)
|
||||
|
||||
classified = orchestrator.classify_from_rows(rows)
|
||||
|
||||
with commit_scope(session):
|
||||
orchestrator.persist(classified, portfolio_id=trigger.portfolio_id)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
counts = {name: len(mapping) for name, mapping in classified.items()}
|
||||
for name, n in counts.items():
|
||||
logger.info("Classified %d descriptions for column %r.", n, name)
|
||||
return counts
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
|
||||
class LandlordDescriptionOverridesTriggerBody(BaseModel):
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
task_id: UUID
|
||||
sub_task_id: UUID
|
||||
s3_uri: str
|
||||
# ``portfolio_id`` is ``bigint`` in the ``landlord_*_overrides`` schema --
|
||||
# Python ``int`` is unbounded so the Pydantic side stays simple; the
|
||||
# SQLModel row class pins the storage to ``BigInteger``.
|
||||
portfolio_id: int
|
||||
# category -> source CSV header (the classifier subset of the upload
|
||||
# mapping). Defaulted so a malformed/empty message classifies nothing
|
||||
# rather than failing validation.
|
||||
column_mapping: dict[str, str] = {}
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
POSTGRES_HOST=
|
||||
POSTGRES_PORT=5432
|
||||
POSTGRES_USERNAME=
|
||||
POSTGRES_PASSWORD=
|
||||
POSTGRES_DATABASE=
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
services:
|
||||
landlord_overrides:
|
||||
build:
|
||||
context: ../../../
|
||||
dockerfile: applications/landlord_description_overrides/Dockerfile
|
||||
ports:
|
||||
- "9002:8080"
|
||||
env_file:
|
||||
- .env.local
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env python3
|
||||
import json
|
||||
import requests
|
||||
|
||||
HOST = "localhost"
|
||||
PORT = "9002"
|
||||
|
||||
LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations"
|
||||
|
||||
payload = {"Records": [{"body": json.dumps({})}]}
|
||||
|
||||
response = requests.post(LAMBDA_URL, json=payload)
|
||||
|
||||
print("Status code:", response.status_code)
|
||||
print("Response:")
|
||||
print(response.text)
|
||||
12
applications/landlord_description_overrides/local_handler/run_local.sh
Executable file
12
applications/landlord_description_overrides/local_handler/run_local.sh
Executable file
|
|
@ -0,0 +1,12 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
if [ ! -f .env.local ]; then
|
||||
cp .env.local.example .env.local
|
||||
echo "Created .env.local from the template — fill it in, then re-run." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
docker compose build --no-cache
|
||||
docker compose up --force-recreate
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
boto3
|
||||
pydantic
|
||||
sqlmodel
|
||||
psycopg2-binary
|
||||
openai==1.93.0
|
||||
|
|
@ -9,11 +9,11 @@ from applications.postcode_splitter.postcode_splitter_trigger_body import (
|
|||
PostcodeSplitterTriggerBody,
|
||||
)
|
||||
from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
|
||||
from infrastructure.csv_s3_client import CsvS3Client
|
||||
from infrastructure.s3.csv_s3_client import CsvS3Client
|
||||
from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator
|
||||
from orchestration.task_orchestrator import TaskOrchestrator
|
||||
from repositories.user_address.user_address_csv_s3_repository import (
|
||||
UserAddressCsvS3Repository,
|
||||
from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import (
|
||||
UnstandardisedAddressListCsvS3Repository,
|
||||
)
|
||||
from utilities.aws_lambda.subtask_handler import subtask_handler
|
||||
|
||||
|
|
@ -29,17 +29,19 @@ def handler(
|
|||
|
||||
# boto3.client is overloaded per-service in the installed stubs; cast
|
||||
# to Any so the strict-mode checker treats it as opaque.
|
||||
boto3_client: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
boto3_client: Any = (
|
||||
boto3.client
|
||||
) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
boto_s3: Any = boto3_client("s3")
|
||||
boto_sqs: Any = boto3_client("sqs")
|
||||
|
||||
csv_client = CsvS3Client(boto_s3, bucket)
|
||||
user_address_repo = UserAddressCsvS3Repository(csv_client, bucket)
|
||||
unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(csv_client, bucket)
|
||||
queue_client = Address2UprnQueueClient(boto_sqs, queue_url)
|
||||
|
||||
splitter = PostcodeSplitterOrchestrator(
|
||||
task_orchestrator=task_orchestrator,
|
||||
user_address_repo=user_address_repo,
|
||||
unstandardised_address_repo=unstandardised_address_repo,
|
||||
queue_client=queue_client,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -32,6 +32,12 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|||
COPY utils/ utils/
|
||||
COPY backend/ backend/
|
||||
COPY datatypes/ datatypes/
|
||||
# main.py imports infrastructure.epc_client.epc_client_service (EpcClientService);
|
||||
# without this the lambda fails at init with "No module named 'infrastructure'".
|
||||
COPY infrastructure/ infrastructure/
|
||||
# EpcClientService -> datatypes.epc.domain.mapper -> domain.sap10_calculator;
|
||||
# without this the lambda fails at init with "No module named 'domain'".
|
||||
COPY domain/ domain/
|
||||
|
||||
# Copy the handler
|
||||
COPY backend/address2UPRN/main.py .
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ from backend.app.bulk_uploads.schema import (
|
|||
CombinedResultsResponse,
|
||||
CombinerTriggerRequest,
|
||||
FlagsSummary,
|
||||
LandlordOverridesTriggerRequest,
|
||||
PostcodeSplitterTriggerRequest,
|
||||
)
|
||||
from backend.app.bulk_uploads.scoring import score_bucket
|
||||
|
|
@ -92,6 +93,26 @@ async def trigger_combiner(req: CombinerTriggerRequest):
|
|||
}
|
||||
|
||||
|
||||
@router.post("/trigger-landlord-overrides", status_code=202)
|
||||
async def trigger_landlord_overrides(req: LandlordOverridesTriggerRequest):
|
||||
settings = get_settings()
|
||||
|
||||
try:
|
||||
sqs = boto3.client("sqs", settings.AWS_DEFAULT_REGION)
|
||||
response = sqs.send_message(
|
||||
QueueUrl=settings.LANDLORD_OVERRIDES_SQS_URL,
|
||||
MessageBody=req.model_dump_json(),
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"SQS error: {e}")
|
||||
|
||||
return {
|
||||
"task_id": req.task_id,
|
||||
"sub_task_id": req.sub_task_id,
|
||||
"sqs_message_id": response.get("MessageId"),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/{task_id}/combined-results", response_model=CombinedResultsResponse)
|
||||
async def get_combined_results(
|
||||
task_id: UUID,
|
||||
|
|
|
|||
|
|
@ -14,6 +14,15 @@ class CombinerTriggerRequest(BaseModel):
|
|||
sub_task_id: str
|
||||
|
||||
|
||||
class LandlordOverridesTriggerRequest(BaseModel):
|
||||
task_id: str
|
||||
sub_task_id: str
|
||||
s3_uri: str
|
||||
portfolio_id: int
|
||||
# category -> source CSV header (the classifier subset of the upload mapping)
|
||||
column_mapping: dict[str, str]
|
||||
|
||||
|
||||
class FlagsSummary(BaseModel):
|
||||
duplicates: int
|
||||
missing: int
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ class Settings(BaseSettings):
|
|||
MAGICPLAN_SQS_URL: str = "changeme"
|
||||
POSTCODE_SPLITTER_SQS_URL: str = "changeme"
|
||||
COMBINER_SQS_URL: str = "changeme"
|
||||
LANDLORD_OVERRIDES_SQS_URL: str = "changeme"
|
||||
|
||||
# Third parties
|
||||
EPC_AUTH_TOKEN: str = "changeme"
|
||||
|
|
|
|||
|
|
@ -40,6 +40,9 @@ COPY --from=build-image /usr/local/lib/python3.11/site-packages/ /usr/local/lib/
|
|||
COPY ./backend/ ./backend
|
||||
COPY ./recommendations/ ./recommendations
|
||||
COPY ./utils/ ./utils/
|
||||
# engine.py -> backend.apis.GoogleSolarApi -> infrastructure.solar; without this
|
||||
# the lambda fails at init with "No module named 'infrastructure'".
|
||||
COPY ./infrastructure/ ./infrastructure/
|
||||
COPY ./etl/epc/ ./etl/epc/
|
||||
COPY ./etl/epc_clean/ ./etl/epc_clean/
|
||||
COPY ./etl/bill_savings/ ./etl/bill_savings/
|
||||
|
|
|
|||
|
|
@ -13,6 +13,12 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|||
COPY utils/ utils/
|
||||
COPY backend/ backend/
|
||||
COPY datatypes/ datatypes/
|
||||
# handler -> ecmk_service -> documents_parser.parser -> datatypes.epc.domain.mapper
|
||||
# -> domain.sap10_calculator, and -> db_writer -> epc_property model
|
||||
# -> infrastructure.postgres. Without these the lambda fails at init with
|
||||
# "No module named 'domain'" / "'infrastructure'".
|
||||
COPY domain/ domain/
|
||||
COPY infrastructure/ infrastructure/
|
||||
|
||||
# Local lambda entrypoint
|
||||
# ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"]
|
||||
|
|
|
|||
|
|
@ -10,6 +10,12 @@ WORKDIR /var/task
|
|||
COPY utils/ utils/
|
||||
COPY backend/ backend/
|
||||
COPY datatypes/ datatypes/
|
||||
# handler -> pashub_service -> documents_parser.parser -> datatypes.epc.domain.mapper
|
||||
# -> domain.sap10_calculator, and -> db_writer -> epc_property model
|
||||
# -> infrastructure.postgres. Without these the lambda fails at init with
|
||||
# "No module named 'domain'" / "'infrastructure'".
|
||||
COPY domain/ domain/
|
||||
COPY infrastructure/ infrastructure/
|
||||
|
||||
COPY backend/pashub_fetcher/handler/requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
|
|
|||
|
|
@ -14,55 +14,11 @@ class TestSearchEpcIntegration:
|
|||
def epc_auth_token(self):
|
||||
return os.getenv("EPC_AUTH_TOKEN")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"address, postcode, uprn, skip_os, lmk_key, n_old_epcs",
|
||||
[
|
||||
# Test case 1: Valid address and postcode, skipping OS
|
||||
# In this case, the property is an individual flat but the uprn associated to the
|
||||
# EPC is for the building as a whole, possibly because there was a conversion of sorts
|
||||
("Garden Flat, 48 Bedminster Parade", "BS3 4HS", 308249, True,
|
||||
"260907a5431fa073d193cc6bbec51fbf1ba9a61845ab2503f85aa19ce3ed6afd", 1),
|
||||
|
||||
# Test case 2: Another valid address and postcode
|
||||
# In this case, the newest EPC, does not have a uprn associated to it. If we did a search by
|
||||
# uprn, we would get an old EPC
|
||||
("Flat 8, Hainton House", "DN32 9AQ", "", True,
|
||||
"bd1149a20a73397184f07a9955f872424826e70f4870c058d71be887766ee1f8", 2),
|
||||
# Test case 3: When we make a request to the API for this property, we get back results for
|
||||
# flats 1, 2 and 3. We have some logic to handle the response so that we get back flat 1
|
||||
("Flat 1, 1 Tottenham Street, London", "W1T 2AE", 5167411, True,
|
||||
"3e6414d7f15f4cf7a69dc20c469bcf043d31a49239b183f1bd0c0e1aafa23c93", 0),
|
||||
|
||||
],
|
||||
)
|
||||
def test_find_property(self, epc_auth_token, address, postcode, uprn, skip_os, lmk_key, n_old_epcs):
|
||||
"""
|
||||
Integration test for `find_property`, making actual API calls.
|
||||
"""
|
||||
# Provide your actual API keys or tokens here
|
||||
os_api_key = ""
|
||||
|
||||
# Initialize the SearchEpc instance
|
||||
epc_searcher = SearchEpc(
|
||||
address1=address,
|
||||
postcode=postcode,
|
||||
uprn=uprn,
|
||||
auth_token=epc_auth_token,
|
||||
os_api_key=os_api_key,
|
||||
)
|
||||
|
||||
# Execute the method
|
||||
epc_searcher.find_property(skip_os=skip_os)
|
||||
|
||||
# We check that we have the correct epc
|
||||
assert epc_searcher.newest_epc["lmk-key"] == lmk_key
|
||||
assert len(epc_searcher.older_epcs) == n_old_epcs
|
||||
|
||||
def test_search_housenumber(self):
|
||||
eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter'
|
||||
eg1 = "Flat A11, Mortimer House, Grendon Road, Exeter"
|
||||
res1 = SearchEpc.get_house_number(eg1, None)
|
||||
assert res1 == "A11"
|
||||
|
||||
eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL'
|
||||
eg2 = "Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL"
|
||||
res2 = SearchEpc.get_house_number(eg2, None)
|
||||
assert res2 == "A9"
|
||||
|
|
|
|||
|
|
@ -98,7 +98,9 @@ class MainHeatingDetail:
|
|||
boiler_flue_type: Optional[int] = None # TODO: make enum?
|
||||
boiler_ignition_type: Optional[int] = None # TODO: make enum?
|
||||
central_heating_pump_age: Optional[int] = None
|
||||
central_heating_pump_age_str: Optional[str] = None # str from site notes e.g. "Unknown", "Pre 2013"
|
||||
central_heating_pump_age_str: Optional[str] = (
|
||||
None # str from site notes e.g. "Unknown", "Pre 2013"
|
||||
)
|
||||
main_heating_index_number: Optional[int] = None
|
||||
sap_main_heating_code: Optional[int] = None # TODO: make enum?
|
||||
main_heating_number: Optional[int] = None
|
||||
|
|
@ -132,7 +134,7 @@ class ShowerOutlets:
|
|||
|
||||
@dataclass
|
||||
class SapHeating:
|
||||
instantaneous_wwhrs: InstantaneousWwhrs
|
||||
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
|
||||
main_heating_details: List[MainHeatingDetail]
|
||||
has_fixed_air_conditioning: bool
|
||||
cylinder_size: Optional[Union[int, str]] = (
|
||||
|
|
@ -145,7 +147,9 @@ class SapHeating:
|
|||
cylinder_insulation_type: Optional[Union[int, str]] = None
|
||||
cylinder_thermostat: Optional[str] = None
|
||||
secondary_fuel_type: Optional[int] = None
|
||||
secondary_heating_type: Optional[Union[int, str]] = None # int from API; str from site notes
|
||||
secondary_heating_type: Optional[Union[int, str]] = (
|
||||
None # int from API; str from site notes
|
||||
)
|
||||
cylinder_insulation_thickness_mm: Optional[int] = None
|
||||
# SAP10 hot-water demand inputs from sap_heating.
|
||||
number_baths: Optional[int] = None
|
||||
|
|
@ -168,7 +172,9 @@ class SapHeating:
|
|||
class SapVentilation:
|
||||
ventilation_type: Optional[str] = None
|
||||
draught_lobby: Optional[bool] = None
|
||||
pressure_test: Optional[str] = None # str from site notes e.g. "No test"; int in API via mechanical_ventilation
|
||||
pressure_test: Optional[str] = (
|
||||
None # str from site notes e.g. "No test"; int in API via mechanical_ventilation
|
||||
)
|
||||
open_flues_count: Optional[int] = None
|
||||
closed_flues_count: Optional[int] = None
|
||||
boiler_flues_count: Optional[int] = None
|
||||
|
|
@ -476,8 +482,12 @@ class SapBuildingPart:
|
|||
None # TODO: make enum/mapping?
|
||||
)
|
||||
floor_type: Optional[str] = None # str from site notes e.g. "Ground Floor"
|
||||
floor_construction_type: Optional[str] = None # str from site notes; distinct from floor_construction: int in SapFloorDimension
|
||||
floor_insulation_type_str: Optional[str] = None # str from site notes e.g. "As Built"
|
||||
floor_construction_type: Optional[str] = (
|
||||
None # str from site notes; distinct from floor_construction: int in SapFloorDimension
|
||||
)
|
||||
floor_insulation_type_str: Optional[str] = (
|
||||
None # str from site notes e.g. "As Built"
|
||||
)
|
||||
floor_u_value_known: Optional[bool] = None
|
||||
|
||||
roof_construction: Optional[int] = None
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ class SapHeating:
|
|||
cylinder_size: int
|
||||
water_heating_code: int
|
||||
water_heating_fuel: int
|
||||
instantaneous_wwhrs: InstantaneousWwhrs
|
||||
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
|
||||
main_heating_details: List[MainHeatingDetail]
|
||||
immersion_heating_type: Union[int, str]
|
||||
cylinder_insulation_type: int
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ class SapHeating:
|
|||
cylinder_size: int
|
||||
water_heating_code: int
|
||||
water_heating_fuel: int
|
||||
instantaneous_wwhrs: InstantaneousWwhrs
|
||||
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
|
||||
main_heating_details: List[MainHeatingDetail]
|
||||
immersion_heating_type: Union[int, str]
|
||||
cylinder_insulation_type: int
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ class SapHeating:
|
|||
cylinder_size: int
|
||||
water_heating_code: int
|
||||
water_heating_fuel: int
|
||||
instantaneous_wwhrs: InstantaneousWwhrs
|
||||
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
|
||||
main_heating_details: List[MainHeatingDetail]
|
||||
immersion_heating_type: Union[int, str]
|
||||
has_fixed_air_conditioning: str
|
||||
|
|
@ -86,6 +86,7 @@ class SapFloorDimension:
|
|||
@dataclass
|
||||
class SapRoomInRoof:
|
||||
"""Room-in-roof details. floor_area is a Measurement object in schema 18.0."""
|
||||
|
||||
floor_area: Measurement
|
||||
insulation: str
|
||||
roof_room_connected: str
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ class SapHeating:
|
|||
cylinder_size: int
|
||||
water_heating_code: int
|
||||
water_heating_fuel: int
|
||||
instantaneous_wwhrs: InstantaneousWwhrs
|
||||
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
|
||||
main_heating_details: List[MainHeatingDetail]
|
||||
immersion_heating_type: Union[int, str]
|
||||
has_fixed_air_conditioning: str
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ class SapHeating:
|
|||
cylinder_size: int
|
||||
water_heating_code: int
|
||||
water_heating_fuel: int
|
||||
instantaneous_wwhrs: InstantaneousWwhrs
|
||||
instantaneous_wwhrs: Optional[InstantaneousWwhrs]
|
||||
main_heating_details: List[MainHeatingDetail]
|
||||
immersion_heating_type: Union[int, str]
|
||||
has_fixed_air_conditioning: str
|
||||
|
|
@ -103,6 +103,7 @@ class SapFloorDimension:
|
|||
@dataclass
|
||||
class SapRoomInRoof:
|
||||
"""Room-in-roof details. floor_area is a plain number in schema 20.0.0 (not a Measurement object)."""
|
||||
|
||||
floor_area: Union[int, float]
|
||||
insulation: str
|
||||
roof_room_connected: str
|
||||
|
|
|
|||
|
|
@ -0,0 +1,50 @@
|
|||
data "terraform_remote_state" "shared" {
|
||||
backend = "s3"
|
||||
config = {
|
||||
bucket = "assessment-model-terraform-state"
|
||||
key = "env:/${var.stage}/terraform.tfstate"
|
||||
region = "eu-west-2"
|
||||
}
|
||||
}
|
||||
|
||||
data "aws_secretsmanager_secret_version" "db_credentials" {
|
||||
secret_id = "${var.stage}/assessment_model/db_credentials"
|
||||
}
|
||||
|
||||
locals {
|
||||
db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)
|
||||
}
|
||||
|
||||
module "lambda" {
|
||||
source = "../../modules/lambda_with_sqs"
|
||||
|
||||
name = "landlord-description-overrides"
|
||||
stage = var.stage
|
||||
|
||||
image_uri = local.image_uri
|
||||
|
||||
# The classifier calls OpenAI once per distinct description per column, so it
|
||||
# is latency-bound. 300s leaves headroom under the queue's 1000s visibility
|
||||
# timeout. batch_size = 1 keeps one upload per invocation, so a single bad
|
||||
# record cannot redrive its siblings. maximum_concurrency caps fan-out to
|
||||
# respect OpenAI rate limits.
|
||||
timeout = 300
|
||||
batch_size = 1
|
||||
maximum_concurrency = 5
|
||||
|
||||
environment = merge(
|
||||
{
|
||||
STAGE = var.stage
|
||||
LOG_LEVEL = "info"
|
||||
POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username
|
||||
POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password
|
||||
OPENAI_API_KEY = var.openai_api_key
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
# Attach S3 read policy so the handler can read the original upload CSV.
|
||||
resource "aws_iam_role_policy_attachment" "landlord_overrides_s3_read" {
|
||||
role = module.lambda.role_name
|
||||
policy_arn = data.terraform_remote_state.shared.outputs.landlord_overrides_s3_read_arn
|
||||
}
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
output "landlord_description_overrides_queue_url" {
|
||||
value = module.lambda.queue_url
|
||||
description = "URL of the Landlord Description Overrides SQS queue (wire into the FastAPI LANDLORD_OVERRIDES_SQS_URL)"
|
||||
}
|
||||
|
||||
output "landlord_description_overrides_queue_arn" {
|
||||
value = module.lambda.queue_arn
|
||||
description = "ARN of the Landlord Description Overrides SQS queue"
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
terraform {
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = ">= 5.0"
|
||||
}
|
||||
}
|
||||
|
||||
backend "s3" {
|
||||
bucket = "landlord-description-overrides-terraform-state"
|
||||
key = "terraform.tfstate"
|
||||
region = "eu-west-2"
|
||||
}
|
||||
|
||||
required_version = ">= 1.2.0"
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
variable "lambda_name" {
|
||||
type = string
|
||||
description = "Logical name of the lambda (e.g. landlordDescriptionOverrides)"
|
||||
}
|
||||
|
||||
variable "stage" {
|
||||
description = "Deployment stage (e.g. dev, prod)"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "ecr_repo_url" {
|
||||
type = string
|
||||
description = "ECR repository URL (no tag, no digest)"
|
||||
}
|
||||
|
||||
variable "image_digest" {
|
||||
type = string
|
||||
description = "Image digest (sha256:...)"
|
||||
}
|
||||
|
||||
variable "openai_api_key" {
|
||||
type = string
|
||||
description = "OpenAI API key used by the ChatGPT column classifier"
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
locals {
|
||||
image_uri = "${var.ecr_repo_url}@${var.image_digest}"
|
||||
}
|
||||
|
||||
output "resolved_image_uri" {
|
||||
value = local.image_uri
|
||||
}
|
||||
|
|
@ -268,11 +268,11 @@ output "retrofit_heat_baseline_predictions_bucket_name" {
|
|||
|
||||
// We make this bucket presignable, because we want to generate download links for the frontend
|
||||
module "retrofit_energy_assessments" {
|
||||
source = "../modules/s3_presignable_bucket"
|
||||
bucketname = "retrofit-energy-assessments-${var.stage}"
|
||||
allowed_origins = var.allowed_origins
|
||||
environment = var.stage
|
||||
enable_versioning = true
|
||||
source = "../modules/s3_presignable_bucket"
|
||||
bucketname = "retrofit-energy-assessments-${var.stage}"
|
||||
allowed_origins = var.allowed_origins
|
||||
environment = var.stage
|
||||
enable_versioning = true
|
||||
}
|
||||
|
||||
output "retrofit_energy_assessments_bucket_name" {
|
||||
|
|
@ -494,6 +494,42 @@ output "postcode_splitter_s3_read_arn" {
|
|||
value = module.postcode_splitter_s3_read.policy_arn
|
||||
}
|
||||
|
||||
################################################
|
||||
# Landlord Description Overrides – Lambda
|
||||
################################################
|
||||
module "landlord_description_overrides_state_bucket" {
|
||||
source = "../modules/tf_state_bucket"
|
||||
bucket_name = "landlord-description-overrides-terraform-state"
|
||||
}
|
||||
|
||||
module "landlord_description_overrides_registry" {
|
||||
source = "../modules/container_registry"
|
||||
name = "landlord_description_overrides"
|
||||
stage = var.stage
|
||||
}
|
||||
|
||||
# S3 policy for the landlord classifier to read the original upload CSV.
|
||||
module "landlord_overrides_s3_read" {
|
||||
source = "../modules/s3_iam_policy"
|
||||
|
||||
policy_name = "LandlordOverridesReadS3"
|
||||
# NOTE: aws_iam_policy.description is ForceNew — changing it destroys+recreates the
|
||||
# policy, which deadlocks because the policy is attached to the lambda role in the
|
||||
# separate landlordDescriptionOverrides stack (DeleteConflict). Keep this string
|
||||
# byte-for-byte identical to what's in state so the bucket change applies in-place.
|
||||
policy_description = "Allow landlord description overrides Lambda to read from retrofit-data bucket"
|
||||
bucket_arns = [
|
||||
"arn:aws:s3:::retrofit-plan-inputs-${var.stage}",
|
||||
"arn:aws:s3:::retrofit-data-${var.stage}",
|
||||
]
|
||||
actions = ["s3:GetObject", "s3:ListBucket"]
|
||||
resource_paths = ["/*"]
|
||||
}
|
||||
|
||||
output "landlord_overrides_s3_read_arn" {
|
||||
value = module.landlord_overrides_s3_read.policy_arn
|
||||
}
|
||||
|
||||
################################################
|
||||
# Bulk Address2UPRN Combiner – Lambda ECR
|
||||
################################################
|
||||
|
|
@ -729,7 +765,7 @@ module "hubspot_etl_bucket" {
|
|||
module "hubspot_etl_registry" {
|
||||
source = "../modules/container_registry"
|
||||
name = "hubspot-etl"
|
||||
stage = var.stage
|
||||
stage = var.stage
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
79
docs/adr/0003-python-writes-landlord-overrides-directly.md
Normal file
79
docs/adr/0003-python-writes-landlord-overrides-directly.md
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
# ADR-0003: Python writes landlord overrides directly to Postgres
|
||||
|
||||
**Status:** Accepted
|
||||
**Date:** 2026-05-26
|
||||
**Supersedes (in part):** [assessment-model/docs/adr/0002-landlord-override-vocabulary.md](https://github.com/.../assessment-model/blob/main/docs/adr/0002-landlord-override-vocabulary.md) — specifically the clause beginning *"Writes happen from Next.js …"*.
|
||||
|
||||
## Context
|
||||
|
||||
ADR-0002 (in the `assessment-model` TS repo) defined the `landlord_property_type_overrides` and `landlord_wall_type_overrides` tables and noted that the Model service would POST classification results to a Next.js route handler, with Next.js performing the upsert. Drizzle remained the schema source of truth.
|
||||
|
||||
That extra hop has not been built and is now judged unnecessary for the present scope:
|
||||
|
||||
- The classification result is internal — a Lambda computes it, the same Lambda persists it. No third party needs to participate in the write.
|
||||
- Drizzle remains the schema's source of truth either way: the Python adapter mirrors the schema in a SQLModel row, but the migrations stay with Drizzle. Adding a Next.js route would not change which side owns schema definition.
|
||||
- The Python lambda already lives next to a Postgres connection in the existing pipeline (`subtask`/`tasks` tables are written from Python today). Adding two more tables to that adapter surface is a small, well-understood change. Routing the same writes through Next.js would mean: lambda → JSON-over-HTTP → Next.js route → Drizzle → Postgres, instead of lambda → SQLAlchemy → Postgres. Three extra moving parts to ship, deploy, monitor, and authenticate for no behavioural gain.
|
||||
|
||||
## Decision
|
||||
|
||||
The Model service (specifically `applications/landlord_description_overrides/handler.py`) writes directly to `landlord_property_type_overrides` and `landlord_wall_type_overrides` via a SQLAlchemy-backed `LandlordOverrideRepository[E]` adapter. No Next.js route handler is required.
|
||||
|
||||
Transaction boundaries live in `infrastructure/postgres/engine.transactional_session` — a context manager that commits on clean exit and rolls back on exception. The application layer (`handler.py`) never calls `.commit()` or `.rollback()` itself; it only opens the context. Orchestration and repository code likewise never commits — keeping transaction semantics confined to one infrastructure helper.
|
||||
|
||||
The conflict policy lives in SQL and is identical for every override category. A single generic adapter, `LandlordOverridesRepository[E]`, implements it once; the target table is selected by the SQLModel `…Row` class passed at construction. Each category (property / built-form / wall / roof type) is that same adapter parameterised by its row class:
|
||||
|
||||
```sql
|
||||
INSERT INTO landlord_property_type_overrides (portfolio_id, description, value, source)
|
||||
VALUES …
|
||||
ON CONFLICT (portfolio_id, description)
|
||||
DO UPDATE SET value = EXCLUDED.value,
|
||||
source = EXCLUDED.source,
|
||||
updated_at = now()
|
||||
WHERE landlord_property_type_overrides.source = 'classifier';
|
||||
```
|
||||
|
||||
The `WHERE existing.source = 'classifier'` guard is load-bearing: it lets the classifier refresh its own past output while leaving `source = 'user'` rows untouched. This is the contract ADR-0002's `source` column was added for.
|
||||
|
||||
`UNKNOWN` values are persisted, not skipped — consistent with ADR-0002 §5. A future user override can upgrade them.
|
||||
|
||||
## Consequences
|
||||
|
||||
**Positive.**
|
||||
|
||||
- One fewer service to deploy, monitor, and authenticate.
|
||||
- The classifier and persistence live in the same process — failures surface against a single `sub_task` row, not split across two systems.
|
||||
- The Postgres adapter mirrors the existing `subtask`/`tasks` repositories, so reviewers have a precedent to compare against.
|
||||
|
||||
**Negative.**
|
||||
|
||||
- The Python repo now holds two schemas — the schema-source-of-truth Drizzle definition lives in the TS repo, and the Python `SQLModel` row class shadows it. They must stay in lockstep. Mitigations: the TS schema header comment (`landlord_overrides.ts:12`) already names the Python source-of-truth file; a future ADR may add a CI check that diffs the two.
|
||||
- The boundary that ADR-0002 anticipated for pgEnum validation (a Next.js route validating incoming values before insert) is gone. Pydantic + the Python `Enum` type catch invalid values on the producing side, and Postgres's pgEnum will reject anything that slips through.
|
||||
|
||||
## File layout
|
||||
|
||||
This ADR also fixes a placement convention for Postgres adapters going forward. The codebase currently has the ChatGPT classifier split cleanly along DDD lines — port in `domain/`, adapter in `infrastructure/chatgpt/` — but the `tasks` Postgres adapter does not follow the same shape: its concrete class lives in `repositories/tasks/`, not `infrastructure/postgres/`.
|
||||
|
||||
The convention going forward separates the persistence *behaviour* (grouped by aggregate) from the schema *mirrors* (grouped by technology, since they share pgEnums and engine metadata):
|
||||
|
||||
- **Port (protocol / abstract base):** `repositories/<aggregate>/<thing>_repository.py`
|
||||
- **Postgres repository adapter (concrete):** `infrastructure/<aggregate>/<aggregate>_postgres_repository.py`
|
||||
- **SQLModel row class (`table=True` schema mirror):** `infrastructure/postgres/<thing>_table.py`
|
||||
|
||||
The `LandlordOverridesRepository` adapter follows this convention: the concrete class — the aggregate's "talker" to Postgres — lives at `infrastructure/landlord_overrides/landlord_overrides_postgres_repository.py`, while the per-category `…Row` classes stay in `infrastructure/postgres/`. The `…Row` classes are one-per-table — each mirrors a genuinely distinct Drizzle table and `value` pgEnum, and they share the single `override_source` pgEnum instance, so they belong together in the Postgres technology bucket as schema mirrors, not duplicated logic.
|
||||
|
||||
(This refines the placement first sketched in this ADR, which put the adapter in `infrastructure/postgres/` alongside the row classes. The adapter holds no schema — only the write path — so it groups by aggregate; only the `table=True` mirrors stay tech-bucketed.)
|
||||
|
||||
**Existing outliers to relocate in a follow-up:**
|
||||
|
||||
- `repositories/tasks/task_postgres_repository.py` → `infrastructure/tasks/task_postgres_repository.py`
|
||||
- `repositories/tasks/subtask_postgres_repository.py` → `infrastructure/tasks/subtask_postgres_repository.py`
|
||||
|
||||
(Their `task_table.py` / `subtask_table.py` schema mirrors already sit correctly in `infrastructure/postgres/`.) Both moves are mechanical (import-path updates only). They are intentionally out of scope for the present PR.
|
||||
|
||||
## Out of scope (deferred to follow-up work)
|
||||
|
||||
- Relocating `task_postgres_repository.py` and `subtask_postgres_repository.py` into `infrastructure/tasks/` per the convention above.
|
||||
- ~~Extracting a shared upsert helper / base class once a third `landlord_*_overrides` column lands — until then the per-category adapters' 95%-identical bodies are kept side-by-side for direct comparison.~~ **Done.** The per-category adapter bodies were byte-identical (varying only in their row class), so they were consolidated into one generic `LandlordOverridesRepository[E]` parameterised by row class rather than waiting for a third column.
|
||||
- Switching `applications/landlord_description_overrides/handler.py` to acquire its `Session` via a `@subtask_handler()`-style decorator instead of building its own engine.
|
||||
- A cross-repo PR amending ADR-0002 to point at this ADR.
|
||||
- A CI check (or codegen) that diffs the Drizzle pgEnum literals against the Python `Enum.value` strings.
|
||||
|
|
@ -2,21 +2,21 @@ from __future__ import annotations
|
|||
|
||||
from collections.abc import Iterable, Iterator
|
||||
|
||||
from domain.addresses.user_address import UserAddress
|
||||
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
|
||||
from domain.postcode import Postcode
|
||||
|
||||
|
||||
def iter_postcode_grouped_batches(
|
||||
addresses: Iterable[UserAddress],
|
||||
addresses: Iterable[UnstandardisedAddress],
|
||||
*,
|
||||
max_batch_size: int = 500,
|
||||
) -> Iterator[list[UserAddress]]:
|
||||
) -> Iterator[AddressList]:
|
||||
if max_batch_size < 1:
|
||||
raise ValueError("max_batch_size must be >= 1")
|
||||
|
||||
groups = _group_by_postcode_in_order(addresses)
|
||||
|
||||
buffer: list[UserAddress] = []
|
||||
buffer: AddressList = AddressList([])
|
||||
for group in groups.values():
|
||||
group_len = len(group)
|
||||
|
||||
|
|
@ -26,14 +26,14 @@ def iter_postcode_grouped_batches(
|
|||
if group_len >= max_batch_size:
|
||||
if buffer:
|
||||
yield buffer
|
||||
buffer = []
|
||||
buffer = AddressList([])
|
||||
yield group
|
||||
continue
|
||||
|
||||
# Adding this group would overflow: flush buffer before appending.
|
||||
if len(buffer) + group_len > max_batch_size:
|
||||
yield buffer
|
||||
buffer = []
|
||||
buffer = AddressList([])
|
||||
|
||||
buffer.extend(group)
|
||||
|
||||
|
|
@ -43,9 +43,9 @@ def iter_postcode_grouped_batches(
|
|||
|
||||
|
||||
def _group_by_postcode_in_order(
|
||||
addresses: Iterable[UserAddress],
|
||||
) -> dict[Postcode, list[UserAddress]]:
|
||||
groups: dict[Postcode, list[UserAddress]] = {}
|
||||
addresses: Iterable[UnstandardisedAddress],
|
||||
) -> dict[Postcode, AddressList]:
|
||||
groups: dict[Postcode, AddressList] = {}
|
||||
for address in addresses:
|
||||
groups.setdefault(address.postcode, []).append(address)
|
||||
groups.setdefault(address.postcode, AddressList([])).append(address)
|
||||
return groups
|
||||
|
|
|
|||
21
domain/addresses/standardised_address_list.py
Normal file
21
domain/addresses/standardised_address_list.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import NewType, Optional
|
||||
|
||||
from domain.postcode import Postcode
|
||||
|
||||
|
||||
def _empty_source_row() -> dict[str, str]:
|
||||
return {}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class StandardisedAddress:
|
||||
address: str
|
||||
postcode: Postcode
|
||||
org_reference: Optional[str] = None
|
||||
|
||||
|
||||
# Standardised Asset List -- the cleaned output counterpart to AddressList.
|
||||
SAL = NewType("SAL", list[StandardisedAddress])
|
||||
24
domain/addresses/unstandardised_address.py
Normal file
24
domain/addresses/unstandardised_address.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, NewType
|
||||
|
||||
from domain.postcode import Postcode
|
||||
|
||||
|
||||
def _empty_source_row() -> dict[str, str]:
|
||||
return {}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UnstandardisedAddress:
|
||||
address: str
|
||||
postcode: Postcode
|
||||
org_reference: Optional[str] = None
|
||||
additional_info: dict[str, str] = field(
|
||||
default_factory=_empty_source_row, compare=False
|
||||
)
|
||||
|
||||
|
||||
# A batch of raw, pre-standardisation addresses as supplied by a landlord.
|
||||
AddressList = NewType("AddressList", list[UnstandardisedAddress])
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from domain.postcode import Postcode
|
||||
|
||||
|
||||
def _empty_source_row() -> dict[str, str]:
|
||||
return {}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UserAddress:
|
||||
user_address: str
|
||||
postcode: Postcode
|
||||
internal_reference: Optional[str] = None
|
||||
source_row: dict[str, str] = field(default_factory=_empty_source_row, compare=False)
|
||||
39
domain/data_transformation/column_classifier.py
Normal file
39
domain/data_transformation/column_classifier.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Generic, TypeVar
|
||||
|
||||
E = TypeVar("E", bound=Enum)
|
||||
|
||||
|
||||
class ClassificationError(Exception):
|
||||
"""Raised when classifying a column's descriptions fails wholesale.
|
||||
|
||||
A whole-batch failure (the AI backend is unreachable, or returns a reply
|
||||
that cannot be parsed) raises this. A single description that merely
|
||||
cannot be resolved is not an error -- it maps to the enum's UNKNOWN member.
|
||||
"""
|
||||
|
||||
|
||||
class ColumnClassifier(ABC, Generic[E]):
|
||||
"""Port: resolves free-text descriptions into a category enum ``E``.
|
||||
|
||||
One classifier handles one landlord-CSV column. Implementations decide
|
||||
*how* the mapping is performed (an LLM, a lookup table, a rules engine);
|
||||
``LandlordDescriptionOverridesOrchestrator`` depends only on this interface.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def classify(self, descriptions: set[str]) -> dict[str, E]:
|
||||
"""Classify each description into a category enum member.
|
||||
|
||||
Every input description appears as a key in the result. A description
|
||||
that cannot be resolved maps to the enum's UNKNOWN member.
|
||||
|
||||
Raises:
|
||||
ClassificationError: If the classification call fails wholesale
|
||||
(e.g. the backend is unreachable or returns an unparseable
|
||||
response).
|
||||
"""
|
||||
...
|
||||
20
domain/epc/built_form_type.py
Normal file
20
domain/epc/built_form_type.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class BuiltFormType(Enum):
|
||||
"""A landlord-supplied built form, as resolved by the landlord-description-overrides context.
|
||||
|
||||
Mirrors the EPC built-form values. ``NOT_RECORDED`` is the legitimate
|
||||
EPC value for properties whose built form the surveyor did not capture;
|
||||
``UNKNOWN`` is the classifier fallback for landlord values that cannot be
|
||||
resolved at all.
|
||||
"""
|
||||
|
||||
DETACHED = "Detached"
|
||||
SEMI_DETACHED = "Semi-Detached"
|
||||
MID_TERRACE = "Mid-Terrace"
|
||||
END_TERRACE = "End-Terrace"
|
||||
ENCLOSED_MID_TERRACE = "Enclosed Mid-Terrace"
|
||||
ENCLOSED_END_TERRACE = "Enclosed End-Terrace"
|
||||
NOT_RECORDED = "Not Recorded"
|
||||
UNKNOWN = "Unknown"
|
||||
16
domain/epc/property_type.py
Normal file
16
domain/epc/property_type.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class PropertyType(Enum):
|
||||
"""A landlord-supplied property type, as resolved by the landlord-description-overrides context.
|
||||
|
||||
Distinct from the EPC context's ``PropertyType``: a landlord CSV value
|
||||
may be unresolvable, so this enum carries an explicit ``UNKNOWN`` member.
|
||||
"""
|
||||
|
||||
HOUSE = "House"
|
||||
BUNGALOW = "Bungalow"
|
||||
FLAT = "Flat"
|
||||
MAISONETTE = "Maisonette"
|
||||
PARK_HOME = "Park home"
|
||||
UNKNOWN = "Unknown"
|
||||
70
domain/epc/roof_type.py
Normal file
70
domain/epc/roof_type.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class RoofType(Enum):
|
||||
"""A landlord-supplied roof description, as resolved by the landlord-description-overrides context.
|
||||
|
||||
Each member is one full EPC roof-description string, combining shape
|
||||
(flat, pitched, roof room(s), thatched) with insulation state and, for
|
||||
pitched roofs, the loft-insulation depth in millimetres. Adjacency
|
||||
markers like ``(another dwelling above)`` represent a unit whose top
|
||||
boundary is another dwelling rather than a roof of its own; they are
|
||||
kept as members because they appear in the same EPC column.
|
||||
``UNKNOWN`` covers values the classifier cannot resolve -- most
|
||||
commonly raw ``Average thermal transmittance`` U-value strings that
|
||||
carry no shape/insulation information.
|
||||
"""
|
||||
|
||||
FLAT_INSULATED = "Flat, insulated"
|
||||
FLAT_INSULATED_ASSUMED = "Flat, insulated (assumed)"
|
||||
FLAT_LIMITED_INSULATION = "Flat, limited insulation"
|
||||
FLAT_LIMITED_INSULATION_ASSUMED = "Flat, limited insulation (assumed)"
|
||||
FLAT_NO_INSULATION = "Flat, no insulation"
|
||||
FLAT_NO_INSULATION_ASSUMED = "Flat, no insulation (assumed)"
|
||||
|
||||
PITCHED_INSULATED = "Pitched, insulated"
|
||||
PITCHED_INSULATED_ASSUMED = "Pitched, insulated (assumed)"
|
||||
PITCHED_INSULATED_AT_RAFTERS = "Pitched, insulated at rafters"
|
||||
PITCHED_LIMITED_INSULATION = "Pitched, limited insulation"
|
||||
PITCHED_LIMITED_INSULATION_ASSUMED = "Pitched, limited insulation (assumed)"
|
||||
PITCHED_NO_INSULATION = "Pitched, no insulation"
|
||||
PITCHED_NO_INSULATION_ASSUMED = "Pitched, no insulation (assumed)"
|
||||
PITCHED_UNKNOWN_LOFT_INSULATION = "Pitched, Unknown loft insulation"
|
||||
PITCHED_LOFT_0MM = "Pitched, 0 mm loft insulation"
|
||||
PITCHED_LOFT_12MM = "Pitched, 12 mm loft insulation"
|
||||
PITCHED_LOFT_25MM = "Pitched, 25 mm loft insulation"
|
||||
PITCHED_LOFT_50MM = "Pitched, 50 mm loft insulation"
|
||||
PITCHED_LOFT_75MM = "Pitched, 75 mm loft insulation"
|
||||
PITCHED_LOFT_100MM = "Pitched, 100 mm loft insulation"
|
||||
PITCHED_LOFT_125MM = "Pitched, 125 mm loft insulation"
|
||||
PITCHED_LOFT_150MM = "Pitched, 150 mm loft insulation"
|
||||
PITCHED_LOFT_175MM = "Pitched, 175 mm loft insulation"
|
||||
PITCHED_LOFT_200MM = "Pitched, 200 mm loft insulation"
|
||||
PITCHED_LOFT_225MM = "Pitched, 225 mm loft insulation"
|
||||
PITCHED_LOFT_250MM = "Pitched, 250 mm loft insulation"
|
||||
PITCHED_LOFT_270MM = "Pitched, 270 mm loft insulation"
|
||||
PITCHED_LOFT_300MM = "Pitched, 300 mm loft insulation"
|
||||
PITCHED_LOFT_350MM = "Pitched, 350 mm loft insulation"
|
||||
PITCHED_LOFT_400MM = "Pitched, 400 mm loft insulation"
|
||||
PITCHED_LOFT_400_PLUS_MM = "Pitched, 400+ mm loft insulation"
|
||||
|
||||
ROOF_ROOM_INSULATED = "Roof room(s), insulated"
|
||||
ROOF_ROOM_INSULATED_ASSUMED = "Roof room(s), insulated (assumed)"
|
||||
ROOF_ROOM_LIMITED_INSULATION = "Roof room(s), limited insulation"
|
||||
ROOF_ROOM_LIMITED_INSULATION_ASSUMED = "Roof room(s), limited insulation (assumed)"
|
||||
ROOF_ROOM_NO_INSULATION = "Roof room(s), no insulation"
|
||||
ROOF_ROOM_NO_INSULATION_ASSUMED = "Roof room(s), no insulation (assumed)"
|
||||
ROOF_ROOM_CEILING_INSULATED = "Roof room(s), ceiling insulated"
|
||||
ROOF_ROOM_THATCHED = "Roof room(s), thatched"
|
||||
ROOF_ROOM_THATCHED_WITH_ADDITIONAL_INSULATION = "Roof room(s), thatched with additional insulation"
|
||||
|
||||
THATCHED = "Thatched"
|
||||
THATCHED_WITH_ADDITIONAL_INSULATION = "Thatched, with additional insulation"
|
||||
|
||||
ADJACENT_ANOTHER_DWELLING_ABOVE = "(another dwelling above)"
|
||||
ADJACENT_SAME_DWELLING_ABOVE = "(same dwelling above)"
|
||||
ADJACENT_OTHER_PREMISES_ABOVE = "(other premises above)"
|
||||
ADJACENT_ANOTHER_PREMISES_ABOVE = "(another premises above)"
|
||||
ANOTHER_PREMISES_ABOVE = "Another Premises Above"
|
||||
|
||||
UNKNOWN = "Unknown"
|
||||
117
domain/epc/wall_type.py
Normal file
117
domain/epc/wall_type.py
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class WallType(Enum):
|
||||
"""A landlord-supplied wall description, as resolved by the landlord-description-overrides context.
|
||||
|
||||
Each member is one full EPC wall-description string, combining material
|
||||
(cavity, solid brick, sandstone, …) with construction/insulation state
|
||||
(as built, filled cavity, with internal insulation, …). ``UNKNOWN`` covers
|
||||
values the classifier cannot resolve — most commonly raw
|
||||
``Average thermal transmittance`` U-value strings that carry no material
|
||||
information.
|
||||
"""
|
||||
|
||||
CAVITY_FILLED = "Cavity wall, filled cavity"
|
||||
CAVITY_AS_BUILT_INSULATED_ASSUMED = (
|
||||
"Cavity wall, as built, insulated (assumed)" # 1983 - 1990
|
||||
)
|
||||
CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = (
|
||||
"Cavity wall, as built, no insulation (assumed)" # Pre-1975
|
||||
)
|
||||
|
||||
CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
|
||||
"Cavity wall, as built, partial insulation (assumed)" # 1976 - 1982
|
||||
)
|
||||
CAVITY_WITH_INTERNAL_INSULATION = "Cavity wall, with internal insulation"
|
||||
CAVITY_WITH_EXTERNAL_INSULATION = "Cavity wall, with external insulation"
|
||||
CAVITY_FILLED_AND_INTERNAL_INSULATION = (
|
||||
"Cavity wall, filled cavity and internal insulation"
|
||||
)
|
||||
CAVITY_FILLED_AND_EXTERNAL_INSULATION = (
|
||||
"Cavity wall, filled cavity and external insulation"
|
||||
)
|
||||
|
||||
SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = (
|
||||
"Solid brick, as built, no insulation (assumed)"
|
||||
)
|
||||
SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = (
|
||||
"Solid brick, as built, insulated (assumed)"
|
||||
)
|
||||
SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
|
||||
"Solid brick, as built, partial insulation (assumed)"
|
||||
)
|
||||
SOLID_BRICK_WITH_INTERNAL_INSULATION = "Solid brick, with internal insulation"
|
||||
SOLID_BRICK_WITH_EXTERNAL_INSULATION = "Solid brick, with external insulation"
|
||||
|
||||
TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = (
|
||||
"Timber frame, as built, no insulation (assumed)"
|
||||
)
|
||||
TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = (
|
||||
"Timber frame, as built, insulated (assumed)"
|
||||
)
|
||||
TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
|
||||
"Timber frame, as built, partial insulation (assumed)"
|
||||
)
|
||||
TIMBER_FRAME_WITH_ADDITIONAL_INSULATION = "Timber frame, with additional insulation"
|
||||
|
||||
SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = (
|
||||
"Sandstone, as built, no insulation (assumed)"
|
||||
)
|
||||
SANDSTONE_AS_BUILT_INSULATED_ASSUMED = "Sandstone, as built, insulated (assumed)"
|
||||
SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
|
||||
"Sandstone, as built, partial insulation (assumed)"
|
||||
)
|
||||
SANDSTONE_WITH_INTERNAL_INSULATION = "Sandstone, with internal insulation"
|
||||
SANDSTONE_WITH_EXTERNAL_INSULATION = "Sandstone, with external insulation"
|
||||
|
||||
GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = (
|
||||
"Granite or whin, as built, no insulation (assumed)"
|
||||
)
|
||||
GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = (
|
||||
"Granite or whin, as built, insulated (assumed)"
|
||||
)
|
||||
GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
|
||||
"Granite or whin, as built, partial insulation (assumed)"
|
||||
)
|
||||
GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = (
|
||||
"Granite or whin, with internal insulation"
|
||||
)
|
||||
GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = (
|
||||
"Granite or whin, with external insulation"
|
||||
)
|
||||
|
||||
SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = (
|
||||
"System built, as built, no insulation (assumed)"
|
||||
)
|
||||
SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = (
|
||||
"System built, as built, insulated (assumed)"
|
||||
)
|
||||
SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
|
||||
"System built, as built, partial insulation (assumed)"
|
||||
)
|
||||
SYSTEM_BUILT_WITH_INTERNAL_INSULATION = "System built, with internal insulation"
|
||||
SYSTEM_BUILT_WITH_EXTERNAL_INSULATION = "System built, with external insulation"
|
||||
|
||||
PARK_HOME_AS_BUILT = "Park home wall, as built"
|
||||
PARK_HOME_WITH_INTERNAL_INSULATION = "Park home wall, with internal insulation"
|
||||
PARK_HOME_WITH_EXTERNAL_INSULATION = "Park home wall, with external insulation"
|
||||
|
||||
COB_AS_BUILT = "Cob, as built"
|
||||
COB_WITH_INTERNAL_INSULATION = "Cob, with internal insulation"
|
||||
COB_WITH_EXTERNAL_INSULATION = "Cob, with external insulation"
|
||||
|
||||
CURTAIN_WALL = "Curtain wall"
|
||||
CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = (
|
||||
"Curtain Wall, as built, no insulation (assumed)"
|
||||
)
|
||||
CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = (
|
||||
"Curtain Wall, as built, insulated (assumed)"
|
||||
)
|
||||
CURTAIN_WALL_FILLED = "Curtain Wall, filled cavity"
|
||||
CURTAIN_WALL_WITH_INTERNAL_INSULATION = "Curtain Wall, with internal insulation"
|
||||
|
||||
BASEMENT_WALL = "Basement wall"
|
||||
BASEMENT_WALL_AS_BUILT = "Basement wall, as built"
|
||||
|
||||
UNKNOWN = "Unknown"
|
||||
72
domain/epc/wall_type_construction_dates.py
Normal file
72
domain/epc/wall_type_construction_dates.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
"""Construction-date metadata for the "assumed" ``WallType`` variants.
|
||||
|
||||
The ``(assumed)`` variants of ``WallType`` are what RdSAP picks when a
|
||||
surveyor has no direct observation and falls back to the typical wall make-up
|
||||
for a property's build era. The era boundaries reflect UK Building
|
||||
Regulations milestones for cavity-wall insulation:
|
||||
|
||||
* up to 1975 -- no cavity insulation requirement
|
||||
* 1976-1982 -- partial-fill cavity (early insulation requirement)
|
||||
* 1983-1990 -- full-fill cavity (insulation required)
|
||||
|
||||
Captured here as a structured lookup so:
|
||||
|
||||
* the LLM prompt builder can render the ranges as a hint, helping the
|
||||
classifier resolve era-implying landlord descriptions to the right
|
||||
``(assumed)`` variant;
|
||||
* future date-aware paths (a deterministic year-to-variant shortcut, a
|
||||
date-keyed repo) can read from the same source instead of duplicating
|
||||
the knowledge.
|
||||
|
||||
Only the variants where we have a defensible era boundary appear here; the
|
||||
remaining ``(assumed)`` members are left out rather than guessed.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Mapping, Optional
|
||||
|
||||
from domain.epc.wall_type import WallType
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class YearRange:
|
||||
"""An inclusive year range. ``None`` on either end means "no bound"."""
|
||||
|
||||
start: Optional[int] = None
|
||||
end: Optional[int] = None
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.start is None and self.end is not None:
|
||||
return f"pre-{self.end + 1}"
|
||||
if self.start is not None and self.end is None:
|
||||
return f"{self.start}+"
|
||||
return f"{self.start}-{self.end}"
|
||||
|
||||
|
||||
WALL_TYPE_CONSTRUCTION_YEARS: Mapping[WallType, YearRange] = {
|
||||
WallType.CAVITY_AS_BUILT_NO_INSULATION_ASSUMED: YearRange(end=1975),
|
||||
WallType.CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED: YearRange(
|
||||
start=1976, end=1982
|
||||
),
|
||||
WallType.CAVITY_AS_BUILT_INSULATED_ASSUMED: YearRange(start=1983, end=1990),
|
||||
}
|
||||
|
||||
|
||||
def wall_type_construction_date_prompt_hint() -> str:
|
||||
"""Render the date metadata as a prompt fragment for the LLM classifier.
|
||||
|
||||
The fragment lists each (variant, year range) pair so the model can
|
||||
prefer the era-matching ``(assumed)`` variant when a landlord
|
||||
description carries era information (e.g. "1970s semi", "built before
|
||||
the war").
|
||||
"""
|
||||
lines = [
|
||||
f"- {wall_type.value!r}: typically built {year_range}"
|
||||
for wall_type, year_range in WALL_TYPE_CONSTRUCTION_YEARS.items()
|
||||
]
|
||||
return (
|
||||
"When the description carries construction-era information, prefer "
|
||||
"the category whose typical build year matches:\n" + "\n".join(lines)
|
||||
)
|
||||
0
infrastructure/chatgpt/__init__.py
Normal file
0
infrastructure/chatgpt/__init__.py
Normal file
60
infrastructure/chatgpt/chatgpt.py
Normal file
60
infrastructure/chatgpt/chatgpt.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from openai import OpenAI
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
|
||||
from infrastructure.chatgpt.exceptions import ChatGPTClientError
|
||||
|
||||
|
||||
class ChatGPT:
|
||||
"""Thin wrapper over the OpenAI Chat Completions API.
|
||||
|
||||
Sends a single prompt and returns the assistant's reply as plain text.
|
||||
"""
|
||||
|
||||
DEFAULT_MODEL = "gpt-4o-mini"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
) -> None:
|
||||
key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not key:
|
||||
raise ChatGPTClientError(
|
||||
"No OpenAI API key provided. "
|
||||
"Pass api_key or set the OPENAI_API_KEY environment variable."
|
||||
)
|
||||
self._client = OpenAI(api_key=key)
|
||||
self._model = model or self.DEFAULT_MODEL
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
system_prompt: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Send a prompt to the model and return its reply text.
|
||||
|
||||
Args:
|
||||
prompt: The user message to send.
|
||||
system_prompt: Optional instruction that sets the model's behaviour.
|
||||
|
||||
Raises:
|
||||
OpenAiClientError: If the model returns an empty response.
|
||||
"""
|
||||
messages: list[ChatCompletionMessageParam] = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
response = self._client.chat.completions.create(
|
||||
model=self._model,
|
||||
messages=messages,
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
if content is None:
|
||||
raise ChatGPTClientError("ChatGPT returned an empty response.")
|
||||
return content
|
||||
99
infrastructure/chatgpt/chatgpt_column_classifier.py
Normal file
99
infrastructure/chatgpt/chatgpt_column_classifier.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import Any, Optional, TypeVar
|
||||
|
||||
from domain.data_transformation.column_classifier import (
|
||||
ClassificationError,
|
||||
ColumnClassifier,
|
||||
)
|
||||
from infrastructure.chatgpt.chatgpt import ChatGPT
|
||||
from infrastructure.chatgpt.exceptions import ChatGPTClientError
|
||||
|
||||
E = TypeVar("E", bound=Enum)
|
||||
|
||||
|
||||
class ChatGptColumnClassifier(ColumnClassifier[E]):
|
||||
"""ColumnClassifier backed by ChatGPT, parametrised by a category enum.
|
||||
|
||||
The same classification path -- prompt, JSON parsing, UNKNOWN fallback --
|
||||
serves any category enum; only ``category_enum`` and its ``unknown``
|
||||
member differ between columns.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chat_gpt: ChatGPT,
|
||||
category_enum: type[E],
|
||||
unknown: E,
|
||||
extra_instructions: Optional[str] = None,
|
||||
) -> None:
|
||||
self._chat_gpt = chat_gpt
|
||||
self._category_enum = category_enum
|
||||
self._unknown = unknown
|
||||
# Free-form column-specific guidance appended to the system prompt
|
||||
# ahead of the JSON-output instruction. Lets each column ship its
|
||||
# own hints (e.g. wall-type construction-era ranges) without the
|
||||
# generic classifier knowing what they are.
|
||||
self._extra_instructions = extra_instructions
|
||||
|
||||
def classify(self, descriptions: set[str]) -> dict[str, E]:
|
||||
if not descriptions:
|
||||
return {}
|
||||
try:
|
||||
reply = self._chat_gpt.generate(
|
||||
prompt=json.dumps(sorted(descriptions)),
|
||||
system_prompt=self._system_prompt(),
|
||||
)
|
||||
except ChatGPTClientError as error:
|
||||
raise ClassificationError(
|
||||
f"ChatGPT classification failed for "
|
||||
f"{self._category_enum.__name__}."
|
||||
) from error
|
||||
try:
|
||||
raw: dict[str, Any] = json.loads(self._strip_code_fence(reply))
|
||||
except json.JSONDecodeError as error:
|
||||
raise ClassificationError(
|
||||
f"ChatGPT returned a reply that is not valid JSON: {reply!r}"
|
||||
) from error
|
||||
return {
|
||||
description: self._to_category(raw.get(description))
|
||||
for description in descriptions
|
||||
}
|
||||
|
||||
def _system_prompt(self) -> str:
|
||||
categories = ", ".join(
|
||||
member.value
|
||||
for member in self._category_enum
|
||||
if member is not self._unknown
|
||||
)
|
||||
parts = [
|
||||
"Classify each free-text description into exactly one category. ",
|
||||
f"Categories: {categories}. ",
|
||||
]
|
||||
if self._extra_instructions:
|
||||
parts.append(self._extra_instructions + " ")
|
||||
parts.append(
|
||||
"Reply with only a JSON object mapping each original description "
|
||||
"to its category, and nothing else."
|
||||
)
|
||||
return "".join(parts)
|
||||
|
||||
def _to_category(self, value: Any) -> E:
|
||||
"""Map a reply value to a category member, defaulting to UNKNOWN."""
|
||||
try:
|
||||
return self._category_enum(value)
|
||||
except ValueError:
|
||||
return self._unknown
|
||||
|
||||
@staticmethod
|
||||
def _strip_code_fence(reply: str) -> str:
|
||||
"""Remove a surrounding markdown code fence, if ChatGPT added one."""
|
||||
text = reply.strip()
|
||||
if not text.startswith("```"):
|
||||
return text
|
||||
lines = text.splitlines()[1:]
|
||||
if lines and lines[-1].strip() == "```":
|
||||
lines = lines[:-1]
|
||||
return "\n".join(lines)
|
||||
2
infrastructure/chatgpt/exceptions.py
Normal file
2
infrastructure/chatgpt/exceptions.py
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
class ChatGPTClientError(Exception):
|
||||
"""Base for all OpenAI client errors."""
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
import csv
|
||||
from io import StringIO
|
||||
|
||||
from infrastructure.s3_client import S3Client
|
||||
from infrastructure.s3_uri import parse_s3_uri
|
||||
|
||||
|
||||
class CsvS3Client(S3Client):
|
||||
def read_rows(self, s3_uri: str) -> list[dict[str, str]]:
|
||||
bucket, key = parse_s3_uri(s3_uri)
|
||||
if bucket != self.bucket:
|
||||
raise ValueError(
|
||||
f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}"
|
||||
)
|
||||
raw = self.get_object(key)
|
||||
try:
|
||||
text = raw.decode("utf-8-sig")
|
||||
except UnicodeDecodeError:
|
||||
# Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8.
|
||||
text = raw.decode("cp1252")
|
||||
|
||||
reader = csv.DictReader(StringIO(text))
|
||||
return [dict(row) for row in reader]
|
||||
|
||||
def save_rows(self, rows: list[dict[str, str]], key: str) -> str:
|
||||
if not rows:
|
||||
raise ValueError("Cannot save an empty rows list: header is unknown")
|
||||
buffer = StringIO()
|
||||
fieldnames = list(rows[0].keys())
|
||||
writer = csv.DictWriter(buffer, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
return self.put_object(key, buffer.getvalue().encode("utf-8"))
|
||||
0
infrastructure/landlord_overrides/__init__.py
Normal file
0
infrastructure/landlord_overrides/__init__.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
"""One Postgres adapter for every ``landlord_<category>_overrides`` table.
|
||||
|
||||
The four override categories (property / built-form / wall / roof type) share
|
||||
an identical write path: the same ``(portfolio_id, description) -> value`` row
|
||||
shape and the same source-aware conflict policy (ADR-0003 §Decision fixes one
|
||||
policy for all of them). The only thing that varies per category is the target
|
||||
table, which is selected by the SQLModel row class handed to the constructor.
|
||||
|
||||
So a single generic adapter serves all four -- callers parameterise it by the
|
||||
category enum ``E`` and pass the matching ``…Row`` class:
|
||||
|
||||
LandlordOverridesRepository[PropertyType](session, LandlordPropertyTypeOverrideRow)
|
||||
|
||||
This adapter -- the aggregate's "talker" to Postgres -- lives under
|
||||
``infrastructure/landlord_overrides/``, grouped by aggregate. The ``table=True``
|
||||
SQLModel row classes it writes through stay in ``infrastructure/postgres/`` as
|
||||
schema mirrors (they share the ``override_source`` pgEnum and register against
|
||||
the same engine metadata). See ADR-0003 §File layout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
from typing import TypeVar, cast
|
||||
|
||||
from sqlalchemy import Table
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from sqlmodel import Session, SQLModel
|
||||
|
||||
from infrastructure.postgres.landlord_override_enums import OverrideSource
|
||||
from repositories.landlord_overrides.landlord_override_repository import (
|
||||
LandlordOverrideRepository,
|
||||
)
|
||||
|
||||
E = TypeVar("E", bound=Enum)
|
||||
|
||||
|
||||
class LandlordOverridesRepository(LandlordOverrideRepository[E]):
|
||||
"""Writes classifier overrides to the ``landlord_<category>_overrides`` table
|
||||
backing ``row_type``.
|
||||
|
||||
``row_type`` is the SQLModel mirror of the target table (e.g.
|
||||
``LandlordPropertyTypeOverrideRow``); its category enum must match the type
|
||||
argument ``E`` the caller binds. The pairing is a convention, not a type
|
||||
constraint -- the row classes are not generic over their value enum -- so
|
||||
keep the two in step at the call site.
|
||||
"""
|
||||
|
||||
def __init__(self, session: Session, row_type: type[SQLModel]) -> None:
|
||||
self._session = session
|
||||
# SQLModel's class-level ``__table__`` is injected at runtime on
|
||||
# ``table=True`` classes but isn't exposed by the stubs; pin it to
|
||||
# ``Table`` via ``getattr`` so the dialect insert helper carries
|
||||
# through with strict types.
|
||||
self._table: Table = cast(Table, getattr(row_type, "__table__"))
|
||||
|
||||
def upsert_all(
|
||||
self,
|
||||
portfolio_id: int,
|
||||
descriptions_to_values: dict[str, E],
|
||||
) -> None:
|
||||
if not descriptions_to_values:
|
||||
return
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
rows = [
|
||||
{
|
||||
"portfolio_id": portfolio_id,
|
||||
"description": description,
|
||||
"value": value.value,
|
||||
"source": OverrideSource.CLASSIFIER,
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
}
|
||||
for description, value in descriptions_to_values.items()
|
||||
]
|
||||
|
||||
stmt = pg_insert(self._table).values(rows)
|
||||
|
||||
# The classifier may refresh its own past output, but must never
|
||||
# overwrite a user correction -- the ``WHERE existing.source =
|
||||
# 'classifier'`` guard enforces that. See ADR-0003 §Decision.
|
||||
stmt = stmt.on_conflict_do_update(
|
||||
index_elements=["portfolio_id", "description"],
|
||||
set_={
|
||||
"value": stmt.excluded.value,
|
||||
"source": stmt.excluded.source,
|
||||
"updated_at": stmt.excluded.updated_at,
|
||||
},
|
||||
where=self._table.c.source == OverrideSource.CLASSIFIER,
|
||||
)
|
||||
|
||||
# SQLModel re-exports SQLAlchemy's ``Session.execute``; one of the
|
||||
# overload signatures is marked deprecated in stubs, which fires
|
||||
# here even though our INSERT path is the supported one.
|
||||
self._session.execute(stmt) # pyright: ignore[reportDeprecated]
|
||||
0
infrastructure/openai/__init__.py
Normal file
0
infrastructure/openai/__init__.py
Normal file
2
infrastructure/openai/exceptions.py
Normal file
2
infrastructure/openai/exceptions.py
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
class OpenAiClientError(Exception):
|
||||
"""Base for all OpenAI client errors."""
|
||||
60
infrastructure/openai/openai_client.py
Normal file
60
infrastructure/openai/openai_client.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from openai import OpenAI
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
|
||||
from infrastructure.openai.exceptions import OpenAiClientError
|
||||
|
||||
|
||||
class OpenAiChatClient:
|
||||
"""Thin wrapper over the OpenAI Chat Completions API.
|
||||
|
||||
Sends a single prompt and returns the assistant's reply as plain text.
|
||||
"""
|
||||
|
||||
DEFAULT_MODEL = "gpt-4o-mini"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
) -> None:
|
||||
key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not key:
|
||||
raise OpenAiClientError(
|
||||
"No OpenAI API key provided. "
|
||||
"Pass api_key or set the OPENAI_API_KEY environment variable."
|
||||
)
|
||||
self._client = OpenAI(api_key=key)
|
||||
self._model = model or self.DEFAULT_MODEL
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
system_prompt: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Send a prompt to the model and return its reply text.
|
||||
|
||||
Args:
|
||||
prompt: The user message to send.
|
||||
system_prompt: Optional instruction that sets the model's behaviour.
|
||||
|
||||
Raises:
|
||||
OpenAiClientError: If the model returns an empty response.
|
||||
"""
|
||||
messages: list[ChatCompletionMessageParam] = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
response = self._client.chat.completions.create(
|
||||
model=self._model,
|
||||
messages=messages,
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
if content is None:
|
||||
raise OpenAiClientError("OpenAI returned an empty response.")
|
||||
return content
|
||||
|
|
@ -1,3 +1,6 @@
|
|||
from collections.abc import Iterator
|
||||
from contextlib import contextmanager
|
||||
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlmodel import Session, create_engine
|
||||
|
||||
|
|
@ -16,3 +19,42 @@ def make_engine(config: PostgresConfig) -> Engine:
|
|||
|
||||
def make_session(engine: Engine) -> Session:
|
||||
return Session(engine)
|
||||
|
||||
|
||||
@contextmanager # pyright: ignore[reportDeprecated]
|
||||
def transactional_session(engine: Engine) -> Iterator[Session]:
|
||||
"""Yield a session whose lifecycle owns the transaction.
|
||||
|
||||
On clean exit the session commits; on any exception it rolls back and
|
||||
re-raises. Either way the session is closed. Callers in the application
|
||||
layer can do their work inside the ``with`` block without ever invoking
|
||||
``.commit()`` / ``.rollback()`` themselves -- transaction semantics stay
|
||||
in the infrastructure layer.
|
||||
"""
|
||||
session = Session(engine)
|
||||
try:
|
||||
yield session
|
||||
session.commit()
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@contextmanager # pyright: ignore[reportDeprecated]
|
||||
def commit_scope(session: Session) -> Iterator[Session]:
|
||||
"""Commit a caller-owned session on clean exit; roll back on error.
|
||||
|
||||
Like ``transactional_session`` but for a session the caller already holds
|
||||
and will close itself. Use it to keep slow, non-DB work *outside* the
|
||||
transaction: build the session, run the slow work, then enter
|
||||
``commit_scope`` only for the persistence -- so a connection is checked out
|
||||
(SQLModel sessions are lazy) for the shortest possible window.
|
||||
"""
|
||||
try:
|
||||
yield session
|
||||
session.commit()
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
|
|
|
|||
|
|
@ -0,0 +1,69 @@
|
|||
"""SQLModel mirror of the ``landlord_built_form_type_overrides`` Drizzle table.
|
||||
|
||||
The schema source of truth lives in the ``assessment-model`` TS repo
|
||||
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
|
||||
this row class only mirrors the columns so the Python lambda can read/write.
|
||||
See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only
|
||||
differences are the table name, the ``built_form_type`` pgEnum on ``value``,
|
||||
and the unique-constraint name.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import ClassVar
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import BigInteger, Column, UniqueConstraint
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.built_form_type import BuiltFormType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
class LandlordBuiltFormTypeOverrideRow(SQLModel, table=True):
|
||||
__tablename__: ClassVar[str] = "landlord_built_form_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
UniqueConstraint(
|
||||
"portfolio_id",
|
||||
"description",
|
||||
name="landlord_built_form_type_overrides_portfolio_description_unique",
|
||||
),
|
||||
)
|
||||
|
||||
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
||||
|
||||
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
|
||||
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
|
||||
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
|
||||
# not declared here -- the ``portfolio`` table is not modelled in Python.
|
||||
portfolio_id: int = Field(
|
||||
sa_column=Column(BigInteger, nullable=False, index=True),
|
||||
)
|
||||
|
||||
description: str = Field(nullable=False)
|
||||
|
||||
value: BuiltFormType = Field(
|
||||
sa_column=Column(
|
||||
SAEnum(
|
||||
BuiltFormType,
|
||||
name="built_form_type",
|
||||
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
|
||||
# instance is reused by every ``landlord_*_overrides`` row class.
|
||||
source: str = Field(
|
||||
sa_column=Column(override_source_sa_enum, nullable=False),
|
||||
)
|
||||
|
||||
created_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
35
infrastructure/postgres/landlord_override_enums.py
Normal file
35
infrastructure/postgres/landlord_override_enums.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
"""Shared pgEnum definitions used by every ``landlord_*_overrides`` row class.
|
||||
|
||||
The ``override_source`` pgEnum is referenced by both
|
||||
``landlord_property_type_overrides`` and ``landlord_wall_type_overrides``
|
||||
(per the Drizzle schema -- see ``landlord_overrides.ts``). Defining it once
|
||||
here and reusing the same SQLAlchemy ``Enum`` instance across both row
|
||||
classes keeps SQLModel's metadata coherent: ``create_all`` emits exactly one
|
||||
``CREATE TYPE override_source`` statement, not two parallel ones colliding
|
||||
on the same pgEnum name.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
|
||||
|
||||
class OverrideSource:
|
||||
"""Mirror of the ``override_source`` pgEnum.
|
||||
|
||||
Drizzle defines this as ``('classifier', 'user')`` in
|
||||
``landlord_overrides.ts``. Modelled here as string constants so callers
|
||||
don't sprinkle magic strings; the column is constrained by Postgres,
|
||||
and the only Python-side producer (the classifier path) writes the
|
||||
literal ``OverrideSource.CLASSIFIER``.
|
||||
"""
|
||||
|
||||
CLASSIFIER = "classifier"
|
||||
USER = "user"
|
||||
|
||||
|
||||
override_source_sa_enum = SAEnum(
|
||||
OverrideSource.CLASSIFIER,
|
||||
OverrideSource.USER,
|
||||
name="override_source",
|
||||
)
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
"""SQLModel mirror of the ``landlord_property_type_overrides`` Drizzle table.
|
||||
|
||||
The schema source of truth lives in the ``assessment-model`` TS repo
|
||||
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
|
||||
this row class only mirrors the columns so the Python lambda can read/write.
|
||||
See ADR-0003.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import ClassVar
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import BigInteger, Column, UniqueConstraint
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.property_type import PropertyType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
class LandlordPropertyTypeOverrideRow(SQLModel, table=True):
|
||||
__tablename__: ClassVar[str] = "landlord_property_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
UniqueConstraint(
|
||||
"portfolio_id",
|
||||
"description",
|
||||
name="landlord_property_type_overrides_portfolio_description_unique",
|
||||
),
|
||||
)
|
||||
|
||||
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
||||
|
||||
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
|
||||
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
|
||||
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
|
||||
# not declared here -- the ``portfolio`` table is not modelled in Python.
|
||||
portfolio_id: int = Field(
|
||||
sa_column=Column(BigInteger, nullable=False, index=True),
|
||||
)
|
||||
|
||||
description: str = Field(nullable=False)
|
||||
|
||||
value: PropertyType = Field(
|
||||
sa_column=Column(
|
||||
SAEnum(
|
||||
PropertyType,
|
||||
name="property_type",
|
||||
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
|
||||
# instance is reused by every ``landlord_*_overrides`` row class.
|
||||
source: str = Field(
|
||||
sa_column=Column(override_source_sa_enum, nullable=False),
|
||||
)
|
||||
|
||||
created_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
69
infrastructure/postgres/landlord_roof_type_override_table.py
Normal file
69
infrastructure/postgres/landlord_roof_type_override_table.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
"""SQLModel mirror of the ``landlord_roof_type_overrides`` Drizzle table.
|
||||
|
||||
The schema source of truth lives in the ``assessment-model`` TS repo
|
||||
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
|
||||
this row class only mirrors the columns so the Python lambda can read/write.
|
||||
See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only
|
||||
differences are the table name, the ``roof_type`` pgEnum on ``value``, and
|
||||
the unique-constraint name.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import ClassVar
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import BigInteger, Column, UniqueConstraint
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.roof_type import RoofType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
class LandlordRoofTypeOverrideRow(SQLModel, table=True):
|
||||
__tablename__: ClassVar[str] = "landlord_roof_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
UniqueConstraint(
|
||||
"portfolio_id",
|
||||
"description",
|
||||
name="landlord_roof_type_overrides_portfolio_description_unique",
|
||||
),
|
||||
)
|
||||
|
||||
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
||||
|
||||
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
|
||||
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
|
||||
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
|
||||
# not declared here -- the ``portfolio`` table is not modelled in Python.
|
||||
portfolio_id: int = Field(
|
||||
sa_column=Column(BigInteger, nullable=False, index=True),
|
||||
)
|
||||
|
||||
description: str = Field(nullable=False)
|
||||
|
||||
value: RoofType = Field(
|
||||
sa_column=Column(
|
||||
SAEnum(
|
||||
RoofType,
|
||||
name="roof_type",
|
||||
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
|
||||
# instance is reused by every ``landlord_*_overrides`` row class.
|
||||
source: str = Field(
|
||||
sa_column=Column(override_source_sa_enum, nullable=False),
|
||||
)
|
||||
|
||||
created_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
69
infrastructure/postgres/landlord_wall_type_override_table.py
Normal file
69
infrastructure/postgres/landlord_wall_type_override_table.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
"""SQLModel mirror of the ``landlord_wall_type_overrides`` Drizzle table.
|
||||
|
||||
The schema source of truth lives in the ``assessment-model`` TS repo
|
||||
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
|
||||
this row class only mirrors the columns so the Python lambda can read/write.
|
||||
See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only
|
||||
differences are the table name, the ``wall_type`` pgEnum on ``value``, and
|
||||
the unique-constraint name.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import ClassVar
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import BigInteger, Column, UniqueConstraint
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.wall_type import WallType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
class LandlordWallTypeOverrideRow(SQLModel, table=True):
|
||||
__tablename__: ClassVar[str] = "landlord_wall_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
UniqueConstraint(
|
||||
"portfolio_id",
|
||||
"description",
|
||||
name="landlord_wall_type_overrides_portfolio_description_unique",
|
||||
),
|
||||
)
|
||||
|
||||
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
||||
|
||||
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
|
||||
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
|
||||
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
|
||||
# not declared here -- the ``portfolio`` table is not modelled in Python.
|
||||
portfolio_id: int = Field(
|
||||
sa_column=Column(BigInteger, nullable=False, index=True),
|
||||
)
|
||||
|
||||
description: str = Field(nullable=False)
|
||||
|
||||
value: WallType = Field(
|
||||
sa_column=Column(
|
||||
SAEnum(
|
||||
WallType,
|
||||
name="wall_type",
|
||||
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
|
||||
# instance is reused by every ``landlord_*_overrides`` row class.
|
||||
source: str = Field(
|
||||
sa_column=Column(override_source_sa_enum, nullable=False),
|
||||
)
|
||||
|
||||
created_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
0
infrastructure/s3/__init__.py
Normal file
0
infrastructure/s3/__init__.py
Normal file
62
infrastructure/s3/csv_s3_client.py
Normal file
62
infrastructure/s3/csv_s3_client.py
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
import csv
|
||||
from io import StringIO
|
||||
|
||||
from infrastructure.s3.s3_client import S3Client
|
||||
from infrastructure.s3.s3_uri import parse_s3_uri
|
||||
|
||||
|
||||
def _dedupe_fieldnames(fieldnames: list[str]) -> list[str]:
|
||||
"""Disambiguate repeated CSV headers by appending an index.
|
||||
|
||||
The first occurrence keeps its name; each later one becomes
|
||||
``name_1``, ``name_2`` … so duplicate columns survive as distinct
|
||||
keys instead of collapsing onto one (last-wins) dict entry.
|
||||
"""
|
||||
deduped: list[str] = []
|
||||
counts: dict[str, int] = {}
|
||||
for name in fieldnames:
|
||||
if name not in counts:
|
||||
counts[name] = 0
|
||||
deduped.append(name)
|
||||
continue
|
||||
counts[name] += 1
|
||||
candidate = f"{name}_{counts[name]}"
|
||||
while candidate in counts:
|
||||
counts[name] += 1
|
||||
candidate = f"{name}_{counts[name]}"
|
||||
counts[candidate] = 0
|
||||
deduped.append(candidate)
|
||||
return deduped
|
||||
|
||||
|
||||
class CsvS3Client(S3Client):
|
||||
def read_rows(self, s3_uri: str) -> list[dict[str, str]]:
|
||||
bucket, key = parse_s3_uri(s3_uri)
|
||||
if bucket != self.bucket:
|
||||
raise ValueError(
|
||||
f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}"
|
||||
)
|
||||
raw = self.get_object(key)
|
||||
try:
|
||||
text = raw.decode("utf-8-sig")
|
||||
except UnicodeDecodeError:
|
||||
# Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8.
|
||||
text = raw.decode("cp1252")
|
||||
|
||||
buffer = StringIO(text)
|
||||
header = next(csv.reader(buffer), None)
|
||||
if header is None:
|
||||
return []
|
||||
fieldnames = _dedupe_fieldnames(header)
|
||||
reader = csv.DictReader(buffer, fieldnames=fieldnames)
|
||||
return [dict(row) for row in reader]
|
||||
|
||||
def save_rows(self, rows: list[dict[str, str]], key: str) -> str:
|
||||
if not rows:
|
||||
raise ValueError("Cannot save an empty rows list: header is unknown")
|
||||
buffer = StringIO()
|
||||
fieldnames = list(rows[0].keys())
|
||||
writer = csv.DictWriter(buffer, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
return self.put_object(key, buffer.getvalue().encode("utf-8"))
|
||||
37
orchestration/classifiable_column.py
Normal file
37
orchestration/classifiable_column.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Generic, TypeVar
|
||||
|
||||
from domain.data_transformation.column_classifier import ColumnClassifier
|
||||
from repositories.landlord_overrides.landlord_override_repository import (
|
||||
LandlordOverrideRepository,
|
||||
)
|
||||
|
||||
E = TypeVar("E", bound=Enum)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ClassifiableColumn(Generic[E]):
|
||||
"""Pairs a column's classifier with the repository that persists its results.
|
||||
|
||||
The orchestrator registers one ``ClassifiableColumn`` per
|
||||
(source column, target enum) pair. Bundling the classifier and the
|
||||
repository together makes the "this enum lands in this table" invariant
|
||||
structural -- the handler can no longer wire ``PropertyType``
|
||||
classifications to a ``WallType`` repo by keying two dicts with the same
|
||||
string.
|
||||
|
||||
``source_column`` is the landlord-CSV header to read from; ``name`` is the
|
||||
unique key the orchestrator uses to report this classification's results
|
||||
(and the key the handler logs). Two ``ClassifiableColumn``s may share a
|
||||
``source_column`` -- e.g. the ``"Property Type"`` CSV column feeds both
|
||||
``PropertyType`` and ``BuiltFormType`` classifiers off the same free-text
|
||||
description -- but each must have a unique ``name``.
|
||||
"""
|
||||
|
||||
name: str
|
||||
source_column: str
|
||||
classifier: ColumnClassifier[E]
|
||||
repo: LandlordOverrideRepository[E]
|
||||
142
orchestration/landlord_description_overrides_orchestrator.py
Normal file
142
orchestration/landlord_description_overrides_orchestrator.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from domain.addresses.unstandardised_address import AddressList
|
||||
from orchestration.classifiable_column import ClassifiableColumn
|
||||
from repositories.unstandardised_address.unstandardised_address_list_repository import (
|
||||
UnstandardisedAddressListRepository,
|
||||
)
|
||||
|
||||
|
||||
class LandlordDescriptionOverridesOrchestrator:
|
||||
def __init__(
|
||||
self,
|
||||
unstandardised_address_repo: UnstandardisedAddressListRepository,
|
||||
columns: list[ClassifiableColumn[Any]],
|
||||
) -> None:
|
||||
self._unstandardised_address_repo = unstandardised_address_repo
|
||||
# Each entry is one (source CSV column, target enum) classification.
|
||||
# Two entries may share ``source_column`` -- e.g. ``"Property Type"``
|
||||
# feeds both PropertyType and BuiltFormType classifiers -- so the
|
||||
# registry is a list rather than a dict keyed by header.
|
||||
self._columns = columns
|
||||
|
||||
def get_unstandardised_addresses(
|
||||
self,
|
||||
input_s3_uri: str,
|
||||
) -> AddressList:
|
||||
return self._unstandardised_address_repo.load_batch(input_s3_uri)
|
||||
|
||||
def get_col_to_description_mappings(
|
||||
self, list_of_unstandardised_address: AddressList
|
||||
) -> dict[str, set[str]]:
|
||||
mappings: dict[str, set[str]] = {}
|
||||
for unstandardised_address in list_of_unstandardised_address:
|
||||
for key, value in unstandardised_address.additional_info.items():
|
||||
bucket = mappings.setdefault(key, set())
|
||||
# A comma-separated value is several descriptions in one cell;
|
||||
# split it so each is its own entry. Lower-case so case-only
|
||||
# typos collapse to one variant.
|
||||
for variant in value.split(","):
|
||||
variant = variant.strip().lower()
|
||||
if variant:
|
||||
bucket.add(variant)
|
||||
return mappings
|
||||
|
||||
def classify_columns(
|
||||
self, addresses: AddressList
|
||||
) -> dict[str, dict[str, Enum]]:
|
||||
"""Classify every registered column's descriptions.
|
||||
|
||||
Returns a mapping of ``ClassifiableColumn.name`` to
|
||||
``{description: category}``. A registered column whose ``source_column``
|
||||
is absent from the addresses contributes an empty inner mapping.
|
||||
"""
|
||||
col_to_desc = self.get_col_to_description_mappings(addresses)
|
||||
return {
|
||||
column.name: column.classifier.classify(
|
||||
col_to_desc.get(column.source_column, set())
|
||||
)
|
||||
for column in self._columns
|
||||
}
|
||||
|
||||
def persist(
|
||||
self, classified: dict[str, dict[str, Enum]], portfolio_id: int
|
||||
) -> None:
|
||||
"""Persist already-classified results via each column's repository.
|
||||
|
||||
``classified`` is keyed by ``ClassifiableColumn.name`` -- the shape
|
||||
``classify_columns`` and ``classify_from_rows`` return. Each non-empty
|
||||
mapping is written through the column's own repo under
|
||||
``source = 'classifier'``; an empty mapping (a registered column absent
|
||||
from this batch) skips the DB round-trip.
|
||||
|
||||
The orchestrator does not commit -- the caller owns the transaction
|
||||
boundary, and is expected to open it only around this call so the
|
||||
slow classification never holds a connection.
|
||||
"""
|
||||
for column in self._columns:
|
||||
mapping = classified.get(column.name)
|
||||
if not mapping:
|
||||
continue
|
||||
column.repo.upsert_all(portfolio_id, mapping)
|
||||
|
||||
def classify_and_persist(
|
||||
self, addresses: AddressList, portfolio_id: int
|
||||
) -> dict[str, dict[str, Enum]]:
|
||||
"""Classify every registered column and persist the results.
|
||||
|
||||
Returns the same shape as ``classify_columns`` so callers can log
|
||||
per-column counts.
|
||||
"""
|
||||
classified = self.classify_columns(addresses)
|
||||
self.persist(classified, portfolio_id)
|
||||
return classified
|
||||
|
||||
def classify_from_rows(
|
||||
self, rows: list[dict[str, str]]
|
||||
) -> dict[str, dict[str, Enum]]:
|
||||
"""Classify raw CSV rows without touching the database.
|
||||
|
||||
The classification half of ``classify_and_persist_from_rows``, split
|
||||
out so a caller can run the slow ChatGPT work *before* opening a
|
||||
transaction and then write the finished results with ``persist`` inside
|
||||
one short-lived connection.
|
||||
|
||||
Unlike the ``AddressList`` path this builds no ``AddressList``, so it
|
||||
has no canonical address/postcode requirement -- the classifier only
|
||||
needs the raw description cells. Used when reading the original
|
||||
landlord upload (raw headers) rather than the address-matching CSV.
|
||||
"""
|
||||
col_to_desc = self._descriptions_from_rows(rows)
|
||||
return {
|
||||
column.name: column.classifier.classify(
|
||||
col_to_desc.get(column.source_column, set())
|
||||
)
|
||||
for column in self._columns
|
||||
}
|
||||
|
||||
def classify_and_persist_from_rows(
|
||||
self, rows: list[dict[str, str]], portfolio_id: int
|
||||
) -> dict[str, dict[str, Enum]]:
|
||||
"""Classify + persist straight from raw CSV rows in one call.
|
||||
|
||||
A convenience composition of ``classify_from_rows`` + ``persist``.
|
||||
Prefer calling the two separately when classification is slow, so the
|
||||
transaction opens only around ``persist`` (see the Lambda handler).
|
||||
"""
|
||||
classified = self.classify_from_rows(rows)
|
||||
self.persist(classified, portfolio_id)
|
||||
return classified
|
||||
|
||||
@staticmethod
|
||||
def _descriptions_from_rows(rows: list[dict[str, str]]) -> dict[str, set[str]]:
|
||||
mappings: dict[str, set[str]] = {}
|
||||
for row in rows:
|
||||
for key, value in row.items():
|
||||
bucket = mappings.setdefault(key, set())
|
||||
for variant in (value or "").split(","):
|
||||
variant = variant.strip().lower()
|
||||
if variant:
|
||||
bucket.add(variant)
|
||||
return mappings
|
||||
|
|
@ -5,19 +5,21 @@ from uuid import UUID
|
|||
from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
|
||||
from orchestration.task_orchestrator import TaskOrchestrator
|
||||
from domain.addresses.postcode_batching import iter_postcode_grouped_batches
|
||||
from repositories.user_address.user_address_repository import UserAddressRepository
|
||||
from repositories.unstandardised_address.unstandardised_address_list_repository import (
|
||||
UnstandardisedAddressListRepository,
|
||||
)
|
||||
|
||||
|
||||
class PostcodeSplitterOrchestrator:
|
||||
def __init__(
|
||||
self,
|
||||
task_orchestrator: TaskOrchestrator,
|
||||
user_address_repo: UserAddressRepository,
|
||||
unstandardised_address_repo: UnstandardisedAddressListRepository,
|
||||
queue_client: Address2UprnQueueClient,
|
||||
max_batch_size: int = 500,
|
||||
) -> None:
|
||||
self._task_orchestrator = task_orchestrator
|
||||
self._user_address_repo = user_address_repo
|
||||
self._unstandardised_address_repo = unstandardised_address_repo
|
||||
self._queue_client = queue_client
|
||||
self._max_batch_size = max_batch_size
|
||||
|
||||
|
|
@ -28,7 +30,7 @@ class PostcodeSplitterOrchestrator:
|
|||
parent_subtask_id: UUID,
|
||||
input_s3_uri: str,
|
||||
) -> list[UUID]:
|
||||
addresses = self._user_address_repo.load_batch(input_s3_uri)
|
||||
addresses = self._unstandardised_address_repo.load_batch(input_s3_uri)
|
||||
path_prefix = (
|
||||
f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}"
|
||||
)
|
||||
|
|
@ -37,7 +39,7 @@ class PostcodeSplitterOrchestrator:
|
|||
for batch in iter_postcode_grouped_batches(
|
||||
addresses, max_batch_size=self._max_batch_size
|
||||
):
|
||||
batch_uri = self._user_address_repo.save_batch(batch, path_prefix)
|
||||
batch_uri = self._unstandardised_address_repo.save_batch(batch, path_prefix)
|
||||
child = self._task_orchestrator.create_child_subtask(
|
||||
parent_task_id,
|
||||
inputs={
|
||||
|
|
|
|||
|
|
@ -26,5 +26,6 @@ testpaths =
|
|||
etl/hubspot/tests
|
||||
etl/spatial/tests
|
||||
domain/sap10_ml/tests
|
||||
tests/
|
||||
markers =
|
||||
integration: mark a test as an integration test
|
||||
|
|
|
|||
0
repositories/landlord_overrides/__init__.py
Normal file
0
repositories/landlord_overrides/__init__.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import Generic, TypeVar
|
||||
|
||||
E = TypeVar("E", bound=Enum)
|
||||
|
||||
|
||||
class LandlordOverrideRepository(ABC, Generic[E]):
|
||||
"""Port: persists landlord (description -> category) overrides for a portfolio.
|
||||
|
||||
One repository implementation targets one ``landlord_<category>_overrides``
|
||||
table. The category enum ``E`` (e.g. ``PropertyType``, ``WallType``) determines
|
||||
which table the adapter writes to; the orchestrator depends only on this
|
||||
interface and never names a concrete table.
|
||||
|
||||
A single concrete adapter,
|
||||
``infrastructure/landlord_overrides/landlord_overrides_postgres_repository.LandlordOverridesRepository``,
|
||||
serves every category (see ADR-0003) -- it is parameterised by the
|
||||
SQLModel row class for the target table.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def upsert_all(
|
||||
self,
|
||||
portfolio_id: int,
|
||||
descriptions_to_values: dict[str, E],
|
||||
) -> None:
|
||||
"""Upsert each ``(portfolio_id, description) -> value`` row with ``source='classifier'``.
|
||||
|
||||
On conflict with an existing row whose ``source = 'classifier'``, the row
|
||||
is updated (value, source, updated_at). On conflict with a row whose
|
||||
``source = 'user'``, the existing row is preserved -- the classifier
|
||||
never overwrites a user correction. See ADR-0003 §Decision.
|
||||
|
||||
An empty ``descriptions_to_values`` mapping is a no-op; callers may
|
||||
skip this call entirely when they have nothing to write.
|
||||
"""
|
||||
...
|
||||
0
repositories/unstandardised_address/__init__.py
Normal file
0
repositories/unstandardised_address/__init__.py
Normal file
|
|
@ -4,10 +4,12 @@ import uuid
|
|||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from domain.addresses.user_address import UserAddress
|
||||
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
|
||||
from domain.postcode import Postcode
|
||||
from infrastructure.csv_s3_client import CsvS3Client
|
||||
from repositories.user_address.user_address_repository import UserAddressRepository
|
||||
from infrastructure.s3.csv_s3_client import CsvS3Client
|
||||
from repositories.unstandardised_address.unstandardised_address_list_repository import (
|
||||
UnstandardisedAddressListRepository,
|
||||
)
|
||||
|
||||
_ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3")
|
||||
_POSTCODE_COLUMN: str = "postcode"
|
||||
|
|
@ -15,42 +17,45 @@ _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference"
|
|||
_POSTCODE_CLEAN_COLUMN: str = "postcode_clean"
|
||||
|
||||
|
||||
class UserAddressCsvS3Repository(UserAddressRepository):
|
||||
class UnstandardisedAddressListCsvS3Repository(UnstandardisedAddressListRepository):
|
||||
def __init__(self, csv_client: CsvS3Client, bucket: str) -> None:
|
||||
self._csv_client = csv_client
|
||||
self._bucket = bucket
|
||||
|
||||
def load_batch(self, s3_uri: str) -> list[UserAddress]:
|
||||
def load_batch(self, s3_uri: str) -> AddressList:
|
||||
rows = self._csv_client.read_rows(s3_uri)
|
||||
if rows and _POSTCODE_COLUMN not in rows[0]:
|
||||
raise ValueError(
|
||||
f"Input CSV {s3_uri} has no {_POSTCODE_COLUMN!r} column; "
|
||||
f"columns present: {sorted(rows[0])}"
|
||||
)
|
||||
addresses: list[UserAddress] = []
|
||||
addresses: AddressList = AddressList([])
|
||||
for row in rows:
|
||||
parts = [
|
||||
row[col].strip()
|
||||
for col in _ADDRESS_COLUMNS
|
||||
if col in row and row[col].strip()
|
||||
]
|
||||
user_address = ", ".join(parts)
|
||||
unstandardised_address = ", ".join(parts)
|
||||
postcode = row.get(_POSTCODE_COLUMN, "")
|
||||
raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip()
|
||||
internal_reference: Optional[str] = raw_ref or None
|
||||
addresses.append(
|
||||
UserAddress(
|
||||
user_address=user_address,
|
||||
UnstandardisedAddress(
|
||||
address=unstandardised_address,
|
||||
postcode=Postcode(postcode),
|
||||
internal_reference=internal_reference,
|
||||
source_row=row,
|
||||
org_reference=internal_reference,
|
||||
additional_info=row,
|
||||
)
|
||||
)
|
||||
return addresses
|
||||
|
||||
def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str:
|
||||
def save_batch(self, addresses: AddressList, path_prefix: str) -> str:
|
||||
rows: list[dict[str, str]] = [
|
||||
{**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)}
|
||||
{
|
||||
**addr.additional_info,
|
||||
_POSTCODE_CLEAN_COLUMN: str(addr.postcode),
|
||||
}
|
||||
for addr in addresses
|
||||
]
|
||||
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from domain.addresses.unstandardised_address import AddressList
|
||||
|
||||
|
||||
class UnstandardisedAddressListRepository(ABC):
|
||||
@abstractmethod
|
||||
def load_batch(self, s3_uri: str) -> AddressList: ...
|
||||
|
||||
@abstractmethod
|
||||
def save_batch(self, addresses: AddressList, path_prefix: str) -> str: ...
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from domain.addresses.user_address import UserAddress
|
||||
|
||||
|
||||
class UserAddressRepository(ABC):
|
||||
@abstractmethod
|
||||
def load_batch(self, s3_uri: str) -> list[UserAddress]: ...
|
||||
|
||||
@abstractmethod
|
||||
def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: ...
|
||||
|
|
@ -10,4 +10,5 @@ fuzzywuzzy
|
|||
pymupdf
|
||||
playwright==1.58.0
|
||||
msal
|
||||
moto[s3,sqs]
|
||||
moto[s3,sqs]
|
||||
openai==1.93.0
|
||||
|
|
@ -1,17 +1,17 @@
|
|||
import pytest
|
||||
|
||||
from domain.addresses.postcode_batching import iter_postcode_grouped_batches
|
||||
from domain.addresses.user_address import UserAddress
|
||||
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
|
||||
from domain.postcode import Postcode
|
||||
|
||||
|
||||
def _addrs(postcode: str, n: int) -> list[UserAddress]:
|
||||
return [
|
||||
UserAddress(
|
||||
user_address=f"{i} {postcode} Street", postcode=Postcode(postcode)
|
||||
)
|
||||
for i in range(n)
|
||||
]
|
||||
def _addrs(postcode: str, n: int) -> AddressList:
|
||||
return AddressList(
|
||||
[
|
||||
UnstandardisedAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode))
|
||||
for i in range(n)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def test_empty_input_yields_no_batches() -> None:
|
||||
|
|
@ -74,9 +74,7 @@ def test_oversize_group_flushes_existing_buffer_first() -> None:
|
|||
big = _addrs("BB2 2BB", 7)
|
||||
tail = _addrs("CC3 3CC", 1)
|
||||
# act
|
||||
batches = list(
|
||||
iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)
|
||||
)
|
||||
batches = list(iter_postcode_grouped_batches(small + big + tail, max_batch_size=5))
|
||||
# assert
|
||||
assert len(batches) == 3
|
||||
assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"]
|
||||
|
|
|
|||
96
tests/domain/addresses/test_unstandardised_address.py
Normal file
96
tests/domain/addresses/test_unstandardised_address.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
import dataclasses
|
||||
|
||||
import pytest
|
||||
|
||||
from domain.addresses.unstandardised_address import UnstandardisedAddress
|
||||
from domain.postcode import Postcode
|
||||
|
||||
|
||||
def test_unstandardised_address_holds_postcode_value_object() -> None:
|
||||
# act
|
||||
addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa"))
|
||||
# assert
|
||||
assert addr.postcode == Postcode("SW1A1AA")
|
||||
|
||||
|
||||
def test_unstandardised_address_preserves_unstandardised_address_verbatim() -> None:
|
||||
# The free-text unstandardised_address string is intentionally NOT normalised --
|
||||
# only the postcode is canonicalised, and that happens inside Postcode.
|
||||
# act
|
||||
addr = UnstandardisedAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA"))
|
||||
# assert
|
||||
assert addr.address == " 1 The Street "
|
||||
|
||||
|
||||
def test_unstandardised_address_internal_reference_defaults_to_none() -> None:
|
||||
# act
|
||||
addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA"))
|
||||
# assert
|
||||
assert addr.org_reference is None
|
||||
|
||||
|
||||
def test_unstandardised_address_internal_reference_accepted() -> None:
|
||||
# act
|
||||
addr = UnstandardisedAddress(
|
||||
address="1 The Street",
|
||||
postcode=Postcode("SW1A1AA"),
|
||||
org_reference="cust-42",
|
||||
)
|
||||
# assert
|
||||
assert addr.org_reference == "cust-42"
|
||||
|
||||
|
||||
def test_unstandardised_address_is_frozen() -> None:
|
||||
# arrange
|
||||
addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA"))
|
||||
# act / assert
|
||||
with pytest.raises(dataclasses.FrozenInstanceError):
|
||||
addr.postcode = Postcode("OTHER") # type: ignore[misc]
|
||||
|
||||
|
||||
def test_unstandardised_address_equality_uses_canonical_postcode() -> None:
|
||||
# Postcode sanitises eagerly, so addresses built from different surface
|
||||
# forms of the same postcode compare equal.
|
||||
# arrange
|
||||
a = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa"))
|
||||
b = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA"))
|
||||
# act / assert
|
||||
assert a == b
|
||||
|
||||
|
||||
def test_unstandardised_address_source_row_defaults_to_empty_dict() -> None:
|
||||
# act
|
||||
addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA"))
|
||||
# assert
|
||||
assert addr.additional_info == {}
|
||||
|
||||
|
||||
def test_unstandardised_address_carries_source_row() -> None:
|
||||
# arrange
|
||||
row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"}
|
||||
# act
|
||||
addr = UnstandardisedAddress(
|
||||
address="1 The Street",
|
||||
postcode=Postcode("SW1A 1AA"),
|
||||
additional_info=row,
|
||||
)
|
||||
# assert
|
||||
assert addr.additional_info == row
|
||||
|
||||
|
||||
def test_unstandardised_address_equality_ignores_source_row() -> None:
|
||||
# source_row is excluded from equality (and hashing): identity stays
|
||||
# defined by the parsed fields.
|
||||
# arrange
|
||||
a = UnstandardisedAddress(
|
||||
address="1 The Street",
|
||||
postcode=Postcode("SW1A1AA"),
|
||||
additional_info={"x": "1"},
|
||||
)
|
||||
b = UnstandardisedAddress(
|
||||
address="1 The Street",
|
||||
postcode=Postcode("SW1A1AA"),
|
||||
additional_info={"y": "2"},
|
||||
)
|
||||
# act / assert
|
||||
assert a == b
|
||||
|
|
@ -1,98 +0,0 @@
|
|||
import dataclasses
|
||||
|
||||
import pytest
|
||||
|
||||
from domain.addresses.user_address import UserAddress
|
||||
from domain.postcode import Postcode
|
||||
|
||||
|
||||
def test_user_address_holds_postcode_value_object() -> None:
|
||||
# act
|
||||
addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
|
||||
# assert
|
||||
assert addr.postcode == Postcode("SW1A1AA")
|
||||
|
||||
|
||||
def test_user_address_preserves_user_address_verbatim() -> None:
|
||||
# The free-text user_address string is intentionally NOT normalised --
|
||||
# only the postcode is canonicalised, and that happens inside Postcode.
|
||||
# act
|
||||
addr = UserAddress(
|
||||
user_address=" 1 The Street ", postcode=Postcode("SW1A1AA")
|
||||
)
|
||||
# assert
|
||||
assert addr.user_address == " 1 The Street "
|
||||
|
||||
|
||||
def test_user_address_internal_reference_defaults_to_none() -> None:
|
||||
# act
|
||||
addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
|
||||
# assert
|
||||
assert addr.internal_reference is None
|
||||
|
||||
|
||||
def test_user_address_internal_reference_accepted() -> None:
|
||||
# act
|
||||
addr = UserAddress(
|
||||
user_address="1 The Street",
|
||||
postcode=Postcode("SW1A1AA"),
|
||||
internal_reference="cust-42",
|
||||
)
|
||||
# assert
|
||||
assert addr.internal_reference == "cust-42"
|
||||
|
||||
|
||||
def test_user_address_is_frozen() -> None:
|
||||
# arrange
|
||||
addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
|
||||
# act / assert
|
||||
with pytest.raises(dataclasses.FrozenInstanceError):
|
||||
addr.postcode = Postcode("OTHER") # type: ignore[misc]
|
||||
|
||||
|
||||
def test_user_address_equality_uses_canonical_postcode() -> None:
|
||||
# Postcode sanitises eagerly, so addresses built from different surface
|
||||
# forms of the same postcode compare equal.
|
||||
# arrange
|
||||
a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
|
||||
b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
|
||||
# act / assert
|
||||
assert a == b
|
||||
|
||||
|
||||
def test_user_address_source_row_defaults_to_empty_dict() -> None:
|
||||
# act
|
||||
addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
|
||||
# assert
|
||||
assert addr.source_row == {}
|
||||
|
||||
|
||||
def test_user_address_carries_source_row() -> None:
|
||||
# arrange
|
||||
row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"}
|
||||
# act
|
||||
addr = UserAddress(
|
||||
user_address="1 The Street",
|
||||
postcode=Postcode("SW1A 1AA"),
|
||||
source_row=row,
|
||||
)
|
||||
# assert
|
||||
assert addr.source_row == row
|
||||
|
||||
|
||||
def test_user_address_equality_ignores_source_row() -> None:
|
||||
# source_row is excluded from equality (and hashing): identity stays
|
||||
# defined by the parsed fields.
|
||||
# arrange
|
||||
a = UserAddress(
|
||||
user_address="1 The Street",
|
||||
postcode=Postcode("SW1A1AA"),
|
||||
source_row={"x": "1"},
|
||||
)
|
||||
b = UserAddress(
|
||||
user_address="1 The Street",
|
||||
postcode=Postcode("SW1A1AA"),
|
||||
source_row={"y": "2"},
|
||||
)
|
||||
# act / assert
|
||||
assert a == b
|
||||
0
tests/infrastructure/chatgpt/__init__.py
Normal file
0
tests/infrastructure/chatgpt/__init__.py
Normal file
185
tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py
Normal file
185
tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from domain.data_transformation.column_classifier import ClassificationError
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.wall_type import WallType
|
||||
from infrastructure.chatgpt.chatgpt import ChatGPT
|
||||
from infrastructure.chatgpt.chatgpt_column_classifier import (
|
||||
ChatGptColumnClassifier,
|
||||
)
|
||||
from infrastructure.chatgpt.exceptions import ChatGPTClientError
|
||||
|
||||
|
||||
class _FakeChatGPT(ChatGPT):
|
||||
"""Hand-written ChatGPT stand-in: returns a canned reply, records prompts."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
reply: str = "{}",
|
||||
error: Optional[Exception] = None,
|
||||
) -> None:
|
||||
self.prompts: list[str] = []
|
||||
self.system_prompts: list[Optional[str]] = []
|
||||
self._reply = reply
|
||||
self._error = error
|
||||
|
||||
def generate(self, prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
self.prompts.append(prompt)
|
||||
self.system_prompts.append(system_prompt)
|
||||
if self._error is not None:
|
||||
raise self._error
|
||||
return self._reply
|
||||
|
||||
|
||||
def _property_type_classifier(
|
||||
chat_gpt: ChatGPT,
|
||||
) -> ChatGptColumnClassifier[PropertyType]:
|
||||
return ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN)
|
||||
|
||||
|
||||
def test_classifies_description_into_its_category() -> None:
|
||||
# Arrange
|
||||
chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}')
|
||||
classifier = _property_type_classifier(chat_gpt)
|
||||
|
||||
# Act
|
||||
result = classifier.classify({"semi-detached"})
|
||||
|
||||
# Assert
|
||||
assert result == {"semi-detached": PropertyType.HOUSE}
|
||||
|
||||
|
||||
def test_classifies_when_reply_is_wrapped_in_a_markdown_fence() -> None:
|
||||
# Arrange: ChatGPT wraps the JSON in a ```json ... ``` code fence.
|
||||
chat_gpt = _FakeChatGPT(reply='```json\n{"semi-detached": "House"}\n```')
|
||||
classifier = _property_type_classifier(chat_gpt)
|
||||
|
||||
# Act
|
||||
result = classifier.classify({"semi-detached"})
|
||||
|
||||
# Assert
|
||||
assert result == {"semi-detached": PropertyType.HOUSE}
|
||||
|
||||
|
||||
def test_unrecognised_category_maps_to_unknown() -> None:
|
||||
# Arrange
|
||||
chat_gpt = _FakeChatGPT(reply='{"garden shed": "Shed"}')
|
||||
classifier = _property_type_classifier(chat_gpt)
|
||||
|
||||
# Act
|
||||
result = classifier.classify({"garden shed"})
|
||||
|
||||
# Assert
|
||||
assert result == {"garden shed": PropertyType.UNKNOWN}
|
||||
|
||||
|
||||
def test_description_omitted_from_reply_maps_to_unknown() -> None:
|
||||
# Arrange: the reply classifies one description but not the other.
|
||||
chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}')
|
||||
classifier = _property_type_classifier(chat_gpt)
|
||||
|
||||
# Act
|
||||
result = classifier.classify({"semi-detached", "TBC"})
|
||||
|
||||
# Assert
|
||||
assert result == {
|
||||
"semi-detached": PropertyType.HOUSE,
|
||||
"TBC": PropertyType.UNKNOWN,
|
||||
}
|
||||
|
||||
|
||||
def test_chatgpt_failure_raises_classification_error() -> None:
|
||||
# Arrange
|
||||
chat_gpt = _FakeChatGPT(error=ChatGPTClientError("backend unreachable"))
|
||||
classifier = _property_type_classifier(chat_gpt)
|
||||
|
||||
# Act / Assert
|
||||
with pytest.raises(ClassificationError):
|
||||
classifier.classify({"semi-detached"})
|
||||
|
||||
|
||||
def test_non_json_reply_raises_classification_error_with_the_raw_reply() -> None:
|
||||
# Arrange
|
||||
chat_gpt = _FakeChatGPT(reply="sorry, I can't do that")
|
||||
classifier = _property_type_classifier(chat_gpt)
|
||||
|
||||
# Act / Assert: the error surfaces the offending reply for diagnosis.
|
||||
with pytest.raises(ClassificationError, match="sorry, I can't do that"):
|
||||
classifier.classify({"semi-detached"})
|
||||
|
||||
|
||||
def test_empty_description_set_returns_empty_without_calling_chatgpt() -> None:
|
||||
# Arrange
|
||||
chat_gpt = _FakeChatGPT(reply='{"unused": "House"}')
|
||||
classifier = _property_type_classifier(chat_gpt)
|
||||
|
||||
# Act
|
||||
result = classifier.classify(set())
|
||||
|
||||
# Assert
|
||||
assert result == {}
|
||||
assert chat_gpt.prompts == []
|
||||
|
||||
|
||||
def test_classifies_with_a_different_category_enum() -> None:
|
||||
# Arrange: the same adapter classifies a WallType column.
|
||||
chat_gpt = _FakeChatGPT(
|
||||
reply='{"solid brick wall": "Solid brick, as built, no insulation (assumed)"}'
|
||||
)
|
||||
classifier = ChatGptColumnClassifier(chat_gpt, WallType, WallType.UNKNOWN)
|
||||
|
||||
# Act
|
||||
result = classifier.classify({"solid brick wall"})
|
||||
|
||||
# Assert
|
||||
assert result == {
|
||||
"solid brick wall": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED
|
||||
}
|
||||
|
||||
|
||||
def test_extra_instructions_are_appended_to_the_system_prompt() -> None:
|
||||
# Arrange: column-specific guidance (e.g. wall-type build-era hints)
|
||||
# should reach the model verbatim, in the system prompt ahead of the
|
||||
# JSON-output instruction.
|
||||
chat_gpt = _FakeChatGPT(reply='{"1970s semi": "House"}')
|
||||
classifier = ChatGptColumnClassifier(
|
||||
chat_gpt,
|
||||
PropertyType,
|
||||
PropertyType.UNKNOWN,
|
||||
extra_instructions="If the description carries a build decade, prefer X.",
|
||||
)
|
||||
|
||||
# Act
|
||||
classifier.classify({"1970s semi"})
|
||||
|
||||
# Assert: the hint sits in the system prompt, before the JSON instruction.
|
||||
system_prompt = chat_gpt.system_prompts[0]
|
||||
assert system_prompt is not None
|
||||
assert "If the description carries a build decade, prefer X." in system_prompt
|
||||
hint_index = system_prompt.index("If the description carries a build decade")
|
||||
json_index = system_prompt.index("Reply with only a JSON object")
|
||||
assert hint_index < json_index
|
||||
|
||||
|
||||
def test_omitting_extra_instructions_leaves_the_system_prompt_unchanged() -> None:
|
||||
# Arrange: a classifier without per-column guidance must still produce
|
||||
# the original system prompt -- no trailing whitespace, no orphan hint.
|
||||
chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}')
|
||||
classifier = ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN)
|
||||
|
||||
# Act
|
||||
classifier.classify({"semi-detached"})
|
||||
|
||||
# Assert
|
||||
system_prompt = chat_gpt.system_prompts[0]
|
||||
assert system_prompt is not None
|
||||
assert system_prompt == (
|
||||
"Classify each free-text description into exactly one category. "
|
||||
"Categories: House, Bungalow, Flat, Maisonette, Park home. "
|
||||
"Reply with only a JSON object mapping each original description "
|
||||
"to its category, and nothing else."
|
||||
)
|
||||
|
|
@ -3,7 +3,7 @@ from collections.abc import Iterator
|
|||
import pytest
|
||||
from moto import mock_aws
|
||||
|
||||
from infrastructure.csv_s3_client import CsvS3Client
|
||||
from infrastructure.s3.csv_s3_client import CsvS3Client
|
||||
from tests.infrastructure import make_boto_client
|
||||
|
||||
BUCKET = "csv-bucket"
|
||||
|
|
@ -49,3 +49,36 @@ def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None:
|
|||
# act / assert
|
||||
with pytest.raises(ValueError, match="does not match client bucket"):
|
||||
csv_client.read_rows("s3://other-bucket/uploads/addresses.csv")
|
||||
|
||||
|
||||
def test_read_rows_indexes_duplicate_column_names(csv_client: CsvS3Client) -> None:
|
||||
# arrange: the Hyde export has two columns both headed "Walls" — a
|
||||
# description and a score. Without disambiguation csv.DictReader would
|
||||
# collapse them onto one key and the description would be lost.
|
||||
raw = "Address 1,Walls,Roofs,Walls\n1 High St,Cavity: Filled,Pitched 300mm,9.6\n"
|
||||
uri = csv_client.put_object("uploads/dup.csv", raw.encode("utf-8"))
|
||||
|
||||
# act
|
||||
rows = csv_client.read_rows(uri)
|
||||
|
||||
# assert: the first occurrence keeps its name, the second gets an index.
|
||||
assert rows == [
|
||||
{
|
||||
"Address 1": "1 High St",
|
||||
"Walls": "Cavity: Filled",
|
||||
"Roofs": "Pitched 300mm",
|
||||
"Walls_1": "9.6",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_read_rows_indexes_each_repeat_of_a_column(csv_client: CsvS3Client) -> None:
|
||||
# arrange: three columns sharing one header.
|
||||
raw = "Walls,Walls,Walls\nfirst,second,third\n"
|
||||
uri = csv_client.put_object("uploads/triple.csv", raw.encode("utf-8"))
|
||||
|
||||
# act
|
||||
rows = csv_client.read_rows(uri)
|
||||
|
||||
# assert
|
||||
assert rows == [{"Walls": "first", "Walls_1": "second", "Walls_2": "third"}]
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from collections.abc import Iterator
|
|||
import pytest
|
||||
from moto import mock_aws
|
||||
|
||||
from infrastructure.s3_client import S3Client
|
||||
from infrastructure.s3.s3_client import S3Client
|
||||
from tests.infrastructure import make_boto_client
|
||||
|
||||
BUCKET = "test-bucket"
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
|
||||
from infrastructure.s3_uri import parse_s3_uri
|
||||
from infrastructure.s3.s3_uri import parse_s3_uri
|
||||
|
||||
|
||||
def test_parses_simple_s3_uri() -> None:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,469 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Any, Optional
|
||||
|
||||
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
|
||||
from domain.epc.built_form_type import BuiltFormType
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.wall_type import WallType
|
||||
from domain.postcode import Postcode
|
||||
from domain.data_transformation.column_classifier import ColumnClassifier
|
||||
from orchestration.classifiable_column import ClassifiableColumn
|
||||
from orchestration.landlord_description_overrides_orchestrator import (
|
||||
LandlordDescriptionOverridesOrchestrator,
|
||||
)
|
||||
from repositories.landlord_overrides.landlord_override_repository import (
|
||||
LandlordOverrideRepository,
|
||||
)
|
||||
from repositories.unstandardised_address.unstandardised_address_list_repository import (
|
||||
UnstandardisedAddressListRepository,
|
||||
)
|
||||
|
||||
|
||||
class _StubUnstandardisedAddressRepository(UnstandardisedAddressListRepository):
|
||||
"""``get_col_to_description_mappings`` never touches the repo."""
|
||||
|
||||
def load_batch(self, s3_uri: str) -> AddressList:
|
||||
raise NotImplementedError()
|
||||
|
||||
def save_batch(self, addresses: AddressList, path_prefix: str) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class _StubColumnClassifier(ColumnClassifier[Enum]):
|
||||
"""Records the descriptions it received; returns a canned mapping."""
|
||||
|
||||
def __init__(self, result: dict[str, Enum]) -> None:
|
||||
self.received: Optional[set[str]] = None
|
||||
self._result = result
|
||||
|
||||
def classify(self, descriptions: set[str]) -> dict[str, Enum]:
|
||||
self.received = descriptions
|
||||
return self._result
|
||||
|
||||
|
||||
class _StubLandlordOverrideRepository(LandlordOverrideRepository[Enum]):
|
||||
"""Records every ``upsert_all`` call so tests can assert routing."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.calls: list[tuple[int, dict[str, Enum]]] = []
|
||||
|
||||
def upsert_all(
|
||||
self, portfolio_id: int, descriptions_to_values: dict[str, Enum]
|
||||
) -> None:
|
||||
self.calls.append((portfolio_id, dict(descriptions_to_values)))
|
||||
|
||||
|
||||
def _make_unstandardised_address(
|
||||
landlord_additional_info: dict[str, str],
|
||||
) -> UnstandardisedAddress:
|
||||
return UnstandardisedAddress(
|
||||
address="1 High St",
|
||||
postcode=Postcode("AA1 1AA"),
|
||||
additional_info=landlord_additional_info,
|
||||
)
|
||||
|
||||
|
||||
def _orchestrator(
|
||||
columns: Optional[list[ClassifiableColumn[Any]]] = None,
|
||||
) -> LandlordDescriptionOverridesOrchestrator:
|
||||
return LandlordDescriptionOverridesOrchestrator(
|
||||
unstandardised_address_repo=_StubUnstandardisedAddressRepository(),
|
||||
columns=columns or [],
|
||||
)
|
||||
|
||||
|
||||
def _column(
|
||||
name: str,
|
||||
source_column: str,
|
||||
classifier: ColumnClassifier[Any],
|
||||
repo: Optional[LandlordOverrideRepository[Any]] = None,
|
||||
) -> ClassifiableColumn[Any]:
|
||||
return ClassifiableColumn(
|
||||
name=name,
|
||||
source_column=source_column,
|
||||
classifier=classifier,
|
||||
repo=repo or _StubLandlordOverrideRepository(),
|
||||
)
|
||||
|
||||
|
||||
def test_collects_every_value_per_shared_key() -> None:
|
||||
# arrange: every address carries the same keys, all values distinct.
|
||||
addresses = AddressList(
|
||||
[
|
||||
_make_unstandardised_address({"description": "cosy", "condition": "new"}),
|
||||
_make_unstandardised_address({"description": "spacious", "condition": "worn"}),
|
||||
_make_unstandardised_address({"description": "bright", "condition": "fair"}),
|
||||
]
|
||||
)
|
||||
|
||||
# act
|
||||
mappings = _orchestrator().get_col_to_description_mappings(addresses)
|
||||
|
||||
# assert
|
||||
assert mappings == {
|
||||
"description": {"cosy", "spacious", "bright"},
|
||||
"condition": {"new", "worn", "fair"},
|
||||
}
|
||||
|
||||
|
||||
def test_repeated_values_collapse_to_one_variant() -> None:
|
||||
# arrange: two addresses share the same wall description.
|
||||
addresses = AddressList(
|
||||
[
|
||||
_make_unstandardised_address({"description": "cosy"}),
|
||||
_make_unstandardised_address({"description": "cosy"}),
|
||||
_make_unstandardised_address({"description": "bright"}),
|
||||
]
|
||||
)
|
||||
|
||||
# act
|
||||
mappings = _orchestrator().get_col_to_description_mappings(addresses)
|
||||
|
||||
# assert: a set keeps one entry per distinct variant.
|
||||
assert mappings == {"description": {"cosy", "bright"}}
|
||||
|
||||
|
||||
def test_case_only_variants_collapse_to_one() -> None:
|
||||
# arrange: the same description typed with inconsistent casing.
|
||||
addresses = AddressList(
|
||||
[
|
||||
_make_unstandardised_address({"description": "Cosy"}),
|
||||
_make_unstandardised_address({"description": "cosy"}),
|
||||
_make_unstandardised_address({"description": "COSY"}),
|
||||
]
|
||||
)
|
||||
|
||||
# act
|
||||
mappings = _orchestrator().get_col_to_description_mappings(addresses)
|
||||
|
||||
# assert: lower-casing folds the casing typos into one variant.
|
||||
assert mappings == {"description": {"cosy"}}
|
||||
|
||||
|
||||
def test_comma_separated_value_splits_into_individual_entries() -> None:
|
||||
# arrange: a single cell packs several descriptions, comma-separated.
|
||||
addresses = AddressList(
|
||||
[_make_unstandardised_address({"description": "cosy, bright, COSY"})]
|
||||
)
|
||||
|
||||
# act
|
||||
mappings = _orchestrator().get_col_to_description_mappings(addresses)
|
||||
|
||||
# assert: each comma-separated part is its own trimmed, lower-cased entry.
|
||||
assert mappings == {"description": {"cosy", "bright"}}
|
||||
|
||||
|
||||
def test_empty_address_list_yields_empty_mapping() -> None:
|
||||
# arrange / act
|
||||
mappings = _orchestrator().get_col_to_description_mappings(AddressList([]))
|
||||
|
||||
# assert
|
||||
assert mappings == {}
|
||||
|
||||
|
||||
def test_single_address_yields_single_value_per_key() -> None:
|
||||
# arrange
|
||||
addresses = AddressList([_make_unstandardised_address({"description": "cosy"})])
|
||||
|
||||
# act
|
||||
mappings = _orchestrator().get_col_to_description_mappings(addresses)
|
||||
|
||||
# assert
|
||||
assert mappings == {"description": {"cosy"}}
|
||||
|
||||
|
||||
def test_classify_columns_classifies_each_registered_column() -> None:
|
||||
# arrange: addresses carry two classifiable columns.
|
||||
addresses = AddressList(
|
||||
[
|
||||
_make_unstandardised_address(
|
||||
{"Property Type": "semi-detached", "Walls": "solid brick"}
|
||||
),
|
||||
]
|
||||
)
|
||||
property_types = _StubColumnClassifier(
|
||||
result={"semi-detached": PropertyType.HOUSE}
|
||||
)
|
||||
wall_types = _StubColumnClassifier(result={"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED})
|
||||
|
||||
# act
|
||||
result = _orchestrator(
|
||||
[
|
||||
_column("property_type", "Property Type", property_types),
|
||||
_column("wall_type", "Walls", wall_types),
|
||||
]
|
||||
).classify_columns(addresses)
|
||||
|
||||
# assert: each registered column was classified independently, keyed by name.
|
||||
assert result == {
|
||||
"property_type": {"semi-detached": PropertyType.HOUSE},
|
||||
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
|
||||
}
|
||||
|
||||
|
||||
def test_classify_columns_yields_empty_mapping_for_an_absent_column() -> None:
|
||||
# arrange: a classifier is registered for a source column the addresses lack.
|
||||
addresses = AddressList([_make_unstandardised_address({"Walls": "cavity"})])
|
||||
property_types = _StubColumnClassifier(result={})
|
||||
|
||||
# act
|
||||
result = _orchestrator(
|
||||
[_column("property_type", "Property Type", property_types)]
|
||||
).classify_columns(addresses)
|
||||
|
||||
# assert: the absent column classified an empty description set.
|
||||
assert result == {"property_type": {}}
|
||||
assert property_types.received == set()
|
||||
|
||||
|
||||
def test_classify_columns_runs_two_classifiers_against_a_shared_source_column() -> None:
|
||||
# arrange: the "Property Type" landlord column feeds two classifiers --
|
||||
# PropertyType (what kind of dwelling) and BuiltFormType (how it joins
|
||||
# to neighbours). Both must run against the same description set; each
|
||||
# result is keyed by its column's ``name``.
|
||||
addresses = AddressList(
|
||||
[_make_unstandardised_address({"Property Type": "semi-detached house"})]
|
||||
)
|
||||
property_types = _StubColumnClassifier(
|
||||
result={"semi-detached house": PropertyType.HOUSE}
|
||||
)
|
||||
built_form_types = _StubColumnClassifier(
|
||||
result={"semi-detached house": BuiltFormType.SEMI_DETACHED}
|
||||
)
|
||||
|
||||
# act
|
||||
result = _orchestrator(
|
||||
[
|
||||
_column("property_type", "Property Type", property_types),
|
||||
_column("built_form_type", "Property Type", built_form_types),
|
||||
]
|
||||
).classify_columns(addresses)
|
||||
|
||||
# assert: both classifiers saw the same description set, and the two
|
||||
# results live under their own ``name`` keys without colliding.
|
||||
assert property_types.received == {"semi-detached house"}
|
||||
assert built_form_types.received == {"semi-detached house"}
|
||||
assert result == {
|
||||
"property_type": {"semi-detached house": PropertyType.HOUSE},
|
||||
"built_form_type": {"semi-detached house": BuiltFormType.SEMI_DETACHED},
|
||||
}
|
||||
|
||||
|
||||
def test_classify_and_persist_writes_each_columns_mapping_to_its_own_repo() -> None:
|
||||
# arrange: two columns with distinct repos -- the orchestrator must
|
||||
# route each column's classifications to its own repo, not mix them.
|
||||
addresses = AddressList(
|
||||
[
|
||||
_make_unstandardised_address(
|
||||
{"Property Type": "semi-detached", "Walls": "solid brick"}
|
||||
),
|
||||
]
|
||||
)
|
||||
property_type_repo = _StubLandlordOverrideRepository()
|
||||
wall_type_repo = _StubLandlordOverrideRepository()
|
||||
columns: list[ClassifiableColumn[Any]] = [
|
||||
_column(
|
||||
"property_type",
|
||||
"Property Type",
|
||||
_StubColumnClassifier({"semi-detached": PropertyType.HOUSE}),
|
||||
property_type_repo,
|
||||
),
|
||||
_column(
|
||||
"wall_type",
|
||||
"Walls",
|
||||
_StubColumnClassifier({"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}),
|
||||
wall_type_repo,
|
||||
),
|
||||
]
|
||||
|
||||
# act
|
||||
result = _orchestrator(columns).classify_and_persist(addresses, portfolio_id=42)
|
||||
|
||||
# assert: each repo received exactly its own column's mapping, under the
|
||||
# given portfolio_id, and the return value mirrors classify_columns.
|
||||
assert property_type_repo.calls == [(42, {"semi-detached": PropertyType.HOUSE})]
|
||||
assert wall_type_repo.calls == [
|
||||
(42, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED})
|
||||
]
|
||||
assert result == {
|
||||
"property_type": {"semi-detached": PropertyType.HOUSE},
|
||||
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
|
||||
}
|
||||
|
||||
|
||||
def test_classify_and_persist_skips_upsert_for_a_column_absent_from_the_batch() -> None:
|
||||
# arrange: ``Walls`` is registered but the address has no ``Walls`` column.
|
||||
# The orchestrator should still classify (yielding an empty mapping) but
|
||||
# must NOT call ``upsert_all`` -- an empty bulk insert is a noisy no-op.
|
||||
addresses = AddressList(
|
||||
[_make_unstandardised_address({"Property Type": "semi-detached"})]
|
||||
)
|
||||
property_type_repo = _StubLandlordOverrideRepository()
|
||||
wall_type_repo = _StubLandlordOverrideRepository()
|
||||
columns: list[ClassifiableColumn[Any]] = [
|
||||
_column(
|
||||
"property_type",
|
||||
"Property Type",
|
||||
_StubColumnClassifier({"semi-detached": PropertyType.HOUSE}),
|
||||
property_type_repo,
|
||||
),
|
||||
_column(
|
||||
"wall_type",
|
||||
"Walls",
|
||||
_StubColumnClassifier({}),
|
||||
wall_type_repo,
|
||||
),
|
||||
]
|
||||
|
||||
# act
|
||||
_orchestrator(columns).classify_and_persist(addresses, portfolio_id=7)
|
||||
|
||||
# assert: Property Type wrote; Walls did not.
|
||||
assert property_type_repo.calls == [(7, {"semi-detached": PropertyType.HOUSE})]
|
||||
assert wall_type_repo.calls == []
|
||||
|
||||
|
||||
def test_classify_from_rows_classifies_each_column_without_persisting() -> None:
|
||||
# arrange: raw CSV rows (not an AddressList) carry two classifiable columns.
|
||||
rows = [{"Property Type": "semi-detached", "Walls": "solid brick"}]
|
||||
property_types = _StubColumnClassifier({"semi-detached": PropertyType.HOUSE})
|
||||
wall_types = _StubColumnClassifier(
|
||||
{"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}
|
||||
)
|
||||
property_type_repo = _StubLandlordOverrideRepository()
|
||||
wall_type_repo = _StubLandlordOverrideRepository()
|
||||
|
||||
# act
|
||||
result = _orchestrator(
|
||||
[
|
||||
_column("property_type", "Property Type", property_types, property_type_repo),
|
||||
_column("wall_type", "Walls", wall_types, wall_type_repo),
|
||||
]
|
||||
).classify_from_rows(rows)
|
||||
|
||||
# assert: each classifier ran against its column's descriptions, keyed by
|
||||
# name -- and NOT a single repo was touched (classification is DB-free, so
|
||||
# the slow ChatGPT work can run before any transaction opens).
|
||||
assert result == {
|
||||
"property_type": {"semi-detached": PropertyType.HOUSE},
|
||||
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
|
||||
}
|
||||
assert property_type_repo.calls == []
|
||||
assert wall_type_repo.calls == []
|
||||
|
||||
|
||||
def test_classify_from_rows_splits_and_normalises_descriptions() -> None:
|
||||
# arrange: one cell packs several descriptions with inconsistent casing,
|
||||
# spread across rows. The rows path must fold them exactly like the
|
||||
# AddressList path: comma-split, trimmed, lower-cased, de-duped.
|
||||
rows = [
|
||||
{"Walls": "Solid Brick, cavity"},
|
||||
{"Walls": "SOLID BRICK"},
|
||||
]
|
||||
wall_types = _StubColumnClassifier({})
|
||||
|
||||
# act
|
||||
_orchestrator(
|
||||
[_column("wall_type", "Walls", wall_types)]
|
||||
).classify_from_rows(rows)
|
||||
|
||||
# assert: the classifier saw one normalised entry per distinct variant.
|
||||
assert wall_types.received == {"solid brick", "cavity"}
|
||||
|
||||
|
||||
def test_classify_from_rows_yields_empty_mapping_for_an_absent_column() -> None:
|
||||
# arrange: a column is registered for a header the rows lack.
|
||||
rows = [{"Walls": "cavity"}]
|
||||
property_types = _StubColumnClassifier({})
|
||||
|
||||
# act
|
||||
result = _orchestrator(
|
||||
[_column("property_type", "Property Type", property_types)]
|
||||
).classify_from_rows(rows)
|
||||
|
||||
# assert: the absent column classified an empty description set.
|
||||
assert result == {"property_type": {}}
|
||||
assert property_types.received == set()
|
||||
|
||||
|
||||
def test_persist_routes_each_columns_mapping_to_its_own_repo() -> None:
|
||||
# arrange: a finished ``classified`` mapping (as classify_* would return)
|
||||
# and two columns with distinct repos.
|
||||
property_type_repo = _StubLandlordOverrideRepository()
|
||||
wall_type_repo = _StubLandlordOverrideRepository()
|
||||
columns: list[ClassifiableColumn[Any]] = [
|
||||
_column("property_type", "Property Type", _StubColumnClassifier({}), property_type_repo),
|
||||
_column("wall_type", "Walls", _StubColumnClassifier({}), wall_type_repo),
|
||||
]
|
||||
classified: dict[str, dict[str, Enum]] = {
|
||||
"property_type": {"semi-detached": PropertyType.HOUSE},
|
||||
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
|
||||
}
|
||||
|
||||
# act
|
||||
_orchestrator(columns).persist(classified, portfolio_id=42)
|
||||
|
||||
# assert: each repo received exactly its own column's mapping.
|
||||
assert property_type_repo.calls == [(42, {"semi-detached": PropertyType.HOUSE})]
|
||||
assert wall_type_repo.calls == [
|
||||
(42, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED})
|
||||
]
|
||||
|
||||
|
||||
def test_persist_skips_empty_and_missing_mappings() -> None:
|
||||
# arrange: ``property_type`` has an empty mapping; ``wall_type`` is absent
|
||||
# from ``classified`` entirely. Neither should hit the DB -- and the
|
||||
# missing key must not raise (``persist`` reads with ``.get``).
|
||||
property_type_repo = _StubLandlordOverrideRepository()
|
||||
wall_type_repo = _StubLandlordOverrideRepository()
|
||||
columns: list[ClassifiableColumn[Any]] = [
|
||||
_column("property_type", "Property Type", _StubColumnClassifier({}), property_type_repo),
|
||||
_column("wall_type", "Walls", _StubColumnClassifier({}), wall_type_repo),
|
||||
]
|
||||
classified: dict[str, dict[str, Enum]] = {"property_type": {}}
|
||||
|
||||
# act
|
||||
_orchestrator(columns).persist(classified, portfolio_id=7)
|
||||
|
||||
# assert: no upserts at all.
|
||||
assert property_type_repo.calls == []
|
||||
assert wall_type_repo.calls == []
|
||||
|
||||
|
||||
def test_classify_and_persist_from_rows_composes_classify_then_persist() -> None:
|
||||
# arrange: the one-shot rows path must classify AND route to repos, so the
|
||||
# convenience composition stays equivalent to calling the two in sequence.
|
||||
rows = [{"Property Type": "semi-detached", "Walls": "solid brick"}]
|
||||
property_type_repo = _StubLandlordOverrideRepository()
|
||||
wall_type_repo = _StubLandlordOverrideRepository()
|
||||
columns: list[ClassifiableColumn[Any]] = [
|
||||
_column(
|
||||
"property_type",
|
||||
"Property Type",
|
||||
_StubColumnClassifier({"semi-detached": PropertyType.HOUSE}),
|
||||
property_type_repo,
|
||||
),
|
||||
_column(
|
||||
"wall_type",
|
||||
"Walls",
|
||||
_StubColumnClassifier(
|
||||
{"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}
|
||||
),
|
||||
wall_type_repo,
|
||||
),
|
||||
]
|
||||
|
||||
# act
|
||||
result = _orchestrator(columns).classify_and_persist_from_rows(rows, portfolio_id=99)
|
||||
|
||||
# assert: same return shape as classify_from_rows, and each repo wrote once.
|
||||
assert result == {
|
||||
"property_type": {"semi-detached": PropertyType.HOUSE},
|
||||
"wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED},
|
||||
}
|
||||
assert property_type_repo.calls == [(99, {"semi-detached": PropertyType.HOUSE})]
|
||||
assert wall_type_repo.calls == [
|
||||
(99, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED})
|
||||
]
|
||||
|
|
@ -13,13 +13,13 @@ from sqlalchemy import Engine
|
|||
from sqlmodel import Session
|
||||
|
||||
from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
|
||||
from infrastructure.csv_s3_client import CsvS3Client
|
||||
from infrastructure.s3.csv_s3_client import CsvS3Client
|
||||
from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator
|
||||
from orchestration.task_orchestrator import TaskOrchestrator
|
||||
from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository
|
||||
from repositories.tasks.task_postgres_repository import TaskPostgresRepository
|
||||
from repositories.user_address.user_address_csv_s3_repository import (
|
||||
UserAddressCsvS3Repository,
|
||||
from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import (
|
||||
UnstandardisedAddressListCsvS3Repository,
|
||||
)
|
||||
|
||||
BUCKET = "splitter-bucket"
|
||||
|
|
@ -27,7 +27,9 @@ REGION = "us-east-1"
|
|||
|
||||
|
||||
def _make_boto_client(service_name: str) -> Any:
|
||||
factory: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
factory: Any = (
|
||||
boto3.client
|
||||
) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
return factory(service_name, region_name=REGION)
|
||||
|
||||
|
||||
|
|
@ -62,7 +64,7 @@ class Harness:
|
|||
csv_client: CsvS3Client
|
||||
boto_sqs: Any
|
||||
queue_url: str
|
||||
repo: UserAddressCsvS3Repository
|
||||
repo: UnstandardisedAddressListCsvS3Repository
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
@ -76,7 +78,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]:
|
|||
queue_url = cast(str, queue["QueueUrl"])
|
||||
|
||||
csv_client = CsvS3Client(boto_s3, BUCKET)
|
||||
repo = UserAddressCsvS3Repository(csv_client, BUCKET)
|
||||
repo = UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET)
|
||||
queue_client = Address2UprnQueueClient(boto_sqs, queue_url)
|
||||
|
||||
# DB: ephemeral PostgreSQL TaskOrchestrator
|
||||
|
|
@ -89,7 +91,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]:
|
|||
|
||||
splitter = PostcodeSplitterOrchestrator(
|
||||
task_orchestrator=task_orchestrator,
|
||||
user_address_repo=repo,
|
||||
unstandardised_address_repo=repo,
|
||||
queue_client=queue_client,
|
||||
max_batch_size=3,
|
||||
)
|
||||
|
|
@ -169,10 +171,8 @@ def test_split_and_dispatch_creates_three_children_for_fixture(
|
|||
harness: Harness,
|
||||
) -> None:
|
||||
# arrange
|
||||
parent_task, parent_subtask = (
|
||||
harness.task_orchestrator.create_task_with_subtask(
|
||||
task_source="manual:postcode-splitter-int"
|
||||
)
|
||||
parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask(
|
||||
task_source="manual:postcode-splitter-int"
|
||||
)
|
||||
input_uri = _upload_fixture_csv(harness.csv_client)
|
||||
|
||||
|
|
@ -197,10 +197,8 @@ def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri(
|
|||
harness: Harness,
|
||||
) -> None:
|
||||
# arrange
|
||||
parent_task, parent_subtask = (
|
||||
harness.task_orchestrator.create_task_with_subtask(
|
||||
task_source="manual:postcode-splitter-int"
|
||||
)
|
||||
parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask(
|
||||
task_source="manual:postcode-splitter-int"
|
||||
)
|
||||
input_uri = _upload_fixture_csv(harness.csv_client)
|
||||
|
||||
|
|
@ -230,10 +228,8 @@ def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids(
|
|||
harness: Harness,
|
||||
) -> None:
|
||||
# arrange
|
||||
parent_task, parent_subtask = (
|
||||
harness.task_orchestrator.create_task_with_subtask(
|
||||
task_source="manual:postcode-splitter-int"
|
||||
)
|
||||
parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask(
|
||||
task_source="manual:postcode-splitter-int"
|
||||
)
|
||||
input_uri = _upload_fixture_csv(harness.csv_client)
|
||||
|
||||
|
|
@ -267,10 +263,8 @@ def test_split_and_dispatch_returns_child_ids_in_dispatch_order(
|
|||
harness: Harness,
|
||||
) -> None:
|
||||
# arrange
|
||||
parent_task, parent_subtask = (
|
||||
harness.task_orchestrator.create_task_with_subtask(
|
||||
task_source="manual:postcode-splitter-int"
|
||||
)
|
||||
parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask(
|
||||
task_source="manual:postcode-splitter-int"
|
||||
)
|
||||
input_uri = _upload_fixture_csv(harness.csv_client)
|
||||
|
||||
|
|
|
|||
0
tests/repositories/landlord_overrides/__init__.py
Normal file
0
tests/repositories/landlord_overrides/__init__.py
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
"""Integration tests for the consolidated landlord-overrides upsert adapter.
|
||||
|
||||
``LandlordOverridesRepository`` serves every ``landlord_<category>_overrides``
|
||||
table; the table is selected by the ``…Row`` class passed at construction. The
|
||||
source-aware conflict policy lives entirely in SQL (``INSERT ... ON CONFLICT
|
||||
... DO UPDATE ... WHERE existing.source = 'classifier'``), so the only way to
|
||||
verify it correctly distinguishes ``EXCLUDED.source`` from the qualified
|
||||
``<table>.source`` is against a real Postgres -- the ``db_engine`` fixture in
|
||||
``tests/conftest.py`` spins one up per test.
|
||||
|
||||
Each scenario is parametrised across two distinct categories (property type and
|
||||
wall type). Running the shared body against two different ``…Row`` classes
|
||||
proves the adapter writes to whichever table it was handed -- not just that one
|
||||
table happens to work.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterator
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import cast
|
||||
|
||||
import pytest
|
||||
from sqlalchemy import Engine, Table
|
||||
from sqlmodel import Session, SQLModel, select
|
||||
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.wall_type import WallType
|
||||
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
|
||||
LandlordOverridesRepository,
|
||||
)
|
||||
from infrastructure.postgres.landlord_override_enums import OverrideSource
|
||||
from infrastructure.postgres.landlord_property_type_override_table import (
|
||||
LandlordPropertyTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_wall_type_override_table import (
|
||||
LandlordWallTypeOverrideRow,
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Category:
|
||||
"""One ``landlord_<category>_overrides`` table plus a few of its enum values.
|
||||
|
||||
``first``/``second`` are two distinct classifier values for re-upsert
|
||||
tests; ``user_correction`` stands in for a value only a user would pick;
|
||||
``unknown`` is the category's ``UNKNOWN`` member.
|
||||
"""
|
||||
|
||||
id: str
|
||||
row_type: type[SQLModel]
|
||||
first: Enum
|
||||
second: Enum
|
||||
user_correction: Enum
|
||||
unknown: Enum
|
||||
|
||||
|
||||
CATEGORIES = [
|
||||
Category(
|
||||
id="property_type",
|
||||
row_type=LandlordPropertyTypeOverrideRow,
|
||||
first=PropertyType.FLAT,
|
||||
second=PropertyType.HOUSE,
|
||||
user_correction=PropertyType.BUNGALOW,
|
||||
unknown=PropertyType.UNKNOWN,
|
||||
),
|
||||
Category(
|
||||
id="wall_type",
|
||||
row_type=LandlordWallTypeOverrideRow,
|
||||
first=WallType.CAVITY_FILLED,
|
||||
second=WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED,
|
||||
user_correction=WallType.SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED,
|
||||
unknown=WallType.UNKNOWN,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def session(db_engine: Engine) -> Iterator[Session]:
|
||||
with Session(db_engine) as s:
|
||||
yield s
|
||||
|
||||
|
||||
@pytest.fixture(params=CATEGORIES, ids=lambda c: c.id)
|
||||
def category(request: pytest.FixtureRequest) -> Category:
|
||||
return request.param
|
||||
|
||||
|
||||
def _select_row(
|
||||
session: Session, row_type: type[SQLModel], portfolio_id: int, description: str
|
||||
) -> SQLModel:
|
||||
# The row class is only known as ``type[SQLModel]`` here, so its column
|
||||
# descriptors aren't typed; filter via the Core ``Table`` columns instead.
|
||||
table = cast(Table, getattr(row_type, "__table__"))
|
||||
rows = session.exec(
|
||||
select(row_type).where(
|
||||
table.c.portfolio_id == portfolio_id,
|
||||
table.c.description == description,
|
||||
)
|
||||
).all()
|
||||
assert len(rows) == 1, f"expected exactly one row, got {len(rows)}"
|
||||
return rows[0]
|
||||
|
||||
|
||||
def test_inserts_a_fresh_row_with_source_classifier(
|
||||
session: Session, category: Category
|
||||
) -> None:
|
||||
# arrange
|
||||
repo = LandlordOverridesRepository[Enum](session, category.row_type)
|
||||
|
||||
# act
|
||||
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.first})
|
||||
session.commit()
|
||||
|
||||
# assert
|
||||
row = _select_row(session, category.row_type, portfolio_id=1, description="cosy")
|
||||
assert getattr(row, "value") is category.first
|
||||
assert getattr(row, "source") == OverrideSource.CLASSIFIER
|
||||
|
||||
|
||||
def test_reupsert_overwrites_a_classifier_row(
|
||||
session: Session, category: Category
|
||||
) -> None:
|
||||
# arrange: a stale classifier row exists.
|
||||
repo = LandlordOverridesRepository[Enum](session, category.row_type)
|
||||
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.first})
|
||||
session.commit()
|
||||
|
||||
# act: re-classify with a different value.
|
||||
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.second})
|
||||
session.commit()
|
||||
|
||||
# assert: the new classification wins.
|
||||
row = _select_row(session, category.row_type, portfolio_id=1, description="cosy")
|
||||
assert getattr(row, "value") is category.second
|
||||
assert getattr(row, "source") == OverrideSource.CLASSIFIER
|
||||
|
||||
|
||||
def test_reupsert_does_not_overwrite_a_user_row(
|
||||
session: Session, category: Category
|
||||
) -> None:
|
||||
# arrange: a user has corrected the row. The classifier path never produces
|
||||
# ``source = 'user'``; we install the row directly to mimic the override
|
||||
# frontend.
|
||||
user_row = category.row_type(
|
||||
portfolio_id=1,
|
||||
description="cosy",
|
||||
value=category.user_correction,
|
||||
source=OverrideSource.USER,
|
||||
)
|
||||
session.add(user_row)
|
||||
session.commit()
|
||||
|
||||
# act: the classifier re-runs and tries to reclassify the same description.
|
||||
# Under the source-aware conflict policy, this must be silently skipped --
|
||||
# user edits beat classifier reruns.
|
||||
repo = LandlordOverridesRepository[Enum](session, category.row_type)
|
||||
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.second})
|
||||
session.commit()
|
||||
|
||||
# assert: the user row is unchanged.
|
||||
row = _select_row(session, category.row_type, portfolio_id=1, description="cosy")
|
||||
assert getattr(row, "value") is category.user_correction
|
||||
assert getattr(row, "source") == OverrideSource.USER
|
||||
|
||||
|
||||
def test_upsert_keeps_other_portfolios_descriptions_independent(
|
||||
session: Session, category: Category
|
||||
) -> None:
|
||||
# arrange / act: the unique key is ``(portfolio_id, description)``, so the
|
||||
# same description for two different portfolios must coexist as two rows.
|
||||
repo = LandlordOverridesRepository[Enum](session, category.row_type)
|
||||
repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": category.first})
|
||||
repo.upsert_all(portfolio_id=2, descriptions_to_values={"cosy": category.second})
|
||||
session.commit()
|
||||
|
||||
# assert: both rows survive with their own values.
|
||||
assert (
|
||||
getattr(_select_row(session, category.row_type, 1, "cosy"), "value")
|
||||
is category.first
|
||||
)
|
||||
assert (
|
||||
getattr(_select_row(session, category.row_type, 2, "cosy"), "value")
|
||||
is category.second
|
||||
)
|
||||
|
||||
|
||||
def test_upsert_persists_unknown_so_a_user_can_resolve_it_later(
|
||||
session: Session, category: Category
|
||||
) -> None:
|
||||
# arrange / act: a description the classifier couldn't resolve still lands
|
||||
# -- per ADR-0002 §5 / ADR-0003 §Decision, so a future user override can
|
||||
# upgrade it to a real value.
|
||||
repo = LandlordOverridesRepository[Enum](session, category.row_type)
|
||||
repo.upsert_all(
|
||||
portfolio_id=1,
|
||||
descriptions_to_values={"unparseable nonsense": category.unknown},
|
||||
)
|
||||
session.commit()
|
||||
|
||||
# assert: the row exists with value=UNKNOWN, source=classifier.
|
||||
row = _select_row(
|
||||
session, category.row_type, portfolio_id=1, description="unparseable nonsense"
|
||||
)
|
||||
assert getattr(row, "value") is category.unknown
|
||||
assert getattr(row, "source") == OverrideSource.CLASSIFIER
|
||||
|
||||
|
||||
def test_upsert_all_with_empty_mapping_is_a_no_op(
|
||||
session: Session, category: Category
|
||||
) -> None:
|
||||
# arrange / act
|
||||
repo = LandlordOverridesRepository[Enum](session, category.row_type)
|
||||
repo.upsert_all(portfolio_id=1, descriptions_to_values={})
|
||||
session.commit()
|
||||
|
||||
# assert: nothing was inserted.
|
||||
rows = session.exec(select(category.row_type)).all()
|
||||
assert rows == []
|
||||
0
tests/repositories/unstandardised_address/__init__.py
Normal file
0
tests/repositories/unstandardised_address/__init__.py
Normal file
|
|
@ -3,11 +3,11 @@ from collections.abc import Iterator
|
|||
import pytest
|
||||
from moto import mock_aws
|
||||
|
||||
from domain.addresses.user_address import UserAddress
|
||||
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
|
||||
from domain.postcode import Postcode
|
||||
from infrastructure.csv_s3_client import CsvS3Client
|
||||
from repositories.user_address.user_address_csv_s3_repository import (
|
||||
UserAddressCsvS3Repository,
|
||||
from infrastructure.s3.csv_s3_client import CsvS3Client
|
||||
from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import (
|
||||
UnstandardisedAddressListCsvS3Repository,
|
||||
)
|
||||
from tests.infrastructure import make_boto_client
|
||||
|
||||
|
|
@ -15,22 +15,22 @@ BUCKET = "user-address-bucket"
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def repo() -> Iterator[UserAddressCsvS3Repository]:
|
||||
def repo() -> Iterator[UnstandardisedAddressListCsvS3Repository]:
|
||||
with mock_aws():
|
||||
boto_client = make_boto_client("s3")
|
||||
boto_client.create_bucket(Bucket=BUCKET)
|
||||
csv_client = CsvS3Client(boto_client, BUCKET)
|
||||
yield UserAddressCsvS3Repository(csv_client, BUCKET)
|
||||
yield UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET)
|
||||
|
||||
|
||||
def _upload_csv(
|
||||
repo: UserAddressCsvS3Repository, rows: list[dict[str, str]], key: str
|
||||
repo: UnstandardisedAddressListCsvS3Repository, rows: list[dict[str, str]], key: str
|
||||
) -> str:
|
||||
return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage]
|
||||
|
||||
|
||||
def test_load_batch_parses_address_postcode_and_reference(
|
||||
repo: UserAddressCsvS3Repository,
|
||||
repo: UnstandardisedAddressListCsvS3Repository,
|
||||
) -> None:
|
||||
# arrange
|
||||
rows = [
|
||||
|
|
@ -50,13 +50,13 @@ def test_load_batch_parses_address_postcode_and_reference(
|
|||
# assert
|
||||
assert len(addresses) == 1
|
||||
address = addresses[0]
|
||||
assert address.user_address == "1 High Street, Flat 2, Townville"
|
||||
assert address.address == "1 High Street, Flat 2, Townville"
|
||||
assert address.postcode == Postcode("SW1A1AA")
|
||||
assert address.internal_reference == "REF-001"
|
||||
assert address.org_reference == "REF-001"
|
||||
|
||||
|
||||
def test_load_batch_uses_only_address_1_when_others_missing(
|
||||
repo: UserAddressCsvS3Repository,
|
||||
repo: UnstandardisedAddressListCsvS3Repository,
|
||||
) -> None:
|
||||
# arrange
|
||||
rows = [
|
||||
|
|
@ -75,13 +75,13 @@ def test_load_batch_uses_only_address_1_when_others_missing(
|
|||
|
||||
# assert
|
||||
assert len(addresses) == 1
|
||||
assert addresses[0].user_address == "10 Cardiff Road"
|
||||
assert addresses[0].address == "10 Cardiff Road"
|
||||
assert addresses[0].postcode == Postcode("CF101AA")
|
||||
assert addresses[0].internal_reference == "REF-002"
|
||||
assert addresses[0].org_reference == "REF-002"
|
||||
|
||||
|
||||
def test_load_batch_handles_missing_internal_reference(
|
||||
repo: UserAddressCsvS3Repository,
|
||||
repo: UnstandardisedAddressListCsvS3Repository,
|
||||
) -> None:
|
||||
# arrange
|
||||
rows = [
|
||||
|
|
@ -100,16 +100,16 @@ def test_load_batch_handles_missing_internal_reference(
|
|||
|
||||
# assert
|
||||
assert len(addresses) == 1
|
||||
assert addresses[0].user_address == "5 Park Lane"
|
||||
assert addresses[0].address == "5 Park Lane"
|
||||
assert addresses[0].postcode == Postcode("M11AA")
|
||||
assert addresses[0].internal_reference is None
|
||||
assert addresses[0].org_reference is None
|
||||
|
||||
|
||||
def test_load_batch_captures_full_source_row(
|
||||
repo: UserAddressCsvS3Repository,
|
||||
repo: UnstandardisedAddressListCsvS3Repository,
|
||||
) -> None:
|
||||
# A raw EPC-export-shaped row: the splitter must preserve every column,
|
||||
# not just the ones it parses into UserAddress fields.
|
||||
# not just the ones it parses into UnstandardisedAddress fields.
|
||||
# arrange
|
||||
row = {
|
||||
"Asset Reference": "511",
|
||||
|
|
@ -124,11 +124,11 @@ def test_load_batch_captures_full_source_row(
|
|||
addresses = repo.load_batch(uri)
|
||||
|
||||
# assert
|
||||
assert addresses[0].source_row == row
|
||||
assert addresses[0].additional_info == row
|
||||
|
||||
|
||||
def test_load_batch_raises_when_postcode_column_absent(
|
||||
repo: UserAddressCsvS3Repository,
|
||||
repo: UnstandardisedAddressListCsvS3Repository,
|
||||
) -> None:
|
||||
# arrange
|
||||
rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}]
|
||||
|
|
@ -140,7 +140,7 @@ def test_load_batch_raises_when_postcode_column_absent(
|
|||
|
||||
|
||||
def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
|
||||
repo: UserAddressCsvS3Repository,
|
||||
repo: UnstandardisedAddressListCsvS3Repository,
|
||||
) -> None:
|
||||
# arrange
|
||||
row = {
|
||||
|
|
@ -154,7 +154,9 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
|
|||
|
||||
# act
|
||||
saved_uri = repo.save_batch(addresses, "tasks/passthrough")
|
||||
saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage]
|
||||
saved_rows = repo._csv_client.read_rows(
|
||||
saved_uri
|
||||
) # pyright: ignore[reportPrivateUsage]
|
||||
|
||||
# assert
|
||||
assert len(saved_rows) == 1
|
||||
|
|
@ -167,16 +169,21 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
|
|||
|
||||
|
||||
def test_save_batch_returns_uri_under_path_prefix(
|
||||
repo: UserAddressCsvS3Repository,
|
||||
repo: UnstandardisedAddressListCsvS3Repository,
|
||||
) -> None:
|
||||
# arrange
|
||||
addresses = [
|
||||
UserAddress(
|
||||
user_address="1 High Street",
|
||||
postcode=Postcode("SW1A 1AA"),
|
||||
source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
|
||||
),
|
||||
]
|
||||
addresses = AddressList(
|
||||
[
|
||||
UnstandardisedAddress(
|
||||
address="1 High Street",
|
||||
postcode=Postcode("SW1A 1AA"),
|
||||
additional_info={
|
||||
"Address 1": "1 High Street",
|
||||
"postcode": "SW1A 1AA",
|
||||
},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# act
|
||||
uri = repo.save_batch(addresses, "tasks/abc/batches")
|
||||
|
|
@ -187,7 +194,7 @@ def test_save_batch_returns_uri_under_path_prefix(
|
|||
|
||||
|
||||
def test_save_then_reload_round_trip_preserves_columns(
|
||||
repo: UserAddressCsvS3Repository,
|
||||
repo: UnstandardisedAddressListCsvS3Repository,
|
||||
) -> None:
|
||||
# arrange
|
||||
rows = [
|
||||
|
|
@ -207,7 +214,9 @@ def test_save_then_reload_round_trip_preserves_columns(
|
|||
|
||||
# act
|
||||
saved_uri = repo.save_batch(addresses, "tasks/round-trip")
|
||||
saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage]
|
||||
saved_rows = repo._csv_client.read_rows(
|
||||
saved_uri
|
||||
) # pyright: ignore[reportPrivateUsage]
|
||||
|
||||
# assert
|
||||
# Original columns come back verbatim; postcode_clean is the only addition.
|
||||
|
|
@ -218,16 +227,21 @@ def test_save_then_reload_round_trip_preserves_columns(
|
|||
|
||||
|
||||
def test_save_batch_uses_unique_filename_per_call(
|
||||
repo: UserAddressCsvS3Repository,
|
||||
repo: UnstandardisedAddressListCsvS3Repository,
|
||||
) -> None:
|
||||
# arrange
|
||||
addresses = [
|
||||
UserAddress(
|
||||
user_address="1 High Street",
|
||||
postcode=Postcode("SW1A 1AA"),
|
||||
source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
|
||||
),
|
||||
]
|
||||
addresses = AddressList(
|
||||
[
|
||||
UnstandardisedAddress(
|
||||
address="1 High Street",
|
||||
postcode=Postcode("SW1A 1AA"),
|
||||
additional_info={
|
||||
"Address 1": "1 High Street",
|
||||
"postcode": "SW1A 1AA",
|
||||
},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# act
|
||||
uri_1 = repo.save_batch(addresses, "tasks/uniqueness")
|
||||
267
tests/test_lambda_packaging.py
Normal file
267
tests/test_lambda_packaging.py
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
"""Static packaging linter for Lambda container images.
|
||||
|
||||
Every Lambda here ships as a Docker image that copies a *subset* of the repo
|
||||
(``COPY utils/ utils/``, ``COPY backend/ backend/``, ...) and then runs a
|
||||
handler via ``CMD ["<module>.handler"]``. If the handler's import graph reaches
|
||||
a top-level package the Dockerfile forgot to ``COPY``, the function dies at cold
|
||||
start with ``Runtime.ImportModuleError: No module named '<pkg>'`` — but only
|
||||
*inside the image*. In the dev/test tree every package is present, so a plain
|
||||
``import`` test can't see the gap. This is exactly how ``No module named
|
||||
'domain'`` reached a deployed address2UPRN.
|
||||
|
||||
The RIE smoke tests (.github/workflows/_smoke_test_lambda.yml) catch this too,
|
||||
but only by building the full image (minutes) and only for hand-listed services.
|
||||
This test catches the same class of bug in milliseconds, locally, for *every*
|
||||
handler Dockerfile — by statically computing each handler's import-time module
|
||||
graph and asserting every repo file it reaches is copied into the image.
|
||||
|
||||
Scope: import-time (module-level) imports only — the ones that run at Lambda
|
||||
init, which is what ImportModuleError is about. Imports inside function bodies
|
||||
and under ``if TYPE_CHECKING:`` are deliberately ignored. Third-party / stdlib
|
||||
imports are out of scope (that's requirements.txt's job, covered by the RIE
|
||||
smoke test actually installing and importing).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
# Dockerfiles that are not Lambda handlers (test harness, dev containers).
|
||||
_SKIP_DOCKERFILES = {"Dockerfile.test", "Dockerfile.test.dockerignore"}
|
||||
_SKIP_PARTS = {".git", "node_modules", ".devcontainer"}
|
||||
|
||||
|
||||
def _toplevel_names() -> set[str]:
|
||||
"""Top-level repo packages/modules — the namespace handler imports resolve
|
||||
against (imports are absolute: ``domain.x``, ``backend.y``)."""
|
||||
names: set[str] = set()
|
||||
for p in REPO_ROOT.iterdir():
|
||||
if p.name.startswith(".") or p.name == "__pycache__":
|
||||
continue
|
||||
if p.is_dir():
|
||||
names.add(p.name)
|
||||
elif p.suffix == ".py":
|
||||
names.add(p.stem)
|
||||
return names
|
||||
|
||||
|
||||
_TOP = _toplevel_names()
|
||||
|
||||
|
||||
def _is_type_checking(test: ast.expr) -> bool:
|
||||
if isinstance(test, ast.Name):
|
||||
return test.id == "TYPE_CHECKING"
|
||||
if isinstance(test, ast.Attribute):
|
||||
return test.attr == "TYPE_CHECKING"
|
||||
return False
|
||||
|
||||
|
||||
def _import_time_imports(path: Path) -> list[str]:
|
||||
"""Absolute module names imported when ``path`` is imported (i.e. at Lambda
|
||||
init). Descends into module-level if/try/with and class bodies, but not into
|
||||
function bodies (lazy) or ``if TYPE_CHECKING:`` blocks (never executed)."""
|
||||
try:
|
||||
tree = ast.parse(path.read_text(encoding="utf-8"), str(path))
|
||||
except (SyntaxError, UnicodeDecodeError):
|
||||
return []
|
||||
out: list[str] = []
|
||||
|
||||
def visit(stmts: list[ast.stmt]) -> None:
|
||||
for node in stmts:
|
||||
if isinstance(node, ast.Import):
|
||||
out.extend(alias.name for alias in node.names)
|
||||
elif isinstance(node, ast.ImportFrom):
|
||||
if not node.level and node.module: # absolute imports only
|
||||
out.append(node.module)
|
||||
elif isinstance(node, ast.If):
|
||||
if _is_type_checking(node.test):
|
||||
continue
|
||||
visit(node.body)
|
||||
visit(node.orelse)
|
||||
elif isinstance(node, ast.Try):
|
||||
visit(node.body)
|
||||
visit(node.orelse)
|
||||
visit(node.finalbody)
|
||||
for handler in node.handlers:
|
||||
visit(handler.body)
|
||||
elif isinstance(node, ast.With):
|
||||
visit(node.body)
|
||||
elif isinstance(node, ast.ClassDef):
|
||||
visit(node.body)
|
||||
# FunctionDef / AsyncFunctionDef bodies are intentionally skipped.
|
||||
|
||||
visit(tree.body)
|
||||
return out
|
||||
|
||||
|
||||
def _module_to_file(module: str) -> Optional[Path]:
|
||||
"""Resolve a dotted module to its repo source file (``foo.bar`` ->
|
||||
``foo/bar.py`` or ``foo/bar/__init__.py``)."""
|
||||
base = REPO_ROOT.joinpath(*module.split("."))
|
||||
py = base.with_suffix(".py")
|
||||
if py.is_file():
|
||||
return py
|
||||
init = base / "__init__.py"
|
||||
if init.is_file():
|
||||
return init
|
||||
return None
|
||||
|
||||
|
||||
def _import_closure(start: Path) -> dict[Path, Optional[Path]]:
|
||||
"""Repo files reachable from ``start`` via import-time imports, mapped to the
|
||||
first file that imported each (for blame in failure messages)."""
|
||||
reached: dict[Path, Optional[Path]] = {}
|
||||
stack: list[tuple[Path, Optional[Path]]] = [(start, None)]
|
||||
while stack:
|
||||
path, importer = stack.pop()
|
||||
if path in reached:
|
||||
continue
|
||||
reached[path] = importer
|
||||
for module in _import_time_imports(path):
|
||||
if module.split(".")[0] not in _TOP:
|
||||
continue # stdlib / third-party — not our concern here
|
||||
target = _module_to_file(module)
|
||||
if target is not None and target not in reached:
|
||||
stack.append((target, path))
|
||||
return reached
|
||||
|
||||
|
||||
def _norm(path_token: str) -> str:
|
||||
return path_token.lstrip("./").rstrip("/")
|
||||
|
||||
|
||||
def _parse_handler_spec(dockerfile_text: str) -> Optional[str]:
|
||||
"""The ``<module>.handler`` string from the ``CMD`` line, or None if this
|
||||
isn't a Lambda handler image."""
|
||||
match = re.search(r"^CMD\s+(\[.*\]|.+)$", dockerfile_text, re.MULTILINE)
|
||||
if not match:
|
||||
return None
|
||||
raw = match.group(1).strip()
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
spec = parsed[0] if isinstance(parsed, list) and parsed else raw
|
||||
except json.JSONDecodeError:
|
||||
spec = raw.strip('"')
|
||||
return spec if isinstance(spec, str) and spec.endswith(".handler") else None
|
||||
|
||||
|
||||
def _parse_copies(dockerfile_text: str) -> list[tuple[list[str], str]]:
|
||||
"""``COPY`` instructions as (sources, dest), dropping ``--flag`` tokens."""
|
||||
copies: list[tuple[list[str], str]] = []
|
||||
for match in re.finditer(r"^COPY\s+(.+)$", dockerfile_text, re.MULTILINE):
|
||||
tokens = [t for t in match.group(1).split() if not t.startswith("--")]
|
||||
if len(tokens) < 2:
|
||||
continue
|
||||
*sources, dest = tokens
|
||||
copies.append((sources, dest))
|
||||
return copies
|
||||
|
||||
|
||||
def _resolve_handler_file(
|
||||
spec: str, copies: list[tuple[list[str], str]]
|
||||
) -> Optional[Path]:
|
||||
"""Map a handler spec to its repo source file.
|
||||
|
||||
Handles both in-place layouts (``backend.foo.handler`` -> ``backend/foo/
|
||||
handler.py``, present via ``COPY backend/ backend/``) and root-placed
|
||||
handlers (``main.handler`` where a ``COPY <src> /var/task/main.py`` or
|
||||
``COPY <src>/main.py .`` puts the file at the image root)."""
|
||||
module_path, _func = spec.rsplit(".", 1)
|
||||
|
||||
direct = REPO_ROOT / (module_path.replace(".", "/") + ".py")
|
||||
if direct.is_file():
|
||||
return direct
|
||||
|
||||
# Root-placed module: find the COPY whose destination basename matches.
|
||||
wanted = module_path.split("/")[-1] + ".py"
|
||||
for sources, dest in copies:
|
||||
dest_norm = _norm(dest)
|
||||
dest_is_named_file = Path(dest_norm).name == wanted
|
||||
dest_is_dir = dest_norm in ("", module_path.split("/")[-1]) or dest.endswith("/")
|
||||
for src in sources:
|
||||
src_path = REPO_ROOT / _norm(src)
|
||||
if not src_path.is_file():
|
||||
continue
|
||||
if dest_is_named_file or (dest_is_dir and src_path.name == wanted):
|
||||
return src_path
|
||||
return None
|
||||
|
||||
|
||||
def _is_copied(rel_path: str, copies: list[tuple[list[str], str]]) -> bool:
|
||||
"""Whether a repo-relative file path lands in the image via some COPY."""
|
||||
rel_path = _norm(rel_path)
|
||||
for sources, _dest in copies:
|
||||
for src in sources:
|
||||
src_norm = _norm(src)
|
||||
if src_norm == "" or src_norm == rel_path or rel_path.startswith(src_norm + "/"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _discover_handler_dockerfiles() -> list[Path]:
|
||||
found: list[Path] = []
|
||||
for path in REPO_ROOT.rglob("*Dockerfile*"):
|
||||
if path.name in _SKIP_DOCKERFILES:
|
||||
continue
|
||||
if any(part in _SKIP_PARTS for part in path.relative_to(REPO_ROOT).parts):
|
||||
continue
|
||||
try:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
except OSError:
|
||||
continue
|
||||
if _parse_handler_spec(text):
|
||||
found.append(path)
|
||||
return sorted(found)
|
||||
|
||||
|
||||
_HANDLER_DOCKERFILES = _discover_handler_dockerfiles()
|
||||
|
||||
|
||||
def test_handler_dockerfiles_discovered() -> None:
|
||||
"""Guard against the discovery silently finding nothing (e.g. a refactor
|
||||
that renames Dockerfiles), which would make every check below vacuous."""
|
||||
assert _HANDLER_DOCKERFILES, "no Lambda handler Dockerfiles found under repo root"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dockerfile",
|
||||
_HANDLER_DOCKERFILES,
|
||||
ids=[str(p.relative_to(REPO_ROOT)) for p in _HANDLER_DOCKERFILES],
|
||||
)
|
||||
def test_lambda_image_copies_full_import_closure(dockerfile: Path) -> None:
|
||||
"""Every repo file the handler imports at init must be COPYed into the image."""
|
||||
text = dockerfile.read_text(encoding="utf-8")
|
||||
spec = _parse_handler_spec(text)
|
||||
assert spec is not None # discovery guaranteed this
|
||||
copies = _parse_copies(text)
|
||||
|
||||
handler_file = _resolve_handler_file(spec, copies)
|
||||
assert handler_file is not None, (
|
||||
f"{dockerfile.relative_to(REPO_ROOT)}: could not locate the source file "
|
||||
f"for CMD handler {spec!r}. Update _resolve_handler_file if this is a new "
|
||||
f"handler layout."
|
||||
)
|
||||
|
||||
missing: list[str] = []
|
||||
for reached, importer in _import_closure(handler_file).items():
|
||||
rel = str(reached.relative_to(REPO_ROOT))
|
||||
if not _is_copied(rel, copies):
|
||||
blame = (
|
||||
str(importer.relative_to(REPO_ROOT)) if importer else "(handler entrypoint)"
|
||||
)
|
||||
missing.append(f" - {rel}\n imported by {blame}")
|
||||
|
||||
assert not missing, (
|
||||
f"{dockerfile.relative_to(REPO_ROOT)} runs `{spec}` but does not COPY "
|
||||
f"{len(missing)} file(s) it imports at init. The Lambda will fail at cold "
|
||||
f"start with Runtime.ImportModuleError. Add the missing top-level "
|
||||
f"package(s) as `COPY <pkg>/ <pkg>/`:\n" + "\n".join(sorted(missing))
|
||||
)
|
||||
Loading…
Add table
Reference in a new issue