added roofs

This commit is contained in:
Jun-te Kim 2026-05-26 16:18:26 +00:00
parent 8422041215
commit 36f4c32904
8 changed files with 378 additions and 31 deletions

View file

@ -11,7 +11,11 @@ from applications.landlord_description_overrides.landlord_description_overrides_
from domain.addresses.unstandardised_address import AddressList
from domain.landlord_description_overrides.built_form_type import BuiltFormType
from domain.landlord_description_overrides.property_type import PropertyType
from domain.landlord_description_overrides.roof_type import RoofType
from domain.landlord_description_overrides.wall_type import WallType
from domain.landlord_description_overrides.wall_type_construction_dates import (
wall_type_construction_date_prompt_hint,
)
from infrastructure.chatgpt.chatgpt import ChatGPT
from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier
from infrastructure.postgres.config import PostgresConfig
@ -22,6 +26,9 @@ from infrastructure.postgres.landlord_built_form_type_override_postgres_reposito
from infrastructure.postgres.landlord_property_type_override_postgres_repository import (
LandlordPropertyTypeOverridePostgresRepository,
)
from infrastructure.postgres.landlord_roof_type_override_postgres_repository import (
LandlordRoofTypeOverridePostgresRepository,
)
from infrastructure.postgres.landlord_wall_type_override_postgres_repository import (
LandlordWallTypeOverridePostgresRepository,
)
@ -98,10 +105,21 @@ def handler(
name="wall_type",
source_column="Walls",
classifier=ChatGptColumnClassifier(
chat_gpt, WallType, WallType.UNKNOWN
chat_gpt,
WallType,
WallType.UNKNOWN,
extra_instructions=wall_type_construction_date_prompt_hint(),
),
repo=LandlordWallTypeOverridePostgresRepository(session),
),
ClassifiableColumn(
name="roof_type",
source_column="Roofs",
classifier=ChatGptColumnClassifier(
chat_gpt, RoofType, RoofType.UNKNOWN
),
repo=LandlordRoofTypeOverridePostgresRepository(session),
),
]
orchestrator = LandlordDescriptionOverridesOrchestrator(

View file

@ -13,40 +13,83 @@ class WallType(Enum):
"""
CAVITY_FILLED = "Cavity wall, filled cavity"
CAVITY_AS_BUILT_INSULATED_ASSUMED = "Cavity wall, as built, insulated (assumed)"
CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = "Cavity wall, as built, no insulation (assumed)"
CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Cavity wall, as built, partial insulation (assumed)"
CAVITY_AS_BUILT_INSULATED_ASSUMED = (
"Cavity wall, as built, insulated (assumed)" # 1983 - 1990
)
CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = (
"Cavity wall, as built, no insulation (assumed)" # Pre-1975
)
CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Cavity wall, as built, partial insulation (assumed)" # 1976 - 1982
)
CAVITY_WITH_INTERNAL_INSULATION = "Cavity wall, with internal insulation"
CAVITY_WITH_EXTERNAL_INSULATION = "Cavity wall, with external insulation"
CAVITY_FILLED_AND_INTERNAL_INSULATION = "Cavity wall, filled cavity and internal insulation"
CAVITY_FILLED_AND_EXTERNAL_INSULATION = "Cavity wall, filled cavity and external insulation"
CAVITY_FILLED_AND_INTERNAL_INSULATION = (
"Cavity wall, filled cavity and internal insulation"
)
CAVITY_FILLED_AND_EXTERNAL_INSULATION = (
"Cavity wall, filled cavity and external insulation"
)
SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = "Solid brick, as built, no insulation (assumed)"
SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = "Solid brick, as built, insulated (assumed)"
SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Solid brick, as built, partial insulation (assumed)"
SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = (
"Solid brick, as built, no insulation (assumed)"
)
SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = (
"Solid brick, as built, insulated (assumed)"
)
SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Solid brick, as built, partial insulation (assumed)"
)
SOLID_BRICK_WITH_INTERNAL_INSULATION = "Solid brick, with internal insulation"
SOLID_BRICK_WITH_EXTERNAL_INSULATION = "Solid brick, with external insulation"
TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = "Timber frame, as built, no insulation (assumed)"
TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = "Timber frame, as built, insulated (assumed)"
TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Timber frame, as built, partial insulation (assumed)"
TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = (
"Timber frame, as built, no insulation (assumed)"
)
TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = (
"Timber frame, as built, insulated (assumed)"
)
TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Timber frame, as built, partial insulation (assumed)"
)
TIMBER_FRAME_WITH_ADDITIONAL_INSULATION = "Timber frame, with additional insulation"
SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = "Sandstone, as built, no insulation (assumed)"
SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = (
"Sandstone, as built, no insulation (assumed)"
)
SANDSTONE_AS_BUILT_INSULATED_ASSUMED = "Sandstone, as built, insulated (assumed)"
SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Sandstone, as built, partial insulation (assumed)"
SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Sandstone, as built, partial insulation (assumed)"
)
SANDSTONE_WITH_INTERNAL_INSULATION = "Sandstone, with internal insulation"
SANDSTONE_WITH_EXTERNAL_INSULATION = "Sandstone, with external insulation"
GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = "Granite or whin, as built, no insulation (assumed)"
GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = "Granite or whin, as built, insulated (assumed)"
GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Granite or whin, as built, partial insulation (assumed)"
GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = "Granite or whin, with internal insulation"
GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = "Granite or whin, with external insulation"
GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = (
"Granite or whin, as built, no insulation (assumed)"
)
GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = (
"Granite or whin, as built, insulated (assumed)"
)
GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"Granite or whin, as built, partial insulation (assumed)"
)
GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = (
"Granite or whin, with internal insulation"
)
GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = (
"Granite or whin, with external insulation"
)
SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = "System built, as built, no insulation (assumed)"
SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = "System built, as built, insulated (assumed)"
SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "System built, as built, partial insulation (assumed)"
SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = (
"System built, as built, no insulation (assumed)"
)
SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = (
"System built, as built, insulated (assumed)"
)
SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = (
"System built, as built, partial insulation (assumed)"
)
SYSTEM_BUILT_WITH_INTERNAL_INSULATION = "System built, with internal insulation"
SYSTEM_BUILT_WITH_EXTERNAL_INSULATION = "System built, with external insulation"
@ -59,8 +102,12 @@ class WallType(Enum):
COB_WITH_EXTERNAL_INSULATION = "Cob, with external insulation"
CURTAIN_WALL = "Curtain wall"
CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = "Curtain Wall, as built, no insulation (assumed)"
CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = "Curtain Wall, as built, insulated (assumed)"
CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = (
"Curtain Wall, as built, no insulation (assumed)"
)
CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = (
"Curtain Wall, as built, insulated (assumed)"
)
CURTAIN_WALL_FILLED = "Curtain Wall, filled cavity"
CURTAIN_WALL_WITH_INTERNAL_INSULATION = "Curtain Wall, with internal insulation"

View file

@ -0,0 +1,72 @@
"""Construction-date metadata for the "assumed" ``WallType`` variants.
The ``(assumed)`` variants of ``WallType`` are what RdSAP picks when a
surveyor has no direct observation and falls back to the typical wall make-up
for a property's build era. The era boundaries reflect UK Building
Regulations milestones for cavity-wall insulation:
* up to 1975 -- no cavity insulation requirement
* 1976-1982 -- partial-fill cavity (early insulation requirement)
* 1983-1990 -- full-fill cavity (insulation required)
Captured here as a structured lookup so:
* the LLM prompt builder can render the ranges as a hint, helping the
classifier resolve era-implying landlord descriptions to the right
``(assumed)`` variant;
* future date-aware paths (a deterministic year-to-variant shortcut, a
date-keyed repo) can read from the same source instead of duplicating
the knowledge.
Only the variants where we have a defensible era boundary appear here; the
remaining ``(assumed)`` members are left out rather than guessed.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Mapping, Optional
from domain.landlord_description_overrides.wall_type import WallType
@dataclass(frozen=True)
class YearRange:
"""An inclusive year range. ``None`` on either end means "no bound"."""
start: Optional[int] = None
end: Optional[int] = None
def __str__(self) -> str:
if self.start is None and self.end is not None:
return f"pre-{self.end + 1}"
if self.start is not None and self.end is None:
return f"{self.start}+"
return f"{self.start}-{self.end}"
WALL_TYPE_CONSTRUCTION_YEARS: Mapping[WallType, YearRange] = {
WallType.CAVITY_AS_BUILT_NO_INSULATION_ASSUMED: YearRange(end=1975),
WallType.CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED: YearRange(
start=1976, end=1982
),
WallType.CAVITY_AS_BUILT_INSULATED_ASSUMED: YearRange(start=1983, end=1990),
}
def wall_type_construction_date_prompt_hint() -> str:
"""Render the date metadata as a prompt fragment for the LLM classifier.
The fragment lists each (variant, year range) pair so the model can
prefer the era-matching ``(assumed)`` variant when a landlord
description carries era information (e.g. "1970s semi", "built before
the war").
"""
lines = [
f"- {wall_type.value!r}: typically built {year_range}"
for wall_type, year_range in WALL_TYPE_CONSTRUCTION_YEARS.items()
]
return (
"When the description carries construction-era information, prefer "
"the category whose typical build year matches:\n" + "\n".join(lines)
)

View file

@ -2,7 +2,7 @@ from __future__ import annotations
import json
from enum import Enum
from typing import Any, TypeVar
from typing import Any, Optional, TypeVar
from domain.landlord_description_overrides.column_classifier import (
ClassificationError,
@ -27,10 +27,16 @@ class ChatGptColumnClassifier(ColumnClassifier[E]):
chat_gpt: ChatGPT,
category_enum: type[E],
unknown: E,
extra_instructions: Optional[str] = None,
) -> None:
self._chat_gpt = chat_gpt
self._category_enum = category_enum
self._unknown = unknown
# Free-form column-specific guidance appended to the system prompt
# ahead of the JSON-output instruction. Lets each column ship its
# own hints (e.g. wall-type construction-era ranges) without the
# generic classifier knowing what they are.
self._extra_instructions = extra_instructions
def classify(self, descriptions: set[str]) -> dict[str, E]:
if not descriptions:
@ -62,12 +68,17 @@ class ChatGptColumnClassifier(ColumnClassifier[E]):
for member in self._category_enum
if member is not self._unknown
)
return (
"Classify each free-text description into exactly one category. "
f"Categories: {categories}. "
parts = [
"Classify each free-text description into exactly one category. ",
f"Categories: {categories}. ",
]
if self._extra_instructions:
parts.append(self._extra_instructions + " ")
parts.append(
"Reply with only a JSON object mapping each original description "
"to its category, and nothing else."
)
return "".join(parts)
def _to_category(self, value: Any) -> E:
"""Map a reply value to a category member, defaulting to UNKNOWN."""

View file

@ -0,0 +1,80 @@
"""Postgres adapter for ``LandlordOverrideRepository[RoofType]``.
Writes to ``landlord_roof_type_overrides`` (Drizzle-managed; mirrored by
``LandlordRoofTypeOverrideRow``). The conflict policy lives in the SQL --
see ADR-0003 §Decision. Shape mirrors
``LandlordPropertyTypeOverridePostgresRepository``; the duplication is
deliberate while there are only a handful of override columns -- if the
duplication becomes painful, extract a shared upsert helper then.
"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import cast
from sqlalchemy import Table
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlmodel import Session
from domain.landlord_description_overrides.roof_type import RoofType
from infrastructure.postgres.landlord_override_enums import OverrideSource
from infrastructure.postgres.landlord_roof_type_override_table import (
LandlordRoofTypeOverrideRow,
)
from repositories.landlord_overrides.landlord_override_repository import (
LandlordOverrideRepository,
)
class LandlordRoofTypeOverridePostgresRepository(
LandlordOverrideRepository[RoofType]
):
def __init__(self, session: Session) -> None:
self._session = session
def upsert_all(
self,
portfolio_id: int,
descriptions_to_values: dict[str, RoofType],
) -> None:
if not descriptions_to_values:
return
now = datetime.now(timezone.utc)
rows = [
{
"portfolio_id": portfolio_id,
"description": description,
"value": value.value,
"source": OverrideSource.CLASSIFIER,
"created_at": now,
"updated_at": now,
}
for description, value in descriptions_to_values.items()
]
# SQLModel's class-level ``__table__`` is injected at runtime on
# ``table=True`` classes but isn't exposed by the stubs; pin it to
# ``Table`` via ``getattr`` so the dialect insert helper below
# carries through with strict types.
table: Table = cast(Table, getattr(LandlordRoofTypeOverrideRow, "__table__"))
stmt = pg_insert(table).values(rows)
# The classifier may refresh its own past output, but must never
# overwrite a user correction -- the ``WHERE existing.source =
# 'classifier'`` guard enforces that. See ADR-0003 §Decision.
stmt = stmt.on_conflict_do_update(
index_elements=["portfolio_id", "description"],
set_={
"value": stmt.excluded.value,
"source": stmt.excluded.source,
"updated_at": stmt.excluded.updated_at,
},
where=table.c.source == OverrideSource.CLASSIFIER,
)
# SQLModel re-exports SQLAlchemy's ``Session.execute``; one of the
# overload signatures is marked deprecated in stubs, which fires
# here even though our INSERT path is the supported one.
self._session.execute(stmt) # pyright: ignore[reportDeprecated]

View file

@ -0,0 +1,69 @@
"""SQLModel mirror of the ``landlord_roof_type_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only
differences are the table name, the ``roof_type`` pgEnum on ``value``, and
the unique-constraint name.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.landlord_description_overrides.roof_type import RoofType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordRoofTypeOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_roof_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_roof_type_overrides_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: RoofType = Field(
sa_column=Column(
SAEnum(
RoofType,
name="roof_type",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

@ -46,7 +46,7 @@ def main() -> int:
print(f" - {c}")
return 0
column = "roof_description"
column = "wall "
series = df[column] if args.keep_na else df[column].dropna()
for value in series.unique():
print(value)

View file

@ -23,11 +23,13 @@ class _FakeChatGPT(ChatGPT):
error: Optional[Exception] = None,
) -> None:
self.prompts: list[str] = []
self.system_prompts: list[Optional[str]] = []
self._reply = reply
self._error = error
def generate(self, prompt: str, system_prompt: Optional[str] = None) -> str:
self.prompts.append(prompt)
self.system_prompts.append(system_prompt)
if self._error is not None:
raise self._error
return self._reply
@ -125,11 +127,59 @@ def test_empty_description_set_returns_empty_without_calling_chatgpt() -> None:
def test_classifies_with_a_different_category_enum() -> None:
# Arrange: the same adapter classifies a WallType column.
chat_gpt = _FakeChatGPT(reply='{"solid brick wall": "Solid Brick"}')
chat_gpt = _FakeChatGPT(
reply='{"solid brick wall": "Solid brick, as built, no insulation (assumed)"}'
)
classifier = ChatGptColumnClassifier(chat_gpt, WallType, WallType.UNKNOWN)
# Act
result = classifier.classify({"solid brick wall"})
# Assert
assert result == {"solid brick wall": WallType.SOLID_BRICK}
assert result == {
"solid brick wall": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED
}
def test_extra_instructions_are_appended_to_the_system_prompt() -> None:
# Arrange: column-specific guidance (e.g. wall-type build-era hints)
# should reach the model verbatim, in the system prompt ahead of the
# JSON-output instruction.
chat_gpt = _FakeChatGPT(reply='{"1970s semi": "House"}')
classifier = ChatGptColumnClassifier(
chat_gpt,
PropertyType,
PropertyType.UNKNOWN,
extra_instructions="If the description carries a build decade, prefer X.",
)
# Act
classifier.classify({"1970s semi"})
# Assert: the hint sits in the system prompt, before the JSON instruction.
system_prompt = chat_gpt.system_prompts[0]
assert system_prompt is not None
assert "If the description carries a build decade, prefer X." in system_prompt
hint_index = system_prompt.index("If the description carries a build decade")
json_index = system_prompt.index("Reply with only a JSON object")
assert hint_index < json_index
def test_omitting_extra_instructions_leaves_the_system_prompt_unchanged() -> None:
# Arrange: a classifier without per-column guidance must still produce
# the original system prompt -- no trailing whitespace, no orphan hint.
chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}')
classifier = ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN)
# Act
classifier.classify({"semi-detached"})
# Assert
system_prompt = chat_gpt.system_prompts[0]
assert system_prompt is not None
assert system_prompt == (
"Classify each free-text description into exactly one category. "
"Categories: House, Bungalow, Flat, Maisonette, Park home. "
"Reply with only a JSON object mapping each original description "
"to its category, and nothing else."
)