From 36f4c32904a40f76e7c07a153cc96c41c925ebe6 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 26 May 2026 16:18:26 +0000 Subject: [PATCH] added roofs --- .../landlord_description_overrides/handler.py | 20 +++- .../wall_type.py | 93 ++++++++++++++----- .../wall_type_construction_dates.py | 72 ++++++++++++++ .../chatgpt/chatgpt_column_classifier.py | 19 +++- ..._roof_type_override_postgres_repository.py | 80 ++++++++++++++++ .../landlord_roof_type_override_table.py | 69 ++++++++++++++ playground.py | 2 +- .../chatgpt/test_chatgpt_column_classifier.py | 54 ++++++++++- 8 files changed, 378 insertions(+), 31 deletions(-) create mode 100644 domain/landlord_description_overrides/wall_type_construction_dates.py create mode 100644 infrastructure/postgres/landlord_roof_type_override_postgres_repository.py create mode 100644 infrastructure/postgres/landlord_roof_type_override_table.py diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index ff16925e..7b7b60af 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -11,7 +11,11 @@ from applications.landlord_description_overrides.landlord_description_overrides_ from domain.addresses.unstandardised_address import AddressList from domain.landlord_description_overrides.built_form_type import BuiltFormType from domain.landlord_description_overrides.property_type import PropertyType +from domain.landlord_description_overrides.roof_type import RoofType from domain.landlord_description_overrides.wall_type import WallType +from domain.landlord_description_overrides.wall_type_construction_dates import ( + wall_type_construction_date_prompt_hint, +) from infrastructure.chatgpt.chatgpt import ChatGPT from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier from infrastructure.postgres.config import PostgresConfig @@ -22,6 +26,9 @@ from infrastructure.postgres.landlord_built_form_type_override_postgres_reposito from infrastructure.postgres.landlord_property_type_override_postgres_repository import ( LandlordPropertyTypeOverridePostgresRepository, ) +from infrastructure.postgres.landlord_roof_type_override_postgres_repository import ( + LandlordRoofTypeOverridePostgresRepository, +) from infrastructure.postgres.landlord_wall_type_override_postgres_repository import ( LandlordWallTypeOverridePostgresRepository, ) @@ -98,10 +105,21 @@ def handler( name="wall_type", source_column="Walls", classifier=ChatGptColumnClassifier( - chat_gpt, WallType, WallType.UNKNOWN + chat_gpt, + WallType, + WallType.UNKNOWN, + extra_instructions=wall_type_construction_date_prompt_hint(), ), repo=LandlordWallTypeOverridePostgresRepository(session), ), + ClassifiableColumn( + name="roof_type", + source_column="Roofs", + classifier=ChatGptColumnClassifier( + chat_gpt, RoofType, RoofType.UNKNOWN + ), + repo=LandlordRoofTypeOverridePostgresRepository(session), + ), ] orchestrator = LandlordDescriptionOverridesOrchestrator( diff --git a/domain/landlord_description_overrides/wall_type.py b/domain/landlord_description_overrides/wall_type.py index 42b90da6..1466f82d 100644 --- a/domain/landlord_description_overrides/wall_type.py +++ b/domain/landlord_description_overrides/wall_type.py @@ -13,40 +13,83 @@ class WallType(Enum): """ CAVITY_FILLED = "Cavity wall, filled cavity" - CAVITY_AS_BUILT_INSULATED_ASSUMED = "Cavity wall, as built, insulated (assumed)" - CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = "Cavity wall, as built, no insulation (assumed)" - CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Cavity wall, as built, partial insulation (assumed)" + CAVITY_AS_BUILT_INSULATED_ASSUMED = ( + "Cavity wall, as built, insulated (assumed)" # 1983 - 1990 + ) + CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Cavity wall, as built, no insulation (assumed)" # Pre-1975 + ) + + CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Cavity wall, as built, partial insulation (assumed)" # 1976 - 1982 + ) CAVITY_WITH_INTERNAL_INSULATION = "Cavity wall, with internal insulation" CAVITY_WITH_EXTERNAL_INSULATION = "Cavity wall, with external insulation" - CAVITY_FILLED_AND_INTERNAL_INSULATION = "Cavity wall, filled cavity and internal insulation" - CAVITY_FILLED_AND_EXTERNAL_INSULATION = "Cavity wall, filled cavity and external insulation" + CAVITY_FILLED_AND_INTERNAL_INSULATION = ( + "Cavity wall, filled cavity and internal insulation" + ) + CAVITY_FILLED_AND_EXTERNAL_INSULATION = ( + "Cavity wall, filled cavity and external insulation" + ) - SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = "Solid brick, as built, no insulation (assumed)" - SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = "Solid brick, as built, insulated (assumed)" - SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Solid brick, as built, partial insulation (assumed)" + SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Solid brick, as built, no insulation (assumed)" + ) + SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = ( + "Solid brick, as built, insulated (assumed)" + ) + SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Solid brick, as built, partial insulation (assumed)" + ) SOLID_BRICK_WITH_INTERNAL_INSULATION = "Solid brick, with internal insulation" SOLID_BRICK_WITH_EXTERNAL_INSULATION = "Solid brick, with external insulation" - TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = "Timber frame, as built, no insulation (assumed)" - TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = "Timber frame, as built, insulated (assumed)" - TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Timber frame, as built, partial insulation (assumed)" + TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Timber frame, as built, no insulation (assumed)" + ) + TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = ( + "Timber frame, as built, insulated (assumed)" + ) + TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Timber frame, as built, partial insulation (assumed)" + ) TIMBER_FRAME_WITH_ADDITIONAL_INSULATION = "Timber frame, with additional insulation" - SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = "Sandstone, as built, no insulation (assumed)" + SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Sandstone, as built, no insulation (assumed)" + ) SANDSTONE_AS_BUILT_INSULATED_ASSUMED = "Sandstone, as built, insulated (assumed)" - SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Sandstone, as built, partial insulation (assumed)" + SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Sandstone, as built, partial insulation (assumed)" + ) SANDSTONE_WITH_INTERNAL_INSULATION = "Sandstone, with internal insulation" SANDSTONE_WITH_EXTERNAL_INSULATION = "Sandstone, with external insulation" - GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = "Granite or whin, as built, no insulation (assumed)" - GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = "Granite or whin, as built, insulated (assumed)" - GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Granite or whin, as built, partial insulation (assumed)" - GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = "Granite or whin, with internal insulation" - GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = "Granite or whin, with external insulation" + GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Granite or whin, as built, no insulation (assumed)" + ) + GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = ( + "Granite or whin, as built, insulated (assumed)" + ) + GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Granite or whin, as built, partial insulation (assumed)" + ) + GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = ( + "Granite or whin, with internal insulation" + ) + GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = ( + "Granite or whin, with external insulation" + ) - SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = "System built, as built, no insulation (assumed)" - SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = "System built, as built, insulated (assumed)" - SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "System built, as built, partial insulation (assumed)" + SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = ( + "System built, as built, no insulation (assumed)" + ) + SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = ( + "System built, as built, insulated (assumed)" + ) + SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "System built, as built, partial insulation (assumed)" + ) SYSTEM_BUILT_WITH_INTERNAL_INSULATION = "System built, with internal insulation" SYSTEM_BUILT_WITH_EXTERNAL_INSULATION = "System built, with external insulation" @@ -59,8 +102,12 @@ class WallType(Enum): COB_WITH_EXTERNAL_INSULATION = "Cob, with external insulation" CURTAIN_WALL = "Curtain wall" - CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = "Curtain Wall, as built, no insulation (assumed)" - CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = "Curtain Wall, as built, insulated (assumed)" + CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Curtain Wall, as built, no insulation (assumed)" + ) + CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = ( + "Curtain Wall, as built, insulated (assumed)" + ) CURTAIN_WALL_FILLED = "Curtain Wall, filled cavity" CURTAIN_WALL_WITH_INTERNAL_INSULATION = "Curtain Wall, with internal insulation" diff --git a/domain/landlord_description_overrides/wall_type_construction_dates.py b/domain/landlord_description_overrides/wall_type_construction_dates.py new file mode 100644 index 00000000..4cd869b3 --- /dev/null +++ b/domain/landlord_description_overrides/wall_type_construction_dates.py @@ -0,0 +1,72 @@ +"""Construction-date metadata for the "assumed" ``WallType`` variants. + +The ``(assumed)`` variants of ``WallType`` are what RdSAP picks when a +surveyor has no direct observation and falls back to the typical wall make-up +for a property's build era. The era boundaries reflect UK Building +Regulations milestones for cavity-wall insulation: + +* up to 1975 -- no cavity insulation requirement +* 1976-1982 -- partial-fill cavity (early insulation requirement) +* 1983-1990 -- full-fill cavity (insulation required) + +Captured here as a structured lookup so: + +* the LLM prompt builder can render the ranges as a hint, helping the + classifier resolve era-implying landlord descriptions to the right + ``(assumed)`` variant; +* future date-aware paths (a deterministic year-to-variant shortcut, a + date-keyed repo) can read from the same source instead of duplicating + the knowledge. + +Only the variants where we have a defensible era boundary appear here; the +remaining ``(assumed)`` members are left out rather than guessed. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Mapping, Optional + +from domain.landlord_description_overrides.wall_type import WallType + + +@dataclass(frozen=True) +class YearRange: + """An inclusive year range. ``None`` on either end means "no bound".""" + + start: Optional[int] = None + end: Optional[int] = None + + def __str__(self) -> str: + if self.start is None and self.end is not None: + return f"pre-{self.end + 1}" + if self.start is not None and self.end is None: + return f"{self.start}+" + return f"{self.start}-{self.end}" + + +WALL_TYPE_CONSTRUCTION_YEARS: Mapping[WallType, YearRange] = { + WallType.CAVITY_AS_BUILT_NO_INSULATION_ASSUMED: YearRange(end=1975), + WallType.CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED: YearRange( + start=1976, end=1982 + ), + WallType.CAVITY_AS_BUILT_INSULATED_ASSUMED: YearRange(start=1983, end=1990), +} + + +def wall_type_construction_date_prompt_hint() -> str: + """Render the date metadata as a prompt fragment for the LLM classifier. + + The fragment lists each (variant, year range) pair so the model can + prefer the era-matching ``(assumed)`` variant when a landlord + description carries era information (e.g. "1970s semi", "built before + the war"). + """ + lines = [ + f"- {wall_type.value!r}: typically built {year_range}" + for wall_type, year_range in WALL_TYPE_CONSTRUCTION_YEARS.items() + ] + return ( + "When the description carries construction-era information, prefer " + "the category whose typical build year matches:\n" + "\n".join(lines) + ) diff --git a/infrastructure/chatgpt/chatgpt_column_classifier.py b/infrastructure/chatgpt/chatgpt_column_classifier.py index b23e7c2e..2ce66299 100644 --- a/infrastructure/chatgpt/chatgpt_column_classifier.py +++ b/infrastructure/chatgpt/chatgpt_column_classifier.py @@ -2,7 +2,7 @@ from __future__ import annotations import json from enum import Enum -from typing import Any, TypeVar +from typing import Any, Optional, TypeVar from domain.landlord_description_overrides.column_classifier import ( ClassificationError, @@ -27,10 +27,16 @@ class ChatGptColumnClassifier(ColumnClassifier[E]): chat_gpt: ChatGPT, category_enum: type[E], unknown: E, + extra_instructions: Optional[str] = None, ) -> None: self._chat_gpt = chat_gpt self._category_enum = category_enum self._unknown = unknown + # Free-form column-specific guidance appended to the system prompt + # ahead of the JSON-output instruction. Lets each column ship its + # own hints (e.g. wall-type construction-era ranges) without the + # generic classifier knowing what they are. + self._extra_instructions = extra_instructions def classify(self, descriptions: set[str]) -> dict[str, E]: if not descriptions: @@ -62,12 +68,17 @@ class ChatGptColumnClassifier(ColumnClassifier[E]): for member in self._category_enum if member is not self._unknown ) - return ( - "Classify each free-text description into exactly one category. " - f"Categories: {categories}. " + parts = [ + "Classify each free-text description into exactly one category. ", + f"Categories: {categories}. ", + ] + if self._extra_instructions: + parts.append(self._extra_instructions + " ") + parts.append( "Reply with only a JSON object mapping each original description " "to its category, and nothing else." ) + return "".join(parts) def _to_category(self, value: Any) -> E: """Map a reply value to a category member, defaulting to UNKNOWN.""" diff --git a/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py b/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py new file mode 100644 index 00000000..b5b570bc --- /dev/null +++ b/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py @@ -0,0 +1,80 @@ +"""Postgres adapter for ``LandlordOverrideRepository[RoofType]``. + +Writes to ``landlord_roof_type_overrides`` (Drizzle-managed; mirrored by +``LandlordRoofTypeOverrideRow``). The conflict policy lives in the SQL -- +see ADR-0003 §Decision. Shape mirrors +``LandlordPropertyTypeOverridePostgresRepository``; the duplication is +deliberate while there are only a handful of override columns -- if the +duplication becomes painful, extract a shared upsert helper then. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import cast + +from sqlalchemy import Table +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlmodel import Session + +from domain.landlord_description_overrides.roof_type import RoofType +from infrastructure.postgres.landlord_override_enums import OverrideSource +from infrastructure.postgres.landlord_roof_type_override_table import ( + LandlordRoofTypeOverrideRow, +) +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, +) + + +class LandlordRoofTypeOverridePostgresRepository( + LandlordOverrideRepository[RoofType] +): + def __init__(self, session: Session) -> None: + self._session = session + + def upsert_all( + self, + portfolio_id: int, + descriptions_to_values: dict[str, RoofType], + ) -> None: + if not descriptions_to_values: + return + + now = datetime.now(timezone.utc) + rows = [ + { + "portfolio_id": portfolio_id, + "description": description, + "value": value.value, + "source": OverrideSource.CLASSIFIER, + "created_at": now, + "updated_at": now, + } + for description, value in descriptions_to_values.items() + ] + + # SQLModel's class-level ``__table__`` is injected at runtime on + # ``table=True`` classes but isn't exposed by the stubs; pin it to + # ``Table`` via ``getattr`` so the dialect insert helper below + # carries through with strict types. + table: Table = cast(Table, getattr(LandlordRoofTypeOverrideRow, "__table__")) + stmt = pg_insert(table).values(rows) + + # The classifier may refresh its own past output, but must never + # overwrite a user correction -- the ``WHERE existing.source = + # 'classifier'`` guard enforces that. See ADR-0003 §Decision. + stmt = stmt.on_conflict_do_update( + index_elements=["portfolio_id", "description"], + set_={ + "value": stmt.excluded.value, + "source": stmt.excluded.source, + "updated_at": stmt.excluded.updated_at, + }, + where=table.c.source == OverrideSource.CLASSIFIER, + ) + + # SQLModel re-exports SQLAlchemy's ``Session.execute``; one of the + # overload signatures is marked deprecated in stubs, which fires + # here even though our INSERT path is the supported one. + self._session.execute(stmt) # pyright: ignore[reportDeprecated] diff --git a/infrastructure/postgres/landlord_roof_type_override_table.py b/infrastructure/postgres/landlord_roof_type_override_table.py new file mode 100644 index 00000000..f0cea945 --- /dev/null +++ b/infrastructure/postgres/landlord_roof_type_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_roof_type_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only +differences are the table name, the ``roof_type`` pgEnum on ``value``, and +the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.landlord_description_overrides.roof_type import RoofType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordRoofTypeOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_roof_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_roof_type_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: RoofType = Field( + sa_column=Column( + SAEnum( + RoofType, + name="roof_type", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/playground.py b/playground.py index d116dcf9..5e9001e1 100644 --- a/playground.py +++ b/playground.py @@ -46,7 +46,7 @@ def main() -> int: print(f" - {c}") return 0 - column = "roof_description" + column = "wall " series = df[column] if args.keep_na else df[column].dropna() for value in series.unique(): print(value) diff --git a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py index 8a07ecec..4cdf4dfe 100644 --- a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py +++ b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py @@ -23,11 +23,13 @@ class _FakeChatGPT(ChatGPT): error: Optional[Exception] = None, ) -> None: self.prompts: list[str] = [] + self.system_prompts: list[Optional[str]] = [] self._reply = reply self._error = error def generate(self, prompt: str, system_prompt: Optional[str] = None) -> str: self.prompts.append(prompt) + self.system_prompts.append(system_prompt) if self._error is not None: raise self._error return self._reply @@ -125,11 +127,59 @@ def test_empty_description_set_returns_empty_without_calling_chatgpt() -> None: def test_classifies_with_a_different_category_enum() -> None: # Arrange: the same adapter classifies a WallType column. - chat_gpt = _FakeChatGPT(reply='{"solid brick wall": "Solid Brick"}') + chat_gpt = _FakeChatGPT( + reply='{"solid brick wall": "Solid brick, as built, no insulation (assumed)"}' + ) classifier = ChatGptColumnClassifier(chat_gpt, WallType, WallType.UNKNOWN) # Act result = classifier.classify({"solid brick wall"}) # Assert - assert result == {"solid brick wall": WallType.SOLID_BRICK} + assert result == { + "solid brick wall": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED + } + + +def test_extra_instructions_are_appended_to_the_system_prompt() -> None: + # Arrange: column-specific guidance (e.g. wall-type build-era hints) + # should reach the model verbatim, in the system prompt ahead of the + # JSON-output instruction. + chat_gpt = _FakeChatGPT(reply='{"1970s semi": "House"}') + classifier = ChatGptColumnClassifier( + chat_gpt, + PropertyType, + PropertyType.UNKNOWN, + extra_instructions="If the description carries a build decade, prefer X.", + ) + + # Act + classifier.classify({"1970s semi"}) + + # Assert: the hint sits in the system prompt, before the JSON instruction. + system_prompt = chat_gpt.system_prompts[0] + assert system_prompt is not None + assert "If the description carries a build decade, prefer X." in system_prompt + hint_index = system_prompt.index("If the description carries a build decade") + json_index = system_prompt.index("Reply with only a JSON object") + assert hint_index < json_index + + +def test_omitting_extra_instructions_leaves_the_system_prompt_unchanged() -> None: + # Arrange: a classifier without per-column guidance must still produce + # the original system prompt -- no trailing whitespace, no orphan hint. + chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}') + classifier = ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN) + + # Act + classifier.classify({"semi-detached"}) + + # Assert + system_prompt = chat_gpt.system_prompts[0] + assert system_prompt is not None + assert system_prompt == ( + "Classify each free-text description into exactly one category. " + "Categories: House, Bungalow, Flat, Maisonette, Park home. " + "Reply with only a JSON object mapping each original description " + "to its category, and nothing else." + )