Merge branch 'feautre/hyde_upload_and_extend_landlord_overrides' into feature/hyde_make_it_more_accurate_with_tests

This commit is contained in:
Jun-te Kim 2026-06-20 07:26:35 +00:00
commit abd4bbc2d0
63 changed files with 4420 additions and 26 deletions

5
.gitignore vendored
View file

@ -313,3 +313,8 @@ scripts/eon/epc_cache.pkl
scripts/hyde/.elmhurst-session/
scripts/hyde/elmhurst_downloads/
scripts/hyde/.elmhurst-creds.json
# Hyde property-overrides script artifacts
overrides_cache.json
overrides_unknowns.csv
overrides_edits.csv

View file

@ -7,11 +7,16 @@ import boto3
from applications.landlord_description_overrides.landlord_description_overrides_trigger_body import (
LandlordDescriptionOverridesTriggerBody,
)
from domain.epc.built_form_type import BuiltFormType
from domain.epc.property_type import PropertyType
from domain.epc.roof_type import RoofType
from domain.epc.wall_type import WallType
from domain.epc.wall_type_construction_dates import (
from domain.epc.property_overrides.built_form_type import BuiltFormType
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
from domain.epc.property_overrides.glazing_type import GlazingType
from domain.epc.property_overrides.main_fuel_type import MainFuelType
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
from domain.epc.property_overrides.property_type import PropertyType
from domain.epc.property_overrides.roof_type import RoofType
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
from domain.epc.property_overrides.wall_type import WallType
from domain.epc.property_overrides.wall_type_construction_dates import (
wall_type_construction_date_prompt_hint,
)
from infrastructure.chatgpt.chatgpt import ChatGPT
@ -24,6 +29,21 @@ from infrastructure.postgres.engine import commit_scope, make_engine, make_sessi
from infrastructure.postgres.landlord_built_form_type_override_table import (
LandlordBuiltFormTypeOverrideRow,
)
from infrastructure.postgres.landlord_construction_age_band_override_table import (
LandlordConstructionAgeBandOverrideRow,
)
from infrastructure.postgres.landlord_glazing_override_table import (
LandlordGlazingOverrideRow,
)
from infrastructure.postgres.landlord_main_fuel_override_table import (
LandlordMainFuelOverrideRow,
)
from infrastructure.postgres.landlord_main_heating_system_override_table import (
LandlordMainHeatingSystemOverrideRow,
)
from infrastructure.postgres.landlord_water_heating_override_table import (
LandlordWaterHeatingOverrideRow,
)
from infrastructure.postgres.landlord_property_type_override_table import (
LandlordPropertyTypeOverrideRow,
)
@ -102,6 +122,56 @@ def _build_columns(
session, LandlordRoofTypeOverrideRow
),
),
"main_fuel": lambda src: ClassifiableColumn(
name="main_fuel",
source_column=src,
classifier=ChatGptColumnClassifier(
chat_gpt, MainFuelType, MainFuelType.UNKNOWN
),
repo=LandlordOverridesRepository[MainFuelType](
session, LandlordMainFuelOverrideRow
),
),
"glazing": lambda src: ClassifiableColumn(
name="glazing",
source_column=src,
classifier=ChatGptColumnClassifier(
chat_gpt, GlazingType, GlazingType.UNKNOWN
),
repo=LandlordOverridesRepository[GlazingType](
session, LandlordGlazingOverrideRow
),
),
"construction_age_band": lambda src: ClassifiableColumn(
name="construction_age_band",
source_column=src,
classifier=ChatGptColumnClassifier(
chat_gpt, ConstructionAgeBand, ConstructionAgeBand.UNKNOWN
),
repo=LandlordOverridesRepository[ConstructionAgeBand](
session, LandlordConstructionAgeBandOverrideRow
),
),
"water_heating": lambda src: ClassifiableColumn(
name="water_heating",
source_column=src,
classifier=ChatGptColumnClassifier(
chat_gpt, WaterHeatingType, WaterHeatingType.UNKNOWN
),
repo=LandlordOverridesRepository[WaterHeatingType](
session, LandlordWaterHeatingOverrideRow
),
),
"main_heating_system": lambda src: ClassifiableColumn(
name="main_heating_system",
source_column=src,
classifier=ChatGptColumnClassifier(
chat_gpt, MainHeatingSystemType, MainHeatingSystemType.UNKNOWN
),
repo=LandlordOverridesRepository[MainHeatingSystemType](
session, LandlordMainHeatingSystemOverrideRow
),
),
}
columns: list[ClassifiableColumn[Any]] = []

View file

@ -19,7 +19,7 @@ variable "image_digest" {
variable "maximum_concurrency" {
type = number
default = 2
default = 20
description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit."
}

View file

@ -0,0 +1,39 @@
"""Map a Landlord-Override construction-age-band value to a fabric Simulation
Overlay.
A construction-age-band value is the RdSAP England-&-Wales letter code (A..M)
the calculator's U-value cascades key on (`SapBuildingPart.construction_age_band`,
read via `.strip().upper()` against the letter-code bands). The overlay targets
the override's building part and sets the band; an unrecognised code produces no
overlay. Re-dating a part re-derives its construction-default U-values, so this
is the highest-leverage fabric override.
"""
from __future__ import annotations
from typing import Optional
from datatypes.epc.domain.epc_property_data import BuildingPartIdentifier
from domain.modelling.simulation import BuildingPartOverlay, EpcSimulation
# RdSAP England-&-Wales construction age bands (letter codes A..M).
_VALID_AGE_BANDS: frozenset[str] = frozenset("ABCDEFGHIJKLM")
def age_band_overlay_for(
age_band_value: str, building_part: int
) -> Optional[EpcSimulation]:
band = age_band_value.strip().upper()
if band not in _VALID_AGE_BANDS:
return None
identifier = (
BuildingPartIdentifier.MAIN
if building_part == 0
else BuildingPartIdentifier.extension(building_part)
)
return EpcSimulation(
building_parts={
identifier: BuildingPartOverlay(construction_age_band=band)
}
)

View file

@ -0,0 +1,36 @@
"""Map a Landlord-Override glazing value to a glazing Simulation Overlay.
A glazing value is one canonical glazing description carrying type + era
("Double glazing, 2002 or later", "Single glazing", "Triple glazing, 2002 or
later"). The calculator derives each window's U-value from its SAP10
`glazing_type` code via the RdSAP Table 24 cascade, so the overlay decomposes
the value into that code and emits a whole-dwelling `GlazingOverlay` (a landlord
describes the dwelling's glazing as a whole, with no per-window geometry, so
`building_part` is ignored). `_fold_glazing` expands it across every window.
Unresolvable values produce no overlay.
"""
from __future__ import annotations
from typing import Optional
from domain.modelling.simulation import EpcSimulation, GlazingOverlay
# Canonical glazing description → SAP10 glazing-type code (the Table 24 /
# `u_window` cascade enum, `_GLAZING_CODE_TO_UWINDOW` in heat_transmission).
_GLAZING_CODES: dict[str, int] = {
"Single glazing": 1,
"Double glazing, 2002 or later": 2,
"Double glazing, pre-2002": 3,
"Triple glazing, pre-2002": 6,
"Triple glazing, 2002 or later": 9,
}
def glazing_overlay_for(
glazing_value: str, building_part: int
) -> Optional[EpcSimulation]:
code = _GLAZING_CODES.get(glazing_value)
if code is None:
return None
return EpcSimulation(glazing=GlazingOverlay(glazing_type=code))

View file

@ -0,0 +1,41 @@
"""Map a Landlord-Override main-fuel value to a heating Simulation Overlay.
A main-fuel value is one canonical gov-EPC `main_fuel` description ("mains gas",
"electricity", ). The calculator reads the dwelling's primary fuel from
`main_heating_details[0].main_fuel_type` as the RdSAP **int code**, so the
overlay decomposes the value into that code and emits a whole-dwelling
`HeatingOverlay` (fuel is not a per-building-part attribute, so `building_part`
is ignored). Codes follow the modern RdSAP-20/21 `(not community)` family the
gov-EPC API baseline uses. Unresolvable values produce no overlay.
"""
from __future__ import annotations
from typing import Optional
from domain.modelling.simulation import EpcSimulation, HeatingOverlay
# RdSAP-20/21 `main_fuel` `(not community)` codes (epc_codes.csv `main_fuel`).
_FUEL_CODES: dict[str, int] = {
"mains gas": 26,
"mains gas (community)": 20,
"LPG (bulk)": 27,
"bottled LPG": 3,
"LPG special condition": 17,
"oil": 28,
"electricity": 29,
"electricity (community)": 25,
"house coal": 33,
"smokeless coal": 15,
"dual fuel (mineral and wood)": 10,
"biomass (community)": 31,
}
def fuel_overlay_for(
main_fuel_value: str, building_part: int
) -> Optional[EpcSimulation]:
code = _FUEL_CODES.get(main_fuel_value)
if code is None:
return None
return EpcSimulation(heating=HeatingOverlay(main_fuel_type=code))

View file

@ -0,0 +1,46 @@
"""Map a Landlord-Override main-heating-system value to a heating Simulation Overlay.
A main-heating-system value is one canonical system archetype ("Gas boiler,
combi", "Electric storage heaters, fan"). The calculator reads the primary
system's `sap_main_heating_code` (SAP Table 4a/4b), so the overlay maps the
archetype to a representative code and emits a whole-dwelling `HeatingOverlay`
targeting `main_heating_details[0]` (`building_part` is ignored). It composes
field-wise with the main_fuel / water_heating overlays.
The SEDBUK A-G efficiency band the Hyde "Heating" column carries is NOT honoured
yet (no efficiency slot on the overlay/MainHeatingDetail) -- archetypes map to
their modern/condensing Table 4b code, so an old low-rated boiler is currently
modelled at the condensing efficiency. Heat pumps and community heating (which
resolve via main_heating_index_number / community codes, not a Table 4b code)
are left UNKNOWN until modelled. Unresolvable values produce no overlay.
"""
from __future__ import annotations
from typing import Optional
from domain.modelling.simulation import EpcSimulation, HeatingOverlay
# Canonical system archetype → representative `sap_main_heating_code` (SAP Table
# 4b boiler rows / Table 4a). Codes map to the modern/condensing variant (A-G
# efficiency deferred): 102 regular condensing, 104 condensing combi, 120 CPSU,
# 404 fan storage heaters, 191 direct-acting electric boiler.
_MAIN_HEATING_CODES: dict[str, int] = {
"Gas boiler, combi": 104,
"Gas boiler, regular": 102,
"Gas CPSU": 120,
"Electric storage heaters, old": 401,
"Electric storage heaters, slimline": 402,
"Electric storage heaters, convector": 403,
"Electric storage heaters, fan": 404,
"Direct-acting electric": 191,
}
def main_heating_overlay_for(
main_heating_value: str, building_part: int
) -> Optional[EpcSimulation]:
code = _MAIN_HEATING_CODES.get(main_heating_value)
if code is None:
return None
return EpcSimulation(heating=HeatingOverlay(sap_main_heating_code=code))

View file

@ -0,0 +1,47 @@
"""Map a Landlord-Override water-heating value to a heating Simulation Overlay.
A water-heating value is one canonical "<system>, <fuel>" description ("From main
system, mains gas", "Electric immersion, electricity"). The calculator reads the
hot-water arrangement from `sap_heating.water_heating_code` (the SAP Table 4a
system code) and `water_heating_fuel`, so the overlay decomposes the value into
those two int codes and emits a whole-dwelling `HeatingOverlay` (water heating is
not per-building-part, so `building_part` is ignored). It composes field-wise with
the main_fuel / main_heating overlays. Unresolvable values produce no overlay.
"""
from __future__ import annotations
from typing import Optional
from domain.modelling.simulation import EpcSimulation, HeatingOverlay
# Canonical "<system>, <fuel>" description → (water_heating_code, water_heating_fuel).
# water_heating_code: 901 "from main system" (SAP Table 4a inherit-from-main),
# 903 "electric immersion". Fuel codes are the modern RdSAP "(not community)"
# family (26 mains gas, 29 electricity), matching the main_fuel overlay.
_WATER_HEATING_CODES: dict[str, tuple[int, int]] = {
"From main system, mains gas": (901, 26),
"From main system, electricity": (901, 29),
"From main system, oil": (901, 28),
"From main system, LPG (bulk)": (901, 27),
"From main system, bottled LPG": (901, 3),
"From main system, house coal": (901, 33),
"Electric immersion, electricity": (903, 29),
# "boiler/circulator for water heating only" — SAP Table 4a code 911 (gas).
"Gas boiler/circulator, mains gas": (911, 26),
}
def water_heating_overlay_for(
water_heating_value: str, building_part: int
) -> Optional[EpcSimulation]:
codes = _WATER_HEATING_CODES.get(water_heating_value)
if codes is None:
return None
water_heating_code, water_heating_fuel = codes
return EpcSimulation(
heating=HeatingOverlay(
water_heating_code=water_heating_code,
water_heating_fuel=water_heating_fuel,
)
)

View file

@ -0,0 +1,4 @@
"""Landlord property-override classifier vocabulary — the category enums a
landlord description resolves into, plus their valuecode helpers. The classifier
target for the property_overrides chain (mirrors the property_overrides table /
override_component pgEnum). Distinct from the EPC-context types of the same name."""

View file

@ -0,0 +1,31 @@
from enum import Enum
class ConstructionAgeBand(Enum):
"""A landlord-supplied construction age band, as resolved by the
landlord-description-overrides context.
Each member's value is the RdSAP England-&-Wales age-band **letter code**
(A..M) the calculator's U-value cascades read from
`SapBuildingPart.construction_age_band` the same representation the gov-EPC
API lodges. The construction-age-band Simulation Overlay
(``domain/epc/property_overlays/construction_age_band_overlay.py``) sets the
letter directly, so these values MUST stay the bare letter codes. Member
names carry the year ranges for readability. ``UNKNOWN`` covers values the
classifier cannot resolve (it leaves the lodged cert's age band untouched).
"""
A_BEFORE_1900 = "A"
B_1900_1929 = "B"
C_1930_1949 = "C"
D_1950_1966 = "D"
E_1967_1975 = "E"
F_1976_1982 = "F"
G_1983_1990 = "G"
H_1991_1995 = "H"
I_1996_2002 = "I"
J_2003_2006 = "J"
K_2007_2011 = "K"
L_2012_2022 = "L"
M_2023_ONWARDS = "M"
UNKNOWN = "Unknown"

View file

@ -0,0 +1,24 @@
from enum import Enum
class GlazingType(Enum):
"""A landlord-supplied glazing description, as resolved by the
landlord-description-overrides context.
Each member's value is the canonical glazing description (type + era) that
the glazing Simulation Overlay
(``domain/epc/property_overlays/glazing_overlay.py``) decomposes into the
SAP10 ``glazing_type`` code the calculator's Table-24 cascade reads — so the
member values here MUST stay in lock-step with that overlay's
``_GLAZING_CODES`` keys. The era matters: double-glazing pre-2002 and
2002-onward resolve to different codes (and U-values). ``UNKNOWN`` covers
values the classifier cannot resolve, and any glazing not yet given a
verified overlay code (it leaves the lodged cert's glazing untouched).
"""
SINGLE = "Single glazing"
DOUBLE_POST_2002 = "Double glazing, 2002 or later"
DOUBLE_PRE_2002 = "Double glazing, pre-2002"
TRIPLE_PRE_2002 = "Triple glazing, pre-2002"
TRIPLE_POST_2002 = "Triple glazing, 2002 or later"
UNKNOWN = "Unknown"

View file

@ -0,0 +1,29 @@
from enum import Enum
class MainFuelType(Enum):
"""A landlord-supplied main-fuel description, as resolved by the
landlord-description-overrides context.
Each member's value is the canonical fuel description that the main-fuel
Simulation Overlay (``domain/epc/property_overlays/main_fuel_overlay.py``)
decomposes into the RdSAP ``main_fuel`` int code the calculator reads so
the member values here MUST stay in lock-step with that overlay's
``_FUEL_CODES`` keys. ``UNKNOWN`` covers values the classifier cannot
resolve, and also any fuel not yet given a verified overlay code (it leaves
the lodged cert's fuel untouched rather than guessing).
"""
MAINS_GAS = "mains gas"
MAINS_GAS_COMMUNITY = "mains gas (community)"
ELECTRICITY = "electricity"
ELECTRICITY_COMMUNITY = "electricity (community)"
LPG_BULK = "LPG (bulk)"
LPG_BOTTLED = "bottled LPG"
LPG_SPECIAL_CONDITION = "LPG special condition"
OIL = "oil"
HOUSE_COAL = "house coal"
SMOKELESS_COAL = "smokeless coal"
DUAL_FUEL_MINERAL_WOOD = "dual fuel (mineral and wood)"
BIOMASS_COMMUNITY = "biomass (community)"
UNKNOWN = "Unknown"

View file

@ -0,0 +1,27 @@
from enum import Enum
class MainHeatingSystemType(Enum):
"""A landlord-supplied main-heating-system description, as resolved by the
landlord-description-overrides context.
Each member's value is the canonical system archetype that the main-heating
Simulation Overlay
(``domain/epc/property_overlays/main_heating_system_overlay.py``) maps to a
representative SAP ``sap_main_heating_code`` so the member values MUST stay
in lock-step with that overlay's ``_MAIN_HEATING_CODES`` keys. The SEDBUK A-G
efficiency band the Hyde "Heating" column carries is NOT modelled yet
(deferred), so archetypes map to their modern/condensing code. ``UNKNOWN``
covers values the classifier cannot resolve and the not-yet-modelled systems
(heat pumps, community heating).
"""
GAS_COMBI = "Gas boiler, combi"
GAS_REGULAR = "Gas boiler, regular"
GAS_CPSU = "Gas CPSU"
ELECTRIC_STORAGE_OLD = "Electric storage heaters, old"
ELECTRIC_STORAGE_SLIMLINE = "Electric storage heaters, slimline"
ELECTRIC_STORAGE_CONVECTOR = "Electric storage heaters, convector"
ELECTRIC_STORAGE_FAN = "Electric storage heaters, fan"
DIRECT_ELECTRIC = "Direct-acting electric"
UNKNOWN = "Unknown"

View file

@ -27,7 +27,7 @@ from __future__ import annotations
from dataclasses import dataclass
from typing import Mapping, Optional
from domain.epc.wall_type import WallType
from domain.epc.property_overrides.wall_type import WallType
@dataclass(frozen=True)

View file

@ -0,0 +1,26 @@
from enum import Enum
class WaterHeatingType(Enum):
"""A landlord-supplied water-heating description, as resolved by the
landlord-description-overrides context.
Each member's value is the canonical "<system>, <fuel>" description that the
water-heating Simulation Overlay
(``domain/epc/property_overlays/water_heating_overlay.py``) decomposes into
the SAP ``water_heating_code`` + ``water_heating_fuel`` int codes the
calculator reads so the member values MUST stay in lock-step with that
overlay's ``_WATER_HEATING_CODES`` keys. ``UNKNOWN`` covers values the
classifier cannot resolve, and any combination not yet given verified codes
(it leaves the lodged cert's hot-water arrangement untouched).
"""
FROM_MAIN_MAINS_GAS = "From main system, mains gas"
FROM_MAIN_ELECTRICITY = "From main system, electricity"
FROM_MAIN_OIL = "From main system, oil"
FROM_MAIN_LPG_BULK = "From main system, LPG (bulk)"
FROM_MAIN_BOTTLED_LPG = "From main system, bottled LPG"
FROM_MAIN_HOUSE_COAL = "From main system, house coal"
ELECTRIC_IMMERSION = "Electric immersion, electricity"
GAS_BOILER_CIRCULATOR_MAINS_GAS = "Gas boiler/circulator, mains gas"
UNKNOWN = "Unknown"

View file

@ -19,6 +19,7 @@ from datatypes.epc.domain.epc_property_data import (
)
from domain.modelling.simulation import (
EpcSimulation,
GlazingOverlay,
HeatingOverlay,
LightingOverlay,
SecondaryHeatingOverlay,
@ -53,6 +54,8 @@ def apply_simulations(
)
if simulation.lighting is not None:
_fold_lighting(result, simulation.lighting)
if simulation.glazing is not None:
_fold_glazing(result, simulation.glazing)
if simulation.heating is not None:
_fold_heating(result, simulation.heating)
if simulation.secondary_heating is not None:
@ -202,6 +205,21 @@ def _fold_window(window: SapWindow, overlay: WindowOverlay) -> None:
details.solar_transmittance = overlay.solar_transmittance
def _fold_glazing(epc: EpcPropertyData, overlay: GlazingOverlay) -> None:
"""Expand a whole-dwelling `GlazingOverlay` across every window: set each
window's `glazing_type` to the corrected SAP10 code AND clear its lodged
transmission U, so `heat_transmission`'s Table-24 cascade re-derives U from
the new type (the lodged U was for the old, mis-recorded glazing). A landlord
glazing override carries no per-window geometry, so it applies uniformly
the expansion lives here because the baseline window list is known only at
fold time."""
if overlay.glazing_type is None:
return
for window in epc.sap_windows:
window.glazing_type = overlay.glazing_type
window.window_transmission_details = None
def _fold_ventilation(
baseline: Optional[SapVentilation], overlay: VentilationOverlay
) -> SapVentilation:

View file

@ -28,6 +28,12 @@ class BuildingPartOverlay:
# The wall material (RdSAP `wall_construction` code). Left `None` by Measures
# — insulating a wall doesn't change its material — but set by a Landlord
# Override that corrects the construction itself (ADR-0032).
# RdSAP England-&-Wales construction age band — the letter code A..M the
# calculator's U-value cascades key on (`SapBuildingPart.construction_age_band`).
# Left `None` by Measures (retrofits don't change build era); set by a Landlord
# Override that corrects the lodged age band, which re-derives this part's
# fabric U-value defaults. Folds onto the part via the generic field loop.
construction_age_band: Optional[str] = None
wall_construction: Optional[int] = None
wall_insulation_type: Optional[int] = None
# Added solid-wall insulation depth (mm) — drives the calculator's Table 6
@ -73,6 +79,28 @@ class WindowOverlay:
solar_transmittance: Optional[float] = None
@dataclass(frozen=True)
class GlazingOverlay:
"""All-optional partial of the dwelling's whole-glazing state — the
correction a Landlord Override makes when the lodged glazing is wrong.
Unlike a per-window `WindowOverlay` (keyed by `sap_windows` index), this
targets no single window: a landlord describes the dwelling's glazing as a
whole ("Double glazing, 2002 or later") with no per-window geometry, so the
overlay builder (which never sees the baseline window list) emits one of
these and `_fold_glazing` expands it across every `sap_windows` entry.
`glazing_type` is the SAP10 glazing-type code (Table 24 / `u_window`
cascade: 1=single, 2=double 2002-2021, 3=double pre-2002, 9=triple 2002+,
). The fold sets it on every window AND clears each window's lodged
transmission U-value, so the Table-24 cascade re-derives the corrected U
from the new type (the lodged U was for the OLD, mis-recorded glazing).
A `None` field means "leave the baseline value unchanged".
"""
glazing_type: Optional[int] = None
@dataclass(frozen=True)
class LightingOverlay:
"""All-optional partial of the dwelling's fixed-lighting bulb counts — the
@ -220,6 +248,7 @@ class EpcSimulation:
windows: Mapping[int, WindowOverlay] = field(default_factory=_no_windows)
ventilation: Optional[VentilationOverlay] = None
lighting: Optional[LightingOverlay] = None
glazing: Optional[GlazingOverlay] = None
heating: Optional[HeatingOverlay] = None
secondary_heating: Optional[SecondaryHeatingOverlay] = None
solar: Optional[SolarOverlay] = None

View file

@ -25,9 +25,24 @@ from infrastructure.postgres.landlord_property_type_override_table import (
from infrastructure.postgres.landlord_roof_type_override_table import (
LandlordRoofTypeOverrideRow,
)
from infrastructure.postgres.landlord_construction_age_band_override_table import (
LandlordConstructionAgeBandOverrideRow,
)
from infrastructure.postgres.landlord_glazing_override_table import (
LandlordGlazingOverrideRow,
)
from infrastructure.postgres.landlord_main_fuel_override_table import (
LandlordMainFuelOverrideRow,
)
from infrastructure.postgres.landlord_main_heating_system_override_table import (
LandlordMainHeatingSystemOverrideRow,
)
from infrastructure.postgres.landlord_wall_type_override_table import (
LandlordWallTypeOverrideRow,
)
from infrastructure.postgres.landlord_water_heating_override_table import (
LandlordWaterHeatingOverrideRow,
)
from repositories.landlord_overrides.landlord_override_reader import (
LandlordOverrideReader,
)
@ -38,6 +53,11 @@ _ROW_TYPES: dict[str, type] = {
"built_form_type": LandlordBuiltFormTypeOverrideRow,
"wall_type": LandlordWallTypeOverrideRow,
"roof_type": LandlordRoofTypeOverrideRow,
"main_fuel": LandlordMainFuelOverrideRow,
"glazing": LandlordGlazingOverrideRow,
"construction_age_band": LandlordConstructionAgeBandOverrideRow,
"water_heating": LandlordWaterHeatingOverrideRow,
"main_heating_system": LandlordMainHeatingSystemOverrideRow,
}

View file

@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.built_form_type import BuiltFormType
from domain.epc.property_overrides.built_form_type import BuiltFormType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum

View file

@ -0,0 +1,73 @@
"""SQLModel mirror of the ``landlord_construction_age_band_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
differences are the table name, the ``construction_age_band`` pgEnum on
``value``, and the unique-constraint name.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordConstructionAgeBandOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_construction_age_band_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
# NB: shortened (drop the redundant ``_overrides``) to stay within
# PostgreSQL's 63-char identifier limit -- the full
# ``landlord_construction_age_band_overrides_portfolio_description_unique``
# is 68 chars and would be silently truncated, diverging from Drizzle.
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_construction_age_band_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: ConstructionAgeBand = Field(
sa_column=Column(
SAEnum(
ConstructionAgeBand,
name="construction_age_band",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

@ -0,0 +1,69 @@
"""SQLModel mirror of the ``landlord_glazing_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
differences are the table name, the ``glazing`` pgEnum on ``value``, and the
unique-constraint name.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.property_overrides.glazing_type import GlazingType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordGlazingOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_glazing_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_glazing_overrides_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: GlazingType = Field(
sa_column=Column(
SAEnum(
GlazingType,
name="glazing",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

@ -0,0 +1,69 @@
"""SQLModel mirror of the ``landlord_main_fuel_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
differences are the table name, the ``main_fuel`` pgEnum on ``value``, and
the unique-constraint name.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.property_overrides.main_fuel_type import MainFuelType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordMainFuelOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_main_fuel_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_main_fuel_overrides_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: MainFuelType = Field(
sa_column=Column(
SAEnum(
MainFuelType,
name="main_fuel",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

@ -0,0 +1,71 @@
"""SQLModel mirror of the ``landlord_main_heating_system_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
differences are the table name, the ``main_heating_system`` pgEnum on ``value``,
and the unique-constraint name.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordMainHeatingSystemOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_main_heating_system_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
# Shortened (drop the redundant ``_overrides``) to stay within
# PostgreSQL's 63-char identifier limit; mirrors the Drizzle name.
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_main_heating_system_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: MainHeatingSystemType = Field(
sa_column=Column(
SAEnum(
MainHeatingSystemType,
name="main_heating_system",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

@ -14,7 +14,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.property_type import PropertyType
from domain.epc.property_overrides.property_type import PropertyType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum

View file

@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.roof_type import RoofType
from domain.epc.property_overrides.roof_type import RoofType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum

View file

@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.wall_type import WallType
from domain.epc.property_overrides.wall_type import WallType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum

View file

@ -0,0 +1,69 @@
"""SQLModel mirror of the ``landlord_water_heating_overrides`` Drizzle table.
The schema source of truth lives in the ``assessment-model`` TS repo
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
this row class only mirrors the columns so the Python lambda can read/write.
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
differences are the table name, the ``water_heating`` pgEnum on ``value``, and
the unique-constraint name.
"""
from datetime import datetime, timezone
from typing import ClassVar
from uuid import UUID, uuid4
from sqlalchemy import BigInteger, Column, UniqueConstraint
from sqlalchemy import Enum as SAEnum
from sqlmodel import Field, SQLModel
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
class LandlordWaterHeatingOverrideRow(SQLModel, table=True):
__tablename__: ClassVar[str] = "landlord_water_heating_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
UniqueConstraint(
"portfolio_id",
"description",
name="landlord_water_heating_overrides_portfolio_description_unique",
),
)
id: UUID = Field(default_factory=uuid4, primary_key=True)
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
# not declared here -- the ``portfolio`` table is not modelled in Python.
portfolio_id: int = Field(
sa_column=Column(BigInteger, nullable=False, index=True),
)
description: str = Field(nullable=False)
value: WaterHeatingType = Field(
sa_column=Column(
SAEnum(
WaterHeatingType,
name="water_heating",
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
),
nullable=False,
),
)
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
# instance is reused by every ``landlord_*_overrides`` row class.
source: str = Field(
sa_column=Column(override_source_sa_enum, nullable=False),
)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
nullable=False,
)

View file

@ -27,6 +27,11 @@ override_component_sa_enum = SAEnum(
"roof_type",
"property_type",
"built_form_type",
"main_fuel",
"glazing",
"construction_age_band",
"water_heating",
"main_heating_system",
name="override_component",
)

View file

@ -14,10 +14,10 @@ from typing import Any, Optional
from uuid import UUID
from domain.epc.built_form_type import BuiltFormType
from domain.epc.property_type import PropertyType
from domain.epc.roof_type import RoofType
from domain.epc.wall_type import WallType
from domain.epc.property_overrides.built_form_type import BuiltFormType
from domain.epc.property_overrides.property_type import PropertyType
from domain.epc.property_overrides.roof_type import RoofType
from domain.epc.property_overrides.wall_type import WallType
from repositories.bulk_upload.bulk_upload_status_writer import BulkUploadStatusWriter
from repositories.landlord_overrides.landlord_override_reader import (
LandlordOverrideReader,

View file

@ -29,6 +29,17 @@ from domain.epc.property_overlays.attribute_overlay import (
built_form_overlay_for,
property_type_overlay_for,
)
from domain.epc.property_overlays.construction_age_band_overlay import (
age_band_overlay_for,
)
from domain.epc.property_overlays.glazing_overlay import glazing_overlay_for
from domain.epc.property_overlays.main_fuel_overlay import fuel_overlay_for
from domain.epc.property_overlays.main_heating_system_overlay import (
main_heating_overlay_for,
)
from domain.epc.property_overlays.water_heating_overlay import (
water_heating_overlay_for,
)
from domain.epc.property_overlays.roof_type_overlay import roof_overlay_for
from domain.epc.property_overlays.wall_type_overlay import wall_overlay_for
from domain.modelling.simulation import EpcSimulation
@ -43,6 +54,11 @@ _COMPONENT_OVERLAYS: dict[str, Callable[[str, int], Optional[EpcSimulation]]] =
"roof_type": roof_overlay_for,
"property_type": property_type_overlay_for,
"built_form_type": built_form_overlay_for,
"main_fuel": fuel_overlay_for,
"glazing": glazing_overlay_for,
"construction_age_band": age_band_overlay_for,
"water_heating": water_heating_overlay_for,
"main_heating_system": main_heating_overlay_for,
}

View file

@ -13,7 +13,7 @@ from __future__ import annotations
from typing import Optional
from domain.epc.override_code_mapping import (
from domain.epc.property_overrides.override_code_mapping import (
built_form_to_code,
property_type_to_code,
)

View file

@ -0,0 +1,353 @@
"""Fill the DOMNA columns in the AddressProfilingResults spreadsheet.
Input: scripts/manipulation(2).xlsx, sheet "AddressProfilingResults", columns
Organisation Reference | UPRN | DOMNA FOUND UPRN | DOMNA FOUND ADDRESS | Address | Postcode
Per-row rule ("if there's a UPRN in the UPRN column we're done"):
* UPRN present AND Address present -> nothing to do (already sorted).
* UPRN present AND Address missing -> reverse-lookup the address from the UPRN
via the EPC API -> DOMNA FOUND ADDRESS.
* UPRN missing AND Address present -> resolve a UPRN from address + postcode
(EPC API, then Ordnance Survey) -> writes
DOMNA FOUND UPRN + DOMNA FOUND ADDRESS.
* not resolvable -> marked "NOT FOUND" and listed in the
unresolved report.
Relaxed matching (this batch only production AddressMatch is untouched): the
landlord writes flats as "3 GLADYS COURT" while EPC stores "Flat 3 Gladys
Court", which the production matcher hard-rejects. So per address we try several
query variants the full string, just the first comma-segment, and a
"Flat <n> ..." form and keep the best-scoring, unambiguous match. The unit
number must still match exactly (AddressMatch zeroes mismatched numbers), so a
wrong-unit match stays unlikely. Each fill carries its score + source so you can
spot-check (DOMNA SCORE / DOMNA SOURCE).
Rows that already have a DOMNA FOUND UPRN are skipped (idempotent / resumable).
python -m scripts.fill_domna_addresses
python -m scripts.fill_domna_addresses --limit 200 # smoke test first N
Keys come from backend/.env (OPEN_EPC_API_TOKEN, ORDNANCE_SURVEY_API_KEY). Run
from the worktree root (import trap).
"""
from __future__ import annotations
import argparse
import os
import re
import sys
from pathlib import Path
from typing import Optional
import pandas as pd
_REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
from backend.address2UPRN.main import get_epc_data_with_postcode # noqa: E402
from backend.address2UPRN.scoring import all_uprns_match, rank_address_similarity # noqa: E402
from backend.ordnanceSurvey.helpers import ( # noqa: E402
lookup_os_places,
os_places_results_to_dataframe,
)
from backend.utils.addressMatch import AddressMatch # noqa: E402
from datatypes.epc.search import EpcSearchResult # noqa: E402
from infrastructure.epc_client.epc_client_service import EpcClientService # noqa: E402
from scripts.resolve_uprns_for_finaliser import clean_postcode, load_keys # noqa: E402
SHEET = "AddressProfilingResults"
UPRN_COL = "UPRN"
ADDRESS_COL = "Address"
POSTCODE_COL = "Postcode"
REF_COL = "Organisation Reference"
FOUND_UPRN_COL = "DOMNA FOUND UPRN"
FOUND_ADDRESS_COL = "DOMNA FOUND ADDRESS"
SCORE_COL = "DOMNA SCORE"
SOURCE_COL = "DOMNA SOURCE"
NOT_FOUND = "NOT FOUND"
# EPC matches are tight (short addresses) so we hold the production 0.7 bar; OS
# addresses carry more trailing tokens, so a slightly lower bar is appropriate.
EPC_THRESHOLD = 0.7
OS_THRESHOLD = 0.6
_DEFAULT_IN = _REPO_ROOT / "scripts" / "manipulation(2).xlsx"
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "manipulation_filled.xlsx"
_DEFAULT_UNRESOLVED = _REPO_ROOT / "scripts" / "manipulation_unresolved.csv"
# A resolved hit: (uprn, matched_address, score, source).
Hit = tuple[str, str, float, str]
def cell_str(value: object) -> str:
"""Coerce a spreadsheet cell to a trimmed string ("" for NaN/None)."""
if value is None:
return ""
text = str(value).strip()
return "" if text.lower() == "nan" else text
def parse_uprn_cell(value: object) -> Optional[int]:
"""Read a UPRN cell that pandas loaded as float64 back into an int."""
text = cell_str(value)
if not text:
return None
try:
return int(float(text))
except ValueError:
return None
def address_variants(address: str) -> list[str]:
"""Query forms to try for one input address, best-discriminating first.
Landlord flats read "3 GLADYS COURT, 260 REIGATE ROAD" but EPC stores
"Flat 3 Gladys Court"; the full string scores low (extra tokens) and the
bare "3 ..." trips the flat guard. So we also try the first comma-segment
and a "Flat <segment>" form.
"""
address = address.strip()
first = address.split(",")[0].strip()
variants = [address, first]
if re.match(r"^\d", first): # starts with a unit/house number
variants.append("Flat " + first)
variants.append("Flat " + address)
seen: set[str] = set()
out: list[str] = []
for v in variants:
key = v.lower()
if v and key not in seen:
seen.add(key)
out.append(v)
return out
def resolve_epc_relaxed(
address: str,
postcode_clean: str,
epc_cache: dict[str, pd.DataFrame],
threshold: float = EPC_THRESHOLD,
) -> Optional[Hit]:
"""Best unambiguous EPC match across the address variants (cached per postcode)."""
epc_df = epc_cache.get(postcode_clean)
if epc_df is None:
epc_df = get_epc_data_with_postcode(postcode=postcode_clean)
epc_cache[postcode_clean] = epc_df
if epc_df.empty:
return None
best: Optional[Hit] = None
for variant in address_variants(address):
scored = rank_address_similarity(epc_df, user_address=variant)
if scored.empty:
continue
score = float(scored.iloc[0]["lexiscore"])
if best is not None and score <= best[2]:
continue
top_rank = scored[scored["lexirank"] == 1]
# rank-1 rows must agree on one UPRN, else it's ambiguous — skip.
if not all_uprns_match(top_rank, top_rank.iloc[0]["uprn"]):
continue
uprn = str(top_rank.iloc[0]["uprn"])
if uprn in ("", "nan"):
continue
best = (uprn, str(scored.iloc[0]["address"]), score, "epc")
return best if best is not None and best[2] >= threshold else None
def resolve_os_relaxed(
address: str,
postcode_clean: str,
os_api_key: str,
os_cache: dict[str, pd.DataFrame],
threshold: float = OS_THRESHOLD,
) -> Optional[Hit]:
"""Best OS Places match across the address variants (cached per postcode)."""
places_df = os_cache.get(postcode_clean)
if places_df is None:
response = lookup_os_places(postcode_clean, os_api_key)
if response.get("status") == 200 and "data" in response:
places_df = os_places_results_to_dataframe(response["data"])
else:
places_df = pd.DataFrame()
os_cache[postcode_clean] = places_df
if places_df.empty or "ADDRESS" not in places_df.columns:
return None
records: list[dict[str, object]] = places_df.to_dict(orient="records")
best: Optional[Hit] = None
for variant in address_variants(address):
for rec in records:
candidate = str(rec.get("ADDRESS", ""))
score = AddressMatch.score(variant, candidate)
if best is None or score > best[2]:
best = (str(rec.get("UPRN", "")), candidate, score, "ordnance_survey")
return best if best is not None and best[2] >= threshold else None
def _address_from_search(result: EpcSearchResult) -> str:
parts = [
result.address_line_1,
result.address_line_2,
result.address_line_3,
result.address_line_4,
result.post_town,
]
return ", ".join(p.strip() for p in parts if p and p.strip())
def reverse_address_from_uprn(
uprn: int,
postcode_clean: str,
service: EpcClientService,
search_cache: dict[str, list[EpcSearchResult]],
) -> Optional[str]:
"""Find the EPC address for a known UPRN by searching its postcode (cached)."""
results = search_cache.get(postcode_clean)
if results is None:
results = service.search_by_postcode(postcode_clean)
search_cache[postcode_clean] = results
for result in results:
if result.uprn is not None and int(result.uprn) == uprn:
return _address_from_search(result)
return None
def fill(df: pd.DataFrame, *, os_api_key: Optional[str]) -> list[dict[str, str]]:
"""Fill the DOMNA columns in place. Returns the unresolved rows."""
for col in (FOUND_UPRN_COL, FOUND_ADDRESS_COL, SCORE_COL, SOURCE_COL):
if col not in df.columns:
df[col] = ""
df[FOUND_UPRN_COL] = df[FOUND_UPRN_COL].astype("object")
df[FOUND_ADDRESS_COL] = df[FOUND_ADDRESS_COL].astype("object")
token = os.environ.get("OPEN_EPC_API_TOKEN")
service = EpcClientService(auth_token=token) if token else None
epc_cache: dict[str, pd.DataFrame] = {}
os_cache: dict[str, pd.DataFrame] = {}
search_cache: dict[str, list[EpcSearchResult]] = {}
unresolved: list[dict[str, str]] = []
resolved_uprn = resolved_addr = skipped = 0
total = len(df)
for n, idx in enumerate(df.index, start=1):
ref = cell_str(df.at[idx, REF_COL])
given_uprn = parse_uprn_cell(df.at[idx, UPRN_COL])
address = cell_str(df.at[idx, ADDRESS_COL])
postcode_raw = cell_str(df.at[idx, POSTCODE_COL])
postcode_clean = clean_postcode(postcode_raw)
# Already sorted (UPRN + address) or already filled by a prior run.
if given_uprn is not None and address:
skipped += 1
continue
if cell_str(df.at[idx, FOUND_UPRN_COL]) and cell_str(df.at[idx, FOUND_UPRN_COL]) != NOT_FOUND:
skipped += 1
continue
def mark_not_found(reason: str) -> None:
df.at[idx, FOUND_UPRN_COL] = NOT_FOUND if given_uprn is None else ""
df.at[idx, FOUND_ADDRESS_COL] = NOT_FOUND
df.at[idx, SOURCE_COL] = "not_found"
unresolved.append(
{
"Organisation Reference": ref,
"reason": reason,
"Address": address,
"Postcode": postcode_raw,
}
)
# Case B — UPRN present, address missing: reverse-lookup the address.
if given_uprn is not None and not address:
found: Optional[str] = None
if service is not None and postcode_clean:
try:
found = reverse_address_from_uprn(
given_uprn, postcode_clean, service, search_cache
)
except Exception as exc:
print(f" reverse failed {ref} {given_uprn}: {exc}")
if found:
df.at[idx, FOUND_ADDRESS_COL] = found
df.at[idx, SOURCE_COL] = "epc_reverse"
resolved_addr += 1
else:
mark_not_found("no address for UPRN")
continue
# Case A — no UPRN, has address: resolve a UPRN.
if given_uprn is None and address:
if not postcode_clean:
mark_not_found("no postcode")
continue
hit: Optional[Hit] = None
if token:
try:
hit = resolve_epc_relaxed(address, postcode_clean, epc_cache)
except Exception as exc:
print(f" EPC failed {ref} {postcode_clean}: {exc}")
if hit is None and os_api_key:
try:
hit = resolve_os_relaxed(address, postcode_clean, os_api_key, os_cache)
except Exception as exc:
print(f" OS failed {ref} {postcode_clean}: {exc}")
if hit is not None:
uprn, matched, score, source = hit
df.at[idx, FOUND_UPRN_COL] = uprn
df.at[idx, FOUND_ADDRESS_COL] = matched
df.at[idx, SCORE_COL] = round(score, 4)
df.at[idx, SOURCE_COL] = source
resolved_uprn += 1
else:
mark_not_found("no UPRN match")
if n % 100 == 0:
print(
f"[{n}/{total}] resolved={resolved_uprn} not_found={len(unresolved)}"
)
continue
# Case C — neither a UPRN nor an address.
mark_not_found("no UPRN and no address")
print(
f"\nResolved {resolved_uprn} UPRNs, {resolved_addr} addresses; "
f"{skipped} already sorted/done; {len(unresolved)} not found."
)
return unresolved
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
parser.add_argument("--unresolved", type=Path, default=_DEFAULT_UNRESOLVED)
parser.add_argument("--limit", type=int, default=None, help="process first N rows")
return parser.parse_args()
def main() -> int:
args = _parse_args()
_epc_token, os_api_key = load_keys()
df = pd.read_excel(args.inp, sheet_name=SHEET)
if args.limit is not None:
df = df.head(args.limit).copy()
print(f"Loaded {len(df)} rows from {args.inp} [{SHEET}]")
unresolved = fill(df, os_api_key=os_api_key)
df.to_excel(args.out, sheet_name=SHEET, index=False)
print(f"Wrote filled sheet -> {args.out}")
if unresolved:
pd.DataFrame(unresolved).to_csv(args.unresolved, index=False)
print(f"Wrote {len(unresolved)} unresolved rows -> {args.unresolved}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,331 @@
"""Insert resolved manipulation_filled rows into the FE-owned ``property`` table.
Reuses the bulk_upload_finaliser's own row->PropertyIdentityInsert mapping
(``BulkUploadFinaliserOrchestrator._row_to_insert``) and the same
``PropertyPostgresRepository.insert_all`` the Lambda uses, so a row inserted here
is identical to one the real finaliser would write. The status-writer /
property_overrides path is skipped this only populates ``property`` (no
BulkUpload task needed).
Insert is ON CONFLICT (portfolio_id, uprn) DO NOTHING, so re-running is safe.
# one random resolved row into portfolio 796, then read it back
python -m scripts.finalise_to_property_table --portfolio 796 --one
# a specific Organisation Reference
python -m scripts.finalise_to_property_table --portfolio 796 --ref 56100000101
# the whole sheet (resolved rows only by default; --include-unmatched to add
# null-UPRN rows too)
python -m scripts.finalise_to_property_table --portfolio 796 --all
Postgres target comes from the root .env (POSTGRES_*). Run from the worktree root.
"""
from __future__ import annotations
import argparse
import os
import sys
from pathlib import Path
from typing import Optional
import pandas as pd
from dotenv import load_dotenv
from sqlmodel import select
_REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
from infrastructure.postgres.config import PostgresConfig # noqa: E402
from infrastructure.postgres.engine import commit_scope, make_engine, make_session # noqa: E402
from infrastructure.postgres.property_table import PropertyRow # noqa: E402
from orchestration.bulk_upload_finaliser_orchestrator import ( # noqa: E402
BulkUploadFinaliserOrchestrator,
)
from repositories.property.property_postgres_repository import ( # noqa: E402
PropertyPostgresRepository,
)
from repositories.property.property_repository import PropertyIdentityInsert # noqa: E402
from scripts.fill_domna_addresses import ( # noqa: E402
ADDRESS_COL,
FOUND_ADDRESS_COL,
FOUND_UPRN_COL,
POSTCODE_COL,
REF_COL,
SCORE_COL,
SHEET,
UPRN_COL,
NOT_FOUND,
cell_str,
parse_uprn_cell,
)
_DEFAULT_IN = _REPO_ROOT / "scripts" / "manipulation_filled.xlsx"
def _final_uprn(row: pd.Series) -> Optional[int]:
"""The authoritative UPRN: the given one, else the DOMNA-found one."""
given = parse_uprn_cell(row.get(UPRN_COL))
if given is not None:
return given
found = cell_str(row.get(FOUND_UPRN_COL))
if found and found != NOT_FOUND:
return parse_uprn_cell(found)
return None
def to_combiner_row(row: pd.Series) -> dict[str, str]:
"""Map one spreadsheet row to the combiner-output shape the finaliser reads."""
given_uprn = parse_uprn_cell(row.get(UPRN_COL))
address = cell_str(row.get(ADDRESS_COL))
uprn = _final_uprn(row)
domna_addr = cell_str(row.get(FOUND_ADDRESS_COL))
if domna_addr == NOT_FOUND:
domna_addr = ""
# Matched address: the resolved one when we found it, else the given address
# (for rows that already had a UPRN + address).
matched = domna_addr or (address if given_uprn is not None else "")
score = cell_str(row.get(SCORE_COL))
return {
"Address 1": address,
"Address 2": "",
"Address 3": "",
"postcode": cell_str(row.get(POSTCODE_COL)),
"Internal Reference": cell_str(row.get(REF_COL)),
"address2uprn_uprn": "" if uprn is None else str(uprn),
"address2uprn_address": matched,
"address2uprn_lexiscore": score,
}
def load_rows(
path: Path, *, include_unmatched: bool
) -> tuple[pd.DataFrame, list[dict[str, str]]]:
"""Load the sheet and the combiner rows. By default drop rows with no UPRN."""
df = pd.read_excel(path, sheet_name=SHEET)
df = df.reset_index(drop=True)
if not include_unmatched:
keep = df.apply(lambda r: _final_uprn(r) is not None, axis=1)
df = df[keep].reset_index(drop=True)
rows = [to_combiner_row(r) for _, r in df.iterrows()]
return df, rows
def dedupe_by_uprn(
rows: list[dict[str, str]],
) -> tuple[list[dict[str, str]], list[dict[str, str]]]:
"""Keep the first row per UPRN; return (kept, dropped collisions).
The DB INSERT collapses duplicate (portfolio, uprn) via ON CONFLICT DO
NOTHING anyway, so this just makes the collision explicit (the dropped rows
are written out for review) rather than letting an arbitrary ref win silently.
"""
seen: set[str] = set()
kept: list[dict[str, str]] = []
dropped: list[dict[str, str]] = []
for row in rows:
uprn = row["address2uprn_uprn"]
if uprn in seen:
dropped.append(row)
else:
seen.add(uprn)
kept.append(row)
return kept, dropped
# Force-reload teardown order (bottom-up). property_overrides is ON DELETE
# CASCADE so it clears itself when the property goes; everything below is NO
# ACTION and must be deleted first, deepest child first.
# property -> epc_property -> {these children}
_EPC_CHILD_TABLES = (
"epc_energy_element",
"epc_window",
"epc_main_heating_detail",
"epc_renewable_heat_incentive",
"epc_building_part",
"epc_flat_details",
)
# property -> {these direct dependents}, deleted after the epc children
_PROPERTY_DEPENDENTS = ("epc_property", "plan")
_INSERT_CHUNK = 4000 # 9 cols/row -> well under psycopg2's 65535-param limit
def _reset_portfolio(session: object, portfolio_id: int) -> int:
"""Delete a portfolio's properties and their NO ACTION dependency tree.
Returns the number of property rows deleted (property_overrides cascade).
"""
from sqlalchemy import text
pids = "SELECT id FROM property WHERE portfolio_id = :pid"
epc_ids = f"SELECT id FROM epc_property WHERE property_id IN ({pids})"
for table in _EPC_CHILD_TABLES:
session.execute( # type: ignore[attr-defined]
text(f"DELETE FROM {table} WHERE epc_property_id IN ({epc_ids})"),
{"pid": portfolio_id},
)
for table in _PROPERTY_DEPENDENTS:
session.execute( # type: ignore[attr-defined]
text(f"DELETE FROM {table} WHERE property_id IN ({pids})"),
{"pid": portfolio_id},
)
result = session.execute( # type: ignore[attr-defined]
text("DELETE FROM property WHERE portfolio_id = :pid"), {"pid": portfolio_id}
)
return result.rowcount
def clean_reload(
rows: list[dict[str, str]], portfolio_id: int, *, reset: bool
) -> tuple[int, int]:
"""Optionally wipe the portfolio, then chunk-insert rows. One transaction.
Returns (properties_deleted, properties_inserted).
"""
inserts: list[PropertyIdentityInsert] = [
BulkUploadFinaliserOrchestrator._row_to_insert(r, portfolio_id) for r in rows
]
engine = _engine()
session = make_session(engine)
deleted = 0
inserted = 0
try:
repo = PropertyPostgresRepository(session)
with commit_scope(session):
if reset:
deleted = _reset_portfolio(session, portfolio_id)
for start in range(0, len(inserts), _INSERT_CHUNK):
inserted += repo.insert_all(inserts[start : start + _INSERT_CHUNK])
finally:
session.close()
return deleted, inserted
def _engine():
load_dotenv(_REPO_ROOT / ".env")
return make_engine(PostgresConfig.from_env(os.environ))
def insert_rows(rows: list[dict[str, str]], portfolio_id: int) -> int:
"""Insert via the finaliser's mapper + repository. Returns rows inserted."""
inserts: list[PropertyIdentityInsert] = [
BulkUploadFinaliserOrchestrator._row_to_insert(r, portfolio_id) for r in rows
]
engine = _engine()
session = make_session(engine)
try:
repo = PropertyPostgresRepository(session)
with commit_scope(session):
inserted = repo.insert_all(inserts)
finally:
session.close()
return inserted
def fetch_by_ref(portfolio_id: int, ref: str) -> list[PropertyRow]:
"""Read back inserted rows for one Organisation Reference (for verification)."""
engine = _engine()
session = make_session(engine)
try:
stmt = select(PropertyRow).where(
PropertyRow.portfolio_id == portfolio_id,
PropertyRow.landlord_property_id == ref,
)
return list(session.exec(stmt).all())
finally:
session.close()
def _show(row: dict[str, str], insert: PropertyIdentityInsert) -> None:
print("\nSource (combiner) row:")
for k, v in row.items():
print(f" {k}: {v!r}")
print("\nMapped PropertyIdentityInsert:")
for k, v in insert.__dict__.items():
print(f" {k}: {v!r}")
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
parser.add_argument("--portfolio", type=int, required=True)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--one", action="store_true", help="one random resolved row")
group.add_argument("--ref", help="a specific Organisation Reference")
group.add_argument("--all", action="store_true", help="every row")
parser.add_argument(
"--include-unmatched",
action="store_true",
help="also insert rows with no UPRN (null-UPRN property rows)",
)
parser.add_argument(
"--reset",
action="store_true",
help="(with --all) DELETE all properties in the portfolio first "
"(cascades property_overrides; clears plan/epc_property)",
)
parser.add_argument(
"--collisions",
type=Path,
default=_REPO_ROOT / "scripts" / "manipulation_collisions.csv",
help="where to write rows dropped as duplicate-UPRN collisions",
)
parser.add_argument("--seed", type=int, default=0, help="random seed for --one")
return parser.parse_args()
def main() -> int:
args = _parse_args()
df, rows = load_rows(args.inp, include_unmatched=args.include_unmatched)
print(f"Loaded {len(rows)} candidate rows from {args.inp}")
if args.all:
kept, dropped = dedupe_by_uprn(rows)
if dropped:
pd.DataFrame(dropped).to_csv(args.collisions, index=False)
print(
f"{len(dropped)} duplicate-UPRN rows dropped -> {args.collisions} "
f"({len(kept)} unique to insert)"
)
deleted, inserted = clean_reload(kept, args.portfolio, reset=args.reset)
if args.reset:
print(f"Deleted {deleted} existing properties in portfolio {args.portfolio}.")
print(f"Inserted {inserted} properties into portfolio {args.portfolio}.")
return 0
# Single-row paths: pick the row, show the mapping, insert, read back.
if args.ref:
match = [r for r in rows if r["Internal Reference"] == args.ref]
if not match:
print(f"No resolved row with Organisation Reference {args.ref!r}.")
return 1
row = match[0]
else: # --one: deterministic "random" pick via seed
idx = (args.seed * 7919) % len(rows)
row = rows[idx]
ref = row["Internal Reference"]
insert = BulkUploadFinaliserOrchestrator._row_to_insert(row, args.portfolio)
_show(row, insert)
inserted = insert_rows([row], args.portfolio)
print(
f"\ninsert_all -> {inserted} new row(s) "
f"(0 means it already existed; ON CONFLICT DO NOTHING)."
)
print(f"\nproperty rows for portfolio {args.portfolio}, ref {ref!r}:")
for pr in fetch_by_ref(args.portfolio, ref):
print(
f" id={pr.id} uprn={pr.uprn} address={pr.address!r} "
f"postcode={pr.postcode!r} status={pr.creation_status} "
f"lexiscore={pr.lexiscore}"
)
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,44 @@
# Resume prompt — finish the Hyde portfolio-796 property_overrides run (after Khalim review)
Paste the block below to continue. It tells the assistant to review the unknown-override
decisions with me, verify them, confirm before writing, then run the remaining steps.
---
We paused the Hyde property-overrides bulk load to review the UNKNOWN classifications with
Khalim. Pick it back up.
**Context (already done):**
- Target is **portfolio 796** in DevAssessmentModelDB (NOT 795 — 795 is empty).
- Script: `scripts/hyde/build_property_overrides.py`. Pass 1 (`classify`) is DONE — the
`landlord_*_overrides` ledger is populated; re-running classify is free (cache hits).
- The 19 unresolved descriptions are documented in `scripts/hyde/unknowns_review.md`, with
proposed values already written to `overrides_edits.csv` (gitignored).
- Env (DB creds + `OPENAI_API_KEY`) is in `/workspaces/home/github/Model/.env`; load it with
python-dotenv and set `POSTGRES_DRIVER=psycopg2`. Writes are idempotent upserts (unique on
`property_id, override_component, building_part`) — safe to re-run, never duplicates.
**Do this, in order:**
1. **Ask me what Khalim decided** for the unknowns. The one real judgement call is the
flat-roof reading: `Flat: As Built` (1,172 rows) + `Flat: Unknown` (194) → which of
`Flat, no insulation (assumed)` / `Flat, insulated (assumed)` / `Flat, limited insulation
(assumed)`. The `construction_age_band` bands (29,829 rows) are deterministic (band = first
letter) — keep as-is unless I say otherwise. Confirm the other roof/wall proposals too.
2. **Update `overrides_edits.csv`** (`corrected_value` column) to match Khalim's decisions.
3. Run `validate --edits overrides_edits.csv` and fix anything it rejects.
4. **Show me the final edits + the planned write counts, and WAIT for my explicit go-ahead
before any `--apply`.** Do not write to the DB before I confirm.
5. On my go-ahead:
- `apply-edits --edits overrides_edits.csv --portfolio-id 796 --apply` (user corrections → ledger)
- `write --excel scripts/hyde/hyde_property_overrides.xlsx --portfolio-id 796` (DRY RUN —
report unmatched org_refs + unresolved across all 31,773 first)
- then the same `write ... --apply`
6. `verify --portfolio-id 796 --org-ref <a few org_refs>` to confirm property_overrides +
overlays landed.
7. Remind me about the deferred **age-classifier prompt-hint fix** for the production lambda
(the live frontend will hit the same `"D: 1950-1966"` → UNKNOWN until that lands).
Every DB command loads env from `/workspaces/home/github/Model/.env`. Read-only checks
(`verify`, dry-run `write`) are fine to run unprompted; anything `--apply` needs my confirm.
---

View file

@ -0,0 +1,437 @@
"""Build ``property_overrides`` for a portfolio from the Hyde Excel, bypassing the
frontend + lambdas, using the ``landlord_*_overrides`` tables as the durable
classification ledger.
Why the ledger (not a throwaway cache): ``landlord_*_overrides`` stores
``(portfolio_id, description) -> value`` with a ``source`` (classifier|user).
* Re-runs classify only descriptions NOT already stored -> saves ChatGPT calls.
* Human corrections are stored as ``source=user`` and the classifier is
forbidden from overwriting them (ADR-0003) -> edits are permanent.
Then we resolve the vocab + match each row to a ``property.id`` by **org_ref**
(Excel "Organisation Reference" -> property.landlord_property_id) and upsert
``property_overrides`` (the fact layer the SAP overlay reads).
Subcommands:
list-values print each component's valid override values (reference)
classify --excel f --portfolio-id 795
PASS 1: classify cache-misses via ChatGPT,
upsert to landlord tables, write
overrides_unknowns.csv (with allowed_values)
validate --edits overrides_edits.csv
check a hand-edited file: every corrected_value
must be a valid enum value (suggests fixes)
apply-edits --edits overrides_edits.csv --portfolio-id 795 [--apply]
upsert validated corrections as source=user
write --excel f --portfolio-id 795 [--apply]
PASS 2: build + upsert property_overrides from vocab
Env: POSTGRES_* (PostgresConfig.from_env) and OPENAI_API_KEY (ChatGPT).
"""
from __future__ import annotations
import argparse
import csv
import difflib
import logging
import os
from collections import Counter
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import Enum
from typing import Any, Optional
import pandas as pd # pyright: ignore[reportMissingTypeStubs]
from sqlalchemy import Table, text
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlmodel import SQLModel
from domain.epc.property_overrides.built_form_type import BuiltFormType
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
from domain.epc.property_overrides.glazing_type import GlazingType
from domain.epc.property_overrides.main_fuel_type import MainFuelType
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
from domain.epc.property_overrides.property_type import PropertyType
from domain.epc.property_overrides.roof_type import RoofType
from domain.epc.property_overrides.wall_type import WallType
from domain.epc.property_overrides.wall_type_construction_dates import (
wall_type_construction_date_prompt_hint,
)
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
from infrastructure.chatgpt.chatgpt import ChatGPT
from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier
from infrastructure.landlord_overrides.landlord_override_reader_postgres_repository import (
LandlordOverrideReaderPostgresRepository,
)
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
LandlordOverridesRepository,
)
from infrastructure.postgres.config import PostgresConfig
from infrastructure.postgres.engine import commit_scope, make_engine, make_session
from infrastructure.postgres.landlord_built_form_type_override_table import (
LandlordBuiltFormTypeOverrideRow,
)
from infrastructure.postgres.landlord_construction_age_band_override_table import (
LandlordConstructionAgeBandOverrideRow,
)
from infrastructure.postgres.landlord_glazing_override_table import (
LandlordGlazingOverrideRow,
)
from infrastructure.postgres.landlord_main_fuel_override_table import (
LandlordMainFuelOverrideRow,
)
from infrastructure.postgres.landlord_main_heating_system_override_table import (
LandlordMainHeatingSystemOverrideRow,
)
from infrastructure.postgres.landlord_override_enums import OverrideSource
from infrastructure.postgres.landlord_property_type_override_table import (
LandlordPropertyTypeOverrideRow,
)
from infrastructure.postgres.landlord_roof_type_override_table import (
LandlordRoofTypeOverrideRow,
)
from infrastructure.postgres.landlord_wall_type_override_table import (
LandlordWallTypeOverrideRow,
)
from infrastructure.postgres.landlord_water_heating_override_table import (
LandlordWaterHeatingOverrideRow,
)
from repositories.property.landlord_override_overlays import overlays_from
from repositories.property.property_override_postgres_repository import (
PropertyOverridePostgresRepository,
)
from repositories.property.property_override_repository import PropertyOverrideInsert
from repositories.property.property_overrides_postgres_reader import (
PropertyOverridesPostgresReader,
)
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger("build_property_overrides")
ORG_REF_COLUMN = "Organisation Reference"
UNKNOWNS_PATH = "overrides_unknowns.csv"
@dataclass(frozen=True)
class ComponentSpec:
component: str
enum_cls: type[Enum]
unknown: Enum
row_type: type[SQLModel]
excel_header: str
per_building_part: bool # comma = building parts (wall/roof/age) vs whole-dwelling
extra_instructions: Optional[str] = None
def allowed_values(self) -> list[str]:
"""Valid override values a human may pick (excludes UNKNOWN)."""
return sorted(m.value for m in self.enum_cls if m is not self.unknown)
def _component_specs() -> list[ComponentSpec]:
return [
ComponentSpec("property_type", PropertyType, PropertyType.UNKNOWN, LandlordPropertyTypeOverrideRow, "Property Type", False),
ComponentSpec("built_form_type", BuiltFormType, BuiltFormType.UNKNOWN, LandlordBuiltFormTypeOverrideRow, "Property Type", False),
ComponentSpec("wall_type", WallType, WallType.UNKNOWN, LandlordWallTypeOverrideRow, "Walls", True, wall_type_construction_date_prompt_hint()),
ComponentSpec("roof_type", RoofType, RoofType.UNKNOWN, LandlordRoofTypeOverrideRow, "Roofs", True),
ComponentSpec("construction_age_band", ConstructionAgeBand, ConstructionAgeBand.UNKNOWN, LandlordConstructionAgeBandOverrideRow, "Age", True),
ComponentSpec("main_fuel", MainFuelType, MainFuelType.UNKNOWN, LandlordMainFuelOverrideRow, "Main Fuel", False),
ComponentSpec("glazing", GlazingType, GlazingType.UNKNOWN, LandlordGlazingOverrideRow, "Glazing", False),
ComponentSpec("water_heating", WaterHeatingType, WaterHeatingType.UNKNOWN, LandlordWaterHeatingOverrideRow, "Hot Water", False),
ComponentSpec("main_heating_system", MainHeatingSystemType, MainHeatingSystemType.UNKNOWN, LandlordMainHeatingSystemOverrideRow, "Heating", False),
]
def _specs_by_component() -> dict[str, ComponentSpec]:
return {s.component: s for s in _component_specs()}
def _norm(s: Any) -> str:
"""Vocab key normalisation — mirrors the orchestrator (strip + lower)."""
return str(s or "").strip().lower()
def _split_entries(cell: Any, per_building_part: bool) -> list[str]:
raw = "" if cell is None else str(cell)
if not raw.strip():
return []
if not per_building_part:
return [raw.strip()]
return [part.strip() for part in raw.split(",") if part.strip()]
def _load_rows(excel: str, sheet: str) -> list[dict[str, Any]]:
return pd.read_excel(excel, sheet_name=sheet).to_dict(orient="records") # type: ignore[return-value]
def _filter_rows(rows: list[dict[str, Any]], org_ref: Optional[str],
limit: Optional[int]) -> list[dict[str, Any]]:
"""Narrow to one property (--org-ref) or the first N rows (--limit) for a
cheap smoke test before the full run."""
if org_ref:
rows = [r for r in rows if str(r.get(ORG_REF_COLUMN, "")).strip() == org_ref]
if limit:
rows = rows[:limit]
return rows
def _distinct_entries(rows: list[dict[str, Any]], spec: ComponentSpec) -> Counter[str]:
counts: Counter[str] = Counter()
for row in rows:
for entry in _split_entries(row.get(spec.excel_header), spec.per_building_part):
counts[entry] += 1
return counts
# --------------------------------------------------------------------------- #
def list_values(_: argparse.Namespace) -> None:
"""Print the valid override values per component (the reference for edits)."""
for spec in _component_specs():
print(f"\n## {spec.component} (Excel: {spec.excel_header})")
for v in spec.allowed_values():
print(f" {v}")
def validate(args: argparse.Namespace) -> None:
"""Check a hand-edited CSV: every corrected_value must be a valid enum value."""
specs = _specs_by_component()
bad = 0
with open(args.edits, newline="") as f:
for i, r in enumerate(csv.DictReader(f), start=2):
val = (r.get("corrected_value") or "").strip()
if not val:
continue
comp = (r.get("component") or "").strip()
spec = specs.get(comp)
if spec is None:
logger.error("row %d: unknown component %r", i, comp)
bad += 1
continue
if val not in spec.allowed_values():
hint = difflib.get_close_matches(val, spec.allowed_values(), n=2)
logger.error("row %d [%s]: %r is not a valid value.%s",
i, comp, val,
f" Did you mean: {hint}?" if hint else
" Run 'list-values' for the allowed set.")
bad += 1
if bad:
raise SystemExit(f"{bad} invalid corrected_value(s) — fix them before apply-edits.")
logger.info("All corrected values are valid enum values. ✓")
def _db_session() -> Any:
return make_session(make_engine(PostgresConfig.from_env(os.environ)))
def classify(args: argparse.Namespace) -> None:
rows = _filter_rows(_load_rows(args.excel, args.sheet), args.org_ref, args.limit)
logger.info("Classifying over %d row(s).", len(rows))
chat_gpt = ChatGPT()
session = _db_session()
reader = LandlordOverrideReaderPostgresRepository(session)
try:
vocab = reader.load_for_portfolio(args.portfolio_id) # {component: {desc: value}}
unknown_rows: list[tuple[str, str, int, str]] = []
for spec in _component_specs():
counts = _distinct_entries(rows, spec)
known = vocab.get(spec.component, {}) # already-classified (cache)
to_classify = {d for d in counts if _norm(d) not in known}
logger.info("%-22s %4d distinct | %4d cached | %4d to classify",
spec.component, len(counts), len(counts) - len(to_classify), len(to_classify))
resolved: dict[str, Enum] = {}
if to_classify:
classifier: ChatGptColumnClassifier[Any] = ChatGptColumnClassifier(
chat_gpt, spec.enum_cls, spec.unknown, extra_instructions=spec.extra_instructions)
resolved = classifier.classify(to_classify)
repo: LandlordOverridesRepository[Any] = LandlordOverridesRepository(session, spec.row_type)
with commit_scope(session):
# store keyed on the normalised description (matches the reader/finaliser lookup)
repo.upsert_all(args.portfolio_id, {_norm(d): m for d, m in resolved.items()})
# collect UNKNOWNs (freshly classified + anything cached as UNKNOWN) for review
unk = spec.unknown.value
for desc, n in counts.items():
v = resolved.get(desc).value if desc in resolved and resolved[desc] else known.get(_norm(desc)) # type: ignore[union-attr]
if v is None or v == unk:
allowed = " | ".join(spec.allowed_values())
unknown_rows.append((spec.component, desc, n, allowed))
with open(UNKNOWNS_PATH, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["component", "description", "count", "corrected_value", "allowed_values"])
for comp, desc, n, allowed in sorted(unknown_rows, key=lambda r: (-r[2])):
w.writerow([comp, desc, n, "", allowed])
logger.info("\nWrote %s — fill 'corrected_value' (must match 'allowed_values'), "
"then: validate -> apply-edits -> write.", UNKNOWNS_PATH)
finally:
session.close()
def _upsert_user_corrections(session: Any, portfolio_id: int,
by_component: dict[str, dict[str, str]]) -> int:
"""Upsert validated human corrections as source=user (always wins on conflict)."""
specs = _specs_by_component()
n = 0
now = datetime.now(timezone.utc)
for comp, mapping in by_component.items():
spec = specs[comp]
table: Table = getattr(spec.row_type, "__table__")
rows = [{"portfolio_id": portfolio_id, "description": _norm(d), "value": v,
"source": OverrideSource.USER, "created_at": now, "updated_at": now}
for d, v in mapping.items()]
if not rows:
continue
stmt = pg_insert(table).values(rows)
stmt = stmt.on_conflict_do_update(
index_elements=["portfolio_id", "description"],
set_={"value": stmt.excluded.value, "source": stmt.excluded.source,
"updated_at": stmt.excluded.updated_at})
session.execute(stmt)
n += len(rows)
return n
def apply_edits(args: argparse.Namespace) -> None:
validate(args) # fail before touching the DB
specs = _specs_by_component()
by_component: dict[str, dict[str, str]] = {}
with open(args.edits, newline="") as f:
for r in csv.DictReader(f):
val = (r.get("corrected_value") or "").strip()
if val and r["component"] in specs:
by_component.setdefault(r["component"], {})[r["description"]] = val
session = _db_session()
try:
if not args.apply:
total = sum(len(m) for m in by_component.values())
logger.info("DRY RUN — %d user corrections ready. Re-run with --apply.", total)
return
with commit_scope(session):
n = _upsert_user_corrections(session, args.portfolio_id, by_component)
logger.info("Upserted %d user corrections (source=user).", n)
finally:
session.close()
def _org_ref_to_property_id(session: Any, portfolio_id: int) -> dict[str, int]:
stmt = text("SELECT landlord_property_id, id FROM property "
"WHERE portfolio_id = :pid AND landlord_property_id IS NOT NULL")
return {str(ref).strip(): int(pid) for ref, pid in session.execute(stmt, {"pid": portfolio_id})}
def write(args: argparse.Namespace) -> None:
rows = _filter_rows(_load_rows(args.excel, args.sheet), args.org_ref, args.limit)
logger.info("Writing over %d row(s).", len(rows))
session = _db_session()
reader = LandlordOverrideReaderPostgresRepository(session)
try:
vocab = reader.load_for_portfolio(args.portfolio_id)
org_ref_map = _org_ref_to_property_id(session, args.portfolio_id)
logger.info("Portfolio %d: %d properties with org_ref.", args.portfolio_id, len(org_ref_map))
inserts: list[PropertyOverrideInsert] = []
unmatched: Counter[str] = Counter()
unresolved: Counter[str] = Counter()
for row in rows:
org_ref = str(row.get(ORG_REF_COLUMN, "")).strip()
property_id = org_ref_map.get(org_ref)
if property_id is None:
unmatched[org_ref] += 1
continue
for spec in _component_specs():
comp_vocab = vocab.get(spec.component, {})
for building_part, entry in enumerate(
_split_entries(row.get(spec.excel_header), spec.per_building_part)):
value = comp_vocab.get(_norm(entry))
if not value or value == spec.unknown.value:
unresolved[f"{spec.component}: {entry}"] += 1
continue
inserts.append(PropertyOverrideInsert(
property_id=property_id, portfolio_id=args.portfolio_id,
building_part=building_part, override_component=spec.component,
override_value=value, original_spreadsheet_description=entry))
logger.info("Built %d rows | %d unmatched org_refs | %d unresolved",
len(inserts), sum(unmatched.values()), sum(unresolved.values()))
if unresolved:
logger.info("Top unresolved (need apply-edits): %s", unresolved.most_common(10))
if not args.apply:
logger.info("DRY RUN — not writing. Re-run with --apply.")
for ins in inserts[:10]:
logger.info(" %s", ins)
return
with commit_scope(session):
affected = PropertyOverridePostgresRepository(session).upsert_all(inserts)
logger.info("Upserted %d property_overrides.", affected)
finally:
session.close()
def verify(args: argparse.Namespace) -> None:
"""For one property (by org_ref): show the persisted property_overrides rows
and the EpcSimulation overlays they produce the end-to-end proof that the
chain reaches the SAP overlay surface."""
session = _db_session()
try:
org_ref_map = _org_ref_to_property_id(session, args.portfolio_id)
property_id = org_ref_map.get(args.org_ref)
if property_id is None:
raise SystemExit(f"org_ref {args.org_ref!r} not found in portfolio {args.portfolio_id}.")
reader = PropertyOverridesPostgresReader(lambda: session)
resolved = reader.overrides_for(property_id)
logger.info("property_id %d%d property_overrides rows:", property_id, len(resolved.rows))
for r in resolved.rows:
logger.info(" part %d | %-22s = %s", r.building_part, r.override_component, r.override_value)
overlays = overlays_from(resolved)
logger.info("\n-> %d EpcSimulation overlay(s) produced (what the SAP calc applies):", len(overlays))
for o in overlays:
logger.info(" %s", o)
finally:
session.close()
def main() -> None:
p = argparse.ArgumentParser(description=__doc__)
sub = p.add_subparsers(dest="cmd", required=True)
sub.add_parser("list-values").set_defaults(func=list_values)
v = sub.add_parser("validate")
v.add_argument("--edits", required=True)
v.set_defaults(func=validate)
c = sub.add_parser("classify")
c.add_argument("--excel", required=True)
c.add_argument("--sheet", default="AddressProfilingResults")
c.add_argument("--portfolio-id", type=int, required=True)
c.add_argument("--org-ref", default=None, help="smoke test: only this property's org_ref")
c.add_argument("--limit", type=int, default=None, help="smoke test: first N rows")
c.set_defaults(func=classify)
a = sub.add_parser("apply-edits")
a.add_argument("--edits", required=True)
a.add_argument("--portfolio-id", type=int, required=True)
a.add_argument("--apply", action="store_true")
a.set_defaults(func=apply_edits)
w = sub.add_parser("write")
w.add_argument("--excel", required=True)
w.add_argument("--sheet", default="AddressProfilingResults")
w.add_argument("--portfolio-id", type=int, required=True)
w.add_argument("--org-ref", default=None, help="smoke test: only this property's org_ref")
w.add_argument("--limit", type=int, default=None, help="smoke test: first N rows")
w.add_argument("--apply", action="store_true")
w.set_defaults(func=write)
vf = sub.add_parser("verify")
vf.add_argument("--portfolio-id", type=int, required=True)
vf.add_argument("--org-ref", required=True)
vf.set_defaults(func=verify)
args = p.parse_args()
args.func(args)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,53 @@
# Hyde portfolio 796 — UNKNOWN overrides for review
After ChatGPT classification, **19 distinct descriptions** did not auto-resolve (out of ~440 distinct across all components). Grouped below with a **proposed value** (must be one of the allowed enum values) + the row count it affects. Nothing is written to the DB until these are confirmed.
## 1. construction_age_band — 29,829 rows (DETERMINISTIC, no judgement)
The classifier didn't extract the band letter in batch, but the band IS the leading letter, so these are mapped mechanically (`"D: 1950-1966"``D`). Just confirm the approach.
| description | → band | rows |
|---|---|---|
| D: 1950-1966 | `D` | 4,978 |
| K: 2007-2011 | `K` | 4,201 |
| I: 1996-2002 | `I` | 3,708 |
| B: 1900-1929 | `B` | 3,222 |
| H: 1991-1995 | `H` | 2,747 |
| E: 1967-1975 | `E` | 2,479 |
| J: 2003-2006 | `J` | 2,221 |
| F: 1976-1982 | `F` | 2,071 |
| C: 1930-1949 | `C` | 1,840 |
| G: 1983-1990 | `G` | 1,615 |
| A: pre-1900 | `A` | 615 |
| M: 2023 onwards | `M` | 132 |
## 2. roof_type (flat roofs) — 1,473 rows (NEEDS KHALIM'S CALL)
Flat-roof insulation drives the SAP roof U-value. **`Flat: As Built` (1,172) + `Flat: Unknown` (194) are the load-bearing decision** — proposed conservatively as *no insulation (assumed)*.
| description | proposed value | rows | alt options |
|---|---|---|---|
| Flat: As Built | `Flat, no insulation (assumed)` | 1,172 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
| Flat: Unknown | `Flat, no insulation (assumed)` | 194 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
| Flat: 150mm | `Flat, insulated` | 59 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
| Flat: 100mm | `Flat, insulated` | 32 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
| Flat: 50mm | `Flat, limited insulation` | 13 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
| SameDwellingAbove | `(same dwelling above)` | 3 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
## 3. wall_type — 7 rows
| description | proposed value | rows |
|---|---|---|
| TimberFrame: Internal | `Timber frame, with additional insulation` | 7 |
## How to apply after review
Edit the `corrected_value` column of `overrides_edits.csv`, then:
```
python scripts/hyde/build_property_overrides.py validate --edits overrides_edits.csv
python scripts/hyde/build_property_overrides.py apply-edits --edits overrides_edits.csv --portfolio-id 796 --apply
python scripts/hyde/build_property_overrides.py write --excel scripts/hyde/hyde_property_overrides.xlsx --portfolio-id 796 --apply
```
> Note: a proper fix for the age classifier (a prompt hint so the production lambda extracts the band letter) is a separate follow-up; these script edits handle this run.

View file

@ -0,0 +1,159 @@
"""Tally the EPC schema versions across the hyde list (manipulation_filled UPRNs).
For every resolved UPRN we look up its EPC certificate's ``schemaType`` (e.g.
``RdSAP-Schema-21.0.1``, ``RdSAP-Schema-17.1``, ``SAP-Schema-16.2``). The
gov EPC ``/api/domestic/search`` endpoint returns ``schemaType`` per row, so one
search-per-postcode covers every UPRN in that postcode far cheaper than a
certificate fetch per UPRN. The latest cert (max registrationDate) wins per UPRN.
Outputs: a per-schema-version tally with one example UPRN each, plus a CSV
mapping every UPRN -> schema version.
python -m scripts.hyde_epc_schema_versions
python -m scripts.hyde_epc_schema_versions --workers 8 --out scripts/hyde_schema_versions.csv
Reads OPEN_EPC_API_TOKEN from backend/.env. Run from the worktree root.
"""
from __future__ import annotations
import argparse
import os
import sys
import time
from collections import Counter, defaultdict
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Any, Optional
import httpx
from dotenv import load_dotenv
_REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
from scripts.fill_domna_addresses import clean_postcode # noqa: E402
from scripts.finalise_to_property_table import load_rows # noqa: E402
_BASE = "https://api.get-energy-performance-data.communities.gov.uk"
_SEARCH = f"{_BASE}/api/domestic/search"
NOT_IN_EPC = "NOT_IN_EPC"
_DEFAULT_IN = _REPO_ROOT / "scripts" / "manipulation_filled.xlsx"
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "hyde_schema_versions.csv"
def search_postcode(
client: httpx.Client, postcode: str, headers: dict[str, str]
) -> list[dict[str, Any]]:
"""Return the search rows for a postcode, retrying on rate-limit (429)."""
for attempt in range(5):
resp = client.get(_SEARCH, params={"postcode": postcode}, headers=headers, timeout=30)
if resp.status_code == 429:
retry_after = float(resp.headers.get("Retry-After", "2"))
time.sleep(min(retry_after, 10) * (attempt + 1))
continue
# 400 = malformed postcode (data-entry typo), 404 = no certs — skip both.
if resp.status_code in (400, 404):
return []
resp.raise_for_status()
return resp.json().get("data", [])
return []
def build_uprn_schema_map(
postcodes: list[str], token: str, workers: int
) -> dict[int, tuple[str, str]]:
"""Map UPRN -> (schemaType, registrationDate) for the latest cert per UPRN.
One search per postcode (concurrent); later we look our UPRNs up in here.
"""
headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
by_uprn: dict[int, tuple[str, str]] = {}
done = 0
total = len(postcodes)
def fetch(pc: str) -> list[dict[str, Any]]:
with httpx.Client() as client:
return search_postcode(client, pc, headers)
with ThreadPoolExecutor(max_workers=workers) as pool:
for rows in pool.map(fetch, postcodes):
for row in rows:
uprn = row.get("uprn")
schema = row.get("schemaType")
reg = row.get("registrationDate") or ""
if uprn is None or not schema:
continue
prev = by_uprn.get(int(uprn))
# Keep the latest-registered cert's schema for this UPRN.
if prev is None or reg > prev[1]:
by_uprn[int(uprn)] = (str(schema), str(reg))
done += 1
if done % 250 == 0:
print(f" searched {done}/{total} postcodes, {len(by_uprn)} uprns seen")
return by_uprn
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
parser.add_argument("--workers", type=int, default=8)
return parser.parse_args()
def main() -> int:
args = _parse_args()
load_dotenv(_REPO_ROOT / "backend" / ".env")
token = os.environ.get("OPEN_EPC_API_TOKEN")
if not token:
print("OPEN_EPC_API_TOKEN not set (backend/.env)")
return 2
_, rows = load_rows(args.inp, include_unmatched=False)
pairs: list[tuple[int, str, str]] = [] # (uprn, postcode_clean, address)
for r in rows:
uprn = r["address2uprn_uprn"]
if uprn:
pairs.append((int(uprn), clean_postcode(r["postcode"]), r["address2uprn_address"]))
postcodes = sorted({pc for _, pc, _ in pairs if pc})
print(f"{len(pairs)} UPRNs across {len(postcodes)} unique postcodes")
by_uprn = build_uprn_schema_map(postcodes, token, args.workers)
print(f"EPC search returned schema for {len(by_uprn)} distinct UPRNs")
# Resolve each hyde UPRN to its schema version.
tally: Counter[str] = Counter()
example: dict[str, tuple[int, str]] = {}
out_lines: list[tuple[int, str, str, str]] = [] # uprn, schema, postcode, address
seen: set[int] = set()
for uprn, pc, address in pairs:
if uprn in seen:
continue
seen.add(uprn)
schema = by_uprn.get(uprn, (NOT_IN_EPC, ""))[0]
tally[schema] += 1
example.setdefault(schema, (uprn, address))
out_lines.append((uprn, schema, pc, address))
# Write the full per-UPRN mapping.
import csv
with args.out.open("w", newline="", encoding="utf-8") as fh:
w = csv.writer(fh)
w.writerow(["uprn", "schema_version", "postcode", "matched_address"])
w.writerows(out_lines)
print(f"\nSchema versions across {len(seen)} distinct UPRNs:\n")
print(f" {'schema version':<26} {'count':>7} example UPRN")
print(f" {'-'*26} {'-'*7} {'-'*12}")
for schema, count in tally.most_common():
ex_uprn, ex_addr = example[schema]
print(f" {schema:<26} {count:>7} {ex_uprn} ({ex_addr})")
print(f"\nFull mapping -> {args.out}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,169 @@
"""Compare our step-1 UPRN resolution against the old "Ara output" data.
The Ara data lives in scripts/lisasrequest/Durkan data.xlsx, sheet "Ara output",
and carries UPRNs from our previous dataset. It is NOT treated as ground truth
this just lines it up against what we found / didn't find so a human can eyeball
the differences. (We read the xlsx, not the CSV export: the CSV mangled half the
UPRNs to Excel scientific notation, e.g. ``1.00023E+11``; the xlsx keeps them
intact, so every comparison below is exact.)
Join key is (postcode, leading number, first street word), since the UPRN is the
thing under comparison and Ara's address strings differ from the landlord input.
Each of our rows lands in one comparison bucket:
match both found a UPRN and they are equal.
differ both found a UPRN and they differ.
we_only we resolved a UPRN, Ara had none for this address.
ara_only we did NOT resolve, but Ara had a UPRN <- recovery candidates.
both_missing neither resolved a UPRN.
no_ara_record the Ara sheet had no row matching this address at all.
python -m scripts.lisasrequest.compare_to_ara
"""
from __future__ import annotations
import argparse
import csv
import re
import sys
from collections import Counter, OrderedDict
from pathlib import Path
from typing import Optional
import pandas as pd
_REPO_ROOT = Path(__file__).resolve().parents[2]
ADDRESS_COL = "address"
POSTCODE_COL = "postcode"
OUR_UPRN_COL = "domna_address_uprn"
OUR_SOURCE_COL = "domna_source"
ARA_UPRN_COL = "EPC_B.uprn"
ARA_ADDRESS_COL = "EPC_B.address"
ARA_POSTCODE_COL = "EPC_B.postcode"
ARA_SHEET = "Ara output"
_OUR_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv"
_ARA_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "Durkan data.xlsx"
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_vs_ara.csv"
Key = tuple[str, str, str]
def norm_key(address: str, postcode: str) -> Key:
"""(postcode-no-space, leading number, first street word) — the join key."""
pc = postcode.upper().replace(" ", "")
upper = address.upper()
nums = re.findall(r"\d+[A-Z]?", upper)
words = [w for w in re.findall(r"[A-Z]+", upper) if w != "FLAT"]
return (pc, nums[0] if nums else "", words[0] if words else "")
def load_ara(path: Path) -> tuple[dict[Key, dict[str, str]], int]:
"""Index the Ara-output xlsx sheet by join key (first row wins).
Returns (index, duplicates). Read as strings so UPRNs keep their full value.
"""
df = pd.read_excel(path, sheet_name=ARA_SHEET, dtype=str)
rows: list[dict[str, str]] = df.fillna("").to_dict(orient="records")
index: dict[Key, dict[str, str]] = OrderedDict()
dupes = 0
for row in rows:
address = str(row.get(ARA_ADDRESS_COL) or "").strip()
postcode = str(row.get(ARA_POSTCODE_COL) or row.get(POSTCODE_COL) or "").strip()
if not address:
continue
key = norm_key(address, postcode)
if key in index:
dupes += 1
continue
index[key] = row
return index, dupes
def classify(
our_uprn: str, our_found: bool, ara: Optional[dict[str, str]]
) -> tuple[str, str, str]:
"""Return (comparison, ara_uprn, ara_address) for one of our rows."""
if ara is None:
return ("no_ara_record", "", "")
ara_uprn = (ara.get(ARA_UPRN_COL) or "").strip()
ara_address = (ara.get(ARA_ADDRESS_COL) or "").strip()
ara_found = bool(ara_uprn)
if our_found and ara_found:
comparison = "match" if our_uprn == ara_uprn else "differ"
elif our_found and not ara_found:
comparison = "we_only"
elif not our_found and ara_found:
comparison = "ara_only"
else:
comparison = "both_missing"
return (comparison, ara_uprn, ara_address)
def compare(
our_rows: list[dict[str, str]], ara_index: dict[Key, dict[str, str]]
) -> list[dict[str, str]]:
out: list[dict[str, str]] = []
for row in our_rows:
address = (row.get(ADDRESS_COL) or "").strip()
postcode = (row.get(POSTCODE_COL) or "").strip()
our_uprn = (row.get(OUR_UPRN_COL) or "").strip()
our_source = (row.get(OUR_SOURCE_COL) or "").strip()
our_found = bool(our_uprn) and our_source != "not_found"
ara = ara_index.get(norm_key(address, postcode))
comparison, ara_uprn, ara_address = classify(our_uprn, our_found, ara)
out.append(
{
"address": address,
"postcode": postcode,
"our_uprn": our_uprn,
"our_source": our_source,
"ara_uprn": ara_uprn,
"ara_address": ara_address,
"comparison": comparison,
}
)
return out
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--ours", type=Path, default=_OUR_IN)
parser.add_argument("--ara", type=Path, default=_ARA_IN)
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
args = parser.parse_args()
with args.ours.open(newline="", encoding="utf-8-sig") as fh:
our_rows = [dict(r) for r in csv.DictReader(fh)]
ara_index, dupes = load_ara(args.ara)
print(f"Loaded {len(our_rows)} of our rows; {len(ara_index)} Ara keys "
f"({dupes} duplicate Ara rows ignored).")
result = compare(our_rows, ara_index)
fieldnames = list(result[0].keys())
with args.out.open("w", newline="", encoding="utf-8") as fh:
writer = csv.DictWriter(fh, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(result)
counts = Counter(r["comparison"] for r in result)
print(f"\nComparison of {len(result)} rows -> {args.out}")
for name in (
"match",
"differ",
"we_only",
"ara_only",
"both_missing",
"no_ara_record",
):
print(f" {name}: {counts.get(name, 0)}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,142 @@
"""EPC SAP-schema check for portfolio 805, and whether each is mapper-supported.
For every UPRN currently in the ``property`` table for portfolio 805, look up its
latest EPC certificate's ``schemaType`` (one /api/domestic/search per postcode,
reusing scripts.hyde_epc_schema_versions) and check it against the schemas the
EpcPropertyData mapper actually handles
(``EpcPropertyDataMapper.from_api_response``, datatypes/epc/domain/mapper.py).
Prints a per-schema tally with a supported? flag and an example UPRN, and writes
the full per-UPRN mapping to durkan_805_schema_check.csv.
python -m scripts.lisasrequest.durkan_805_schema_check
python -m scripts.lisasrequest.durkan_805_schema_check --portfolio 805 --workers 8
Reads OPEN_EPC_API_TOKEN from backend/.env and POSTGRES_* from the root .env.
Run from the worktree root.
"""
from __future__ import annotations
import argparse
import csv
import os
import sys
from collections import Counter
from pathlib import Path
from dotenv import load_dotenv
from sqlmodel import select
_REPO_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
from infrastructure.postgres.config import PostgresConfig # noqa: E402
from infrastructure.postgres.engine import make_engine, make_session # noqa: E402
from infrastructure.postgres.property_table import PropertyRow # noqa: E402
from scripts.fill_domna_addresses import clean_postcode # noqa: E402
from scripts.hyde_epc_schema_versions import ( # noqa: E402
NOT_IN_EPC,
build_uprn_schema_map,
)
# Schemas EpcPropertyDataMapper.from_api_response dispatches on (everything else
# raises "Unsupported EPC schema"). Keep in sync with mapper.py:2539-2603.
SUPPORTED_SCHEMAS = frozenset(
{
"RdSAP-Schema-17.0",
"RdSAP-Schema-17.1",
"RdSAP-Schema-18.0",
"RdSAP-Schema-19.0",
"RdSAP-Schema-20.0.0",
"RdSAP-Schema-21.0.0",
"RdSAP-Schema-21.0.1",
"SAP-Schema-16.0",
"SAP-Schema-16.2",
"SAP-Schema-16.3",
"SAP-Schema-17.0",
"SAP-Schema-17.1",
"SAP-Schema-18.0.0",
}
)
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_805_schema_check.csv"
def load_portfolio_uprns(portfolio_id: int) -> list[tuple[int, str]]:
"""Return (uprn, postcode) for every property in the portfolio with a UPRN."""
load_dotenv(_REPO_ROOT / ".env")
engine = make_engine(PostgresConfig.from_env(os.environ))
session = make_session(engine)
try:
stmt = select(PropertyRow.uprn, PropertyRow.postcode).where(
PropertyRow.portfolio_id == portfolio_id
)
out: list[tuple[int, str]] = []
for uprn, postcode in session.exec(stmt).all():
if uprn is not None:
out.append((int(uprn), str(postcode or "")))
return out
finally:
session.close()
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--portfolio", type=int, default=805)
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
parser.add_argument("--workers", type=int, default=8)
args = parser.parse_args()
load_dotenv(_REPO_ROOT / "backend" / ".env")
token = os.environ.get("OPEN_EPC_API_TOKEN")
if not token:
print("OPEN_EPC_API_TOKEN not set (backend/.env)")
return 2
pairs = load_portfolio_uprns(args.portfolio)
postcodes = sorted({clean_postcode(pc) for _, pc in pairs if pc})
print(
f"Portfolio {args.portfolio}: {len(pairs)} UPRNs across "
f"{len(postcodes)} unique postcodes"
)
by_uprn = build_uprn_schema_map(postcodes, token, args.workers)
print(f"EPC search returned a schema for {len(by_uprn)} distinct UPRNs")
tally: Counter[str] = Counter()
example: dict[str, int] = {}
rows_out: list[tuple[int, str, str, str]] = [] # uprn, schema, supported, postcode
seen: set[int] = set()
for uprn, pc in pairs:
if uprn in seen:
continue
seen.add(uprn)
schema = by_uprn.get(uprn, (NOT_IN_EPC, ""))[0]
supported = "yes" if schema in SUPPORTED_SCHEMAS else "no"
tally[schema] += 1
example.setdefault(schema, uprn)
rows_out.append((uprn, schema, supported, clean_postcode(pc)))
with args.out.open("w", newline="", encoding="utf-8") as fh:
writer = csv.writer(fh)
writer.writerow(["uprn", "schema_version", "mapper_supported", "postcode"])
writer.writerows(rows_out)
supported_count = sum(c for s, c in tally.items() if s in SUPPORTED_SCHEMAS)
print(f"\nSchema versions across {len(seen)} distinct UPRNs in portfolio "
f"{args.portfolio}:\n")
print(f" {'schema version':<26} {'count':>5} {'supported?':<10} example UPRN")
print(f" {'-' * 26} {'-' * 5} {'-' * 10} {'-' * 12}")
for schema, count in tally.most_common():
supported = "yes" if schema in SUPPORTED_SCHEMAS else "NO"
print(f" {schema:<26} {count:>5} {supported:<10} {example[schema]}")
print(
f"\nMapper-supported: {supported_count}/{len(seen)} UPRNs. "
f"Full mapping -> {args.out}"
)
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,200 @@
"""Step 1 (Durkan portfolio): resolve a UPRN per CSV row via EPC then OS.
Input: scripts/lisasrequest/260611_Sample_Seed_Portfolio_Durkan_split_addresses(Split Addresses).csv
columns include ``address`` and ``postcode``.
Every row carries an address and none carry a UPRN, so there is a single case:
* resolve a UPRN from ``address`` + ``postcode`` via the EPC API (relaxed
address variants, threshold 0.7), then Ordnance Survey Places as a fallback
(threshold 0.6).
* not resolvable -> domna_source = "not_found"; uprn/address/score left empty.
Writes a NEW CSV = every original column, in order, plus four DOMNA columns:
domna_address_found the canonical address EPC/OS returned (matched string)
domna_address_uprn the resolved UPRN ("" when unresolved)
domna_lexiscore the match score in [0, 1] ("" when unresolved)
domna_source epc / ordnance_survey / not_found
This is the human-review file; step 2 (resolve_uprns_for_finaliser) reshapes it
into the finaliser columns without re-hitting the APIs.
python -m scripts.lisasrequest.fill_domna_address
python -m scripts.lisasrequest.fill_domna_address --limit 20 # smoke test
Resolution reuses the relaxed matchers from scripts.fill_domna_addresses. Keys
come from backend/.env (OPEN_EPC_API_TOKEN, ORDNANCE_SURVEY_API_KEY). Run from
the worktree root (import trap).
"""
from __future__ import annotations
import argparse
import csv
import sys
from pathlib import Path
from typing import Optional
import pandas as pd
_REPO_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
from scripts.fill_domna_addresses import ( # noqa: E402
Hit,
resolve_epc_relaxed,
resolve_os_relaxed,
)
from scripts.resolve_uprns_for_finaliser import clean_postcode, load_keys # noqa: E402
ADDRESS_COL = "address"
POSTCODE_COL = "postcode"
FOUND_ADDRESS_COL = "domna_address_found"
FOUND_UPRN_COL = "domna_address_uprn"
LEXISCORE_COL = "domna_lexiscore"
SOURCE_COL = "domna_source"
NOT_FOUND = "not_found"
_RESULT_COLS = (FOUND_ADDRESS_COL, FOUND_UPRN_COL, LEXISCORE_COL, SOURCE_COL)
_CSV_NAME = "260611_Sample_Seed_Portfolio_Durkan_split_addresses(Split Addresses).csv"
_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / _CSV_NAME
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv"
def read_rows(path: Path) -> tuple[list[dict[str, str]], list[str]]:
"""Read a CSV into (rows, fieldnames), preserving column order."""
with path.open(newline="", encoding="utf-8-sig") as fh:
reader = csv.DictReader(fh)
fieldnames = list(reader.fieldnames or [])
rows = [dict(row) for row in reader]
return rows, fieldnames
def resolve_one(
address: str,
postcode_raw: str,
*,
epc_token: Optional[str],
os_api_key: Optional[str],
epc_cache: dict[str, pd.DataFrame],
os_cache: dict[str, pd.DataFrame],
) -> Optional[Hit]:
"""Resolve one row's UPRN: EPC (relaxed) first, then OS Places fallback."""
postcode_clean = clean_postcode(postcode_raw)
if not address or not postcode_clean:
return None
hit: Optional[Hit] = None
if epc_token:
try:
hit = resolve_epc_relaxed(address, postcode_clean, epc_cache)
except Exception as exc:
print(f" EPC failed {address!r} / {postcode_clean}: {exc}")
if hit is None and os_api_key:
try:
hit = resolve_os_relaxed(address, postcode_clean, os_api_key, os_cache)
except Exception as exc:
print(f" OS failed {address!r} / {postcode_clean}: {exc}")
return hit
def fill(
rows: list[dict[str, str]],
*,
epc_token: Optional[str],
os_api_key: Optional[str],
) -> tuple[int, int, int]:
"""Fill the DOMNA columns on each row in place.
Returns (epc_hits, os_hits, not_found) counts.
"""
epc_cache: dict[str, pd.DataFrame] = {}
os_cache: dict[str, pd.DataFrame] = {}
epc_hits = os_hits = not_found = 0
total = len(rows)
for n, row in enumerate(rows, start=1):
address = str(row.get(ADDRESS_COL, "") or "").strip()
postcode_raw = str(row.get(POSTCODE_COL, "") or "").strip()
hit = resolve_one(
address,
postcode_raw,
epc_token=epc_token,
os_api_key=os_api_key,
epc_cache=epc_cache,
os_cache=os_cache,
)
if hit is None:
row[FOUND_ADDRESS_COL] = ""
row[FOUND_UPRN_COL] = ""
row[LEXISCORE_COL] = ""
row[SOURCE_COL] = NOT_FOUND
not_found += 1
else:
uprn, matched, score, source = hit
row[FOUND_ADDRESS_COL] = matched
row[FOUND_UPRN_COL] = uprn
row[LEXISCORE_COL] = str(round(score, 4))
row[SOURCE_COL] = source
if source == "epc":
epc_hits += 1
else:
os_hits += 1
print(
f"[{n}/{total}] {address!r} -> "
f"{row[FOUND_UPRN_COL] or '(no match)'} ({row[SOURCE_COL]})"
)
return epc_hits, os_hits, not_found
def write_rows(rows: list[dict[str, str]], path: Path, fieldnames: list[str]) -> None:
"""Write rows to CSV, preserving input columns and appending DOMNA columns."""
out_fields = list(fieldnames)
for col in _RESULT_COLS:
if col not in out_fields:
out_fields.append(col)
with path.open("w", newline="", encoding="utf-8") as fh:
writer = csv.DictWriter(fh, fieldnames=out_fields, extrasaction="ignore")
writer.writeheader()
writer.writerows(rows)
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
parser.add_argument("--limit", type=int, default=None, help="process first N rows")
return parser.parse_args()
def main() -> int:
args = _parse_args()
epc_token, os_api_key = load_keys()
if not epc_token:
print("OPEN_EPC_API_TOKEN not set (backend/.env) — EPC resolution disabled")
if not os_api_key:
print("ORDNANCE_SURVEY_API_KEY not set (backend/.env) — OS fallback disabled")
rows, fieldnames = read_rows(args.inp)
if args.limit is not None:
rows = rows[: args.limit]
print(f"Loaded {len(rows)} rows from {args.inp}")
epc_hits, os_hits, not_found = fill(
rows, epc_token=epc_token, os_api_key=os_api_key
)
write_rows(rows, args.out, fieldnames)
resolved = epc_hits + os_hits
print(
f"\nResolved {resolved}/{len(rows)} "
f"(epc={epc_hits}, ordnance_survey={os_hits}); {not_found} not found."
)
print(f"Wrote filled CSV -> {args.out}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,111 @@
"""Step 3 (Durkan portfolio): insert the reshaped rows into the ``property`` table.
Reads durkan_finaliser_input.csv (step 2) and, per row, maps it with the real
finaliser mapper (``BulkUploadFinaliserOrchestrator._row_to_insert``) and inserts
via the same ``PropertyPostgresRepository.insert_all`` the Lambda uses so a row
written here is identical to one the production finaliser would write. Insert is
ON CONFLICT (portfolio_id, uprn) DO NOTHING, so re-running is safe.
DRY RUN BY DEFAULT it dedupes, reports, and writes the collisions file but does
NOT touch the database. Add --commit to actually insert.
# preview only (no DB writes): dedupe + mapping report
python -m scripts.lisasrequest.finalise_to_property_table --portfolio 805
# actually insert
python -m scripts.lisasrequest.finalise_to_property_table --portfolio 805 --commit
Postgres target comes from the root .env (POSTGRES_*). Run from the worktree root.
"""
from __future__ import annotations
import argparse
import csv
import sys
from pathlib import Path
_REPO_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
from scripts.finalise_to_property_table import ( # noqa: E402
dedupe_by_uprn,
insert_rows,
)
_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_finaliser_input.csv"
_DEFAULT_COLLISIONS = (
_REPO_ROOT / "scripts" / "lisasrequest" / "durkan_finaliser_collisions.csv"
)
UPRN_COL = "address2uprn_uprn"
MATCHED_ADDRESS_COL = "address2uprn_address"
POSTCODE_COL = "postcode"
LEXISCORE_COL = "address2uprn_lexiscore"
def read_rows(path: Path) -> list[dict[str, str]]:
with path.open(newline="", encoding="utf-8-sig") as fh:
return [dict(row) for row in csv.DictReader(fh)]
def _preview(rows: list[dict[str, str]]) -> None:
"""Show the first few rows as they will be inserted (no DB, no mapper call).
The finalise step applies the standard finaliser mapper
(BulkUploadFinaliserOrchestrator) on insert; the fields below are its inputs.
"""
print("\nSample rows to insert (uprn | matched address | postcode | lexiscore):")
for row in rows[:3]:
print(
f" {row.get(UPRN_COL)} | {row.get(MATCHED_ADDRESS_COL)!r} | "
f"{row.get(POSTCODE_COL)!r} | {row.get(LEXISCORE_COL)}"
)
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
parser.add_argument("--portfolio", type=int, required=True)
parser.add_argument(
"--commit",
action="store_true",
help="actually insert into property (default is a dry-run preview)",
)
parser.add_argument("--collisions", type=Path, default=_DEFAULT_COLLISIONS)
args = parser.parse_args()
rows = read_rows(args.inp)
print(f"Loaded {len(rows)} finaliser rows from {args.inp}")
kept, dropped = dedupe_by_uprn(rows)
if dropped:
with args.collisions.open("w", newline="", encoding="utf-8") as fh:
writer = csv.DictWriter(fh, fieldnames=list(dropped[0].keys()))
writer.writeheader()
writer.writerows(dropped)
print(
f"{len(dropped)} duplicate-UPRN rows dropped -> {args.collisions} "
f"({len(kept)} unique to insert)"
)
else:
print(f"No duplicate-UPRN collisions; {len(kept)} unique rows to insert.")
_preview(kept)
if not args.commit:
print(
f"\nDRY RUN — nothing written. {len(kept)} rows would be inserted into "
f"portfolio {args.portfolio}. Re-run with --commit to write."
)
return 0
inserted = insert_rows(kept, args.portfolio)
print(
f"\nInserted {inserted} new properties into portfolio {args.portfolio} "
f"({len(kept) - inserted} already existed; ON CONFLICT DO NOTHING)."
)
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,212 @@
"""Step 2 (Durkan portfolio): split step-1 matches, reshape the confident ones.
Reads durkan_domna_filled.csv (step 1) and SPLITS it in two no re-resolution,
just column work:
* Rows we cannot confidently insert are held back to a client-clarification CSV
(durkan_client_clarification.csv) for Khalim to take to the client. Reasons:
not_found_no_match no UPRN was resolved.
no_flat_level_uprn a block of flats all collapsed onto one building
UPRN OS/EPC carry no flat-level records, so we
can't tell the flats apart.
unit_number_mismatch the matched house number differs from the input
(e.g. "9 ..." matched "9A ..."), so the property is
ambiguous.
* Every remaining row is reshaped into the columns the finaliser reads
(bulk_upload_finaliser_orchestrator), written to durkan_finaliser_input.csv
ready for step 3:
Address 1/2/3 | postcode | Internal Reference | address2uprn_uprn
| address2uprn_address | address2uprn_lexiscore
Internal Reference is left blank (landlord_property_id null, by decision).
python -m scripts.lisasrequest.resolve_uprns_for_finaliser
This stage hits no APIs. The held rows are not lost once the client confirms
them they can be appended to the finaliser input by hand.
"""
from __future__ import annotations
import argparse
import csv
import sys
from collections import Counter
from pathlib import Path
from typing import Optional
_REPO_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
from scripts.lisasrequest.fill_domna_address import ( # noqa: E402
ADDRESS_COL,
FOUND_ADDRESS_COL,
FOUND_UPRN_COL,
LEXISCORE_COL,
POSTCODE_COL,
SOURCE_COL,
)
from scripts.lisasrequest.review_flags import address_numbers, input_unit # noqa: E402
# Finaliser input columns — must match bulk_upload_finaliser_orchestrator
# (ADDRESS_COLS / POSTCODE_COL / INTERNAL_REF_COL / UPRN_COL /
# MATCHED_ADDRESS_COL / LEXISCORE_COL). Hard-coded to keep this a light,
# stdlib-only reshape; step 3 imports the real orchestrator and will fail loudly
# if these ever drift.
FIN_ADDRESS_1, FIN_ADDRESS_2, FIN_ADDRESS_3 = "Address 1", "Address 2", "Address 3"
FIN_POSTCODE = "postcode"
FIN_INTERNAL_REF = "Internal Reference"
FIN_UPRN = "address2uprn_uprn"
FIN_MATCHED_ADDRESS = "address2uprn_address"
FIN_LEXISCORE = "address2uprn_lexiscore"
_FINALISER_COLS = [
FIN_ADDRESS_1,
FIN_ADDRESS_2,
FIN_ADDRESS_3,
FIN_POSTCODE,
FIN_INTERNAL_REF,
FIN_UPRN,
FIN_MATCHED_ADDRESS,
FIN_LEXISCORE,
]
# Client-clarification report columns (kept human-readable for the client).
CONTEXT_COLS = ["address", "postcode", "No.", "Address Block"]
DOMNA_COLS = [FOUND_ADDRESS_COL, FOUND_UPRN_COL, LEXISCORE_COL, SOURCE_COL]
REASON_COL = "clarification_reason"
ACTION_COL = "action_needed"
_CLARIFY_COLS = CONTEXT_COLS + DOMNA_COLS + [REASON_COL, ACTION_COL]
_REASON_ORDER = {
"not_found_no_match": 0,
"no_flat_level_uprn": 1,
"unit_number_mismatch": 2,
}
_REASON_ACTION = {
"not_found_no_match": "No UPRN found for this address — please confirm the "
"exact address or provide the UPRN.",
"no_flat_level_uprn": "Address registers hold only the building, not the "
"individual flats — please provide a UPRN per flat, or confirm a "
"building-level record is acceptable.",
"unit_number_mismatch": "Closest match has a different unit number (see "
"domna_address_found) — please confirm the correct property / UPRN.",
}
_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv"
_DEFAULT_FINALISER = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_finaliser_input.csv"
_DEFAULT_CLARIFY = (
_REPO_ROOT / "scripts" / "lisasrequest" / "durkan_client_clarification.csv"
)
def read_rows(path: Path) -> list[dict[str, str]]:
with path.open(newline="", encoding="utf-8-sig") as fh:
return [dict(row) for row in csv.DictReader(fh)]
def clarification_reason(
row: dict[str, str], uprn_counts: Counter[str]
) -> Optional[str]:
"""Why this row can't be inserted yet, or None if it's safe to finalise."""
uprn = row.get(FOUND_UPRN_COL, "")
if row.get(SOURCE_COL) == "not_found" or not uprn:
return "not_found_no_match"
unit = input_unit(row.get(ADDRESS_COL, ""))
unit_missing = bool(unit) and unit not in address_numbers(
row.get(FOUND_ADDRESS_COL, "")
)
duplicate = uprn_counts[uprn] > 1
if unit_missing:
return "no_flat_level_uprn" if duplicate else "unit_number_mismatch"
if duplicate:
# A shared UPRN with the right unit number still collides at finalise.
return "no_flat_level_uprn"
return None
def to_finaliser_row(row: dict[str, str]) -> dict[str, str]:
"""Rename a confident step-1 row into the finaliser's input columns."""
return {
FIN_ADDRESS_1: row.get(ADDRESS_COL, ""),
FIN_ADDRESS_2: "",
FIN_ADDRESS_3: "",
FIN_POSTCODE: row.get(POSTCODE_COL, ""),
FIN_INTERNAL_REF: "", # landlord_property_id null, by decision
FIN_UPRN: row.get(FOUND_UPRN_COL, ""),
FIN_MATCHED_ADDRESS: row.get(FOUND_ADDRESS_COL, ""),
FIN_LEXISCORE: row.get(LEXISCORE_COL, ""),
}
def to_clarify_row(row: dict[str, str], reason: str) -> dict[str, str]:
out = {col: row.get(col, "") for col in CONTEXT_COLS + DOMNA_COLS}
out[REASON_COL] = reason
out[ACTION_COL] = _REASON_ACTION[reason]
return out
def split(
rows: list[dict[str, str]],
*,
accept_unit_mismatch: bool = False,
) -> tuple[list[dict[str, str]], list[dict[str, str]]]:
"""Return (finaliser_rows, clarification_rows).
``accept_unit_mismatch`` reshapes the ``unit_number_mismatch`` rows (a
near-miss like 9 -> 9A the client has already confirmed) into the finaliser
input instead of holding them back.
"""
uprn_counts: Counter[str] = Counter(
r.get(FOUND_UPRN_COL, "") for r in rows if r.get(FOUND_UPRN_COL)
)
finaliser: list[dict[str, str]] = []
clarify: list[dict[str, str]] = []
for row in rows:
reason = clarification_reason(row, uprn_counts)
if reason is None or (
accept_unit_mismatch and reason == "unit_number_mismatch"
):
finaliser.append(to_finaliser_row(row))
else:
clarify.append(to_clarify_row(row, reason))
clarify.sort(key=lambda r: _REASON_ORDER.get(r[REASON_COL], 9))
return finaliser, clarify
def write_csv(rows: list[dict[str, str]], path: Path, fieldnames: list[str]) -> None:
with path.open("w", newline="", encoding="utf-8") as fh:
writer = csv.DictWriter(fh, fieldnames=fieldnames, extrasaction="ignore")
writer.writeheader()
writer.writerows(rows)
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
parser.add_argument("--finaliser-out", type=Path, default=_DEFAULT_FINALISER)
parser.add_argument("--clarify-out", type=Path, default=_DEFAULT_CLARIFY)
parser.add_argument(
"--accept-unit-mismatch",
action="store_true",
help="reshape unit_number_mismatch rows (e.g. 9->9A) into the finaliser "
"input instead of holding them for the client",
)
args = parser.parse_args()
rows = read_rows(args.inp)
finaliser, clarify = split(rows, accept_unit_mismatch=args.accept_unit_mismatch)
write_csv(finaliser, args.finaliser_out, _FINALISER_COLS)
write_csv(clarify, args.clarify_out, _CLARIFY_COLS)
counts = Counter(r[REASON_COL] for r in clarify)
print(f"Read {len(rows)} step-1 rows.")
print(f" -> {len(finaliser)} confident rows reshaped -> {args.finaliser_out}")
print(f" -> {len(clarify)} held for client -> {args.clarify_out}")
for reason in _REASON_ORDER:
print(f" {reason}: {counts.get(reason, 0)}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,135 @@
"""Flag step-1 matches that need a human eye, for review before finalising.
Reads durkan_domna_filled.csv (the step-1 output) and writes a review CSV of
only the rows carrying at least one flag, newest-doubt-first:
not_found no UPRN resolved at all.
unit_not_in_match the input flat/house number does NOT appear in the matched
address the high-precision "wrong property" signal. Two
shapes: a near-miss ("9 VANBRUGH" matched "9A, VANBRUGH")
or a flat collapsing onto its building ("FLAT 1, 20 WARWICK"
matched "20, WARWICK ROAD").
dup_uprn the same UPRN was resolved for >1 input row typically a
block of flats all collapsing onto the building UPRN; all
but one will be dropped at finalise.
low_score lexiscore < 0.70 (a weak match, just over the OS bar). NOTE:
on its own this is noisy truncated EPC addresses and extra
locality tokens push correct matches below 0.70. Treat it as
informational unless paired with one of the flags above.
python -m scripts.lisasrequest.review_flags
"""
from __future__ import annotations
import argparse
import csv
import re
import sys
from collections import Counter
from pathlib import Path
_REPO_ROOT = Path(__file__).resolve().parents[2]
ADDRESS_COL = "address"
POSTCODE_COL = "postcode"
FOUND_ADDRESS_COL = "domna_address_found"
FOUND_UPRN_COL = "domna_address_uprn"
LEXISCORE_COL = "domna_lexiscore"
SOURCE_COL = "domna_source"
LOW_SCORE = 0.70
_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv"
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_review_flags.csv"
_REVIEW_COLS = [
ADDRESS_COL,
POSTCODE_COL,
FOUND_ADDRESS_COL,
FOUND_UPRN_COL,
LEXISCORE_COL,
SOURCE_COL,
"flags",
]
def input_unit(address: str) -> str:
"""The salient unit number of an input address: the FLAT number if present,
else the leading house number ("" if neither). Upper-cased."""
upper = address.upper()
flat = re.search(r"\bFLAT\s+(\d+[A-Z]?)", upper)
if flat:
return flat.group(1)
lead = re.match(r"\s*(\d+[A-Z]?)\b", upper)
return lead.group(1) if lead else ""
def address_numbers(address: str) -> set[str]:
"""All standalone number tokens in an address (e.g. {"3", "20"}). Upper-cased."""
return set(re.findall(r"\b\d+[A-Z]?\b", address.upper()))
def _score(value: str) -> float:
try:
return float(value)
except (TypeError, ValueError):
return 0.0
def flag_rows(rows: list[dict[str, str]]) -> list[dict[str, str]]:
"""Return the flagged subset, each with a ';'-joined ``flags`` field."""
uprn_counts = Counter(
r.get(FOUND_UPRN_COL, "") for r in rows if r.get(FOUND_UPRN_COL)
)
flagged: list[dict[str, str]] = []
for row in rows:
uprn = row.get(FOUND_UPRN_COL, "")
source = row.get(SOURCE_COL, "")
flags: list[str] = []
if source == "not_found" or not uprn:
flags.append("not_found")
else:
unit = input_unit(row.get(ADDRESS_COL, ""))
if unit and unit not in address_numbers(row.get(FOUND_ADDRESS_COL, "")):
flags.append("unit_not_in_match")
if uprn_counts[uprn] > 1:
flags.append("dup_uprn")
if _score(row.get(LEXISCORE_COL, "")) < LOW_SCORE:
flags.append("low_score")
if flags:
flagged.append({**{c: row.get(c, "") for c in _REVIEW_COLS[:-1]},
"flags": ";".join(flags)})
# not_found first, then mismatches, then dup/low.
order = {"not_found": 0, "unit_not_in_match": 1, "dup_uprn": 2, "low_score": 3}
flagged.sort(key=lambda r: order.get(r["flags"].split(";")[0], 9))
return flagged
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
args = parser.parse_args()
with args.inp.open(newline="", encoding="utf-8-sig") as fh:
rows = [dict(r) for r in csv.DictReader(fh)]
flagged = flag_rows(rows)
with args.out.open("w", newline="", encoding="utf-8") as fh:
writer = csv.DictWriter(fh, fieldnames=_REVIEW_COLS, extrasaction="ignore")
writer.writeheader()
writer.writerows(flagged)
counts = Counter(f for r in flagged for f in r["flags"].split(";"))
print(f"{len(flagged)}/{len(rows)} rows flagged for review -> {args.out}")
for name in ("not_found", "unit_not_in_match", "dup_uprn", "low_score"):
print(f" {name}: {counts.get(name, 0)}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,328 @@
"""Resolve a CSV of addresses to UPRNs, ready to feed the bulk-upload finaliser.
Takes a CSV with `Address 1/2/3` + `postcode` columns and, per row, resolves a
UPRN by trying in order the new EPC API (address2uprn), the historic EPC S3
dataset, then the Ordnance Survey Places API as a fallback. Whichever source
wins, the result is written into the SAME three columns the finaliser reads
(`bulk_upload_finaliser_orchestrator`):
address2uprn_uprn UPRN integer (empty when unresolved)
address2uprn_address the matched address
address2uprn_lexiscore the match score in [0, 1]
A `resolution_source` diagnostic column (epc / epc_historic / ordnance_survey /
none) is appended too the finaliser ignores unknown columns. All original
columns are preserved in their original order, so the output CSV drops straight
into the finaliser.
python -m scripts.resolve_uprns_for_finaliser input.csv -o resolved.csv
# OS-only / EPC-only, custom postcode column, custom OS score threshold
python -m scripts.resolve_uprns_for_finaliser in.csv -o out.csv --no-epc
python -m scripts.resolve_uprns_for_finaliser in.csv -o out.csv --postcode-col Postcode --os-threshold 0.6
Keys are read from backend/.env: OPEN_EPC_API_TOKEN (EPC) and
ORDNANCE_SURVEY_API_KEY (OS Places). Run from the worktree root (import trap).
The module-level functions (`load_keys`, `read_rows`, `resolve_row`, `process`,
`write_rows`) are written to be driven line-by-line from a REPL as well as via
the CLI.
"""
from __future__ import annotations
import argparse
import csv
import os
import sys
from pathlib import Path
from typing import Optional
import pandas as pd
from dotenv import load_dotenv
_REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
from backend.address2UPRN.main import ( # noqa: E402
get_epc_data_with_postcode,
get_uprn_from_historic_epc,
get_uprn_with_epc_df,
)
from backend.ordnanceSurvey.helpers import ( # noqa: E402
lookup_os_places,
os_places_results_to_dataframe,
)
from backend.utils.addressMatch import AddressMatch # noqa: E402
# Columns the finaliser reads (bulk_upload_finaliser_orchestrator).
UPRN_COL = "address2uprn_uprn"
MATCHED_ADDRESS_COL = "address2uprn_address"
LEXISCORE_COL = "address2uprn_lexiscore"
SOURCE_COL = "resolution_source"
_RESULT_COLS = (UPRN_COL, MATCHED_ADDRESS_COL, LEXISCORE_COL, SOURCE_COL)
# A resolved hit: (uprn, matched_address, lexiscore, source).
Resolution = tuple[str, str, float, str]
def load_keys() -> tuple[Optional[str], Optional[str]]:
"""Load (epc_token, os_api_key) from backend/.env (and the process env)."""
load_dotenv(_REPO_ROOT / "backend" / ".env")
epc_token = os.environ.get("OPEN_EPC_API_TOKEN")
os_api_key = os.environ.get("ORDNANCE_SURVEY_API_KEY")
return epc_token, os_api_key
def read_rows(path: Path) -> tuple[list[dict[str, str]], list[str]]:
"""Read a CSV into (rows, fieldnames). Preserves column order."""
with path.open(newline="", encoding="utf-8-sig") as fh:
reader = csv.DictReader(fh)
fieldnames = list(reader.fieldnames or [])
rows = [dict(row) for row in reader]
return rows, fieldnames
def clean_postcode(postcode: str) -> str:
"""Sanitise to the no-space upper form the EPC/OS lookups expect (e.g. E84SQ)."""
return postcode.upper().replace(" ", "").strip()
def build_address(row: dict[str, str]) -> str:
"""Concatenate Address 1/2/3 the same way the address2uprn lambda does."""
return " ".join(
str(row.get(col, "") or "").strip() for col in ("Address 1", "Address 2", "Address 3")
).strip()
def resolve_epc(
address: str, postcode_clean: str, epc_cache: dict[str, pd.DataFrame]
) -> Optional[Resolution]:
"""Resolve via the new EPC API (cached per postcode), then historic EPC S3.
`epc_cache` is mutated to memoise one EPC API call per postcode pass the
same dict across rows so a postcode is only fetched once.
"""
epc_df = epc_cache.get(postcode_clean)
if epc_df is None:
epc_df = get_epc_data_with_postcode(postcode=postcode_clean)
epc_cache[postcode_clean] = epc_df
result = get_uprn_with_epc_df(
user_inputed_address=address, epc_df=epc_df, verbose=True
)
if isinstance(result, tuple):
uprn, matched, score = result
return str(uprn), str(matched), float(score), "epc"
historic = get_uprn_from_historic_epc(
user_inputed_address=address, postcode=postcode_clean
)
if historic is not None:
uprn, matched, score = historic
return str(uprn), str(matched), float(score), "epc_historic"
return None
def resolve_os(
address: str,
postcode_clean: str,
os_api_key: str,
os_cache: dict[str, pd.DataFrame],
threshold: float,
) -> Optional[Resolution]:
"""Resolve via the OS Places API: best-scoring address above `threshold`.
`os_cache` memoises one OS Places call per postcode.
"""
places_df = os_cache.get(postcode_clean)
if places_df is None:
response = lookup_os_places(postcode_clean, os_api_key)
if response.get("status") != 200 or "data" not in response:
places_df = pd.DataFrame()
else:
places_df = os_places_results_to_dataframe(response["data"])
os_cache[postcode_clean] = places_df
if places_df.empty or "ADDRESS" not in places_df.columns:
return None
# Iterate plain records — avoids pandas' partially-unknown indexing types.
records: list[dict[str, object]] = places_df.to_dict(orient="records")
best: Optional[Resolution] = None
for rec in records:
candidate = str(rec.get("ADDRESS", ""))
score = AddressMatch.score(address, candidate)
if score >= threshold and (best is None or score > best[2]):
best = (str(rec.get("UPRN", "")), candidate, score, "ordnance_survey")
return best
def resolve_row(
row: dict[str, str],
*,
epc_token: Optional[str],
os_api_key: Optional[str],
epc_cache: dict[str, pd.DataFrame],
os_cache: dict[str, pd.DataFrame],
postcode_col: str,
use_epc: bool,
use_os: bool,
os_threshold: float,
validate_postcode: bool,
) -> dict[str, str]:
"""Resolve one row in place and return it with the finaliser columns filled.
Tries EPC (new + historic) first, then OS Places. On no match the three
result columns are written empty and `resolution_source` is "none".
"""
address = build_address(row)
postcode_clean = clean_postcode(str(row.get(postcode_col, "") or ""))
def write(res: Optional[Resolution]) -> dict[str, str]:
if res is None:
row[UPRN_COL] = ""
row[MATCHED_ADDRESS_COL] = ""
row[LEXISCORE_COL] = ""
row[SOURCE_COL] = "none"
else:
uprn, matched, score, source = res
row[UPRN_COL] = uprn
row[MATCHED_ADDRESS_COL] = matched
row[LEXISCORE_COL] = str(score)
row[SOURCE_COL] = source
return row
if not address or not postcode_clean:
return write(None)
if validate_postcode and not AddressMatch.is_valid_postcode(postcode_clean):
return write(None)
if use_epc and epc_token:
try:
res = resolve_epc(address, postcode_clean, epc_cache)
if res is not None:
return write(res)
except Exception as exc: # keep going on a per-row API/lookup failure
print(f" EPC lookup failed for {address!r} / {postcode_clean}: {exc}")
if use_os and os_api_key:
try:
res = resolve_os(address, postcode_clean, os_api_key, os_cache, os_threshold)
if res is not None:
return write(res)
except Exception as exc:
print(f" OS lookup failed for {address!r} / {postcode_clean}: {exc}")
return write(None)
def process(
rows: list[dict[str, str]],
*,
epc_token: Optional[str],
os_api_key: Optional[str],
postcode_col: str = "postcode",
use_epc: bool = True,
use_os: bool = True,
os_threshold: float = 0.5,
validate_postcode: bool = True,
) -> list[dict[str, str]]:
"""Resolve every row, printing a per-row line so REPL/CLI progress is visible."""
epc_cache: dict[str, pd.DataFrame] = {}
os_cache: dict[str, pd.DataFrame] = {}
for i, row in enumerate(rows, start=1):
resolve_row(
row,
epc_token=epc_token,
os_api_key=os_api_key,
epc_cache=epc_cache,
os_cache=os_cache,
postcode_col=postcode_col,
use_epc=use_epc,
use_os=use_os,
os_threshold=os_threshold,
validate_postcode=validate_postcode,
)
print(
f"[{i}/{len(rows)}] {build_address(row)!r} -> "
f"{row[UPRN_COL] or '(no match)'} ({row[SOURCE_COL]})"
)
return rows
def write_rows(rows: list[dict[str, str]], path: Path, fieldnames: list[str]) -> None:
"""Write rows to CSV, preserving input columns and appending the result columns."""
out_fields = list(fieldnames)
for col in _RESULT_COLS:
if col not in out_fields:
out_fields.append(col)
with path.open("w", newline="", encoding="utf-8") as fh:
writer = csv.DictWriter(fh, fieldnames=out_fields, extrasaction="ignore")
writer.writeheader()
writer.writerows(rows)
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input", type=Path, help="input CSV (Address 1/2/3 + postcode)")
parser.add_argument(
"-o", "--out", type=Path, required=True, help="output CSV for the finaliser"
)
parser.add_argument("--postcode-col", default="postcode", help="postcode column name")
parser.add_argument("--no-epc", action="store_true", help="skip EPC resolution")
parser.add_argument("--no-os", action="store_true", help="skip Ordnance Survey fallback")
parser.add_argument(
"--os-threshold", type=float, default=0.5, help="min OS match score (default 0.5)"
)
parser.add_argument(
"--no-validate-postcode",
action="store_true",
help="skip the postcodes.io validity check (one HTTP call per postcode)",
)
parser.add_argument("--limit", type=int, default=None, help="process only the first N rows")
return parser.parse_args()
def main() -> int:
args = _parse_args()
epc_token, os_api_key = load_keys()
use_epc = not args.no_epc
use_os = not args.no_os
if use_epc and not epc_token:
print("OPEN_EPC_API_TOKEN not set (backend/.env) — EPC resolution disabled")
use_epc = False
if use_os and not os_api_key:
print("ORDNANCE_SURVEY_API_KEY not set (backend/.env) — OS fallback disabled")
use_os = False
if not use_epc and not use_os:
print("No resolver enabled (missing keys or both --no-* flags). Nothing to do.")
return 2
rows, fieldnames = read_rows(args.input)
if args.limit is not None:
rows = rows[: args.limit]
print(f"Loaded {len(rows)} rows from {args.input}")
process(
rows,
epc_token=epc_token,
os_api_key=os_api_key,
postcode_col=args.postcode_col,
use_epc=use_epc,
use_os=use_os,
os_threshold=args.os_threshold,
validate_postcode=not args.no_validate_postcode,
)
write_rows(rows, args.out, fieldnames)
matched = sum(1 for r in rows if r.get(UPRN_COL))
print(f"\nResolved {matched}/{len(rows)} rows. Wrote {args.out}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,79 @@
"""The landlord-description-overrides handler's column wiring (`_build_columns`).
A `column_mapping` entry of ``{category -> source header}`` must produce a
ClassifiableColumn that reads the named header and classifies into the
category's enum. This pins the main_fuel category onto the wiring.
"""
from __future__ import annotations
from typing import cast
from applications.landlord_description_overrides.handler import _build_columns # pyright: ignore[reportPrivateUsage]
from infrastructure.chatgpt.chatgpt import ChatGPT
def test_build_columns_wires_a_main_fuel_classifier_column() -> None:
# Arrange — the factory only stores the injected collaborators, so a bare
# object stands in for the (I/O-bound) ChatGPT client and the DB session.
chat_gpt = cast(ChatGPT, object())
# Act
columns = _build_columns({"main_fuel": "Main Fuel"}, chat_gpt, None)
# Assert — one column, named main_fuel, reading the "Main Fuel" header.
assert len(columns) == 1
assert columns[0].name == "main_fuel"
assert columns[0].source_column == "Main Fuel"
def test_build_columns_wires_a_glazing_classifier_column() -> None:
# Arrange
chat_gpt = cast(ChatGPT, object())
# Act
columns = _build_columns({"glazing": "Glazing"}, chat_gpt, None)
# Assert — one column, named glazing, reading the "Glazing" header.
assert len(columns) == 1
assert columns[0].name == "glazing"
assert columns[0].source_column == "Glazing"
def test_build_columns_wires_a_construction_age_band_classifier_column() -> None:
# Arrange
chat_gpt = cast(ChatGPT, object())
# Act
columns = _build_columns({"construction_age_band": "Age"}, chat_gpt, None)
# Assert — one column, named construction_age_band, reading the "Age" header.
assert len(columns) == 1
assert columns[0].name == "construction_age_band"
assert columns[0].source_column == "Age"
def test_build_columns_wires_a_water_heating_classifier_column() -> None:
# Arrange
chat_gpt = cast(ChatGPT, object())
# Act
columns = _build_columns({"water_heating": "Hot Water"}, chat_gpt, None)
# Assert
assert len(columns) == 1
assert columns[0].name == "water_heating"
assert columns[0].source_column == "Hot Water"
def test_build_columns_wires_a_main_heating_system_classifier_column() -> None:
# Arrange
chat_gpt = cast(ChatGPT, object())
# Act
columns = _build_columns({"main_heating_system": "Heating"}, chat_gpt, None)
# Assert
assert len(columns) == 1
assert columns[0].name == "main_heating_system"
assert columns[0].source_column == "Heating"

View file

@ -0,0 +1,109 @@
"""The Landlord-Override construction-age-band → fabric Simulation Overlay.
An age-band value resolves to the RdSAP letter code the calculator's U-value
cascades read from `SapBuildingPart.construction_age_band`; the overlay targets
the override's building part.
"""
from __future__ import annotations
import pytest
from datatypes.epc.domain.epc_property_data import (
BuildingPartIdentifier,
EpcPropertyData,
SapBuildingPart,
)
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
from domain.epc.property_overlays.construction_age_band_overlay import (
age_band_overlay_for,
)
from domain.modelling.scoring.overlay_applicator import apply_simulations
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
build_epc,
)
def _part(
epc: EpcPropertyData, identifier: BuildingPartIdentifier
) -> SapBuildingPart:
return next(p for p in epc.sap_building_parts if p.identifier is identifier)
def test_age_band_overlays_the_main_building_part() -> None:
# Act — band B (1900-1929) on the main building part.
simulation = age_band_overlay_for("B", 0)
# Assert
assert simulation is not None
overlay = simulation.building_parts[BuildingPartIdentifier.MAIN]
assert overlay.construction_age_band == "B"
def test_age_band_overlay_targets_the_extension_building_part() -> None:
# Act — building_part 1 is the first extension.
simulation = age_band_overlay_for("L", 1)
# Assert
assert simulation is not None
assert BuildingPartIdentifier.EXTENSION_1 in simulation.building_parts
assert (
simulation.building_parts[BuildingPartIdentifier.EXTENSION_1]
.construction_age_band
== "L"
)
def test_lowercase_age_band_is_normalised_to_its_letter_code() -> None:
# Act
simulation = age_band_overlay_for("d", 0)
# Assert — the calculator upper-cases the band; the overlay stores it upper.
assert simulation is not None
assert (
simulation.building_parts[BuildingPartIdentifier.MAIN].construction_age_band
== "D"
)
@pytest.mark.parametrize("age_band_value", ["Z", "", "1900-1929", "Unknown"])
def test_unrecognised_age_band_produces_no_overlay(age_band_value: str) -> None:
# Act
simulation = age_band_overlay_for(age_band_value, 0)
# Assert
assert simulation is None
def test_age_band_override_re_dates_the_main_part_only() -> None:
# Arrange — baseline main + extension are both band B; the landlord corrects
# the main building's age band to F (1976-1982).
baseline = build_epc()
overlay = age_band_overlay_for("F", 0)
assert overlay is not None
# Act
result = apply_simulations(baseline, [overlay])
# Assert — the main part is re-dated (its U-value cascade now keys on F); the
# extension is left untouched.
assert _part(result, BuildingPartIdentifier.MAIN).construction_age_band == "F"
assert (
_part(result, BuildingPartIdentifier.EXTENSION_1).construction_age_band == "B"
)
@pytest.mark.parametrize(
"member", [m for m in ConstructionAgeBand if m is not ConstructionAgeBand.UNKNOWN]
)
def test_every_resolvable_age_band_value_decodes_to_an_overlay(
member: ConstructionAgeBand,
) -> None:
# A classifier emits a ConstructionAgeBand value; if the overlay can't decode
# it the override silently no-ops. Every non-UNKNOWN member must resolve.
# Act
simulation = age_band_overlay_for(member.value, 0)
# Assert
assert simulation is not None

View file

@ -0,0 +1,90 @@
"""The Landlord-Override glazing → glazing Simulation Overlay mapping.
A glazing value resolves to the SAP10 `glazing_type` code the calculator's
Table-24 cascade reads; the overlay is whole-dwelling (expanded across every
window by `_fold_glazing`).
"""
from __future__ import annotations
import pytest
from domain.epc.property_overrides.glazing_type import GlazingType
from domain.epc.property_overlays.glazing_overlay import glazing_overlay_for
from domain.modelling.scoring.overlay_applicator import apply_simulations
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
build_epc,
)
def test_double_glazing_post_2002_overlays_its_glazing_code() -> None:
# Act
simulation = glazing_overlay_for("Double glazing, 2002 or later", 0)
# Assert — double glazing 2002-2021 is SAP10 glazing_type code 2.
assert simulation is not None
assert simulation.glazing is not None
assert simulation.glazing.glazing_type == 2
@pytest.mark.parametrize(
("glazing_value", "code"),
[
("Single glazing", 1),
("Double glazing, pre-2002", 3),
("Triple glazing, 2002 or later", 9),
("Triple glazing, pre-2002", 6),
],
)
def test_glazing_types_decode_to_their_sap_codes(
glazing_value: str, code: int
) -> None:
# Act
simulation = glazing_overlay_for(glazing_value, 0)
# Assert
assert simulation is not None
assert simulation.glazing is not None
assert simulation.glazing.glazing_type == code
@pytest.mark.parametrize("glazing_value", ["Unknown", ""])
def test_unresolvable_glazing_produces_no_overlay(glazing_value: str) -> None:
# Act
simulation = glazing_overlay_for(glazing_value, 0)
# Assert
assert simulation is None
def test_glazing_override_remaps_every_window_and_clears_lodged_u() -> None:
# Arrange — baseline windows are double glazed (code 2, lodged U 2.8); the
# landlord corrects the whole dwelling to single glazing.
baseline = build_epc()
assert len(baseline.sap_windows) > 1
overlay = glazing_overlay_for("Single glazing", 0)
assert overlay is not None
# Act
result = apply_simulations(baseline, [overlay])
# Assert — every window flips to single (code 1) and its lodged transmission
# U is cleared so the Table-24 cascade re-derives U from the new type.
assert all(w.glazing_type == 1 for w in result.sap_windows)
assert all(w.window_transmission_details is None for w in result.sap_windows)
@pytest.mark.parametrize(
"member", [m for m in GlazingType if m is not GlazingType.UNKNOWN]
)
def test_every_resolvable_glazing_value_decodes_to_a_code(
member: GlazingType,
) -> None:
# A classifier emits a GlazingType value; if the overlay can't decode it the
# override silently no-ops. Every non-UNKNOWN member must resolve.
# Act
simulation = glazing_overlay_for(member.value, 0)
# Assert
assert simulation is not None

View file

@ -0,0 +1,115 @@
"""The Landlord-Override main-fuel → heating Simulation Overlay mapping.
A main-fuel value resolves to the RdSAP `main_fuel_type` int code the calculator
reads from the dwelling's primary heating system; the overlay is whole-dwelling.
"""
from __future__ import annotations
import pytest
from domain.epc.property_overrides.main_fuel_type import MainFuelType
from domain.epc.property_overlays.main_fuel_overlay import fuel_overlay_for
from domain.modelling.scoring.overlay_applicator import apply_simulations
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
build_epc,
)
def test_mains_gas_overlays_the_primary_fuel() -> None:
# Act
simulation = fuel_overlay_for("mains gas", 0)
# Assert — mains gas (not community) is RdSAP main_fuel code 26.
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.main_fuel_type == 26
@pytest.mark.parametrize(
("main_fuel_value", "code"),
[
("electricity", 29),
("LPG (bulk)", 27),
("oil", 28),
("house coal", 33),
],
)
def test_fuels_decode_to_their_modern_not_community_codes(
main_fuel_value: str, code: int
) -> None:
# Act
simulation = fuel_overlay_for(main_fuel_value, 0)
# Assert
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.main_fuel_type == code
@pytest.mark.parametrize(
("main_fuel_value", "code"),
[
("bottled LPG", 3),
("LPG special condition", 17),
("electricity (community)", 25),
("biomass (community)", 31),
("dual fuel (mineral and wood)", 10),
("smokeless coal", 15),
],
)
def test_more_fuels_decode_to_their_codes(main_fuel_value: str, code: int) -> None:
# Act
simulation = fuel_overlay_for(main_fuel_value, 0)
# Assert
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.main_fuel_type == code
def test_community_mains_gas_is_a_distinct_fuel_code() -> None:
# Act
simulation = fuel_overlay_for("mains gas (community)", 0)
# Assert — community mains gas is code 20, distinct from 26 (not community).
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.main_fuel_type == 20
@pytest.mark.parametrize("main_fuel_value", ["Unknown", "", "no heating or hot water"])
def test_unresolvable_fuel_produces_no_overlay(main_fuel_value: str) -> None:
# Act
simulation = fuel_overlay_for(main_fuel_value, 0)
# Assert
assert simulation is None
def test_fuel_override_remaps_the_primary_systems_fuel_on_the_epc() -> None:
# Arrange — a landlord correction that the dwelling runs on electricity.
baseline = build_epc()
overlay = fuel_overlay_for("electricity", 0)
assert overlay is not None
# Act
result = apply_simulations(baseline, [overlay])
# Assert — the calculator reads the primary fuel from main_heating_details[0].
assert result.sap_heating.main_heating_details[0].main_fuel_type == 29
@pytest.mark.parametrize(
"member", [m for m in MainFuelType if m is not MainFuelType.UNKNOWN]
)
def test_every_resolvable_fuel_value_decodes_to_a_code(member: MainFuelType) -> None:
# A classifier emits a MainFuelType value; if the overlay can't decode it the
# override silently no-ops. Every non-UNKNOWN member must resolve.
# Act
simulation = fuel_overlay_for(member.value, 0)
# Assert
assert simulation is not None

View file

@ -0,0 +1,140 @@
"""The Landlord-Override main-heating-system → heating Simulation Overlay mapping.
A main-heating-system value resolves to the SAP `sap_main_heating_code` the
calculator reads from the primary system; the overlay is whole-dwelling.
"""
from __future__ import annotations
import pytest
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
from domain.epc.property_overlays.main_fuel_overlay import fuel_overlay_for
from domain.epc.property_overlays.main_heating_system_overlay import (
main_heating_overlay_for,
)
from domain.epc.property_overlays.water_heating_overlay import (
water_heating_overlay_for,
)
from domain.modelling.scoring.overlay_applicator import apply_simulations
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
build_epc,
)
def test_gas_combi_overlays_the_primary_heating_code() -> None:
# Act
simulation = main_heating_overlay_for("Gas boiler, combi", 0)
# Assert — condensing combi is SAP Table 4b code 104.
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.sap_main_heating_code == 104
@pytest.mark.parametrize(
("main_heating_value", "code"),
[
("Gas boiler, regular", 102),
("Gas CPSU", 120),
("Electric storage heaters, fan", 404),
("Direct-acting electric", 191),
],
)
def test_heating_archetypes_decode_to_their_sap_codes(
main_heating_value: str, code: int
) -> None:
# Act
simulation = main_heating_overlay_for(main_heating_value, 0)
# Assert
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.sap_main_heating_code == code
@pytest.mark.parametrize(
("main_heating_value", "code"),
[
("Electric storage heaters, old", 401),
("Electric storage heaters, slimline", 402),
("Electric storage heaters, convector", 403),
],
)
def test_storage_heater_subtypes_decode_to_their_codes(
main_heating_value: str, code: int
) -> None:
# Act
simulation = main_heating_overlay_for(main_heating_value, 0)
# Assert
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.sap_main_heating_code == code
@pytest.mark.parametrize(
"main_heating_value",
["Unknown", "", "Air source heat pump", "Community heating"],
)
def test_unresolvable_or_unmodelled_heating_produces_no_overlay(
main_heating_value: str,
) -> None:
# Heat pumps (main_heating_index_number) and community heating (community
# codes) don't map to a Table 4b sap_main_heating_code yet — no overlay.
# Act
simulation = main_heating_overlay_for(main_heating_value, 0)
# Assert
assert simulation is None
def test_main_heating_override_remaps_the_primary_system_code() -> None:
# Arrange
baseline = build_epc()
overlay = main_heating_overlay_for("Gas boiler, regular", 0)
assert overlay is not None
# Act
result = apply_simulations(baseline, [overlay])
# Assert — the calculator reads the code off main_heating_details[0].
assert result.sap_heating.main_heating_details[0].sap_main_heating_code == 102
def test_the_three_heating_overrides_compose_without_conflict() -> None:
# Arrange — main_fuel, water_heating and main_heating_system all fold onto one
# HeatingOverlay surface but set DISJOINT fields, so they compose (the
# field-disjoint design that makes precedence moot for these three).
baseline = build_epc()
overlays = [
fuel_overlay_for("electricity", 0),
water_heating_overlay_for("Electric immersion, electricity", 0),
main_heating_overlay_for("Electric storage heaters, fan", 0),
]
assert all(o is not None for o in overlays)
# Act
result = apply_simulations(baseline, [o for o in overlays if o is not None])
# Assert — each override landed on its own field.
main = result.sap_heating.main_heating_details[0]
assert main.main_fuel_type == 29
assert main.sap_main_heating_code == 404
assert result.sap_heating.water_heating_code == 903
assert result.sap_heating.water_heating_fuel == 29
@pytest.mark.parametrize(
"member",
[m for m in MainHeatingSystemType if m is not MainHeatingSystemType.UNKNOWN],
)
def test_every_resolvable_main_heating_value_decodes(
member: MainHeatingSystemType,
) -> None:
# Act
simulation = main_heating_overlay_for(member.value, 0)
# Assert
assert simulation is not None

View file

@ -12,12 +12,12 @@ from typing import Optional
import pytest
from domain.epc.built_form_type import BuiltFormType
from domain.epc.override_code_mapping import (
from domain.epc.property_overrides.built_form_type import BuiltFormType
from domain.epc.property_overrides.override_code_mapping import (
built_form_to_code,
property_type_to_code,
)
from domain.epc.property_type import PropertyType
from domain.epc.property_overrides.property_type import PropertyType
def test_house_maps_to_gov_code_zero() -> None:

View file

@ -0,0 +1,111 @@
"""The Landlord-Override water-heating → heating Simulation Overlay mapping.
A water-heating value resolves to the SAP `water_heating_code` (system) and
`water_heating_fuel` the calculator reads; the overlay is whole-dwelling.
"""
from __future__ import annotations
import pytest
from domain.epc.property_overlays.water_heating_overlay import (
water_heating_overlay_for,
)
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
from domain.modelling.scoring.overlay_applicator import apply_simulations
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
build_epc,
)
def test_from_main_system_mains_gas_overlays_water_heating() -> None:
# Act
simulation = water_heating_overlay_for("From main system, mains gas", 0)
# Assert — "from main system" is water_heating_code 901, mains gas fuel 26.
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.water_heating_code == 901
assert simulation.heating.water_heating_fuel == 26
@pytest.mark.parametrize(
("water_heating_value", "code", "fuel"),
[
("From main system, electricity", 901, 29),
("Electric immersion, electricity", 903, 29),
],
)
def test_water_heating_systems_decode_to_their_codes(
water_heating_value: str, code: int, fuel: int
) -> None:
# Act
simulation = water_heating_overlay_for(water_heating_value, 0)
# Assert
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.water_heating_code == code
assert simulation.heating.water_heating_fuel == fuel
@pytest.mark.parametrize(
("water_heating_value", "code", "fuel"),
[
("From main system, oil", 901, 28),
("From main system, LPG (bulk)", 901, 27),
("From main system, bottled LPG", 901, 3),
("From main system, house coal", 901, 33),
# "boiler/circulator for water heating only" is SAP Table 4a code 911.
("Gas boiler/circulator, mains gas", 911, 26),
],
)
def test_more_water_heating_combos_decode_to_their_codes(
water_heating_value: str, code: int, fuel: int
) -> None:
# Act
simulation = water_heating_overlay_for(water_heating_value, 0)
# Assert
assert simulation is not None
assert simulation.heating is not None
assert simulation.heating.water_heating_code == code
assert simulation.heating.water_heating_fuel == fuel
@pytest.mark.parametrize("water_heating_value", ["Unknown", ""])
def test_unresolvable_water_heating_produces_no_overlay(
water_heating_value: str,
) -> None:
# Act
simulation = water_heating_overlay_for(water_heating_value, 0)
# Assert
assert simulation is None
def test_water_heating_override_remaps_the_hot_water_arrangement() -> None:
# Arrange — landlord correction: HW is a separate electric immersion.
baseline = build_epc()
overlay = water_heating_overlay_for("Electric immersion, electricity", 0)
assert overlay is not None
# Act
result = apply_simulations(baseline, [overlay])
# Assert — the calculator reads these off sap_heating.
assert result.sap_heating.water_heating_code == 903
assert result.sap_heating.water_heating_fuel == 29
@pytest.mark.parametrize(
"member", [m for m in WaterHeatingType if m is not WaterHeatingType.UNKNOWN]
)
def test_every_resolvable_water_heating_value_decodes(
member: WaterHeatingType,
) -> None:
# Act
simulation = water_heating_overlay_for(member.value, 0)
# Assert
assert simulation is not None

View file

@ -5,8 +5,8 @@ from typing import Optional
import pytest
from domain.data_transformation.column_classifier import ClassificationError
from domain.epc.property_type import PropertyType
from domain.epc.wall_type import WallType
from domain.epc.property_overrides.property_type import PropertyType
from domain.epc.property_overrides.wall_type import WallType
from infrastructure.chatgpt.chatgpt import ChatGPT
from infrastructure.chatgpt.chatgpt_column_classifier import (
ChatGptColumnClassifier,

View file

@ -4,9 +4,9 @@ from enum import Enum
from typing import Any, Optional
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
from domain.epc.built_form_type import BuiltFormType
from domain.epc.property_type import PropertyType
from domain.epc.wall_type import WallType
from domain.epc.property_overrides.built_form_type import BuiltFormType
from domain.epc.property_overrides.property_type import PropertyType
from domain.epc.property_overrides.wall_type import WallType
from domain.postcode import Postcode
from domain.data_transformation.column_classifier import ColumnClassifier
from orchestration.classifiable_column import ClassifiableColumn

View file

@ -25,8 +25,8 @@ import pytest
from sqlalchemy import Engine, Table
from sqlmodel import Session, SQLModel, select
from domain.epc.property_type import PropertyType
from domain.epc.wall_type import WallType
from domain.epc.property_overrides.property_type import PropertyType
from domain.epc.property_overrides.wall_type import WallType
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
LandlordOverridesRepository,
)

View file

@ -47,6 +47,88 @@ def test_each_resolvable_component_produces_an_overlay() -> None:
assert len(overlays) == 4
def test_main_fuel_row_produces_a_heating_fuel_overlay() -> None:
# Arrange
overrides = ResolvedPropertyOverrides(
rows=(ResolvedPropertyOverride("main_fuel", 0, "mains gas"),)
)
# Act
overlays = overlays_from(overrides)
# Assert
assert len(overlays) == 1
assert overlays[0].heating is not None
assert overlays[0].heating.main_fuel_type == 26
def test_glazing_row_produces_a_glazing_overlay() -> None:
# Arrange
overrides = ResolvedPropertyOverrides(
rows=(ResolvedPropertyOverride("glazing", 0, "Double glazing, 2002 or later"),)
)
# Act
overlays = overlays_from(overrides)
# Assert
assert len(overlays) == 1
assert overlays[0].glazing is not None
assert overlays[0].glazing.glazing_type == 2
def test_construction_age_band_row_produces_a_building_part_overlay() -> None:
# Arrange
overrides = ResolvedPropertyOverrides(
rows=(ResolvedPropertyOverride("construction_age_band", 0, "B"),)
)
# Act
overlays = overlays_from(overrides)
# Assert
assert len(overlays) == 1
main = overlays[0].building_parts[BuildingPartIdentifier.MAIN]
assert main.construction_age_band == "B"
def test_water_heating_row_produces_a_heating_overlay() -> None:
# Arrange
overrides = ResolvedPropertyOverrides(
rows=(
ResolvedPropertyOverride(
"water_heating", 0, "From main system, mains gas"
),
)
)
# Act
overlays = overlays_from(overrides)
# Assert
assert len(overlays) == 1
assert overlays[0].heating is not None
assert overlays[0].heating.water_heating_code == 901
assert overlays[0].heating.water_heating_fuel == 26
def test_main_heating_system_row_produces_a_heating_overlay() -> None:
# Arrange
overrides = ResolvedPropertyOverrides(
rows=(
ResolvedPropertyOverride("main_heating_system", 0, "Gas boiler, combi"),
)
)
# Act
overlays = overlays_from(overrides)
# Assert
assert len(overlays) == 1
assert overlays[0].heating is not None
assert overlays[0].heating.sap_main_heating_code == 104
def test_unresolvable_rows_are_skipped() -> None:
# Arrange — an "Unknown" property type and an unmapped wall material.
overrides = ResolvedPropertyOverrides(

View file

@ -0,0 +1,35 @@
"""Every override component must be wired through the WHOLE chain.
The finaliser reader (`_ROW_TYPES`, component -> landlord table) and the overlay
registry (`_COMPONENT_OVERLAYS`, component -> overlay mapper) must cover exactly
the same set of components. If a component is classified + stored but has no
reader entry, the finaliser silently never writes its `property_overrides` rows;
if it has no overlay entry, the row never reaches the calculator. This guard
keeps the two registries in lock-step (it would have caught the missing
main_fuel / glazing / construction_age_band reader entries).
"""
from __future__ import annotations
from typing import cast
from infrastructure.landlord_overrides.landlord_override_reader_postgres_repository import (
_ROW_TYPES, # pyright: ignore[reportPrivateUsage]
)
from infrastructure.postgres.property_override_table import override_component_sa_enum
from repositories.property.landlord_override_overlays import (
_COMPONENT_OVERLAYS, # pyright: ignore[reportPrivateUsage]
)
def test_reader_and_overlay_registries_cover_the_same_components() -> None:
# Assert
assert set(_ROW_TYPES) == set(_COMPONENT_OVERLAYS)
def test_override_component_pgenum_covers_every_component() -> None:
# The property_overrides.override_component pgEnum mirror must list every
# component, or writing/reading a new-component row through it throws a
# LookupError against Postgres (caught live on the Hyde portfolio-796 run).
pgenum_values = cast(list[str], getattr(override_component_sa_enum, "enums"))
assert set(pgenum_values) == set(_COMPONENT_OVERLAYS)

View file

@ -0,0 +1,95 @@
"""End-to-end smoke of the Hyde override script for ONE property, against a real
(ephemeral) Postgres. Seeds the landlord vocab (simulating post-classify, so no
ChatGPT) + a minimal ``property`` row, then runs the script's real
``write`` + ``verify`` paths and asserts property_overrides + overlays land.
"""
from __future__ import annotations
import argparse
from typing import Any
from sqlalchemy import Engine, text
from sqlmodel import Session
import scripts.hyde.build_property_overrides as b
from domain.epc.property_overrides.built_form_type import BuiltFormType
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
from domain.epc.property_overrides.glazing_type import GlazingType
from domain.epc.property_overrides.main_fuel_type import MainFuelType
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
from domain.epc.property_overrides.property_type import PropertyType
from domain.epc.property_overrides.roof_type import RoofType
from domain.epc.property_overrides.wall_type import WallType
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
LandlordOverridesRepository,
)
from repositories.property.landlord_override_overlays import overlays_from
from repositories.property.property_overrides_postgres_reader import (
PropertyOverridesPostgresReader,
)
PORTFOLIO = 795
ORG_REF = "55180004001"
EXCEL = "scripts/hyde/hyde_property_overrides.xlsx"
# What ChatGPT WOULD resolve this property's 9 descriptions to (component ->
# (raw Excel entry, enum member)). Seeded into the landlord ledger.
SEED = {
"property_type": ("House: MidTerrace", PropertyType.HOUSE),
"built_form_type": ("House: MidTerrace", BuiltFormType.MID_TERRACE),
"wall_type": ("TimberFrame: AsBuilt", WallType.TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED),
"roof_type": ("PitchedNormalLoftAccess: 300mm", RoofType.PITCHED_LOFT_300MM),
"construction_age_band": ("L: 2012-2022", ConstructionAgeBand.L_2012_2022),
"main_fuel": ("Gas: Mains Gas", MainFuelType.MAINS_GAS),
"glazing": ("100% Double glazing 2002 or later", GlazingType.DOUBLE_POST_2002),
"water_heating": ("From main heating system: Mains Gas", WaterHeatingType.FROM_MAIN_MAINS_GAS),
"main_heating_system": ("Boiler: C rated Combi", MainHeatingSystemType.GAS_COMBI),
}
def test_one_property_end_to_end(db_engine: Engine, monkeypatch: Any) -> None:
specs = b._specs_by_component() # pyright: ignore[reportPrivateUsage]
# minimal FE-owned `property` table + the one row we'll match by org_ref
with Session(db_engine) as s:
s.execute(text( # pyright: ignore[reportDeprecated]
"CREATE TABLE property (id bigint PRIMARY KEY, portfolio_id bigint, "
"landlord_property_id text)"))
s.execute(text("INSERT INTO property VALUES (1, :p, :ref)"), # pyright: ignore[reportDeprecated]
{"p": PORTFOLIO, "ref": ORG_REF})
# seed the classifier ledger (keyed on normalised description)
for comp, (raw, member) in SEED.items():
repo: LandlordOverridesRepository[Any] = LandlordOverridesRepository(
s, specs[comp].row_type)
repo.upsert_all(PORTFOLIO, {b._norm(raw): member}) # pyright: ignore[reportPrivateUsage]
s.commit()
# point the script at the ephemeral engine
monkeypatch.setattr(b, "_db_session", lambda: Session(db_engine))
# --- run the real write() for this one property ---
b.write(argparse.Namespace(excel=EXCEL, sheet="AddressProfilingResults",
portfolio_id=PORTFOLIO, org_ref=ORG_REF, limit=None, apply=True))
with Session(db_engine) as s:
rows = list(s.execute(text( # pyright: ignore[reportDeprecated]
"SELECT override_component, building_part, override_value "
"FROM property_overrides WHERE property_id = 1 ORDER BY override_component")))
got = {c: v for c, _, v in rows}
# every seeded component produced a property_overrides row with the resolved value
assert got["main_fuel"] == "mains gas"
assert got["glazing"] == "Double glazing, 2002 or later"
assert got["construction_age_band"] == "L"
assert got["main_heating_system"] == "Gas boiler, combi"
assert got["water_heating"] == "From main system, mains gas"
assert len(rows) == 9 # all 9 components
# --- the overrides reach the SAP overlay surface ---
b.verify(argparse.Namespace(portfolio_id=PORTFOLIO, org_ref=ORG_REF)) # exercises verify()
overlays = overlays_from(
PropertyOverridesPostgresReader(lambda: Session(db_engine)).overrides_for(1))
assert len(overlays) == 9
assert any(o.heating is not None and o.heating.main_fuel_type == 26 for o in overlays)
assert any(o.glazing is not None and o.glazing.glazing_type == 2 for o in overlays)