diff --git a/.gitignore b/.gitignore index 9c77b311..0bf9e0e9 100644 --- a/.gitignore +++ b/.gitignore @@ -313,3 +313,8 @@ scripts/eon/epc_cache.pkl scripts/hyde/.elmhurst-session/ scripts/hyde/elmhurst_downloads/ scripts/hyde/.elmhurst-creds.json + +# Hyde property-overrides script artifacts +overrides_cache.json +overrides_unknowns.csv +overrides_edits.csv diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index 5e689a45..4020bc51 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -7,11 +7,16 @@ import boto3 from applications.landlord_description_overrides.landlord_description_overrides_trigger_body import ( LandlordDescriptionOverridesTriggerBody, ) -from domain.epc.built_form_type import BuiltFormType -from domain.epc.property_type import PropertyType -from domain.epc.roof_type import RoofType -from domain.epc.wall_type import WallType -from domain.epc.wall_type_construction_dates import ( +from domain.epc.property_overrides.built_form_type import BuiltFormType +from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand +from domain.epc.property_overrides.glazing_type import GlazingType +from domain.epc.property_overrides.main_fuel_type import MainFuelType +from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType +from domain.epc.property_overrides.property_type import PropertyType +from domain.epc.property_overrides.roof_type import RoofType +from domain.epc.property_overrides.water_heating_type import WaterHeatingType +from domain.epc.property_overrides.wall_type import WallType +from domain.epc.property_overrides.wall_type_construction_dates import ( wall_type_construction_date_prompt_hint, ) from infrastructure.chatgpt.chatgpt import ChatGPT @@ -24,6 +29,21 @@ from infrastructure.postgres.engine import commit_scope, make_engine, make_sessi from infrastructure.postgres.landlord_built_form_type_override_table import ( LandlordBuiltFormTypeOverrideRow, ) +from infrastructure.postgres.landlord_construction_age_band_override_table import ( + LandlordConstructionAgeBandOverrideRow, +) +from infrastructure.postgres.landlord_glazing_override_table import ( + LandlordGlazingOverrideRow, +) +from infrastructure.postgres.landlord_main_fuel_override_table import ( + LandlordMainFuelOverrideRow, +) +from infrastructure.postgres.landlord_main_heating_system_override_table import ( + LandlordMainHeatingSystemOverrideRow, +) +from infrastructure.postgres.landlord_water_heating_override_table import ( + LandlordWaterHeatingOverrideRow, +) from infrastructure.postgres.landlord_property_type_override_table import ( LandlordPropertyTypeOverrideRow, ) @@ -102,6 +122,56 @@ def _build_columns( session, LandlordRoofTypeOverrideRow ), ), + "main_fuel": lambda src: ClassifiableColumn( + name="main_fuel", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, MainFuelType, MainFuelType.UNKNOWN + ), + repo=LandlordOverridesRepository[MainFuelType]( + session, LandlordMainFuelOverrideRow + ), + ), + "glazing": lambda src: ClassifiableColumn( + name="glazing", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, GlazingType, GlazingType.UNKNOWN + ), + repo=LandlordOverridesRepository[GlazingType]( + session, LandlordGlazingOverrideRow + ), + ), + "construction_age_band": lambda src: ClassifiableColumn( + name="construction_age_band", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, ConstructionAgeBand, ConstructionAgeBand.UNKNOWN + ), + repo=LandlordOverridesRepository[ConstructionAgeBand]( + session, LandlordConstructionAgeBandOverrideRow + ), + ), + "water_heating": lambda src: ClassifiableColumn( + name="water_heating", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, WaterHeatingType, WaterHeatingType.UNKNOWN + ), + repo=LandlordOverridesRepository[WaterHeatingType]( + session, LandlordWaterHeatingOverrideRow + ), + ), + "main_heating_system": lambda src: ClassifiableColumn( + name="main_heating_system", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, MainHeatingSystemType, MainHeatingSystemType.UNKNOWN + ), + repo=LandlordOverridesRepository[MainHeatingSystemType]( + session, LandlordMainHeatingSystemOverrideRow + ), + ), } columns: list[ClassifiableColumn[Any]] = [] diff --git a/deployment/terraform/lambda/hubspot_deal_etl/variables.tf b/deployment/terraform/lambda/hubspot_deal_etl/variables.tf index 84f0e567..189822f0 100644 --- a/deployment/terraform/lambda/hubspot_deal_etl/variables.tf +++ b/deployment/terraform/lambda/hubspot_deal_etl/variables.tf @@ -19,7 +19,7 @@ variable "image_digest" { variable "maximum_concurrency" { type = number - default = 2 + default = 20 description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." } diff --git a/domain/epc/property_overlays/construction_age_band_overlay.py b/domain/epc/property_overlays/construction_age_band_overlay.py new file mode 100644 index 00000000..55054168 --- /dev/null +++ b/domain/epc/property_overlays/construction_age_band_overlay.py @@ -0,0 +1,39 @@ +"""Map a Landlord-Override construction-age-band value to a fabric Simulation +Overlay. + +A construction-age-band value is the RdSAP England-&-Wales letter code (A..M) +the calculator's U-value cascades key on (`SapBuildingPart.construction_age_band`, +read via `.strip().upper()` against the letter-code bands). The overlay targets +the override's building part and sets the band; an unrecognised code produces no +overlay. Re-dating a part re-derives its construction-default U-values, so this +is the highest-leverage fabric override. +""" + +from __future__ import annotations + +from typing import Optional + +from datatypes.epc.domain.epc_property_data import BuildingPartIdentifier +from domain.modelling.simulation import BuildingPartOverlay, EpcSimulation + +# RdSAP England-&-Wales construction age bands (letter codes A..M). +_VALID_AGE_BANDS: frozenset[str] = frozenset("ABCDEFGHIJKLM") + + +def age_band_overlay_for( + age_band_value: str, building_part: int +) -> Optional[EpcSimulation]: + band = age_band_value.strip().upper() + if band not in _VALID_AGE_BANDS: + return None + + identifier = ( + BuildingPartIdentifier.MAIN + if building_part == 0 + else BuildingPartIdentifier.extension(building_part) + ) + return EpcSimulation( + building_parts={ + identifier: BuildingPartOverlay(construction_age_band=band) + } + ) diff --git a/domain/epc/property_overlays/glazing_overlay.py b/domain/epc/property_overlays/glazing_overlay.py new file mode 100644 index 00000000..d692181d --- /dev/null +++ b/domain/epc/property_overlays/glazing_overlay.py @@ -0,0 +1,36 @@ +"""Map a Landlord-Override glazing value to a glazing Simulation Overlay. + +A glazing value is one canonical glazing description carrying type + era +("Double glazing, 2002 or later", "Single glazing", "Triple glazing, 2002 or +later"). The calculator derives each window's U-value from its SAP10 +`glazing_type` code via the RdSAP Table 24 cascade, so the overlay decomposes +the value into that code and emits a whole-dwelling `GlazingOverlay` (a landlord +describes the dwelling's glazing as a whole, with no per-window geometry, so +`building_part` is ignored). `_fold_glazing` expands it across every window. +Unresolvable values produce no overlay. +""" + +from __future__ import annotations + +from typing import Optional + +from domain.modelling.simulation import EpcSimulation, GlazingOverlay + +# Canonical glazing description → SAP10 glazing-type code (the Table 24 / +# `u_window` cascade enum, `_GLAZING_CODE_TO_UWINDOW` in heat_transmission). +_GLAZING_CODES: dict[str, int] = { + "Single glazing": 1, + "Double glazing, 2002 or later": 2, + "Double glazing, pre-2002": 3, + "Triple glazing, pre-2002": 6, + "Triple glazing, 2002 or later": 9, +} + + +def glazing_overlay_for( + glazing_value: str, building_part: int +) -> Optional[EpcSimulation]: + code = _GLAZING_CODES.get(glazing_value) + if code is None: + return None + return EpcSimulation(glazing=GlazingOverlay(glazing_type=code)) diff --git a/domain/epc/property_overlays/main_fuel_overlay.py b/domain/epc/property_overlays/main_fuel_overlay.py new file mode 100644 index 00000000..cc482122 --- /dev/null +++ b/domain/epc/property_overlays/main_fuel_overlay.py @@ -0,0 +1,41 @@ +"""Map a Landlord-Override main-fuel value to a heating Simulation Overlay. + +A main-fuel value is one canonical gov-EPC `main_fuel` description ("mains gas", +"electricity", …). The calculator reads the dwelling's primary fuel from +`main_heating_details[0].main_fuel_type` as the RdSAP **int code**, so the +overlay decomposes the value into that code and emits a whole-dwelling +`HeatingOverlay` (fuel is not a per-building-part attribute, so `building_part` +is ignored). Codes follow the modern RdSAP-20/21 `(not community)` family the +gov-EPC API baseline uses. Unresolvable values produce no overlay. +""" + +from __future__ import annotations + +from typing import Optional + +from domain.modelling.simulation import EpcSimulation, HeatingOverlay + +# RdSAP-20/21 `main_fuel` `(not community)` codes (epc_codes.csv `main_fuel`). +_FUEL_CODES: dict[str, int] = { + "mains gas": 26, + "mains gas (community)": 20, + "LPG (bulk)": 27, + "bottled LPG": 3, + "LPG special condition": 17, + "oil": 28, + "electricity": 29, + "electricity (community)": 25, + "house coal": 33, + "smokeless coal": 15, + "dual fuel (mineral and wood)": 10, + "biomass (community)": 31, +} + + +def fuel_overlay_for( + main_fuel_value: str, building_part: int +) -> Optional[EpcSimulation]: + code = _FUEL_CODES.get(main_fuel_value) + if code is None: + return None + return EpcSimulation(heating=HeatingOverlay(main_fuel_type=code)) diff --git a/domain/epc/property_overlays/main_heating_system_overlay.py b/domain/epc/property_overlays/main_heating_system_overlay.py new file mode 100644 index 00000000..1f06eabc --- /dev/null +++ b/domain/epc/property_overlays/main_heating_system_overlay.py @@ -0,0 +1,46 @@ +"""Map a Landlord-Override main-heating-system value to a heating Simulation Overlay. + +A main-heating-system value is one canonical system archetype ("Gas boiler, +combi", "Electric storage heaters, fan"). The calculator reads the primary +system's `sap_main_heating_code` (SAP Table 4a/4b), so the overlay maps the +archetype to a representative code and emits a whole-dwelling `HeatingOverlay` +targeting `main_heating_details[0]` (`building_part` is ignored). It composes +field-wise with the main_fuel / water_heating overlays. + +The SEDBUK A-G efficiency band the Hyde "Heating" column carries is NOT honoured +yet (no efficiency slot on the overlay/MainHeatingDetail) -- archetypes map to +their modern/condensing Table 4b code, so an old low-rated boiler is currently +modelled at the condensing efficiency. Heat pumps and community heating (which +resolve via main_heating_index_number / community codes, not a Table 4b code) +are left UNKNOWN until modelled. Unresolvable values produce no overlay. +""" + +from __future__ import annotations + +from typing import Optional + +from domain.modelling.simulation import EpcSimulation, HeatingOverlay + +# Canonical system archetype → representative `sap_main_heating_code` (SAP Table +# 4b boiler rows / Table 4a). Codes map to the modern/condensing variant (A-G +# efficiency deferred): 102 regular condensing, 104 condensing combi, 120 CPSU, +# 404 fan storage heaters, 191 direct-acting electric boiler. +_MAIN_HEATING_CODES: dict[str, int] = { + "Gas boiler, combi": 104, + "Gas boiler, regular": 102, + "Gas CPSU": 120, + "Electric storage heaters, old": 401, + "Electric storage heaters, slimline": 402, + "Electric storage heaters, convector": 403, + "Electric storage heaters, fan": 404, + "Direct-acting electric": 191, +} + + +def main_heating_overlay_for( + main_heating_value: str, building_part: int +) -> Optional[EpcSimulation]: + code = _MAIN_HEATING_CODES.get(main_heating_value) + if code is None: + return None + return EpcSimulation(heating=HeatingOverlay(sap_main_heating_code=code)) diff --git a/domain/epc/property_overlays/water_heating_overlay.py b/domain/epc/property_overlays/water_heating_overlay.py new file mode 100644 index 00000000..3ec952de --- /dev/null +++ b/domain/epc/property_overlays/water_heating_overlay.py @@ -0,0 +1,47 @@ +"""Map a Landlord-Override water-heating value to a heating Simulation Overlay. + +A water-heating value is one canonical ", " description ("From main +system, mains gas", "Electric immersion, electricity"). The calculator reads the +hot-water arrangement from `sap_heating.water_heating_code` (the SAP Table 4a +system code) and `water_heating_fuel`, so the overlay decomposes the value into +those two int codes and emits a whole-dwelling `HeatingOverlay` (water heating is +not per-building-part, so `building_part` is ignored). It composes field-wise with +the main_fuel / main_heating overlays. Unresolvable values produce no overlay. +""" + +from __future__ import annotations + +from typing import Optional + +from domain.modelling.simulation import EpcSimulation, HeatingOverlay + +# Canonical ", " description → (water_heating_code, water_heating_fuel). +# water_heating_code: 901 "from main system" (SAP Table 4a inherit-from-main), +# 903 "electric immersion". Fuel codes are the modern RdSAP "(not community)" +# family (26 mains gas, 29 electricity), matching the main_fuel overlay. +_WATER_HEATING_CODES: dict[str, tuple[int, int]] = { + "From main system, mains gas": (901, 26), + "From main system, electricity": (901, 29), + "From main system, oil": (901, 28), + "From main system, LPG (bulk)": (901, 27), + "From main system, bottled LPG": (901, 3), + "From main system, house coal": (901, 33), + "Electric immersion, electricity": (903, 29), + # "boiler/circulator for water heating only" — SAP Table 4a code 911 (gas). + "Gas boiler/circulator, mains gas": (911, 26), +} + + +def water_heating_overlay_for( + water_heating_value: str, building_part: int +) -> Optional[EpcSimulation]: + codes = _WATER_HEATING_CODES.get(water_heating_value) + if codes is None: + return None + water_heating_code, water_heating_fuel = codes + return EpcSimulation( + heating=HeatingOverlay( + water_heating_code=water_heating_code, + water_heating_fuel=water_heating_fuel, + ) + ) diff --git a/domain/epc/property_overrides/__init__.py b/domain/epc/property_overrides/__init__.py new file mode 100644 index 00000000..e11710a4 --- /dev/null +++ b/domain/epc/property_overrides/__init__.py @@ -0,0 +1,4 @@ +"""Landlord property-override classifier vocabulary — the category enums a +landlord description resolves into, plus their value→code helpers. The classifier +target for the property_overrides chain (mirrors the property_overrides table / +override_component pgEnum). Distinct from the EPC-context types of the same name.""" diff --git a/domain/epc/built_form_type.py b/domain/epc/property_overrides/built_form_type.py similarity index 100% rename from domain/epc/built_form_type.py rename to domain/epc/property_overrides/built_form_type.py diff --git a/domain/epc/property_overrides/construction_age_band.py b/domain/epc/property_overrides/construction_age_band.py new file mode 100644 index 00000000..83f6e9a8 --- /dev/null +++ b/domain/epc/property_overrides/construction_age_band.py @@ -0,0 +1,31 @@ +from enum import Enum + + +class ConstructionAgeBand(Enum): + """A landlord-supplied construction age band, as resolved by the + landlord-description-overrides context. + + Each member's value is the RdSAP England-&-Wales age-band **letter code** + (A..M) the calculator's U-value cascades read from + `SapBuildingPart.construction_age_band` — the same representation the gov-EPC + API lodges. The construction-age-band Simulation Overlay + (``domain/epc/property_overlays/construction_age_band_overlay.py``) sets the + letter directly, so these values MUST stay the bare letter codes. Member + names carry the year ranges for readability. ``UNKNOWN`` covers values the + classifier cannot resolve (it leaves the lodged cert's age band untouched). + """ + + A_BEFORE_1900 = "A" + B_1900_1929 = "B" + C_1930_1949 = "C" + D_1950_1966 = "D" + E_1967_1975 = "E" + F_1976_1982 = "F" + G_1983_1990 = "G" + H_1991_1995 = "H" + I_1996_2002 = "I" + J_2003_2006 = "J" + K_2007_2011 = "K" + L_2012_2022 = "L" + M_2023_ONWARDS = "M" + UNKNOWN = "Unknown" diff --git a/domain/epc/property_overrides/glazing_type.py b/domain/epc/property_overrides/glazing_type.py new file mode 100644 index 00000000..ee3ad267 --- /dev/null +++ b/domain/epc/property_overrides/glazing_type.py @@ -0,0 +1,24 @@ +from enum import Enum + + +class GlazingType(Enum): + """A landlord-supplied glazing description, as resolved by the + landlord-description-overrides context. + + Each member's value is the canonical glazing description (type + era) that + the glazing Simulation Overlay + (``domain/epc/property_overlays/glazing_overlay.py``) decomposes into the + SAP10 ``glazing_type`` code the calculator's Table-24 cascade reads — so the + member values here MUST stay in lock-step with that overlay's + ``_GLAZING_CODES`` keys. The era matters: double-glazing pre-2002 and + 2002-onward resolve to different codes (and U-values). ``UNKNOWN`` covers + values the classifier cannot resolve, and any glazing not yet given a + verified overlay code (it leaves the lodged cert's glazing untouched). + """ + + SINGLE = "Single glazing" + DOUBLE_POST_2002 = "Double glazing, 2002 or later" + DOUBLE_PRE_2002 = "Double glazing, pre-2002" + TRIPLE_PRE_2002 = "Triple glazing, pre-2002" + TRIPLE_POST_2002 = "Triple glazing, 2002 or later" + UNKNOWN = "Unknown" diff --git a/domain/epc/property_overrides/main_fuel_type.py b/domain/epc/property_overrides/main_fuel_type.py new file mode 100644 index 00000000..8427b9cf --- /dev/null +++ b/domain/epc/property_overrides/main_fuel_type.py @@ -0,0 +1,29 @@ +from enum import Enum + + +class MainFuelType(Enum): + """A landlord-supplied main-fuel description, as resolved by the + landlord-description-overrides context. + + Each member's value is the canonical fuel description that the main-fuel + Simulation Overlay (``domain/epc/property_overlays/main_fuel_overlay.py``) + decomposes into the RdSAP ``main_fuel`` int code the calculator reads — so + the member values here MUST stay in lock-step with that overlay's + ``_FUEL_CODES`` keys. ``UNKNOWN`` covers values the classifier cannot + resolve, and also any fuel not yet given a verified overlay code (it leaves + the lodged cert's fuel untouched rather than guessing). + """ + + MAINS_GAS = "mains gas" + MAINS_GAS_COMMUNITY = "mains gas (community)" + ELECTRICITY = "electricity" + ELECTRICITY_COMMUNITY = "electricity (community)" + LPG_BULK = "LPG (bulk)" + LPG_BOTTLED = "bottled LPG" + LPG_SPECIAL_CONDITION = "LPG special condition" + OIL = "oil" + HOUSE_COAL = "house coal" + SMOKELESS_COAL = "smokeless coal" + DUAL_FUEL_MINERAL_WOOD = "dual fuel (mineral and wood)" + BIOMASS_COMMUNITY = "biomass (community)" + UNKNOWN = "Unknown" diff --git a/domain/epc/property_overrides/main_heating_system_type.py b/domain/epc/property_overrides/main_heating_system_type.py new file mode 100644 index 00000000..bea14e6a --- /dev/null +++ b/domain/epc/property_overrides/main_heating_system_type.py @@ -0,0 +1,27 @@ +from enum import Enum + + +class MainHeatingSystemType(Enum): + """A landlord-supplied main-heating-system description, as resolved by the + landlord-description-overrides context. + + Each member's value is the canonical system archetype that the main-heating + Simulation Overlay + (``domain/epc/property_overlays/main_heating_system_overlay.py``) maps to a + representative SAP ``sap_main_heating_code`` — so the member values MUST stay + in lock-step with that overlay's ``_MAIN_HEATING_CODES`` keys. The SEDBUK A-G + efficiency band the Hyde "Heating" column carries is NOT modelled yet + (deferred), so archetypes map to their modern/condensing code. ``UNKNOWN`` + covers values the classifier cannot resolve and the not-yet-modelled systems + (heat pumps, community heating). + """ + + GAS_COMBI = "Gas boiler, combi" + GAS_REGULAR = "Gas boiler, regular" + GAS_CPSU = "Gas CPSU" + ELECTRIC_STORAGE_OLD = "Electric storage heaters, old" + ELECTRIC_STORAGE_SLIMLINE = "Electric storage heaters, slimline" + ELECTRIC_STORAGE_CONVECTOR = "Electric storage heaters, convector" + ELECTRIC_STORAGE_FAN = "Electric storage heaters, fan" + DIRECT_ELECTRIC = "Direct-acting electric" + UNKNOWN = "Unknown" diff --git a/domain/epc/override_code_mapping.py b/domain/epc/property_overrides/override_code_mapping.py similarity index 100% rename from domain/epc/override_code_mapping.py rename to domain/epc/property_overrides/override_code_mapping.py diff --git a/domain/epc/property_type.py b/domain/epc/property_overrides/property_type.py similarity index 100% rename from domain/epc/property_type.py rename to domain/epc/property_overrides/property_type.py diff --git a/domain/epc/roof_type.py b/domain/epc/property_overrides/roof_type.py similarity index 100% rename from domain/epc/roof_type.py rename to domain/epc/property_overrides/roof_type.py diff --git a/domain/epc/wall_type.py b/domain/epc/property_overrides/wall_type.py similarity index 100% rename from domain/epc/wall_type.py rename to domain/epc/property_overrides/wall_type.py diff --git a/domain/epc/wall_type_construction_dates.py b/domain/epc/property_overrides/wall_type_construction_dates.py similarity index 97% rename from domain/epc/wall_type_construction_dates.py rename to domain/epc/property_overrides/wall_type_construction_dates.py index 0eccc44c..04deda23 100644 --- a/domain/epc/wall_type_construction_dates.py +++ b/domain/epc/property_overrides/wall_type_construction_dates.py @@ -27,7 +27,7 @@ from __future__ import annotations from dataclasses import dataclass from typing import Mapping, Optional -from domain.epc.wall_type import WallType +from domain.epc.property_overrides.wall_type import WallType @dataclass(frozen=True) diff --git a/domain/epc/property_overrides/water_heating_type.py b/domain/epc/property_overrides/water_heating_type.py new file mode 100644 index 00000000..e85a195d --- /dev/null +++ b/domain/epc/property_overrides/water_heating_type.py @@ -0,0 +1,26 @@ +from enum import Enum + + +class WaterHeatingType(Enum): + """A landlord-supplied water-heating description, as resolved by the + landlord-description-overrides context. + + Each member's value is the canonical ", " description that the + water-heating Simulation Overlay + (``domain/epc/property_overlays/water_heating_overlay.py``) decomposes into + the SAP ``water_heating_code`` + ``water_heating_fuel`` int codes the + calculator reads — so the member values MUST stay in lock-step with that + overlay's ``_WATER_HEATING_CODES`` keys. ``UNKNOWN`` covers values the + classifier cannot resolve, and any combination not yet given verified codes + (it leaves the lodged cert's hot-water arrangement untouched). + """ + + FROM_MAIN_MAINS_GAS = "From main system, mains gas" + FROM_MAIN_ELECTRICITY = "From main system, electricity" + FROM_MAIN_OIL = "From main system, oil" + FROM_MAIN_LPG_BULK = "From main system, LPG (bulk)" + FROM_MAIN_BOTTLED_LPG = "From main system, bottled LPG" + FROM_MAIN_HOUSE_COAL = "From main system, house coal" + ELECTRIC_IMMERSION = "Electric immersion, electricity" + GAS_BOILER_CIRCULATOR_MAINS_GAS = "Gas boiler/circulator, mains gas" + UNKNOWN = "Unknown" diff --git a/domain/modelling/scoring/overlay_applicator.py b/domain/modelling/scoring/overlay_applicator.py index 9c83724a..961f4e02 100644 --- a/domain/modelling/scoring/overlay_applicator.py +++ b/domain/modelling/scoring/overlay_applicator.py @@ -19,6 +19,7 @@ from datatypes.epc.domain.epc_property_data import ( ) from domain.modelling.simulation import ( EpcSimulation, + GlazingOverlay, HeatingOverlay, LightingOverlay, SecondaryHeatingOverlay, @@ -53,6 +54,8 @@ def apply_simulations( ) if simulation.lighting is not None: _fold_lighting(result, simulation.lighting) + if simulation.glazing is not None: + _fold_glazing(result, simulation.glazing) if simulation.heating is not None: _fold_heating(result, simulation.heating) if simulation.secondary_heating is not None: @@ -202,6 +205,21 @@ def _fold_window(window: SapWindow, overlay: WindowOverlay) -> None: details.solar_transmittance = overlay.solar_transmittance +def _fold_glazing(epc: EpcPropertyData, overlay: GlazingOverlay) -> None: + """Expand a whole-dwelling `GlazingOverlay` across every window: set each + window's `glazing_type` to the corrected SAP10 code AND clear its lodged + transmission U, so `heat_transmission`'s Table-24 cascade re-derives U from + the new type (the lodged U was for the old, mis-recorded glazing). A landlord + glazing override carries no per-window geometry, so it applies uniformly — + the expansion lives here because the baseline window list is known only at + fold time.""" + if overlay.glazing_type is None: + return + for window in epc.sap_windows: + window.glazing_type = overlay.glazing_type + window.window_transmission_details = None + + def _fold_ventilation( baseline: Optional[SapVentilation], overlay: VentilationOverlay ) -> SapVentilation: diff --git a/domain/modelling/simulation.py b/domain/modelling/simulation.py index 624826f6..caee5fb5 100644 --- a/domain/modelling/simulation.py +++ b/domain/modelling/simulation.py @@ -28,6 +28,12 @@ class BuildingPartOverlay: # The wall material (RdSAP `wall_construction` code). Left `None` by Measures # — insulating a wall doesn't change its material — but set by a Landlord # Override that corrects the construction itself (ADR-0032). + # RdSAP England-&-Wales construction age band — the letter code A..M the + # calculator's U-value cascades key on (`SapBuildingPart.construction_age_band`). + # Left `None` by Measures (retrofits don't change build era); set by a Landlord + # Override that corrects the lodged age band, which re-derives this part's + # fabric U-value defaults. Folds onto the part via the generic field loop. + construction_age_band: Optional[str] = None wall_construction: Optional[int] = None wall_insulation_type: Optional[int] = None # Added solid-wall insulation depth (mm) — drives the calculator's Table 6 @@ -73,6 +79,28 @@ class WindowOverlay: solar_transmittance: Optional[float] = None +@dataclass(frozen=True) +class GlazingOverlay: + """All-optional partial of the dwelling's whole-glazing state — the + correction a Landlord Override makes when the lodged glazing is wrong. + + Unlike a per-window `WindowOverlay` (keyed by `sap_windows` index), this + targets no single window: a landlord describes the dwelling's glazing as a + whole ("Double glazing, 2002 or later") with no per-window geometry, so the + overlay builder (which never sees the baseline window list) emits one of + these and `_fold_glazing` expands it across every `sap_windows` entry. + + `glazing_type` is the SAP10 glazing-type code (Table 24 / `u_window` + cascade: 1=single, 2=double 2002-2021, 3=double pre-2002, 9=triple 2002+, + …). The fold sets it on every window AND clears each window's lodged + transmission U-value, so the Table-24 cascade re-derives the corrected U + from the new type (the lodged U was for the OLD, mis-recorded glazing). + A `None` field means "leave the baseline value unchanged". + """ + + glazing_type: Optional[int] = None + + @dataclass(frozen=True) class LightingOverlay: """All-optional partial of the dwelling's fixed-lighting bulb counts — the @@ -220,6 +248,7 @@ class EpcSimulation: windows: Mapping[int, WindowOverlay] = field(default_factory=_no_windows) ventilation: Optional[VentilationOverlay] = None lighting: Optional[LightingOverlay] = None + glazing: Optional[GlazingOverlay] = None heating: Optional[HeatingOverlay] = None secondary_heating: Optional[SecondaryHeatingOverlay] = None solar: Optional[SolarOverlay] = None diff --git a/infrastructure/landlord_overrides/landlord_override_reader_postgres_repository.py b/infrastructure/landlord_overrides/landlord_override_reader_postgres_repository.py index 79b851f3..4d02b0d5 100644 --- a/infrastructure/landlord_overrides/landlord_override_reader_postgres_repository.py +++ b/infrastructure/landlord_overrides/landlord_override_reader_postgres_repository.py @@ -25,9 +25,24 @@ from infrastructure.postgres.landlord_property_type_override_table import ( from infrastructure.postgres.landlord_roof_type_override_table import ( LandlordRoofTypeOverrideRow, ) +from infrastructure.postgres.landlord_construction_age_band_override_table import ( + LandlordConstructionAgeBandOverrideRow, +) +from infrastructure.postgres.landlord_glazing_override_table import ( + LandlordGlazingOverrideRow, +) +from infrastructure.postgres.landlord_main_fuel_override_table import ( + LandlordMainFuelOverrideRow, +) +from infrastructure.postgres.landlord_main_heating_system_override_table import ( + LandlordMainHeatingSystemOverrideRow, +) from infrastructure.postgres.landlord_wall_type_override_table import ( LandlordWallTypeOverrideRow, ) +from infrastructure.postgres.landlord_water_heating_override_table import ( + LandlordWaterHeatingOverrideRow, +) from repositories.landlord_overrides.landlord_override_reader import ( LandlordOverrideReader, ) @@ -38,6 +53,11 @@ _ROW_TYPES: dict[str, type] = { "built_form_type": LandlordBuiltFormTypeOverrideRow, "wall_type": LandlordWallTypeOverrideRow, "roof_type": LandlordRoofTypeOverrideRow, + "main_fuel": LandlordMainFuelOverrideRow, + "glazing": LandlordGlazingOverrideRow, + "construction_age_band": LandlordConstructionAgeBandOverrideRow, + "water_heating": LandlordWaterHeatingOverrideRow, + "main_heating_system": LandlordMainHeatingSystemOverrideRow, } diff --git a/infrastructure/postgres/landlord_built_form_type_override_table.py b/infrastructure/postgres/landlord_built_form_type_override_table.py index ec93ba27..5f343613 100644 --- a/infrastructure/postgres/landlord_built_form_type_override_table.py +++ b/infrastructure/postgres/landlord_built_form_type_override_table.py @@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint from sqlalchemy import Enum as SAEnum from sqlmodel import Field, SQLModel -from domain.epc.built_form_type import BuiltFormType +from domain.epc.property_overrides.built_form_type import BuiltFormType from infrastructure.postgres.landlord_override_enums import override_source_sa_enum diff --git a/infrastructure/postgres/landlord_construction_age_band_override_table.py b/infrastructure/postgres/landlord_construction_age_band_override_table.py new file mode 100644 index 00000000..0c8f9119 --- /dev/null +++ b/infrastructure/postgres/landlord_construction_age_band_override_table.py @@ -0,0 +1,73 @@ +"""SQLModel mirror of the ``landlord_construction_age_band_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only +differences are the table name, the ``construction_age_band`` pgEnum on +``value``, and the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordConstructionAgeBandOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_construction_age_band_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + # NB: shortened (drop the redundant ``_overrides``) to stay within + # PostgreSQL's 63-char identifier limit -- the full + # ``landlord_construction_age_band_overrides_portfolio_description_unique`` + # is 68 chars and would be silently truncated, diverging from Drizzle. + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_construction_age_band_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: ConstructionAgeBand = Field( + sa_column=Column( + SAEnum( + ConstructionAgeBand, + name="construction_age_band", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/landlord_glazing_override_table.py b/infrastructure/postgres/landlord_glazing_override_table.py new file mode 100644 index 00000000..b25dea54 --- /dev/null +++ b/infrastructure/postgres/landlord_glazing_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_glazing_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only +differences are the table name, the ``glazing`` pgEnum on ``value``, and the +unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.epc.property_overrides.glazing_type import GlazingType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordGlazingOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_glazing_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_glazing_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: GlazingType = Field( + sa_column=Column( + SAEnum( + GlazingType, + name="glazing", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/landlord_main_fuel_override_table.py b/infrastructure/postgres/landlord_main_fuel_override_table.py new file mode 100644 index 00000000..3012a2e9 --- /dev/null +++ b/infrastructure/postgres/landlord_main_fuel_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_main_fuel_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only +differences are the table name, the ``main_fuel`` pgEnum on ``value``, and +the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.epc.property_overrides.main_fuel_type import MainFuelType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordMainFuelOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_main_fuel_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_main_fuel_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: MainFuelType = Field( + sa_column=Column( + SAEnum( + MainFuelType, + name="main_fuel", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/landlord_main_heating_system_override_table.py b/infrastructure/postgres/landlord_main_heating_system_override_table.py new file mode 100644 index 00000000..d9862089 --- /dev/null +++ b/infrastructure/postgres/landlord_main_heating_system_override_table.py @@ -0,0 +1,71 @@ +"""SQLModel mirror of the ``landlord_main_heating_system_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only +differences are the table name, the ``main_heating_system`` pgEnum on ``value``, +and the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordMainHeatingSystemOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_main_heating_system_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + # Shortened (drop the redundant ``_overrides``) to stay within + # PostgreSQL's 63-char identifier limit; mirrors the Drizzle name. + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_main_heating_system_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: MainHeatingSystemType = Field( + sa_column=Column( + SAEnum( + MainHeatingSystemType, + name="main_heating_system", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/landlord_property_type_override_table.py b/infrastructure/postgres/landlord_property_type_override_table.py index ae9377cd..2718fb34 100644 --- a/infrastructure/postgres/landlord_property_type_override_table.py +++ b/infrastructure/postgres/landlord_property_type_override_table.py @@ -14,7 +14,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint from sqlalchemy import Enum as SAEnum from sqlmodel import Field, SQLModel -from domain.epc.property_type import PropertyType +from domain.epc.property_overrides.property_type import PropertyType from infrastructure.postgres.landlord_override_enums import override_source_sa_enum diff --git a/infrastructure/postgres/landlord_roof_type_override_table.py b/infrastructure/postgres/landlord_roof_type_override_table.py index 58bd61ff..659c5c64 100644 --- a/infrastructure/postgres/landlord_roof_type_override_table.py +++ b/infrastructure/postgres/landlord_roof_type_override_table.py @@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint from sqlalchemy import Enum as SAEnum from sqlmodel import Field, SQLModel -from domain.epc.roof_type import RoofType +from domain.epc.property_overrides.roof_type import RoofType from infrastructure.postgres.landlord_override_enums import override_source_sa_enum diff --git a/infrastructure/postgres/landlord_wall_type_override_table.py b/infrastructure/postgres/landlord_wall_type_override_table.py index b5097164..7a3c70ae 100644 --- a/infrastructure/postgres/landlord_wall_type_override_table.py +++ b/infrastructure/postgres/landlord_wall_type_override_table.py @@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint from sqlalchemy import Enum as SAEnum from sqlmodel import Field, SQLModel -from domain.epc.wall_type import WallType +from domain.epc.property_overrides.wall_type import WallType from infrastructure.postgres.landlord_override_enums import override_source_sa_enum diff --git a/infrastructure/postgres/landlord_water_heating_override_table.py b/infrastructure/postgres/landlord_water_heating_override_table.py new file mode 100644 index 00000000..039b5a62 --- /dev/null +++ b/infrastructure/postgres/landlord_water_heating_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_water_heating_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only +differences are the table name, the ``water_heating`` pgEnum on ``value``, and +the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.epc.property_overrides.water_heating_type import WaterHeatingType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordWaterHeatingOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_water_heating_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_water_heating_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: WaterHeatingType = Field( + sa_column=Column( + SAEnum( + WaterHeatingType, + name="water_heating", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/property_override_table.py b/infrastructure/postgres/property_override_table.py index 3132ddf5..40490144 100644 --- a/infrastructure/postgres/property_override_table.py +++ b/infrastructure/postgres/property_override_table.py @@ -27,6 +27,11 @@ override_component_sa_enum = SAEnum( "roof_type", "property_type", "built_form_type", + "main_fuel", + "glazing", + "construction_age_band", + "water_heating", + "main_heating_system", name="override_component", ) diff --git a/orchestration/bulk_upload_finaliser_orchestrator.py b/orchestration/bulk_upload_finaliser_orchestrator.py index 4aa49ab8..1d707a8d 100644 --- a/orchestration/bulk_upload_finaliser_orchestrator.py +++ b/orchestration/bulk_upload_finaliser_orchestrator.py @@ -14,10 +14,10 @@ from typing import Any, Optional from uuid import UUID -from domain.epc.built_form_type import BuiltFormType -from domain.epc.property_type import PropertyType -from domain.epc.roof_type import RoofType -from domain.epc.wall_type import WallType +from domain.epc.property_overrides.built_form_type import BuiltFormType +from domain.epc.property_overrides.property_type import PropertyType +from domain.epc.property_overrides.roof_type import RoofType +from domain.epc.property_overrides.wall_type import WallType from repositories.bulk_upload.bulk_upload_status_writer import BulkUploadStatusWriter from repositories.landlord_overrides.landlord_override_reader import ( LandlordOverrideReader, diff --git a/repositories/property/landlord_override_overlays.py b/repositories/property/landlord_override_overlays.py index ab8d7ca3..6c40d009 100644 --- a/repositories/property/landlord_override_overlays.py +++ b/repositories/property/landlord_override_overlays.py @@ -29,6 +29,17 @@ from domain.epc.property_overlays.attribute_overlay import ( built_form_overlay_for, property_type_overlay_for, ) +from domain.epc.property_overlays.construction_age_band_overlay import ( + age_band_overlay_for, +) +from domain.epc.property_overlays.glazing_overlay import glazing_overlay_for +from domain.epc.property_overlays.main_fuel_overlay import fuel_overlay_for +from domain.epc.property_overlays.main_heating_system_overlay import ( + main_heating_overlay_for, +) +from domain.epc.property_overlays.water_heating_overlay import ( + water_heating_overlay_for, +) from domain.epc.property_overlays.roof_type_overlay import roof_overlay_for from domain.epc.property_overlays.wall_type_overlay import wall_overlay_for from domain.modelling.simulation import EpcSimulation @@ -43,6 +54,11 @@ _COMPONENT_OVERLAYS: dict[str, Callable[[str, int], Optional[EpcSimulation]]] = "roof_type": roof_overlay_for, "property_type": property_type_overlay_for, "built_form_type": built_form_overlay_for, + "main_fuel": fuel_overlay_for, + "glazing": glazing_overlay_for, + "construction_age_band": age_band_overlay_for, + "water_heating": water_heating_overlay_for, + "main_heating_system": main_heating_overlay_for, } diff --git a/repositories/property/override_backed_prediction_attributes_reader.py b/repositories/property/override_backed_prediction_attributes_reader.py index 5befd3b3..1c5a4b71 100644 --- a/repositories/property/override_backed_prediction_attributes_reader.py +++ b/repositories/property/override_backed_prediction_attributes_reader.py @@ -13,7 +13,7 @@ from __future__ import annotations from typing import Optional -from domain.epc.override_code_mapping import ( +from domain.epc.property_overrides.override_code_mapping import ( built_form_to_code, property_type_to_code, ) diff --git a/scripts/fill_domna_addresses.py b/scripts/fill_domna_addresses.py new file mode 100644 index 00000000..e4a7e18b --- /dev/null +++ b/scripts/fill_domna_addresses.py @@ -0,0 +1,353 @@ +"""Fill the DOMNA columns in the AddressProfilingResults spreadsheet. + +Input: scripts/manipulation(2).xlsx, sheet "AddressProfilingResults", columns + Organisation Reference | UPRN | DOMNA FOUND UPRN | DOMNA FOUND ADDRESS | Address | Postcode + +Per-row rule ("if there's a UPRN in the UPRN column we're done"): + + * UPRN present AND Address present -> nothing to do (already sorted). + * UPRN present AND Address missing -> reverse-lookup the address from the UPRN + via the EPC API -> DOMNA FOUND ADDRESS. + * UPRN missing AND Address present -> resolve a UPRN from address + postcode + (EPC API, then Ordnance Survey) -> writes + DOMNA FOUND UPRN + DOMNA FOUND ADDRESS. + * not resolvable -> marked "NOT FOUND" and listed in the + unresolved report. + +Relaxed matching (this batch only — production AddressMatch is untouched): the +landlord writes flats as "3 GLADYS COURT" while EPC stores "Flat 3 Gladys +Court", which the production matcher hard-rejects. So per address we try several +query variants — the full string, just the first comma-segment, and a +"Flat ..." form — and keep the best-scoring, unambiguous match. The unit +number must still match exactly (AddressMatch zeroes mismatched numbers), so a +wrong-unit match stays unlikely. Each fill carries its score + source so you can +spot-check (DOMNA SCORE / DOMNA SOURCE). + +Rows that already have a DOMNA FOUND UPRN are skipped (idempotent / resumable). + + python -m scripts.fill_domna_addresses + python -m scripts.fill_domna_addresses --limit 200 # smoke test first N + +Keys come from backend/.env (OPEN_EPC_API_TOKEN, ORDNANCE_SURVEY_API_KEY). Run +from the worktree root (import trap). +""" + +from __future__ import annotations + +import argparse +import os +import re +import sys +from pathlib import Path +from typing import Optional + +import pandas as pd + +_REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap + +from backend.address2UPRN.main import get_epc_data_with_postcode # noqa: E402 +from backend.address2UPRN.scoring import all_uprns_match, rank_address_similarity # noqa: E402 +from backend.ordnanceSurvey.helpers import ( # noqa: E402 + lookup_os_places, + os_places_results_to_dataframe, +) +from backend.utils.addressMatch import AddressMatch # noqa: E402 +from datatypes.epc.search import EpcSearchResult # noqa: E402 +from infrastructure.epc_client.epc_client_service import EpcClientService # noqa: E402 +from scripts.resolve_uprns_for_finaliser import clean_postcode, load_keys # noqa: E402 + +SHEET = "AddressProfilingResults" +UPRN_COL = "UPRN" +ADDRESS_COL = "Address" +POSTCODE_COL = "Postcode" +REF_COL = "Organisation Reference" +FOUND_UPRN_COL = "DOMNA FOUND UPRN" +FOUND_ADDRESS_COL = "DOMNA FOUND ADDRESS" +SCORE_COL = "DOMNA SCORE" +SOURCE_COL = "DOMNA SOURCE" +NOT_FOUND = "NOT FOUND" + +# EPC matches are tight (short addresses) so we hold the production 0.7 bar; OS +# addresses carry more trailing tokens, so a slightly lower bar is appropriate. +EPC_THRESHOLD = 0.7 +OS_THRESHOLD = 0.6 + +_DEFAULT_IN = _REPO_ROOT / "scripts" / "manipulation(2).xlsx" +_DEFAULT_OUT = _REPO_ROOT / "scripts" / "manipulation_filled.xlsx" +_DEFAULT_UNRESOLVED = _REPO_ROOT / "scripts" / "manipulation_unresolved.csv" + +# A resolved hit: (uprn, matched_address, score, source). +Hit = tuple[str, str, float, str] + + +def cell_str(value: object) -> str: + """Coerce a spreadsheet cell to a trimmed string ("" for NaN/None).""" + if value is None: + return "" + text = str(value).strip() + return "" if text.lower() == "nan" else text + + +def parse_uprn_cell(value: object) -> Optional[int]: + """Read a UPRN cell that pandas loaded as float64 back into an int.""" + text = cell_str(value) + if not text: + return None + try: + return int(float(text)) + except ValueError: + return None + + +def address_variants(address: str) -> list[str]: + """Query forms to try for one input address, best-discriminating first. + + Landlord flats read "3 GLADYS COURT, 260 REIGATE ROAD" but EPC stores + "Flat 3 Gladys Court"; the full string scores low (extra tokens) and the + bare "3 ..." trips the flat guard. So we also try the first comma-segment + and a "Flat " form. + """ + address = address.strip() + first = address.split(",")[0].strip() + variants = [address, first] + if re.match(r"^\d", first): # starts with a unit/house number + variants.append("Flat " + first) + variants.append("Flat " + address) + seen: set[str] = set() + out: list[str] = [] + for v in variants: + key = v.lower() + if v and key not in seen: + seen.add(key) + out.append(v) + return out + + +def resolve_epc_relaxed( + address: str, + postcode_clean: str, + epc_cache: dict[str, pd.DataFrame], + threshold: float = EPC_THRESHOLD, +) -> Optional[Hit]: + """Best unambiguous EPC match across the address variants (cached per postcode).""" + epc_df = epc_cache.get(postcode_clean) + if epc_df is None: + epc_df = get_epc_data_with_postcode(postcode=postcode_clean) + epc_cache[postcode_clean] = epc_df + if epc_df.empty: + return None + + best: Optional[Hit] = None + for variant in address_variants(address): + scored = rank_address_similarity(epc_df, user_address=variant) + if scored.empty: + continue + score = float(scored.iloc[0]["lexiscore"]) + if best is not None and score <= best[2]: + continue + top_rank = scored[scored["lexirank"] == 1] + # rank-1 rows must agree on one UPRN, else it's ambiguous — skip. + if not all_uprns_match(top_rank, top_rank.iloc[0]["uprn"]): + continue + uprn = str(top_rank.iloc[0]["uprn"]) + if uprn in ("", "nan"): + continue + best = (uprn, str(scored.iloc[0]["address"]), score, "epc") + + return best if best is not None and best[2] >= threshold else None + + +def resolve_os_relaxed( + address: str, + postcode_clean: str, + os_api_key: str, + os_cache: dict[str, pd.DataFrame], + threshold: float = OS_THRESHOLD, +) -> Optional[Hit]: + """Best OS Places match across the address variants (cached per postcode).""" + places_df = os_cache.get(postcode_clean) + if places_df is None: + response = lookup_os_places(postcode_clean, os_api_key) + if response.get("status") == 200 and "data" in response: + places_df = os_places_results_to_dataframe(response["data"]) + else: + places_df = pd.DataFrame() + os_cache[postcode_clean] = places_df + if places_df.empty or "ADDRESS" not in places_df.columns: + return None + + records: list[dict[str, object]] = places_df.to_dict(orient="records") + best: Optional[Hit] = None + for variant in address_variants(address): + for rec in records: + candidate = str(rec.get("ADDRESS", "")) + score = AddressMatch.score(variant, candidate) + if best is None or score > best[2]: + best = (str(rec.get("UPRN", "")), candidate, score, "ordnance_survey") + return best if best is not None and best[2] >= threshold else None + + +def _address_from_search(result: EpcSearchResult) -> str: + parts = [ + result.address_line_1, + result.address_line_2, + result.address_line_3, + result.address_line_4, + result.post_town, + ] + return ", ".join(p.strip() for p in parts if p and p.strip()) + + +def reverse_address_from_uprn( + uprn: int, + postcode_clean: str, + service: EpcClientService, + search_cache: dict[str, list[EpcSearchResult]], +) -> Optional[str]: + """Find the EPC address for a known UPRN by searching its postcode (cached).""" + results = search_cache.get(postcode_clean) + if results is None: + results = service.search_by_postcode(postcode_clean) + search_cache[postcode_clean] = results + for result in results: + if result.uprn is not None and int(result.uprn) == uprn: + return _address_from_search(result) + return None + + +def fill(df: pd.DataFrame, *, os_api_key: Optional[str]) -> list[dict[str, str]]: + """Fill the DOMNA columns in place. Returns the unresolved rows.""" + for col in (FOUND_UPRN_COL, FOUND_ADDRESS_COL, SCORE_COL, SOURCE_COL): + if col not in df.columns: + df[col] = "" + df[FOUND_UPRN_COL] = df[FOUND_UPRN_COL].astype("object") + df[FOUND_ADDRESS_COL] = df[FOUND_ADDRESS_COL].astype("object") + + token = os.environ.get("OPEN_EPC_API_TOKEN") + service = EpcClientService(auth_token=token) if token else None + epc_cache: dict[str, pd.DataFrame] = {} + os_cache: dict[str, pd.DataFrame] = {} + search_cache: dict[str, list[EpcSearchResult]] = {} + + unresolved: list[dict[str, str]] = [] + resolved_uprn = resolved_addr = skipped = 0 + total = len(df) + + for n, idx in enumerate(df.index, start=1): + ref = cell_str(df.at[idx, REF_COL]) + given_uprn = parse_uprn_cell(df.at[idx, UPRN_COL]) + address = cell_str(df.at[idx, ADDRESS_COL]) + postcode_raw = cell_str(df.at[idx, POSTCODE_COL]) + postcode_clean = clean_postcode(postcode_raw) + + # Already sorted (UPRN + address) or already filled by a prior run. + if given_uprn is not None and address: + skipped += 1 + continue + if cell_str(df.at[idx, FOUND_UPRN_COL]) and cell_str(df.at[idx, FOUND_UPRN_COL]) != NOT_FOUND: + skipped += 1 + continue + + def mark_not_found(reason: str) -> None: + df.at[idx, FOUND_UPRN_COL] = NOT_FOUND if given_uprn is None else "" + df.at[idx, FOUND_ADDRESS_COL] = NOT_FOUND + df.at[idx, SOURCE_COL] = "not_found" + unresolved.append( + { + "Organisation Reference": ref, + "reason": reason, + "Address": address, + "Postcode": postcode_raw, + } + ) + + # Case B — UPRN present, address missing: reverse-lookup the address. + if given_uprn is not None and not address: + found: Optional[str] = None + if service is not None and postcode_clean: + try: + found = reverse_address_from_uprn( + given_uprn, postcode_clean, service, search_cache + ) + except Exception as exc: + print(f" reverse failed {ref} {given_uprn}: {exc}") + if found: + df.at[idx, FOUND_ADDRESS_COL] = found + df.at[idx, SOURCE_COL] = "epc_reverse" + resolved_addr += 1 + else: + mark_not_found("no address for UPRN") + continue + + # Case A — no UPRN, has address: resolve a UPRN. + if given_uprn is None and address: + if not postcode_clean: + mark_not_found("no postcode") + continue + hit: Optional[Hit] = None + if token: + try: + hit = resolve_epc_relaxed(address, postcode_clean, epc_cache) + except Exception as exc: + print(f" EPC failed {ref} {postcode_clean}: {exc}") + if hit is None and os_api_key: + try: + hit = resolve_os_relaxed(address, postcode_clean, os_api_key, os_cache) + except Exception as exc: + print(f" OS failed {ref} {postcode_clean}: {exc}") + if hit is not None: + uprn, matched, score, source = hit + df.at[idx, FOUND_UPRN_COL] = uprn + df.at[idx, FOUND_ADDRESS_COL] = matched + df.at[idx, SCORE_COL] = round(score, 4) + df.at[idx, SOURCE_COL] = source + resolved_uprn += 1 + else: + mark_not_found("no UPRN match") + if n % 100 == 0: + print( + f"[{n}/{total}] resolved={resolved_uprn} not_found={len(unresolved)}" + ) + continue + + # Case C — neither a UPRN nor an address. + mark_not_found("no UPRN and no address") + + print( + f"\nResolved {resolved_uprn} UPRNs, {resolved_addr} addresses; " + f"{skipped} already sorted/done; {len(unresolved)} not found." + ) + return unresolved + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN) + parser.add_argument("--out", type=Path, default=_DEFAULT_OUT) + parser.add_argument("--unresolved", type=Path, default=_DEFAULT_UNRESOLVED) + parser.add_argument("--limit", type=int, default=None, help="process first N rows") + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + _epc_token, os_api_key = load_keys() + + df = pd.read_excel(args.inp, sheet_name=SHEET) + if args.limit is not None: + df = df.head(args.limit).copy() + print(f"Loaded {len(df)} rows from {args.inp} [{SHEET}]") + + unresolved = fill(df, os_api_key=os_api_key) + + df.to_excel(args.out, sheet_name=SHEET, index=False) + print(f"Wrote filled sheet -> {args.out}") + if unresolved: + pd.DataFrame(unresolved).to_csv(args.unresolved, index=False) + print(f"Wrote {len(unresolved)} unresolved rows -> {args.unresolved}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/finalise_to_property_table.py b/scripts/finalise_to_property_table.py new file mode 100644 index 00000000..751e8c59 --- /dev/null +++ b/scripts/finalise_to_property_table.py @@ -0,0 +1,331 @@ +"""Insert resolved manipulation_filled rows into the FE-owned ``property`` table. + +Reuses the bulk_upload_finaliser's own row->PropertyIdentityInsert mapping +(``BulkUploadFinaliserOrchestrator._row_to_insert``) and the same +``PropertyPostgresRepository.insert_all`` the Lambda uses, so a row inserted here +is identical to one the real finaliser would write. The status-writer / +property_overrides path is skipped — this only populates ``property`` (no +BulkUpload task needed). + +Insert is ON CONFLICT (portfolio_id, uprn) DO NOTHING, so re-running is safe. + + # one random resolved row into portfolio 796, then read it back + python -m scripts.finalise_to_property_table --portfolio 796 --one + + # a specific Organisation Reference + python -m scripts.finalise_to_property_table --portfolio 796 --ref 56100000101 + + # the whole sheet (resolved rows only by default; --include-unmatched to add + # null-UPRN rows too) + python -m scripts.finalise_to_property_table --portfolio 796 --all + +Postgres target comes from the root .env (POSTGRES_*). Run from the worktree root. +""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path +from typing import Optional + +import pandas as pd +from dotenv import load_dotenv +from sqlmodel import select + +_REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap + +from infrastructure.postgres.config import PostgresConfig # noqa: E402 +from infrastructure.postgres.engine import commit_scope, make_engine, make_session # noqa: E402 +from infrastructure.postgres.property_table import PropertyRow # noqa: E402 +from orchestration.bulk_upload_finaliser_orchestrator import ( # noqa: E402 + BulkUploadFinaliserOrchestrator, +) +from repositories.property.property_postgres_repository import ( # noqa: E402 + PropertyPostgresRepository, +) +from repositories.property.property_repository import PropertyIdentityInsert # noqa: E402 +from scripts.fill_domna_addresses import ( # noqa: E402 + ADDRESS_COL, + FOUND_ADDRESS_COL, + FOUND_UPRN_COL, + POSTCODE_COL, + REF_COL, + SCORE_COL, + SHEET, + UPRN_COL, + NOT_FOUND, + cell_str, + parse_uprn_cell, +) + +_DEFAULT_IN = _REPO_ROOT / "scripts" / "manipulation_filled.xlsx" + + +def _final_uprn(row: pd.Series) -> Optional[int]: + """The authoritative UPRN: the given one, else the DOMNA-found one.""" + given = parse_uprn_cell(row.get(UPRN_COL)) + if given is not None: + return given + found = cell_str(row.get(FOUND_UPRN_COL)) + if found and found != NOT_FOUND: + return parse_uprn_cell(found) + return None + + +def to_combiner_row(row: pd.Series) -> dict[str, str]: + """Map one spreadsheet row to the combiner-output shape the finaliser reads.""" + given_uprn = parse_uprn_cell(row.get(UPRN_COL)) + address = cell_str(row.get(ADDRESS_COL)) + uprn = _final_uprn(row) + + domna_addr = cell_str(row.get(FOUND_ADDRESS_COL)) + if domna_addr == NOT_FOUND: + domna_addr = "" + # Matched address: the resolved one when we found it, else the given address + # (for rows that already had a UPRN + address). + matched = domna_addr or (address if given_uprn is not None else "") + score = cell_str(row.get(SCORE_COL)) + + return { + "Address 1": address, + "Address 2": "", + "Address 3": "", + "postcode": cell_str(row.get(POSTCODE_COL)), + "Internal Reference": cell_str(row.get(REF_COL)), + "address2uprn_uprn": "" if uprn is None else str(uprn), + "address2uprn_address": matched, + "address2uprn_lexiscore": score, + } + + +def load_rows( + path: Path, *, include_unmatched: bool +) -> tuple[pd.DataFrame, list[dict[str, str]]]: + """Load the sheet and the combiner rows. By default drop rows with no UPRN.""" + df = pd.read_excel(path, sheet_name=SHEET) + df = df.reset_index(drop=True) + if not include_unmatched: + keep = df.apply(lambda r: _final_uprn(r) is not None, axis=1) + df = df[keep].reset_index(drop=True) + rows = [to_combiner_row(r) for _, r in df.iterrows()] + return df, rows + + +def dedupe_by_uprn( + rows: list[dict[str, str]], +) -> tuple[list[dict[str, str]], list[dict[str, str]]]: + """Keep the first row per UPRN; return (kept, dropped collisions). + + The DB INSERT collapses duplicate (portfolio, uprn) via ON CONFLICT DO + NOTHING anyway, so this just makes the collision explicit (the dropped rows + are written out for review) rather than letting an arbitrary ref win silently. + """ + seen: set[str] = set() + kept: list[dict[str, str]] = [] + dropped: list[dict[str, str]] = [] + for row in rows: + uprn = row["address2uprn_uprn"] + if uprn in seen: + dropped.append(row) + else: + seen.add(uprn) + kept.append(row) + return kept, dropped + + +# Force-reload teardown order (bottom-up). property_overrides is ON DELETE +# CASCADE so it clears itself when the property goes; everything below is NO +# ACTION and must be deleted first, deepest child first. +# property -> epc_property -> {these children} +_EPC_CHILD_TABLES = ( + "epc_energy_element", + "epc_window", + "epc_main_heating_detail", + "epc_renewable_heat_incentive", + "epc_building_part", + "epc_flat_details", +) +# property -> {these direct dependents}, deleted after the epc children +_PROPERTY_DEPENDENTS = ("epc_property", "plan") +_INSERT_CHUNK = 4000 # 9 cols/row -> well under psycopg2's 65535-param limit + + +def _reset_portfolio(session: object, portfolio_id: int) -> int: + """Delete a portfolio's properties and their NO ACTION dependency tree. + + Returns the number of property rows deleted (property_overrides cascade). + """ + from sqlalchemy import text + + pids = "SELECT id FROM property WHERE portfolio_id = :pid" + epc_ids = f"SELECT id FROM epc_property WHERE property_id IN ({pids})" + for table in _EPC_CHILD_TABLES: + session.execute( # type: ignore[attr-defined] + text(f"DELETE FROM {table} WHERE epc_property_id IN ({epc_ids})"), + {"pid": portfolio_id}, + ) + for table in _PROPERTY_DEPENDENTS: + session.execute( # type: ignore[attr-defined] + text(f"DELETE FROM {table} WHERE property_id IN ({pids})"), + {"pid": portfolio_id}, + ) + result = session.execute( # type: ignore[attr-defined] + text("DELETE FROM property WHERE portfolio_id = :pid"), {"pid": portfolio_id} + ) + return result.rowcount + + +def clean_reload( + rows: list[dict[str, str]], portfolio_id: int, *, reset: bool +) -> tuple[int, int]: + """Optionally wipe the portfolio, then chunk-insert rows. One transaction. + + Returns (properties_deleted, properties_inserted). + """ + inserts: list[PropertyIdentityInsert] = [ + BulkUploadFinaliserOrchestrator._row_to_insert(r, portfolio_id) for r in rows + ] + engine = _engine() + session = make_session(engine) + deleted = 0 + inserted = 0 + try: + repo = PropertyPostgresRepository(session) + with commit_scope(session): + if reset: + deleted = _reset_portfolio(session, portfolio_id) + for start in range(0, len(inserts), _INSERT_CHUNK): + inserted += repo.insert_all(inserts[start : start + _INSERT_CHUNK]) + finally: + session.close() + return deleted, inserted + + +def _engine(): + load_dotenv(_REPO_ROOT / ".env") + return make_engine(PostgresConfig.from_env(os.environ)) + + +def insert_rows(rows: list[dict[str, str]], portfolio_id: int) -> int: + """Insert via the finaliser's mapper + repository. Returns rows inserted.""" + inserts: list[PropertyIdentityInsert] = [ + BulkUploadFinaliserOrchestrator._row_to_insert(r, portfolio_id) for r in rows + ] + engine = _engine() + session = make_session(engine) + try: + repo = PropertyPostgresRepository(session) + with commit_scope(session): + inserted = repo.insert_all(inserts) + finally: + session.close() + return inserted + + +def fetch_by_ref(portfolio_id: int, ref: str) -> list[PropertyRow]: + """Read back inserted rows for one Organisation Reference (for verification).""" + engine = _engine() + session = make_session(engine) + try: + stmt = select(PropertyRow).where( + PropertyRow.portfolio_id == portfolio_id, + PropertyRow.landlord_property_id == ref, + ) + return list(session.exec(stmt).all()) + finally: + session.close() + + +def _show(row: dict[str, str], insert: PropertyIdentityInsert) -> None: + print("\nSource (combiner) row:") + for k, v in row.items(): + print(f" {k}: {v!r}") + print("\nMapped PropertyIdentityInsert:") + for k, v in insert.__dict__.items(): + print(f" {k}: {v!r}") + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN) + parser.add_argument("--portfolio", type=int, required=True) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--one", action="store_true", help="one random resolved row") + group.add_argument("--ref", help="a specific Organisation Reference") + group.add_argument("--all", action="store_true", help="every row") + parser.add_argument( + "--include-unmatched", + action="store_true", + help="also insert rows with no UPRN (null-UPRN property rows)", + ) + parser.add_argument( + "--reset", + action="store_true", + help="(with --all) DELETE all properties in the portfolio first " + "(cascades property_overrides; clears plan/epc_property)", + ) + parser.add_argument( + "--collisions", + type=Path, + default=_REPO_ROOT / "scripts" / "manipulation_collisions.csv", + help="where to write rows dropped as duplicate-UPRN collisions", + ) + parser.add_argument("--seed", type=int, default=0, help="random seed for --one") + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + df, rows = load_rows(args.inp, include_unmatched=args.include_unmatched) + print(f"Loaded {len(rows)} candidate rows from {args.inp}") + + if args.all: + kept, dropped = dedupe_by_uprn(rows) + if dropped: + pd.DataFrame(dropped).to_csv(args.collisions, index=False) + print( + f"{len(dropped)} duplicate-UPRN rows dropped -> {args.collisions} " + f"({len(kept)} unique to insert)" + ) + deleted, inserted = clean_reload(kept, args.portfolio, reset=args.reset) + if args.reset: + print(f"Deleted {deleted} existing properties in portfolio {args.portfolio}.") + print(f"Inserted {inserted} properties into portfolio {args.portfolio}.") + return 0 + + # Single-row paths: pick the row, show the mapping, insert, read back. + if args.ref: + match = [r for r in rows if r["Internal Reference"] == args.ref] + if not match: + print(f"No resolved row with Organisation Reference {args.ref!r}.") + return 1 + row = match[0] + else: # --one: deterministic "random" pick via seed + idx = (args.seed * 7919) % len(rows) + row = rows[idx] + + ref = row["Internal Reference"] + insert = BulkUploadFinaliserOrchestrator._row_to_insert(row, args.portfolio) + _show(row, insert) + + inserted = insert_rows([row], args.portfolio) + print( + f"\ninsert_all -> {inserted} new row(s) " + f"(0 means it already existed; ON CONFLICT DO NOTHING)." + ) + + print(f"\nproperty rows for portfolio {args.portfolio}, ref {ref!r}:") + for pr in fetch_by_ref(args.portfolio, ref): + print( + f" id={pr.id} uprn={pr.uprn} address={pr.address!r} " + f"postcode={pr.postcode!r} status={pr.creation_status} " + f"lexiscore={pr.lexiscore}" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/hyde/RESUME_AFTER_KHALIM.md b/scripts/hyde/RESUME_AFTER_KHALIM.md new file mode 100644 index 00000000..f2927577 --- /dev/null +++ b/scripts/hyde/RESUME_AFTER_KHALIM.md @@ -0,0 +1,44 @@ +# Resume prompt — finish the Hyde portfolio-796 property_overrides run (after Khalim review) + +Paste the block below to continue. It tells the assistant to review the unknown-override +decisions with me, verify them, confirm before writing, then run the remaining steps. + +--- + +We paused the Hyde property-overrides bulk load to review the UNKNOWN classifications with +Khalim. Pick it back up. + +**Context (already done):** +- Target is **portfolio 796** in DevAssessmentModelDB (NOT 795 — 795 is empty). +- Script: `scripts/hyde/build_property_overrides.py`. Pass 1 (`classify`) is DONE — the + `landlord_*_overrides` ledger is populated; re-running classify is free (cache hits). +- The 19 unresolved descriptions are documented in `scripts/hyde/unknowns_review.md`, with + proposed values already written to `overrides_edits.csv` (gitignored). +- Env (DB creds + `OPENAI_API_KEY`) is in `/workspaces/home/github/Model/.env`; load it with + python-dotenv and set `POSTGRES_DRIVER=psycopg2`. Writes are idempotent upserts (unique on + `property_id, override_component, building_part`) — safe to re-run, never duplicates. + +**Do this, in order:** +1. **Ask me what Khalim decided** for the unknowns. The one real judgement call is the + flat-roof reading: `Flat: As Built` (1,172 rows) + `Flat: Unknown` (194) → which of + `Flat, no insulation (assumed)` / `Flat, insulated (assumed)` / `Flat, limited insulation + (assumed)`. The `construction_age_band` bands (29,829 rows) are deterministic (band = first + letter) — keep as-is unless I say otherwise. Confirm the other roof/wall proposals too. +2. **Update `overrides_edits.csv`** (`corrected_value` column) to match Khalim's decisions. +3. Run `validate --edits overrides_edits.csv` and fix anything it rejects. +4. **Show me the final edits + the planned write counts, and WAIT for my explicit go-ahead + before any `--apply`.** Do not write to the DB before I confirm. +5. On my go-ahead: + - `apply-edits --edits overrides_edits.csv --portfolio-id 796 --apply` (user corrections → ledger) + - `write --excel scripts/hyde/hyde_property_overrides.xlsx --portfolio-id 796` (DRY RUN — + report unmatched org_refs + unresolved across all 31,773 first) + - then the same `write ... --apply` +6. `verify --portfolio-id 796 --org-ref ` to confirm property_overrides + + overlays landed. +7. Remind me about the deferred **age-classifier prompt-hint fix** for the production lambda + (the live frontend will hit the same `"D: 1950-1966"` → UNKNOWN until that lands). + +Every DB command loads env from `/workspaces/home/github/Model/.env`. Read-only checks +(`verify`, dry-run `write`) are fine to run unprompted; anything `--apply` needs my confirm. + +--- diff --git a/scripts/hyde/build_property_overrides.py b/scripts/hyde/build_property_overrides.py new file mode 100644 index 00000000..dc77b520 --- /dev/null +++ b/scripts/hyde/build_property_overrides.py @@ -0,0 +1,437 @@ +"""Build ``property_overrides`` for a portfolio from the Hyde Excel, bypassing the +frontend + lambdas, using the ``landlord_*_overrides`` tables as the durable +classification ledger. + +Why the ledger (not a throwaway cache): ``landlord_*_overrides`` stores +``(portfolio_id, description) -> value`` with a ``source`` (classifier|user). + * Re-runs classify only descriptions NOT already stored -> saves ChatGPT calls. + * Human corrections are stored as ``source=user`` and the classifier is + forbidden from overwriting them (ADR-0003) -> edits are permanent. +Then we resolve the vocab + match each row to a ``property.id`` by **org_ref** +(Excel "Organisation Reference" -> property.landlord_property_id) and upsert +``property_overrides`` (the fact layer the SAP overlay reads). + +Subcommands: + list-values print each component's valid override values (reference) + classify --excel f --portfolio-id 795 + PASS 1: classify cache-misses via ChatGPT, + upsert to landlord tables, write + overrides_unknowns.csv (with allowed_values) + validate --edits overrides_edits.csv + check a hand-edited file: every corrected_value + must be a valid enum value (suggests fixes) + apply-edits --edits overrides_edits.csv --portfolio-id 795 [--apply] + upsert validated corrections as source=user + write --excel f --portfolio-id 795 [--apply] + PASS 2: build + upsert property_overrides from vocab + +Env: POSTGRES_* (PostgresConfig.from_env) and OPENAI_API_KEY (ChatGPT). +""" + +from __future__ import annotations + +import argparse +import csv +import difflib +import logging +import os +from collections import Counter +from dataclasses import dataclass +from datetime import datetime, timezone +from enum import Enum +from typing import Any, Optional + +import pandas as pd # pyright: ignore[reportMissingTypeStubs] +from sqlalchemy import Table, text +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlmodel import SQLModel + +from domain.epc.property_overrides.built_form_type import BuiltFormType +from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand +from domain.epc.property_overrides.glazing_type import GlazingType +from domain.epc.property_overrides.main_fuel_type import MainFuelType +from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType +from domain.epc.property_overrides.property_type import PropertyType +from domain.epc.property_overrides.roof_type import RoofType +from domain.epc.property_overrides.wall_type import WallType +from domain.epc.property_overrides.wall_type_construction_dates import ( + wall_type_construction_date_prompt_hint, +) +from domain.epc.property_overrides.water_heating_type import WaterHeatingType +from infrastructure.chatgpt.chatgpt import ChatGPT +from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier +from infrastructure.landlord_overrides.landlord_override_reader_postgres_repository import ( + LandlordOverrideReaderPostgresRepository, +) +from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import ( + LandlordOverridesRepository, +) +from infrastructure.postgres.config import PostgresConfig +from infrastructure.postgres.engine import commit_scope, make_engine, make_session +from infrastructure.postgres.landlord_built_form_type_override_table import ( + LandlordBuiltFormTypeOverrideRow, +) +from infrastructure.postgres.landlord_construction_age_band_override_table import ( + LandlordConstructionAgeBandOverrideRow, +) +from infrastructure.postgres.landlord_glazing_override_table import ( + LandlordGlazingOverrideRow, +) +from infrastructure.postgres.landlord_main_fuel_override_table import ( + LandlordMainFuelOverrideRow, +) +from infrastructure.postgres.landlord_main_heating_system_override_table import ( + LandlordMainHeatingSystemOverrideRow, +) +from infrastructure.postgres.landlord_override_enums import OverrideSource +from infrastructure.postgres.landlord_property_type_override_table import ( + LandlordPropertyTypeOverrideRow, +) +from infrastructure.postgres.landlord_roof_type_override_table import ( + LandlordRoofTypeOverrideRow, +) +from infrastructure.postgres.landlord_wall_type_override_table import ( + LandlordWallTypeOverrideRow, +) +from infrastructure.postgres.landlord_water_heating_override_table import ( + LandlordWaterHeatingOverrideRow, +) +from repositories.property.landlord_override_overlays import overlays_from +from repositories.property.property_override_postgres_repository import ( + PropertyOverridePostgresRepository, +) +from repositories.property.property_override_repository import PropertyOverrideInsert +from repositories.property.property_overrides_postgres_reader import ( + PropertyOverridesPostgresReader, +) + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger("build_property_overrides") + +ORG_REF_COLUMN = "Organisation Reference" +UNKNOWNS_PATH = "overrides_unknowns.csv" + + +@dataclass(frozen=True) +class ComponentSpec: + component: str + enum_cls: type[Enum] + unknown: Enum + row_type: type[SQLModel] + excel_header: str + per_building_part: bool # comma = building parts (wall/roof/age) vs whole-dwelling + extra_instructions: Optional[str] = None + + def allowed_values(self) -> list[str]: + """Valid override values a human may pick (excludes UNKNOWN).""" + return sorted(m.value for m in self.enum_cls if m is not self.unknown) + + +def _component_specs() -> list[ComponentSpec]: + return [ + ComponentSpec("property_type", PropertyType, PropertyType.UNKNOWN, LandlordPropertyTypeOverrideRow, "Property Type", False), + ComponentSpec("built_form_type", BuiltFormType, BuiltFormType.UNKNOWN, LandlordBuiltFormTypeOverrideRow, "Property Type", False), + ComponentSpec("wall_type", WallType, WallType.UNKNOWN, LandlordWallTypeOverrideRow, "Walls", True, wall_type_construction_date_prompt_hint()), + ComponentSpec("roof_type", RoofType, RoofType.UNKNOWN, LandlordRoofTypeOverrideRow, "Roofs", True), + ComponentSpec("construction_age_band", ConstructionAgeBand, ConstructionAgeBand.UNKNOWN, LandlordConstructionAgeBandOverrideRow, "Age", True), + ComponentSpec("main_fuel", MainFuelType, MainFuelType.UNKNOWN, LandlordMainFuelOverrideRow, "Main Fuel", False), + ComponentSpec("glazing", GlazingType, GlazingType.UNKNOWN, LandlordGlazingOverrideRow, "Glazing", False), + ComponentSpec("water_heating", WaterHeatingType, WaterHeatingType.UNKNOWN, LandlordWaterHeatingOverrideRow, "Hot Water", False), + ComponentSpec("main_heating_system", MainHeatingSystemType, MainHeatingSystemType.UNKNOWN, LandlordMainHeatingSystemOverrideRow, "Heating", False), + ] + + +def _specs_by_component() -> dict[str, ComponentSpec]: + return {s.component: s for s in _component_specs()} + + +def _norm(s: Any) -> str: + """Vocab key normalisation — mirrors the orchestrator (strip + lower).""" + return str(s or "").strip().lower() + + +def _split_entries(cell: Any, per_building_part: bool) -> list[str]: + raw = "" if cell is None else str(cell) + if not raw.strip(): + return [] + if not per_building_part: + return [raw.strip()] + return [part.strip() for part in raw.split(",") if part.strip()] + + +def _load_rows(excel: str, sheet: str) -> list[dict[str, Any]]: + return pd.read_excel(excel, sheet_name=sheet).to_dict(orient="records") # type: ignore[return-value] + + +def _filter_rows(rows: list[dict[str, Any]], org_ref: Optional[str], + limit: Optional[int]) -> list[dict[str, Any]]: + """Narrow to one property (--org-ref) or the first N rows (--limit) for a + cheap smoke test before the full run.""" + if org_ref: + rows = [r for r in rows if str(r.get(ORG_REF_COLUMN, "")).strip() == org_ref] + if limit: + rows = rows[:limit] + return rows + + +def _distinct_entries(rows: list[dict[str, Any]], spec: ComponentSpec) -> Counter[str]: + counts: Counter[str] = Counter() + for row in rows: + for entry in _split_entries(row.get(spec.excel_header), spec.per_building_part): + counts[entry] += 1 + return counts + + +# --------------------------------------------------------------------------- # +def list_values(_: argparse.Namespace) -> None: + """Print the valid override values per component (the reference for edits).""" + for spec in _component_specs(): + print(f"\n## {spec.component} (Excel: {spec.excel_header})") + for v in spec.allowed_values(): + print(f" {v}") + + +def validate(args: argparse.Namespace) -> None: + """Check a hand-edited CSV: every corrected_value must be a valid enum value.""" + specs = _specs_by_component() + bad = 0 + with open(args.edits, newline="") as f: + for i, r in enumerate(csv.DictReader(f), start=2): + val = (r.get("corrected_value") or "").strip() + if not val: + continue + comp = (r.get("component") or "").strip() + spec = specs.get(comp) + if spec is None: + logger.error("row %d: unknown component %r", i, comp) + bad += 1 + continue + if val not in spec.allowed_values(): + hint = difflib.get_close_matches(val, spec.allowed_values(), n=2) + logger.error("row %d [%s]: %r is not a valid value.%s", + i, comp, val, + f" Did you mean: {hint}?" if hint else + " Run 'list-values' for the allowed set.") + bad += 1 + if bad: + raise SystemExit(f"{bad} invalid corrected_value(s) — fix them before apply-edits.") + logger.info("All corrected values are valid enum values. ✓") + + +def _db_session() -> Any: + return make_session(make_engine(PostgresConfig.from_env(os.environ))) + + +def classify(args: argparse.Namespace) -> None: + rows = _filter_rows(_load_rows(args.excel, args.sheet), args.org_ref, args.limit) + logger.info("Classifying over %d row(s).", len(rows)) + chat_gpt = ChatGPT() + session = _db_session() + reader = LandlordOverrideReaderPostgresRepository(session) + try: + vocab = reader.load_for_portfolio(args.portfolio_id) # {component: {desc: value}} + unknown_rows: list[tuple[str, str, int, str]] = [] + + for spec in _component_specs(): + counts = _distinct_entries(rows, spec) + known = vocab.get(spec.component, {}) # already-classified (cache) + to_classify = {d for d in counts if _norm(d) not in known} + logger.info("%-22s %4d distinct | %4d cached | %4d to classify", + spec.component, len(counts), len(counts) - len(to_classify), len(to_classify)) + + resolved: dict[str, Enum] = {} + if to_classify: + classifier: ChatGptColumnClassifier[Any] = ChatGptColumnClassifier( + chat_gpt, spec.enum_cls, spec.unknown, extra_instructions=spec.extra_instructions) + resolved = classifier.classify(to_classify) + repo: LandlordOverridesRepository[Any] = LandlordOverridesRepository(session, spec.row_type) + with commit_scope(session): + # store keyed on the normalised description (matches the reader/finaliser lookup) + repo.upsert_all(args.portfolio_id, {_norm(d): m for d, m in resolved.items()}) + + # collect UNKNOWNs (freshly classified + anything cached as UNKNOWN) for review + unk = spec.unknown.value + for desc, n in counts.items(): + v = resolved.get(desc).value if desc in resolved and resolved[desc] else known.get(_norm(desc)) # type: ignore[union-attr] + if v is None or v == unk: + allowed = " | ".join(spec.allowed_values()) + unknown_rows.append((spec.component, desc, n, allowed)) + + with open(UNKNOWNS_PATH, "w", newline="") as f: + w = csv.writer(f) + w.writerow(["component", "description", "count", "corrected_value", "allowed_values"]) + for comp, desc, n, allowed in sorted(unknown_rows, key=lambda r: (-r[2])): + w.writerow([comp, desc, n, "", allowed]) + logger.info("\nWrote %s — fill 'corrected_value' (must match 'allowed_values'), " + "then: validate -> apply-edits -> write.", UNKNOWNS_PATH) + finally: + session.close() + + +def _upsert_user_corrections(session: Any, portfolio_id: int, + by_component: dict[str, dict[str, str]]) -> int: + """Upsert validated human corrections as source=user (always wins on conflict).""" + specs = _specs_by_component() + n = 0 + now = datetime.now(timezone.utc) + for comp, mapping in by_component.items(): + spec = specs[comp] + table: Table = getattr(spec.row_type, "__table__") + rows = [{"portfolio_id": portfolio_id, "description": _norm(d), "value": v, + "source": OverrideSource.USER, "created_at": now, "updated_at": now} + for d, v in mapping.items()] + if not rows: + continue + stmt = pg_insert(table).values(rows) + stmt = stmt.on_conflict_do_update( + index_elements=["portfolio_id", "description"], + set_={"value": stmt.excluded.value, "source": stmt.excluded.source, + "updated_at": stmt.excluded.updated_at}) + session.execute(stmt) + n += len(rows) + return n + + +def apply_edits(args: argparse.Namespace) -> None: + validate(args) # fail before touching the DB + specs = _specs_by_component() + by_component: dict[str, dict[str, str]] = {} + with open(args.edits, newline="") as f: + for r in csv.DictReader(f): + val = (r.get("corrected_value") or "").strip() + if val and r["component"] in specs: + by_component.setdefault(r["component"], {})[r["description"]] = val + session = _db_session() + try: + if not args.apply: + total = sum(len(m) for m in by_component.values()) + logger.info("DRY RUN — %d user corrections ready. Re-run with --apply.", total) + return + with commit_scope(session): + n = _upsert_user_corrections(session, args.portfolio_id, by_component) + logger.info("Upserted %d user corrections (source=user).", n) + finally: + session.close() + + +def _org_ref_to_property_id(session: Any, portfolio_id: int) -> dict[str, int]: + stmt = text("SELECT landlord_property_id, id FROM property " + "WHERE portfolio_id = :pid AND landlord_property_id IS NOT NULL") + return {str(ref).strip(): int(pid) for ref, pid in session.execute(stmt, {"pid": portfolio_id})} + + +def write(args: argparse.Namespace) -> None: + rows = _filter_rows(_load_rows(args.excel, args.sheet), args.org_ref, args.limit) + logger.info("Writing over %d row(s).", len(rows)) + session = _db_session() + reader = LandlordOverrideReaderPostgresRepository(session) + try: + vocab = reader.load_for_portfolio(args.portfolio_id) + org_ref_map = _org_ref_to_property_id(session, args.portfolio_id) + logger.info("Portfolio %d: %d properties with org_ref.", args.portfolio_id, len(org_ref_map)) + + inserts: list[PropertyOverrideInsert] = [] + unmatched: Counter[str] = Counter() + unresolved: Counter[str] = Counter() + for row in rows: + org_ref = str(row.get(ORG_REF_COLUMN, "")).strip() + property_id = org_ref_map.get(org_ref) + if property_id is None: + unmatched[org_ref] += 1 + continue + for spec in _component_specs(): + comp_vocab = vocab.get(spec.component, {}) + for building_part, entry in enumerate( + _split_entries(row.get(spec.excel_header), spec.per_building_part)): + value = comp_vocab.get(_norm(entry)) + if not value or value == spec.unknown.value: + unresolved[f"{spec.component}: {entry}"] += 1 + continue + inserts.append(PropertyOverrideInsert( + property_id=property_id, portfolio_id=args.portfolio_id, + building_part=building_part, override_component=spec.component, + override_value=value, original_spreadsheet_description=entry)) + + logger.info("Built %d rows | %d unmatched org_refs | %d unresolved", + len(inserts), sum(unmatched.values()), sum(unresolved.values())) + if unresolved: + logger.info("Top unresolved (need apply-edits): %s", unresolved.most_common(10)) + if not args.apply: + logger.info("DRY RUN — not writing. Re-run with --apply.") + for ins in inserts[:10]: + logger.info(" %s", ins) + return + with commit_scope(session): + affected = PropertyOverridePostgresRepository(session).upsert_all(inserts) + logger.info("Upserted %d property_overrides.", affected) + finally: + session.close() + + +def verify(args: argparse.Namespace) -> None: + """For one property (by org_ref): show the persisted property_overrides rows + and the EpcSimulation overlays they produce — the end-to-end proof that the + chain reaches the SAP overlay surface.""" + session = _db_session() + try: + org_ref_map = _org_ref_to_property_id(session, args.portfolio_id) + property_id = org_ref_map.get(args.org_ref) + if property_id is None: + raise SystemExit(f"org_ref {args.org_ref!r} not found in portfolio {args.portfolio_id}.") + reader = PropertyOverridesPostgresReader(lambda: session) + resolved = reader.overrides_for(property_id) + logger.info("property_id %d — %d property_overrides rows:", property_id, len(resolved.rows)) + for r in resolved.rows: + logger.info(" part %d | %-22s = %s", r.building_part, r.override_component, r.override_value) + overlays = overlays_from(resolved) + logger.info("\n-> %d EpcSimulation overlay(s) produced (what the SAP calc applies):", len(overlays)) + for o in overlays: + logger.info(" %s", o) + finally: + session.close() + + +def main() -> None: + p = argparse.ArgumentParser(description=__doc__) + sub = p.add_subparsers(dest="cmd", required=True) + + sub.add_parser("list-values").set_defaults(func=list_values) + + v = sub.add_parser("validate") + v.add_argument("--edits", required=True) + v.set_defaults(func=validate) + + c = sub.add_parser("classify") + c.add_argument("--excel", required=True) + c.add_argument("--sheet", default="AddressProfilingResults") + c.add_argument("--portfolio-id", type=int, required=True) + c.add_argument("--org-ref", default=None, help="smoke test: only this property's org_ref") + c.add_argument("--limit", type=int, default=None, help="smoke test: first N rows") + c.set_defaults(func=classify) + + a = sub.add_parser("apply-edits") + a.add_argument("--edits", required=True) + a.add_argument("--portfolio-id", type=int, required=True) + a.add_argument("--apply", action="store_true") + a.set_defaults(func=apply_edits) + + w = sub.add_parser("write") + w.add_argument("--excel", required=True) + w.add_argument("--sheet", default="AddressProfilingResults") + w.add_argument("--portfolio-id", type=int, required=True) + w.add_argument("--org-ref", default=None, help="smoke test: only this property's org_ref") + w.add_argument("--limit", type=int, default=None, help="smoke test: first N rows") + w.add_argument("--apply", action="store_true") + w.set_defaults(func=write) + + vf = sub.add_parser("verify") + vf.add_argument("--portfolio-id", type=int, required=True) + vf.add_argument("--org-ref", required=True) + vf.set_defaults(func=verify) + + args = p.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/scripts/hyde/unknowns_review.md b/scripts/hyde/unknowns_review.md new file mode 100644 index 00000000..44514221 --- /dev/null +++ b/scripts/hyde/unknowns_review.md @@ -0,0 +1,53 @@ +# Hyde portfolio 796 — UNKNOWN overrides for review + +After ChatGPT classification, **19 distinct descriptions** did not auto-resolve (out of ~440 distinct across all components). Grouped below with a **proposed value** (must be one of the allowed enum values) + the row count it affects. Nothing is written to the DB until these are confirmed. + +## 1. construction_age_band — 29,829 rows (DETERMINISTIC, no judgement) + +The classifier didn't extract the band letter in batch, but the band IS the leading letter, so these are mapped mechanically (`"D: 1950-1966"` → `D`). Just confirm the approach. + +| description | → band | rows | +|---|---|---| +| D: 1950-1966 | `D` | 4,978 | +| K: 2007-2011 | `K` | 4,201 | +| I: 1996-2002 | `I` | 3,708 | +| B: 1900-1929 | `B` | 3,222 | +| H: 1991-1995 | `H` | 2,747 | +| E: 1967-1975 | `E` | 2,479 | +| J: 2003-2006 | `J` | 2,221 | +| F: 1976-1982 | `F` | 2,071 | +| C: 1930-1949 | `C` | 1,840 | +| G: 1983-1990 | `G` | 1,615 | +| A: pre-1900 | `A` | 615 | +| M: 2023 onwards | `M` | 132 | + +## 2. roof_type (flat roofs) — 1,473 rows (NEEDS KHALIM'S CALL) + +Flat-roof insulation drives the SAP roof U-value. **`Flat: As Built` (1,172) + `Flat: Unknown` (194) are the load-bearing decision** — proposed conservatively as *no insulation (assumed)*. + +| description | proposed value | rows | alt options | +|---|---|---|---| +| Flat: As Built | `Flat, no insulation (assumed)` | 1,172 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) | +| Flat: Unknown | `Flat, no insulation (assumed)` | 194 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) | +| Flat: 150mm | `Flat, insulated` | 59 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) | +| Flat: 100mm | `Flat, insulated` | 32 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) | +| Flat: 50mm | `Flat, limited insulation` | 13 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) | +| SameDwellingAbove | `(same dwelling above)` | 3 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) | + +## 3. wall_type — 7 rows + +| description | proposed value | rows | +|---|---|---| +| TimberFrame: Internal | `Timber frame, with additional insulation` | 7 | + +## How to apply after review + +Edit the `corrected_value` column of `overrides_edits.csv`, then: + +``` +python scripts/hyde/build_property_overrides.py validate --edits overrides_edits.csv +python scripts/hyde/build_property_overrides.py apply-edits --edits overrides_edits.csv --portfolio-id 796 --apply +python scripts/hyde/build_property_overrides.py write --excel scripts/hyde/hyde_property_overrides.xlsx --portfolio-id 796 --apply +``` + +> Note: a proper fix for the age classifier (a prompt hint so the production lambda extracts the band letter) is a separate follow-up; these script edits handle this run. \ No newline at end of file diff --git a/scripts/hyde_epc_schema_versions.py b/scripts/hyde_epc_schema_versions.py new file mode 100644 index 00000000..7bb882ac --- /dev/null +++ b/scripts/hyde_epc_schema_versions.py @@ -0,0 +1,159 @@ +"""Tally the EPC schema versions across the hyde list (manipulation_filled UPRNs). + +For every resolved UPRN we look up its EPC certificate's ``schemaType`` (e.g. +``RdSAP-Schema-21.0.1``, ``RdSAP-Schema-17.1``, ``SAP-Schema-16.2``). The +gov EPC ``/api/domestic/search`` endpoint returns ``schemaType`` per row, so one +search-per-postcode covers every UPRN in that postcode — far cheaper than a +certificate fetch per UPRN. The latest cert (max registrationDate) wins per UPRN. + +Outputs: a per-schema-version tally with one example UPRN each, plus a CSV +mapping every UPRN -> schema version. + + python -m scripts.hyde_epc_schema_versions + python -m scripts.hyde_epc_schema_versions --workers 8 --out scripts/hyde_schema_versions.csv + +Reads OPEN_EPC_API_TOKEN from backend/.env. Run from the worktree root. +""" + +from __future__ import annotations + +import argparse +import os +import sys +import time +from collections import Counter, defaultdict +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Any, Optional + +import httpx +from dotenv import load_dotenv + +_REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap + +from scripts.fill_domna_addresses import clean_postcode # noqa: E402 +from scripts.finalise_to_property_table import load_rows # noqa: E402 + +_BASE = "https://api.get-energy-performance-data.communities.gov.uk" +_SEARCH = f"{_BASE}/api/domestic/search" +NOT_IN_EPC = "NOT_IN_EPC" + +_DEFAULT_IN = _REPO_ROOT / "scripts" / "manipulation_filled.xlsx" +_DEFAULT_OUT = _REPO_ROOT / "scripts" / "hyde_schema_versions.csv" + + +def search_postcode( + client: httpx.Client, postcode: str, headers: dict[str, str] +) -> list[dict[str, Any]]: + """Return the search rows for a postcode, retrying on rate-limit (429).""" + for attempt in range(5): + resp = client.get(_SEARCH, params={"postcode": postcode}, headers=headers, timeout=30) + if resp.status_code == 429: + retry_after = float(resp.headers.get("Retry-After", "2")) + time.sleep(min(retry_after, 10) * (attempt + 1)) + continue + # 400 = malformed postcode (data-entry typo), 404 = no certs — skip both. + if resp.status_code in (400, 404): + return [] + resp.raise_for_status() + return resp.json().get("data", []) + return [] + + +def build_uprn_schema_map( + postcodes: list[str], token: str, workers: int +) -> dict[int, tuple[str, str]]: + """Map UPRN -> (schemaType, registrationDate) for the latest cert per UPRN. + + One search per postcode (concurrent); later we look our UPRNs up in here. + """ + headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"} + by_uprn: dict[int, tuple[str, str]] = {} + done = 0 + total = len(postcodes) + + def fetch(pc: str) -> list[dict[str, Any]]: + with httpx.Client() as client: + return search_postcode(client, pc, headers) + + with ThreadPoolExecutor(max_workers=workers) as pool: + for rows in pool.map(fetch, postcodes): + for row in rows: + uprn = row.get("uprn") + schema = row.get("schemaType") + reg = row.get("registrationDate") or "" + if uprn is None or not schema: + continue + prev = by_uprn.get(int(uprn)) + # Keep the latest-registered cert's schema for this UPRN. + if prev is None or reg > prev[1]: + by_uprn[int(uprn)] = (str(schema), str(reg)) + done += 1 + if done % 250 == 0: + print(f" searched {done}/{total} postcodes, {len(by_uprn)} uprns seen") + return by_uprn + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN) + parser.add_argument("--out", type=Path, default=_DEFAULT_OUT) + parser.add_argument("--workers", type=int, default=8) + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + load_dotenv(_REPO_ROOT / "backend" / ".env") + token = os.environ.get("OPEN_EPC_API_TOKEN") + if not token: + print("OPEN_EPC_API_TOKEN not set (backend/.env)") + return 2 + + _, rows = load_rows(args.inp, include_unmatched=False) + pairs: list[tuple[int, str, str]] = [] # (uprn, postcode_clean, address) + for r in rows: + uprn = r["address2uprn_uprn"] + if uprn: + pairs.append((int(uprn), clean_postcode(r["postcode"]), r["address2uprn_address"])) + postcodes = sorted({pc for _, pc, _ in pairs if pc}) + print(f"{len(pairs)} UPRNs across {len(postcodes)} unique postcodes") + + by_uprn = build_uprn_schema_map(postcodes, token, args.workers) + print(f"EPC search returned schema for {len(by_uprn)} distinct UPRNs") + + # Resolve each hyde UPRN to its schema version. + tally: Counter[str] = Counter() + example: dict[str, tuple[int, str]] = {} + out_lines: list[tuple[int, str, str, str]] = [] # uprn, schema, postcode, address + seen: set[int] = set() + for uprn, pc, address in pairs: + if uprn in seen: + continue + seen.add(uprn) + schema = by_uprn.get(uprn, (NOT_IN_EPC, ""))[0] + tally[schema] += 1 + example.setdefault(schema, (uprn, address)) + out_lines.append((uprn, schema, pc, address)) + + # Write the full per-UPRN mapping. + import csv + + with args.out.open("w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow(["uprn", "schema_version", "postcode", "matched_address"]) + w.writerows(out_lines) + + print(f"\nSchema versions across {len(seen)} distinct UPRNs:\n") + print(f" {'schema version':<26} {'count':>7} example UPRN") + print(f" {'-'*26} {'-'*7} {'-'*12}") + for schema, count in tally.most_common(): + ex_uprn, ex_addr = example[schema] + print(f" {schema:<26} {count:>7} {ex_uprn} ({ex_addr})") + print(f"\nFull mapping -> {args.out}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/lisasrequest/compare_to_ara.py b/scripts/lisasrequest/compare_to_ara.py new file mode 100644 index 00000000..c30b5f29 --- /dev/null +++ b/scripts/lisasrequest/compare_to_ara.py @@ -0,0 +1,169 @@ +"""Compare our step-1 UPRN resolution against the old "Ara output" data. + +The Ara data lives in scripts/lisasrequest/Durkan data.xlsx, sheet "Ara output", +and carries UPRNs from our previous dataset. It is NOT treated as ground truth — +this just lines it up against what we found / didn't find so a human can eyeball +the differences. (We read the xlsx, not the CSV export: the CSV mangled half the +UPRNs to Excel scientific notation, e.g. ``1.00023E+11``; the xlsx keeps them +intact, so every comparison below is exact.) + +Join key is (postcode, leading number, first street word), since the UPRN is the +thing under comparison and Ara's address strings differ from the landlord input. + +Each of our rows lands in one comparison bucket: + match both found a UPRN and they are equal. + differ both found a UPRN and they differ. + we_only we resolved a UPRN, Ara had none for this address. + ara_only we did NOT resolve, but Ara had a UPRN <- recovery candidates. + both_missing neither resolved a UPRN. + no_ara_record the Ara sheet had no row matching this address at all. + + python -m scripts.lisasrequest.compare_to_ara +""" + +from __future__ import annotations + +import argparse +import csv +import re +import sys +from collections import Counter, OrderedDict +from pathlib import Path +from typing import Optional + +import pandas as pd + +_REPO_ROOT = Path(__file__).resolve().parents[2] + +ADDRESS_COL = "address" +POSTCODE_COL = "postcode" +OUR_UPRN_COL = "domna_address_uprn" +OUR_SOURCE_COL = "domna_source" + +ARA_UPRN_COL = "EPC_B.uprn" +ARA_ADDRESS_COL = "EPC_B.address" +ARA_POSTCODE_COL = "EPC_B.postcode" +ARA_SHEET = "Ara output" + +_OUR_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv" +_ARA_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "Durkan data.xlsx" +_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_vs_ara.csv" + +Key = tuple[str, str, str] + + +def norm_key(address: str, postcode: str) -> Key: + """(postcode-no-space, leading number, first street word) — the join key.""" + pc = postcode.upper().replace(" ", "") + upper = address.upper() + nums = re.findall(r"\d+[A-Z]?", upper) + words = [w for w in re.findall(r"[A-Z]+", upper) if w != "FLAT"] + return (pc, nums[0] if nums else "", words[0] if words else "") + + +def load_ara(path: Path) -> tuple[dict[Key, dict[str, str]], int]: + """Index the Ara-output xlsx sheet by join key (first row wins). + + Returns (index, duplicates). Read as strings so UPRNs keep their full value. + """ + df = pd.read_excel(path, sheet_name=ARA_SHEET, dtype=str) + rows: list[dict[str, str]] = df.fillna("").to_dict(orient="records") + index: dict[Key, dict[str, str]] = OrderedDict() + dupes = 0 + for row in rows: + address = str(row.get(ARA_ADDRESS_COL) or "").strip() + postcode = str(row.get(ARA_POSTCODE_COL) or row.get(POSTCODE_COL) or "").strip() + if not address: + continue + key = norm_key(address, postcode) + if key in index: + dupes += 1 + continue + index[key] = row + return index, dupes + + +def classify( + our_uprn: str, our_found: bool, ara: Optional[dict[str, str]] +) -> tuple[str, str, str]: + """Return (comparison, ara_uprn, ara_address) for one of our rows.""" + if ara is None: + return ("no_ara_record", "", "") + ara_uprn = (ara.get(ARA_UPRN_COL) or "").strip() + ara_address = (ara.get(ARA_ADDRESS_COL) or "").strip() + ara_found = bool(ara_uprn) + + if our_found and ara_found: + comparison = "match" if our_uprn == ara_uprn else "differ" + elif our_found and not ara_found: + comparison = "we_only" + elif not our_found and ara_found: + comparison = "ara_only" + else: + comparison = "both_missing" + return (comparison, ara_uprn, ara_address) + + +def compare( + our_rows: list[dict[str, str]], ara_index: dict[Key, dict[str, str]] +) -> list[dict[str, str]]: + out: list[dict[str, str]] = [] + for row in our_rows: + address = (row.get(ADDRESS_COL) or "").strip() + postcode = (row.get(POSTCODE_COL) or "").strip() + our_uprn = (row.get(OUR_UPRN_COL) or "").strip() + our_source = (row.get(OUR_SOURCE_COL) or "").strip() + our_found = bool(our_uprn) and our_source != "not_found" + + ara = ara_index.get(norm_key(address, postcode)) + comparison, ara_uprn, ara_address = classify(our_uprn, our_found, ara) + out.append( + { + "address": address, + "postcode": postcode, + "our_uprn": our_uprn, + "our_source": our_source, + "ara_uprn": ara_uprn, + "ara_address": ara_address, + "comparison": comparison, + } + ) + return out + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--ours", type=Path, default=_OUR_IN) + parser.add_argument("--ara", type=Path, default=_ARA_IN) + parser.add_argument("--out", type=Path, default=_DEFAULT_OUT) + args = parser.parse_args() + + with args.ours.open(newline="", encoding="utf-8-sig") as fh: + our_rows = [dict(r) for r in csv.DictReader(fh)] + ara_index, dupes = load_ara(args.ara) + print(f"Loaded {len(our_rows)} of our rows; {len(ara_index)} Ara keys " + f"({dupes} duplicate Ara rows ignored).") + + result = compare(our_rows, ara_index) + fieldnames = list(result[0].keys()) + with args.out.open("w", newline="", encoding="utf-8") as fh: + writer = csv.DictWriter(fh, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(result) + + counts = Counter(r["comparison"] for r in result) + print(f"\nComparison of {len(result)} rows -> {args.out}") + for name in ( + "match", + "differ", + "we_only", + "ara_only", + "both_missing", + "no_ara_record", + ): + print(f" {name}: {counts.get(name, 0)}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/lisasrequest/durkan_805_schema_check.py b/scripts/lisasrequest/durkan_805_schema_check.py new file mode 100644 index 00000000..776a7813 --- /dev/null +++ b/scripts/lisasrequest/durkan_805_schema_check.py @@ -0,0 +1,142 @@ +"""EPC SAP-schema check for portfolio 805, and whether each is mapper-supported. + +For every UPRN currently in the ``property`` table for portfolio 805, look up its +latest EPC certificate's ``schemaType`` (one /api/domestic/search per postcode, +reusing scripts.hyde_epc_schema_versions) and check it against the schemas the +EpcPropertyData mapper actually handles +(``EpcPropertyDataMapper.from_api_response``, datatypes/epc/domain/mapper.py). + +Prints a per-schema tally with a supported? flag and an example UPRN, and writes +the full per-UPRN mapping to durkan_805_schema_check.csv. + + python -m scripts.lisasrequest.durkan_805_schema_check + python -m scripts.lisasrequest.durkan_805_schema_check --portfolio 805 --workers 8 + +Reads OPEN_EPC_API_TOKEN from backend/.env and POSTGRES_* from the root .env. +Run from the worktree root. +""" + +from __future__ import annotations + +import argparse +import csv +import os +import sys +from collections import Counter +from pathlib import Path + +from dotenv import load_dotenv +from sqlmodel import select + +_REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap + +from infrastructure.postgres.config import PostgresConfig # noqa: E402 +from infrastructure.postgres.engine import make_engine, make_session # noqa: E402 +from infrastructure.postgres.property_table import PropertyRow # noqa: E402 +from scripts.fill_domna_addresses import clean_postcode # noqa: E402 +from scripts.hyde_epc_schema_versions import ( # noqa: E402 + NOT_IN_EPC, + build_uprn_schema_map, +) + +# Schemas EpcPropertyDataMapper.from_api_response dispatches on (everything else +# raises "Unsupported EPC schema"). Keep in sync with mapper.py:2539-2603. +SUPPORTED_SCHEMAS = frozenset( + { + "RdSAP-Schema-17.0", + "RdSAP-Schema-17.1", + "RdSAP-Schema-18.0", + "RdSAP-Schema-19.0", + "RdSAP-Schema-20.0.0", + "RdSAP-Schema-21.0.0", + "RdSAP-Schema-21.0.1", + "SAP-Schema-16.0", + "SAP-Schema-16.2", + "SAP-Schema-16.3", + "SAP-Schema-17.0", + "SAP-Schema-17.1", + "SAP-Schema-18.0.0", + } +) + +_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_805_schema_check.csv" + + +def load_portfolio_uprns(portfolio_id: int) -> list[tuple[int, str]]: + """Return (uprn, postcode) for every property in the portfolio with a UPRN.""" + load_dotenv(_REPO_ROOT / ".env") + engine = make_engine(PostgresConfig.from_env(os.environ)) + session = make_session(engine) + try: + stmt = select(PropertyRow.uprn, PropertyRow.postcode).where( + PropertyRow.portfolio_id == portfolio_id + ) + out: list[tuple[int, str]] = [] + for uprn, postcode in session.exec(stmt).all(): + if uprn is not None: + out.append((int(uprn), str(postcode or ""))) + return out + finally: + session.close() + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--portfolio", type=int, default=805) + parser.add_argument("--out", type=Path, default=_DEFAULT_OUT) + parser.add_argument("--workers", type=int, default=8) + args = parser.parse_args() + + load_dotenv(_REPO_ROOT / "backend" / ".env") + token = os.environ.get("OPEN_EPC_API_TOKEN") + if not token: + print("OPEN_EPC_API_TOKEN not set (backend/.env)") + return 2 + + pairs = load_portfolio_uprns(args.portfolio) + postcodes = sorted({clean_postcode(pc) for _, pc in pairs if pc}) + print( + f"Portfolio {args.portfolio}: {len(pairs)} UPRNs across " + f"{len(postcodes)} unique postcodes" + ) + + by_uprn = build_uprn_schema_map(postcodes, token, args.workers) + print(f"EPC search returned a schema for {len(by_uprn)} distinct UPRNs") + + tally: Counter[str] = Counter() + example: dict[str, int] = {} + rows_out: list[tuple[int, str, str, str]] = [] # uprn, schema, supported, postcode + seen: set[int] = set() + for uprn, pc in pairs: + if uprn in seen: + continue + seen.add(uprn) + schema = by_uprn.get(uprn, (NOT_IN_EPC, ""))[0] + supported = "yes" if schema in SUPPORTED_SCHEMAS else "no" + tally[schema] += 1 + example.setdefault(schema, uprn) + rows_out.append((uprn, schema, supported, clean_postcode(pc))) + + with args.out.open("w", newline="", encoding="utf-8") as fh: + writer = csv.writer(fh) + writer.writerow(["uprn", "schema_version", "mapper_supported", "postcode"]) + writer.writerows(rows_out) + + supported_count = sum(c for s, c in tally.items() if s in SUPPORTED_SCHEMAS) + print(f"\nSchema versions across {len(seen)} distinct UPRNs in portfolio " + f"{args.portfolio}:\n") + print(f" {'schema version':<26} {'count':>5} {'supported?':<10} example UPRN") + print(f" {'-' * 26} {'-' * 5} {'-' * 10} {'-' * 12}") + for schema, count in tally.most_common(): + supported = "yes" if schema in SUPPORTED_SCHEMAS else "NO" + print(f" {schema:<26} {count:>5} {supported:<10} {example[schema]}") + print( + f"\nMapper-supported: {supported_count}/{len(seen)} UPRNs. " + f"Full mapping -> {args.out}" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/lisasrequest/fill_domna_address.py b/scripts/lisasrequest/fill_domna_address.py new file mode 100644 index 00000000..98d70071 --- /dev/null +++ b/scripts/lisasrequest/fill_domna_address.py @@ -0,0 +1,200 @@ +"""Step 1 (Durkan portfolio): resolve a UPRN per CSV row via EPC then OS. + +Input: scripts/lisasrequest/260611_Sample_Seed_Portfolio_Durkan_split_addresses(Split Addresses).csv + columns include ``address`` and ``postcode``. + +Every row carries an address and none carry a UPRN, so there is a single case: + + * resolve a UPRN from ``address`` + ``postcode`` via the EPC API (relaxed + address variants, threshold 0.7), then Ordnance Survey Places as a fallback + (threshold 0.6). + * not resolvable -> domna_source = "not_found"; uprn/address/score left empty. + +Writes a NEW CSV = every original column, in order, plus four DOMNA columns: + + domna_address_found the canonical address EPC/OS returned (matched string) + domna_address_uprn the resolved UPRN ("" when unresolved) + domna_lexiscore the match score in [0, 1] ("" when unresolved) + domna_source epc / ordnance_survey / not_found + +This is the human-review file; step 2 (resolve_uprns_for_finaliser) reshapes it +into the finaliser columns without re-hitting the APIs. + + python -m scripts.lisasrequest.fill_domna_address + python -m scripts.lisasrequest.fill_domna_address --limit 20 # smoke test + +Resolution reuses the relaxed matchers from scripts.fill_domna_addresses. Keys +come from backend/.env (OPEN_EPC_API_TOKEN, ORDNANCE_SURVEY_API_KEY). Run from +the worktree root (import trap). +""" + +from __future__ import annotations + +import argparse +import csv +import sys +from pathlib import Path +from typing import Optional + +import pandas as pd + +_REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap + +from scripts.fill_domna_addresses import ( # noqa: E402 + Hit, + resolve_epc_relaxed, + resolve_os_relaxed, +) +from scripts.resolve_uprns_for_finaliser import clean_postcode, load_keys # noqa: E402 + +ADDRESS_COL = "address" +POSTCODE_COL = "postcode" +FOUND_ADDRESS_COL = "domna_address_found" +FOUND_UPRN_COL = "domna_address_uprn" +LEXISCORE_COL = "domna_lexiscore" +SOURCE_COL = "domna_source" +NOT_FOUND = "not_found" +_RESULT_COLS = (FOUND_ADDRESS_COL, FOUND_UPRN_COL, LEXISCORE_COL, SOURCE_COL) + +_CSV_NAME = "260611_Sample_Seed_Portfolio_Durkan_split_addresses(Split Addresses).csv" +_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / _CSV_NAME +_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv" + + +def read_rows(path: Path) -> tuple[list[dict[str, str]], list[str]]: + """Read a CSV into (rows, fieldnames), preserving column order.""" + with path.open(newline="", encoding="utf-8-sig") as fh: + reader = csv.DictReader(fh) + fieldnames = list(reader.fieldnames or []) + rows = [dict(row) for row in reader] + return rows, fieldnames + + +def resolve_one( + address: str, + postcode_raw: str, + *, + epc_token: Optional[str], + os_api_key: Optional[str], + epc_cache: dict[str, pd.DataFrame], + os_cache: dict[str, pd.DataFrame], +) -> Optional[Hit]: + """Resolve one row's UPRN: EPC (relaxed) first, then OS Places fallback.""" + postcode_clean = clean_postcode(postcode_raw) + if not address or not postcode_clean: + return None + + hit: Optional[Hit] = None + if epc_token: + try: + hit = resolve_epc_relaxed(address, postcode_clean, epc_cache) + except Exception as exc: + print(f" EPC failed {address!r} / {postcode_clean}: {exc}") + if hit is None and os_api_key: + try: + hit = resolve_os_relaxed(address, postcode_clean, os_api_key, os_cache) + except Exception as exc: + print(f" OS failed {address!r} / {postcode_clean}: {exc}") + return hit + + +def fill( + rows: list[dict[str, str]], + *, + epc_token: Optional[str], + os_api_key: Optional[str], +) -> tuple[int, int, int]: + """Fill the DOMNA columns on each row in place. + + Returns (epc_hits, os_hits, not_found) counts. + """ + epc_cache: dict[str, pd.DataFrame] = {} + os_cache: dict[str, pd.DataFrame] = {} + epc_hits = os_hits = not_found = 0 + total = len(rows) + + for n, row in enumerate(rows, start=1): + address = str(row.get(ADDRESS_COL, "") or "").strip() + postcode_raw = str(row.get(POSTCODE_COL, "") or "").strip() + hit = resolve_one( + address, + postcode_raw, + epc_token=epc_token, + os_api_key=os_api_key, + epc_cache=epc_cache, + os_cache=os_cache, + ) + if hit is None: + row[FOUND_ADDRESS_COL] = "" + row[FOUND_UPRN_COL] = "" + row[LEXISCORE_COL] = "" + row[SOURCE_COL] = NOT_FOUND + not_found += 1 + else: + uprn, matched, score, source = hit + row[FOUND_ADDRESS_COL] = matched + row[FOUND_UPRN_COL] = uprn + row[LEXISCORE_COL] = str(round(score, 4)) + row[SOURCE_COL] = source + if source == "epc": + epc_hits += 1 + else: + os_hits += 1 + print( + f"[{n}/{total}] {address!r} -> " + f"{row[FOUND_UPRN_COL] or '(no match)'} ({row[SOURCE_COL]})" + ) + + return epc_hits, os_hits, not_found + + +def write_rows(rows: list[dict[str, str]], path: Path, fieldnames: list[str]) -> None: + """Write rows to CSV, preserving input columns and appending DOMNA columns.""" + out_fields = list(fieldnames) + for col in _RESULT_COLS: + if col not in out_fields: + out_fields.append(col) + with path.open("w", newline="", encoding="utf-8") as fh: + writer = csv.DictWriter(fh, fieldnames=out_fields, extrasaction="ignore") + writer.writeheader() + writer.writerows(rows) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN) + parser.add_argument("--out", type=Path, default=_DEFAULT_OUT) + parser.add_argument("--limit", type=int, default=None, help="process first N rows") + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + epc_token, os_api_key = load_keys() + if not epc_token: + print("OPEN_EPC_API_TOKEN not set (backend/.env) — EPC resolution disabled") + if not os_api_key: + print("ORDNANCE_SURVEY_API_KEY not set (backend/.env) — OS fallback disabled") + + rows, fieldnames = read_rows(args.inp) + if args.limit is not None: + rows = rows[: args.limit] + print(f"Loaded {len(rows)} rows from {args.inp}") + + epc_hits, os_hits, not_found = fill( + rows, epc_token=epc_token, os_api_key=os_api_key + ) + + write_rows(rows, args.out, fieldnames) + resolved = epc_hits + os_hits + print( + f"\nResolved {resolved}/{len(rows)} " + f"(epc={epc_hits}, ordnance_survey={os_hits}); {not_found} not found." + ) + print(f"Wrote filled CSV -> {args.out}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/lisasrequest/finalise_to_property_table.py b/scripts/lisasrequest/finalise_to_property_table.py new file mode 100644 index 00000000..eee66a0f --- /dev/null +++ b/scripts/lisasrequest/finalise_to_property_table.py @@ -0,0 +1,111 @@ +"""Step 3 (Durkan portfolio): insert the reshaped rows into the ``property`` table. + +Reads durkan_finaliser_input.csv (step 2) and, per row, maps it with the real +finaliser mapper (``BulkUploadFinaliserOrchestrator._row_to_insert``) and inserts +via the same ``PropertyPostgresRepository.insert_all`` the Lambda uses — so a row +written here is identical to one the production finaliser would write. Insert is +ON CONFLICT (portfolio_id, uprn) DO NOTHING, so re-running is safe. + +DRY RUN BY DEFAULT — it dedupes, reports, and writes the collisions file but does +NOT touch the database. Add --commit to actually insert. + + # preview only (no DB writes): dedupe + mapping report + python -m scripts.lisasrequest.finalise_to_property_table --portfolio 805 + + # actually insert + python -m scripts.lisasrequest.finalise_to_property_table --portfolio 805 --commit + +Postgres target comes from the root .env (POSTGRES_*). Run from the worktree root. +""" + +from __future__ import annotations + +import argparse +import csv +import sys +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap + +from scripts.finalise_to_property_table import ( # noqa: E402 + dedupe_by_uprn, + insert_rows, +) + +_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_finaliser_input.csv" +_DEFAULT_COLLISIONS = ( + _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_finaliser_collisions.csv" +) +UPRN_COL = "address2uprn_uprn" +MATCHED_ADDRESS_COL = "address2uprn_address" +POSTCODE_COL = "postcode" +LEXISCORE_COL = "address2uprn_lexiscore" + + +def read_rows(path: Path) -> list[dict[str, str]]: + with path.open(newline="", encoding="utf-8-sig") as fh: + return [dict(row) for row in csv.DictReader(fh)] + + +def _preview(rows: list[dict[str, str]]) -> None: + """Show the first few rows as they will be inserted (no DB, no mapper call). + + The finalise step applies the standard finaliser mapper + (BulkUploadFinaliserOrchestrator) on insert; the fields below are its inputs. + """ + print("\nSample rows to insert (uprn | matched address | postcode | lexiscore):") + for row in rows[:3]: + print( + f" {row.get(UPRN_COL)} | {row.get(MATCHED_ADDRESS_COL)!r} | " + f"{row.get(POSTCODE_COL)!r} | {row.get(LEXISCORE_COL)}" + ) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN) + parser.add_argument("--portfolio", type=int, required=True) + parser.add_argument( + "--commit", + action="store_true", + help="actually insert into property (default is a dry-run preview)", + ) + parser.add_argument("--collisions", type=Path, default=_DEFAULT_COLLISIONS) + args = parser.parse_args() + + rows = read_rows(args.inp) + print(f"Loaded {len(rows)} finaliser rows from {args.inp}") + + kept, dropped = dedupe_by_uprn(rows) + if dropped: + with args.collisions.open("w", newline="", encoding="utf-8") as fh: + writer = csv.DictWriter(fh, fieldnames=list(dropped[0].keys())) + writer.writeheader() + writer.writerows(dropped) + print( + f"{len(dropped)} duplicate-UPRN rows dropped -> {args.collisions} " + f"({len(kept)} unique to insert)" + ) + else: + print(f"No duplicate-UPRN collisions; {len(kept)} unique rows to insert.") + + _preview(kept) + + if not args.commit: + print( + f"\nDRY RUN — nothing written. {len(kept)} rows would be inserted into " + f"portfolio {args.portfolio}. Re-run with --commit to write." + ) + return 0 + + inserted = insert_rows(kept, args.portfolio) + print( + f"\nInserted {inserted} new properties into portfolio {args.portfolio} " + f"({len(kept) - inserted} already existed; ON CONFLICT DO NOTHING)." + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/lisasrequest/resolve_uprns_for_finaliser.py b/scripts/lisasrequest/resolve_uprns_for_finaliser.py new file mode 100644 index 00000000..6107b837 --- /dev/null +++ b/scripts/lisasrequest/resolve_uprns_for_finaliser.py @@ -0,0 +1,212 @@ +"""Step 2 (Durkan portfolio): split step-1 matches, reshape the confident ones. + +Reads durkan_domna_filled.csv (step 1) and SPLITS it in two — no re-resolution, +just column work: + + * Rows we cannot confidently insert are held back to a client-clarification CSV + (durkan_client_clarification.csv) for Khalim to take to the client. Reasons: + not_found_no_match no UPRN was resolved. + no_flat_level_uprn a block of flats all collapsed onto one building + UPRN — OS/EPC carry no flat-level records, so we + can't tell the flats apart. + unit_number_mismatch the matched house number differs from the input + (e.g. "9 ..." matched "9A ..."), so the property is + ambiguous. + * Every remaining row is reshaped into the columns the finaliser reads + (bulk_upload_finaliser_orchestrator), written to durkan_finaliser_input.csv + ready for step 3: + Address 1/2/3 | postcode | Internal Reference | address2uprn_uprn + | address2uprn_address | address2uprn_lexiscore + Internal Reference is left blank (landlord_property_id null, by decision). + + python -m scripts.lisasrequest.resolve_uprns_for_finaliser + +This stage hits no APIs. The held rows are not lost — once the client confirms +them they can be appended to the finaliser input by hand. +""" + +from __future__ import annotations + +import argparse +import csv +import sys +from collections import Counter +from pathlib import Path +from typing import Optional + +_REPO_ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap + +from scripts.lisasrequest.fill_domna_address import ( # noqa: E402 + ADDRESS_COL, + FOUND_ADDRESS_COL, + FOUND_UPRN_COL, + LEXISCORE_COL, + POSTCODE_COL, + SOURCE_COL, +) +from scripts.lisasrequest.review_flags import address_numbers, input_unit # noqa: E402 + +# Finaliser input columns — must match bulk_upload_finaliser_orchestrator +# (ADDRESS_COLS / POSTCODE_COL / INTERNAL_REF_COL / UPRN_COL / +# MATCHED_ADDRESS_COL / LEXISCORE_COL). Hard-coded to keep this a light, +# stdlib-only reshape; step 3 imports the real orchestrator and will fail loudly +# if these ever drift. +FIN_ADDRESS_1, FIN_ADDRESS_2, FIN_ADDRESS_3 = "Address 1", "Address 2", "Address 3" +FIN_POSTCODE = "postcode" +FIN_INTERNAL_REF = "Internal Reference" +FIN_UPRN = "address2uprn_uprn" +FIN_MATCHED_ADDRESS = "address2uprn_address" +FIN_LEXISCORE = "address2uprn_lexiscore" +_FINALISER_COLS = [ + FIN_ADDRESS_1, + FIN_ADDRESS_2, + FIN_ADDRESS_3, + FIN_POSTCODE, + FIN_INTERNAL_REF, + FIN_UPRN, + FIN_MATCHED_ADDRESS, + FIN_LEXISCORE, +] + +# Client-clarification report columns (kept human-readable for the client). +CONTEXT_COLS = ["address", "postcode", "No.", "Address Block"] +DOMNA_COLS = [FOUND_ADDRESS_COL, FOUND_UPRN_COL, LEXISCORE_COL, SOURCE_COL] +REASON_COL = "clarification_reason" +ACTION_COL = "action_needed" +_CLARIFY_COLS = CONTEXT_COLS + DOMNA_COLS + [REASON_COL, ACTION_COL] + +_REASON_ORDER = { + "not_found_no_match": 0, + "no_flat_level_uprn": 1, + "unit_number_mismatch": 2, +} +_REASON_ACTION = { + "not_found_no_match": "No UPRN found for this address — please confirm the " + "exact address or provide the UPRN.", + "no_flat_level_uprn": "Address registers hold only the building, not the " + "individual flats — please provide a UPRN per flat, or confirm a " + "building-level record is acceptable.", + "unit_number_mismatch": "Closest match has a different unit number (see " + "domna_address_found) — please confirm the correct property / UPRN.", +} + +_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv" +_DEFAULT_FINALISER = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_finaliser_input.csv" +_DEFAULT_CLARIFY = ( + _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_client_clarification.csv" +) + + +def read_rows(path: Path) -> list[dict[str, str]]: + with path.open(newline="", encoding="utf-8-sig") as fh: + return [dict(row) for row in csv.DictReader(fh)] + + +def clarification_reason( + row: dict[str, str], uprn_counts: Counter[str] +) -> Optional[str]: + """Why this row can't be inserted yet, or None if it's safe to finalise.""" + uprn = row.get(FOUND_UPRN_COL, "") + if row.get(SOURCE_COL) == "not_found" or not uprn: + return "not_found_no_match" + + unit = input_unit(row.get(ADDRESS_COL, "")) + unit_missing = bool(unit) and unit not in address_numbers( + row.get(FOUND_ADDRESS_COL, "") + ) + duplicate = uprn_counts[uprn] > 1 + if unit_missing: + return "no_flat_level_uprn" if duplicate else "unit_number_mismatch" + if duplicate: + # A shared UPRN with the right unit number still collides at finalise. + return "no_flat_level_uprn" + return None + + +def to_finaliser_row(row: dict[str, str]) -> dict[str, str]: + """Rename a confident step-1 row into the finaliser's input columns.""" + return { + FIN_ADDRESS_1: row.get(ADDRESS_COL, ""), + FIN_ADDRESS_2: "", + FIN_ADDRESS_3: "", + FIN_POSTCODE: row.get(POSTCODE_COL, ""), + FIN_INTERNAL_REF: "", # landlord_property_id null, by decision + FIN_UPRN: row.get(FOUND_UPRN_COL, ""), + FIN_MATCHED_ADDRESS: row.get(FOUND_ADDRESS_COL, ""), + FIN_LEXISCORE: row.get(LEXISCORE_COL, ""), + } + + +def to_clarify_row(row: dict[str, str], reason: str) -> dict[str, str]: + out = {col: row.get(col, "") for col in CONTEXT_COLS + DOMNA_COLS} + out[REASON_COL] = reason + out[ACTION_COL] = _REASON_ACTION[reason] + return out + + +def split( + rows: list[dict[str, str]], + *, + accept_unit_mismatch: bool = False, +) -> tuple[list[dict[str, str]], list[dict[str, str]]]: + """Return (finaliser_rows, clarification_rows). + + ``accept_unit_mismatch`` reshapes the ``unit_number_mismatch`` rows (a + near-miss like 9 -> 9A the client has already confirmed) into the finaliser + input instead of holding them back. + """ + uprn_counts: Counter[str] = Counter( + r.get(FOUND_UPRN_COL, "") for r in rows if r.get(FOUND_UPRN_COL) + ) + finaliser: list[dict[str, str]] = [] + clarify: list[dict[str, str]] = [] + for row in rows: + reason = clarification_reason(row, uprn_counts) + if reason is None or ( + accept_unit_mismatch and reason == "unit_number_mismatch" + ): + finaliser.append(to_finaliser_row(row)) + else: + clarify.append(to_clarify_row(row, reason)) + clarify.sort(key=lambda r: _REASON_ORDER.get(r[REASON_COL], 9)) + return finaliser, clarify + + +def write_csv(rows: list[dict[str, str]], path: Path, fieldnames: list[str]) -> None: + with path.open("w", newline="", encoding="utf-8") as fh: + writer = csv.DictWriter(fh, fieldnames=fieldnames, extrasaction="ignore") + writer.writeheader() + writer.writerows(rows) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN) + parser.add_argument("--finaliser-out", type=Path, default=_DEFAULT_FINALISER) + parser.add_argument("--clarify-out", type=Path, default=_DEFAULT_CLARIFY) + parser.add_argument( + "--accept-unit-mismatch", + action="store_true", + help="reshape unit_number_mismatch rows (e.g. 9->9A) into the finaliser " + "input instead of holding them for the client", + ) + args = parser.parse_args() + + rows = read_rows(args.inp) + finaliser, clarify = split(rows, accept_unit_mismatch=args.accept_unit_mismatch) + + write_csv(finaliser, args.finaliser_out, _FINALISER_COLS) + write_csv(clarify, args.clarify_out, _CLARIFY_COLS) + + counts = Counter(r[REASON_COL] for r in clarify) + print(f"Read {len(rows)} step-1 rows.") + print(f" -> {len(finaliser)} confident rows reshaped -> {args.finaliser_out}") + print(f" -> {len(clarify)} held for client -> {args.clarify_out}") + for reason in _REASON_ORDER: + print(f" {reason}: {counts.get(reason, 0)}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/lisasrequest/review_flags.py b/scripts/lisasrequest/review_flags.py new file mode 100644 index 00000000..4bf710e2 --- /dev/null +++ b/scripts/lisasrequest/review_flags.py @@ -0,0 +1,135 @@ +"""Flag step-1 matches that need a human eye, for review before finalising. + +Reads durkan_domna_filled.csv (the step-1 output) and writes a review CSV of +only the rows carrying at least one flag, newest-doubt-first: + + not_found no UPRN resolved at all. + unit_not_in_match the input flat/house number does NOT appear in the matched + address — the high-precision "wrong property" signal. Two + shapes: a near-miss ("9 VANBRUGH" matched "9A, VANBRUGH") + or a flat collapsing onto its building ("FLAT 1, 20 WARWICK" + matched "20, WARWICK ROAD"). + dup_uprn the same UPRN was resolved for >1 input row — typically a + block of flats all collapsing onto the building UPRN; all + but one will be dropped at finalise. + low_score lexiscore < 0.70 (a weak match, just over the OS bar). NOTE: + on its own this is noisy — truncated EPC addresses and extra + locality tokens push correct matches below 0.70. Treat it as + informational unless paired with one of the flags above. + + python -m scripts.lisasrequest.review_flags +""" + +from __future__ import annotations + +import argparse +import csv +import re +import sys +from collections import Counter +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parents[2] + +ADDRESS_COL = "address" +POSTCODE_COL = "postcode" +FOUND_ADDRESS_COL = "domna_address_found" +FOUND_UPRN_COL = "domna_address_uprn" +LEXISCORE_COL = "domna_lexiscore" +SOURCE_COL = "domna_source" +LOW_SCORE = 0.70 + +_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv" +_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_review_flags.csv" + +_REVIEW_COLS = [ + ADDRESS_COL, + POSTCODE_COL, + FOUND_ADDRESS_COL, + FOUND_UPRN_COL, + LEXISCORE_COL, + SOURCE_COL, + "flags", +] + + +def input_unit(address: str) -> str: + """The salient unit number of an input address: the FLAT number if present, + else the leading house number ("" if neither). Upper-cased.""" + upper = address.upper() + flat = re.search(r"\bFLAT\s+(\d+[A-Z]?)", upper) + if flat: + return flat.group(1) + lead = re.match(r"\s*(\d+[A-Z]?)\b", upper) + return lead.group(1) if lead else "" + + +def address_numbers(address: str) -> set[str]: + """All standalone number tokens in an address (e.g. {"3", "20"}). Upper-cased.""" + return set(re.findall(r"\b\d+[A-Z]?\b", address.upper())) + + +def _score(value: str) -> float: + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def flag_rows(rows: list[dict[str, str]]) -> list[dict[str, str]]: + """Return the flagged subset, each with a ';'-joined ``flags`` field.""" + uprn_counts = Counter( + r.get(FOUND_UPRN_COL, "") for r in rows if r.get(FOUND_UPRN_COL) + ) + + flagged: list[dict[str, str]] = [] + for row in rows: + uprn = row.get(FOUND_UPRN_COL, "") + source = row.get(SOURCE_COL, "") + flags: list[str] = [] + + if source == "not_found" or not uprn: + flags.append("not_found") + else: + unit = input_unit(row.get(ADDRESS_COL, "")) + if unit and unit not in address_numbers(row.get(FOUND_ADDRESS_COL, "")): + flags.append("unit_not_in_match") + if uprn_counts[uprn] > 1: + flags.append("dup_uprn") + if _score(row.get(LEXISCORE_COL, "")) < LOW_SCORE: + flags.append("low_score") + + if flags: + flagged.append({**{c: row.get(c, "") for c in _REVIEW_COLS[:-1]}, + "flags": ";".join(flags)}) + + # not_found first, then mismatches, then dup/low. + order = {"not_found": 0, "unit_not_in_match": 1, "dup_uprn": 2, "low_score": 3} + flagged.sort(key=lambda r: order.get(r["flags"].split(";")[0], 9)) + return flagged + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN) + parser.add_argument("--out", type=Path, default=_DEFAULT_OUT) + args = parser.parse_args() + + with args.inp.open(newline="", encoding="utf-8-sig") as fh: + rows = [dict(r) for r in csv.DictReader(fh)] + + flagged = flag_rows(rows) + with args.out.open("w", newline="", encoding="utf-8") as fh: + writer = csv.DictWriter(fh, fieldnames=_REVIEW_COLS, extrasaction="ignore") + writer.writeheader() + writer.writerows(flagged) + + counts = Counter(f for r in flagged for f in r["flags"].split(";")) + print(f"{len(flagged)}/{len(rows)} rows flagged for review -> {args.out}") + for name in ("not_found", "unit_not_in_match", "dup_uprn", "low_score"): + print(f" {name}: {counts.get(name, 0)}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/resolve_uprns_for_finaliser.py b/scripts/resolve_uprns_for_finaliser.py new file mode 100644 index 00000000..c01a55ed --- /dev/null +++ b/scripts/resolve_uprns_for_finaliser.py @@ -0,0 +1,328 @@ +"""Resolve a CSV of addresses to UPRNs, ready to feed the bulk-upload finaliser. + +Takes a CSV with `Address 1/2/3` + `postcode` columns and, per row, resolves a +UPRN by trying — in order — the new EPC API (address2uprn), the historic EPC S3 +dataset, then the Ordnance Survey Places API as a fallback. Whichever source +wins, the result is written into the SAME three columns the finaliser reads +(`bulk_upload_finaliser_orchestrator`): + + address2uprn_uprn UPRN integer (empty when unresolved) + address2uprn_address the matched address + address2uprn_lexiscore the match score in [0, 1] + +A `resolution_source` diagnostic column (epc / epc_historic / ordnance_survey / +none) is appended too — the finaliser ignores unknown columns. All original +columns are preserved in their original order, so the output CSV drops straight +into the finaliser. + + python -m scripts.resolve_uprns_for_finaliser input.csv -o resolved.csv + + # OS-only / EPC-only, custom postcode column, custom OS score threshold + python -m scripts.resolve_uprns_for_finaliser in.csv -o out.csv --no-epc + python -m scripts.resolve_uprns_for_finaliser in.csv -o out.csv --postcode-col Postcode --os-threshold 0.6 + +Keys are read from backend/.env: OPEN_EPC_API_TOKEN (EPC) and +ORDNANCE_SURVEY_API_KEY (OS Places). Run from the worktree root (import trap). + +The module-level functions (`load_keys`, `read_rows`, `resolve_row`, `process`, +`write_rows`) are written to be driven line-by-line from a REPL as well as via +the CLI. +""" + +from __future__ import annotations + +import argparse +import csv +import os +import sys +from pathlib import Path +from typing import Optional + +import pandas as pd +from dotenv import load_dotenv + +_REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap + +from backend.address2UPRN.main import ( # noqa: E402 + get_epc_data_with_postcode, + get_uprn_from_historic_epc, + get_uprn_with_epc_df, +) +from backend.ordnanceSurvey.helpers import ( # noqa: E402 + lookup_os_places, + os_places_results_to_dataframe, +) +from backend.utils.addressMatch import AddressMatch # noqa: E402 + +# Columns the finaliser reads (bulk_upload_finaliser_orchestrator). +UPRN_COL = "address2uprn_uprn" +MATCHED_ADDRESS_COL = "address2uprn_address" +LEXISCORE_COL = "address2uprn_lexiscore" +SOURCE_COL = "resolution_source" +_RESULT_COLS = (UPRN_COL, MATCHED_ADDRESS_COL, LEXISCORE_COL, SOURCE_COL) + +# A resolved hit: (uprn, matched_address, lexiscore, source). +Resolution = tuple[str, str, float, str] + + +def load_keys() -> tuple[Optional[str], Optional[str]]: + """Load (epc_token, os_api_key) from backend/.env (and the process env).""" + load_dotenv(_REPO_ROOT / "backend" / ".env") + epc_token = os.environ.get("OPEN_EPC_API_TOKEN") + os_api_key = os.environ.get("ORDNANCE_SURVEY_API_KEY") + return epc_token, os_api_key + + +def read_rows(path: Path) -> tuple[list[dict[str, str]], list[str]]: + """Read a CSV into (rows, fieldnames). Preserves column order.""" + with path.open(newline="", encoding="utf-8-sig") as fh: + reader = csv.DictReader(fh) + fieldnames = list(reader.fieldnames or []) + rows = [dict(row) for row in reader] + return rows, fieldnames + + +def clean_postcode(postcode: str) -> str: + """Sanitise to the no-space upper form the EPC/OS lookups expect (e.g. E84SQ).""" + return postcode.upper().replace(" ", "").strip() + + +def build_address(row: dict[str, str]) -> str: + """Concatenate Address 1/2/3 the same way the address2uprn lambda does.""" + return " ".join( + str(row.get(col, "") or "").strip() for col in ("Address 1", "Address 2", "Address 3") + ).strip() + + +def resolve_epc( + address: str, postcode_clean: str, epc_cache: dict[str, pd.DataFrame] +) -> Optional[Resolution]: + """Resolve via the new EPC API (cached per postcode), then historic EPC S3. + + `epc_cache` is mutated to memoise one EPC API call per postcode — pass the + same dict across rows so a postcode is only fetched once. + """ + epc_df = epc_cache.get(postcode_clean) + if epc_df is None: + epc_df = get_epc_data_with_postcode(postcode=postcode_clean) + epc_cache[postcode_clean] = epc_df + + result = get_uprn_with_epc_df( + user_inputed_address=address, epc_df=epc_df, verbose=True + ) + if isinstance(result, tuple): + uprn, matched, score = result + return str(uprn), str(matched), float(score), "epc" + + historic = get_uprn_from_historic_epc( + user_inputed_address=address, postcode=postcode_clean + ) + if historic is not None: + uprn, matched, score = historic + return str(uprn), str(matched), float(score), "epc_historic" + + return None + + +def resolve_os( + address: str, + postcode_clean: str, + os_api_key: str, + os_cache: dict[str, pd.DataFrame], + threshold: float, +) -> Optional[Resolution]: + """Resolve via the OS Places API: best-scoring address above `threshold`. + + `os_cache` memoises one OS Places call per postcode. + """ + places_df = os_cache.get(postcode_clean) + if places_df is None: + response = lookup_os_places(postcode_clean, os_api_key) + if response.get("status") != 200 or "data" not in response: + places_df = pd.DataFrame() + else: + places_df = os_places_results_to_dataframe(response["data"]) + os_cache[postcode_clean] = places_df + + if places_df.empty or "ADDRESS" not in places_df.columns: + return None + + # Iterate plain records — avoids pandas' partially-unknown indexing types. + records: list[dict[str, object]] = places_df.to_dict(orient="records") + best: Optional[Resolution] = None + for rec in records: + candidate = str(rec.get("ADDRESS", "")) + score = AddressMatch.score(address, candidate) + if score >= threshold and (best is None or score > best[2]): + best = (str(rec.get("UPRN", "")), candidate, score, "ordnance_survey") + return best + + +def resolve_row( + row: dict[str, str], + *, + epc_token: Optional[str], + os_api_key: Optional[str], + epc_cache: dict[str, pd.DataFrame], + os_cache: dict[str, pd.DataFrame], + postcode_col: str, + use_epc: bool, + use_os: bool, + os_threshold: float, + validate_postcode: bool, +) -> dict[str, str]: + """Resolve one row in place and return it with the finaliser columns filled. + + Tries EPC (new + historic) first, then OS Places. On no match the three + result columns are written empty and `resolution_source` is "none". + """ + address = build_address(row) + postcode_clean = clean_postcode(str(row.get(postcode_col, "") or "")) + + def write(res: Optional[Resolution]) -> dict[str, str]: + if res is None: + row[UPRN_COL] = "" + row[MATCHED_ADDRESS_COL] = "" + row[LEXISCORE_COL] = "" + row[SOURCE_COL] = "none" + else: + uprn, matched, score, source = res + row[UPRN_COL] = uprn + row[MATCHED_ADDRESS_COL] = matched + row[LEXISCORE_COL] = str(score) + row[SOURCE_COL] = source + return row + + if not address or not postcode_clean: + return write(None) + + if validate_postcode and not AddressMatch.is_valid_postcode(postcode_clean): + return write(None) + + if use_epc and epc_token: + try: + res = resolve_epc(address, postcode_clean, epc_cache) + if res is not None: + return write(res) + except Exception as exc: # keep going on a per-row API/lookup failure + print(f" EPC lookup failed for {address!r} / {postcode_clean}: {exc}") + + if use_os and os_api_key: + try: + res = resolve_os(address, postcode_clean, os_api_key, os_cache, os_threshold) + if res is not None: + return write(res) + except Exception as exc: + print(f" OS lookup failed for {address!r} / {postcode_clean}: {exc}") + + return write(None) + + +def process( + rows: list[dict[str, str]], + *, + epc_token: Optional[str], + os_api_key: Optional[str], + postcode_col: str = "postcode", + use_epc: bool = True, + use_os: bool = True, + os_threshold: float = 0.5, + validate_postcode: bool = True, +) -> list[dict[str, str]]: + """Resolve every row, printing a per-row line so REPL/CLI progress is visible.""" + epc_cache: dict[str, pd.DataFrame] = {} + os_cache: dict[str, pd.DataFrame] = {} + for i, row in enumerate(rows, start=1): + resolve_row( + row, + epc_token=epc_token, + os_api_key=os_api_key, + epc_cache=epc_cache, + os_cache=os_cache, + postcode_col=postcode_col, + use_epc=use_epc, + use_os=use_os, + os_threshold=os_threshold, + validate_postcode=validate_postcode, + ) + print( + f"[{i}/{len(rows)}] {build_address(row)!r} -> " + f"{row[UPRN_COL] or '(no match)'} ({row[SOURCE_COL]})" + ) + return rows + + +def write_rows(rows: list[dict[str, str]], path: Path, fieldnames: list[str]) -> None: + """Write rows to CSV, preserving input columns and appending the result columns.""" + out_fields = list(fieldnames) + for col in _RESULT_COLS: + if col not in out_fields: + out_fields.append(col) + with path.open("w", newline="", encoding="utf-8") as fh: + writer = csv.DictWriter(fh, fieldnames=out_fields, extrasaction="ignore") + writer.writeheader() + writer.writerows(rows) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("input", type=Path, help="input CSV (Address 1/2/3 + postcode)") + parser.add_argument( + "-o", "--out", type=Path, required=True, help="output CSV for the finaliser" + ) + parser.add_argument("--postcode-col", default="postcode", help="postcode column name") + parser.add_argument("--no-epc", action="store_true", help="skip EPC resolution") + parser.add_argument("--no-os", action="store_true", help="skip Ordnance Survey fallback") + parser.add_argument( + "--os-threshold", type=float, default=0.5, help="min OS match score (default 0.5)" + ) + parser.add_argument( + "--no-validate-postcode", + action="store_true", + help="skip the postcodes.io validity check (one HTTP call per postcode)", + ) + parser.add_argument("--limit", type=int, default=None, help="process only the first N rows") + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + epc_token, os_api_key = load_keys() + + use_epc = not args.no_epc + use_os = not args.no_os + if use_epc and not epc_token: + print("OPEN_EPC_API_TOKEN not set (backend/.env) — EPC resolution disabled") + use_epc = False + if use_os and not os_api_key: + print("ORDNANCE_SURVEY_API_KEY not set (backend/.env) — OS fallback disabled") + use_os = False + if not use_epc and not use_os: + print("No resolver enabled (missing keys or both --no-* flags). Nothing to do.") + return 2 + + rows, fieldnames = read_rows(args.input) + if args.limit is not None: + rows = rows[: args.limit] + print(f"Loaded {len(rows)} rows from {args.input}") + + process( + rows, + epc_token=epc_token, + os_api_key=os_api_key, + postcode_col=args.postcode_col, + use_epc=use_epc, + use_os=use_os, + os_threshold=args.os_threshold, + validate_postcode=not args.no_validate_postcode, + ) + + write_rows(rows, args.out, fieldnames) + matched = sum(1 for r in rows if r.get(UPRN_COL)) + print(f"\nResolved {matched}/{len(rows)} rows. Wrote {args.out}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/applications/landlord_description_overrides/__init__.py b/tests/applications/landlord_description_overrides/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/applications/landlord_description_overrides/test_build_columns.py b/tests/applications/landlord_description_overrides/test_build_columns.py new file mode 100644 index 00000000..0e47ed8d --- /dev/null +++ b/tests/applications/landlord_description_overrides/test_build_columns.py @@ -0,0 +1,79 @@ +"""The landlord-description-overrides handler's column wiring (`_build_columns`). + +A `column_mapping` entry of ``{category -> source header}`` must produce a +ClassifiableColumn that reads the named header and classifies into the +category's enum. This pins the main_fuel category onto the wiring. +""" + +from __future__ import annotations + +from typing import cast + +from applications.landlord_description_overrides.handler import _build_columns # pyright: ignore[reportPrivateUsage] +from infrastructure.chatgpt.chatgpt import ChatGPT + + +def test_build_columns_wires_a_main_fuel_classifier_column() -> None: + # Arrange — the factory only stores the injected collaborators, so a bare + # object stands in for the (I/O-bound) ChatGPT client and the DB session. + chat_gpt = cast(ChatGPT, object()) + + # Act + columns = _build_columns({"main_fuel": "Main Fuel"}, chat_gpt, None) + + # Assert — one column, named main_fuel, reading the "Main Fuel" header. + assert len(columns) == 1 + assert columns[0].name == "main_fuel" + assert columns[0].source_column == "Main Fuel" + + +def test_build_columns_wires_a_glazing_classifier_column() -> None: + # Arrange + chat_gpt = cast(ChatGPT, object()) + + # Act + columns = _build_columns({"glazing": "Glazing"}, chat_gpt, None) + + # Assert — one column, named glazing, reading the "Glazing" header. + assert len(columns) == 1 + assert columns[0].name == "glazing" + assert columns[0].source_column == "Glazing" + + +def test_build_columns_wires_a_construction_age_band_classifier_column() -> None: + # Arrange + chat_gpt = cast(ChatGPT, object()) + + # Act + columns = _build_columns({"construction_age_band": "Age"}, chat_gpt, None) + + # Assert — one column, named construction_age_band, reading the "Age" header. + assert len(columns) == 1 + assert columns[0].name == "construction_age_band" + assert columns[0].source_column == "Age" + + +def test_build_columns_wires_a_water_heating_classifier_column() -> None: + # Arrange + chat_gpt = cast(ChatGPT, object()) + + # Act + columns = _build_columns({"water_heating": "Hot Water"}, chat_gpt, None) + + # Assert + assert len(columns) == 1 + assert columns[0].name == "water_heating" + assert columns[0].source_column == "Hot Water" + + +def test_build_columns_wires_a_main_heating_system_classifier_column() -> None: + # Arrange + chat_gpt = cast(ChatGPT, object()) + + # Act + columns = _build_columns({"main_heating_system": "Heating"}, chat_gpt, None) + + # Assert + assert len(columns) == 1 + assert columns[0].name == "main_heating_system" + assert columns[0].source_column == "Heating" diff --git a/tests/domain/epc/test_construction_age_band_overlay.py b/tests/domain/epc/test_construction_age_band_overlay.py new file mode 100644 index 00000000..5ae822e6 --- /dev/null +++ b/tests/domain/epc/test_construction_age_band_overlay.py @@ -0,0 +1,109 @@ +"""The Landlord-Override construction-age-band → fabric Simulation Overlay. + +An age-band value resolves to the RdSAP letter code the calculator's U-value +cascades read from `SapBuildingPart.construction_age_band`; the overlay targets +the override's building part. +""" + +from __future__ import annotations + +import pytest + +from datatypes.epc.domain.epc_property_data import ( + BuildingPartIdentifier, + EpcPropertyData, + SapBuildingPart, +) +from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand +from domain.epc.property_overlays.construction_age_band_overlay import ( + age_band_overlay_for, +) +from domain.modelling.scoring.overlay_applicator import apply_simulations +from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import ( + build_epc, +) + + +def _part( + epc: EpcPropertyData, identifier: BuildingPartIdentifier +) -> SapBuildingPart: + return next(p for p in epc.sap_building_parts if p.identifier is identifier) + + +def test_age_band_overlays_the_main_building_part() -> None: + # Act — band B (1900-1929) on the main building part. + simulation = age_band_overlay_for("B", 0) + + # Assert + assert simulation is not None + overlay = simulation.building_parts[BuildingPartIdentifier.MAIN] + assert overlay.construction_age_band == "B" + + +def test_age_band_overlay_targets_the_extension_building_part() -> None: + # Act — building_part 1 is the first extension. + simulation = age_band_overlay_for("L", 1) + + # Assert + assert simulation is not None + assert BuildingPartIdentifier.EXTENSION_1 in simulation.building_parts + assert ( + simulation.building_parts[BuildingPartIdentifier.EXTENSION_1] + .construction_age_band + == "L" + ) + + +def test_lowercase_age_band_is_normalised_to_its_letter_code() -> None: + # Act + simulation = age_band_overlay_for("d", 0) + + # Assert — the calculator upper-cases the band; the overlay stores it upper. + assert simulation is not None + assert ( + simulation.building_parts[BuildingPartIdentifier.MAIN].construction_age_band + == "D" + ) + + +@pytest.mark.parametrize("age_band_value", ["Z", "", "1900-1929", "Unknown"]) +def test_unrecognised_age_band_produces_no_overlay(age_band_value: str) -> None: + # Act + simulation = age_band_overlay_for(age_band_value, 0) + + # Assert + assert simulation is None + + +def test_age_band_override_re_dates_the_main_part_only() -> None: + # Arrange — baseline main + extension are both band B; the landlord corrects + # the main building's age band to F (1976-1982). + baseline = build_epc() + overlay = age_band_overlay_for("F", 0) + assert overlay is not None + + # Act + result = apply_simulations(baseline, [overlay]) + + # Assert — the main part is re-dated (its U-value cascade now keys on F); the + # extension is left untouched. + assert _part(result, BuildingPartIdentifier.MAIN).construction_age_band == "F" + assert ( + _part(result, BuildingPartIdentifier.EXTENSION_1).construction_age_band == "B" + ) + + +@pytest.mark.parametrize( + "member", [m for m in ConstructionAgeBand if m is not ConstructionAgeBand.UNKNOWN] +) +def test_every_resolvable_age_band_value_decodes_to_an_overlay( + member: ConstructionAgeBand, +) -> None: + # A classifier emits a ConstructionAgeBand value; if the overlay can't decode + # it the override silently no-ops. Every non-UNKNOWN member must resolve. + + # Act + simulation = age_band_overlay_for(member.value, 0) + + # Assert + assert simulation is not None diff --git a/tests/domain/epc/test_glazing_overlay.py b/tests/domain/epc/test_glazing_overlay.py new file mode 100644 index 00000000..3182d79c --- /dev/null +++ b/tests/domain/epc/test_glazing_overlay.py @@ -0,0 +1,90 @@ +"""The Landlord-Override glazing → glazing Simulation Overlay mapping. + +A glazing value resolves to the SAP10 `glazing_type` code the calculator's +Table-24 cascade reads; the overlay is whole-dwelling (expanded across every +window by `_fold_glazing`). +""" + +from __future__ import annotations + +import pytest + +from domain.epc.property_overrides.glazing_type import GlazingType +from domain.epc.property_overlays.glazing_overlay import glazing_overlay_for +from domain.modelling.scoring.overlay_applicator import apply_simulations +from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import ( + build_epc, +) + + +def test_double_glazing_post_2002_overlays_its_glazing_code() -> None: + # Act + simulation = glazing_overlay_for("Double glazing, 2002 or later", 0) + + # Assert — double glazing 2002-2021 is SAP10 glazing_type code 2. + assert simulation is not None + assert simulation.glazing is not None + assert simulation.glazing.glazing_type == 2 + + +@pytest.mark.parametrize( + ("glazing_value", "code"), + [ + ("Single glazing", 1), + ("Double glazing, pre-2002", 3), + ("Triple glazing, 2002 or later", 9), + ("Triple glazing, pre-2002", 6), + ], +) +def test_glazing_types_decode_to_their_sap_codes( + glazing_value: str, code: int +) -> None: + # Act + simulation = glazing_overlay_for(glazing_value, 0) + + # Assert + assert simulation is not None + assert simulation.glazing is not None + assert simulation.glazing.glazing_type == code + + +@pytest.mark.parametrize("glazing_value", ["Unknown", ""]) +def test_unresolvable_glazing_produces_no_overlay(glazing_value: str) -> None: + # Act + simulation = glazing_overlay_for(glazing_value, 0) + + # Assert + assert simulation is None + + +def test_glazing_override_remaps_every_window_and_clears_lodged_u() -> None: + # Arrange — baseline windows are double glazed (code 2, lodged U 2.8); the + # landlord corrects the whole dwelling to single glazing. + baseline = build_epc() + assert len(baseline.sap_windows) > 1 + overlay = glazing_overlay_for("Single glazing", 0) + assert overlay is not None + + # Act + result = apply_simulations(baseline, [overlay]) + + # Assert — every window flips to single (code 1) and its lodged transmission + # U is cleared so the Table-24 cascade re-derives U from the new type. + assert all(w.glazing_type == 1 for w in result.sap_windows) + assert all(w.window_transmission_details is None for w in result.sap_windows) + + +@pytest.mark.parametrize( + "member", [m for m in GlazingType if m is not GlazingType.UNKNOWN] +) +def test_every_resolvable_glazing_value_decodes_to_a_code( + member: GlazingType, +) -> None: + # A classifier emits a GlazingType value; if the overlay can't decode it the + # override silently no-ops. Every non-UNKNOWN member must resolve. + + # Act + simulation = glazing_overlay_for(member.value, 0) + + # Assert + assert simulation is not None diff --git a/tests/domain/epc/test_main_fuel_overlay.py b/tests/domain/epc/test_main_fuel_overlay.py new file mode 100644 index 00000000..9a1b1335 --- /dev/null +++ b/tests/domain/epc/test_main_fuel_overlay.py @@ -0,0 +1,115 @@ +"""The Landlord-Override main-fuel → heating Simulation Overlay mapping. + +A main-fuel value resolves to the RdSAP `main_fuel_type` int code the calculator +reads from the dwelling's primary heating system; the overlay is whole-dwelling. +""" + +from __future__ import annotations + +import pytest + +from domain.epc.property_overrides.main_fuel_type import MainFuelType +from domain.epc.property_overlays.main_fuel_overlay import fuel_overlay_for +from domain.modelling.scoring.overlay_applicator import apply_simulations +from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import ( + build_epc, +) + + +def test_mains_gas_overlays_the_primary_fuel() -> None: + # Act + simulation = fuel_overlay_for("mains gas", 0) + + # Assert — mains gas (not community) is RdSAP main_fuel code 26. + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.main_fuel_type == 26 + + +@pytest.mark.parametrize( + ("main_fuel_value", "code"), + [ + ("electricity", 29), + ("LPG (bulk)", 27), + ("oil", 28), + ("house coal", 33), + ], +) +def test_fuels_decode_to_their_modern_not_community_codes( + main_fuel_value: str, code: int +) -> None: + # Act + simulation = fuel_overlay_for(main_fuel_value, 0) + + # Assert + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.main_fuel_type == code + + +@pytest.mark.parametrize( + ("main_fuel_value", "code"), + [ + ("bottled LPG", 3), + ("LPG special condition", 17), + ("electricity (community)", 25), + ("biomass (community)", 31), + ("dual fuel (mineral and wood)", 10), + ("smokeless coal", 15), + ], +) +def test_more_fuels_decode_to_their_codes(main_fuel_value: str, code: int) -> None: + # Act + simulation = fuel_overlay_for(main_fuel_value, 0) + + # Assert + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.main_fuel_type == code + + +def test_community_mains_gas_is_a_distinct_fuel_code() -> None: + # Act + simulation = fuel_overlay_for("mains gas (community)", 0) + + # Assert — community mains gas is code 20, distinct from 26 (not community). + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.main_fuel_type == 20 + + +@pytest.mark.parametrize("main_fuel_value", ["Unknown", "", "no heating or hot water"]) +def test_unresolvable_fuel_produces_no_overlay(main_fuel_value: str) -> None: + # Act + simulation = fuel_overlay_for(main_fuel_value, 0) + + # Assert + assert simulation is None + + +def test_fuel_override_remaps_the_primary_systems_fuel_on_the_epc() -> None: + # Arrange — a landlord correction that the dwelling runs on electricity. + baseline = build_epc() + overlay = fuel_overlay_for("electricity", 0) + assert overlay is not None + + # Act + result = apply_simulations(baseline, [overlay]) + + # Assert — the calculator reads the primary fuel from main_heating_details[0]. + assert result.sap_heating.main_heating_details[0].main_fuel_type == 29 + + +@pytest.mark.parametrize( + "member", [m for m in MainFuelType if m is not MainFuelType.UNKNOWN] +) +def test_every_resolvable_fuel_value_decodes_to_a_code(member: MainFuelType) -> None: + # A classifier emits a MainFuelType value; if the overlay can't decode it the + # override silently no-ops. Every non-UNKNOWN member must resolve. + + # Act + simulation = fuel_overlay_for(member.value, 0) + + # Assert + assert simulation is not None + diff --git a/tests/domain/epc/test_main_heating_system_overlay.py b/tests/domain/epc/test_main_heating_system_overlay.py new file mode 100644 index 00000000..8ccde2b9 --- /dev/null +++ b/tests/domain/epc/test_main_heating_system_overlay.py @@ -0,0 +1,140 @@ +"""The Landlord-Override main-heating-system → heating Simulation Overlay mapping. + +A main-heating-system value resolves to the SAP `sap_main_heating_code` the +calculator reads from the primary system; the overlay is whole-dwelling. +""" + +from __future__ import annotations + +import pytest + +from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType +from domain.epc.property_overlays.main_fuel_overlay import fuel_overlay_for +from domain.epc.property_overlays.main_heating_system_overlay import ( + main_heating_overlay_for, +) +from domain.epc.property_overlays.water_heating_overlay import ( + water_heating_overlay_for, +) +from domain.modelling.scoring.overlay_applicator import apply_simulations +from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import ( + build_epc, +) + + +def test_gas_combi_overlays_the_primary_heating_code() -> None: + # Act + simulation = main_heating_overlay_for("Gas boiler, combi", 0) + + # Assert — condensing combi is SAP Table 4b code 104. + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.sap_main_heating_code == 104 + + +@pytest.mark.parametrize( + ("main_heating_value", "code"), + [ + ("Gas boiler, regular", 102), + ("Gas CPSU", 120), + ("Electric storage heaters, fan", 404), + ("Direct-acting electric", 191), + ], +) +def test_heating_archetypes_decode_to_their_sap_codes( + main_heating_value: str, code: int +) -> None: + # Act + simulation = main_heating_overlay_for(main_heating_value, 0) + + # Assert + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.sap_main_heating_code == code + + +@pytest.mark.parametrize( + ("main_heating_value", "code"), + [ + ("Electric storage heaters, old", 401), + ("Electric storage heaters, slimline", 402), + ("Electric storage heaters, convector", 403), + ], +) +def test_storage_heater_subtypes_decode_to_their_codes( + main_heating_value: str, code: int +) -> None: + # Act + simulation = main_heating_overlay_for(main_heating_value, 0) + + # Assert + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.sap_main_heating_code == code + + +@pytest.mark.parametrize( + "main_heating_value", + ["Unknown", "", "Air source heat pump", "Community heating"], +) +def test_unresolvable_or_unmodelled_heating_produces_no_overlay( + main_heating_value: str, +) -> None: + # Heat pumps (main_heating_index_number) and community heating (community + # codes) don't map to a Table 4b sap_main_heating_code yet — no overlay. + + # Act + simulation = main_heating_overlay_for(main_heating_value, 0) + + # Assert + assert simulation is None + + +def test_main_heating_override_remaps_the_primary_system_code() -> None: + # Arrange + baseline = build_epc() + overlay = main_heating_overlay_for("Gas boiler, regular", 0) + assert overlay is not None + + # Act + result = apply_simulations(baseline, [overlay]) + + # Assert — the calculator reads the code off main_heating_details[0]. + assert result.sap_heating.main_heating_details[0].sap_main_heating_code == 102 + + +def test_the_three_heating_overrides_compose_without_conflict() -> None: + # Arrange — main_fuel, water_heating and main_heating_system all fold onto one + # HeatingOverlay surface but set DISJOINT fields, so they compose (the + # field-disjoint design that makes precedence moot for these three). + baseline = build_epc() + overlays = [ + fuel_overlay_for("electricity", 0), + water_heating_overlay_for("Electric immersion, electricity", 0), + main_heating_overlay_for("Electric storage heaters, fan", 0), + ] + assert all(o is not None for o in overlays) + + # Act + result = apply_simulations(baseline, [o for o in overlays if o is not None]) + + # Assert — each override landed on its own field. + main = result.sap_heating.main_heating_details[0] + assert main.main_fuel_type == 29 + assert main.sap_main_heating_code == 404 + assert result.sap_heating.water_heating_code == 903 + assert result.sap_heating.water_heating_fuel == 29 + + +@pytest.mark.parametrize( + "member", + [m for m in MainHeatingSystemType if m is not MainHeatingSystemType.UNKNOWN], +) +def test_every_resolvable_main_heating_value_decodes( + member: MainHeatingSystemType, +) -> None: + # Act + simulation = main_heating_overlay_for(member.value, 0) + + # Assert + assert simulation is not None diff --git a/tests/domain/epc/test_override_code_mapping.py b/tests/domain/epc/test_override_code_mapping.py index b22ad0a5..5e462e08 100644 --- a/tests/domain/epc/test_override_code_mapping.py +++ b/tests/domain/epc/test_override_code_mapping.py @@ -12,12 +12,12 @@ from typing import Optional import pytest -from domain.epc.built_form_type import BuiltFormType -from domain.epc.override_code_mapping import ( +from domain.epc.property_overrides.built_form_type import BuiltFormType +from domain.epc.property_overrides.override_code_mapping import ( built_form_to_code, property_type_to_code, ) -from domain.epc.property_type import PropertyType +from domain.epc.property_overrides.property_type import PropertyType def test_house_maps_to_gov_code_zero() -> None: diff --git a/tests/domain/epc/test_water_heating_overlay.py b/tests/domain/epc/test_water_heating_overlay.py new file mode 100644 index 00000000..a1955aa7 --- /dev/null +++ b/tests/domain/epc/test_water_heating_overlay.py @@ -0,0 +1,111 @@ +"""The Landlord-Override water-heating → heating Simulation Overlay mapping. + +A water-heating value resolves to the SAP `water_heating_code` (system) and +`water_heating_fuel` the calculator reads; the overlay is whole-dwelling. +""" + +from __future__ import annotations + +import pytest + +from domain.epc.property_overlays.water_heating_overlay import ( + water_heating_overlay_for, +) +from domain.epc.property_overrides.water_heating_type import WaterHeatingType +from domain.modelling.scoring.overlay_applicator import apply_simulations +from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import ( + build_epc, +) + + +def test_from_main_system_mains_gas_overlays_water_heating() -> None: + # Act + simulation = water_heating_overlay_for("From main system, mains gas", 0) + + # Assert — "from main system" is water_heating_code 901, mains gas fuel 26. + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.water_heating_code == 901 + assert simulation.heating.water_heating_fuel == 26 + + +@pytest.mark.parametrize( + ("water_heating_value", "code", "fuel"), + [ + ("From main system, electricity", 901, 29), + ("Electric immersion, electricity", 903, 29), + ], +) +def test_water_heating_systems_decode_to_their_codes( + water_heating_value: str, code: int, fuel: int +) -> None: + # Act + simulation = water_heating_overlay_for(water_heating_value, 0) + + # Assert + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.water_heating_code == code + assert simulation.heating.water_heating_fuel == fuel + + +@pytest.mark.parametrize( + ("water_heating_value", "code", "fuel"), + [ + ("From main system, oil", 901, 28), + ("From main system, LPG (bulk)", 901, 27), + ("From main system, bottled LPG", 901, 3), + ("From main system, house coal", 901, 33), + # "boiler/circulator for water heating only" is SAP Table 4a code 911. + ("Gas boiler/circulator, mains gas", 911, 26), + ], +) +def test_more_water_heating_combos_decode_to_their_codes( + water_heating_value: str, code: int, fuel: int +) -> None: + # Act + simulation = water_heating_overlay_for(water_heating_value, 0) + + # Assert + assert simulation is not None + assert simulation.heating is not None + assert simulation.heating.water_heating_code == code + assert simulation.heating.water_heating_fuel == fuel + + +@pytest.mark.parametrize("water_heating_value", ["Unknown", ""]) +def test_unresolvable_water_heating_produces_no_overlay( + water_heating_value: str, +) -> None: + # Act + simulation = water_heating_overlay_for(water_heating_value, 0) + + # Assert + assert simulation is None + + +def test_water_heating_override_remaps_the_hot_water_arrangement() -> None: + # Arrange — landlord correction: HW is a separate electric immersion. + baseline = build_epc() + overlay = water_heating_overlay_for("Electric immersion, electricity", 0) + assert overlay is not None + + # Act + result = apply_simulations(baseline, [overlay]) + + # Assert — the calculator reads these off sap_heating. + assert result.sap_heating.water_heating_code == 903 + assert result.sap_heating.water_heating_fuel == 29 + + +@pytest.mark.parametrize( + "member", [m for m in WaterHeatingType if m is not WaterHeatingType.UNKNOWN] +) +def test_every_resolvable_water_heating_value_decodes( + member: WaterHeatingType, +) -> None: + # Act + simulation = water_heating_overlay_for(member.value, 0) + + # Assert + assert simulation is not None diff --git a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py index 425d2625..0070af49 100644 --- a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py +++ b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py @@ -5,8 +5,8 @@ from typing import Optional import pytest from domain.data_transformation.column_classifier import ClassificationError -from domain.epc.property_type import PropertyType -from domain.epc.wall_type import WallType +from domain.epc.property_overrides.property_type import PropertyType +from domain.epc.property_overrides.wall_type import WallType from infrastructure.chatgpt.chatgpt import ChatGPT from infrastructure.chatgpt.chatgpt_column_classifier import ( ChatGptColumnClassifier, diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index bf5b13ce..2339c615 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -4,9 +4,9 @@ from enum import Enum from typing import Any, Optional from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress -from domain.epc.built_form_type import BuiltFormType -from domain.epc.property_type import PropertyType -from domain.epc.wall_type import WallType +from domain.epc.property_overrides.built_form_type import BuiltFormType +from domain.epc.property_overrides.property_type import PropertyType +from domain.epc.property_overrides.wall_type import WallType from domain.postcode import Postcode from domain.data_transformation.column_classifier import ColumnClassifier from orchestration.classifiable_column import ClassifiableColumn diff --git a/tests/repositories/landlord_overrides/postgres/test_landlord_overrides_postgres_repository.py b/tests/repositories/landlord_overrides/postgres/test_landlord_overrides_postgres_repository.py index 1ce4e997..31ca2b17 100644 --- a/tests/repositories/landlord_overrides/postgres/test_landlord_overrides_postgres_repository.py +++ b/tests/repositories/landlord_overrides/postgres/test_landlord_overrides_postgres_repository.py @@ -25,8 +25,8 @@ import pytest from sqlalchemy import Engine, Table from sqlmodel import Session, SQLModel, select -from domain.epc.property_type import PropertyType -from domain.epc.wall_type import WallType +from domain.epc.property_overrides.property_type import PropertyType +from domain.epc.property_overrides.wall_type import WallType from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import ( LandlordOverridesRepository, ) diff --git a/tests/repositories/property/test_landlord_override_overlays.py b/tests/repositories/property/test_landlord_override_overlays.py index 20c79c1f..1f790aa8 100644 --- a/tests/repositories/property/test_landlord_override_overlays.py +++ b/tests/repositories/property/test_landlord_override_overlays.py @@ -47,6 +47,88 @@ def test_each_resolvable_component_produces_an_overlay() -> None: assert len(overlays) == 4 +def test_main_fuel_row_produces_a_heating_fuel_overlay() -> None: + # Arrange + overrides = ResolvedPropertyOverrides( + rows=(ResolvedPropertyOverride("main_fuel", 0, "mains gas"),) + ) + + # Act + overlays = overlays_from(overrides) + + # Assert + assert len(overlays) == 1 + assert overlays[0].heating is not None + assert overlays[0].heating.main_fuel_type == 26 + + +def test_glazing_row_produces_a_glazing_overlay() -> None: + # Arrange + overrides = ResolvedPropertyOverrides( + rows=(ResolvedPropertyOverride("glazing", 0, "Double glazing, 2002 or later"),) + ) + + # Act + overlays = overlays_from(overrides) + + # Assert + assert len(overlays) == 1 + assert overlays[0].glazing is not None + assert overlays[0].glazing.glazing_type == 2 + + +def test_construction_age_band_row_produces_a_building_part_overlay() -> None: + # Arrange + overrides = ResolvedPropertyOverrides( + rows=(ResolvedPropertyOverride("construction_age_band", 0, "B"),) + ) + + # Act + overlays = overlays_from(overrides) + + # Assert + assert len(overlays) == 1 + main = overlays[0].building_parts[BuildingPartIdentifier.MAIN] + assert main.construction_age_band == "B" + + +def test_water_heating_row_produces_a_heating_overlay() -> None: + # Arrange + overrides = ResolvedPropertyOverrides( + rows=( + ResolvedPropertyOverride( + "water_heating", 0, "From main system, mains gas" + ), + ) + ) + + # Act + overlays = overlays_from(overrides) + + # Assert + assert len(overlays) == 1 + assert overlays[0].heating is not None + assert overlays[0].heating.water_heating_code == 901 + assert overlays[0].heating.water_heating_fuel == 26 + + +def test_main_heating_system_row_produces_a_heating_overlay() -> None: + # Arrange + overrides = ResolvedPropertyOverrides( + rows=( + ResolvedPropertyOverride("main_heating_system", 0, "Gas boiler, combi"), + ) + ) + + # Act + overlays = overlays_from(overrides) + + # Assert + assert len(overlays) == 1 + assert overlays[0].heating is not None + assert overlays[0].heating.sap_main_heating_code == 104 + + def test_unresolvable_rows_are_skipped() -> None: # Arrange — an "Unknown" property type and an unmapped wall material. overrides = ResolvedPropertyOverrides( diff --git a/tests/repositories/property/test_override_component_consistency.py b/tests/repositories/property/test_override_component_consistency.py new file mode 100644 index 00000000..d3801b0d --- /dev/null +++ b/tests/repositories/property/test_override_component_consistency.py @@ -0,0 +1,35 @@ +"""Every override component must be wired through the WHOLE chain. + +The finaliser reader (`_ROW_TYPES`, component -> landlord table) and the overlay +registry (`_COMPONENT_OVERLAYS`, component -> overlay mapper) must cover exactly +the same set of components. If a component is classified + stored but has no +reader entry, the finaliser silently never writes its `property_overrides` rows; +if it has no overlay entry, the row never reaches the calculator. This guard +keeps the two registries in lock-step (it would have caught the missing +main_fuel / glazing / construction_age_band reader entries). +""" + +from __future__ import annotations + +from typing import cast + +from infrastructure.landlord_overrides.landlord_override_reader_postgres_repository import ( + _ROW_TYPES, # pyright: ignore[reportPrivateUsage] +) +from infrastructure.postgres.property_override_table import override_component_sa_enum +from repositories.property.landlord_override_overlays import ( + _COMPONENT_OVERLAYS, # pyright: ignore[reportPrivateUsage] +) + + +def test_reader_and_overlay_registries_cover_the_same_components() -> None: + # Assert + assert set(_ROW_TYPES) == set(_COMPONENT_OVERLAYS) + + +def test_override_component_pgenum_covers_every_component() -> None: + # The property_overrides.override_component pgEnum mirror must list every + # component, or writing/reading a new-component row through it throws a + # LookupError against Postgres (caught live on the Hyde portfolio-796 run). + pgenum_values = cast(list[str], getattr(override_component_sa_enum, "enums")) + assert set(pgenum_values) == set(_COMPONENT_OVERLAYS) diff --git a/tests/scripts/test_build_property_overrides_smoke.py b/tests/scripts/test_build_property_overrides_smoke.py new file mode 100644 index 00000000..59db45e5 --- /dev/null +++ b/tests/scripts/test_build_property_overrides_smoke.py @@ -0,0 +1,95 @@ +"""End-to-end smoke of the Hyde override script for ONE property, against a real +(ephemeral) Postgres. Seeds the landlord vocab (simulating post-classify, so no +ChatGPT) + a minimal ``property`` row, then runs the script's real +``write`` + ``verify`` paths and asserts property_overrides + overlays land. +""" + +from __future__ import annotations + +import argparse +from typing import Any + +from sqlalchemy import Engine, text +from sqlmodel import Session + +import scripts.hyde.build_property_overrides as b +from domain.epc.property_overrides.built_form_type import BuiltFormType +from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand +from domain.epc.property_overrides.glazing_type import GlazingType +from domain.epc.property_overrides.main_fuel_type import MainFuelType +from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType +from domain.epc.property_overrides.property_type import PropertyType +from domain.epc.property_overrides.roof_type import RoofType +from domain.epc.property_overrides.wall_type import WallType +from domain.epc.property_overrides.water_heating_type import WaterHeatingType +from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import ( + LandlordOverridesRepository, +) +from repositories.property.landlord_override_overlays import overlays_from +from repositories.property.property_overrides_postgres_reader import ( + PropertyOverridesPostgresReader, +) + +PORTFOLIO = 795 +ORG_REF = "55180004001" +EXCEL = "scripts/hyde/hyde_property_overrides.xlsx" + +# What ChatGPT WOULD resolve this property's 9 descriptions to (component -> +# (raw Excel entry, enum member)). Seeded into the landlord ledger. +SEED = { + "property_type": ("House: MidTerrace", PropertyType.HOUSE), + "built_form_type": ("House: MidTerrace", BuiltFormType.MID_TERRACE), + "wall_type": ("TimberFrame: AsBuilt", WallType.TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED), + "roof_type": ("PitchedNormalLoftAccess: 300mm", RoofType.PITCHED_LOFT_300MM), + "construction_age_band": ("L: 2012-2022", ConstructionAgeBand.L_2012_2022), + "main_fuel": ("Gas: Mains Gas", MainFuelType.MAINS_GAS), + "glazing": ("100% Double glazing 2002 or later", GlazingType.DOUBLE_POST_2002), + "water_heating": ("From main heating system: Mains Gas", WaterHeatingType.FROM_MAIN_MAINS_GAS), + "main_heating_system": ("Boiler: C rated Combi", MainHeatingSystemType.GAS_COMBI), +} + + +def test_one_property_end_to_end(db_engine: Engine, monkeypatch: Any) -> None: + specs = b._specs_by_component() # pyright: ignore[reportPrivateUsage] + + # minimal FE-owned `property` table + the one row we'll match by org_ref + with Session(db_engine) as s: + s.execute(text( # pyright: ignore[reportDeprecated] + "CREATE TABLE property (id bigint PRIMARY KEY, portfolio_id bigint, " + "landlord_property_id text)")) + s.execute(text("INSERT INTO property VALUES (1, :p, :ref)"), # pyright: ignore[reportDeprecated] + {"p": PORTFOLIO, "ref": ORG_REF}) + # seed the classifier ledger (keyed on normalised description) + for comp, (raw, member) in SEED.items(): + repo: LandlordOverridesRepository[Any] = LandlordOverridesRepository( + s, specs[comp].row_type) + repo.upsert_all(PORTFOLIO, {b._norm(raw): member}) # pyright: ignore[reportPrivateUsage] + s.commit() + + # point the script at the ephemeral engine + monkeypatch.setattr(b, "_db_session", lambda: Session(db_engine)) + + # --- run the real write() for this one property --- + b.write(argparse.Namespace(excel=EXCEL, sheet="AddressProfilingResults", + portfolio_id=PORTFOLIO, org_ref=ORG_REF, limit=None, apply=True)) + + with Session(db_engine) as s: + rows = list(s.execute(text( # pyright: ignore[reportDeprecated] + "SELECT override_component, building_part, override_value " + "FROM property_overrides WHERE property_id = 1 ORDER BY override_component"))) + got = {c: v for c, _, v in rows} + # every seeded component produced a property_overrides row with the resolved value + assert got["main_fuel"] == "mains gas" + assert got["glazing"] == "Double glazing, 2002 or later" + assert got["construction_age_band"] == "L" + assert got["main_heating_system"] == "Gas boiler, combi" + assert got["water_heating"] == "From main system, mains gas" + assert len(rows) == 9 # all 9 components + + # --- the overrides reach the SAP overlay surface --- + b.verify(argparse.Namespace(portfolio_id=PORTFOLIO, org_ref=ORG_REF)) # exercises verify() + overlays = overlays_from( + PropertyOverridesPostgresReader(lambda: Session(db_engine)).overrides_for(1)) + assert len(overlays) == 9 + assert any(o.heating is not None and o.heating.main_fuel_type == 26 for o in overlays) + assert any(o.glazing is not None and o.glazing.glazing_type == 2 for o in overlays)