mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
Merge branch 'feautre/hyde_upload_and_extend_landlord_overrides' into feature/hyde_make_it_more_accurate_with_tests
This commit is contained in:
commit
abd4bbc2d0
63 changed files with 4420 additions and 26 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
|
@ -313,3 +313,8 @@ scripts/eon/epc_cache.pkl
|
|||
scripts/hyde/.elmhurst-session/
|
||||
scripts/hyde/elmhurst_downloads/
|
||||
scripts/hyde/.elmhurst-creds.json
|
||||
|
||||
# Hyde property-overrides script artifacts
|
||||
overrides_cache.json
|
||||
overrides_unknowns.csv
|
||||
overrides_edits.csv
|
||||
|
|
|
|||
|
|
@ -7,11 +7,16 @@ import boto3
|
|||
from applications.landlord_description_overrides.landlord_description_overrides_trigger_body import (
|
||||
LandlordDescriptionOverridesTriggerBody,
|
||||
)
|
||||
from domain.epc.built_form_type import BuiltFormType
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.roof_type import RoofType
|
||||
from domain.epc.wall_type import WallType
|
||||
from domain.epc.wall_type_construction_dates import (
|
||||
from domain.epc.property_overrides.built_form_type import BuiltFormType
|
||||
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
|
||||
from domain.epc.property_overrides.glazing_type import GlazingType
|
||||
from domain.epc.property_overrides.main_fuel_type import MainFuelType
|
||||
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
|
||||
from domain.epc.property_overrides.property_type import PropertyType
|
||||
from domain.epc.property_overrides.roof_type import RoofType
|
||||
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
|
||||
from domain.epc.property_overrides.wall_type import WallType
|
||||
from domain.epc.property_overrides.wall_type_construction_dates import (
|
||||
wall_type_construction_date_prompt_hint,
|
||||
)
|
||||
from infrastructure.chatgpt.chatgpt import ChatGPT
|
||||
|
|
@ -24,6 +29,21 @@ from infrastructure.postgres.engine import commit_scope, make_engine, make_sessi
|
|||
from infrastructure.postgres.landlord_built_form_type_override_table import (
|
||||
LandlordBuiltFormTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_construction_age_band_override_table import (
|
||||
LandlordConstructionAgeBandOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_glazing_override_table import (
|
||||
LandlordGlazingOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_main_fuel_override_table import (
|
||||
LandlordMainFuelOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_main_heating_system_override_table import (
|
||||
LandlordMainHeatingSystemOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_water_heating_override_table import (
|
||||
LandlordWaterHeatingOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_property_type_override_table import (
|
||||
LandlordPropertyTypeOverrideRow,
|
||||
)
|
||||
|
|
@ -102,6 +122,56 @@ def _build_columns(
|
|||
session, LandlordRoofTypeOverrideRow
|
||||
),
|
||||
),
|
||||
"main_fuel": lambda src: ClassifiableColumn(
|
||||
name="main_fuel",
|
||||
source_column=src,
|
||||
classifier=ChatGptColumnClassifier(
|
||||
chat_gpt, MainFuelType, MainFuelType.UNKNOWN
|
||||
),
|
||||
repo=LandlordOverridesRepository[MainFuelType](
|
||||
session, LandlordMainFuelOverrideRow
|
||||
),
|
||||
),
|
||||
"glazing": lambda src: ClassifiableColumn(
|
||||
name="glazing",
|
||||
source_column=src,
|
||||
classifier=ChatGptColumnClassifier(
|
||||
chat_gpt, GlazingType, GlazingType.UNKNOWN
|
||||
),
|
||||
repo=LandlordOverridesRepository[GlazingType](
|
||||
session, LandlordGlazingOverrideRow
|
||||
),
|
||||
),
|
||||
"construction_age_band": lambda src: ClassifiableColumn(
|
||||
name="construction_age_band",
|
||||
source_column=src,
|
||||
classifier=ChatGptColumnClassifier(
|
||||
chat_gpt, ConstructionAgeBand, ConstructionAgeBand.UNKNOWN
|
||||
),
|
||||
repo=LandlordOverridesRepository[ConstructionAgeBand](
|
||||
session, LandlordConstructionAgeBandOverrideRow
|
||||
),
|
||||
),
|
||||
"water_heating": lambda src: ClassifiableColumn(
|
||||
name="water_heating",
|
||||
source_column=src,
|
||||
classifier=ChatGptColumnClassifier(
|
||||
chat_gpt, WaterHeatingType, WaterHeatingType.UNKNOWN
|
||||
),
|
||||
repo=LandlordOverridesRepository[WaterHeatingType](
|
||||
session, LandlordWaterHeatingOverrideRow
|
||||
),
|
||||
),
|
||||
"main_heating_system": lambda src: ClassifiableColumn(
|
||||
name="main_heating_system",
|
||||
source_column=src,
|
||||
classifier=ChatGptColumnClassifier(
|
||||
chat_gpt, MainHeatingSystemType, MainHeatingSystemType.UNKNOWN
|
||||
),
|
||||
repo=LandlordOverridesRepository[MainHeatingSystemType](
|
||||
session, LandlordMainHeatingSystemOverrideRow
|
||||
),
|
||||
),
|
||||
}
|
||||
|
||||
columns: list[ClassifiableColumn[Any]] = []
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ variable "image_digest" {
|
|||
|
||||
variable "maximum_concurrency" {
|
||||
type = number
|
||||
default = 2
|
||||
default = 20
|
||||
description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit."
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,39 @@
|
|||
"""Map a Landlord-Override construction-age-band value to a fabric Simulation
|
||||
Overlay.
|
||||
|
||||
A construction-age-band value is the RdSAP England-&-Wales letter code (A..M)
|
||||
the calculator's U-value cascades key on (`SapBuildingPart.construction_age_band`,
|
||||
read via `.strip().upper()` against the letter-code bands). The overlay targets
|
||||
the override's building part and sets the band; an unrecognised code produces no
|
||||
overlay. Re-dating a part re-derives its construction-default U-values, so this
|
||||
is the highest-leverage fabric override.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from datatypes.epc.domain.epc_property_data import BuildingPartIdentifier
|
||||
from domain.modelling.simulation import BuildingPartOverlay, EpcSimulation
|
||||
|
||||
# RdSAP England-&-Wales construction age bands (letter codes A..M).
|
||||
_VALID_AGE_BANDS: frozenset[str] = frozenset("ABCDEFGHIJKLM")
|
||||
|
||||
|
||||
def age_band_overlay_for(
|
||||
age_band_value: str, building_part: int
|
||||
) -> Optional[EpcSimulation]:
|
||||
band = age_band_value.strip().upper()
|
||||
if band not in _VALID_AGE_BANDS:
|
||||
return None
|
||||
|
||||
identifier = (
|
||||
BuildingPartIdentifier.MAIN
|
||||
if building_part == 0
|
||||
else BuildingPartIdentifier.extension(building_part)
|
||||
)
|
||||
return EpcSimulation(
|
||||
building_parts={
|
||||
identifier: BuildingPartOverlay(construction_age_band=band)
|
||||
}
|
||||
)
|
||||
36
domain/epc/property_overlays/glazing_overlay.py
Normal file
36
domain/epc/property_overlays/glazing_overlay.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
"""Map a Landlord-Override glazing value to a glazing Simulation Overlay.
|
||||
|
||||
A glazing value is one canonical glazing description carrying type + era
|
||||
("Double glazing, 2002 or later", "Single glazing", "Triple glazing, 2002 or
|
||||
later"). The calculator derives each window's U-value from its SAP10
|
||||
`glazing_type` code via the RdSAP Table 24 cascade, so the overlay decomposes
|
||||
the value into that code and emits a whole-dwelling `GlazingOverlay` (a landlord
|
||||
describes the dwelling's glazing as a whole, with no per-window geometry, so
|
||||
`building_part` is ignored). `_fold_glazing` expands it across every window.
|
||||
Unresolvable values produce no overlay.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from domain.modelling.simulation import EpcSimulation, GlazingOverlay
|
||||
|
||||
# Canonical glazing description → SAP10 glazing-type code (the Table 24 /
|
||||
# `u_window` cascade enum, `_GLAZING_CODE_TO_UWINDOW` in heat_transmission).
|
||||
_GLAZING_CODES: dict[str, int] = {
|
||||
"Single glazing": 1,
|
||||
"Double glazing, 2002 or later": 2,
|
||||
"Double glazing, pre-2002": 3,
|
||||
"Triple glazing, pre-2002": 6,
|
||||
"Triple glazing, 2002 or later": 9,
|
||||
}
|
||||
|
||||
|
||||
def glazing_overlay_for(
|
||||
glazing_value: str, building_part: int
|
||||
) -> Optional[EpcSimulation]:
|
||||
code = _GLAZING_CODES.get(glazing_value)
|
||||
if code is None:
|
||||
return None
|
||||
return EpcSimulation(glazing=GlazingOverlay(glazing_type=code))
|
||||
41
domain/epc/property_overlays/main_fuel_overlay.py
Normal file
41
domain/epc/property_overlays/main_fuel_overlay.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
"""Map a Landlord-Override main-fuel value to a heating Simulation Overlay.
|
||||
|
||||
A main-fuel value is one canonical gov-EPC `main_fuel` description ("mains gas",
|
||||
"electricity", …). The calculator reads the dwelling's primary fuel from
|
||||
`main_heating_details[0].main_fuel_type` as the RdSAP **int code**, so the
|
||||
overlay decomposes the value into that code and emits a whole-dwelling
|
||||
`HeatingOverlay` (fuel is not a per-building-part attribute, so `building_part`
|
||||
is ignored). Codes follow the modern RdSAP-20/21 `(not community)` family the
|
||||
gov-EPC API baseline uses. Unresolvable values produce no overlay.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from domain.modelling.simulation import EpcSimulation, HeatingOverlay
|
||||
|
||||
# RdSAP-20/21 `main_fuel` `(not community)` codes (epc_codes.csv `main_fuel`).
|
||||
_FUEL_CODES: dict[str, int] = {
|
||||
"mains gas": 26,
|
||||
"mains gas (community)": 20,
|
||||
"LPG (bulk)": 27,
|
||||
"bottled LPG": 3,
|
||||
"LPG special condition": 17,
|
||||
"oil": 28,
|
||||
"electricity": 29,
|
||||
"electricity (community)": 25,
|
||||
"house coal": 33,
|
||||
"smokeless coal": 15,
|
||||
"dual fuel (mineral and wood)": 10,
|
||||
"biomass (community)": 31,
|
||||
}
|
||||
|
||||
|
||||
def fuel_overlay_for(
|
||||
main_fuel_value: str, building_part: int
|
||||
) -> Optional[EpcSimulation]:
|
||||
code = _FUEL_CODES.get(main_fuel_value)
|
||||
if code is None:
|
||||
return None
|
||||
return EpcSimulation(heating=HeatingOverlay(main_fuel_type=code))
|
||||
46
domain/epc/property_overlays/main_heating_system_overlay.py
Normal file
46
domain/epc/property_overlays/main_heating_system_overlay.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
"""Map a Landlord-Override main-heating-system value to a heating Simulation Overlay.
|
||||
|
||||
A main-heating-system value is one canonical system archetype ("Gas boiler,
|
||||
combi", "Electric storage heaters, fan"). The calculator reads the primary
|
||||
system's `sap_main_heating_code` (SAP Table 4a/4b), so the overlay maps the
|
||||
archetype to a representative code and emits a whole-dwelling `HeatingOverlay`
|
||||
targeting `main_heating_details[0]` (`building_part` is ignored). It composes
|
||||
field-wise with the main_fuel / water_heating overlays.
|
||||
|
||||
The SEDBUK A-G efficiency band the Hyde "Heating" column carries is NOT honoured
|
||||
yet (no efficiency slot on the overlay/MainHeatingDetail) -- archetypes map to
|
||||
their modern/condensing Table 4b code, so an old low-rated boiler is currently
|
||||
modelled at the condensing efficiency. Heat pumps and community heating (which
|
||||
resolve via main_heating_index_number / community codes, not a Table 4b code)
|
||||
are left UNKNOWN until modelled. Unresolvable values produce no overlay.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from domain.modelling.simulation import EpcSimulation, HeatingOverlay
|
||||
|
||||
# Canonical system archetype → representative `sap_main_heating_code` (SAP Table
|
||||
# 4b boiler rows / Table 4a). Codes map to the modern/condensing variant (A-G
|
||||
# efficiency deferred): 102 regular condensing, 104 condensing combi, 120 CPSU,
|
||||
# 404 fan storage heaters, 191 direct-acting electric boiler.
|
||||
_MAIN_HEATING_CODES: dict[str, int] = {
|
||||
"Gas boiler, combi": 104,
|
||||
"Gas boiler, regular": 102,
|
||||
"Gas CPSU": 120,
|
||||
"Electric storage heaters, old": 401,
|
||||
"Electric storage heaters, slimline": 402,
|
||||
"Electric storage heaters, convector": 403,
|
||||
"Electric storage heaters, fan": 404,
|
||||
"Direct-acting electric": 191,
|
||||
}
|
||||
|
||||
|
||||
def main_heating_overlay_for(
|
||||
main_heating_value: str, building_part: int
|
||||
) -> Optional[EpcSimulation]:
|
||||
code = _MAIN_HEATING_CODES.get(main_heating_value)
|
||||
if code is None:
|
||||
return None
|
||||
return EpcSimulation(heating=HeatingOverlay(sap_main_heating_code=code))
|
||||
47
domain/epc/property_overlays/water_heating_overlay.py
Normal file
47
domain/epc/property_overlays/water_heating_overlay.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
"""Map a Landlord-Override water-heating value to a heating Simulation Overlay.
|
||||
|
||||
A water-heating value is one canonical "<system>, <fuel>" description ("From main
|
||||
system, mains gas", "Electric immersion, electricity"). The calculator reads the
|
||||
hot-water arrangement from `sap_heating.water_heating_code` (the SAP Table 4a
|
||||
system code) and `water_heating_fuel`, so the overlay decomposes the value into
|
||||
those two int codes and emits a whole-dwelling `HeatingOverlay` (water heating is
|
||||
not per-building-part, so `building_part` is ignored). It composes field-wise with
|
||||
the main_fuel / main_heating overlays. Unresolvable values produce no overlay.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from domain.modelling.simulation import EpcSimulation, HeatingOverlay
|
||||
|
||||
# Canonical "<system>, <fuel>" description → (water_heating_code, water_heating_fuel).
|
||||
# water_heating_code: 901 "from main system" (SAP Table 4a inherit-from-main),
|
||||
# 903 "electric immersion". Fuel codes are the modern RdSAP "(not community)"
|
||||
# family (26 mains gas, 29 electricity), matching the main_fuel overlay.
|
||||
_WATER_HEATING_CODES: dict[str, tuple[int, int]] = {
|
||||
"From main system, mains gas": (901, 26),
|
||||
"From main system, electricity": (901, 29),
|
||||
"From main system, oil": (901, 28),
|
||||
"From main system, LPG (bulk)": (901, 27),
|
||||
"From main system, bottled LPG": (901, 3),
|
||||
"From main system, house coal": (901, 33),
|
||||
"Electric immersion, electricity": (903, 29),
|
||||
# "boiler/circulator for water heating only" — SAP Table 4a code 911 (gas).
|
||||
"Gas boiler/circulator, mains gas": (911, 26),
|
||||
}
|
||||
|
||||
|
||||
def water_heating_overlay_for(
|
||||
water_heating_value: str, building_part: int
|
||||
) -> Optional[EpcSimulation]:
|
||||
codes = _WATER_HEATING_CODES.get(water_heating_value)
|
||||
if codes is None:
|
||||
return None
|
||||
water_heating_code, water_heating_fuel = codes
|
||||
return EpcSimulation(
|
||||
heating=HeatingOverlay(
|
||||
water_heating_code=water_heating_code,
|
||||
water_heating_fuel=water_heating_fuel,
|
||||
)
|
||||
)
|
||||
4
domain/epc/property_overrides/__init__.py
Normal file
4
domain/epc/property_overrides/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
"""Landlord property-override classifier vocabulary — the category enums a
|
||||
landlord description resolves into, plus their value→code helpers. The classifier
|
||||
target for the property_overrides chain (mirrors the property_overrides table /
|
||||
override_component pgEnum). Distinct from the EPC-context types of the same name."""
|
||||
31
domain/epc/property_overrides/construction_age_band.py
Normal file
31
domain/epc/property_overrides/construction_age_band.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class ConstructionAgeBand(Enum):
|
||||
"""A landlord-supplied construction age band, as resolved by the
|
||||
landlord-description-overrides context.
|
||||
|
||||
Each member's value is the RdSAP England-&-Wales age-band **letter code**
|
||||
(A..M) the calculator's U-value cascades read from
|
||||
`SapBuildingPart.construction_age_band` — the same representation the gov-EPC
|
||||
API lodges. The construction-age-band Simulation Overlay
|
||||
(``domain/epc/property_overlays/construction_age_band_overlay.py``) sets the
|
||||
letter directly, so these values MUST stay the bare letter codes. Member
|
||||
names carry the year ranges for readability. ``UNKNOWN`` covers values the
|
||||
classifier cannot resolve (it leaves the lodged cert's age band untouched).
|
||||
"""
|
||||
|
||||
A_BEFORE_1900 = "A"
|
||||
B_1900_1929 = "B"
|
||||
C_1930_1949 = "C"
|
||||
D_1950_1966 = "D"
|
||||
E_1967_1975 = "E"
|
||||
F_1976_1982 = "F"
|
||||
G_1983_1990 = "G"
|
||||
H_1991_1995 = "H"
|
||||
I_1996_2002 = "I"
|
||||
J_2003_2006 = "J"
|
||||
K_2007_2011 = "K"
|
||||
L_2012_2022 = "L"
|
||||
M_2023_ONWARDS = "M"
|
||||
UNKNOWN = "Unknown"
|
||||
24
domain/epc/property_overrides/glazing_type.py
Normal file
24
domain/epc/property_overrides/glazing_type.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class GlazingType(Enum):
|
||||
"""A landlord-supplied glazing description, as resolved by the
|
||||
landlord-description-overrides context.
|
||||
|
||||
Each member's value is the canonical glazing description (type + era) that
|
||||
the glazing Simulation Overlay
|
||||
(``domain/epc/property_overlays/glazing_overlay.py``) decomposes into the
|
||||
SAP10 ``glazing_type`` code the calculator's Table-24 cascade reads — so the
|
||||
member values here MUST stay in lock-step with that overlay's
|
||||
``_GLAZING_CODES`` keys. The era matters: double-glazing pre-2002 and
|
||||
2002-onward resolve to different codes (and U-values). ``UNKNOWN`` covers
|
||||
values the classifier cannot resolve, and any glazing not yet given a
|
||||
verified overlay code (it leaves the lodged cert's glazing untouched).
|
||||
"""
|
||||
|
||||
SINGLE = "Single glazing"
|
||||
DOUBLE_POST_2002 = "Double glazing, 2002 or later"
|
||||
DOUBLE_PRE_2002 = "Double glazing, pre-2002"
|
||||
TRIPLE_PRE_2002 = "Triple glazing, pre-2002"
|
||||
TRIPLE_POST_2002 = "Triple glazing, 2002 or later"
|
||||
UNKNOWN = "Unknown"
|
||||
29
domain/epc/property_overrides/main_fuel_type.py
Normal file
29
domain/epc/property_overrides/main_fuel_type.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class MainFuelType(Enum):
|
||||
"""A landlord-supplied main-fuel description, as resolved by the
|
||||
landlord-description-overrides context.
|
||||
|
||||
Each member's value is the canonical fuel description that the main-fuel
|
||||
Simulation Overlay (``domain/epc/property_overlays/main_fuel_overlay.py``)
|
||||
decomposes into the RdSAP ``main_fuel`` int code the calculator reads — so
|
||||
the member values here MUST stay in lock-step with that overlay's
|
||||
``_FUEL_CODES`` keys. ``UNKNOWN`` covers values the classifier cannot
|
||||
resolve, and also any fuel not yet given a verified overlay code (it leaves
|
||||
the lodged cert's fuel untouched rather than guessing).
|
||||
"""
|
||||
|
||||
MAINS_GAS = "mains gas"
|
||||
MAINS_GAS_COMMUNITY = "mains gas (community)"
|
||||
ELECTRICITY = "electricity"
|
||||
ELECTRICITY_COMMUNITY = "electricity (community)"
|
||||
LPG_BULK = "LPG (bulk)"
|
||||
LPG_BOTTLED = "bottled LPG"
|
||||
LPG_SPECIAL_CONDITION = "LPG special condition"
|
||||
OIL = "oil"
|
||||
HOUSE_COAL = "house coal"
|
||||
SMOKELESS_COAL = "smokeless coal"
|
||||
DUAL_FUEL_MINERAL_WOOD = "dual fuel (mineral and wood)"
|
||||
BIOMASS_COMMUNITY = "biomass (community)"
|
||||
UNKNOWN = "Unknown"
|
||||
27
domain/epc/property_overrides/main_heating_system_type.py
Normal file
27
domain/epc/property_overrides/main_heating_system_type.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class MainHeatingSystemType(Enum):
|
||||
"""A landlord-supplied main-heating-system description, as resolved by the
|
||||
landlord-description-overrides context.
|
||||
|
||||
Each member's value is the canonical system archetype that the main-heating
|
||||
Simulation Overlay
|
||||
(``domain/epc/property_overlays/main_heating_system_overlay.py``) maps to a
|
||||
representative SAP ``sap_main_heating_code`` — so the member values MUST stay
|
||||
in lock-step with that overlay's ``_MAIN_HEATING_CODES`` keys. The SEDBUK A-G
|
||||
efficiency band the Hyde "Heating" column carries is NOT modelled yet
|
||||
(deferred), so archetypes map to their modern/condensing code. ``UNKNOWN``
|
||||
covers values the classifier cannot resolve and the not-yet-modelled systems
|
||||
(heat pumps, community heating).
|
||||
"""
|
||||
|
||||
GAS_COMBI = "Gas boiler, combi"
|
||||
GAS_REGULAR = "Gas boiler, regular"
|
||||
GAS_CPSU = "Gas CPSU"
|
||||
ELECTRIC_STORAGE_OLD = "Electric storage heaters, old"
|
||||
ELECTRIC_STORAGE_SLIMLINE = "Electric storage heaters, slimline"
|
||||
ELECTRIC_STORAGE_CONVECTOR = "Electric storage heaters, convector"
|
||||
ELECTRIC_STORAGE_FAN = "Electric storage heaters, fan"
|
||||
DIRECT_ELECTRIC = "Direct-acting electric"
|
||||
UNKNOWN = "Unknown"
|
||||
|
|
@ -27,7 +27,7 @@ from __future__ import annotations
|
|||
from dataclasses import dataclass
|
||||
from typing import Mapping, Optional
|
||||
|
||||
from domain.epc.wall_type import WallType
|
||||
from domain.epc.property_overrides.wall_type import WallType
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
26
domain/epc/property_overrides/water_heating_type.py
Normal file
26
domain/epc/property_overrides/water_heating_type.py
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class WaterHeatingType(Enum):
|
||||
"""A landlord-supplied water-heating description, as resolved by the
|
||||
landlord-description-overrides context.
|
||||
|
||||
Each member's value is the canonical "<system>, <fuel>" description that the
|
||||
water-heating Simulation Overlay
|
||||
(``domain/epc/property_overlays/water_heating_overlay.py``) decomposes into
|
||||
the SAP ``water_heating_code`` + ``water_heating_fuel`` int codes the
|
||||
calculator reads — so the member values MUST stay in lock-step with that
|
||||
overlay's ``_WATER_HEATING_CODES`` keys. ``UNKNOWN`` covers values the
|
||||
classifier cannot resolve, and any combination not yet given verified codes
|
||||
(it leaves the lodged cert's hot-water arrangement untouched).
|
||||
"""
|
||||
|
||||
FROM_MAIN_MAINS_GAS = "From main system, mains gas"
|
||||
FROM_MAIN_ELECTRICITY = "From main system, electricity"
|
||||
FROM_MAIN_OIL = "From main system, oil"
|
||||
FROM_MAIN_LPG_BULK = "From main system, LPG (bulk)"
|
||||
FROM_MAIN_BOTTLED_LPG = "From main system, bottled LPG"
|
||||
FROM_MAIN_HOUSE_COAL = "From main system, house coal"
|
||||
ELECTRIC_IMMERSION = "Electric immersion, electricity"
|
||||
GAS_BOILER_CIRCULATOR_MAINS_GAS = "Gas boiler/circulator, mains gas"
|
||||
UNKNOWN = "Unknown"
|
||||
|
|
@ -19,6 +19,7 @@ from datatypes.epc.domain.epc_property_data import (
|
|||
)
|
||||
from domain.modelling.simulation import (
|
||||
EpcSimulation,
|
||||
GlazingOverlay,
|
||||
HeatingOverlay,
|
||||
LightingOverlay,
|
||||
SecondaryHeatingOverlay,
|
||||
|
|
@ -53,6 +54,8 @@ def apply_simulations(
|
|||
)
|
||||
if simulation.lighting is not None:
|
||||
_fold_lighting(result, simulation.lighting)
|
||||
if simulation.glazing is not None:
|
||||
_fold_glazing(result, simulation.glazing)
|
||||
if simulation.heating is not None:
|
||||
_fold_heating(result, simulation.heating)
|
||||
if simulation.secondary_heating is not None:
|
||||
|
|
@ -202,6 +205,21 @@ def _fold_window(window: SapWindow, overlay: WindowOverlay) -> None:
|
|||
details.solar_transmittance = overlay.solar_transmittance
|
||||
|
||||
|
||||
def _fold_glazing(epc: EpcPropertyData, overlay: GlazingOverlay) -> None:
|
||||
"""Expand a whole-dwelling `GlazingOverlay` across every window: set each
|
||||
window's `glazing_type` to the corrected SAP10 code AND clear its lodged
|
||||
transmission U, so `heat_transmission`'s Table-24 cascade re-derives U from
|
||||
the new type (the lodged U was for the old, mis-recorded glazing). A landlord
|
||||
glazing override carries no per-window geometry, so it applies uniformly —
|
||||
the expansion lives here because the baseline window list is known only at
|
||||
fold time."""
|
||||
if overlay.glazing_type is None:
|
||||
return
|
||||
for window in epc.sap_windows:
|
||||
window.glazing_type = overlay.glazing_type
|
||||
window.window_transmission_details = None
|
||||
|
||||
|
||||
def _fold_ventilation(
|
||||
baseline: Optional[SapVentilation], overlay: VentilationOverlay
|
||||
) -> SapVentilation:
|
||||
|
|
|
|||
|
|
@ -28,6 +28,12 @@ class BuildingPartOverlay:
|
|||
# The wall material (RdSAP `wall_construction` code). Left `None` by Measures
|
||||
# — insulating a wall doesn't change its material — but set by a Landlord
|
||||
# Override that corrects the construction itself (ADR-0032).
|
||||
# RdSAP England-&-Wales construction age band — the letter code A..M the
|
||||
# calculator's U-value cascades key on (`SapBuildingPart.construction_age_band`).
|
||||
# Left `None` by Measures (retrofits don't change build era); set by a Landlord
|
||||
# Override that corrects the lodged age band, which re-derives this part's
|
||||
# fabric U-value defaults. Folds onto the part via the generic field loop.
|
||||
construction_age_band: Optional[str] = None
|
||||
wall_construction: Optional[int] = None
|
||||
wall_insulation_type: Optional[int] = None
|
||||
# Added solid-wall insulation depth (mm) — drives the calculator's Table 6
|
||||
|
|
@ -73,6 +79,28 @@ class WindowOverlay:
|
|||
solar_transmittance: Optional[float] = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class GlazingOverlay:
|
||||
"""All-optional partial of the dwelling's whole-glazing state — the
|
||||
correction a Landlord Override makes when the lodged glazing is wrong.
|
||||
|
||||
Unlike a per-window `WindowOverlay` (keyed by `sap_windows` index), this
|
||||
targets no single window: a landlord describes the dwelling's glazing as a
|
||||
whole ("Double glazing, 2002 or later") with no per-window geometry, so the
|
||||
overlay builder (which never sees the baseline window list) emits one of
|
||||
these and `_fold_glazing` expands it across every `sap_windows` entry.
|
||||
|
||||
`glazing_type` is the SAP10 glazing-type code (Table 24 / `u_window`
|
||||
cascade: 1=single, 2=double 2002-2021, 3=double pre-2002, 9=triple 2002+,
|
||||
…). The fold sets it on every window AND clears each window's lodged
|
||||
transmission U-value, so the Table-24 cascade re-derives the corrected U
|
||||
from the new type (the lodged U was for the OLD, mis-recorded glazing).
|
||||
A `None` field means "leave the baseline value unchanged".
|
||||
"""
|
||||
|
||||
glazing_type: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LightingOverlay:
|
||||
"""All-optional partial of the dwelling's fixed-lighting bulb counts — the
|
||||
|
|
@ -220,6 +248,7 @@ class EpcSimulation:
|
|||
windows: Mapping[int, WindowOverlay] = field(default_factory=_no_windows)
|
||||
ventilation: Optional[VentilationOverlay] = None
|
||||
lighting: Optional[LightingOverlay] = None
|
||||
glazing: Optional[GlazingOverlay] = None
|
||||
heating: Optional[HeatingOverlay] = None
|
||||
secondary_heating: Optional[SecondaryHeatingOverlay] = None
|
||||
solar: Optional[SolarOverlay] = None
|
||||
|
|
|
|||
|
|
@ -25,9 +25,24 @@ from infrastructure.postgres.landlord_property_type_override_table import (
|
|||
from infrastructure.postgres.landlord_roof_type_override_table import (
|
||||
LandlordRoofTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_construction_age_band_override_table import (
|
||||
LandlordConstructionAgeBandOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_glazing_override_table import (
|
||||
LandlordGlazingOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_main_fuel_override_table import (
|
||||
LandlordMainFuelOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_main_heating_system_override_table import (
|
||||
LandlordMainHeatingSystemOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_wall_type_override_table import (
|
||||
LandlordWallTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_water_heating_override_table import (
|
||||
LandlordWaterHeatingOverrideRow,
|
||||
)
|
||||
from repositories.landlord_overrides.landlord_override_reader import (
|
||||
LandlordOverrideReader,
|
||||
)
|
||||
|
|
@ -38,6 +53,11 @@ _ROW_TYPES: dict[str, type] = {
|
|||
"built_form_type": LandlordBuiltFormTypeOverrideRow,
|
||||
"wall_type": LandlordWallTypeOverrideRow,
|
||||
"roof_type": LandlordRoofTypeOverrideRow,
|
||||
"main_fuel": LandlordMainFuelOverrideRow,
|
||||
"glazing": LandlordGlazingOverrideRow,
|
||||
"construction_age_band": LandlordConstructionAgeBandOverrideRow,
|
||||
"water_heating": LandlordWaterHeatingOverrideRow,
|
||||
"main_heating_system": LandlordMainHeatingSystemOverrideRow,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint
|
|||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.built_form_type import BuiltFormType
|
||||
from domain.epc.property_overrides.built_form_type import BuiltFormType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,73 @@
|
|||
"""SQLModel mirror of the ``landlord_construction_age_band_overrides`` Drizzle table.
|
||||
|
||||
The schema source of truth lives in the ``assessment-model`` TS repo
|
||||
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
|
||||
this row class only mirrors the columns so the Python lambda can read/write.
|
||||
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
|
||||
differences are the table name, the ``construction_age_band`` pgEnum on
|
||||
``value``, and the unique-constraint name.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import ClassVar
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import BigInteger, Column, UniqueConstraint
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
class LandlordConstructionAgeBandOverrideRow(SQLModel, table=True):
|
||||
__tablename__: ClassVar[str] = "landlord_construction_age_band_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
# NB: shortened (drop the redundant ``_overrides``) to stay within
|
||||
# PostgreSQL's 63-char identifier limit -- the full
|
||||
# ``landlord_construction_age_band_overrides_portfolio_description_unique``
|
||||
# is 68 chars and would be silently truncated, diverging from Drizzle.
|
||||
UniqueConstraint(
|
||||
"portfolio_id",
|
||||
"description",
|
||||
name="landlord_construction_age_band_portfolio_description_unique",
|
||||
),
|
||||
)
|
||||
|
||||
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
||||
|
||||
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
|
||||
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
|
||||
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
|
||||
# not declared here -- the ``portfolio`` table is not modelled in Python.
|
||||
portfolio_id: int = Field(
|
||||
sa_column=Column(BigInteger, nullable=False, index=True),
|
||||
)
|
||||
|
||||
description: str = Field(nullable=False)
|
||||
|
||||
value: ConstructionAgeBand = Field(
|
||||
sa_column=Column(
|
||||
SAEnum(
|
||||
ConstructionAgeBand,
|
||||
name="construction_age_band",
|
||||
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
|
||||
# instance is reused by every ``landlord_*_overrides`` row class.
|
||||
source: str = Field(
|
||||
sa_column=Column(override_source_sa_enum, nullable=False),
|
||||
)
|
||||
|
||||
created_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
69
infrastructure/postgres/landlord_glazing_override_table.py
Normal file
69
infrastructure/postgres/landlord_glazing_override_table.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
"""SQLModel mirror of the ``landlord_glazing_overrides`` Drizzle table.
|
||||
|
||||
The schema source of truth lives in the ``assessment-model`` TS repo
|
||||
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
|
||||
this row class only mirrors the columns so the Python lambda can read/write.
|
||||
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
|
||||
differences are the table name, the ``glazing`` pgEnum on ``value``, and the
|
||||
unique-constraint name.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import ClassVar
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import BigInteger, Column, UniqueConstraint
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.property_overrides.glazing_type import GlazingType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
class LandlordGlazingOverrideRow(SQLModel, table=True):
|
||||
__tablename__: ClassVar[str] = "landlord_glazing_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
UniqueConstraint(
|
||||
"portfolio_id",
|
||||
"description",
|
||||
name="landlord_glazing_overrides_portfolio_description_unique",
|
||||
),
|
||||
)
|
||||
|
||||
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
||||
|
||||
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
|
||||
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
|
||||
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
|
||||
# not declared here -- the ``portfolio`` table is not modelled in Python.
|
||||
portfolio_id: int = Field(
|
||||
sa_column=Column(BigInteger, nullable=False, index=True),
|
||||
)
|
||||
|
||||
description: str = Field(nullable=False)
|
||||
|
||||
value: GlazingType = Field(
|
||||
sa_column=Column(
|
||||
SAEnum(
|
||||
GlazingType,
|
||||
name="glazing",
|
||||
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
|
||||
# instance is reused by every ``landlord_*_overrides`` row class.
|
||||
source: str = Field(
|
||||
sa_column=Column(override_source_sa_enum, nullable=False),
|
||||
)
|
||||
|
||||
created_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
69
infrastructure/postgres/landlord_main_fuel_override_table.py
Normal file
69
infrastructure/postgres/landlord_main_fuel_override_table.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
"""SQLModel mirror of the ``landlord_main_fuel_overrides`` Drizzle table.
|
||||
|
||||
The schema source of truth lives in the ``assessment-model`` TS repo
|
||||
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
|
||||
this row class only mirrors the columns so the Python lambda can read/write.
|
||||
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
|
||||
differences are the table name, the ``main_fuel`` pgEnum on ``value``, and
|
||||
the unique-constraint name.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import ClassVar
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import BigInteger, Column, UniqueConstraint
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.property_overrides.main_fuel_type import MainFuelType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
class LandlordMainFuelOverrideRow(SQLModel, table=True):
|
||||
__tablename__: ClassVar[str] = "landlord_main_fuel_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
UniqueConstraint(
|
||||
"portfolio_id",
|
||||
"description",
|
||||
name="landlord_main_fuel_overrides_portfolio_description_unique",
|
||||
),
|
||||
)
|
||||
|
||||
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
||||
|
||||
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
|
||||
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
|
||||
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
|
||||
# not declared here -- the ``portfolio`` table is not modelled in Python.
|
||||
portfolio_id: int = Field(
|
||||
sa_column=Column(BigInteger, nullable=False, index=True),
|
||||
)
|
||||
|
||||
description: str = Field(nullable=False)
|
||||
|
||||
value: MainFuelType = Field(
|
||||
sa_column=Column(
|
||||
SAEnum(
|
||||
MainFuelType,
|
||||
name="main_fuel",
|
||||
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
|
||||
# instance is reused by every ``landlord_*_overrides`` row class.
|
||||
source: str = Field(
|
||||
sa_column=Column(override_source_sa_enum, nullable=False),
|
||||
)
|
||||
|
||||
created_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
"""SQLModel mirror of the ``landlord_main_heating_system_overrides`` Drizzle table.
|
||||
|
||||
The schema source of truth lives in the ``assessment-model`` TS repo
|
||||
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
|
||||
this row class only mirrors the columns so the Python lambda can read/write.
|
||||
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
|
||||
differences are the table name, the ``main_heating_system`` pgEnum on ``value``,
|
||||
and the unique-constraint name.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import ClassVar
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import BigInteger, Column, UniqueConstraint
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
class LandlordMainHeatingSystemOverrideRow(SQLModel, table=True):
|
||||
__tablename__: ClassVar[str] = "landlord_main_heating_system_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
# Shortened (drop the redundant ``_overrides``) to stay within
|
||||
# PostgreSQL's 63-char identifier limit; mirrors the Drizzle name.
|
||||
UniqueConstraint(
|
||||
"portfolio_id",
|
||||
"description",
|
||||
name="landlord_main_heating_system_portfolio_description_unique",
|
||||
),
|
||||
)
|
||||
|
||||
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
||||
|
||||
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
|
||||
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
|
||||
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
|
||||
# not declared here -- the ``portfolio`` table is not modelled in Python.
|
||||
portfolio_id: int = Field(
|
||||
sa_column=Column(BigInteger, nullable=False, index=True),
|
||||
)
|
||||
|
||||
description: str = Field(nullable=False)
|
||||
|
||||
value: MainHeatingSystemType = Field(
|
||||
sa_column=Column(
|
||||
SAEnum(
|
||||
MainHeatingSystemType,
|
||||
name="main_heating_system",
|
||||
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
|
||||
# instance is reused by every ``landlord_*_overrides`` row class.
|
||||
source: str = Field(
|
||||
sa_column=Column(override_source_sa_enum, nullable=False),
|
||||
)
|
||||
|
||||
created_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
|
|
@ -14,7 +14,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint
|
|||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.property_overrides.property_type import PropertyType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint
|
|||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.roof_type import RoofType
|
||||
from domain.epc.property_overrides.roof_type import RoofType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint
|
|||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.wall_type import WallType
|
||||
from domain.epc.property_overrides.wall_type import WallType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,69 @@
|
|||
"""SQLModel mirror of the ``landlord_water_heating_overrides`` Drizzle table.
|
||||
|
||||
The schema source of truth lives in the ``assessment-model`` TS repo
|
||||
(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there;
|
||||
this row class only mirrors the columns so the Python lambda can read/write.
|
||||
See ADR-0003. Shape mirrors ``LandlordWallTypeOverrideRow`` -- the only
|
||||
differences are the table name, the ``water_heating`` pgEnum on ``value``, and
|
||||
the unique-constraint name.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import ClassVar
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import BigInteger, Column, UniqueConstraint
|
||||
from sqlalchemy import Enum as SAEnum
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
|
||||
from infrastructure.postgres.landlord_override_enums import override_source_sa_enum
|
||||
|
||||
|
||||
class LandlordWaterHeatingOverrideRow(SQLModel, table=True):
|
||||
__tablename__: ClassVar[str] = "landlord_water_heating_overrides" # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
__table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride]
|
||||
UniqueConstraint(
|
||||
"portfolio_id",
|
||||
"description",
|
||||
name="landlord_water_heating_overrides_portfolio_description_unique",
|
||||
),
|
||||
)
|
||||
|
||||
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
||||
|
||||
# bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int
|
||||
# mapping is 32-bit Integer and would overflow once portfolio IDs exceed
|
||||
# 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration,
|
||||
# not declared here -- the ``portfolio`` table is not modelled in Python.
|
||||
portfolio_id: int = Field(
|
||||
sa_column=Column(BigInteger, nullable=False, index=True),
|
||||
)
|
||||
|
||||
description: str = Field(nullable=False)
|
||||
|
||||
value: WaterHeatingType = Field(
|
||||
sa_column=Column(
|
||||
SAEnum(
|
||||
WaterHeatingType,
|
||||
name="water_heating",
|
||||
values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Shared SAEnum -- see ``landlord_override_enums`` for why this single
|
||||
# instance is reused by every ``landlord_*_overrides`` row class.
|
||||
source: str = Field(
|
||||
sa_column=Column(override_source_sa_enum, nullable=False),
|
||||
)
|
||||
|
||||
created_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
nullable=False,
|
||||
)
|
||||
|
|
@ -27,6 +27,11 @@ override_component_sa_enum = SAEnum(
|
|||
"roof_type",
|
||||
"property_type",
|
||||
"built_form_type",
|
||||
"main_fuel",
|
||||
"glazing",
|
||||
"construction_age_band",
|
||||
"water_heating",
|
||||
"main_heating_system",
|
||||
name="override_component",
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -14,10 +14,10 @@ from typing import Any, Optional
|
|||
|
||||
from uuid import UUID
|
||||
|
||||
from domain.epc.built_form_type import BuiltFormType
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.roof_type import RoofType
|
||||
from domain.epc.wall_type import WallType
|
||||
from domain.epc.property_overrides.built_form_type import BuiltFormType
|
||||
from domain.epc.property_overrides.property_type import PropertyType
|
||||
from domain.epc.property_overrides.roof_type import RoofType
|
||||
from domain.epc.property_overrides.wall_type import WallType
|
||||
from repositories.bulk_upload.bulk_upload_status_writer import BulkUploadStatusWriter
|
||||
from repositories.landlord_overrides.landlord_override_reader import (
|
||||
LandlordOverrideReader,
|
||||
|
|
|
|||
|
|
@ -29,6 +29,17 @@ from domain.epc.property_overlays.attribute_overlay import (
|
|||
built_form_overlay_for,
|
||||
property_type_overlay_for,
|
||||
)
|
||||
from domain.epc.property_overlays.construction_age_band_overlay import (
|
||||
age_band_overlay_for,
|
||||
)
|
||||
from domain.epc.property_overlays.glazing_overlay import glazing_overlay_for
|
||||
from domain.epc.property_overlays.main_fuel_overlay import fuel_overlay_for
|
||||
from domain.epc.property_overlays.main_heating_system_overlay import (
|
||||
main_heating_overlay_for,
|
||||
)
|
||||
from domain.epc.property_overlays.water_heating_overlay import (
|
||||
water_heating_overlay_for,
|
||||
)
|
||||
from domain.epc.property_overlays.roof_type_overlay import roof_overlay_for
|
||||
from domain.epc.property_overlays.wall_type_overlay import wall_overlay_for
|
||||
from domain.modelling.simulation import EpcSimulation
|
||||
|
|
@ -43,6 +54,11 @@ _COMPONENT_OVERLAYS: dict[str, Callable[[str, int], Optional[EpcSimulation]]] =
|
|||
"roof_type": roof_overlay_for,
|
||||
"property_type": property_type_overlay_for,
|
||||
"built_form_type": built_form_overlay_for,
|
||||
"main_fuel": fuel_overlay_for,
|
||||
"glazing": glazing_overlay_for,
|
||||
"construction_age_band": age_band_overlay_for,
|
||||
"water_heating": water_heating_overlay_for,
|
||||
"main_heating_system": main_heating_overlay_for,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ from __future__ import annotations
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from domain.epc.override_code_mapping import (
|
||||
from domain.epc.property_overrides.override_code_mapping import (
|
||||
built_form_to_code,
|
||||
property_type_to_code,
|
||||
)
|
||||
|
|
|
|||
353
scripts/fill_domna_addresses.py
Normal file
353
scripts/fill_domna_addresses.py
Normal file
|
|
@ -0,0 +1,353 @@
|
|||
"""Fill the DOMNA columns in the AddressProfilingResults spreadsheet.
|
||||
|
||||
Input: scripts/manipulation(2).xlsx, sheet "AddressProfilingResults", columns
|
||||
Organisation Reference | UPRN | DOMNA FOUND UPRN | DOMNA FOUND ADDRESS | Address | Postcode
|
||||
|
||||
Per-row rule ("if there's a UPRN in the UPRN column we're done"):
|
||||
|
||||
* UPRN present AND Address present -> nothing to do (already sorted).
|
||||
* UPRN present AND Address missing -> reverse-lookup the address from the UPRN
|
||||
via the EPC API -> DOMNA FOUND ADDRESS.
|
||||
* UPRN missing AND Address present -> resolve a UPRN from address + postcode
|
||||
(EPC API, then Ordnance Survey) -> writes
|
||||
DOMNA FOUND UPRN + DOMNA FOUND ADDRESS.
|
||||
* not resolvable -> marked "NOT FOUND" and listed in the
|
||||
unresolved report.
|
||||
|
||||
Relaxed matching (this batch only — production AddressMatch is untouched): the
|
||||
landlord writes flats as "3 GLADYS COURT" while EPC stores "Flat 3 Gladys
|
||||
Court", which the production matcher hard-rejects. So per address we try several
|
||||
query variants — the full string, just the first comma-segment, and a
|
||||
"Flat <n> ..." form — and keep the best-scoring, unambiguous match. The unit
|
||||
number must still match exactly (AddressMatch zeroes mismatched numbers), so a
|
||||
wrong-unit match stays unlikely. Each fill carries its score + source so you can
|
||||
spot-check (DOMNA SCORE / DOMNA SOURCE).
|
||||
|
||||
Rows that already have a DOMNA FOUND UPRN are skipped (idempotent / resumable).
|
||||
|
||||
python -m scripts.fill_domna_addresses
|
||||
python -m scripts.fill_domna_addresses --limit 200 # smoke test first N
|
||||
|
||||
Keys come from backend/.env (OPEN_EPC_API_TOKEN, ORDNANCE_SURVEY_API_KEY). Run
|
||||
from the worktree root (import trap).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
|
||||
|
||||
from backend.address2UPRN.main import get_epc_data_with_postcode # noqa: E402
|
||||
from backend.address2UPRN.scoring import all_uprns_match, rank_address_similarity # noqa: E402
|
||||
from backend.ordnanceSurvey.helpers import ( # noqa: E402
|
||||
lookup_os_places,
|
||||
os_places_results_to_dataframe,
|
||||
)
|
||||
from backend.utils.addressMatch import AddressMatch # noqa: E402
|
||||
from datatypes.epc.search import EpcSearchResult # noqa: E402
|
||||
from infrastructure.epc_client.epc_client_service import EpcClientService # noqa: E402
|
||||
from scripts.resolve_uprns_for_finaliser import clean_postcode, load_keys # noqa: E402
|
||||
|
||||
SHEET = "AddressProfilingResults"
|
||||
UPRN_COL = "UPRN"
|
||||
ADDRESS_COL = "Address"
|
||||
POSTCODE_COL = "Postcode"
|
||||
REF_COL = "Organisation Reference"
|
||||
FOUND_UPRN_COL = "DOMNA FOUND UPRN"
|
||||
FOUND_ADDRESS_COL = "DOMNA FOUND ADDRESS"
|
||||
SCORE_COL = "DOMNA SCORE"
|
||||
SOURCE_COL = "DOMNA SOURCE"
|
||||
NOT_FOUND = "NOT FOUND"
|
||||
|
||||
# EPC matches are tight (short addresses) so we hold the production 0.7 bar; OS
|
||||
# addresses carry more trailing tokens, so a slightly lower bar is appropriate.
|
||||
EPC_THRESHOLD = 0.7
|
||||
OS_THRESHOLD = 0.6
|
||||
|
||||
_DEFAULT_IN = _REPO_ROOT / "scripts" / "manipulation(2).xlsx"
|
||||
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "manipulation_filled.xlsx"
|
||||
_DEFAULT_UNRESOLVED = _REPO_ROOT / "scripts" / "manipulation_unresolved.csv"
|
||||
|
||||
# A resolved hit: (uprn, matched_address, score, source).
|
||||
Hit = tuple[str, str, float, str]
|
||||
|
||||
|
||||
def cell_str(value: object) -> str:
|
||||
"""Coerce a spreadsheet cell to a trimmed string ("" for NaN/None)."""
|
||||
if value is None:
|
||||
return ""
|
||||
text = str(value).strip()
|
||||
return "" if text.lower() == "nan" else text
|
||||
|
||||
|
||||
def parse_uprn_cell(value: object) -> Optional[int]:
|
||||
"""Read a UPRN cell that pandas loaded as float64 back into an int."""
|
||||
text = cell_str(value)
|
||||
if not text:
|
||||
return None
|
||||
try:
|
||||
return int(float(text))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def address_variants(address: str) -> list[str]:
|
||||
"""Query forms to try for one input address, best-discriminating first.
|
||||
|
||||
Landlord flats read "3 GLADYS COURT, 260 REIGATE ROAD" but EPC stores
|
||||
"Flat 3 Gladys Court"; the full string scores low (extra tokens) and the
|
||||
bare "3 ..." trips the flat guard. So we also try the first comma-segment
|
||||
and a "Flat <segment>" form.
|
||||
"""
|
||||
address = address.strip()
|
||||
first = address.split(",")[0].strip()
|
||||
variants = [address, first]
|
||||
if re.match(r"^\d", first): # starts with a unit/house number
|
||||
variants.append("Flat " + first)
|
||||
variants.append("Flat " + address)
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for v in variants:
|
||||
key = v.lower()
|
||||
if v and key not in seen:
|
||||
seen.add(key)
|
||||
out.append(v)
|
||||
return out
|
||||
|
||||
|
||||
def resolve_epc_relaxed(
|
||||
address: str,
|
||||
postcode_clean: str,
|
||||
epc_cache: dict[str, pd.DataFrame],
|
||||
threshold: float = EPC_THRESHOLD,
|
||||
) -> Optional[Hit]:
|
||||
"""Best unambiguous EPC match across the address variants (cached per postcode)."""
|
||||
epc_df = epc_cache.get(postcode_clean)
|
||||
if epc_df is None:
|
||||
epc_df = get_epc_data_with_postcode(postcode=postcode_clean)
|
||||
epc_cache[postcode_clean] = epc_df
|
||||
if epc_df.empty:
|
||||
return None
|
||||
|
||||
best: Optional[Hit] = None
|
||||
for variant in address_variants(address):
|
||||
scored = rank_address_similarity(epc_df, user_address=variant)
|
||||
if scored.empty:
|
||||
continue
|
||||
score = float(scored.iloc[0]["lexiscore"])
|
||||
if best is not None and score <= best[2]:
|
||||
continue
|
||||
top_rank = scored[scored["lexirank"] == 1]
|
||||
# rank-1 rows must agree on one UPRN, else it's ambiguous — skip.
|
||||
if not all_uprns_match(top_rank, top_rank.iloc[0]["uprn"]):
|
||||
continue
|
||||
uprn = str(top_rank.iloc[0]["uprn"])
|
||||
if uprn in ("", "nan"):
|
||||
continue
|
||||
best = (uprn, str(scored.iloc[0]["address"]), score, "epc")
|
||||
|
||||
return best if best is not None and best[2] >= threshold else None
|
||||
|
||||
|
||||
def resolve_os_relaxed(
|
||||
address: str,
|
||||
postcode_clean: str,
|
||||
os_api_key: str,
|
||||
os_cache: dict[str, pd.DataFrame],
|
||||
threshold: float = OS_THRESHOLD,
|
||||
) -> Optional[Hit]:
|
||||
"""Best OS Places match across the address variants (cached per postcode)."""
|
||||
places_df = os_cache.get(postcode_clean)
|
||||
if places_df is None:
|
||||
response = lookup_os_places(postcode_clean, os_api_key)
|
||||
if response.get("status") == 200 and "data" in response:
|
||||
places_df = os_places_results_to_dataframe(response["data"])
|
||||
else:
|
||||
places_df = pd.DataFrame()
|
||||
os_cache[postcode_clean] = places_df
|
||||
if places_df.empty or "ADDRESS" not in places_df.columns:
|
||||
return None
|
||||
|
||||
records: list[dict[str, object]] = places_df.to_dict(orient="records")
|
||||
best: Optional[Hit] = None
|
||||
for variant in address_variants(address):
|
||||
for rec in records:
|
||||
candidate = str(rec.get("ADDRESS", ""))
|
||||
score = AddressMatch.score(variant, candidate)
|
||||
if best is None or score > best[2]:
|
||||
best = (str(rec.get("UPRN", "")), candidate, score, "ordnance_survey")
|
||||
return best if best is not None and best[2] >= threshold else None
|
||||
|
||||
|
||||
def _address_from_search(result: EpcSearchResult) -> str:
|
||||
parts = [
|
||||
result.address_line_1,
|
||||
result.address_line_2,
|
||||
result.address_line_3,
|
||||
result.address_line_4,
|
||||
result.post_town,
|
||||
]
|
||||
return ", ".join(p.strip() for p in parts if p and p.strip())
|
||||
|
||||
|
||||
def reverse_address_from_uprn(
|
||||
uprn: int,
|
||||
postcode_clean: str,
|
||||
service: EpcClientService,
|
||||
search_cache: dict[str, list[EpcSearchResult]],
|
||||
) -> Optional[str]:
|
||||
"""Find the EPC address for a known UPRN by searching its postcode (cached)."""
|
||||
results = search_cache.get(postcode_clean)
|
||||
if results is None:
|
||||
results = service.search_by_postcode(postcode_clean)
|
||||
search_cache[postcode_clean] = results
|
||||
for result in results:
|
||||
if result.uprn is not None and int(result.uprn) == uprn:
|
||||
return _address_from_search(result)
|
||||
return None
|
||||
|
||||
|
||||
def fill(df: pd.DataFrame, *, os_api_key: Optional[str]) -> list[dict[str, str]]:
|
||||
"""Fill the DOMNA columns in place. Returns the unresolved rows."""
|
||||
for col in (FOUND_UPRN_COL, FOUND_ADDRESS_COL, SCORE_COL, SOURCE_COL):
|
||||
if col not in df.columns:
|
||||
df[col] = ""
|
||||
df[FOUND_UPRN_COL] = df[FOUND_UPRN_COL].astype("object")
|
||||
df[FOUND_ADDRESS_COL] = df[FOUND_ADDRESS_COL].astype("object")
|
||||
|
||||
token = os.environ.get("OPEN_EPC_API_TOKEN")
|
||||
service = EpcClientService(auth_token=token) if token else None
|
||||
epc_cache: dict[str, pd.DataFrame] = {}
|
||||
os_cache: dict[str, pd.DataFrame] = {}
|
||||
search_cache: dict[str, list[EpcSearchResult]] = {}
|
||||
|
||||
unresolved: list[dict[str, str]] = []
|
||||
resolved_uprn = resolved_addr = skipped = 0
|
||||
total = len(df)
|
||||
|
||||
for n, idx in enumerate(df.index, start=1):
|
||||
ref = cell_str(df.at[idx, REF_COL])
|
||||
given_uprn = parse_uprn_cell(df.at[idx, UPRN_COL])
|
||||
address = cell_str(df.at[idx, ADDRESS_COL])
|
||||
postcode_raw = cell_str(df.at[idx, POSTCODE_COL])
|
||||
postcode_clean = clean_postcode(postcode_raw)
|
||||
|
||||
# Already sorted (UPRN + address) or already filled by a prior run.
|
||||
if given_uprn is not None and address:
|
||||
skipped += 1
|
||||
continue
|
||||
if cell_str(df.at[idx, FOUND_UPRN_COL]) and cell_str(df.at[idx, FOUND_UPRN_COL]) != NOT_FOUND:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
def mark_not_found(reason: str) -> None:
|
||||
df.at[idx, FOUND_UPRN_COL] = NOT_FOUND if given_uprn is None else ""
|
||||
df.at[idx, FOUND_ADDRESS_COL] = NOT_FOUND
|
||||
df.at[idx, SOURCE_COL] = "not_found"
|
||||
unresolved.append(
|
||||
{
|
||||
"Organisation Reference": ref,
|
||||
"reason": reason,
|
||||
"Address": address,
|
||||
"Postcode": postcode_raw,
|
||||
}
|
||||
)
|
||||
|
||||
# Case B — UPRN present, address missing: reverse-lookup the address.
|
||||
if given_uprn is not None and not address:
|
||||
found: Optional[str] = None
|
||||
if service is not None and postcode_clean:
|
||||
try:
|
||||
found = reverse_address_from_uprn(
|
||||
given_uprn, postcode_clean, service, search_cache
|
||||
)
|
||||
except Exception as exc:
|
||||
print(f" reverse failed {ref} {given_uprn}: {exc}")
|
||||
if found:
|
||||
df.at[idx, FOUND_ADDRESS_COL] = found
|
||||
df.at[idx, SOURCE_COL] = "epc_reverse"
|
||||
resolved_addr += 1
|
||||
else:
|
||||
mark_not_found("no address for UPRN")
|
||||
continue
|
||||
|
||||
# Case A — no UPRN, has address: resolve a UPRN.
|
||||
if given_uprn is None and address:
|
||||
if not postcode_clean:
|
||||
mark_not_found("no postcode")
|
||||
continue
|
||||
hit: Optional[Hit] = None
|
||||
if token:
|
||||
try:
|
||||
hit = resolve_epc_relaxed(address, postcode_clean, epc_cache)
|
||||
except Exception as exc:
|
||||
print(f" EPC failed {ref} {postcode_clean}: {exc}")
|
||||
if hit is None and os_api_key:
|
||||
try:
|
||||
hit = resolve_os_relaxed(address, postcode_clean, os_api_key, os_cache)
|
||||
except Exception as exc:
|
||||
print(f" OS failed {ref} {postcode_clean}: {exc}")
|
||||
if hit is not None:
|
||||
uprn, matched, score, source = hit
|
||||
df.at[idx, FOUND_UPRN_COL] = uprn
|
||||
df.at[idx, FOUND_ADDRESS_COL] = matched
|
||||
df.at[idx, SCORE_COL] = round(score, 4)
|
||||
df.at[idx, SOURCE_COL] = source
|
||||
resolved_uprn += 1
|
||||
else:
|
||||
mark_not_found("no UPRN match")
|
||||
if n % 100 == 0:
|
||||
print(
|
||||
f"[{n}/{total}] resolved={resolved_uprn} not_found={len(unresolved)}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Case C — neither a UPRN nor an address.
|
||||
mark_not_found("no UPRN and no address")
|
||||
|
||||
print(
|
||||
f"\nResolved {resolved_uprn} UPRNs, {resolved_addr} addresses; "
|
||||
f"{skipped} already sorted/done; {len(unresolved)} not found."
|
||||
)
|
||||
return unresolved
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
|
||||
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
|
||||
parser.add_argument("--unresolved", type=Path, default=_DEFAULT_UNRESOLVED)
|
||||
parser.add_argument("--limit", type=int, default=None, help="process first N rows")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = _parse_args()
|
||||
_epc_token, os_api_key = load_keys()
|
||||
|
||||
df = pd.read_excel(args.inp, sheet_name=SHEET)
|
||||
if args.limit is not None:
|
||||
df = df.head(args.limit).copy()
|
||||
print(f"Loaded {len(df)} rows from {args.inp} [{SHEET}]")
|
||||
|
||||
unresolved = fill(df, os_api_key=os_api_key)
|
||||
|
||||
df.to_excel(args.out, sheet_name=SHEET, index=False)
|
||||
print(f"Wrote filled sheet -> {args.out}")
|
||||
if unresolved:
|
||||
pd.DataFrame(unresolved).to_csv(args.unresolved, index=False)
|
||||
print(f"Wrote {len(unresolved)} unresolved rows -> {args.unresolved}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
331
scripts/finalise_to_property_table.py
Normal file
331
scripts/finalise_to_property_table.py
Normal file
|
|
@ -0,0 +1,331 @@
|
|||
"""Insert resolved manipulation_filled rows into the FE-owned ``property`` table.
|
||||
|
||||
Reuses the bulk_upload_finaliser's own row->PropertyIdentityInsert mapping
|
||||
(``BulkUploadFinaliserOrchestrator._row_to_insert``) and the same
|
||||
``PropertyPostgresRepository.insert_all`` the Lambda uses, so a row inserted here
|
||||
is identical to one the real finaliser would write. The status-writer /
|
||||
property_overrides path is skipped — this only populates ``property`` (no
|
||||
BulkUpload task needed).
|
||||
|
||||
Insert is ON CONFLICT (portfolio_id, uprn) DO NOTHING, so re-running is safe.
|
||||
|
||||
# one random resolved row into portfolio 796, then read it back
|
||||
python -m scripts.finalise_to_property_table --portfolio 796 --one
|
||||
|
||||
# a specific Organisation Reference
|
||||
python -m scripts.finalise_to_property_table --portfolio 796 --ref 56100000101
|
||||
|
||||
# the whole sheet (resolved rows only by default; --include-unmatched to add
|
||||
# null-UPRN rows too)
|
||||
python -m scripts.finalise_to_property_table --portfolio 796 --all
|
||||
|
||||
Postgres target comes from the root .env (POSTGRES_*). Run from the worktree root.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
from dotenv import load_dotenv
|
||||
from sqlmodel import select
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
|
||||
|
||||
from infrastructure.postgres.config import PostgresConfig # noqa: E402
|
||||
from infrastructure.postgres.engine import commit_scope, make_engine, make_session # noqa: E402
|
||||
from infrastructure.postgres.property_table import PropertyRow # noqa: E402
|
||||
from orchestration.bulk_upload_finaliser_orchestrator import ( # noqa: E402
|
||||
BulkUploadFinaliserOrchestrator,
|
||||
)
|
||||
from repositories.property.property_postgres_repository import ( # noqa: E402
|
||||
PropertyPostgresRepository,
|
||||
)
|
||||
from repositories.property.property_repository import PropertyIdentityInsert # noqa: E402
|
||||
from scripts.fill_domna_addresses import ( # noqa: E402
|
||||
ADDRESS_COL,
|
||||
FOUND_ADDRESS_COL,
|
||||
FOUND_UPRN_COL,
|
||||
POSTCODE_COL,
|
||||
REF_COL,
|
||||
SCORE_COL,
|
||||
SHEET,
|
||||
UPRN_COL,
|
||||
NOT_FOUND,
|
||||
cell_str,
|
||||
parse_uprn_cell,
|
||||
)
|
||||
|
||||
_DEFAULT_IN = _REPO_ROOT / "scripts" / "manipulation_filled.xlsx"
|
||||
|
||||
|
||||
def _final_uprn(row: pd.Series) -> Optional[int]:
|
||||
"""The authoritative UPRN: the given one, else the DOMNA-found one."""
|
||||
given = parse_uprn_cell(row.get(UPRN_COL))
|
||||
if given is not None:
|
||||
return given
|
||||
found = cell_str(row.get(FOUND_UPRN_COL))
|
||||
if found and found != NOT_FOUND:
|
||||
return parse_uprn_cell(found)
|
||||
return None
|
||||
|
||||
|
||||
def to_combiner_row(row: pd.Series) -> dict[str, str]:
|
||||
"""Map one spreadsheet row to the combiner-output shape the finaliser reads."""
|
||||
given_uprn = parse_uprn_cell(row.get(UPRN_COL))
|
||||
address = cell_str(row.get(ADDRESS_COL))
|
||||
uprn = _final_uprn(row)
|
||||
|
||||
domna_addr = cell_str(row.get(FOUND_ADDRESS_COL))
|
||||
if domna_addr == NOT_FOUND:
|
||||
domna_addr = ""
|
||||
# Matched address: the resolved one when we found it, else the given address
|
||||
# (for rows that already had a UPRN + address).
|
||||
matched = domna_addr or (address if given_uprn is not None else "")
|
||||
score = cell_str(row.get(SCORE_COL))
|
||||
|
||||
return {
|
||||
"Address 1": address,
|
||||
"Address 2": "",
|
||||
"Address 3": "",
|
||||
"postcode": cell_str(row.get(POSTCODE_COL)),
|
||||
"Internal Reference": cell_str(row.get(REF_COL)),
|
||||
"address2uprn_uprn": "" if uprn is None else str(uprn),
|
||||
"address2uprn_address": matched,
|
||||
"address2uprn_lexiscore": score,
|
||||
}
|
||||
|
||||
|
||||
def load_rows(
|
||||
path: Path, *, include_unmatched: bool
|
||||
) -> tuple[pd.DataFrame, list[dict[str, str]]]:
|
||||
"""Load the sheet and the combiner rows. By default drop rows with no UPRN."""
|
||||
df = pd.read_excel(path, sheet_name=SHEET)
|
||||
df = df.reset_index(drop=True)
|
||||
if not include_unmatched:
|
||||
keep = df.apply(lambda r: _final_uprn(r) is not None, axis=1)
|
||||
df = df[keep].reset_index(drop=True)
|
||||
rows = [to_combiner_row(r) for _, r in df.iterrows()]
|
||||
return df, rows
|
||||
|
||||
|
||||
def dedupe_by_uprn(
|
||||
rows: list[dict[str, str]],
|
||||
) -> tuple[list[dict[str, str]], list[dict[str, str]]]:
|
||||
"""Keep the first row per UPRN; return (kept, dropped collisions).
|
||||
|
||||
The DB INSERT collapses duplicate (portfolio, uprn) via ON CONFLICT DO
|
||||
NOTHING anyway, so this just makes the collision explicit (the dropped rows
|
||||
are written out for review) rather than letting an arbitrary ref win silently.
|
||||
"""
|
||||
seen: set[str] = set()
|
||||
kept: list[dict[str, str]] = []
|
||||
dropped: list[dict[str, str]] = []
|
||||
for row in rows:
|
||||
uprn = row["address2uprn_uprn"]
|
||||
if uprn in seen:
|
||||
dropped.append(row)
|
||||
else:
|
||||
seen.add(uprn)
|
||||
kept.append(row)
|
||||
return kept, dropped
|
||||
|
||||
|
||||
# Force-reload teardown order (bottom-up). property_overrides is ON DELETE
|
||||
# CASCADE so it clears itself when the property goes; everything below is NO
|
||||
# ACTION and must be deleted first, deepest child first.
|
||||
# property -> epc_property -> {these children}
|
||||
_EPC_CHILD_TABLES = (
|
||||
"epc_energy_element",
|
||||
"epc_window",
|
||||
"epc_main_heating_detail",
|
||||
"epc_renewable_heat_incentive",
|
||||
"epc_building_part",
|
||||
"epc_flat_details",
|
||||
)
|
||||
# property -> {these direct dependents}, deleted after the epc children
|
||||
_PROPERTY_DEPENDENTS = ("epc_property", "plan")
|
||||
_INSERT_CHUNK = 4000 # 9 cols/row -> well under psycopg2's 65535-param limit
|
||||
|
||||
|
||||
def _reset_portfolio(session: object, portfolio_id: int) -> int:
|
||||
"""Delete a portfolio's properties and their NO ACTION dependency tree.
|
||||
|
||||
Returns the number of property rows deleted (property_overrides cascade).
|
||||
"""
|
||||
from sqlalchemy import text
|
||||
|
||||
pids = "SELECT id FROM property WHERE portfolio_id = :pid"
|
||||
epc_ids = f"SELECT id FROM epc_property WHERE property_id IN ({pids})"
|
||||
for table in _EPC_CHILD_TABLES:
|
||||
session.execute( # type: ignore[attr-defined]
|
||||
text(f"DELETE FROM {table} WHERE epc_property_id IN ({epc_ids})"),
|
||||
{"pid": portfolio_id},
|
||||
)
|
||||
for table in _PROPERTY_DEPENDENTS:
|
||||
session.execute( # type: ignore[attr-defined]
|
||||
text(f"DELETE FROM {table} WHERE property_id IN ({pids})"),
|
||||
{"pid": portfolio_id},
|
||||
)
|
||||
result = session.execute( # type: ignore[attr-defined]
|
||||
text("DELETE FROM property WHERE portfolio_id = :pid"), {"pid": portfolio_id}
|
||||
)
|
||||
return result.rowcount
|
||||
|
||||
|
||||
def clean_reload(
|
||||
rows: list[dict[str, str]], portfolio_id: int, *, reset: bool
|
||||
) -> tuple[int, int]:
|
||||
"""Optionally wipe the portfolio, then chunk-insert rows. One transaction.
|
||||
|
||||
Returns (properties_deleted, properties_inserted).
|
||||
"""
|
||||
inserts: list[PropertyIdentityInsert] = [
|
||||
BulkUploadFinaliserOrchestrator._row_to_insert(r, portfolio_id) for r in rows
|
||||
]
|
||||
engine = _engine()
|
||||
session = make_session(engine)
|
||||
deleted = 0
|
||||
inserted = 0
|
||||
try:
|
||||
repo = PropertyPostgresRepository(session)
|
||||
with commit_scope(session):
|
||||
if reset:
|
||||
deleted = _reset_portfolio(session, portfolio_id)
|
||||
for start in range(0, len(inserts), _INSERT_CHUNK):
|
||||
inserted += repo.insert_all(inserts[start : start + _INSERT_CHUNK])
|
||||
finally:
|
||||
session.close()
|
||||
return deleted, inserted
|
||||
|
||||
|
||||
def _engine():
|
||||
load_dotenv(_REPO_ROOT / ".env")
|
||||
return make_engine(PostgresConfig.from_env(os.environ))
|
||||
|
||||
|
||||
def insert_rows(rows: list[dict[str, str]], portfolio_id: int) -> int:
|
||||
"""Insert via the finaliser's mapper + repository. Returns rows inserted."""
|
||||
inserts: list[PropertyIdentityInsert] = [
|
||||
BulkUploadFinaliserOrchestrator._row_to_insert(r, portfolio_id) for r in rows
|
||||
]
|
||||
engine = _engine()
|
||||
session = make_session(engine)
|
||||
try:
|
||||
repo = PropertyPostgresRepository(session)
|
||||
with commit_scope(session):
|
||||
inserted = repo.insert_all(inserts)
|
||||
finally:
|
||||
session.close()
|
||||
return inserted
|
||||
|
||||
|
||||
def fetch_by_ref(portfolio_id: int, ref: str) -> list[PropertyRow]:
|
||||
"""Read back inserted rows for one Organisation Reference (for verification)."""
|
||||
engine = _engine()
|
||||
session = make_session(engine)
|
||||
try:
|
||||
stmt = select(PropertyRow).where(
|
||||
PropertyRow.portfolio_id == portfolio_id,
|
||||
PropertyRow.landlord_property_id == ref,
|
||||
)
|
||||
return list(session.exec(stmt).all())
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def _show(row: dict[str, str], insert: PropertyIdentityInsert) -> None:
|
||||
print("\nSource (combiner) row:")
|
||||
for k, v in row.items():
|
||||
print(f" {k}: {v!r}")
|
||||
print("\nMapped PropertyIdentityInsert:")
|
||||
for k, v in insert.__dict__.items():
|
||||
print(f" {k}: {v!r}")
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
|
||||
parser.add_argument("--portfolio", type=int, required=True)
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument("--one", action="store_true", help="one random resolved row")
|
||||
group.add_argument("--ref", help="a specific Organisation Reference")
|
||||
group.add_argument("--all", action="store_true", help="every row")
|
||||
parser.add_argument(
|
||||
"--include-unmatched",
|
||||
action="store_true",
|
||||
help="also insert rows with no UPRN (null-UPRN property rows)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reset",
|
||||
action="store_true",
|
||||
help="(with --all) DELETE all properties in the portfolio first "
|
||||
"(cascades property_overrides; clears plan/epc_property)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--collisions",
|
||||
type=Path,
|
||||
default=_REPO_ROOT / "scripts" / "manipulation_collisions.csv",
|
||||
help="where to write rows dropped as duplicate-UPRN collisions",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=0, help="random seed for --one")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = _parse_args()
|
||||
df, rows = load_rows(args.inp, include_unmatched=args.include_unmatched)
|
||||
print(f"Loaded {len(rows)} candidate rows from {args.inp}")
|
||||
|
||||
if args.all:
|
||||
kept, dropped = dedupe_by_uprn(rows)
|
||||
if dropped:
|
||||
pd.DataFrame(dropped).to_csv(args.collisions, index=False)
|
||||
print(
|
||||
f"{len(dropped)} duplicate-UPRN rows dropped -> {args.collisions} "
|
||||
f"({len(kept)} unique to insert)"
|
||||
)
|
||||
deleted, inserted = clean_reload(kept, args.portfolio, reset=args.reset)
|
||||
if args.reset:
|
||||
print(f"Deleted {deleted} existing properties in portfolio {args.portfolio}.")
|
||||
print(f"Inserted {inserted} properties into portfolio {args.portfolio}.")
|
||||
return 0
|
||||
|
||||
# Single-row paths: pick the row, show the mapping, insert, read back.
|
||||
if args.ref:
|
||||
match = [r for r in rows if r["Internal Reference"] == args.ref]
|
||||
if not match:
|
||||
print(f"No resolved row with Organisation Reference {args.ref!r}.")
|
||||
return 1
|
||||
row = match[0]
|
||||
else: # --one: deterministic "random" pick via seed
|
||||
idx = (args.seed * 7919) % len(rows)
|
||||
row = rows[idx]
|
||||
|
||||
ref = row["Internal Reference"]
|
||||
insert = BulkUploadFinaliserOrchestrator._row_to_insert(row, args.portfolio)
|
||||
_show(row, insert)
|
||||
|
||||
inserted = insert_rows([row], args.portfolio)
|
||||
print(
|
||||
f"\ninsert_all -> {inserted} new row(s) "
|
||||
f"(0 means it already existed; ON CONFLICT DO NOTHING)."
|
||||
)
|
||||
|
||||
print(f"\nproperty rows for portfolio {args.portfolio}, ref {ref!r}:")
|
||||
for pr in fetch_by_ref(args.portfolio, ref):
|
||||
print(
|
||||
f" id={pr.id} uprn={pr.uprn} address={pr.address!r} "
|
||||
f"postcode={pr.postcode!r} status={pr.creation_status} "
|
||||
f"lexiscore={pr.lexiscore}"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
44
scripts/hyde/RESUME_AFTER_KHALIM.md
Normal file
44
scripts/hyde/RESUME_AFTER_KHALIM.md
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# Resume prompt — finish the Hyde portfolio-796 property_overrides run (after Khalim review)
|
||||
|
||||
Paste the block below to continue. It tells the assistant to review the unknown-override
|
||||
decisions with me, verify them, confirm before writing, then run the remaining steps.
|
||||
|
||||
---
|
||||
|
||||
We paused the Hyde property-overrides bulk load to review the UNKNOWN classifications with
|
||||
Khalim. Pick it back up.
|
||||
|
||||
**Context (already done):**
|
||||
- Target is **portfolio 796** in DevAssessmentModelDB (NOT 795 — 795 is empty).
|
||||
- Script: `scripts/hyde/build_property_overrides.py`. Pass 1 (`classify`) is DONE — the
|
||||
`landlord_*_overrides` ledger is populated; re-running classify is free (cache hits).
|
||||
- The 19 unresolved descriptions are documented in `scripts/hyde/unknowns_review.md`, with
|
||||
proposed values already written to `overrides_edits.csv` (gitignored).
|
||||
- Env (DB creds + `OPENAI_API_KEY`) is in `/workspaces/home/github/Model/.env`; load it with
|
||||
python-dotenv and set `POSTGRES_DRIVER=psycopg2`. Writes are idempotent upserts (unique on
|
||||
`property_id, override_component, building_part`) — safe to re-run, never duplicates.
|
||||
|
||||
**Do this, in order:**
|
||||
1. **Ask me what Khalim decided** for the unknowns. The one real judgement call is the
|
||||
flat-roof reading: `Flat: As Built` (1,172 rows) + `Flat: Unknown` (194) → which of
|
||||
`Flat, no insulation (assumed)` / `Flat, insulated (assumed)` / `Flat, limited insulation
|
||||
(assumed)`. The `construction_age_band` bands (29,829 rows) are deterministic (band = first
|
||||
letter) — keep as-is unless I say otherwise. Confirm the other roof/wall proposals too.
|
||||
2. **Update `overrides_edits.csv`** (`corrected_value` column) to match Khalim's decisions.
|
||||
3. Run `validate --edits overrides_edits.csv` and fix anything it rejects.
|
||||
4. **Show me the final edits + the planned write counts, and WAIT for my explicit go-ahead
|
||||
before any `--apply`.** Do not write to the DB before I confirm.
|
||||
5. On my go-ahead:
|
||||
- `apply-edits --edits overrides_edits.csv --portfolio-id 796 --apply` (user corrections → ledger)
|
||||
- `write --excel scripts/hyde/hyde_property_overrides.xlsx --portfolio-id 796` (DRY RUN —
|
||||
report unmatched org_refs + unresolved across all 31,773 first)
|
||||
- then the same `write ... --apply`
|
||||
6. `verify --portfolio-id 796 --org-ref <a few org_refs>` to confirm property_overrides +
|
||||
overlays landed.
|
||||
7. Remind me about the deferred **age-classifier prompt-hint fix** for the production lambda
|
||||
(the live frontend will hit the same `"D: 1950-1966"` → UNKNOWN until that lands).
|
||||
|
||||
Every DB command loads env from `/workspaces/home/github/Model/.env`. Read-only checks
|
||||
(`verify`, dry-run `write`) are fine to run unprompted; anything `--apply` needs my confirm.
|
||||
|
||||
---
|
||||
437
scripts/hyde/build_property_overrides.py
Normal file
437
scripts/hyde/build_property_overrides.py
Normal file
|
|
@ -0,0 +1,437 @@
|
|||
"""Build ``property_overrides`` for a portfolio from the Hyde Excel, bypassing the
|
||||
frontend + lambdas, using the ``landlord_*_overrides`` tables as the durable
|
||||
classification ledger.
|
||||
|
||||
Why the ledger (not a throwaway cache): ``landlord_*_overrides`` stores
|
||||
``(portfolio_id, description) -> value`` with a ``source`` (classifier|user).
|
||||
* Re-runs classify only descriptions NOT already stored -> saves ChatGPT calls.
|
||||
* Human corrections are stored as ``source=user`` and the classifier is
|
||||
forbidden from overwriting them (ADR-0003) -> edits are permanent.
|
||||
Then we resolve the vocab + match each row to a ``property.id`` by **org_ref**
|
||||
(Excel "Organisation Reference" -> property.landlord_property_id) and upsert
|
||||
``property_overrides`` (the fact layer the SAP overlay reads).
|
||||
|
||||
Subcommands:
|
||||
list-values print each component's valid override values (reference)
|
||||
classify --excel f --portfolio-id 795
|
||||
PASS 1: classify cache-misses via ChatGPT,
|
||||
upsert to landlord tables, write
|
||||
overrides_unknowns.csv (with allowed_values)
|
||||
validate --edits overrides_edits.csv
|
||||
check a hand-edited file: every corrected_value
|
||||
must be a valid enum value (suggests fixes)
|
||||
apply-edits --edits overrides_edits.csv --portfolio-id 795 [--apply]
|
||||
upsert validated corrections as source=user
|
||||
write --excel f --portfolio-id 795 [--apply]
|
||||
PASS 2: build + upsert property_overrides from vocab
|
||||
|
||||
Env: POSTGRES_* (PostgresConfig.from_env) and OPENAI_API_KEY (ChatGPT).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import difflib
|
||||
import logging
|
||||
import os
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
from typing import Any, Optional
|
||||
|
||||
import pandas as pd # pyright: ignore[reportMissingTypeStubs]
|
||||
from sqlalchemy import Table, text
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from sqlmodel import SQLModel
|
||||
|
||||
from domain.epc.property_overrides.built_form_type import BuiltFormType
|
||||
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
|
||||
from domain.epc.property_overrides.glazing_type import GlazingType
|
||||
from domain.epc.property_overrides.main_fuel_type import MainFuelType
|
||||
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
|
||||
from domain.epc.property_overrides.property_type import PropertyType
|
||||
from domain.epc.property_overrides.roof_type import RoofType
|
||||
from domain.epc.property_overrides.wall_type import WallType
|
||||
from domain.epc.property_overrides.wall_type_construction_dates import (
|
||||
wall_type_construction_date_prompt_hint,
|
||||
)
|
||||
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
|
||||
from infrastructure.chatgpt.chatgpt import ChatGPT
|
||||
from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier
|
||||
from infrastructure.landlord_overrides.landlord_override_reader_postgres_repository import (
|
||||
LandlordOverrideReaderPostgresRepository,
|
||||
)
|
||||
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
|
||||
LandlordOverridesRepository,
|
||||
)
|
||||
from infrastructure.postgres.config import PostgresConfig
|
||||
from infrastructure.postgres.engine import commit_scope, make_engine, make_session
|
||||
from infrastructure.postgres.landlord_built_form_type_override_table import (
|
||||
LandlordBuiltFormTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_construction_age_band_override_table import (
|
||||
LandlordConstructionAgeBandOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_glazing_override_table import (
|
||||
LandlordGlazingOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_main_fuel_override_table import (
|
||||
LandlordMainFuelOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_main_heating_system_override_table import (
|
||||
LandlordMainHeatingSystemOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_override_enums import OverrideSource
|
||||
from infrastructure.postgres.landlord_property_type_override_table import (
|
||||
LandlordPropertyTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_roof_type_override_table import (
|
||||
LandlordRoofTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_wall_type_override_table import (
|
||||
LandlordWallTypeOverrideRow,
|
||||
)
|
||||
from infrastructure.postgres.landlord_water_heating_override_table import (
|
||||
LandlordWaterHeatingOverrideRow,
|
||||
)
|
||||
from repositories.property.landlord_override_overlays import overlays_from
|
||||
from repositories.property.property_override_postgres_repository import (
|
||||
PropertyOverridePostgresRepository,
|
||||
)
|
||||
from repositories.property.property_override_repository import PropertyOverrideInsert
|
||||
from repositories.property.property_overrides_postgres_reader import (
|
||||
PropertyOverridesPostgresReader,
|
||||
)
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||
logger = logging.getLogger("build_property_overrides")
|
||||
|
||||
ORG_REF_COLUMN = "Organisation Reference"
|
||||
UNKNOWNS_PATH = "overrides_unknowns.csv"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ComponentSpec:
|
||||
component: str
|
||||
enum_cls: type[Enum]
|
||||
unknown: Enum
|
||||
row_type: type[SQLModel]
|
||||
excel_header: str
|
||||
per_building_part: bool # comma = building parts (wall/roof/age) vs whole-dwelling
|
||||
extra_instructions: Optional[str] = None
|
||||
|
||||
def allowed_values(self) -> list[str]:
|
||||
"""Valid override values a human may pick (excludes UNKNOWN)."""
|
||||
return sorted(m.value for m in self.enum_cls if m is not self.unknown)
|
||||
|
||||
|
||||
def _component_specs() -> list[ComponentSpec]:
|
||||
return [
|
||||
ComponentSpec("property_type", PropertyType, PropertyType.UNKNOWN, LandlordPropertyTypeOverrideRow, "Property Type", False),
|
||||
ComponentSpec("built_form_type", BuiltFormType, BuiltFormType.UNKNOWN, LandlordBuiltFormTypeOverrideRow, "Property Type", False),
|
||||
ComponentSpec("wall_type", WallType, WallType.UNKNOWN, LandlordWallTypeOverrideRow, "Walls", True, wall_type_construction_date_prompt_hint()),
|
||||
ComponentSpec("roof_type", RoofType, RoofType.UNKNOWN, LandlordRoofTypeOverrideRow, "Roofs", True),
|
||||
ComponentSpec("construction_age_band", ConstructionAgeBand, ConstructionAgeBand.UNKNOWN, LandlordConstructionAgeBandOverrideRow, "Age", True),
|
||||
ComponentSpec("main_fuel", MainFuelType, MainFuelType.UNKNOWN, LandlordMainFuelOverrideRow, "Main Fuel", False),
|
||||
ComponentSpec("glazing", GlazingType, GlazingType.UNKNOWN, LandlordGlazingOverrideRow, "Glazing", False),
|
||||
ComponentSpec("water_heating", WaterHeatingType, WaterHeatingType.UNKNOWN, LandlordWaterHeatingOverrideRow, "Hot Water", False),
|
||||
ComponentSpec("main_heating_system", MainHeatingSystemType, MainHeatingSystemType.UNKNOWN, LandlordMainHeatingSystemOverrideRow, "Heating", False),
|
||||
]
|
||||
|
||||
|
||||
def _specs_by_component() -> dict[str, ComponentSpec]:
|
||||
return {s.component: s for s in _component_specs()}
|
||||
|
||||
|
||||
def _norm(s: Any) -> str:
|
||||
"""Vocab key normalisation — mirrors the orchestrator (strip + lower)."""
|
||||
return str(s or "").strip().lower()
|
||||
|
||||
|
||||
def _split_entries(cell: Any, per_building_part: bool) -> list[str]:
|
||||
raw = "" if cell is None else str(cell)
|
||||
if not raw.strip():
|
||||
return []
|
||||
if not per_building_part:
|
||||
return [raw.strip()]
|
||||
return [part.strip() for part in raw.split(",") if part.strip()]
|
||||
|
||||
|
||||
def _load_rows(excel: str, sheet: str) -> list[dict[str, Any]]:
|
||||
return pd.read_excel(excel, sheet_name=sheet).to_dict(orient="records") # type: ignore[return-value]
|
||||
|
||||
|
||||
def _filter_rows(rows: list[dict[str, Any]], org_ref: Optional[str],
|
||||
limit: Optional[int]) -> list[dict[str, Any]]:
|
||||
"""Narrow to one property (--org-ref) or the first N rows (--limit) for a
|
||||
cheap smoke test before the full run."""
|
||||
if org_ref:
|
||||
rows = [r for r in rows if str(r.get(ORG_REF_COLUMN, "")).strip() == org_ref]
|
||||
if limit:
|
||||
rows = rows[:limit]
|
||||
return rows
|
||||
|
||||
|
||||
def _distinct_entries(rows: list[dict[str, Any]], spec: ComponentSpec) -> Counter[str]:
|
||||
counts: Counter[str] = Counter()
|
||||
for row in rows:
|
||||
for entry in _split_entries(row.get(spec.excel_header), spec.per_building_part):
|
||||
counts[entry] += 1
|
||||
return counts
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
def list_values(_: argparse.Namespace) -> None:
|
||||
"""Print the valid override values per component (the reference for edits)."""
|
||||
for spec in _component_specs():
|
||||
print(f"\n## {spec.component} (Excel: {spec.excel_header})")
|
||||
for v in spec.allowed_values():
|
||||
print(f" {v}")
|
||||
|
||||
|
||||
def validate(args: argparse.Namespace) -> None:
|
||||
"""Check a hand-edited CSV: every corrected_value must be a valid enum value."""
|
||||
specs = _specs_by_component()
|
||||
bad = 0
|
||||
with open(args.edits, newline="") as f:
|
||||
for i, r in enumerate(csv.DictReader(f), start=2):
|
||||
val = (r.get("corrected_value") or "").strip()
|
||||
if not val:
|
||||
continue
|
||||
comp = (r.get("component") or "").strip()
|
||||
spec = specs.get(comp)
|
||||
if spec is None:
|
||||
logger.error("row %d: unknown component %r", i, comp)
|
||||
bad += 1
|
||||
continue
|
||||
if val not in spec.allowed_values():
|
||||
hint = difflib.get_close_matches(val, spec.allowed_values(), n=2)
|
||||
logger.error("row %d [%s]: %r is not a valid value.%s",
|
||||
i, comp, val,
|
||||
f" Did you mean: {hint}?" if hint else
|
||||
" Run 'list-values' for the allowed set.")
|
||||
bad += 1
|
||||
if bad:
|
||||
raise SystemExit(f"{bad} invalid corrected_value(s) — fix them before apply-edits.")
|
||||
logger.info("All corrected values are valid enum values. ✓")
|
||||
|
||||
|
||||
def _db_session() -> Any:
|
||||
return make_session(make_engine(PostgresConfig.from_env(os.environ)))
|
||||
|
||||
|
||||
def classify(args: argparse.Namespace) -> None:
|
||||
rows = _filter_rows(_load_rows(args.excel, args.sheet), args.org_ref, args.limit)
|
||||
logger.info("Classifying over %d row(s).", len(rows))
|
||||
chat_gpt = ChatGPT()
|
||||
session = _db_session()
|
||||
reader = LandlordOverrideReaderPostgresRepository(session)
|
||||
try:
|
||||
vocab = reader.load_for_portfolio(args.portfolio_id) # {component: {desc: value}}
|
||||
unknown_rows: list[tuple[str, str, int, str]] = []
|
||||
|
||||
for spec in _component_specs():
|
||||
counts = _distinct_entries(rows, spec)
|
||||
known = vocab.get(spec.component, {}) # already-classified (cache)
|
||||
to_classify = {d for d in counts if _norm(d) not in known}
|
||||
logger.info("%-22s %4d distinct | %4d cached | %4d to classify",
|
||||
spec.component, len(counts), len(counts) - len(to_classify), len(to_classify))
|
||||
|
||||
resolved: dict[str, Enum] = {}
|
||||
if to_classify:
|
||||
classifier: ChatGptColumnClassifier[Any] = ChatGptColumnClassifier(
|
||||
chat_gpt, spec.enum_cls, spec.unknown, extra_instructions=spec.extra_instructions)
|
||||
resolved = classifier.classify(to_classify)
|
||||
repo: LandlordOverridesRepository[Any] = LandlordOverridesRepository(session, spec.row_type)
|
||||
with commit_scope(session):
|
||||
# store keyed on the normalised description (matches the reader/finaliser lookup)
|
||||
repo.upsert_all(args.portfolio_id, {_norm(d): m for d, m in resolved.items()})
|
||||
|
||||
# collect UNKNOWNs (freshly classified + anything cached as UNKNOWN) for review
|
||||
unk = spec.unknown.value
|
||||
for desc, n in counts.items():
|
||||
v = resolved.get(desc).value if desc in resolved and resolved[desc] else known.get(_norm(desc)) # type: ignore[union-attr]
|
||||
if v is None or v == unk:
|
||||
allowed = " | ".join(spec.allowed_values())
|
||||
unknown_rows.append((spec.component, desc, n, allowed))
|
||||
|
||||
with open(UNKNOWNS_PATH, "w", newline="") as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(["component", "description", "count", "corrected_value", "allowed_values"])
|
||||
for comp, desc, n, allowed in sorted(unknown_rows, key=lambda r: (-r[2])):
|
||||
w.writerow([comp, desc, n, "", allowed])
|
||||
logger.info("\nWrote %s — fill 'corrected_value' (must match 'allowed_values'), "
|
||||
"then: validate -> apply-edits -> write.", UNKNOWNS_PATH)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def _upsert_user_corrections(session: Any, portfolio_id: int,
|
||||
by_component: dict[str, dict[str, str]]) -> int:
|
||||
"""Upsert validated human corrections as source=user (always wins on conflict)."""
|
||||
specs = _specs_by_component()
|
||||
n = 0
|
||||
now = datetime.now(timezone.utc)
|
||||
for comp, mapping in by_component.items():
|
||||
spec = specs[comp]
|
||||
table: Table = getattr(spec.row_type, "__table__")
|
||||
rows = [{"portfolio_id": portfolio_id, "description": _norm(d), "value": v,
|
||||
"source": OverrideSource.USER, "created_at": now, "updated_at": now}
|
||||
for d, v in mapping.items()]
|
||||
if not rows:
|
||||
continue
|
||||
stmt = pg_insert(table).values(rows)
|
||||
stmt = stmt.on_conflict_do_update(
|
||||
index_elements=["portfolio_id", "description"],
|
||||
set_={"value": stmt.excluded.value, "source": stmt.excluded.source,
|
||||
"updated_at": stmt.excluded.updated_at})
|
||||
session.execute(stmt)
|
||||
n += len(rows)
|
||||
return n
|
||||
|
||||
|
||||
def apply_edits(args: argparse.Namespace) -> None:
|
||||
validate(args) # fail before touching the DB
|
||||
specs = _specs_by_component()
|
||||
by_component: dict[str, dict[str, str]] = {}
|
||||
with open(args.edits, newline="") as f:
|
||||
for r in csv.DictReader(f):
|
||||
val = (r.get("corrected_value") or "").strip()
|
||||
if val and r["component"] in specs:
|
||||
by_component.setdefault(r["component"], {})[r["description"]] = val
|
||||
session = _db_session()
|
||||
try:
|
||||
if not args.apply:
|
||||
total = sum(len(m) for m in by_component.values())
|
||||
logger.info("DRY RUN — %d user corrections ready. Re-run with --apply.", total)
|
||||
return
|
||||
with commit_scope(session):
|
||||
n = _upsert_user_corrections(session, args.portfolio_id, by_component)
|
||||
logger.info("Upserted %d user corrections (source=user).", n)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def _org_ref_to_property_id(session: Any, portfolio_id: int) -> dict[str, int]:
|
||||
stmt = text("SELECT landlord_property_id, id FROM property "
|
||||
"WHERE portfolio_id = :pid AND landlord_property_id IS NOT NULL")
|
||||
return {str(ref).strip(): int(pid) for ref, pid in session.execute(stmt, {"pid": portfolio_id})}
|
||||
|
||||
|
||||
def write(args: argparse.Namespace) -> None:
|
||||
rows = _filter_rows(_load_rows(args.excel, args.sheet), args.org_ref, args.limit)
|
||||
logger.info("Writing over %d row(s).", len(rows))
|
||||
session = _db_session()
|
||||
reader = LandlordOverrideReaderPostgresRepository(session)
|
||||
try:
|
||||
vocab = reader.load_for_portfolio(args.portfolio_id)
|
||||
org_ref_map = _org_ref_to_property_id(session, args.portfolio_id)
|
||||
logger.info("Portfolio %d: %d properties with org_ref.", args.portfolio_id, len(org_ref_map))
|
||||
|
||||
inserts: list[PropertyOverrideInsert] = []
|
||||
unmatched: Counter[str] = Counter()
|
||||
unresolved: Counter[str] = Counter()
|
||||
for row in rows:
|
||||
org_ref = str(row.get(ORG_REF_COLUMN, "")).strip()
|
||||
property_id = org_ref_map.get(org_ref)
|
||||
if property_id is None:
|
||||
unmatched[org_ref] += 1
|
||||
continue
|
||||
for spec in _component_specs():
|
||||
comp_vocab = vocab.get(spec.component, {})
|
||||
for building_part, entry in enumerate(
|
||||
_split_entries(row.get(spec.excel_header), spec.per_building_part)):
|
||||
value = comp_vocab.get(_norm(entry))
|
||||
if not value or value == spec.unknown.value:
|
||||
unresolved[f"{spec.component}: {entry}"] += 1
|
||||
continue
|
||||
inserts.append(PropertyOverrideInsert(
|
||||
property_id=property_id, portfolio_id=args.portfolio_id,
|
||||
building_part=building_part, override_component=spec.component,
|
||||
override_value=value, original_spreadsheet_description=entry))
|
||||
|
||||
logger.info("Built %d rows | %d unmatched org_refs | %d unresolved",
|
||||
len(inserts), sum(unmatched.values()), sum(unresolved.values()))
|
||||
if unresolved:
|
||||
logger.info("Top unresolved (need apply-edits): %s", unresolved.most_common(10))
|
||||
if not args.apply:
|
||||
logger.info("DRY RUN — not writing. Re-run with --apply.")
|
||||
for ins in inserts[:10]:
|
||||
logger.info(" %s", ins)
|
||||
return
|
||||
with commit_scope(session):
|
||||
affected = PropertyOverridePostgresRepository(session).upsert_all(inserts)
|
||||
logger.info("Upserted %d property_overrides.", affected)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def verify(args: argparse.Namespace) -> None:
|
||||
"""For one property (by org_ref): show the persisted property_overrides rows
|
||||
and the EpcSimulation overlays they produce — the end-to-end proof that the
|
||||
chain reaches the SAP overlay surface."""
|
||||
session = _db_session()
|
||||
try:
|
||||
org_ref_map = _org_ref_to_property_id(session, args.portfolio_id)
|
||||
property_id = org_ref_map.get(args.org_ref)
|
||||
if property_id is None:
|
||||
raise SystemExit(f"org_ref {args.org_ref!r} not found in portfolio {args.portfolio_id}.")
|
||||
reader = PropertyOverridesPostgresReader(lambda: session)
|
||||
resolved = reader.overrides_for(property_id)
|
||||
logger.info("property_id %d — %d property_overrides rows:", property_id, len(resolved.rows))
|
||||
for r in resolved.rows:
|
||||
logger.info(" part %d | %-22s = %s", r.building_part, r.override_component, r.override_value)
|
||||
overlays = overlays_from(resolved)
|
||||
logger.info("\n-> %d EpcSimulation overlay(s) produced (what the SAP calc applies):", len(overlays))
|
||||
for o in overlays:
|
||||
logger.info(" %s", o)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser(description=__doc__)
|
||||
sub = p.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
sub.add_parser("list-values").set_defaults(func=list_values)
|
||||
|
||||
v = sub.add_parser("validate")
|
||||
v.add_argument("--edits", required=True)
|
||||
v.set_defaults(func=validate)
|
||||
|
||||
c = sub.add_parser("classify")
|
||||
c.add_argument("--excel", required=True)
|
||||
c.add_argument("--sheet", default="AddressProfilingResults")
|
||||
c.add_argument("--portfolio-id", type=int, required=True)
|
||||
c.add_argument("--org-ref", default=None, help="smoke test: only this property's org_ref")
|
||||
c.add_argument("--limit", type=int, default=None, help="smoke test: first N rows")
|
||||
c.set_defaults(func=classify)
|
||||
|
||||
a = sub.add_parser("apply-edits")
|
||||
a.add_argument("--edits", required=True)
|
||||
a.add_argument("--portfolio-id", type=int, required=True)
|
||||
a.add_argument("--apply", action="store_true")
|
||||
a.set_defaults(func=apply_edits)
|
||||
|
||||
w = sub.add_parser("write")
|
||||
w.add_argument("--excel", required=True)
|
||||
w.add_argument("--sheet", default="AddressProfilingResults")
|
||||
w.add_argument("--portfolio-id", type=int, required=True)
|
||||
w.add_argument("--org-ref", default=None, help="smoke test: only this property's org_ref")
|
||||
w.add_argument("--limit", type=int, default=None, help="smoke test: first N rows")
|
||||
w.add_argument("--apply", action="store_true")
|
||||
w.set_defaults(func=write)
|
||||
|
||||
vf = sub.add_parser("verify")
|
||||
vf.add_argument("--portfolio-id", type=int, required=True)
|
||||
vf.add_argument("--org-ref", required=True)
|
||||
vf.set_defaults(func=verify)
|
||||
|
||||
args = p.parse_args()
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
53
scripts/hyde/unknowns_review.md
Normal file
53
scripts/hyde/unknowns_review.md
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
# Hyde portfolio 796 — UNKNOWN overrides for review
|
||||
|
||||
After ChatGPT classification, **19 distinct descriptions** did not auto-resolve (out of ~440 distinct across all components). Grouped below with a **proposed value** (must be one of the allowed enum values) + the row count it affects. Nothing is written to the DB until these are confirmed.
|
||||
|
||||
## 1. construction_age_band — 29,829 rows (DETERMINISTIC, no judgement)
|
||||
|
||||
The classifier didn't extract the band letter in batch, but the band IS the leading letter, so these are mapped mechanically (`"D: 1950-1966"` → `D`). Just confirm the approach.
|
||||
|
||||
| description | → band | rows |
|
||||
|---|---|---|
|
||||
| D: 1950-1966 | `D` | 4,978 |
|
||||
| K: 2007-2011 | `K` | 4,201 |
|
||||
| I: 1996-2002 | `I` | 3,708 |
|
||||
| B: 1900-1929 | `B` | 3,222 |
|
||||
| H: 1991-1995 | `H` | 2,747 |
|
||||
| E: 1967-1975 | `E` | 2,479 |
|
||||
| J: 2003-2006 | `J` | 2,221 |
|
||||
| F: 1976-1982 | `F` | 2,071 |
|
||||
| C: 1930-1949 | `C` | 1,840 |
|
||||
| G: 1983-1990 | `G` | 1,615 |
|
||||
| A: pre-1900 | `A` | 615 |
|
||||
| M: 2023 onwards | `M` | 132 |
|
||||
|
||||
## 2. roof_type (flat roofs) — 1,473 rows (NEEDS KHALIM'S CALL)
|
||||
|
||||
Flat-roof insulation drives the SAP roof U-value. **`Flat: As Built` (1,172) + `Flat: Unknown` (194) are the load-bearing decision** — proposed conservatively as *no insulation (assumed)*.
|
||||
|
||||
| description | proposed value | rows | alt options |
|
||||
|---|---|---|---|
|
||||
| Flat: As Built | `Flat, no insulation (assumed)` | 1,172 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
|
||||
| Flat: Unknown | `Flat, no insulation (assumed)` | 194 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
|
||||
| Flat: 150mm | `Flat, insulated` | 59 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
|
||||
| Flat: 100mm | `Flat, insulated` | 32 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
|
||||
| Flat: 50mm | `Flat, limited insulation` | 13 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
|
||||
| SameDwellingAbove | `(same dwelling above)` | 3 | Flat, insulated (assumed) / Flat, limited insulation (assumed) / Flat, no insulation (assumed) |
|
||||
|
||||
## 3. wall_type — 7 rows
|
||||
|
||||
| description | proposed value | rows |
|
||||
|---|---|---|
|
||||
| TimberFrame: Internal | `Timber frame, with additional insulation` | 7 |
|
||||
|
||||
## How to apply after review
|
||||
|
||||
Edit the `corrected_value` column of `overrides_edits.csv`, then:
|
||||
|
||||
```
|
||||
python scripts/hyde/build_property_overrides.py validate --edits overrides_edits.csv
|
||||
python scripts/hyde/build_property_overrides.py apply-edits --edits overrides_edits.csv --portfolio-id 796 --apply
|
||||
python scripts/hyde/build_property_overrides.py write --excel scripts/hyde/hyde_property_overrides.xlsx --portfolio-id 796 --apply
|
||||
```
|
||||
|
||||
> Note: a proper fix for the age classifier (a prompt hint so the production lambda extracts the band letter) is a separate follow-up; these script edits handle this run.
|
||||
159
scripts/hyde_epc_schema_versions.py
Normal file
159
scripts/hyde_epc_schema_versions.py
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
"""Tally the EPC schema versions across the hyde list (manipulation_filled UPRNs).
|
||||
|
||||
For every resolved UPRN we look up its EPC certificate's ``schemaType`` (e.g.
|
||||
``RdSAP-Schema-21.0.1``, ``RdSAP-Schema-17.1``, ``SAP-Schema-16.2``). The
|
||||
gov EPC ``/api/domestic/search`` endpoint returns ``schemaType`` per row, so one
|
||||
search-per-postcode covers every UPRN in that postcode — far cheaper than a
|
||||
certificate fetch per UPRN. The latest cert (max registrationDate) wins per UPRN.
|
||||
|
||||
Outputs: a per-schema-version tally with one example UPRN each, plus a CSV
|
||||
mapping every UPRN -> schema version.
|
||||
|
||||
python -m scripts.hyde_epc_schema_versions
|
||||
python -m scripts.hyde_epc_schema_versions --workers 8 --out scripts/hyde_schema_versions.csv
|
||||
|
||||
Reads OPEN_EPC_API_TOKEN from backend/.env. Run from the worktree root.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter, defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
|
||||
|
||||
from scripts.fill_domna_addresses import clean_postcode # noqa: E402
|
||||
from scripts.finalise_to_property_table import load_rows # noqa: E402
|
||||
|
||||
_BASE = "https://api.get-energy-performance-data.communities.gov.uk"
|
||||
_SEARCH = f"{_BASE}/api/domestic/search"
|
||||
NOT_IN_EPC = "NOT_IN_EPC"
|
||||
|
||||
_DEFAULT_IN = _REPO_ROOT / "scripts" / "manipulation_filled.xlsx"
|
||||
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "hyde_schema_versions.csv"
|
||||
|
||||
|
||||
def search_postcode(
|
||||
client: httpx.Client, postcode: str, headers: dict[str, str]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Return the search rows for a postcode, retrying on rate-limit (429)."""
|
||||
for attempt in range(5):
|
||||
resp = client.get(_SEARCH, params={"postcode": postcode}, headers=headers, timeout=30)
|
||||
if resp.status_code == 429:
|
||||
retry_after = float(resp.headers.get("Retry-After", "2"))
|
||||
time.sleep(min(retry_after, 10) * (attempt + 1))
|
||||
continue
|
||||
# 400 = malformed postcode (data-entry typo), 404 = no certs — skip both.
|
||||
if resp.status_code in (400, 404):
|
||||
return []
|
||||
resp.raise_for_status()
|
||||
return resp.json().get("data", [])
|
||||
return []
|
||||
|
||||
|
||||
def build_uprn_schema_map(
|
||||
postcodes: list[str], token: str, workers: int
|
||||
) -> dict[int, tuple[str, str]]:
|
||||
"""Map UPRN -> (schemaType, registrationDate) for the latest cert per UPRN.
|
||||
|
||||
One search per postcode (concurrent); later we look our UPRNs up in here.
|
||||
"""
|
||||
headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
|
||||
by_uprn: dict[int, tuple[str, str]] = {}
|
||||
done = 0
|
||||
total = len(postcodes)
|
||||
|
||||
def fetch(pc: str) -> list[dict[str, Any]]:
|
||||
with httpx.Client() as client:
|
||||
return search_postcode(client, pc, headers)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
for rows in pool.map(fetch, postcodes):
|
||||
for row in rows:
|
||||
uprn = row.get("uprn")
|
||||
schema = row.get("schemaType")
|
||||
reg = row.get("registrationDate") or ""
|
||||
if uprn is None or not schema:
|
||||
continue
|
||||
prev = by_uprn.get(int(uprn))
|
||||
# Keep the latest-registered cert's schema for this UPRN.
|
||||
if prev is None or reg > prev[1]:
|
||||
by_uprn[int(uprn)] = (str(schema), str(reg))
|
||||
done += 1
|
||||
if done % 250 == 0:
|
||||
print(f" searched {done}/{total} postcodes, {len(by_uprn)} uprns seen")
|
||||
return by_uprn
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
|
||||
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
|
||||
parser.add_argument("--workers", type=int, default=8)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = _parse_args()
|
||||
load_dotenv(_REPO_ROOT / "backend" / ".env")
|
||||
token = os.environ.get("OPEN_EPC_API_TOKEN")
|
||||
if not token:
|
||||
print("OPEN_EPC_API_TOKEN not set (backend/.env)")
|
||||
return 2
|
||||
|
||||
_, rows = load_rows(args.inp, include_unmatched=False)
|
||||
pairs: list[tuple[int, str, str]] = [] # (uprn, postcode_clean, address)
|
||||
for r in rows:
|
||||
uprn = r["address2uprn_uprn"]
|
||||
if uprn:
|
||||
pairs.append((int(uprn), clean_postcode(r["postcode"]), r["address2uprn_address"]))
|
||||
postcodes = sorted({pc for _, pc, _ in pairs if pc})
|
||||
print(f"{len(pairs)} UPRNs across {len(postcodes)} unique postcodes")
|
||||
|
||||
by_uprn = build_uprn_schema_map(postcodes, token, args.workers)
|
||||
print(f"EPC search returned schema for {len(by_uprn)} distinct UPRNs")
|
||||
|
||||
# Resolve each hyde UPRN to its schema version.
|
||||
tally: Counter[str] = Counter()
|
||||
example: dict[str, tuple[int, str]] = {}
|
||||
out_lines: list[tuple[int, str, str, str]] = [] # uprn, schema, postcode, address
|
||||
seen: set[int] = set()
|
||||
for uprn, pc, address in pairs:
|
||||
if uprn in seen:
|
||||
continue
|
||||
seen.add(uprn)
|
||||
schema = by_uprn.get(uprn, (NOT_IN_EPC, ""))[0]
|
||||
tally[schema] += 1
|
||||
example.setdefault(schema, (uprn, address))
|
||||
out_lines.append((uprn, schema, pc, address))
|
||||
|
||||
# Write the full per-UPRN mapping.
|
||||
import csv
|
||||
|
||||
with args.out.open("w", newline="", encoding="utf-8") as fh:
|
||||
w = csv.writer(fh)
|
||||
w.writerow(["uprn", "schema_version", "postcode", "matched_address"])
|
||||
w.writerows(out_lines)
|
||||
|
||||
print(f"\nSchema versions across {len(seen)} distinct UPRNs:\n")
|
||||
print(f" {'schema version':<26} {'count':>7} example UPRN")
|
||||
print(f" {'-'*26} {'-'*7} {'-'*12}")
|
||||
for schema, count in tally.most_common():
|
||||
ex_uprn, ex_addr = example[schema]
|
||||
print(f" {schema:<26} {count:>7} {ex_uprn} ({ex_addr})")
|
||||
print(f"\nFull mapping -> {args.out}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
169
scripts/lisasrequest/compare_to_ara.py
Normal file
169
scripts/lisasrequest/compare_to_ara.py
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
"""Compare our step-1 UPRN resolution against the old "Ara output" data.
|
||||
|
||||
The Ara data lives in scripts/lisasrequest/Durkan data.xlsx, sheet "Ara output",
|
||||
and carries UPRNs from our previous dataset. It is NOT treated as ground truth —
|
||||
this just lines it up against what we found / didn't find so a human can eyeball
|
||||
the differences. (We read the xlsx, not the CSV export: the CSV mangled half the
|
||||
UPRNs to Excel scientific notation, e.g. ``1.00023E+11``; the xlsx keeps them
|
||||
intact, so every comparison below is exact.)
|
||||
|
||||
Join key is (postcode, leading number, first street word), since the UPRN is the
|
||||
thing under comparison and Ara's address strings differ from the landlord input.
|
||||
|
||||
Each of our rows lands in one comparison bucket:
|
||||
match both found a UPRN and they are equal.
|
||||
differ both found a UPRN and they differ.
|
||||
we_only we resolved a UPRN, Ara had none for this address.
|
||||
ara_only we did NOT resolve, but Ara had a UPRN <- recovery candidates.
|
||||
both_missing neither resolved a UPRN.
|
||||
no_ara_record the Ara sheet had no row matching this address at all.
|
||||
|
||||
python -m scripts.lisasrequest.compare_to_ara
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter, OrderedDict
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
ADDRESS_COL = "address"
|
||||
POSTCODE_COL = "postcode"
|
||||
OUR_UPRN_COL = "domna_address_uprn"
|
||||
OUR_SOURCE_COL = "domna_source"
|
||||
|
||||
ARA_UPRN_COL = "EPC_B.uprn"
|
||||
ARA_ADDRESS_COL = "EPC_B.address"
|
||||
ARA_POSTCODE_COL = "EPC_B.postcode"
|
||||
ARA_SHEET = "Ara output"
|
||||
|
||||
_OUR_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv"
|
||||
_ARA_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "Durkan data.xlsx"
|
||||
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_vs_ara.csv"
|
||||
|
||||
Key = tuple[str, str, str]
|
||||
|
||||
|
||||
def norm_key(address: str, postcode: str) -> Key:
|
||||
"""(postcode-no-space, leading number, first street word) — the join key."""
|
||||
pc = postcode.upper().replace(" ", "")
|
||||
upper = address.upper()
|
||||
nums = re.findall(r"\d+[A-Z]?", upper)
|
||||
words = [w for w in re.findall(r"[A-Z]+", upper) if w != "FLAT"]
|
||||
return (pc, nums[0] if nums else "", words[0] if words else "")
|
||||
|
||||
|
||||
def load_ara(path: Path) -> tuple[dict[Key, dict[str, str]], int]:
|
||||
"""Index the Ara-output xlsx sheet by join key (first row wins).
|
||||
|
||||
Returns (index, duplicates). Read as strings so UPRNs keep their full value.
|
||||
"""
|
||||
df = pd.read_excel(path, sheet_name=ARA_SHEET, dtype=str)
|
||||
rows: list[dict[str, str]] = df.fillna("").to_dict(orient="records")
|
||||
index: dict[Key, dict[str, str]] = OrderedDict()
|
||||
dupes = 0
|
||||
for row in rows:
|
||||
address = str(row.get(ARA_ADDRESS_COL) or "").strip()
|
||||
postcode = str(row.get(ARA_POSTCODE_COL) or row.get(POSTCODE_COL) or "").strip()
|
||||
if not address:
|
||||
continue
|
||||
key = norm_key(address, postcode)
|
||||
if key in index:
|
||||
dupes += 1
|
||||
continue
|
||||
index[key] = row
|
||||
return index, dupes
|
||||
|
||||
|
||||
def classify(
|
||||
our_uprn: str, our_found: bool, ara: Optional[dict[str, str]]
|
||||
) -> tuple[str, str, str]:
|
||||
"""Return (comparison, ara_uprn, ara_address) for one of our rows."""
|
||||
if ara is None:
|
||||
return ("no_ara_record", "", "")
|
||||
ara_uprn = (ara.get(ARA_UPRN_COL) or "").strip()
|
||||
ara_address = (ara.get(ARA_ADDRESS_COL) or "").strip()
|
||||
ara_found = bool(ara_uprn)
|
||||
|
||||
if our_found and ara_found:
|
||||
comparison = "match" if our_uprn == ara_uprn else "differ"
|
||||
elif our_found and not ara_found:
|
||||
comparison = "we_only"
|
||||
elif not our_found and ara_found:
|
||||
comparison = "ara_only"
|
||||
else:
|
||||
comparison = "both_missing"
|
||||
return (comparison, ara_uprn, ara_address)
|
||||
|
||||
|
||||
def compare(
|
||||
our_rows: list[dict[str, str]], ara_index: dict[Key, dict[str, str]]
|
||||
) -> list[dict[str, str]]:
|
||||
out: list[dict[str, str]] = []
|
||||
for row in our_rows:
|
||||
address = (row.get(ADDRESS_COL) or "").strip()
|
||||
postcode = (row.get(POSTCODE_COL) or "").strip()
|
||||
our_uprn = (row.get(OUR_UPRN_COL) or "").strip()
|
||||
our_source = (row.get(OUR_SOURCE_COL) or "").strip()
|
||||
our_found = bool(our_uprn) and our_source != "not_found"
|
||||
|
||||
ara = ara_index.get(norm_key(address, postcode))
|
||||
comparison, ara_uprn, ara_address = classify(our_uprn, our_found, ara)
|
||||
out.append(
|
||||
{
|
||||
"address": address,
|
||||
"postcode": postcode,
|
||||
"our_uprn": our_uprn,
|
||||
"our_source": our_source,
|
||||
"ara_uprn": ara_uprn,
|
||||
"ara_address": ara_address,
|
||||
"comparison": comparison,
|
||||
}
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--ours", type=Path, default=_OUR_IN)
|
||||
parser.add_argument("--ara", type=Path, default=_ARA_IN)
|
||||
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
|
||||
args = parser.parse_args()
|
||||
|
||||
with args.ours.open(newline="", encoding="utf-8-sig") as fh:
|
||||
our_rows = [dict(r) for r in csv.DictReader(fh)]
|
||||
ara_index, dupes = load_ara(args.ara)
|
||||
print(f"Loaded {len(our_rows)} of our rows; {len(ara_index)} Ara keys "
|
||||
f"({dupes} duplicate Ara rows ignored).")
|
||||
|
||||
result = compare(our_rows, ara_index)
|
||||
fieldnames = list(result[0].keys())
|
||||
with args.out.open("w", newline="", encoding="utf-8") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(result)
|
||||
|
||||
counts = Counter(r["comparison"] for r in result)
|
||||
print(f"\nComparison of {len(result)} rows -> {args.out}")
|
||||
for name in (
|
||||
"match",
|
||||
"differ",
|
||||
"we_only",
|
||||
"ara_only",
|
||||
"both_missing",
|
||||
"no_ara_record",
|
||||
):
|
||||
print(f" {name}: {counts.get(name, 0)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
142
scripts/lisasrequest/durkan_805_schema_check.py
Normal file
142
scripts/lisasrequest/durkan_805_schema_check.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
"""EPC SAP-schema check for portfolio 805, and whether each is mapper-supported.
|
||||
|
||||
For every UPRN currently in the ``property`` table for portfolio 805, look up its
|
||||
latest EPC certificate's ``schemaType`` (one /api/domestic/search per postcode,
|
||||
reusing scripts.hyde_epc_schema_versions) and check it against the schemas the
|
||||
EpcPropertyData mapper actually handles
|
||||
(``EpcPropertyDataMapper.from_api_response``, datatypes/epc/domain/mapper.py).
|
||||
|
||||
Prints a per-schema tally with a supported? flag and an example UPRN, and writes
|
||||
the full per-UPRN mapping to durkan_805_schema_check.csv.
|
||||
|
||||
python -m scripts.lisasrequest.durkan_805_schema_check
|
||||
python -m scripts.lisasrequest.durkan_805_schema_check --portfolio 805 --workers 8
|
||||
|
||||
Reads OPEN_EPC_API_TOKEN from backend/.env and POSTGRES_* from the root .env.
|
||||
Run from the worktree root.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from sqlmodel import select
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
|
||||
|
||||
from infrastructure.postgres.config import PostgresConfig # noqa: E402
|
||||
from infrastructure.postgres.engine import make_engine, make_session # noqa: E402
|
||||
from infrastructure.postgres.property_table import PropertyRow # noqa: E402
|
||||
from scripts.fill_domna_addresses import clean_postcode # noqa: E402
|
||||
from scripts.hyde_epc_schema_versions import ( # noqa: E402
|
||||
NOT_IN_EPC,
|
||||
build_uprn_schema_map,
|
||||
)
|
||||
|
||||
# Schemas EpcPropertyDataMapper.from_api_response dispatches on (everything else
|
||||
# raises "Unsupported EPC schema"). Keep in sync with mapper.py:2539-2603.
|
||||
SUPPORTED_SCHEMAS = frozenset(
|
||||
{
|
||||
"RdSAP-Schema-17.0",
|
||||
"RdSAP-Schema-17.1",
|
||||
"RdSAP-Schema-18.0",
|
||||
"RdSAP-Schema-19.0",
|
||||
"RdSAP-Schema-20.0.0",
|
||||
"RdSAP-Schema-21.0.0",
|
||||
"RdSAP-Schema-21.0.1",
|
||||
"SAP-Schema-16.0",
|
||||
"SAP-Schema-16.2",
|
||||
"SAP-Schema-16.3",
|
||||
"SAP-Schema-17.0",
|
||||
"SAP-Schema-17.1",
|
||||
"SAP-Schema-18.0.0",
|
||||
}
|
||||
)
|
||||
|
||||
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_805_schema_check.csv"
|
||||
|
||||
|
||||
def load_portfolio_uprns(portfolio_id: int) -> list[tuple[int, str]]:
|
||||
"""Return (uprn, postcode) for every property in the portfolio with a UPRN."""
|
||||
load_dotenv(_REPO_ROOT / ".env")
|
||||
engine = make_engine(PostgresConfig.from_env(os.environ))
|
||||
session = make_session(engine)
|
||||
try:
|
||||
stmt = select(PropertyRow.uprn, PropertyRow.postcode).where(
|
||||
PropertyRow.portfolio_id == portfolio_id
|
||||
)
|
||||
out: list[tuple[int, str]] = []
|
||||
for uprn, postcode in session.exec(stmt).all():
|
||||
if uprn is not None:
|
||||
out.append((int(uprn), str(postcode or "")))
|
||||
return out
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--portfolio", type=int, default=805)
|
||||
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
|
||||
parser.add_argument("--workers", type=int, default=8)
|
||||
args = parser.parse_args()
|
||||
|
||||
load_dotenv(_REPO_ROOT / "backend" / ".env")
|
||||
token = os.environ.get("OPEN_EPC_API_TOKEN")
|
||||
if not token:
|
||||
print("OPEN_EPC_API_TOKEN not set (backend/.env)")
|
||||
return 2
|
||||
|
||||
pairs = load_portfolio_uprns(args.portfolio)
|
||||
postcodes = sorted({clean_postcode(pc) for _, pc in pairs if pc})
|
||||
print(
|
||||
f"Portfolio {args.portfolio}: {len(pairs)} UPRNs across "
|
||||
f"{len(postcodes)} unique postcodes"
|
||||
)
|
||||
|
||||
by_uprn = build_uprn_schema_map(postcodes, token, args.workers)
|
||||
print(f"EPC search returned a schema for {len(by_uprn)} distinct UPRNs")
|
||||
|
||||
tally: Counter[str] = Counter()
|
||||
example: dict[str, int] = {}
|
||||
rows_out: list[tuple[int, str, str, str]] = [] # uprn, schema, supported, postcode
|
||||
seen: set[int] = set()
|
||||
for uprn, pc in pairs:
|
||||
if uprn in seen:
|
||||
continue
|
||||
seen.add(uprn)
|
||||
schema = by_uprn.get(uprn, (NOT_IN_EPC, ""))[0]
|
||||
supported = "yes" if schema in SUPPORTED_SCHEMAS else "no"
|
||||
tally[schema] += 1
|
||||
example.setdefault(schema, uprn)
|
||||
rows_out.append((uprn, schema, supported, clean_postcode(pc)))
|
||||
|
||||
with args.out.open("w", newline="", encoding="utf-8") as fh:
|
||||
writer = csv.writer(fh)
|
||||
writer.writerow(["uprn", "schema_version", "mapper_supported", "postcode"])
|
||||
writer.writerows(rows_out)
|
||||
|
||||
supported_count = sum(c for s, c in tally.items() if s in SUPPORTED_SCHEMAS)
|
||||
print(f"\nSchema versions across {len(seen)} distinct UPRNs in portfolio "
|
||||
f"{args.portfolio}:\n")
|
||||
print(f" {'schema version':<26} {'count':>5} {'supported?':<10} example UPRN")
|
||||
print(f" {'-' * 26} {'-' * 5} {'-' * 10} {'-' * 12}")
|
||||
for schema, count in tally.most_common():
|
||||
supported = "yes" if schema in SUPPORTED_SCHEMAS else "NO"
|
||||
print(f" {schema:<26} {count:>5} {supported:<10} {example[schema]}")
|
||||
print(
|
||||
f"\nMapper-supported: {supported_count}/{len(seen)} UPRNs. "
|
||||
f"Full mapping -> {args.out}"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
200
scripts/lisasrequest/fill_domna_address.py
Normal file
200
scripts/lisasrequest/fill_domna_address.py
Normal file
|
|
@ -0,0 +1,200 @@
|
|||
"""Step 1 (Durkan portfolio): resolve a UPRN per CSV row via EPC then OS.
|
||||
|
||||
Input: scripts/lisasrequest/260611_Sample_Seed_Portfolio_Durkan_split_addresses(Split Addresses).csv
|
||||
columns include ``address`` and ``postcode``.
|
||||
|
||||
Every row carries an address and none carry a UPRN, so there is a single case:
|
||||
|
||||
* resolve a UPRN from ``address`` + ``postcode`` via the EPC API (relaxed
|
||||
address variants, threshold 0.7), then Ordnance Survey Places as a fallback
|
||||
(threshold 0.6).
|
||||
* not resolvable -> domna_source = "not_found"; uprn/address/score left empty.
|
||||
|
||||
Writes a NEW CSV = every original column, in order, plus four DOMNA columns:
|
||||
|
||||
domna_address_found the canonical address EPC/OS returned (matched string)
|
||||
domna_address_uprn the resolved UPRN ("" when unresolved)
|
||||
domna_lexiscore the match score in [0, 1] ("" when unresolved)
|
||||
domna_source epc / ordnance_survey / not_found
|
||||
|
||||
This is the human-review file; step 2 (resolve_uprns_for_finaliser) reshapes it
|
||||
into the finaliser columns without re-hitting the APIs.
|
||||
|
||||
python -m scripts.lisasrequest.fill_domna_address
|
||||
python -m scripts.lisasrequest.fill_domna_address --limit 20 # smoke test
|
||||
|
||||
Resolution reuses the relaxed matchers from scripts.fill_domna_addresses. Keys
|
||||
come from backend/.env (OPEN_EPC_API_TOKEN, ORDNANCE_SURVEY_API_KEY). Run from
|
||||
the worktree root (import trap).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
|
||||
|
||||
from scripts.fill_domna_addresses import ( # noqa: E402
|
||||
Hit,
|
||||
resolve_epc_relaxed,
|
||||
resolve_os_relaxed,
|
||||
)
|
||||
from scripts.resolve_uprns_for_finaliser import clean_postcode, load_keys # noqa: E402
|
||||
|
||||
ADDRESS_COL = "address"
|
||||
POSTCODE_COL = "postcode"
|
||||
FOUND_ADDRESS_COL = "domna_address_found"
|
||||
FOUND_UPRN_COL = "domna_address_uprn"
|
||||
LEXISCORE_COL = "domna_lexiscore"
|
||||
SOURCE_COL = "domna_source"
|
||||
NOT_FOUND = "not_found"
|
||||
_RESULT_COLS = (FOUND_ADDRESS_COL, FOUND_UPRN_COL, LEXISCORE_COL, SOURCE_COL)
|
||||
|
||||
_CSV_NAME = "260611_Sample_Seed_Portfolio_Durkan_split_addresses(Split Addresses).csv"
|
||||
_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / _CSV_NAME
|
||||
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv"
|
||||
|
||||
|
||||
def read_rows(path: Path) -> tuple[list[dict[str, str]], list[str]]:
|
||||
"""Read a CSV into (rows, fieldnames), preserving column order."""
|
||||
with path.open(newline="", encoding="utf-8-sig") as fh:
|
||||
reader = csv.DictReader(fh)
|
||||
fieldnames = list(reader.fieldnames or [])
|
||||
rows = [dict(row) for row in reader]
|
||||
return rows, fieldnames
|
||||
|
||||
|
||||
def resolve_one(
|
||||
address: str,
|
||||
postcode_raw: str,
|
||||
*,
|
||||
epc_token: Optional[str],
|
||||
os_api_key: Optional[str],
|
||||
epc_cache: dict[str, pd.DataFrame],
|
||||
os_cache: dict[str, pd.DataFrame],
|
||||
) -> Optional[Hit]:
|
||||
"""Resolve one row's UPRN: EPC (relaxed) first, then OS Places fallback."""
|
||||
postcode_clean = clean_postcode(postcode_raw)
|
||||
if not address or not postcode_clean:
|
||||
return None
|
||||
|
||||
hit: Optional[Hit] = None
|
||||
if epc_token:
|
||||
try:
|
||||
hit = resolve_epc_relaxed(address, postcode_clean, epc_cache)
|
||||
except Exception as exc:
|
||||
print(f" EPC failed {address!r} / {postcode_clean}: {exc}")
|
||||
if hit is None and os_api_key:
|
||||
try:
|
||||
hit = resolve_os_relaxed(address, postcode_clean, os_api_key, os_cache)
|
||||
except Exception as exc:
|
||||
print(f" OS failed {address!r} / {postcode_clean}: {exc}")
|
||||
return hit
|
||||
|
||||
|
||||
def fill(
|
||||
rows: list[dict[str, str]],
|
||||
*,
|
||||
epc_token: Optional[str],
|
||||
os_api_key: Optional[str],
|
||||
) -> tuple[int, int, int]:
|
||||
"""Fill the DOMNA columns on each row in place.
|
||||
|
||||
Returns (epc_hits, os_hits, not_found) counts.
|
||||
"""
|
||||
epc_cache: dict[str, pd.DataFrame] = {}
|
||||
os_cache: dict[str, pd.DataFrame] = {}
|
||||
epc_hits = os_hits = not_found = 0
|
||||
total = len(rows)
|
||||
|
||||
for n, row in enumerate(rows, start=1):
|
||||
address = str(row.get(ADDRESS_COL, "") or "").strip()
|
||||
postcode_raw = str(row.get(POSTCODE_COL, "") or "").strip()
|
||||
hit = resolve_one(
|
||||
address,
|
||||
postcode_raw,
|
||||
epc_token=epc_token,
|
||||
os_api_key=os_api_key,
|
||||
epc_cache=epc_cache,
|
||||
os_cache=os_cache,
|
||||
)
|
||||
if hit is None:
|
||||
row[FOUND_ADDRESS_COL] = ""
|
||||
row[FOUND_UPRN_COL] = ""
|
||||
row[LEXISCORE_COL] = ""
|
||||
row[SOURCE_COL] = NOT_FOUND
|
||||
not_found += 1
|
||||
else:
|
||||
uprn, matched, score, source = hit
|
||||
row[FOUND_ADDRESS_COL] = matched
|
||||
row[FOUND_UPRN_COL] = uprn
|
||||
row[LEXISCORE_COL] = str(round(score, 4))
|
||||
row[SOURCE_COL] = source
|
||||
if source == "epc":
|
||||
epc_hits += 1
|
||||
else:
|
||||
os_hits += 1
|
||||
print(
|
||||
f"[{n}/{total}] {address!r} -> "
|
||||
f"{row[FOUND_UPRN_COL] or '(no match)'} ({row[SOURCE_COL]})"
|
||||
)
|
||||
|
||||
return epc_hits, os_hits, not_found
|
||||
|
||||
|
||||
def write_rows(rows: list[dict[str, str]], path: Path, fieldnames: list[str]) -> None:
|
||||
"""Write rows to CSV, preserving input columns and appending DOMNA columns."""
|
||||
out_fields = list(fieldnames)
|
||||
for col in _RESULT_COLS:
|
||||
if col not in out_fields:
|
||||
out_fields.append(col)
|
||||
with path.open("w", newline="", encoding="utf-8") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=out_fields, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
|
||||
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
|
||||
parser.add_argument("--limit", type=int, default=None, help="process first N rows")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = _parse_args()
|
||||
epc_token, os_api_key = load_keys()
|
||||
if not epc_token:
|
||||
print("OPEN_EPC_API_TOKEN not set (backend/.env) — EPC resolution disabled")
|
||||
if not os_api_key:
|
||||
print("ORDNANCE_SURVEY_API_KEY not set (backend/.env) — OS fallback disabled")
|
||||
|
||||
rows, fieldnames = read_rows(args.inp)
|
||||
if args.limit is not None:
|
||||
rows = rows[: args.limit]
|
||||
print(f"Loaded {len(rows)} rows from {args.inp}")
|
||||
|
||||
epc_hits, os_hits, not_found = fill(
|
||||
rows, epc_token=epc_token, os_api_key=os_api_key
|
||||
)
|
||||
|
||||
write_rows(rows, args.out, fieldnames)
|
||||
resolved = epc_hits + os_hits
|
||||
print(
|
||||
f"\nResolved {resolved}/{len(rows)} "
|
||||
f"(epc={epc_hits}, ordnance_survey={os_hits}); {not_found} not found."
|
||||
)
|
||||
print(f"Wrote filled CSV -> {args.out}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
111
scripts/lisasrequest/finalise_to_property_table.py
Normal file
111
scripts/lisasrequest/finalise_to_property_table.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
"""Step 3 (Durkan portfolio): insert the reshaped rows into the ``property`` table.
|
||||
|
||||
Reads durkan_finaliser_input.csv (step 2) and, per row, maps it with the real
|
||||
finaliser mapper (``BulkUploadFinaliserOrchestrator._row_to_insert``) and inserts
|
||||
via the same ``PropertyPostgresRepository.insert_all`` the Lambda uses — so a row
|
||||
written here is identical to one the production finaliser would write. Insert is
|
||||
ON CONFLICT (portfolio_id, uprn) DO NOTHING, so re-running is safe.
|
||||
|
||||
DRY RUN BY DEFAULT — it dedupes, reports, and writes the collisions file but does
|
||||
NOT touch the database. Add --commit to actually insert.
|
||||
|
||||
# preview only (no DB writes): dedupe + mapping report
|
||||
python -m scripts.lisasrequest.finalise_to_property_table --portfolio 805
|
||||
|
||||
# actually insert
|
||||
python -m scripts.lisasrequest.finalise_to_property_table --portfolio 805 --commit
|
||||
|
||||
Postgres target comes from the root .env (POSTGRES_*). Run from the worktree root.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
|
||||
|
||||
from scripts.finalise_to_property_table import ( # noqa: E402
|
||||
dedupe_by_uprn,
|
||||
insert_rows,
|
||||
)
|
||||
|
||||
_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_finaliser_input.csv"
|
||||
_DEFAULT_COLLISIONS = (
|
||||
_REPO_ROOT / "scripts" / "lisasrequest" / "durkan_finaliser_collisions.csv"
|
||||
)
|
||||
UPRN_COL = "address2uprn_uprn"
|
||||
MATCHED_ADDRESS_COL = "address2uprn_address"
|
||||
POSTCODE_COL = "postcode"
|
||||
LEXISCORE_COL = "address2uprn_lexiscore"
|
||||
|
||||
|
||||
def read_rows(path: Path) -> list[dict[str, str]]:
|
||||
with path.open(newline="", encoding="utf-8-sig") as fh:
|
||||
return [dict(row) for row in csv.DictReader(fh)]
|
||||
|
||||
|
||||
def _preview(rows: list[dict[str, str]]) -> None:
|
||||
"""Show the first few rows as they will be inserted (no DB, no mapper call).
|
||||
|
||||
The finalise step applies the standard finaliser mapper
|
||||
(BulkUploadFinaliserOrchestrator) on insert; the fields below are its inputs.
|
||||
"""
|
||||
print("\nSample rows to insert (uprn | matched address | postcode | lexiscore):")
|
||||
for row in rows[:3]:
|
||||
print(
|
||||
f" {row.get(UPRN_COL)} | {row.get(MATCHED_ADDRESS_COL)!r} | "
|
||||
f"{row.get(POSTCODE_COL)!r} | {row.get(LEXISCORE_COL)}"
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
|
||||
parser.add_argument("--portfolio", type=int, required=True)
|
||||
parser.add_argument(
|
||||
"--commit",
|
||||
action="store_true",
|
||||
help="actually insert into property (default is a dry-run preview)",
|
||||
)
|
||||
parser.add_argument("--collisions", type=Path, default=_DEFAULT_COLLISIONS)
|
||||
args = parser.parse_args()
|
||||
|
||||
rows = read_rows(args.inp)
|
||||
print(f"Loaded {len(rows)} finaliser rows from {args.inp}")
|
||||
|
||||
kept, dropped = dedupe_by_uprn(rows)
|
||||
if dropped:
|
||||
with args.collisions.open("w", newline="", encoding="utf-8") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=list(dropped[0].keys()))
|
||||
writer.writeheader()
|
||||
writer.writerows(dropped)
|
||||
print(
|
||||
f"{len(dropped)} duplicate-UPRN rows dropped -> {args.collisions} "
|
||||
f"({len(kept)} unique to insert)"
|
||||
)
|
||||
else:
|
||||
print(f"No duplicate-UPRN collisions; {len(kept)} unique rows to insert.")
|
||||
|
||||
_preview(kept)
|
||||
|
||||
if not args.commit:
|
||||
print(
|
||||
f"\nDRY RUN — nothing written. {len(kept)} rows would be inserted into "
|
||||
f"portfolio {args.portfolio}. Re-run with --commit to write."
|
||||
)
|
||||
return 0
|
||||
|
||||
inserted = insert_rows(kept, args.portfolio)
|
||||
print(
|
||||
f"\nInserted {inserted} new properties into portfolio {args.portfolio} "
|
||||
f"({len(kept) - inserted} already existed; ON CONFLICT DO NOTHING)."
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
212
scripts/lisasrequest/resolve_uprns_for_finaliser.py
Normal file
212
scripts/lisasrequest/resolve_uprns_for_finaliser.py
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
"""Step 2 (Durkan portfolio): split step-1 matches, reshape the confident ones.
|
||||
|
||||
Reads durkan_domna_filled.csv (step 1) and SPLITS it in two — no re-resolution,
|
||||
just column work:
|
||||
|
||||
* Rows we cannot confidently insert are held back to a client-clarification CSV
|
||||
(durkan_client_clarification.csv) for Khalim to take to the client. Reasons:
|
||||
not_found_no_match no UPRN was resolved.
|
||||
no_flat_level_uprn a block of flats all collapsed onto one building
|
||||
UPRN — OS/EPC carry no flat-level records, so we
|
||||
can't tell the flats apart.
|
||||
unit_number_mismatch the matched house number differs from the input
|
||||
(e.g. "9 ..." matched "9A ..."), so the property is
|
||||
ambiguous.
|
||||
* Every remaining row is reshaped into the columns the finaliser reads
|
||||
(bulk_upload_finaliser_orchestrator), written to durkan_finaliser_input.csv
|
||||
ready for step 3:
|
||||
Address 1/2/3 | postcode | Internal Reference | address2uprn_uprn
|
||||
| address2uprn_address | address2uprn_lexiscore
|
||||
Internal Reference is left blank (landlord_property_id null, by decision).
|
||||
|
||||
python -m scripts.lisasrequest.resolve_uprns_for_finaliser
|
||||
|
||||
This stage hits no APIs. The held rows are not lost — once the client confirms
|
||||
them they can be appended to the finaliser input by hand.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
|
||||
|
||||
from scripts.lisasrequest.fill_domna_address import ( # noqa: E402
|
||||
ADDRESS_COL,
|
||||
FOUND_ADDRESS_COL,
|
||||
FOUND_UPRN_COL,
|
||||
LEXISCORE_COL,
|
||||
POSTCODE_COL,
|
||||
SOURCE_COL,
|
||||
)
|
||||
from scripts.lisasrequest.review_flags import address_numbers, input_unit # noqa: E402
|
||||
|
||||
# Finaliser input columns — must match bulk_upload_finaliser_orchestrator
|
||||
# (ADDRESS_COLS / POSTCODE_COL / INTERNAL_REF_COL / UPRN_COL /
|
||||
# MATCHED_ADDRESS_COL / LEXISCORE_COL). Hard-coded to keep this a light,
|
||||
# stdlib-only reshape; step 3 imports the real orchestrator and will fail loudly
|
||||
# if these ever drift.
|
||||
FIN_ADDRESS_1, FIN_ADDRESS_2, FIN_ADDRESS_3 = "Address 1", "Address 2", "Address 3"
|
||||
FIN_POSTCODE = "postcode"
|
||||
FIN_INTERNAL_REF = "Internal Reference"
|
||||
FIN_UPRN = "address2uprn_uprn"
|
||||
FIN_MATCHED_ADDRESS = "address2uprn_address"
|
||||
FIN_LEXISCORE = "address2uprn_lexiscore"
|
||||
_FINALISER_COLS = [
|
||||
FIN_ADDRESS_1,
|
||||
FIN_ADDRESS_2,
|
||||
FIN_ADDRESS_3,
|
||||
FIN_POSTCODE,
|
||||
FIN_INTERNAL_REF,
|
||||
FIN_UPRN,
|
||||
FIN_MATCHED_ADDRESS,
|
||||
FIN_LEXISCORE,
|
||||
]
|
||||
|
||||
# Client-clarification report columns (kept human-readable for the client).
|
||||
CONTEXT_COLS = ["address", "postcode", "No.", "Address Block"]
|
||||
DOMNA_COLS = [FOUND_ADDRESS_COL, FOUND_UPRN_COL, LEXISCORE_COL, SOURCE_COL]
|
||||
REASON_COL = "clarification_reason"
|
||||
ACTION_COL = "action_needed"
|
||||
_CLARIFY_COLS = CONTEXT_COLS + DOMNA_COLS + [REASON_COL, ACTION_COL]
|
||||
|
||||
_REASON_ORDER = {
|
||||
"not_found_no_match": 0,
|
||||
"no_flat_level_uprn": 1,
|
||||
"unit_number_mismatch": 2,
|
||||
}
|
||||
_REASON_ACTION = {
|
||||
"not_found_no_match": "No UPRN found for this address — please confirm the "
|
||||
"exact address or provide the UPRN.",
|
||||
"no_flat_level_uprn": "Address registers hold only the building, not the "
|
||||
"individual flats — please provide a UPRN per flat, or confirm a "
|
||||
"building-level record is acceptable.",
|
||||
"unit_number_mismatch": "Closest match has a different unit number (see "
|
||||
"domna_address_found) — please confirm the correct property / UPRN.",
|
||||
}
|
||||
|
||||
_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv"
|
||||
_DEFAULT_FINALISER = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_finaliser_input.csv"
|
||||
_DEFAULT_CLARIFY = (
|
||||
_REPO_ROOT / "scripts" / "lisasrequest" / "durkan_client_clarification.csv"
|
||||
)
|
||||
|
||||
|
||||
def read_rows(path: Path) -> list[dict[str, str]]:
|
||||
with path.open(newline="", encoding="utf-8-sig") as fh:
|
||||
return [dict(row) for row in csv.DictReader(fh)]
|
||||
|
||||
|
||||
def clarification_reason(
|
||||
row: dict[str, str], uprn_counts: Counter[str]
|
||||
) -> Optional[str]:
|
||||
"""Why this row can't be inserted yet, or None if it's safe to finalise."""
|
||||
uprn = row.get(FOUND_UPRN_COL, "")
|
||||
if row.get(SOURCE_COL) == "not_found" or not uprn:
|
||||
return "not_found_no_match"
|
||||
|
||||
unit = input_unit(row.get(ADDRESS_COL, ""))
|
||||
unit_missing = bool(unit) and unit not in address_numbers(
|
||||
row.get(FOUND_ADDRESS_COL, "")
|
||||
)
|
||||
duplicate = uprn_counts[uprn] > 1
|
||||
if unit_missing:
|
||||
return "no_flat_level_uprn" if duplicate else "unit_number_mismatch"
|
||||
if duplicate:
|
||||
# A shared UPRN with the right unit number still collides at finalise.
|
||||
return "no_flat_level_uprn"
|
||||
return None
|
||||
|
||||
|
||||
def to_finaliser_row(row: dict[str, str]) -> dict[str, str]:
|
||||
"""Rename a confident step-1 row into the finaliser's input columns."""
|
||||
return {
|
||||
FIN_ADDRESS_1: row.get(ADDRESS_COL, ""),
|
||||
FIN_ADDRESS_2: "",
|
||||
FIN_ADDRESS_3: "",
|
||||
FIN_POSTCODE: row.get(POSTCODE_COL, ""),
|
||||
FIN_INTERNAL_REF: "", # landlord_property_id null, by decision
|
||||
FIN_UPRN: row.get(FOUND_UPRN_COL, ""),
|
||||
FIN_MATCHED_ADDRESS: row.get(FOUND_ADDRESS_COL, ""),
|
||||
FIN_LEXISCORE: row.get(LEXISCORE_COL, ""),
|
||||
}
|
||||
|
||||
|
||||
def to_clarify_row(row: dict[str, str], reason: str) -> dict[str, str]:
|
||||
out = {col: row.get(col, "") for col in CONTEXT_COLS + DOMNA_COLS}
|
||||
out[REASON_COL] = reason
|
||||
out[ACTION_COL] = _REASON_ACTION[reason]
|
||||
return out
|
||||
|
||||
|
||||
def split(
|
||||
rows: list[dict[str, str]],
|
||||
*,
|
||||
accept_unit_mismatch: bool = False,
|
||||
) -> tuple[list[dict[str, str]], list[dict[str, str]]]:
|
||||
"""Return (finaliser_rows, clarification_rows).
|
||||
|
||||
``accept_unit_mismatch`` reshapes the ``unit_number_mismatch`` rows (a
|
||||
near-miss like 9 -> 9A the client has already confirmed) into the finaliser
|
||||
input instead of holding them back.
|
||||
"""
|
||||
uprn_counts: Counter[str] = Counter(
|
||||
r.get(FOUND_UPRN_COL, "") for r in rows if r.get(FOUND_UPRN_COL)
|
||||
)
|
||||
finaliser: list[dict[str, str]] = []
|
||||
clarify: list[dict[str, str]] = []
|
||||
for row in rows:
|
||||
reason = clarification_reason(row, uprn_counts)
|
||||
if reason is None or (
|
||||
accept_unit_mismatch and reason == "unit_number_mismatch"
|
||||
):
|
||||
finaliser.append(to_finaliser_row(row))
|
||||
else:
|
||||
clarify.append(to_clarify_row(row, reason))
|
||||
clarify.sort(key=lambda r: _REASON_ORDER.get(r[REASON_COL], 9))
|
||||
return finaliser, clarify
|
||||
|
||||
|
||||
def write_csv(rows: list[dict[str, str]], path: Path, fieldnames: list[str]) -> None:
|
||||
with path.open("w", newline="", encoding="utf-8") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=fieldnames, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
|
||||
parser.add_argument("--finaliser-out", type=Path, default=_DEFAULT_FINALISER)
|
||||
parser.add_argument("--clarify-out", type=Path, default=_DEFAULT_CLARIFY)
|
||||
parser.add_argument(
|
||||
"--accept-unit-mismatch",
|
||||
action="store_true",
|
||||
help="reshape unit_number_mismatch rows (e.g. 9->9A) into the finaliser "
|
||||
"input instead of holding them for the client",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
rows = read_rows(args.inp)
|
||||
finaliser, clarify = split(rows, accept_unit_mismatch=args.accept_unit_mismatch)
|
||||
|
||||
write_csv(finaliser, args.finaliser_out, _FINALISER_COLS)
|
||||
write_csv(clarify, args.clarify_out, _CLARIFY_COLS)
|
||||
|
||||
counts = Counter(r[REASON_COL] for r in clarify)
|
||||
print(f"Read {len(rows)} step-1 rows.")
|
||||
print(f" -> {len(finaliser)} confident rows reshaped -> {args.finaliser_out}")
|
||||
print(f" -> {len(clarify)} held for client -> {args.clarify_out}")
|
||||
for reason in _REASON_ORDER:
|
||||
print(f" {reason}: {counts.get(reason, 0)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
135
scripts/lisasrequest/review_flags.py
Normal file
135
scripts/lisasrequest/review_flags.py
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
"""Flag step-1 matches that need a human eye, for review before finalising.
|
||||
|
||||
Reads durkan_domna_filled.csv (the step-1 output) and writes a review CSV of
|
||||
only the rows carrying at least one flag, newest-doubt-first:
|
||||
|
||||
not_found no UPRN resolved at all.
|
||||
unit_not_in_match the input flat/house number does NOT appear in the matched
|
||||
address — the high-precision "wrong property" signal. Two
|
||||
shapes: a near-miss ("9 VANBRUGH" matched "9A, VANBRUGH")
|
||||
or a flat collapsing onto its building ("FLAT 1, 20 WARWICK"
|
||||
matched "20, WARWICK ROAD").
|
||||
dup_uprn the same UPRN was resolved for >1 input row — typically a
|
||||
block of flats all collapsing onto the building UPRN; all
|
||||
but one will be dropped at finalise.
|
||||
low_score lexiscore < 0.70 (a weak match, just over the OS bar). NOTE:
|
||||
on its own this is noisy — truncated EPC addresses and extra
|
||||
locality tokens push correct matches below 0.70. Treat it as
|
||||
informational unless paired with one of the flags above.
|
||||
|
||||
python -m scripts.lisasrequest.review_flags
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
ADDRESS_COL = "address"
|
||||
POSTCODE_COL = "postcode"
|
||||
FOUND_ADDRESS_COL = "domna_address_found"
|
||||
FOUND_UPRN_COL = "domna_address_uprn"
|
||||
LEXISCORE_COL = "domna_lexiscore"
|
||||
SOURCE_COL = "domna_source"
|
||||
LOW_SCORE = 0.70
|
||||
|
||||
_DEFAULT_IN = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_domna_filled.csv"
|
||||
_DEFAULT_OUT = _REPO_ROOT / "scripts" / "lisasrequest" / "durkan_review_flags.csv"
|
||||
|
||||
_REVIEW_COLS = [
|
||||
ADDRESS_COL,
|
||||
POSTCODE_COL,
|
||||
FOUND_ADDRESS_COL,
|
||||
FOUND_UPRN_COL,
|
||||
LEXISCORE_COL,
|
||||
SOURCE_COL,
|
||||
"flags",
|
||||
]
|
||||
|
||||
|
||||
def input_unit(address: str) -> str:
|
||||
"""The salient unit number of an input address: the FLAT number if present,
|
||||
else the leading house number ("" if neither). Upper-cased."""
|
||||
upper = address.upper()
|
||||
flat = re.search(r"\bFLAT\s+(\d+[A-Z]?)", upper)
|
||||
if flat:
|
||||
return flat.group(1)
|
||||
lead = re.match(r"\s*(\d+[A-Z]?)\b", upper)
|
||||
return lead.group(1) if lead else ""
|
||||
|
||||
|
||||
def address_numbers(address: str) -> set[str]:
|
||||
"""All standalone number tokens in an address (e.g. {"3", "20"}). Upper-cased."""
|
||||
return set(re.findall(r"\b\d+[A-Z]?\b", address.upper()))
|
||||
|
||||
|
||||
def _score(value: str) -> float:
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return 0.0
|
||||
|
||||
|
||||
def flag_rows(rows: list[dict[str, str]]) -> list[dict[str, str]]:
|
||||
"""Return the flagged subset, each with a ';'-joined ``flags`` field."""
|
||||
uprn_counts = Counter(
|
||||
r.get(FOUND_UPRN_COL, "") for r in rows if r.get(FOUND_UPRN_COL)
|
||||
)
|
||||
|
||||
flagged: list[dict[str, str]] = []
|
||||
for row in rows:
|
||||
uprn = row.get(FOUND_UPRN_COL, "")
|
||||
source = row.get(SOURCE_COL, "")
|
||||
flags: list[str] = []
|
||||
|
||||
if source == "not_found" or not uprn:
|
||||
flags.append("not_found")
|
||||
else:
|
||||
unit = input_unit(row.get(ADDRESS_COL, ""))
|
||||
if unit and unit not in address_numbers(row.get(FOUND_ADDRESS_COL, "")):
|
||||
flags.append("unit_not_in_match")
|
||||
if uprn_counts[uprn] > 1:
|
||||
flags.append("dup_uprn")
|
||||
if _score(row.get(LEXISCORE_COL, "")) < LOW_SCORE:
|
||||
flags.append("low_score")
|
||||
|
||||
if flags:
|
||||
flagged.append({**{c: row.get(c, "") for c in _REVIEW_COLS[:-1]},
|
||||
"flags": ";".join(flags)})
|
||||
|
||||
# not_found first, then mismatches, then dup/low.
|
||||
order = {"not_found": 0, "unit_not_in_match": 1, "dup_uprn": 2, "low_score": 3}
|
||||
flagged.sort(key=lambda r: order.get(r["flags"].split(";")[0], 9))
|
||||
return flagged
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--in", dest="inp", type=Path, default=_DEFAULT_IN)
|
||||
parser.add_argument("--out", type=Path, default=_DEFAULT_OUT)
|
||||
args = parser.parse_args()
|
||||
|
||||
with args.inp.open(newline="", encoding="utf-8-sig") as fh:
|
||||
rows = [dict(r) for r in csv.DictReader(fh)]
|
||||
|
||||
flagged = flag_rows(rows)
|
||||
with args.out.open("w", newline="", encoding="utf-8") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=_REVIEW_COLS, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
writer.writerows(flagged)
|
||||
|
||||
counts = Counter(f for r in flagged for f in r["flags"].split(";"))
|
||||
print(f"{len(flagged)}/{len(rows)} rows flagged for review -> {args.out}")
|
||||
for name in ("not_found", "unit_not_in_match", "dup_uprn", "low_score"):
|
||||
print(f" {name}: {counts.get(name, 0)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
328
scripts/resolve_uprns_for_finaliser.py
Normal file
328
scripts/resolve_uprns_for_finaliser.py
Normal file
|
|
@ -0,0 +1,328 @@
|
|||
"""Resolve a CSV of addresses to UPRNs, ready to feed the bulk-upload finaliser.
|
||||
|
||||
Takes a CSV with `Address 1/2/3` + `postcode` columns and, per row, resolves a
|
||||
UPRN by trying — in order — the new EPC API (address2uprn), the historic EPC S3
|
||||
dataset, then the Ordnance Survey Places API as a fallback. Whichever source
|
||||
wins, the result is written into the SAME three columns the finaliser reads
|
||||
(`bulk_upload_finaliser_orchestrator`):
|
||||
|
||||
address2uprn_uprn UPRN integer (empty when unresolved)
|
||||
address2uprn_address the matched address
|
||||
address2uprn_lexiscore the match score in [0, 1]
|
||||
|
||||
A `resolution_source` diagnostic column (epc / epc_historic / ordnance_survey /
|
||||
none) is appended too — the finaliser ignores unknown columns. All original
|
||||
columns are preserved in their original order, so the output CSV drops straight
|
||||
into the finaliser.
|
||||
|
||||
python -m scripts.resolve_uprns_for_finaliser input.csv -o resolved.csv
|
||||
|
||||
# OS-only / EPC-only, custom postcode column, custom OS score threshold
|
||||
python -m scripts.resolve_uprns_for_finaliser in.csv -o out.csv --no-epc
|
||||
python -m scripts.resolve_uprns_for_finaliser in.csv -o out.csv --postcode-col Postcode --os-threshold 0.6
|
||||
|
||||
Keys are read from backend/.env: OPEN_EPC_API_TOKEN (EPC) and
|
||||
ORDNANCE_SURVEY_API_KEY (OS Places). Run from the worktree root (import trap).
|
||||
|
||||
The module-level functions (`load_keys`, `read_rows`, `resolve_row`, `process`,
|
||||
`write_rows`) are written to be driven line-by-line from a REPL as well as via
|
||||
the CLI.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
from dotenv import load_dotenv
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
|
||||
|
||||
from backend.address2UPRN.main import ( # noqa: E402
|
||||
get_epc_data_with_postcode,
|
||||
get_uprn_from_historic_epc,
|
||||
get_uprn_with_epc_df,
|
||||
)
|
||||
from backend.ordnanceSurvey.helpers import ( # noqa: E402
|
||||
lookup_os_places,
|
||||
os_places_results_to_dataframe,
|
||||
)
|
||||
from backend.utils.addressMatch import AddressMatch # noqa: E402
|
||||
|
||||
# Columns the finaliser reads (bulk_upload_finaliser_orchestrator).
|
||||
UPRN_COL = "address2uprn_uprn"
|
||||
MATCHED_ADDRESS_COL = "address2uprn_address"
|
||||
LEXISCORE_COL = "address2uprn_lexiscore"
|
||||
SOURCE_COL = "resolution_source"
|
||||
_RESULT_COLS = (UPRN_COL, MATCHED_ADDRESS_COL, LEXISCORE_COL, SOURCE_COL)
|
||||
|
||||
# A resolved hit: (uprn, matched_address, lexiscore, source).
|
||||
Resolution = tuple[str, str, float, str]
|
||||
|
||||
|
||||
def load_keys() -> tuple[Optional[str], Optional[str]]:
|
||||
"""Load (epc_token, os_api_key) from backend/.env (and the process env)."""
|
||||
load_dotenv(_REPO_ROOT / "backend" / ".env")
|
||||
epc_token = os.environ.get("OPEN_EPC_API_TOKEN")
|
||||
os_api_key = os.environ.get("ORDNANCE_SURVEY_API_KEY")
|
||||
return epc_token, os_api_key
|
||||
|
||||
|
||||
def read_rows(path: Path) -> tuple[list[dict[str, str]], list[str]]:
|
||||
"""Read a CSV into (rows, fieldnames). Preserves column order."""
|
||||
with path.open(newline="", encoding="utf-8-sig") as fh:
|
||||
reader = csv.DictReader(fh)
|
||||
fieldnames = list(reader.fieldnames or [])
|
||||
rows = [dict(row) for row in reader]
|
||||
return rows, fieldnames
|
||||
|
||||
|
||||
def clean_postcode(postcode: str) -> str:
|
||||
"""Sanitise to the no-space upper form the EPC/OS lookups expect (e.g. E84SQ)."""
|
||||
return postcode.upper().replace(" ", "").strip()
|
||||
|
||||
|
||||
def build_address(row: dict[str, str]) -> str:
|
||||
"""Concatenate Address 1/2/3 the same way the address2uprn lambda does."""
|
||||
return " ".join(
|
||||
str(row.get(col, "") or "").strip() for col in ("Address 1", "Address 2", "Address 3")
|
||||
).strip()
|
||||
|
||||
|
||||
def resolve_epc(
|
||||
address: str, postcode_clean: str, epc_cache: dict[str, pd.DataFrame]
|
||||
) -> Optional[Resolution]:
|
||||
"""Resolve via the new EPC API (cached per postcode), then historic EPC S3.
|
||||
|
||||
`epc_cache` is mutated to memoise one EPC API call per postcode — pass the
|
||||
same dict across rows so a postcode is only fetched once.
|
||||
"""
|
||||
epc_df = epc_cache.get(postcode_clean)
|
||||
if epc_df is None:
|
||||
epc_df = get_epc_data_with_postcode(postcode=postcode_clean)
|
||||
epc_cache[postcode_clean] = epc_df
|
||||
|
||||
result = get_uprn_with_epc_df(
|
||||
user_inputed_address=address, epc_df=epc_df, verbose=True
|
||||
)
|
||||
if isinstance(result, tuple):
|
||||
uprn, matched, score = result
|
||||
return str(uprn), str(matched), float(score), "epc"
|
||||
|
||||
historic = get_uprn_from_historic_epc(
|
||||
user_inputed_address=address, postcode=postcode_clean
|
||||
)
|
||||
if historic is not None:
|
||||
uprn, matched, score = historic
|
||||
return str(uprn), str(matched), float(score), "epc_historic"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def resolve_os(
|
||||
address: str,
|
||||
postcode_clean: str,
|
||||
os_api_key: str,
|
||||
os_cache: dict[str, pd.DataFrame],
|
||||
threshold: float,
|
||||
) -> Optional[Resolution]:
|
||||
"""Resolve via the OS Places API: best-scoring address above `threshold`.
|
||||
|
||||
`os_cache` memoises one OS Places call per postcode.
|
||||
"""
|
||||
places_df = os_cache.get(postcode_clean)
|
||||
if places_df is None:
|
||||
response = lookup_os_places(postcode_clean, os_api_key)
|
||||
if response.get("status") != 200 or "data" not in response:
|
||||
places_df = pd.DataFrame()
|
||||
else:
|
||||
places_df = os_places_results_to_dataframe(response["data"])
|
||||
os_cache[postcode_clean] = places_df
|
||||
|
||||
if places_df.empty or "ADDRESS" not in places_df.columns:
|
||||
return None
|
||||
|
||||
# Iterate plain records — avoids pandas' partially-unknown indexing types.
|
||||
records: list[dict[str, object]] = places_df.to_dict(orient="records")
|
||||
best: Optional[Resolution] = None
|
||||
for rec in records:
|
||||
candidate = str(rec.get("ADDRESS", ""))
|
||||
score = AddressMatch.score(address, candidate)
|
||||
if score >= threshold and (best is None or score > best[2]):
|
||||
best = (str(rec.get("UPRN", "")), candidate, score, "ordnance_survey")
|
||||
return best
|
||||
|
||||
|
||||
def resolve_row(
|
||||
row: dict[str, str],
|
||||
*,
|
||||
epc_token: Optional[str],
|
||||
os_api_key: Optional[str],
|
||||
epc_cache: dict[str, pd.DataFrame],
|
||||
os_cache: dict[str, pd.DataFrame],
|
||||
postcode_col: str,
|
||||
use_epc: bool,
|
||||
use_os: bool,
|
||||
os_threshold: float,
|
||||
validate_postcode: bool,
|
||||
) -> dict[str, str]:
|
||||
"""Resolve one row in place and return it with the finaliser columns filled.
|
||||
|
||||
Tries EPC (new + historic) first, then OS Places. On no match the three
|
||||
result columns are written empty and `resolution_source` is "none".
|
||||
"""
|
||||
address = build_address(row)
|
||||
postcode_clean = clean_postcode(str(row.get(postcode_col, "") or ""))
|
||||
|
||||
def write(res: Optional[Resolution]) -> dict[str, str]:
|
||||
if res is None:
|
||||
row[UPRN_COL] = ""
|
||||
row[MATCHED_ADDRESS_COL] = ""
|
||||
row[LEXISCORE_COL] = ""
|
||||
row[SOURCE_COL] = "none"
|
||||
else:
|
||||
uprn, matched, score, source = res
|
||||
row[UPRN_COL] = uprn
|
||||
row[MATCHED_ADDRESS_COL] = matched
|
||||
row[LEXISCORE_COL] = str(score)
|
||||
row[SOURCE_COL] = source
|
||||
return row
|
||||
|
||||
if not address or not postcode_clean:
|
||||
return write(None)
|
||||
|
||||
if validate_postcode and not AddressMatch.is_valid_postcode(postcode_clean):
|
||||
return write(None)
|
||||
|
||||
if use_epc and epc_token:
|
||||
try:
|
||||
res = resolve_epc(address, postcode_clean, epc_cache)
|
||||
if res is not None:
|
||||
return write(res)
|
||||
except Exception as exc: # keep going on a per-row API/lookup failure
|
||||
print(f" EPC lookup failed for {address!r} / {postcode_clean}: {exc}")
|
||||
|
||||
if use_os and os_api_key:
|
||||
try:
|
||||
res = resolve_os(address, postcode_clean, os_api_key, os_cache, os_threshold)
|
||||
if res is not None:
|
||||
return write(res)
|
||||
except Exception as exc:
|
||||
print(f" OS lookup failed for {address!r} / {postcode_clean}: {exc}")
|
||||
|
||||
return write(None)
|
||||
|
||||
|
||||
def process(
|
||||
rows: list[dict[str, str]],
|
||||
*,
|
||||
epc_token: Optional[str],
|
||||
os_api_key: Optional[str],
|
||||
postcode_col: str = "postcode",
|
||||
use_epc: bool = True,
|
||||
use_os: bool = True,
|
||||
os_threshold: float = 0.5,
|
||||
validate_postcode: bool = True,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Resolve every row, printing a per-row line so REPL/CLI progress is visible."""
|
||||
epc_cache: dict[str, pd.DataFrame] = {}
|
||||
os_cache: dict[str, pd.DataFrame] = {}
|
||||
for i, row in enumerate(rows, start=1):
|
||||
resolve_row(
|
||||
row,
|
||||
epc_token=epc_token,
|
||||
os_api_key=os_api_key,
|
||||
epc_cache=epc_cache,
|
||||
os_cache=os_cache,
|
||||
postcode_col=postcode_col,
|
||||
use_epc=use_epc,
|
||||
use_os=use_os,
|
||||
os_threshold=os_threshold,
|
||||
validate_postcode=validate_postcode,
|
||||
)
|
||||
print(
|
||||
f"[{i}/{len(rows)}] {build_address(row)!r} -> "
|
||||
f"{row[UPRN_COL] or '(no match)'} ({row[SOURCE_COL]})"
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def write_rows(rows: list[dict[str, str]], path: Path, fieldnames: list[str]) -> None:
|
||||
"""Write rows to CSV, preserving input columns and appending the result columns."""
|
||||
out_fields = list(fieldnames)
|
||||
for col in _RESULT_COLS:
|
||||
if col not in out_fields:
|
||||
out_fields.append(col)
|
||||
with path.open("w", newline="", encoding="utf-8") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=out_fields, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("input", type=Path, help="input CSV (Address 1/2/3 + postcode)")
|
||||
parser.add_argument(
|
||||
"-o", "--out", type=Path, required=True, help="output CSV for the finaliser"
|
||||
)
|
||||
parser.add_argument("--postcode-col", default="postcode", help="postcode column name")
|
||||
parser.add_argument("--no-epc", action="store_true", help="skip EPC resolution")
|
||||
parser.add_argument("--no-os", action="store_true", help="skip Ordnance Survey fallback")
|
||||
parser.add_argument(
|
||||
"--os-threshold", type=float, default=0.5, help="min OS match score (default 0.5)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-validate-postcode",
|
||||
action="store_true",
|
||||
help="skip the postcodes.io validity check (one HTTP call per postcode)",
|
||||
)
|
||||
parser.add_argument("--limit", type=int, default=None, help="process only the first N rows")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = _parse_args()
|
||||
epc_token, os_api_key = load_keys()
|
||||
|
||||
use_epc = not args.no_epc
|
||||
use_os = not args.no_os
|
||||
if use_epc and not epc_token:
|
||||
print("OPEN_EPC_API_TOKEN not set (backend/.env) — EPC resolution disabled")
|
||||
use_epc = False
|
||||
if use_os and not os_api_key:
|
||||
print("ORDNANCE_SURVEY_API_KEY not set (backend/.env) — OS fallback disabled")
|
||||
use_os = False
|
||||
if not use_epc and not use_os:
|
||||
print("No resolver enabled (missing keys or both --no-* flags). Nothing to do.")
|
||||
return 2
|
||||
|
||||
rows, fieldnames = read_rows(args.input)
|
||||
if args.limit is not None:
|
||||
rows = rows[: args.limit]
|
||||
print(f"Loaded {len(rows)} rows from {args.input}")
|
||||
|
||||
process(
|
||||
rows,
|
||||
epc_token=epc_token,
|
||||
os_api_key=os_api_key,
|
||||
postcode_col=args.postcode_col,
|
||||
use_epc=use_epc,
|
||||
use_os=use_os,
|
||||
os_threshold=args.os_threshold,
|
||||
validate_postcode=not args.no_validate_postcode,
|
||||
)
|
||||
|
||||
write_rows(rows, args.out, fieldnames)
|
||||
matched = sum(1 for r in rows if r.get(UPRN_COL))
|
||||
print(f"\nResolved {matched}/{len(rows)} rows. Wrote {args.out}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
"""The landlord-description-overrides handler's column wiring (`_build_columns`).
|
||||
|
||||
A `column_mapping` entry of ``{category -> source header}`` must produce a
|
||||
ClassifiableColumn that reads the named header and classifies into the
|
||||
category's enum. This pins the main_fuel category onto the wiring.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import cast
|
||||
|
||||
from applications.landlord_description_overrides.handler import _build_columns # pyright: ignore[reportPrivateUsage]
|
||||
from infrastructure.chatgpt.chatgpt import ChatGPT
|
||||
|
||||
|
||||
def test_build_columns_wires_a_main_fuel_classifier_column() -> None:
|
||||
# Arrange — the factory only stores the injected collaborators, so a bare
|
||||
# object stands in for the (I/O-bound) ChatGPT client and the DB session.
|
||||
chat_gpt = cast(ChatGPT, object())
|
||||
|
||||
# Act
|
||||
columns = _build_columns({"main_fuel": "Main Fuel"}, chat_gpt, None)
|
||||
|
||||
# Assert — one column, named main_fuel, reading the "Main Fuel" header.
|
||||
assert len(columns) == 1
|
||||
assert columns[0].name == "main_fuel"
|
||||
assert columns[0].source_column == "Main Fuel"
|
||||
|
||||
|
||||
def test_build_columns_wires_a_glazing_classifier_column() -> None:
|
||||
# Arrange
|
||||
chat_gpt = cast(ChatGPT, object())
|
||||
|
||||
# Act
|
||||
columns = _build_columns({"glazing": "Glazing"}, chat_gpt, None)
|
||||
|
||||
# Assert — one column, named glazing, reading the "Glazing" header.
|
||||
assert len(columns) == 1
|
||||
assert columns[0].name == "glazing"
|
||||
assert columns[0].source_column == "Glazing"
|
||||
|
||||
|
||||
def test_build_columns_wires_a_construction_age_band_classifier_column() -> None:
|
||||
# Arrange
|
||||
chat_gpt = cast(ChatGPT, object())
|
||||
|
||||
# Act
|
||||
columns = _build_columns({"construction_age_band": "Age"}, chat_gpt, None)
|
||||
|
||||
# Assert — one column, named construction_age_band, reading the "Age" header.
|
||||
assert len(columns) == 1
|
||||
assert columns[0].name == "construction_age_band"
|
||||
assert columns[0].source_column == "Age"
|
||||
|
||||
|
||||
def test_build_columns_wires_a_water_heating_classifier_column() -> None:
|
||||
# Arrange
|
||||
chat_gpt = cast(ChatGPT, object())
|
||||
|
||||
# Act
|
||||
columns = _build_columns({"water_heating": "Hot Water"}, chat_gpt, None)
|
||||
|
||||
# Assert
|
||||
assert len(columns) == 1
|
||||
assert columns[0].name == "water_heating"
|
||||
assert columns[0].source_column == "Hot Water"
|
||||
|
||||
|
||||
def test_build_columns_wires_a_main_heating_system_classifier_column() -> None:
|
||||
# Arrange
|
||||
chat_gpt = cast(ChatGPT, object())
|
||||
|
||||
# Act
|
||||
columns = _build_columns({"main_heating_system": "Heating"}, chat_gpt, None)
|
||||
|
||||
# Assert
|
||||
assert len(columns) == 1
|
||||
assert columns[0].name == "main_heating_system"
|
||||
assert columns[0].source_column == "Heating"
|
||||
109
tests/domain/epc/test_construction_age_band_overlay.py
Normal file
109
tests/domain/epc/test_construction_age_band_overlay.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
"""The Landlord-Override construction-age-band → fabric Simulation Overlay.
|
||||
|
||||
An age-band value resolves to the RdSAP letter code the calculator's U-value
|
||||
cascades read from `SapBuildingPart.construction_age_band`; the overlay targets
|
||||
the override's building part.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from datatypes.epc.domain.epc_property_data import (
|
||||
BuildingPartIdentifier,
|
||||
EpcPropertyData,
|
||||
SapBuildingPart,
|
||||
)
|
||||
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
|
||||
from domain.epc.property_overlays.construction_age_band_overlay import (
|
||||
age_band_overlay_for,
|
||||
)
|
||||
from domain.modelling.scoring.overlay_applicator import apply_simulations
|
||||
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
|
||||
build_epc,
|
||||
)
|
||||
|
||||
|
||||
def _part(
|
||||
epc: EpcPropertyData, identifier: BuildingPartIdentifier
|
||||
) -> SapBuildingPart:
|
||||
return next(p for p in epc.sap_building_parts if p.identifier is identifier)
|
||||
|
||||
|
||||
def test_age_band_overlays_the_main_building_part() -> None:
|
||||
# Act — band B (1900-1929) on the main building part.
|
||||
simulation = age_band_overlay_for("B", 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
overlay = simulation.building_parts[BuildingPartIdentifier.MAIN]
|
||||
assert overlay.construction_age_band == "B"
|
||||
|
||||
|
||||
def test_age_band_overlay_targets_the_extension_building_part() -> None:
|
||||
# Act — building_part 1 is the first extension.
|
||||
simulation = age_band_overlay_for("L", 1)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
assert BuildingPartIdentifier.EXTENSION_1 in simulation.building_parts
|
||||
assert (
|
||||
simulation.building_parts[BuildingPartIdentifier.EXTENSION_1]
|
||||
.construction_age_band
|
||||
== "L"
|
||||
)
|
||||
|
||||
|
||||
def test_lowercase_age_band_is_normalised_to_its_letter_code() -> None:
|
||||
# Act
|
||||
simulation = age_band_overlay_for("d", 0)
|
||||
|
||||
# Assert — the calculator upper-cases the band; the overlay stores it upper.
|
||||
assert simulation is not None
|
||||
assert (
|
||||
simulation.building_parts[BuildingPartIdentifier.MAIN].construction_age_band
|
||||
== "D"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("age_band_value", ["Z", "", "1900-1929", "Unknown"])
|
||||
def test_unrecognised_age_band_produces_no_overlay(age_band_value: str) -> None:
|
||||
# Act
|
||||
simulation = age_band_overlay_for(age_band_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is None
|
||||
|
||||
|
||||
def test_age_band_override_re_dates_the_main_part_only() -> None:
|
||||
# Arrange — baseline main + extension are both band B; the landlord corrects
|
||||
# the main building's age band to F (1976-1982).
|
||||
baseline = build_epc()
|
||||
overlay = age_band_overlay_for("F", 0)
|
||||
assert overlay is not None
|
||||
|
||||
# Act
|
||||
result = apply_simulations(baseline, [overlay])
|
||||
|
||||
# Assert — the main part is re-dated (its U-value cascade now keys on F); the
|
||||
# extension is left untouched.
|
||||
assert _part(result, BuildingPartIdentifier.MAIN).construction_age_band == "F"
|
||||
assert (
|
||||
_part(result, BuildingPartIdentifier.EXTENSION_1).construction_age_band == "B"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"member", [m for m in ConstructionAgeBand if m is not ConstructionAgeBand.UNKNOWN]
|
||||
)
|
||||
def test_every_resolvable_age_band_value_decodes_to_an_overlay(
|
||||
member: ConstructionAgeBand,
|
||||
) -> None:
|
||||
# A classifier emits a ConstructionAgeBand value; if the overlay can't decode
|
||||
# it the override silently no-ops. Every non-UNKNOWN member must resolve.
|
||||
|
||||
# Act
|
||||
simulation = age_band_overlay_for(member.value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
90
tests/domain/epc/test_glazing_overlay.py
Normal file
90
tests/domain/epc/test_glazing_overlay.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
"""The Landlord-Override glazing → glazing Simulation Overlay mapping.
|
||||
|
||||
A glazing value resolves to the SAP10 `glazing_type` code the calculator's
|
||||
Table-24 cascade reads; the overlay is whole-dwelling (expanded across every
|
||||
window by `_fold_glazing`).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from domain.epc.property_overrides.glazing_type import GlazingType
|
||||
from domain.epc.property_overlays.glazing_overlay import glazing_overlay_for
|
||||
from domain.modelling.scoring.overlay_applicator import apply_simulations
|
||||
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
|
||||
build_epc,
|
||||
)
|
||||
|
||||
|
||||
def test_double_glazing_post_2002_overlays_its_glazing_code() -> None:
|
||||
# Act
|
||||
simulation = glazing_overlay_for("Double glazing, 2002 or later", 0)
|
||||
|
||||
# Assert — double glazing 2002-2021 is SAP10 glazing_type code 2.
|
||||
assert simulation is not None
|
||||
assert simulation.glazing is not None
|
||||
assert simulation.glazing.glazing_type == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("glazing_value", "code"),
|
||||
[
|
||||
("Single glazing", 1),
|
||||
("Double glazing, pre-2002", 3),
|
||||
("Triple glazing, 2002 or later", 9),
|
||||
("Triple glazing, pre-2002", 6),
|
||||
],
|
||||
)
|
||||
def test_glazing_types_decode_to_their_sap_codes(
|
||||
glazing_value: str, code: int
|
||||
) -> None:
|
||||
# Act
|
||||
simulation = glazing_overlay_for(glazing_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
assert simulation.glazing is not None
|
||||
assert simulation.glazing.glazing_type == code
|
||||
|
||||
|
||||
@pytest.mark.parametrize("glazing_value", ["Unknown", ""])
|
||||
def test_unresolvable_glazing_produces_no_overlay(glazing_value: str) -> None:
|
||||
# Act
|
||||
simulation = glazing_overlay_for(glazing_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is None
|
||||
|
||||
|
||||
def test_glazing_override_remaps_every_window_and_clears_lodged_u() -> None:
|
||||
# Arrange — baseline windows are double glazed (code 2, lodged U 2.8); the
|
||||
# landlord corrects the whole dwelling to single glazing.
|
||||
baseline = build_epc()
|
||||
assert len(baseline.sap_windows) > 1
|
||||
overlay = glazing_overlay_for("Single glazing", 0)
|
||||
assert overlay is not None
|
||||
|
||||
# Act
|
||||
result = apply_simulations(baseline, [overlay])
|
||||
|
||||
# Assert — every window flips to single (code 1) and its lodged transmission
|
||||
# U is cleared so the Table-24 cascade re-derives U from the new type.
|
||||
assert all(w.glazing_type == 1 for w in result.sap_windows)
|
||||
assert all(w.window_transmission_details is None for w in result.sap_windows)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"member", [m for m in GlazingType if m is not GlazingType.UNKNOWN]
|
||||
)
|
||||
def test_every_resolvable_glazing_value_decodes_to_a_code(
|
||||
member: GlazingType,
|
||||
) -> None:
|
||||
# A classifier emits a GlazingType value; if the overlay can't decode it the
|
||||
# override silently no-ops. Every non-UNKNOWN member must resolve.
|
||||
|
||||
# Act
|
||||
simulation = glazing_overlay_for(member.value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
115
tests/domain/epc/test_main_fuel_overlay.py
Normal file
115
tests/domain/epc/test_main_fuel_overlay.py
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
"""The Landlord-Override main-fuel → heating Simulation Overlay mapping.
|
||||
|
||||
A main-fuel value resolves to the RdSAP `main_fuel_type` int code the calculator
|
||||
reads from the dwelling's primary heating system; the overlay is whole-dwelling.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from domain.epc.property_overrides.main_fuel_type import MainFuelType
|
||||
from domain.epc.property_overlays.main_fuel_overlay import fuel_overlay_for
|
||||
from domain.modelling.scoring.overlay_applicator import apply_simulations
|
||||
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
|
||||
build_epc,
|
||||
)
|
||||
|
||||
|
||||
def test_mains_gas_overlays_the_primary_fuel() -> None:
|
||||
# Act
|
||||
simulation = fuel_overlay_for("mains gas", 0)
|
||||
|
||||
# Assert — mains gas (not community) is RdSAP main_fuel code 26.
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.main_fuel_type == 26
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("main_fuel_value", "code"),
|
||||
[
|
||||
("electricity", 29),
|
||||
("LPG (bulk)", 27),
|
||||
("oil", 28),
|
||||
("house coal", 33),
|
||||
],
|
||||
)
|
||||
def test_fuels_decode_to_their_modern_not_community_codes(
|
||||
main_fuel_value: str, code: int
|
||||
) -> None:
|
||||
# Act
|
||||
simulation = fuel_overlay_for(main_fuel_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.main_fuel_type == code
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("main_fuel_value", "code"),
|
||||
[
|
||||
("bottled LPG", 3),
|
||||
("LPG special condition", 17),
|
||||
("electricity (community)", 25),
|
||||
("biomass (community)", 31),
|
||||
("dual fuel (mineral and wood)", 10),
|
||||
("smokeless coal", 15),
|
||||
],
|
||||
)
|
||||
def test_more_fuels_decode_to_their_codes(main_fuel_value: str, code: int) -> None:
|
||||
# Act
|
||||
simulation = fuel_overlay_for(main_fuel_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.main_fuel_type == code
|
||||
|
||||
|
||||
def test_community_mains_gas_is_a_distinct_fuel_code() -> None:
|
||||
# Act
|
||||
simulation = fuel_overlay_for("mains gas (community)", 0)
|
||||
|
||||
# Assert — community mains gas is code 20, distinct from 26 (not community).
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.main_fuel_type == 20
|
||||
|
||||
|
||||
@pytest.mark.parametrize("main_fuel_value", ["Unknown", "", "no heating or hot water"])
|
||||
def test_unresolvable_fuel_produces_no_overlay(main_fuel_value: str) -> None:
|
||||
# Act
|
||||
simulation = fuel_overlay_for(main_fuel_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is None
|
||||
|
||||
|
||||
def test_fuel_override_remaps_the_primary_systems_fuel_on_the_epc() -> None:
|
||||
# Arrange — a landlord correction that the dwelling runs on electricity.
|
||||
baseline = build_epc()
|
||||
overlay = fuel_overlay_for("electricity", 0)
|
||||
assert overlay is not None
|
||||
|
||||
# Act
|
||||
result = apply_simulations(baseline, [overlay])
|
||||
|
||||
# Assert — the calculator reads the primary fuel from main_heating_details[0].
|
||||
assert result.sap_heating.main_heating_details[0].main_fuel_type == 29
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"member", [m for m in MainFuelType if m is not MainFuelType.UNKNOWN]
|
||||
)
|
||||
def test_every_resolvable_fuel_value_decodes_to_a_code(member: MainFuelType) -> None:
|
||||
# A classifier emits a MainFuelType value; if the overlay can't decode it the
|
||||
# override silently no-ops. Every non-UNKNOWN member must resolve.
|
||||
|
||||
# Act
|
||||
simulation = fuel_overlay_for(member.value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
|
||||
140
tests/domain/epc/test_main_heating_system_overlay.py
Normal file
140
tests/domain/epc/test_main_heating_system_overlay.py
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
"""The Landlord-Override main-heating-system → heating Simulation Overlay mapping.
|
||||
|
||||
A main-heating-system value resolves to the SAP `sap_main_heating_code` the
|
||||
calculator reads from the primary system; the overlay is whole-dwelling.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
|
||||
from domain.epc.property_overlays.main_fuel_overlay import fuel_overlay_for
|
||||
from domain.epc.property_overlays.main_heating_system_overlay import (
|
||||
main_heating_overlay_for,
|
||||
)
|
||||
from domain.epc.property_overlays.water_heating_overlay import (
|
||||
water_heating_overlay_for,
|
||||
)
|
||||
from domain.modelling.scoring.overlay_applicator import apply_simulations
|
||||
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
|
||||
build_epc,
|
||||
)
|
||||
|
||||
|
||||
def test_gas_combi_overlays_the_primary_heating_code() -> None:
|
||||
# Act
|
||||
simulation = main_heating_overlay_for("Gas boiler, combi", 0)
|
||||
|
||||
# Assert — condensing combi is SAP Table 4b code 104.
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.sap_main_heating_code == 104
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("main_heating_value", "code"),
|
||||
[
|
||||
("Gas boiler, regular", 102),
|
||||
("Gas CPSU", 120),
|
||||
("Electric storage heaters, fan", 404),
|
||||
("Direct-acting electric", 191),
|
||||
],
|
||||
)
|
||||
def test_heating_archetypes_decode_to_their_sap_codes(
|
||||
main_heating_value: str, code: int
|
||||
) -> None:
|
||||
# Act
|
||||
simulation = main_heating_overlay_for(main_heating_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.sap_main_heating_code == code
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("main_heating_value", "code"),
|
||||
[
|
||||
("Electric storage heaters, old", 401),
|
||||
("Electric storage heaters, slimline", 402),
|
||||
("Electric storage heaters, convector", 403),
|
||||
],
|
||||
)
|
||||
def test_storage_heater_subtypes_decode_to_their_codes(
|
||||
main_heating_value: str, code: int
|
||||
) -> None:
|
||||
# Act
|
||||
simulation = main_heating_overlay_for(main_heating_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.sap_main_heating_code == code
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"main_heating_value",
|
||||
["Unknown", "", "Air source heat pump", "Community heating"],
|
||||
)
|
||||
def test_unresolvable_or_unmodelled_heating_produces_no_overlay(
|
||||
main_heating_value: str,
|
||||
) -> None:
|
||||
# Heat pumps (main_heating_index_number) and community heating (community
|
||||
# codes) don't map to a Table 4b sap_main_heating_code yet — no overlay.
|
||||
|
||||
# Act
|
||||
simulation = main_heating_overlay_for(main_heating_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is None
|
||||
|
||||
|
||||
def test_main_heating_override_remaps_the_primary_system_code() -> None:
|
||||
# Arrange
|
||||
baseline = build_epc()
|
||||
overlay = main_heating_overlay_for("Gas boiler, regular", 0)
|
||||
assert overlay is not None
|
||||
|
||||
# Act
|
||||
result = apply_simulations(baseline, [overlay])
|
||||
|
||||
# Assert — the calculator reads the code off main_heating_details[0].
|
||||
assert result.sap_heating.main_heating_details[0].sap_main_heating_code == 102
|
||||
|
||||
|
||||
def test_the_three_heating_overrides_compose_without_conflict() -> None:
|
||||
# Arrange — main_fuel, water_heating and main_heating_system all fold onto one
|
||||
# HeatingOverlay surface but set DISJOINT fields, so they compose (the
|
||||
# field-disjoint design that makes precedence moot for these three).
|
||||
baseline = build_epc()
|
||||
overlays = [
|
||||
fuel_overlay_for("electricity", 0),
|
||||
water_heating_overlay_for("Electric immersion, electricity", 0),
|
||||
main_heating_overlay_for("Electric storage heaters, fan", 0),
|
||||
]
|
||||
assert all(o is not None for o in overlays)
|
||||
|
||||
# Act
|
||||
result = apply_simulations(baseline, [o for o in overlays if o is not None])
|
||||
|
||||
# Assert — each override landed on its own field.
|
||||
main = result.sap_heating.main_heating_details[0]
|
||||
assert main.main_fuel_type == 29
|
||||
assert main.sap_main_heating_code == 404
|
||||
assert result.sap_heating.water_heating_code == 903
|
||||
assert result.sap_heating.water_heating_fuel == 29
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"member",
|
||||
[m for m in MainHeatingSystemType if m is not MainHeatingSystemType.UNKNOWN],
|
||||
)
|
||||
def test_every_resolvable_main_heating_value_decodes(
|
||||
member: MainHeatingSystemType,
|
||||
) -> None:
|
||||
# Act
|
||||
simulation = main_heating_overlay_for(member.value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
|
|
@ -12,12 +12,12 @@ from typing import Optional
|
|||
|
||||
import pytest
|
||||
|
||||
from domain.epc.built_form_type import BuiltFormType
|
||||
from domain.epc.override_code_mapping import (
|
||||
from domain.epc.property_overrides.built_form_type import BuiltFormType
|
||||
from domain.epc.property_overrides.override_code_mapping import (
|
||||
built_form_to_code,
|
||||
property_type_to_code,
|
||||
)
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.property_overrides.property_type import PropertyType
|
||||
|
||||
|
||||
def test_house_maps_to_gov_code_zero() -> None:
|
||||
|
|
|
|||
111
tests/domain/epc/test_water_heating_overlay.py
Normal file
111
tests/domain/epc/test_water_heating_overlay.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
"""The Landlord-Override water-heating → heating Simulation Overlay mapping.
|
||||
|
||||
A water-heating value resolves to the SAP `water_heating_code` (system) and
|
||||
`water_heating_fuel` the calculator reads; the overlay is whole-dwelling.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from domain.epc.property_overlays.water_heating_overlay import (
|
||||
water_heating_overlay_for,
|
||||
)
|
||||
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
|
||||
from domain.modelling.scoring.overlay_applicator import apply_simulations
|
||||
from tests.domain.sap10_calculator.worksheet._elmhurst_worksheet_000490 import (
|
||||
build_epc,
|
||||
)
|
||||
|
||||
|
||||
def test_from_main_system_mains_gas_overlays_water_heating() -> None:
|
||||
# Act
|
||||
simulation = water_heating_overlay_for("From main system, mains gas", 0)
|
||||
|
||||
# Assert — "from main system" is water_heating_code 901, mains gas fuel 26.
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.water_heating_code == 901
|
||||
assert simulation.heating.water_heating_fuel == 26
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("water_heating_value", "code", "fuel"),
|
||||
[
|
||||
("From main system, electricity", 901, 29),
|
||||
("Electric immersion, electricity", 903, 29),
|
||||
],
|
||||
)
|
||||
def test_water_heating_systems_decode_to_their_codes(
|
||||
water_heating_value: str, code: int, fuel: int
|
||||
) -> None:
|
||||
# Act
|
||||
simulation = water_heating_overlay_for(water_heating_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.water_heating_code == code
|
||||
assert simulation.heating.water_heating_fuel == fuel
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("water_heating_value", "code", "fuel"),
|
||||
[
|
||||
("From main system, oil", 901, 28),
|
||||
("From main system, LPG (bulk)", 901, 27),
|
||||
("From main system, bottled LPG", 901, 3),
|
||||
("From main system, house coal", 901, 33),
|
||||
# "boiler/circulator for water heating only" is SAP Table 4a code 911.
|
||||
("Gas boiler/circulator, mains gas", 911, 26),
|
||||
],
|
||||
)
|
||||
def test_more_water_heating_combos_decode_to_their_codes(
|
||||
water_heating_value: str, code: int, fuel: int
|
||||
) -> None:
|
||||
# Act
|
||||
simulation = water_heating_overlay_for(water_heating_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
assert simulation.heating is not None
|
||||
assert simulation.heating.water_heating_code == code
|
||||
assert simulation.heating.water_heating_fuel == fuel
|
||||
|
||||
|
||||
@pytest.mark.parametrize("water_heating_value", ["Unknown", ""])
|
||||
def test_unresolvable_water_heating_produces_no_overlay(
|
||||
water_heating_value: str,
|
||||
) -> None:
|
||||
# Act
|
||||
simulation = water_heating_overlay_for(water_heating_value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is None
|
||||
|
||||
|
||||
def test_water_heating_override_remaps_the_hot_water_arrangement() -> None:
|
||||
# Arrange — landlord correction: HW is a separate electric immersion.
|
||||
baseline = build_epc()
|
||||
overlay = water_heating_overlay_for("Electric immersion, electricity", 0)
|
||||
assert overlay is not None
|
||||
|
||||
# Act
|
||||
result = apply_simulations(baseline, [overlay])
|
||||
|
||||
# Assert — the calculator reads these off sap_heating.
|
||||
assert result.sap_heating.water_heating_code == 903
|
||||
assert result.sap_heating.water_heating_fuel == 29
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"member", [m for m in WaterHeatingType if m is not WaterHeatingType.UNKNOWN]
|
||||
)
|
||||
def test_every_resolvable_water_heating_value_decodes(
|
||||
member: WaterHeatingType,
|
||||
) -> None:
|
||||
# Act
|
||||
simulation = water_heating_overlay_for(member.value, 0)
|
||||
|
||||
# Assert
|
||||
assert simulation is not None
|
||||
|
|
@ -5,8 +5,8 @@ from typing import Optional
|
|||
import pytest
|
||||
|
||||
from domain.data_transformation.column_classifier import ClassificationError
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.wall_type import WallType
|
||||
from domain.epc.property_overrides.property_type import PropertyType
|
||||
from domain.epc.property_overrides.wall_type import WallType
|
||||
from infrastructure.chatgpt.chatgpt import ChatGPT
|
||||
from infrastructure.chatgpt.chatgpt_column_classifier import (
|
||||
ChatGptColumnClassifier,
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@ from enum import Enum
|
|||
from typing import Any, Optional
|
||||
|
||||
from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress
|
||||
from domain.epc.built_form_type import BuiltFormType
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.wall_type import WallType
|
||||
from domain.epc.property_overrides.built_form_type import BuiltFormType
|
||||
from domain.epc.property_overrides.property_type import PropertyType
|
||||
from domain.epc.property_overrides.wall_type import WallType
|
||||
from domain.postcode import Postcode
|
||||
from domain.data_transformation.column_classifier import ColumnClassifier
|
||||
from orchestration.classifiable_column import ClassifiableColumn
|
||||
|
|
|
|||
|
|
@ -25,8 +25,8 @@ import pytest
|
|||
from sqlalchemy import Engine, Table
|
||||
from sqlmodel import Session, SQLModel, select
|
||||
|
||||
from domain.epc.property_type import PropertyType
|
||||
from domain.epc.wall_type import WallType
|
||||
from domain.epc.property_overrides.property_type import PropertyType
|
||||
from domain.epc.property_overrides.wall_type import WallType
|
||||
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
|
||||
LandlordOverridesRepository,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -47,6 +47,88 @@ def test_each_resolvable_component_produces_an_overlay() -> None:
|
|||
assert len(overlays) == 4
|
||||
|
||||
|
||||
def test_main_fuel_row_produces_a_heating_fuel_overlay() -> None:
|
||||
# Arrange
|
||||
overrides = ResolvedPropertyOverrides(
|
||||
rows=(ResolvedPropertyOverride("main_fuel", 0, "mains gas"),)
|
||||
)
|
||||
|
||||
# Act
|
||||
overlays = overlays_from(overrides)
|
||||
|
||||
# Assert
|
||||
assert len(overlays) == 1
|
||||
assert overlays[0].heating is not None
|
||||
assert overlays[0].heating.main_fuel_type == 26
|
||||
|
||||
|
||||
def test_glazing_row_produces_a_glazing_overlay() -> None:
|
||||
# Arrange
|
||||
overrides = ResolvedPropertyOverrides(
|
||||
rows=(ResolvedPropertyOverride("glazing", 0, "Double glazing, 2002 or later"),)
|
||||
)
|
||||
|
||||
# Act
|
||||
overlays = overlays_from(overrides)
|
||||
|
||||
# Assert
|
||||
assert len(overlays) == 1
|
||||
assert overlays[0].glazing is not None
|
||||
assert overlays[0].glazing.glazing_type == 2
|
||||
|
||||
|
||||
def test_construction_age_band_row_produces_a_building_part_overlay() -> None:
|
||||
# Arrange
|
||||
overrides = ResolvedPropertyOverrides(
|
||||
rows=(ResolvedPropertyOverride("construction_age_band", 0, "B"),)
|
||||
)
|
||||
|
||||
# Act
|
||||
overlays = overlays_from(overrides)
|
||||
|
||||
# Assert
|
||||
assert len(overlays) == 1
|
||||
main = overlays[0].building_parts[BuildingPartIdentifier.MAIN]
|
||||
assert main.construction_age_band == "B"
|
||||
|
||||
|
||||
def test_water_heating_row_produces_a_heating_overlay() -> None:
|
||||
# Arrange
|
||||
overrides = ResolvedPropertyOverrides(
|
||||
rows=(
|
||||
ResolvedPropertyOverride(
|
||||
"water_heating", 0, "From main system, mains gas"
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Act
|
||||
overlays = overlays_from(overrides)
|
||||
|
||||
# Assert
|
||||
assert len(overlays) == 1
|
||||
assert overlays[0].heating is not None
|
||||
assert overlays[0].heating.water_heating_code == 901
|
||||
assert overlays[0].heating.water_heating_fuel == 26
|
||||
|
||||
|
||||
def test_main_heating_system_row_produces_a_heating_overlay() -> None:
|
||||
# Arrange
|
||||
overrides = ResolvedPropertyOverrides(
|
||||
rows=(
|
||||
ResolvedPropertyOverride("main_heating_system", 0, "Gas boiler, combi"),
|
||||
)
|
||||
)
|
||||
|
||||
# Act
|
||||
overlays = overlays_from(overrides)
|
||||
|
||||
# Assert
|
||||
assert len(overlays) == 1
|
||||
assert overlays[0].heating is not None
|
||||
assert overlays[0].heating.sap_main_heating_code == 104
|
||||
|
||||
|
||||
def test_unresolvable_rows_are_skipped() -> None:
|
||||
# Arrange — an "Unknown" property type and an unmapped wall material.
|
||||
overrides = ResolvedPropertyOverrides(
|
||||
|
|
|
|||
|
|
@ -0,0 +1,35 @@
|
|||
"""Every override component must be wired through the WHOLE chain.
|
||||
|
||||
The finaliser reader (`_ROW_TYPES`, component -> landlord table) and the overlay
|
||||
registry (`_COMPONENT_OVERLAYS`, component -> overlay mapper) must cover exactly
|
||||
the same set of components. If a component is classified + stored but has no
|
||||
reader entry, the finaliser silently never writes its `property_overrides` rows;
|
||||
if it has no overlay entry, the row never reaches the calculator. This guard
|
||||
keeps the two registries in lock-step (it would have caught the missing
|
||||
main_fuel / glazing / construction_age_band reader entries).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import cast
|
||||
|
||||
from infrastructure.landlord_overrides.landlord_override_reader_postgres_repository import (
|
||||
_ROW_TYPES, # pyright: ignore[reportPrivateUsage]
|
||||
)
|
||||
from infrastructure.postgres.property_override_table import override_component_sa_enum
|
||||
from repositories.property.landlord_override_overlays import (
|
||||
_COMPONENT_OVERLAYS, # pyright: ignore[reportPrivateUsage]
|
||||
)
|
||||
|
||||
|
||||
def test_reader_and_overlay_registries_cover_the_same_components() -> None:
|
||||
# Assert
|
||||
assert set(_ROW_TYPES) == set(_COMPONENT_OVERLAYS)
|
||||
|
||||
|
||||
def test_override_component_pgenum_covers_every_component() -> None:
|
||||
# The property_overrides.override_component pgEnum mirror must list every
|
||||
# component, or writing/reading a new-component row through it throws a
|
||||
# LookupError against Postgres (caught live on the Hyde portfolio-796 run).
|
||||
pgenum_values = cast(list[str], getattr(override_component_sa_enum, "enums"))
|
||||
assert set(pgenum_values) == set(_COMPONENT_OVERLAYS)
|
||||
95
tests/scripts/test_build_property_overrides_smoke.py
Normal file
95
tests/scripts/test_build_property_overrides_smoke.py
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
"""End-to-end smoke of the Hyde override script for ONE property, against a real
|
||||
(ephemeral) Postgres. Seeds the landlord vocab (simulating post-classify, so no
|
||||
ChatGPT) + a minimal ``property`` row, then runs the script's real
|
||||
``write`` + ``verify`` paths and asserts property_overrides + overlays land.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import Engine, text
|
||||
from sqlmodel import Session
|
||||
|
||||
import scripts.hyde.build_property_overrides as b
|
||||
from domain.epc.property_overrides.built_form_type import BuiltFormType
|
||||
from domain.epc.property_overrides.construction_age_band import ConstructionAgeBand
|
||||
from domain.epc.property_overrides.glazing_type import GlazingType
|
||||
from domain.epc.property_overrides.main_fuel_type import MainFuelType
|
||||
from domain.epc.property_overrides.main_heating_system_type import MainHeatingSystemType
|
||||
from domain.epc.property_overrides.property_type import PropertyType
|
||||
from domain.epc.property_overrides.roof_type import RoofType
|
||||
from domain.epc.property_overrides.wall_type import WallType
|
||||
from domain.epc.property_overrides.water_heating_type import WaterHeatingType
|
||||
from infrastructure.landlord_overrides.landlord_overrides_postgres_repository import (
|
||||
LandlordOverridesRepository,
|
||||
)
|
||||
from repositories.property.landlord_override_overlays import overlays_from
|
||||
from repositories.property.property_overrides_postgres_reader import (
|
||||
PropertyOverridesPostgresReader,
|
||||
)
|
||||
|
||||
PORTFOLIO = 795
|
||||
ORG_REF = "55180004001"
|
||||
EXCEL = "scripts/hyde/hyde_property_overrides.xlsx"
|
||||
|
||||
# What ChatGPT WOULD resolve this property's 9 descriptions to (component ->
|
||||
# (raw Excel entry, enum member)). Seeded into the landlord ledger.
|
||||
SEED = {
|
||||
"property_type": ("House: MidTerrace", PropertyType.HOUSE),
|
||||
"built_form_type": ("House: MidTerrace", BuiltFormType.MID_TERRACE),
|
||||
"wall_type": ("TimberFrame: AsBuilt", WallType.TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED),
|
||||
"roof_type": ("PitchedNormalLoftAccess: 300mm", RoofType.PITCHED_LOFT_300MM),
|
||||
"construction_age_band": ("L: 2012-2022", ConstructionAgeBand.L_2012_2022),
|
||||
"main_fuel": ("Gas: Mains Gas", MainFuelType.MAINS_GAS),
|
||||
"glazing": ("100% Double glazing 2002 or later", GlazingType.DOUBLE_POST_2002),
|
||||
"water_heating": ("From main heating system: Mains Gas", WaterHeatingType.FROM_MAIN_MAINS_GAS),
|
||||
"main_heating_system": ("Boiler: C rated Combi", MainHeatingSystemType.GAS_COMBI),
|
||||
}
|
||||
|
||||
|
||||
def test_one_property_end_to_end(db_engine: Engine, monkeypatch: Any) -> None:
|
||||
specs = b._specs_by_component() # pyright: ignore[reportPrivateUsage]
|
||||
|
||||
# minimal FE-owned `property` table + the one row we'll match by org_ref
|
||||
with Session(db_engine) as s:
|
||||
s.execute(text( # pyright: ignore[reportDeprecated]
|
||||
"CREATE TABLE property (id bigint PRIMARY KEY, portfolio_id bigint, "
|
||||
"landlord_property_id text)"))
|
||||
s.execute(text("INSERT INTO property VALUES (1, :p, :ref)"), # pyright: ignore[reportDeprecated]
|
||||
{"p": PORTFOLIO, "ref": ORG_REF})
|
||||
# seed the classifier ledger (keyed on normalised description)
|
||||
for comp, (raw, member) in SEED.items():
|
||||
repo: LandlordOverridesRepository[Any] = LandlordOverridesRepository(
|
||||
s, specs[comp].row_type)
|
||||
repo.upsert_all(PORTFOLIO, {b._norm(raw): member}) # pyright: ignore[reportPrivateUsage]
|
||||
s.commit()
|
||||
|
||||
# point the script at the ephemeral engine
|
||||
monkeypatch.setattr(b, "_db_session", lambda: Session(db_engine))
|
||||
|
||||
# --- run the real write() for this one property ---
|
||||
b.write(argparse.Namespace(excel=EXCEL, sheet="AddressProfilingResults",
|
||||
portfolio_id=PORTFOLIO, org_ref=ORG_REF, limit=None, apply=True))
|
||||
|
||||
with Session(db_engine) as s:
|
||||
rows = list(s.execute(text( # pyright: ignore[reportDeprecated]
|
||||
"SELECT override_component, building_part, override_value "
|
||||
"FROM property_overrides WHERE property_id = 1 ORDER BY override_component")))
|
||||
got = {c: v for c, _, v in rows}
|
||||
# every seeded component produced a property_overrides row with the resolved value
|
||||
assert got["main_fuel"] == "mains gas"
|
||||
assert got["glazing"] == "Double glazing, 2002 or later"
|
||||
assert got["construction_age_band"] == "L"
|
||||
assert got["main_heating_system"] == "Gas boiler, combi"
|
||||
assert got["water_heating"] == "From main system, mains gas"
|
||||
assert len(rows) == 9 # all 9 components
|
||||
|
||||
# --- the overrides reach the SAP overlay surface ---
|
||||
b.verify(argparse.Namespace(portfolio_id=PORTFOLIO, org_ref=ORG_REF)) # exercises verify()
|
||||
overlays = overlays_from(
|
||||
PropertyOverridesPostgresReader(lambda: Session(db_engine)).overrides_for(1))
|
||||
assert len(overlays) == 9
|
||||
assert any(o.heating is not None and o.heating.main_fuel_type == 26 for o in overlays)
|
||||
assert any(o.glazing is not None and o.glazing.glazing_type == 2 for o in overlays)
|
||||
Loading…
Add table
Reference in a new issue