Model/backend/documents_parser/tests/test_elmhurst_extractor.py
Khalim Conn-Kowlessar ec9ef0e8bb fix(extractor): drop windows-table header remnant from first window glazing type
Summary PDFs preprocessed from `pdftotext -layout` wrap the windows-table
header across several lines. The third header line's tail ("U value / g
value / Draught Proofed / Permanent Shutters") tokenises to "value value
Proofed Shutters" and lands directly above the FIRST window's data row.

Because the first window in a building part has `before_start = 0`, its
prefix block reaches back into that header remnant. The remnant is
neither an orientation nor a building-part fragment, so it survived the
pops in `_compose_window_descriptors` and leaked into glazing_type as
"value value Proofed Shutters Double between 2002 and 2021" (windows 2-3,
whose prefix starts after the previous window's manufacturer line, were
clean).

Fix: the glazing-type phrase always starts with a glazing-start word
(Single/Double/Triple/Secondary), so trim any prefix fragments preceding
that word before joining the glazing type. Orientation/bp pops still run
on the full prefix, so they are unaffected.

Reproduced from `sap worksheets/Recommendations Elmhurst Files/
cavity_wall_insulation - main wall/before/Summary_001431.pdf`. Added a
regression test driving the real `_extract_windows_from_layout` path with
the verbatim tokenised header+rows. 2306 passed (+4), pyright net-zero.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-02 22:54:49 +00:00

624 lines
23 KiB
Python

import json
import os
from datetime import date
import pytest
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
from datatypes.epc.surveys.elmhurst_site_notes import (
BathsAndShowers,
BuildingPartDimensions,
ElmhurstSiteNotes,
FloorDetails,
FloorDimension,
Lighting,
MainHeating,
Meters,
PropertyDetails,
Renewables,
RoofDetails,
Shower,
SurveyorInfo,
VentilationAndCooling,
WallDetails,
WaterHeating,
Window,
)
FIXTURE_PATH = os.path.join(
os.path.dirname(__file__), "fixtures", "elmhurst_site_notes_1_text.json"
)
FIXTURE_PATH_2 = os.path.join(
os.path.dirname(__file__), "fixtures", "elmhurst_site_notes_2_text.json"
)
@pytest.fixture(scope="module")
def result() -> ElmhurstSiteNotes:
with open(FIXTURE_PATH) as f:
pages = json.load(f)
return ElmhurstSiteNotesExtractor(pages).extract()
@pytest.fixture(scope="module")
def result2() -> ElmhurstSiteNotes:
with open(FIXTURE_PATH_2) as f:
pages = json.load(f)
return ElmhurstSiteNotesExtractor(pages).extract()
class TestSurveyorInfo:
def test_surveyor_code(self, result: ElmhurstSiteNotes) -> None:
assert result.surveyor_info.surveyor_code == "P960-0001"
def test_name(self, result: ElmhurstSiteNotes) -> None:
assert result.surveyor_info.name == "Richard Matthew Ratcliff"
def test_title(self, result: ElmhurstSiteNotes) -> None:
assert result.surveyor_info.title == "Mr."
def test_tel_number(self, result: ElmhurstSiteNotes) -> None:
assert result.surveyor_info.tel_number == "07760 443 469"
def test_survey_reference(self, result: ElmhurstSiteNotes) -> None:
assert result.surveyor_info.survey_reference == "001573"
def test_my_reference_none(self, result: ElmhurstSiteNotes) -> None:
assert result.surveyor_info.my_reference is None
class TestPropertyDetails:
def test_rdsap_version(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.rdsap_version == "RdSAP10"
def test_reference_number(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.reference_number == "P960-0001-001573"
def test_lodgement_required(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.lodgement_required is False
def test_regs_region(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.regs_region == "England"
def test_epc_language(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.epc_language == "English"
def test_uprn_none(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.uprn is None
def test_postcode(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.postcode == "BB10 1XX"
def test_region(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.region == "West Pennines"
def test_house_name_none(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.house_name is None
def test_house_number(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.house_number == "19"
def test_street(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.street == "Queens Road"
def test_locality_none(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.locality is None
def test_town(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.town == "BURNLEY"
def test_county_none(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.county is None
def test_tenure(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.tenure == "Rented (social)"
def test_transaction_type(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.transaction_type == "Grant scheme"
def test_inspection_date(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.inspection_date == date(2026, 3, 6)
def test_process_date(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.process_date == date(2026, 3, 6)
def test_epc_exists(self, result: ElmhurstSiteNotes) -> None:
assert result.property_details.epc_exists is False
class TestPropertyDescription:
def test_property_type(self, result: ElmhurstSiteNotes) -> None:
assert result.property_type == "B Bungalow"
def test_attachment(self, result: ElmhurstSiteNotes) -> None:
assert result.attachment == "E End-Terrace"
def test_number_of_storeys(self, result: ElmhurstSiteNotes) -> None:
assert result.number_of_storeys == 1
def test_habitable_rooms(self, result: ElmhurstSiteNotes) -> None:
assert result.habitable_rooms == 2
def test_heated_habitable_rooms(self, result: ElmhurstSiteNotes) -> None:
assert result.heated_habitable_rooms == 2
def test_construction_age_band(self, result: ElmhurstSiteNotes) -> None:
assert result.construction_age_band == "D 1950-1966"
def test_has_conservatory(self, result: ElmhurstSiteNotes) -> None:
assert result.has_conservatory is False
class TestDimensions:
def test_dimension_type(self, result: ElmhurstSiteNotes) -> None:
assert result.dimensions.dimension_type == "Internal"
def test_floor_count(self, result: ElmhurstSiteNotes) -> None:
assert len(result.dimensions.floors) == 1
def test_floor_name(self, result: ElmhurstSiteNotes) -> None:
assert result.dimensions.floors[0].name == "Lowest Floor"
def test_floor_area(self, result: ElmhurstSiteNotes) -> None:
assert result.dimensions.floors[0].area_m2 == 44.89
def test_floor_room_height(self, result: ElmhurstSiteNotes) -> None:
assert result.dimensions.floors[0].room_height_m == 2.24
def test_floor_heat_loss_perimeter(self, result: ElmhurstSiteNotes) -> None:
assert result.dimensions.floors[0].heat_loss_perimeter_m == 20.10
def test_floor_party_wall_length(self, result: ElmhurstSiteNotes) -> None:
assert result.dimensions.floors[0].party_wall_length_m == 6.70
class TestWalls:
def test_wall_type(self, result: ElmhurstSiteNotes) -> None:
assert result.walls.wall_type == "CA Cavity"
def test_insulation(self, result: ElmhurstSiteNotes) -> None:
assert result.walls.insulation == "F Filled Cavity"
def test_thickness_unknown(self, result: ElmhurstSiteNotes) -> None:
assert result.walls.thickness_unknown is False
def test_thickness_mm(self, result: ElmhurstSiteNotes) -> None:
assert result.walls.thickness_mm == 300
def test_u_value_known(self, result: ElmhurstSiteNotes) -> None:
assert result.walls.u_value_known is False
def test_party_wall_type(self, result: ElmhurstSiteNotes) -> None:
assert result.walls.party_wall_type == "U Unable to determine"
class TestRoof:
def test_roof_type(self, result: ElmhurstSiteNotes) -> None:
assert result.roof.roof_type == "PA Pitched (slates/tiles), access to loft"
def test_insulation(self, result: ElmhurstSiteNotes) -> None:
assert result.roof.insulation == "J Joists"
def test_insulation_thickness_mm(self, result: ElmhurstSiteNotes) -> None:
assert result.roof.insulation_thickness_mm == 270
def test_u_value_known(self, result: ElmhurstSiteNotes) -> None:
assert result.roof.u_value_known is False
class TestFloor:
def test_location(self, result: ElmhurstSiteNotes) -> None:
assert result.floor.location == "G Ground floor"
def test_floor_type(self, result: ElmhurstSiteNotes) -> None:
assert result.floor.floor_type == "N Suspended, not timber"
def test_insulation(self, result: ElmhurstSiteNotes) -> None:
assert result.floor.insulation == "A As built"
def test_default_u_value(self, result: ElmhurstSiteNotes) -> None:
assert result.floor.default_u_value == 0.69
def test_u_value_known(self, result: ElmhurstSiteNotes) -> None:
assert result.floor.u_value_known is False
class TestDoors:
def test_door_count(self, result: ElmhurstSiteNotes) -> None:
assert result.door_count == 0
def test_insulated_door_count(self, result: ElmhurstSiteNotes) -> None:
assert result.insulated_door_count == 0
class TestWindows:
def test_window_count(self, result: ElmhurstSiteNotes) -> None:
assert len(result.windows) == 4
def test_draught_proofing_percent(self, result: ElmhurstSiteNotes) -> None:
assert result.draught_proofing_percent == 100
def test_first_window_dimensions(self, result: ElmhurstSiteNotes) -> None:
w = result.windows[0]
assert w.width_m == 1.30
assert w.height_m == 1.10
assert w.area_m2 == 1.43
def test_first_window_glazing(self, result: ElmhurstSiteNotes) -> None:
w = result.windows[0]
assert w.glazing_type == "Double post or during 2022"
assert w.frame_factor == 0.70
def test_first_window_location(self, result: ElmhurstSiteNotes) -> None:
w = result.windows[0]
assert w.building_part == "Main"
assert w.location == "External wall"
assert w.orientation == "North"
def test_first_window_performance(self, result: ElmhurstSiteNotes) -> None:
w = result.windows[0]
assert w.data_source == "Manufacturer"
assert w.u_value == 1.40
assert w.g_value == 0.72
assert w.draught_proofed is True
assert w.permanent_shutters == "None"
def test_third_window_orientation(self, result: ElmhurstSiteNotes) -> None:
assert result.windows[2].orientation == "South"
def test_fourth_window_dimensions(self, result: ElmhurstSiteNotes) -> None:
w = result.windows[3]
assert w.width_m == 0.70
assert w.height_m == 1.30
assert w.area_m2 == 0.91
class TestVentilation:
def test_open_chimneys(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.open_chimneys_count == 0
def test_open_flues(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.open_flues_count == 0
def test_open_chimneys_closed_fire(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.open_chimneys_closed_fire_count == 0
def test_solid_fuel_boiler_flues(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.solid_fuel_boiler_flues_count == 0
def test_other_heater_flues(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.other_heater_flues_count == 0
def test_blocked_chimneys(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.blocked_chimneys_count == 0
def test_extract_fans(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.extract_fans_count == 2
def test_passive_vents(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.passive_vents_count == 0
def test_flueless_gas_fires(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.flueless_gas_fires_count == 0
def test_fixed_space_cooling(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.fixed_space_cooling is False
def test_draught_lobby(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.draught_lobby == "Not present"
def test_mechanical_ventilation(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.mechanical_ventilation is False
def test_pressure_test_method(self, result: ElmhurstSiteNotes) -> None:
assert result.ventilation.pressure_test_method == "Not available"
class TestLighting:
def test_total_bulbs(self, result: ElmhurstSiteNotes) -> None:
assert result.lighting.total_bulbs == 8
def test_led_cfl_count_known(self, result: ElmhurstSiteNotes) -> None:
assert result.lighting.led_cfl_count_known is True
def test_led_count(self, result: ElmhurstSiteNotes) -> None:
assert result.lighting.led_count == 4
def test_cfl_count(self, result: ElmhurstSiteNotes) -> None:
assert result.lighting.cfl_count == 4
def test_incandescent_count(self, result: ElmhurstSiteNotes) -> None:
assert result.lighting.incandescent_count == 0
class TestMainHeating:
def test_pcdf_boiler_reference(self, result: ElmhurstSiteNotes) -> None:
assert (
result.main_heating.pcdf_boiler_reference
== "17742 Potterton, Promax 33 Combi ErP, 88.30%"
)
def test_heat_emitter(self, result: ElmhurstSiteNotes) -> None:
assert result.main_heating.heat_emitter == "Radiators"
def test_heat_pump_age(self, result: ElmhurstSiteNotes) -> None:
assert result.main_heating.heat_pump_age == "Unknown"
def test_fuel_type(self, result: ElmhurstSiteNotes) -> None:
assert result.main_heating.fuel_type == "Mains gas"
def test_flue_type(self, result: ElmhurstSiteNotes) -> None:
assert result.main_heating.flue_type == "Balanced"
def test_fan_assisted_flue(self, result: ElmhurstSiteNotes) -> None:
assert result.main_heating.fan_assisted_flue is True
def test_design_flow_temperature(self, result: ElmhurstSiteNotes) -> None:
assert result.main_heating.design_flow_temperature == "Unknown"
def test_heating_controls_ees(self, result: ElmhurstSiteNotes) -> None:
assert result.main_heating.heating_controls_ees == "CBE"
def test_heating_controls_sap(self, result: ElmhurstSiteNotes) -> None:
assert (
result.main_heating.heating_controls_sap
== "SAP code 2106, Programmer, room thermostat and TRVs"
)
def test_percentage_of_heat(self, result: ElmhurstSiteNotes) -> None:
assert result.main_heating.percentage_of_heat == 100
class TestMeters:
def test_electricity_meter_type(self, result: ElmhurstSiteNotes) -> None:
assert result.meters.electricity_meter_type == "Single"
def test_main_gas(self, result: ElmhurstSiteNotes) -> None:
assert result.meters.main_gas is True
def test_electricity_smart_meter(self, result: ElmhurstSiteNotes) -> None:
assert result.meters.electricity_smart_meter is False
def test_gas_smart_meter(self, result: ElmhurstSiteNotes) -> None:
assert result.meters.gas_smart_meter is False
class TestWaterHeating:
def test_water_heating_code(self, result: ElmhurstSiteNotes) -> None:
assert result.water_heating.water_heating_code == "HWP"
def test_water_heating_sap_code(self, result: ElmhurstSiteNotes) -> None:
assert result.water_heating.water_heating_sap_code == 901
def test_water_heating_fuel_type(self, result: ElmhurstSiteNotes) -> None:
assert result.water_heating.water_heating_fuel_type == "Mains gas"
def test_hot_water_cylinder_present(self, result: ElmhurstSiteNotes) -> None:
assert result.water_heating.hot_water_cylinder_present is False
class TestBathsAndShowers:
def test_number_of_baths(self, result: ElmhurstSiteNotes) -> None:
assert result.baths_and_showers.number_of_baths == 0
def test_number_of_baths_connected(self, result: ElmhurstSiteNotes) -> None:
assert result.baths_and_showers.number_of_baths_connected == 0
def test_shower_count(self, result: ElmhurstSiteNotes) -> None:
assert len(result.baths_and_showers.showers) == 1
def test_shower_number(self, result: ElmhurstSiteNotes) -> None:
assert result.baths_and_showers.showers[0].shower_number == 1
def test_shower_outlet_type(self, result: ElmhurstSiteNotes) -> None:
assert result.baths_and_showers.showers[0].outlet_type == "Electric shower"
def test_shower_connected(self, result: ElmhurstSiteNotes) -> None:
assert result.baths_and_showers.showers[0].connected == "None"
class TestRenewables:
def test_solar_water_heating(self, result: ElmhurstSiteNotes) -> None:
assert result.renewables.solar_water_heating is False
def test_wwhrs_present(self, result: ElmhurstSiteNotes) -> None:
assert result.renewables.wwhrs_present is False
def test_flue_gas_heat_recovery_present(self, result: ElmhurstSiteNotes) -> None:
assert result.renewables.flue_gas_heat_recovery_present is False
def test_photovoltaic_panel(self, result: ElmhurstSiteNotes) -> None:
assert result.renewables.photovoltaic_panel == "None"
def test_export_capable_meter(self, result: ElmhurstSiteNotes) -> None:
assert result.renewables.export_capable_meter is False
def test_wind_turbine_present(self, result: ElmhurstSiteNotes) -> None:
assert result.renewables.wind_turbine_present is False
def test_wind_turbines_terrain_type(self, result: ElmhurstSiteNotes) -> None:
assert result.renewables.wind_turbines_terrain_type == "Suburban"
def test_hydro_electricity_generated_kwh(self, result: ElmhurstSiteNotes) -> None:
assert result.renewables.hydro_electricity_generated_kwh == 0.0
class TestEnergyPerformance:
def test_current_sap_rating(self, result: ElmhurstSiteNotes) -> None:
assert result.current_sap_rating == 69
def test_potential_sap_rating(self, result: ElmhurstSiteNotes) -> None:
assert result.potential_sap_rating == 77
def test_current_ei_rating(self, result: ElmhurstSiteNotes) -> None:
assert result.current_ei_rating == 76
def test_potential_ei_rating(self, result: ElmhurstSiteNotes) -> None:
assert result.potential_ei_rating == 81
def test_co2_emissions_current_t(self, result: ElmhurstSiteNotes) -> None:
assert result.co2_emissions_current_t == 1.683
class TestWindowsWithFrameDetails:
def test_window_count(self, result2: ElmhurstSiteNotes) -> None:
assert len(result2.windows) == 8
def test_draught_proofing_percent(self, result2: ElmhurstSiteNotes) -> None:
assert result2.draught_proofing_percent == 90
def test_first_window_glazing_type_excludes_frame_type(self, result2: ElmhurstSiteNotes) -> None:
assert result2.windows[0].glazing_type == "Double with unknown install date"
def test_first_window_frame_type(self, result2: ElmhurstSiteNotes) -> None:
assert result2.windows[0].frame_type == "PVC"
def test_first_window_frame_factor(self, result2: ElmhurstSiteNotes) -> None:
assert result2.windows[0].frame_factor == 0.70
def test_first_window_glazing_gap(self, result2: ElmhurstSiteNotes) -> None:
assert result2.windows[0].glazing_gap == "16 mm or more"
def test_first_window_location(self, result2: ElmhurstSiteNotes) -> None:
assert result2.windows[0].building_part == "Main"
assert result2.windows[0].location == "External wall"
assert result2.windows[0].orientation == "East"
def test_first_window_performance(self, result2: ElmhurstSiteNotes) -> None:
assert result2.windows[0].data_source == "Manufacturer"
assert result2.windows[0].u_value == 2.70
assert result2.windows[0].g_value == 0.76
assert result2.windows[0].draught_proofed is True
assert result2.windows[0].permanent_shutters == "None"
def test_fourth_window_orientation(self, result2: ElmhurstSiteNotes) -> None:
assert result2.windows[3].orientation == "South"
class TestLightingLedCflUnknown:
def test_total_bulbs(self, result2: ElmhurstSiteNotes) -> None:
assert result2.lighting.total_bulbs == 10
def test_led_cfl_count_known_false(self, result2: ElmhurstSiteNotes) -> None:
assert result2.lighting.led_cfl_count_known is False
def test_low_energy_count(self, result2: ElmhurstSiteNotes) -> None:
assert result2.lighting.low_energy_count == 5
def test_incandescent_count(self, result2: ElmhurstSiteNotes) -> None:
assert result2.lighting.incandescent_count == 5
def test_led_count_zero_when_unknown(self, result2: ElmhurstSiteNotes) -> None:
assert result2.lighting.led_count == 0
def test_cfl_count_zero_when_unknown(self, result2: ElmhurstSiteNotes) -> None:
assert result2.lighting.cfl_count == 0
class TestWindowsLayoutHeaderRemnant:
"""Regression for the first-window glazing-type header leak.
Summary PDFs preprocessed from `pdftotext -layout` wrap the windows
table header across several lines. The third header line's tail
("U value / g value / Draught Proofed / Permanent Shutters") tokenises
to "value value Proofed Shutters" and sits directly above the FIRST
window's data row. Because the first window in a building part has
`before_start = 0`, its prefix block reaches back into that header
remnant, which is neither an orientation nor a building-part fragment
and so survived into `glazing_type` as
"value value Proofed Shutters Double between 2002 and 2021".
Reproduced from `sap worksheets/Recommendations Elmhurst Files/
cavity_wall_insulation - main wall/before/Summary_001431.pdf` (3
Manufacturer-data-source windows; only window 0 was corrupted).
"""
# Faithful reproduction of the tokenised windows section (one page),
# captured verbatim from the Summary PDF above. The header remnant
# "value value Proofed Shutters" precedes window 0's wrapped glazing
# cell ("Double between 2002" / "and 2021").
_WINDOWS_PAGE = "\n".join([
"11.0 Windows:",
"Frame Frame Glazing",
"Building",
"U",
"g Draught Permanent",
"W",
"H",
"Area Glazing Type",
"Location",
"Orient. Data-Source",
"Type Factor Gap",
"Part",
"value value Proofed Shutters",
"Double between 2002",
"North",
"0.97 1.00 0.97",
"PVC",
"0.70",
"Main",
"External wall",
"Manufacturer 2.00",
"0.72",
"Yes",
"None",
"and 2021",
"West",
"Double between 2002",
"South",
"2.66 1.00 2.66",
"PVC",
"0.70",
"Main",
"External wall",
"Manufacturer 2.00",
"0.72",
"Yes",
"None",
"and 2021",
"East",
"Double between 2002",
"South",
"2.66 1.00 2.66",
"PVC",
"0.70",
"Main",
"External wall",
"Manufacturer 2.00",
"0.72",
"Yes",
"None",
"and 2021",
"East",
"12.0 Ventilation",
])
@pytest.fixture(scope="class")
def windows(self) -> list[Window]:
return ElmhurstSiteNotesExtractor([self._WINDOWS_PAGE])._extract_windows()
def test_window_count(self, windows: list[Window]) -> None:
# Arrange / Act / Assert
assert len(windows) == 3
def test_first_window_glazing_type_excludes_header_remnant(
self, windows: list[Window]
) -> None:
# Arrange / Act / Assert — no "value value Proofed Shutters" leak.
assert windows[0].glazing_type == "Double between 2002 and 2021"
def test_all_windows_share_clean_glazing_type(
self, windows: list[Window]
) -> None:
# Arrange / Act / Assert — windows 1 and 2 were already clean;
# all three must agree after the fix.
assert [w.glazing_type for w in windows] == [
"Double between 2002 and 2021"
] * 3
def test_first_window_orientation_unaffected(
self, windows: list[Window]
) -> None:
# Arrange / Act / Assert — trimming the glazing prefix must not
# disturb orientation extraction (North + West fragments).
assert windows[0].orientation == "North-West"