Merge pull request #679 from Hestia-Homes/feature/condition-data

Condition Data - parse LBWF houses data to objects
This commit is contained in:
Daniel Roth 2026-01-20 12:15:35 +00:00 committed by GitHub
commit 11b482838e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 539 additions and 9 deletions

View file

@ -1,4 +1,5 @@
FROM python:3.12-bullseye
FROM python:3.11.10-bullseye
ARG USER=vscode
ARG DEBIAN_FRONTEND=noninteractive
@ -24,12 +25,17 @@ RUN useradd -m -s /usr/bin/bash ${USER} \
&& echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \
&& chmod 0440 /etc/sudoers.d/${USER}
# 4) Python deps
ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
# Model
# # 4) Python deps - if you want to run assest list
# ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
# ADD asset_list/requirements.txt requirements.txt
# FASTAPI backend
ADD .devcontainer/requirements.txt requirements.txt
# RUN pip install -r requirements.txt
#
ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
ADD backend/engine/requirements.txt requirements1.txt
ADD backend/app/requirements/requirements.txt requirements2.txt
ADD .devcontainer/requirements.txt requirements3.txt
RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt
RUN pip install -r requirements.txt
# 5) Workdir
@ -37,4 +43,4 @@ WORKDIR /workspaces/model
# 6) Make Python find your package
# Add project root to PYTHONPATH for all processes
ENV PYTHONPATH=/workspaces/model:${PYTHONPATH}
ENV PYTHONPATH=/workspaces/model:${PYTHONPATH}

View file

@ -27,5 +27,8 @@
"ms-python.vscode-python-envs"
]
}
},
"containerEnv": {
"PYTHONFLAGS": "-Xfrozen_modules=off"
}
}

View file

@ -14,4 +14,7 @@ openpyxl==3.1.2
pytz
uvicorn[standard]
sqlmodel
# Testing
pytest==9.0.2
pytest-cov==7.0.0
ipykernel>=6.25,<7

2
.gitignore vendored
View file

@ -242,6 +242,8 @@ fabric.properties
local_data/*
/local_data/*
etl/epc/local_data/*
/backend/condition/sample_data/lbwf/*
/backend/condition/sample_data/peabody/*
*.DS_Store
infrastructure/terraform/.terraform*

15
.vscode/launch.json vendored Normal file
View file

@ -0,0 +1,15 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

View file

@ -9,6 +9,9 @@
"path": "/bin/bash"
}
},
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.pytestArgs": ["-s", "-q", "--no-cov"]
// Hot reload setting that needs to be in user settings
// "jupyter.runStartupCommands": [

View file

View file

@ -0,0 +1,12 @@
from enum import Enum
class FileType(Enum):
LBWF = "lbwf"
def detect_file_type(filepath: str) -> FileType:
path = filepath.lower()
if "lbwf" in path:
return FileType.LBWF
raise ValueError("Unrecognised file path")

View file

@ -0,0 +1,16 @@
from typing import Mapping, Any
from io import BytesIO
from utils.logger import setup_logger
from backend.condition.processor import process_file
logger = setup_logger()
def handler(event: Mapping[str, Any], context: Any) -> None:
# Temporary stub for PoC wiring
dummy_stream = BytesIO(b"")
source_key = event.get("source_key", "unknown-source")
process_file(dummy_stream, source_key)

View file

@ -0,0 +1,25 @@
from pathlib import Path
from backend.condition.processor import process_file
def main() -> None:
try:
# Works in scripts / debugger / pytest
ROOT_DIR = Path(__file__).resolve().parents[1]
except NameError:
# __file__ is not defined in notebooks
ROOT_DIR = Path.cwd()
path: Path = ROOT_DIR / "condition" / "sample_data"
lbwf_path: Path = path / "lbwf" / "LBWF - Example Asset Data September 2025.xlsx" # TODO: get this from s3 as part of devcontainer init
with lbwf_path.open("rb") as f:
process_file(
file_stream=f,
source_key=lbwf_path.as_posix(),
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,9 @@
from backend.condition.file_type import FileType
from backend.condition.parsing.parser import Parser
from backend.condition.parsing.lbwf_parser import LbwfParser
def select_parser(file_type: FileType) -> Parser:
if file_type is FileType.LBWF:
return LbwfParser()
raise ValueError("Unrecognised file type, unable to instantiate Parser")

View file

@ -0,0 +1,180 @@
from typing import BinaryIO, Any, Dict, Iterator, List, Tuple
from openpyxl import Workbook, load_workbook
from collections import defaultdict
from backend.condition.parsing.parser import Parser
from backend.condition.parsing.records.lbwf.lbwf_asset_condition import LbwfAssetCondition
from backend.condition.parsing.records.lbwf.lbwf_house import LbwfHouse
from backend.condition.utils.date_utils import normalise_date
from utils.logger import setup_logger
logger = setup_logger
class LbwfParser(Parser):
def parse(self, file_stream: BinaryIO) -> Any:
wb: Workbook = load_workbook(file_stream)
address_to_uprn_map: Dict[str, int] = self._generate_address_to_uprn_dict(wb)
assets = self._parse_assets(wb)
houses = self._parse_houses(wb, address_to_uprn_map)
self._merge_assets_into_houses(assets, houses)
return houses
@staticmethod
def _parse_assets(wb: Workbook) -> List[LbwfAssetCondition]:
assets_sheet = wb["Houses Asset Data"]
asset_rows = assets_sheet.iter_rows(values_only=True)
asset_headers = next(asset_rows)
asset_header_indexes = LbwfParser._get_column_indexes_by_name(asset_headers)
assets: List[LbwfAssetCondition] = []
for row in asset_rows:
try:
assets.append(
LbwfParser._map_row_to_asset_record(row, asset_header_indexes)
)
except Exception as e:
logger.error(f"Error mapping LBWF row to asset record: {e}")
continue
return assets
@staticmethod
def _parse_houses(
wb: Workbook,
address_to_uprn_map: Dict[str, int],
) -> List[LbwfHouse]:
houses_sheet = wb["Houses"]
house_rows = houses_sheet.iter_rows(values_only=True)
house_headers = next(house_rows)
house_header_indexes = LbwfParser._get_column_indexes_by_name(house_headers)
houses: List[LbwfHouse] = []
for row in house_rows:
try:
houses.append(
LbwfParser._map_row_to_house_record(
row,
house_header_indexes,
address_to_uprn_map,
)
)
except Exception as e:
logger.error(f"Error mapping LBWF row to house record: {e}")
continue
return houses
@staticmethod
def _merge_assets_into_houses(
assets: List[LbwfAssetCondition],
houses: List[LbwfHouse],
) -> None:
assets_by_ref: Dict[int, List[LbwfAssetCondition]] = defaultdict(list)
for asset in assets:
assets_by_ref[asset.prop_ref].append(asset)
for house in houses:
house.assets = assets_by_ref.get(house.reference, [])
@staticmethod
def _map_row_to_house_record(
row: Any | Tuple[object | None, ...],
header_indexes: Dict[str, int],
address_to_uprn_map: Dict[str, int],
) -> LbwfHouse:
address: str = row[header_indexes["Address"]]
return LbwfHouse(
uprn=LbwfParser._get_uprn_from_address(address, address_to_uprn_map),
reference=row[header_indexes["Reference"]],
address=address,
epc=row[header_indexes["EPC "]],
shdf=row[header_indexes["SHDF"]],
house=row[header_indexes["HOSUE"]],
fail_decency=row[header_indexes["Fail Decency"]],
assets=[],
)
@staticmethod
def _map_row_to_asset_record(
row: Any | Tuple[object | None, ...],
header_indexes: Dict[str, int],
) -> LbwfAssetCondition:
return LbwfAssetCondition(
prop_ref=row[header_indexes["PROP REF"]],
domna=row[header_indexes["Domna"]],
address=row[header_indexes["ADDRESS"]],
ownership=row[header_indexes["OWNERSHIP"]],
prop_status=row[header_indexes["PROP STATUS"]],
prop_type=row[header_indexes["PROP TYPE"]],
prop_sub_type=row[header_indexes["PROP SUB TYPE"]],
element_group=row[header_indexes["ELEMENT GROUP"]],
element_code=row[header_indexes["ELEMENT CODE"]],
element_code_description=row[header_indexes["ELEMENT CODE DESCRIPTION"]],
attribute_code=row[header_indexes["ATTRIBUTE CODE"]],
attribute_code_description=row[header_indexes["ATTRIBUTE CODE DESCRIPTION"]],
element_date_value=row[header_indexes["ELEMENT DATE VALUE"]],
element_numerical_value=row[header_indexes["ELEMENT NUMERIC VALUE"]],
element_text_value=row[header_indexes["ELEMENT TEXT VALUE"]],
quantity=row[header_indexes["QUANTITY"]],
install_date=normalise_date(row[header_indexes["INSTALL DATE"]]),
remaining_life=row[header_indexes["REMAINING LIFE"]],
element_comments=row[header_indexes["ELEMENT COMMENTS"]],
)
@staticmethod
def _generate_address_to_uprn_dict(wb: Workbook) -> Dict[str, int | None]:
sheet: Workbook = wb["All Energy Breakdown "]
rows: Iterator[Tuple[object | None, ...]] = sheet.iter_rows(values_only=True)
headers = next(rows)
header_indexes: Dict[str, int] = LbwfParser._get_column_indexes_by_name(headers)
address_idx = header_indexes["Address"]
uprn_idx = header_indexes["UPRN"]
mapping: Dict[str, int | None] = {}
for row in rows:
address = row[address_idx]
uprn = row[uprn_idx]
if not isinstance(address, str):
continue
if uprn is not None and not isinstance(uprn, int):
raise ValueError(f"Unexpected UPRN value: {uprn!r}")
mapping[address] = uprn
return mapping
def _get_column_indexes_by_name(
headers: Tuple[object | None, ...]
) -> Dict[str, int]:
index: Dict[str, int] = {}
for i, header in enumerate(headers):
if isinstance(header, str):
index[header] = i
return index
def _get_uprn_from_address(address: str, address_to_uprn_map: Dict[str, int]) -> int | None:
pseudo_name = address.split(",")[0]
if pseudo_name.lower() in (k.lower() for k in address_to_uprn_map.keys()):
return address_to_uprn_map[pseudo_name.upper()]
return None

View file

@ -0,0 +1,8 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Any
class Parser(ABC):
@abstractmethod
def parse(self, file_stream: BinaryIO) -> Any:
pass

View file

@ -0,0 +1,26 @@
from dataclasses import dataclass
from datetime import date
@dataclass
class LbwfAssetCondition:
prop_ref: int
domna: int
address: str
ownership: str
prop_status: str
prop_type: str # TODO: make this enum?
prop_sub_type: str # TODO: make this enum?
element_group: str
element_code: str
element_code_description: str
attribute_code: str
attribute_code_description: str
element_date_value: str | None = None
element_numerical_value: int | None = None
element_text_value: str | None = None
quantity: int | None = None
install_date: date | None = None
remaining_life: int | None = None
element_comments: str | None = None

View file

@ -0,0 +1,15 @@
from dataclasses import dataclass
from typing import List
from backend.condition.parsing.records.lbwf.lbwf_asset_condition import LbwfAssetCondition
@dataclass
class LbwfHouse:
uprn: int
reference: int
address: str
epc: str # TODO: make enum
shdf: bool
house: str
fail_decency: int
assets: List[LbwfAssetCondition]

View file

@ -0,0 +1,18 @@
from typing import Any, BinaryIO, List
from backend.condition.parsing.parser import Parser
from utils.logger import setup_logger
from backend.condition.file_type import FileType, detect_file_type
from backend.condition.parsing.factory import select_parser
def process_file(file_stream: BinaryIO, source_key: str) -> None:
print(f"[processor] Received file: {source_key}")
# Instantiation
file_type: FileType = detect_file_type(source_key)
parser: Parser = select_parser(file_type)
# Orchestration
records: List[Any] = parser.parse(file_stream)
print(records) # temp

View file

@ -0,0 +1,134 @@
from typing import Any
import pytest
from io import BytesIO
from openpyxl import Workbook
from datetime import datetime
from backend.condition.parsing.lbwf_parser import LbwfParser
from backend.condition.parsing.records.lbwf.lbwf_asset_condition import LbwfAssetCondition
from backend.condition.parsing.records.lbwf.lbwf_house import LbwfHouse
@pytest.fixture
def lbwf_homes_xlsx_bytes() -> BytesIO:
wb = Workbook()
houses_asset_data = wb.active
houses_asset_data.title = "Houses Asset Data"
houses_asset_data.append([
"PROP REF",
"Domna",
"ADDRESS",
"OWNERSHIP",
"PROP STATUS",
"PROP TYPE",
"PROP SUB TYPE",
"ELEMENT GROUP",
"ELEMENT CODE",
"ELEMENT CODE DESCRIPTION",
"ATTRIBUTE CODE",
"ATTRIBUTE CODE DESCRIPTION",
"ELEMENT DATE VALUE",
"ELEMENT NUMERIC VALUE",
"ELEMENT TEXT VALUE",
"QUANTITY",
"INSTALL DATE",
"REMAINING LIFE",
"ELEMENT COMMENTS"
]
)
houses_asset_data.append([
12345,
12345,
"123 Fake Street, London, A10 1AB",
"LBWF_OWNED",
"OCCP",
"HOU",
"TERRACED",
"ASSETS",
"AHR_CAT",
"Accessible Housing Register Category",
"F",
"General Needs",
None,
None,
None,
1,
None,
None,
None,
])
houses_asset_data.append([
54321,
54321,
"100 Random Road, London, A10 1AB",
"LBWF_OWNED",
"OCCP",
"HOU",
"EOT",
"ASSETS",
"INTSMKDET",
"Smoke Detectors in Property",
"HARDWRDMNS",
"Hard Wired Mains Smoke Alarm in Property",
None,
None,
None,
2,
datetime(2019,4,1),
4,
"Source of Data = Joe Bloggs",
])
houses = wb.create_sheet("Houses")
houses.append(["Reference", "Address", "EPC ", "SHDF", "HOSUE", "Fail Decency"])
houses.append([12345, "123 Fake Street, London, A10 1AB", "E", "NO", "HOUSE", 2025])
houses.append([54321, "100 Random Road, London, A10 1AB", "F", "NO", "HOUSE", 2025])
all_energy_breakdown = wb.create_sheet("All Energy Breakdown ") # Trailing space is intentional; matches source
all_energy_breakdown.append([
"UPRN",
"Organisation Reference",
"Alternate Organisation Reference",
"Address",
"Postcode"
])
all_energy_breakdown.append([
1,
200,
None,
"123 FAKE STREET",
"A10 1AB"
])
all_energy_breakdown.append([
2,
100,
101,
"100 RANDOM ROAD",
"A10 1AB"
])
stream = BytesIO()
wb.save(stream)
stream.seek(0)
return stream
def test_lbwf_parser_passes_houses(lbwf_homes_xlsx_bytes):
# arrange
parser = LbwfParser()
# act
result: Any = parser.parse(lbwf_homes_xlsx_bytes)
# assert
# TODO: Improve these asserts
assert len(result) == 2
assert isinstance(result[0], LbwfHouse)
assert result[0].uprn == 1
assert len(result[0].assets) == 1
assert isinstance(result[0].assets[0], LbwfAssetCondition)
assert isinstance(result[1], LbwfHouse)
assert result[1].uprn == 2
assert len(result[1].assets) == 1
assert isinstance(result[1].assets[0], LbwfAssetCondition)

View file

@ -0,0 +1,15 @@
import pytest
from backend.condition.parsing.factory import select_parser
from backend.condition.file_type import FileType
def test_selects_lbwf_parser():
# arrange
file_type = FileType.LBWF
expected_class_name = "LbwfParser"
# act
actual_class_name = select_parser(file_type).__class__.__name__
# assert
assert expected_class_name == actual_class_name

View file

@ -0,0 +1,22 @@
import pytest
from backend.condition.file_type import FileType, detect_file_type
def test_detects_lbwf_file_type():
# arrange
file_path_str = "uploads/lbwf/Exaple Asset Data.xlsx"
expected_file_type = FileType.LBWF
# act
actual_file_type: FileType = detect_file_type(file_path_str)
# assert
assert expected_file_type == actual_file_type
def test_unknown_filepath_raises_value_error():
# arrange
file_path_str = "unknown/Example Asset Data.xlsx"
# act + assert
with pytest.raises(ValueError):
detect_file_type(file_path_str)

View file

@ -0,0 +1,18 @@
from datetime import datetime, date
from typing import Any
def normalise_date(value: Any, allow_none: bool = True) -> date | None:
if value is None and allow_none:
return None
if isinstance(value, datetime):
return value.date()
if isinstance(value, str):
try:
return datetime.strptime(value.strip(), "%d/%m/%Y").date()
except ValueError as exc:
raise ValueError(f"Invalid date string: {value!r}") from exc
raise ValueError(f"Unexpected date value: {value!r}")

View file

@ -1,4 +1,4 @@
[pytest]
pythonpath = .
addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial
testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests
testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests