Merge pull request #679 from Hestia-Homes/feature/condition-data

Condition Data - parse LBWF houses data to objects
2026-07-27 23:35:01 +00:00 · 2026-01-20 12:15:35 +00:00 · 2026-01-20 12:15:35 +00:00 · 11b482838e
commit 11b482838e
parent 941be42b83 07cab931e5
21 changed files with 539 additions and 9 deletions
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@ -1,4 +1,5 @@
-FROM python:3.12-bullseye
+FROM python:3.11.10-bullseye
+

 ARG USER=vscode
 ARG DEBIAN_FRONTEND=noninteractive
@ -24,12 +25,17 @@ RUN useradd -m -s /usr/bin/bash ${USER} \
 && echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \
 && chmod 0440 /etc/sudoers.d/${USER}

-# 4) Python deps
-ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
-# Model
+# # 4) Python deps - if you want to run assest list
+# ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
 # ADD asset_list/requirements.txt requirements.txt
-# FASTAPI backend
-ADD .devcontainer/requirements.txt requirements.txt
+# RUN pip install -r requirements.txt
+
+# 
+ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
+ADD backend/engine/requirements.txt requirements1.txt
+ADD backend/app/requirements/requirements.txt requirements2.txt
+ADD .devcontainer/requirements.txt requirements3.txt
+RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt
 RUN pip install -r requirements.txt

 # 5) Workdir
@ -37,4 +43,4 @@ WORKDIR /workspaces/model

 # 6) Make Python find your package
 # Add project root to PYTHONPATH for all processes
-ENV PYTHONPATH=/workspaces/model:${PYTHONPATH}
+ENV PYTHONPATH=/workspaces/model:${PYTHONPATH}
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -27,5 +27,8 @@
        "ms-python.vscode-python-envs"
      ]
    }
+  },
+  "containerEnv": {
+    "PYTHONFLAGS": "-Xfrozen_modules=off"
  }
 }
--- a/.devcontainer/requirements.txt
+++ b/.devcontainer/requirements.txt
@ -14,4 +14,7 @@ openpyxl==3.1.2
 pytz
 uvicorn[standard]
 sqlmodel
-
+# Testing
+pytest==9.0.2
+pytest-cov==7.0.0
+ipykernel>=6.25,<7
--- a/.gitignore
+++ b/.gitignore
@ -242,6 +242,8 @@ fabric.properties
 local_data/*
 /local_data/*
 etl/epc/local_data/*
+/backend/condition/sample_data/lbwf/*
+/backend/condition/sample_data/peabody/*

 *.DS_Store
 infrastructure/terraform/.terraform*
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -9,6 +9,9 @@
            "path": "/bin/bash"
        }
    },
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "python.testing.pytestArgs": ["-s", "-q", "--no-cov"]

    // Hot reload setting that needs to be in user settings
    // "jupyter.runStartupCommands": [
--- a/backend/condition/init.py
+++ b/backend/condition/init.py
--- a/backend/condition/file_type.py
+++ b/backend/condition/file_type.py
@ -0,0 +1,12 @@
+from enum import Enum
+
+class FileType(Enum):
+    LBWF = "lbwf"
+
+def detect_file_type(filepath: str) -> FileType:
+    path = filepath.lower()
+
+    if "lbwf" in path:
+        return FileType.LBWF
+    
+    raise ValueError("Unrecognised file path")
--- a/backend/condition/handler.py
+++ b/backend/condition/handler.py
@ -0,0 +1,16 @@
+from typing import Mapping, Any
+from io import BytesIO
+
+from utils.logger import setup_logger
+from backend.condition.processor import process_file
+
+
+logger = setup_logger()
+
+def handler(event: Mapping[str, Any], context: Any) -> None:
+    # Temporary stub for PoC wiring
+    dummy_stream = BytesIO(b"")
+
+    source_key = event.get("source_key", "unknown-source")
+
+    process_file(dummy_stream, source_key)
--- a/backend/condition/local_runner.py
+++ b/backend/condition/local_runner.py
@ -0,0 +1,25 @@
+from pathlib import Path
+
+from backend.condition.processor import process_file
+
+def main() -> None:
+    try:
+        # Works in scripts / debugger / pytest
+        ROOT_DIR = Path(__file__).resolve().parents[1]
+    except NameError:
+        # __file__ is not defined in notebooks
+        ROOT_DIR = Path.cwd()
+
+    path: Path = ROOT_DIR / "condition" / "sample_data"
+
+    lbwf_path: Path = path / "lbwf" / "LBWF - Example Asset Data September 2025.xlsx"  # TODO: get this from s3 as part of devcontainer init
+
+    with lbwf_path.open("rb") as f:
+        process_file(
+            file_stream=f,
+            source_key=lbwf_path.as_posix(),
+        )
+
+if __name__ == "__main__":
+    main()
+
--- a/backend/condition/parsing/factory.py
+++ b/backend/condition/parsing/factory.py
@ -0,0 +1,9 @@
+from backend.condition.file_type import FileType
+from backend.condition.parsing.parser import Parser
+from backend.condition.parsing.lbwf_parser import LbwfParser
+
+def select_parser(file_type: FileType) -> Parser:
+    if file_type is FileType.LBWF:
+        return LbwfParser()
+
+    raise ValueError("Unrecognised file type, unable to instantiate Parser")
--- a/backend/condition/parsing/lbwf_parser.py
+++ b/backend/condition/parsing/lbwf_parser.py
@ -0,0 +1,180 @@
+from typing import BinaryIO, Any, Dict, Iterator, List, Tuple
+from openpyxl import Workbook, load_workbook
+from collections import defaultdict
+
+from backend.condition.parsing.parser import Parser
+from backend.condition.parsing.records.lbwf.lbwf_asset_condition import LbwfAssetCondition
+from backend.condition.parsing.records.lbwf.lbwf_house import LbwfHouse
+from backend.condition.utils.date_utils import normalise_date
+from utils.logger import setup_logger
+
+logger = setup_logger
+
+class LbwfParser(Parser):
+
+    def parse(self, file_stream: BinaryIO) -> Any:
+        wb: Workbook = load_workbook(file_stream)
+        address_to_uprn_map: Dict[str, int] = self._generate_address_to_uprn_dict(wb)
+
+        assets = self._parse_assets(wb)
+        houses = self._parse_houses(wb, address_to_uprn_map)
+
+        self._merge_assets_into_houses(assets, houses)
+
+        return houses
+
+    @staticmethod
+    def _parse_assets(wb: Workbook) -> List[LbwfAssetCondition]:
+        assets_sheet = wb["Houses Asset Data"]
+        asset_rows = assets_sheet.iter_rows(values_only=True)
+
+        asset_headers = next(asset_rows)
+        asset_header_indexes = LbwfParser._get_column_indexes_by_name(asset_headers)
+
+        assets: List[LbwfAssetCondition] = []
+        for row in asset_rows:
+            try:
+                assets.append(
+                    LbwfParser._map_row_to_asset_record(row, asset_header_indexes)
+                )
+            except Exception as e:
+                logger.error(f"Error mapping LBWF row to asset record: {e}")
+                continue
+
+        return assets
+
+    @staticmethod
+    def _parse_houses(
+        wb: Workbook,
+        address_to_uprn_map: Dict[str, int],
+    ) -> List[LbwfHouse]:
+        houses_sheet = wb["Houses"]
+        house_rows = houses_sheet.iter_rows(values_only=True)
+
+        house_headers = next(house_rows)
+        house_header_indexes = LbwfParser._get_column_indexes_by_name(house_headers)
+
+        houses: List[LbwfHouse] = []
+        for row in house_rows:
+            try:
+                houses.append(
+                    LbwfParser._map_row_to_house_record(
+                        row,
+                        house_header_indexes,
+                        address_to_uprn_map,
+                    )
+                )
+            except Exception as e:
+                logger.error(f"Error mapping LBWF row to house record: {e}")
+                continue
+
+        return houses
+
+    @staticmethod
+    def _merge_assets_into_houses(
+        assets: List[LbwfAssetCondition],
+        houses: List[LbwfHouse],
+    ) -> None:
+        assets_by_ref: Dict[int, List[LbwfAssetCondition]] = defaultdict(list)
+        for asset in assets:
+            assets_by_ref[asset.prop_ref].append(asset)
+
+        for house in houses:
+            house.assets = assets_by_ref.get(house.reference, [])
+
+
+    @staticmethod
+    def _map_row_to_house_record(
+        row: Any | Tuple[object | None, ...],
+        header_indexes: Dict[str, int],
+        address_to_uprn_map: Dict[str, int],
+    ) -> LbwfHouse:
+        address: str = row[header_indexes["Address"]]
+
+        return LbwfHouse(
+            uprn=LbwfParser._get_uprn_from_address(address, address_to_uprn_map),
+            reference=row[header_indexes["Reference"]],
+            address=address,
+            epc=row[header_indexes["EPC "]],
+            shdf=row[header_indexes["SHDF"]],
+            house=row[header_indexes["HOSUE"]],
+            fail_decency=row[header_indexes["Fail Decency"]],
+            assets=[],
+        ) 
+    
+    @staticmethod
+    def _map_row_to_asset_record(
+        row: Any | Tuple[object | None, ...],
+        header_indexes: Dict[str, int],
+    ) -> LbwfAssetCondition:
+        return LbwfAssetCondition(
+            prop_ref=row[header_indexes["PROP REF"]],
+            domna=row[header_indexes["Domna"]],
+            address=row[header_indexes["ADDRESS"]],
+            ownership=row[header_indexes["OWNERSHIP"]],
+            prop_status=row[header_indexes["PROP STATUS"]],
+            prop_type=row[header_indexes["PROP TYPE"]],
+            prop_sub_type=row[header_indexes["PROP SUB TYPE"]],
+            element_group=row[header_indexes["ELEMENT GROUP"]],
+            element_code=row[header_indexes["ELEMENT CODE"]],
+            element_code_description=row[header_indexes["ELEMENT CODE DESCRIPTION"]],
+            attribute_code=row[header_indexes["ATTRIBUTE CODE"]],
+            attribute_code_description=row[header_indexes["ATTRIBUTE CODE DESCRIPTION"]],
+            element_date_value=row[header_indexes["ELEMENT DATE VALUE"]],
+            element_numerical_value=row[header_indexes["ELEMENT NUMERIC VALUE"]],
+            element_text_value=row[header_indexes["ELEMENT TEXT VALUE"]],
+            quantity=row[header_indexes["QUANTITY"]],
+            install_date=normalise_date(row[header_indexes["INSTALL DATE"]]),
+            remaining_life=row[header_indexes["REMAINING LIFE"]],
+            element_comments=row[header_indexes["ELEMENT COMMENTS"]],
+        )
+    
+
+    @staticmethod
+    def _generate_address_to_uprn_dict(wb: Workbook) -> Dict[str, int | None]:
+        sheet: Workbook = wb["All Energy Breakdown "]
+
+        rows: Iterator[Tuple[object | None, ...]] = sheet.iter_rows(values_only=True)
+
+        headers = next(rows)
+        header_indexes: Dict[str, int] = LbwfParser._get_column_indexes_by_name(headers)
+
+        address_idx = header_indexes["Address"]
+        uprn_idx = header_indexes["UPRN"]
+
+        mapping: Dict[str, int | None] = {}
+
+        for row in rows:
+            address = row[address_idx]
+            uprn = row[uprn_idx]
+
+            if not isinstance(address, str):
+                continue
+
+            if uprn is not None and not isinstance(uprn, int):
+                raise ValueError(f"Unexpected UPRN value: {uprn!r}")
+
+            mapping[address] = uprn
+
+        return mapping
+
+
+    def _get_column_indexes_by_name(
+        headers: Tuple[object | None, ...]
+    ) -> Dict[str, int]:
+        index: Dict[str, int] = {}
+
+        for i, header in enumerate(headers):
+            if isinstance(header, str):
+                index[header] = i
+
+        return index
+    
+    def _get_uprn_from_address(address: str, address_to_uprn_map: Dict[str, int]) -> int | None:
+        pseudo_name = address.split(",")[0]
+
+        if pseudo_name.lower() in (k.lower() for k in address_to_uprn_map.keys()):
+            return address_to_uprn_map[pseudo_name.upper()]
+        
+        return None
+        
--- a/backend/condition/parsing/parser.py
+++ b/backend/condition/parsing/parser.py
@ -0,0 +1,8 @@
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Any
+
+class Parser(ABC):
+
+    @abstractmethod
+    def parse(self, file_stream: BinaryIO) -> Any:
+        pass
--- a/backend/condition/parsing/records/lbwf/lbwf_asset_condition.py
+++ b/backend/condition/parsing/records/lbwf/lbwf_asset_condition.py
@ -0,0 +1,26 @@
+from dataclasses import dataclass
+from datetime import date
+
+
+@dataclass
+class LbwfAssetCondition:
+    prop_ref: int
+    domna: int
+    address: str
+    ownership: str
+    prop_status: str
+    prop_type: str # TODO: make this enum?
+    prop_sub_type: str  # TODO: make this enum?
+    element_group: str
+    element_code: str
+    element_code_description: str
+    attribute_code: str
+    attribute_code_description: str
+    element_date_value: str | None = None
+    element_numerical_value: int | None = None
+    element_text_value: str | None = None
+    quantity: int | None = None
+    install_date: date | None = None
+    remaining_life: int | None = None
+    element_comments: str | None = None
+
--- a/backend/condition/parsing/records/lbwf/lbwf_house.py
+++ b/backend/condition/parsing/records/lbwf/lbwf_house.py
@ -0,0 +1,15 @@
+from dataclasses import dataclass
+from typing import List
+
+from backend.condition.parsing.records.lbwf.lbwf_asset_condition import LbwfAssetCondition
+
+@dataclass
+class LbwfHouse:
+    uprn: int
+    reference: int
+    address: str
+    epc: str # TODO: make enum
+    shdf: bool
+    house: str
+    fail_decency: int
+    assets: List[LbwfAssetCondition]
--- a/backend/condition/processor.py
+++ b/backend/condition/processor.py
@ -0,0 +1,18 @@
+from typing import Any, BinaryIO, List
+
+from backend.condition.parsing.parser import Parser
+from utils.logger import setup_logger
+from backend.condition.file_type import FileType, detect_file_type
+from backend.condition.parsing.factory import select_parser
+
+def process_file(file_stream: BinaryIO, source_key: str) -> None:
+    print(f"[processor] Received file: {source_key}")
+
+    # Instantiation
+    file_type: FileType = detect_file_type(source_key)
+    parser: Parser = select_parser(file_type)
+
+    # Orchestration
+    records: List[Any] = parser.parse(file_stream)
+
+    print(records) # temp
--- a/backend/condition/tests/parsing/test_lbwf_parser.py
+++ b/backend/condition/tests/parsing/test_lbwf_parser.py
@ -0,0 +1,134 @@
+from typing import Any
+import pytest
+from io import BytesIO
+from openpyxl import Workbook
+from datetime import datetime
+
+from backend.condition.parsing.lbwf_parser import LbwfParser
+from backend.condition.parsing.records.lbwf.lbwf_asset_condition import LbwfAssetCondition
+from backend.condition.parsing.records.lbwf.lbwf_house import LbwfHouse
+
+@pytest.fixture
+def lbwf_homes_xlsx_bytes() -> BytesIO:
+    wb = Workbook()
+    houses_asset_data = wb.active
+    houses_asset_data.title = "Houses Asset Data"
+    houses_asset_data.append([
+        "PROP REF",
+        "Domna",
+        "ADDRESS",
+        "OWNERSHIP",
+        "PROP STATUS",
+        "PROP TYPE",
+        "PROP SUB TYPE",
+        "ELEMENT GROUP",
+        "ELEMENT CODE",
+        "ELEMENT CODE DESCRIPTION",
+        "ATTRIBUTE CODE",
+        "ATTRIBUTE CODE DESCRIPTION",
+        "ELEMENT DATE VALUE",
+        "ELEMENT NUMERIC VALUE",
+        "ELEMENT TEXT VALUE",
+        "QUANTITY",
+        "INSTALL DATE",
+        "REMAINING LIFE",
+        "ELEMENT COMMENTS"
+        ]
+    )
+    houses_asset_data.append([
+        12345,
+        12345,
+        "123 Fake Street, London, A10 1AB",
+        "LBWF_OWNED",
+        "OCCP",
+        "HOU",
+        "TERRACED",
+        "ASSETS",
+        "AHR_CAT",
+        "Accessible Housing Register Category",
+        "F",
+        "General Needs",
+        None,
+        None,
+        None,
+        1,
+        None,
+        None,
+        None,
+    ])
+    houses_asset_data.append([
+        54321,
+        54321,
+        "100 Random Road, London, A10 1AB",
+        "LBWF_OWNED",
+        "OCCP",
+        "HOU",
+        "EOT",
+        "ASSETS",
+        "INTSMKDET",
+        "Smoke Detectors in Property",
+        "HARDWRDMNS",
+        "Hard Wired Mains Smoke Alarm in Property",
+        None,
+        None,
+        None,
+        2,
+        datetime(2019,4,1),
+        4,
+        "Source of Data = Joe Bloggs",
+    ])
+    
+    houses = wb.create_sheet("Houses")
+    houses.append(["Reference", "Address", "EPC ", "SHDF", "HOSUE", "Fail Decency"])
+    houses.append([12345, "123 Fake Street, London, A10 1AB", "E", "NO", "HOUSE", 2025])
+    houses.append([54321, "100 Random Road, London, A10 1AB", "F", "NO", "HOUSE", 2025])
+
+    all_energy_breakdown = wb.create_sheet("All Energy Breakdown ") # Trailing space is intentional; matches source
+    all_energy_breakdown.append([
+        "UPRN",
+        "Organisation Reference",
+        "Alternate Organisation Reference",
+        "Address",
+        "Postcode"
+    ])
+    all_energy_breakdown.append([
+        1,
+        200,
+        None,
+        "123 FAKE STREET",
+        "A10 1AB"
+    ])
+    all_energy_breakdown.append([
+        2,
+        100,
+        101,
+        "100 RANDOM ROAD",
+        "A10 1AB"
+    ])
+
+    stream = BytesIO()
+    wb.save(stream)
+    stream.seek(0)
+
+    return stream
+
+def test_lbwf_parser_passes_houses(lbwf_homes_xlsx_bytes):
+    # arrange
+    parser = LbwfParser()
+
+    # act
+    result: Any = parser.parse(lbwf_homes_xlsx_bytes)
+
+    # assert
+    # TODO: Improve these asserts
+    assert len(result) == 2
+
+    assert isinstance(result[0], LbwfHouse)
+    assert result[0].uprn == 1
+    assert len(result[0].assets) == 1
+    assert isinstance(result[0].assets[0], LbwfAssetCondition)
+
+    assert isinstance(result[1], LbwfHouse)
+    assert result[1].uprn == 2
+    assert len(result[1].assets) == 1
+    assert isinstance(result[1].assets[0], LbwfAssetCondition)
--- a/backend/condition/tests/parsing/test_parsing_factory.py
+++ b/backend/condition/tests/parsing/test_parsing_factory.py
@ -0,0 +1,15 @@
+import pytest
+
+from backend.condition.parsing.factory import select_parser
+from backend.condition.file_type import FileType
+
+def test_selects_lbwf_parser():
+    # arrange
+    file_type = FileType.LBWF
+    expected_class_name = "LbwfParser"
+
+    # act
+    actual_class_name = select_parser(file_type).__class__.__name__
+
+    # assert
+    assert expected_class_name == actual_class_name
--- a/backend/condition/tests/test_detect_file_type.py
+++ b/backend/condition/tests/test_detect_file_type.py
@ -0,0 +1,22 @@
+import pytest
+
+from backend.condition.file_type import FileType, detect_file_type
+
+def test_detects_lbwf_file_type():
+    # arrange
+    file_path_str = "uploads/lbwf/Exaple Asset Data.xlsx"
+    expected_file_type = FileType.LBWF
+
+    # act
+    actual_file_type: FileType = detect_file_type(file_path_str)
+
+    # assert
+    assert expected_file_type == actual_file_type
+
+def test_unknown_filepath_raises_value_error():
+    # arrange
+    file_path_str = "unknown/Example Asset Data.xlsx"
+
+    # act + assert
+    with pytest.raises(ValueError):
+        detect_file_type(file_path_str)
--- a/backend/condition/utils/date_utils.py
+++ b/backend/condition/utils/date_utils.py
@ -0,0 +1,18 @@
+from datetime import datetime, date
+from typing import Any
+
+
+def normalise_date(value: Any, allow_none: bool = True) -> date | None:
+    if value is None and allow_none:
+        return None
+    
+    if isinstance(value, datetime):
+        return value.date()
+    
+    if isinstance(value, str):
+        try:
+            return datetime.strptime(value.strip(), "%d/%m/%Y").date()
+        except ValueError as exc:
+            raise ValueError(f"Invalid date string: {value!r}") from exc
+        
+    raise ValueError(f"Unexpected date value: {value!r}")
--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +1,4 @@
 [pytest]
 pythonpath = .
 addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial
-testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests
+testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests