diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 56c366f4..ccfb55b6 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.12-bullseye +FROM python:3.11.10-bullseye + ARG USER=vscode ARG DEBIAN_FRONTEND=noninteractive @@ -24,12 +25,17 @@ RUN useradd -m -s /usr/bin/bash ${USER} \ && echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \ && chmod 0440 /etc/sudoers.d/${USER} -# 4) Python deps -ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 -# Model +# # 4) Python deps - if you want to run assest list +# ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 # ADD asset_list/requirements.txt requirements.txt -# FASTAPI backend -ADD .devcontainer/requirements.txt requirements.txt +# RUN pip install -r requirements.txt + +# +ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 +ADD backend/engine/requirements.txt requirements1.txt +ADD backend/app/requirements/requirements.txt requirements2.txt +ADD .devcontainer/requirements.txt requirements3.txt +RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt RUN pip install -r requirements.txt # 5) Workdir @@ -37,4 +43,4 @@ WORKDIR /workspaces/model # 6) Make Python find your package # Add project root to PYTHONPATH for all processes -ENV PYTHONPATH=/workspaces/model:${PYTHONPATH} +ENV PYTHONPATH=/workspaces/model:${PYTHONPATH} \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 91a76c3d..761786cd 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -27,5 +27,8 @@ "ms-python.vscode-python-envs" ] } + }, + "containerEnv": { + "PYTHONFLAGS": "-Xfrozen_modules=off" } } diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt index d8c51f19..300b86b0 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/requirements.txt @@ -14,4 +14,7 @@ openpyxl==3.1.2 pytz uvicorn[standard] sqlmodel - +# Testing +pytest==9.0.2 +pytest-cov==7.0.0 +ipykernel>=6.25,<7 \ No newline at end of file diff --git a/.gitignore b/.gitignore index a6538116..6268360b 100644 --- a/.gitignore +++ b/.gitignore @@ -242,6 +242,8 @@ fabric.properties local_data/* /local_data/* etl/epc/local_data/* +/backend/condition/sample_data/lbwf/* +/backend/condition/sample_data/peabody/* *.DS_Store infrastructure/terraform/.terraform* diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..6b76b4fa --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 27782c10..88c2ae2d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,6 +9,9 @@ "path": "/bin/bash" } }, + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.testing.pytestArgs": ["-s", "-q", "--no-cov"] // Hot reload setting that needs to be in user settings // "jupyter.runStartupCommands": [ diff --git a/backend/condition/__init__.py b/backend/condition/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/condition/file_type.py b/backend/condition/file_type.py new file mode 100644 index 00000000..b9a4357f --- /dev/null +++ b/backend/condition/file_type.py @@ -0,0 +1,12 @@ +from enum import Enum + +class FileType(Enum): + LBWF = "lbwf" + +def detect_file_type(filepath: str) -> FileType: + path = filepath.lower() + + if "lbwf" in path: + return FileType.LBWF + + raise ValueError("Unrecognised file path") \ No newline at end of file diff --git a/backend/condition/handler.py b/backend/condition/handler.py new file mode 100644 index 00000000..5279b029 --- /dev/null +++ b/backend/condition/handler.py @@ -0,0 +1,16 @@ +from typing import Mapping, Any +from io import BytesIO + +from utils.logger import setup_logger +from backend.condition.processor import process_file + + +logger = setup_logger() + +def handler(event: Mapping[str, Any], context: Any) -> None: + # Temporary stub for PoC wiring + dummy_stream = BytesIO(b"") + + source_key = event.get("source_key", "unknown-source") + + process_file(dummy_stream, source_key) \ No newline at end of file diff --git a/backend/condition/local_runner.py b/backend/condition/local_runner.py new file mode 100644 index 00000000..28f9b06c --- /dev/null +++ b/backend/condition/local_runner.py @@ -0,0 +1,25 @@ +from pathlib import Path + +from backend.condition.processor import process_file + +def main() -> None: + try: + # Works in scripts / debugger / pytest + ROOT_DIR = Path(__file__).resolve().parents[1] + except NameError: + # __file__ is not defined in notebooks + ROOT_DIR = Path.cwd() + + path: Path = ROOT_DIR / "condition" / "sample_data" + + lbwf_path: Path = path / "lbwf" / "LBWF - Example Asset Data September 2025.xlsx" # TODO: get this from s3 as part of devcontainer init + + with lbwf_path.open("rb") as f: + process_file( + file_stream=f, + source_key=lbwf_path.as_posix(), + ) + +if __name__ == "__main__": + main() + diff --git a/backend/condition/parsing/factory.py b/backend/condition/parsing/factory.py new file mode 100644 index 00000000..01dce75d --- /dev/null +++ b/backend/condition/parsing/factory.py @@ -0,0 +1,9 @@ +from backend.condition.file_type import FileType +from backend.condition.parsing.parser import Parser +from backend.condition.parsing.lbwf_parser import LbwfParser + +def select_parser(file_type: FileType) -> Parser: + if file_type is FileType.LBWF: + return LbwfParser() + + raise ValueError("Unrecognised file type, unable to instantiate Parser") diff --git a/backend/condition/parsing/lbwf_parser.py b/backend/condition/parsing/lbwf_parser.py new file mode 100644 index 00000000..8d52f6d5 --- /dev/null +++ b/backend/condition/parsing/lbwf_parser.py @@ -0,0 +1,180 @@ +from typing import BinaryIO, Any, Dict, Iterator, List, Tuple +from openpyxl import Workbook, load_workbook +from collections import defaultdict + +from backend.condition.parsing.parser import Parser +from backend.condition.parsing.records.lbwf.lbwf_asset_condition import LbwfAssetCondition +from backend.condition.parsing.records.lbwf.lbwf_house import LbwfHouse +from backend.condition.utils.date_utils import normalise_date +from utils.logger import setup_logger + +logger = setup_logger + +class LbwfParser(Parser): + + def parse(self, file_stream: BinaryIO) -> Any: + wb: Workbook = load_workbook(file_stream) + address_to_uprn_map: Dict[str, int] = self._generate_address_to_uprn_dict(wb) + + assets = self._parse_assets(wb) + houses = self._parse_houses(wb, address_to_uprn_map) + + self._merge_assets_into_houses(assets, houses) + + return houses + + @staticmethod + def _parse_assets(wb: Workbook) -> List[LbwfAssetCondition]: + assets_sheet = wb["Houses Asset Data"] + asset_rows = assets_sheet.iter_rows(values_only=True) + + asset_headers = next(asset_rows) + asset_header_indexes = LbwfParser._get_column_indexes_by_name(asset_headers) + + assets: List[LbwfAssetCondition] = [] + for row in asset_rows: + try: + assets.append( + LbwfParser._map_row_to_asset_record(row, asset_header_indexes) + ) + except Exception as e: + logger.error(f"Error mapping LBWF row to asset record: {e}") + continue + + return assets + + @staticmethod + def _parse_houses( + wb: Workbook, + address_to_uprn_map: Dict[str, int], + ) -> List[LbwfHouse]: + houses_sheet = wb["Houses"] + house_rows = houses_sheet.iter_rows(values_only=True) + + house_headers = next(house_rows) + house_header_indexes = LbwfParser._get_column_indexes_by_name(house_headers) + + houses: List[LbwfHouse] = [] + for row in house_rows: + try: + houses.append( + LbwfParser._map_row_to_house_record( + row, + house_header_indexes, + address_to_uprn_map, + ) + ) + except Exception as e: + logger.error(f"Error mapping LBWF row to house record: {e}") + continue + + return houses + + @staticmethod + def _merge_assets_into_houses( + assets: List[LbwfAssetCondition], + houses: List[LbwfHouse], + ) -> None: + assets_by_ref: Dict[int, List[LbwfAssetCondition]] = defaultdict(list) + for asset in assets: + assets_by_ref[asset.prop_ref].append(asset) + + for house in houses: + house.assets = assets_by_ref.get(house.reference, []) + + + @staticmethod + def _map_row_to_house_record( + row: Any | Tuple[object | None, ...], + header_indexes: Dict[str, int], + address_to_uprn_map: Dict[str, int], + ) -> LbwfHouse: + address: str = row[header_indexes["Address"]] + + return LbwfHouse( + uprn=LbwfParser._get_uprn_from_address(address, address_to_uprn_map), + reference=row[header_indexes["Reference"]], + address=address, + epc=row[header_indexes["EPC "]], + shdf=row[header_indexes["SHDF"]], + house=row[header_indexes["HOSUE"]], + fail_decency=row[header_indexes["Fail Decency"]], + assets=[], + ) + + @staticmethod + def _map_row_to_asset_record( + row: Any | Tuple[object | None, ...], + header_indexes: Dict[str, int], + ) -> LbwfAssetCondition: + return LbwfAssetCondition( + prop_ref=row[header_indexes["PROP REF"]], + domna=row[header_indexes["Domna"]], + address=row[header_indexes["ADDRESS"]], + ownership=row[header_indexes["OWNERSHIP"]], + prop_status=row[header_indexes["PROP STATUS"]], + prop_type=row[header_indexes["PROP TYPE"]], + prop_sub_type=row[header_indexes["PROP SUB TYPE"]], + element_group=row[header_indexes["ELEMENT GROUP"]], + element_code=row[header_indexes["ELEMENT CODE"]], + element_code_description=row[header_indexes["ELEMENT CODE DESCRIPTION"]], + attribute_code=row[header_indexes["ATTRIBUTE CODE"]], + attribute_code_description=row[header_indexes["ATTRIBUTE CODE DESCRIPTION"]], + element_date_value=row[header_indexes["ELEMENT DATE VALUE"]], + element_numerical_value=row[header_indexes["ELEMENT NUMERIC VALUE"]], + element_text_value=row[header_indexes["ELEMENT TEXT VALUE"]], + quantity=row[header_indexes["QUANTITY"]], + install_date=normalise_date(row[header_indexes["INSTALL DATE"]]), + remaining_life=row[header_indexes["REMAINING LIFE"]], + element_comments=row[header_indexes["ELEMENT COMMENTS"]], + ) + + + @staticmethod + def _generate_address_to_uprn_dict(wb: Workbook) -> Dict[str, int | None]: + sheet: Workbook = wb["All Energy Breakdown "] + + rows: Iterator[Tuple[object | None, ...]] = sheet.iter_rows(values_only=True) + + headers = next(rows) + header_indexes: Dict[str, int] = LbwfParser._get_column_indexes_by_name(headers) + + address_idx = header_indexes["Address"] + uprn_idx = header_indexes["UPRN"] + + mapping: Dict[str, int | None] = {} + + for row in rows: + address = row[address_idx] + uprn = row[uprn_idx] + + if not isinstance(address, str): + continue + + if uprn is not None and not isinstance(uprn, int): + raise ValueError(f"Unexpected UPRN value: {uprn!r}") + + mapping[address] = uprn + + return mapping + + + def _get_column_indexes_by_name( + headers: Tuple[object | None, ...] + ) -> Dict[str, int]: + index: Dict[str, int] = {} + + for i, header in enumerate(headers): + if isinstance(header, str): + index[header] = i + + return index + + def _get_uprn_from_address(address: str, address_to_uprn_map: Dict[str, int]) -> int | None: + pseudo_name = address.split(",")[0] + + if pseudo_name.lower() in (k.lower() for k in address_to_uprn_map.keys()): + return address_to_uprn_map[pseudo_name.upper()] + + return None + diff --git a/backend/condition/parsing/parser.py b/backend/condition/parsing/parser.py new file mode 100644 index 00000000..105fda36 --- /dev/null +++ b/backend/condition/parsing/parser.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod +from typing import BinaryIO, Any + +class Parser(ABC): + + @abstractmethod + def parse(self, file_stream: BinaryIO) -> Any: + pass \ No newline at end of file diff --git a/backend/condition/parsing/records/lbwf/lbwf_asset_condition.py b/backend/condition/parsing/records/lbwf/lbwf_asset_condition.py new file mode 100644 index 00000000..dffd1e53 --- /dev/null +++ b/backend/condition/parsing/records/lbwf/lbwf_asset_condition.py @@ -0,0 +1,26 @@ +from dataclasses import dataclass +from datetime import date + + +@dataclass +class LbwfAssetCondition: + prop_ref: int + domna: int + address: str + ownership: str + prop_status: str + prop_type: str # TODO: make this enum? + prop_sub_type: str # TODO: make this enum? + element_group: str + element_code: str + element_code_description: str + attribute_code: str + attribute_code_description: str + element_date_value: str | None = None + element_numerical_value: int | None = None + element_text_value: str | None = None + quantity: int | None = None + install_date: date | None = None + remaining_life: int | None = None + element_comments: str | None = None + diff --git a/backend/condition/parsing/records/lbwf/lbwf_house.py b/backend/condition/parsing/records/lbwf/lbwf_house.py new file mode 100644 index 00000000..6db16862 --- /dev/null +++ b/backend/condition/parsing/records/lbwf/lbwf_house.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass +from typing import List + +from backend.condition.parsing.records.lbwf.lbwf_asset_condition import LbwfAssetCondition + +@dataclass +class LbwfHouse: + uprn: int + reference: int + address: str + epc: str # TODO: make enum + shdf: bool + house: str + fail_decency: int + assets: List[LbwfAssetCondition] \ No newline at end of file diff --git a/backend/condition/processor.py b/backend/condition/processor.py new file mode 100644 index 00000000..fb06c888 --- /dev/null +++ b/backend/condition/processor.py @@ -0,0 +1,18 @@ +from typing import Any, BinaryIO, List + +from backend.condition.parsing.parser import Parser +from utils.logger import setup_logger +from backend.condition.file_type import FileType, detect_file_type +from backend.condition.parsing.factory import select_parser + +def process_file(file_stream: BinaryIO, source_key: str) -> None: + print(f"[processor] Received file: {source_key}") + + # Instantiation + file_type: FileType = detect_file_type(source_key) + parser: Parser = select_parser(file_type) + + # Orchestration + records: List[Any] = parser.parse(file_stream) + + print(records) # temp \ No newline at end of file diff --git a/backend/condition/tests/parsing/test_lbwf_parser.py b/backend/condition/tests/parsing/test_lbwf_parser.py new file mode 100644 index 00000000..7556b845 --- /dev/null +++ b/backend/condition/tests/parsing/test_lbwf_parser.py @@ -0,0 +1,134 @@ +from typing import Any +import pytest +from io import BytesIO +from openpyxl import Workbook +from datetime import datetime + +from backend.condition.parsing.lbwf_parser import LbwfParser +from backend.condition.parsing.records.lbwf.lbwf_asset_condition import LbwfAssetCondition +from backend.condition.parsing.records.lbwf.lbwf_house import LbwfHouse + +@pytest.fixture +def lbwf_homes_xlsx_bytes() -> BytesIO: + wb = Workbook() + houses_asset_data = wb.active + houses_asset_data.title = "Houses Asset Data" + houses_asset_data.append([ + "PROP REF", + "Domna", + "ADDRESS", + "OWNERSHIP", + "PROP STATUS", + "PROP TYPE", + "PROP SUB TYPE", + "ELEMENT GROUP", + "ELEMENT CODE", + "ELEMENT CODE DESCRIPTION", + "ATTRIBUTE CODE", + "ATTRIBUTE CODE DESCRIPTION", + "ELEMENT DATE VALUE", + "ELEMENT NUMERIC VALUE", + "ELEMENT TEXT VALUE", + "QUANTITY", + "INSTALL DATE", + "REMAINING LIFE", + "ELEMENT COMMENTS" + ] + ) + houses_asset_data.append([ + 12345, + 12345, + "123 Fake Street, London, A10 1AB", + "LBWF_OWNED", + "OCCP", + "HOU", + "TERRACED", + "ASSETS", + "AHR_CAT", + "Accessible Housing Register Category", + "F", + "General Needs", + None, + None, + None, + 1, + None, + None, + None, + ]) + houses_asset_data.append([ + 54321, + 54321, + "100 Random Road, London, A10 1AB", + "LBWF_OWNED", + "OCCP", + "HOU", + "EOT", + "ASSETS", + "INTSMKDET", + "Smoke Detectors in Property", + "HARDWRDMNS", + "Hard Wired Mains Smoke Alarm in Property", + None, + None, + None, + 2, + datetime(2019,4,1), + 4, + "Source of Data = Joe Bloggs", + ]) + + houses = wb.create_sheet("Houses") + houses.append(["Reference", "Address", "EPC ", "SHDF", "HOSUE", "Fail Decency"]) + houses.append([12345, "123 Fake Street, London, A10 1AB", "E", "NO", "HOUSE", 2025]) + houses.append([54321, "100 Random Road, London, A10 1AB", "F", "NO", "HOUSE", 2025]) + + all_energy_breakdown = wb.create_sheet("All Energy Breakdown ") # Trailing space is intentional; matches source + all_energy_breakdown.append([ + "UPRN", + "Organisation Reference", + "Alternate Organisation Reference", + "Address", + "Postcode" + ]) + all_energy_breakdown.append([ + 1, + 200, + None, + "123 FAKE STREET", + "A10 1AB" + ]) + all_energy_breakdown.append([ + 2, + 100, + 101, + "100 RANDOM ROAD", + "A10 1AB" + ]) + + stream = BytesIO() + wb.save(stream) + stream.seek(0) + + return stream + +def test_lbwf_parser_passes_houses(lbwf_homes_xlsx_bytes): + # arrange + parser = LbwfParser() + + # act + result: Any = parser.parse(lbwf_homes_xlsx_bytes) + + # assert + # TODO: Improve these asserts + assert len(result) == 2 + + assert isinstance(result[0], LbwfHouse) + assert result[0].uprn == 1 + assert len(result[0].assets) == 1 + assert isinstance(result[0].assets[0], LbwfAssetCondition) + + assert isinstance(result[1], LbwfHouse) + assert result[1].uprn == 2 + assert len(result[1].assets) == 1 + assert isinstance(result[1].assets[0], LbwfAssetCondition) \ No newline at end of file diff --git a/backend/condition/tests/parsing/test_parsing_factory.py b/backend/condition/tests/parsing/test_parsing_factory.py new file mode 100644 index 00000000..481418d7 --- /dev/null +++ b/backend/condition/tests/parsing/test_parsing_factory.py @@ -0,0 +1,15 @@ +import pytest + +from backend.condition.parsing.factory import select_parser +from backend.condition.file_type import FileType + +def test_selects_lbwf_parser(): + # arrange + file_type = FileType.LBWF + expected_class_name = "LbwfParser" + + # act + actual_class_name = select_parser(file_type).__class__.__name__ + + # assert + assert expected_class_name == actual_class_name \ No newline at end of file diff --git a/backend/condition/tests/test_detect_file_type.py b/backend/condition/tests/test_detect_file_type.py new file mode 100644 index 00000000..fecf22c1 --- /dev/null +++ b/backend/condition/tests/test_detect_file_type.py @@ -0,0 +1,22 @@ +import pytest + +from backend.condition.file_type import FileType, detect_file_type + +def test_detects_lbwf_file_type(): + # arrange + file_path_str = "uploads/lbwf/Exaple Asset Data.xlsx" + expected_file_type = FileType.LBWF + + # act + actual_file_type: FileType = detect_file_type(file_path_str) + + # assert + assert expected_file_type == actual_file_type + +def test_unknown_filepath_raises_value_error(): + # arrange + file_path_str = "unknown/Example Asset Data.xlsx" + + # act + assert + with pytest.raises(ValueError): + detect_file_type(file_path_str) \ No newline at end of file diff --git a/backend/condition/utils/date_utils.py b/backend/condition/utils/date_utils.py new file mode 100644 index 00000000..713d151c --- /dev/null +++ b/backend/condition/utils/date_utils.py @@ -0,0 +1,18 @@ +from datetime import datetime, date +from typing import Any + + +def normalise_date(value: Any, allow_none: bool = True) -> date | None: + if value is None and allow_none: + return None + + if isinstance(value, datetime): + return value.date() + + if isinstance(value, str): + try: + return datetime.strptime(value.strip(), "%d/%m/%Y").date() + except ValueError as exc: + raise ValueError(f"Invalid date string: {value!r}") from exc + + raise ValueError(f"Unexpected date value: {value!r}") \ No newline at end of file diff --git a/pytest.ini b/pytest.ini index 84c686b1..1422657b 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] pythonpath = . addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests