Merge branch 'main' of https://github.com/Hestia-Homes/Model into debugging-ara-runs

This commit is contained in:
Khalim Conn-Kowlessar 2026-02-10 14:31:19 +00:00
commit 50256d4cff
44 changed files with 754 additions and 189 deletions

View file

@ -27,8 +27,9 @@ RUN useradd -m -s /usr/bin/bash ${USER} \
# # 4) Python deps - if you want to run assest list
ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
ADD asset_list/requirements.txt requirements.txt
RUN pip install -r requirements.txt
ADD .devcontainer/asset_list/requirements.txt requirements2.txt
ADD asset_list/requirements.txt requirements1.txt
RUN cat requirements1.txt requirements2.txt >> requirements.txt
RUN pip install -r requirements.txt
# 5) Workdir

View file

@ -15,10 +15,9 @@ uvicorn[standard]
pytest==9.0.2
pytest-cov==7.0.0
ipykernel>=6.25,<7
pydantic-settings<2
pyyaml>=6.0.1
pydantic>=1.10.7,<2
sqlmodel
# Formatting
black==26.1.0
dotenv
pydantic-settings

View file

@ -13,6 +13,9 @@ on:
required: false
default: "."
type: string
build_args:
required: false
type: string
outputs:
image_digest:
@ -29,11 +32,22 @@ on:
required: true
AWS_REGION:
required: true
DEV_DB_HOST:
required: false
DEV_DB_PORT:
required: false
DEV_DB_NAME:
required: false
jobs:
build:
runs-on: ubuntu-latest
env:
DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }}
DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
outputs:
image_digest: ${{ steps.digest.outputs.image_digest }}
ecr_repo_url: ${{ steps.repo.outputs.ecr_repo_url }}
@ -64,7 +78,22 @@ jobs:
- name: Build & push image
run: |
IMAGE_URI="${{ steps.repo.outputs.ecr_repo_url }}:${GITHUB_SHA}"
docker build -f ${{ inputs.dockerfile_path }} -t $IMAGE_URI ${{ inputs.build_context }}
# Writes build args and removes line breaks
BUILD_ARGS=""
while IFS= read -r line; do
# skip empty lines
[ -n "$line" ] || continue
temp=$(eval echo "$line")
BUILD_ARGS="$BUILD_ARGS --build-arg $temp"
done <<< "${{ inputs.build_args }}"
docker build \
-f ${{ inputs.dockerfile_path }} \
$BUILD_ARGS \
-t $IMAGE_URI \
${{ inputs.build_context }}
docker push $IMAGE_URI
- name: Resolve image digest

View file

@ -16,6 +16,7 @@ jobs:
id: set-stage
shell: bash
run: |
env
BRANCH="${GITHUB_REF_NAME}"
if [[ "$BRANCH" == "prod" ]]; then
@ -73,8 +74,8 @@ jobs:
uses: ./.github/workflows/_build_image.yml
with:
ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }}
dockerfile_path: backend/address2UPRN/Dockerfile
build_context: backend/address2UPRN
dockerfile_path: backend/address2UPRN/handler/Dockerfile
build_context: .
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
@ -96,3 +97,76 @@ jobs:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
# ============================================================
# 2⃣ Build Postcode Splitter image and Push
# ============================================================
postcodeSplitter_image:
needs: [determine_stage, shared_terraform]
uses: ./.github/workflows/_build_image.yml
with:
ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }}
dockerfile_path: backend/postcode_splitter/handler/Dockerfile
build_context: .
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
# ============================================================
# 3⃣ Deploy Postcode Splitter Lambda
# ============================================================
postcodeSplitter_lambda:
needs: [postcodeSplitter_image, determine_stage]
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: postcodeSplitter
lambda_path: infrastructure/terraform/lambda/postcodeSplitter
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }}
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
# ============================================================
# Condition ETL image and Push
# ============================================================
condition_etl_image:
needs: [determine_stage, shared_terraform]
uses: ./.github/workflows/_build_image.yml
with:
ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }}
dockerfile_path: backend/condition/handler/Dockerfile
build_context: .
build_args: |
DEV_DB_HOST=$DEV_DB_HOST
DEV_DB_PORT=$DEV_DB_PORT
DEV_DB_NAME=$DEV_DB_NAME
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }}
DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
# ============================================================
# Deploy Condition ETL Lambda
# ============================================================
condition_etl_lambda:
needs: [condition_etl_image, determine_stage]
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: condition-etl
lambda_path: infrastructure/terraform/lambda/condition-etl
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.condition_etl_image.outputs.image_digest }}
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}

View file

@ -6,17 +6,17 @@ DB_PASSWORD=makingwarmerhomes
#not used
GOOGLE_SOLAR_API_KEY="test"
SAP_PREDICTIONS_BUCKET="test"
CARBON_PREDICTIONS_BUCKET="test"
HEAT_PREDICTIONS_BUCKET="test"
HEATING_KWH_PREDICTIONS_BUCKET="test"
HOTWATER_KWH_PREDICTIONS_BUCKET="test"
API_KEY="test"
ENVIRONMENT="test"
SECRET_KEY="test"
PLAN_TRIGGER_BUCKET="test"
DATA_BUCKET="test"
EPC_AUTH_TOKEN="test"
ENGINE_SQS_URL="test"
ENERGY_ASSESSMENTS_BUCKET="test"
GOOGLE_SOLAR_API_KEY=test
SAP_PREDICTIONS_BUCKET=test
CARBON_PREDICTIONS_BUCKET=test
HEAT_PREDICTIONS_BUCKET=test
HEATING_KWH_PREDICTIONS_BUCKET=test
HOTWATER_KWH_PREDICTIONS_BUCKET=test
API_KEY=test
ENVIRONMENT=test
SECRET_KEY=test
PLAN_TRIGGER_BUCKET=test
DATA_BUCKET=test
EPC_AUTH_TOKEN=test
ENGINE_SQS_URL=test
ENERGY_ASSESSMENTS_BUCKET=test

View file

@ -1,7 +0,0 @@
FROM public.ecr.aws/lambda/python:3.10
# Copy function code
COPY main.py .
# Set the handler
CMD ["main.handler"]

View file

@ -0,0 +1,23 @@
FROM public.ecr.aws/lambda/python:3.10
# Set working directory (Lambda task root)
WORKDIR /var/task
# -----------------------------
# Copy requirements FIRST (for Docker layer caching)
# -----------------------------
COPY backend/address2UPRN/handler/requirements.txt .
# Install dependencies into Lambda runtime
RUN pip install --no-cache-dir -r requirements.txt
# -----------------------------
# Copy application code
# -----------------------------
COPY utils/ utils/
COPY backend/address2UPRN/main.py .
# -----------------------------
# Lambda handler
# -----------------------------
CMD ["main.handler"]

View file

@ -0,0 +1,3 @@
epc-api-python==1.0.2
tqdm
pandas

View file

@ -212,6 +212,8 @@ def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3):
method="get",
params={"postcode": postcode},
)
if not search_resp or "rows" not in search_resp:
return pd.DataFrame()
results_df = pd.DataFrame(search_resp["rows"], columns=search_resp["column-names"])
@ -298,7 +300,7 @@ def get_uprn_candidates(
)
def get_uprn(user_inputed_address: str, postcode: str):
def get_uprn(user_inputed_address: str, postcode: str, return_address=False):
"""
Return uprn (str)
Return False if failed to find a sensible matching epc
@ -337,6 +339,8 @@ def get_uprn(user_inputed_address: str, postcode: str):
if found_uprn == "":
return None
if return_address:
return found_uprn, address
return found_uprn

View file

@ -1,17 +1,24 @@
import pandas as pd
from tqdm import tqdm
from backend.address2UPRN.main import get_uprn
# Enable tqdm for pandas
tqdm.pandas()
df = pd.read_excel("address2.xlsx")
# use Address 1
junte_df = pd.read_excel("hackney_uprn_failures.xlsx")
def extract_uprn(row):
print(row["User Input"], row["Postcode"])
result = get_uprn(row["User Input"], row["Postcode"], return_address=True)
if result is None:
return pd.Series([None, None])
uprn, found_address = result
return pd.Series([uprn, found_address])
# use domna_address_1
khalim_df = pd.read_excel("khalim_standard.xlsx")
combined_df = junte_df.merge(khalim_df, how="left", left_on="Address 1", right_on='domna_address_1')
# Find the row in khalim_df that does not app
result = combined_df[~pd.isnull(combined_df["epc_os_uprn"])]
df[["juntes uprn", "junte found address"]] = df.progress_apply(extract_uprn, axis=1)
df.to_excel("outputs2.xlsx", index=False)

View file

@ -0,0 +1,33 @@
from enum import Enum
from typing import Optional
from pydantic import BaseModel
class ConditionFileType(Enum):
LBWF = "LBWF"
Peabody = "Peabody"
# TODO: make these asset management systems rather than client names
class ConditionTriggerRequest(BaseModel):
file_type: ConditionFileType
trigger_file_bucket: str # TODO: get this from settings
trigger_file_key: str
uprn_lookup_file_bucket: Optional[str] = None # TODO: get this from settings
uprn_lookup_file_key: Optional[str] = None
# {
# "file_type": "Peabody",
# "trigger_file_bucket": "condition-data-dev",
# "trigger_file_key": "input/peabody/2026_01_06 - Peabody - Stock Condition Data - Survey Records - D Lower.xlsx",
# "uprn_lookup_file_bucket": "condition-data-dev",
# "uprn_lookup_file_key": "input/peabody/uprn-lookup/PeabodyPropertymatched_Dec25_propref_UPRN.csv"
# }
# {
# "file_type": "LBWF",
# "trigger_file_bucket": "condition-data-dev",
# "trigger_file_key": "input/lbwf/LBWF - Example Asset Data September 2025.xlsx",
# }

View file

@ -1,4 +1,4 @@
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, Optional, Tuple
from datetime import date
from backend.condition.domain.aspect_condition import AspectCondition

View file

@ -1,16 +0,0 @@
from enum import Enum
class FileType(Enum):
LBWF = "lbwf"
Peabody = "peabody"
def detect_file_type(filepath: str) -> FileType:
path = filepath.lower()
if "lbwf" in path:
return FileType.LBWF
if "peabody" in path:
return FileType.Peabody
raise ValueError("Unrecognised file path")

View file

@ -1,16 +0,0 @@
from typing import Mapping, Any
from io import BytesIO
from utils.logger import setup_logger
from backend.condition.processor import process_file
logger = setup_logger()
def handler(event: Mapping[str, Any], context: Any) -> None:
# Temporary stub for PoC wiring
dummy_stream = BytesIO(b"")
source_key = event.get("source_key", "unknown-source")
process_file(dummy_stream, source_key)

View file

@ -0,0 +1,48 @@
FROM public.ecr.aws/lambda/python:3.11
# For local running:
# FROM python:3.11.10-bullseye
ARG DEV_DB_HOST
ARG DEV_DB_PORT
ARG DEV_DB_NAME
# Set working directory (Lambda task root)
WORKDIR /var/task
# Environment
ENV DB_HOST=${DEV_DB_HOST}
ENV DB_PORT=${DEV_DB_PORT}
ENV DB_NAME=${DEV_DB_NAME}
COPY backend/.env.local backend/.env.local
# -----------------------------
# Copy requirements FIRST (for Docker layer caching)
# -----------------------------
COPY backend/condition/handler/requirements.txt .
# Install dependencies into Lambda runtime
RUN pip install --no-cache-dir -r requirements.txt
# -----------------------------
# Copy application code
# -----------------------------
COPY utils/ utils/
COPY backend/condition/ backend/condition/
COPY backend/app/db/models/condition.py backend/app/db/models/condition.py
COPY backend/app/db/connection.py backend/app/db/connection.py
COPY backend/app/config.py backend/app/config.py
COPY backend/__init__.py backend/__init__.py
COPY backend/app/__init__.py backend/app/__init__.py
COPY backend/app/db/__init__.py backend/app/db/__init__.py
# -----------------------------
# Lambda handler
# -----------------------------
CMD ["backend/condition/handler/handler.handler"]
# For local running
# CMD ["python", "-m", "backend.condition.handler.handler"]

View file

@ -0,0 +1,51 @@
import json
from typing import Mapping, Any
from io import BytesIO
from backend.condition.condition_trigger_request import ConditionTriggerRequest
from backend.condition.lookups.uprn_lookup_s3 import UprnLookupS3
from backend.condition.processor import process_file
from utils.logger import setup_logger
from utils.s3 import read_io_from_s3
logger = setup_logger()
def handler(event: Mapping[str, Any], context: Any) -> None:
for record in event.get("Records", []):
try:
body_dict = json.loads(record["body"])
logger.debug("Validating request body")
payload = ConditionTriggerRequest.model_validate(body_dict)
logger.debug("Successfully validated request body")
if payload.uprn_lookup_file_bucket and payload.uprn_lookup_file_key:
logger.debug("Getting UPRN lookup file from s3")
uprn_lookup = UprnLookupS3(
bucket=payload.uprn_lookup_file_bucket,
key=payload.uprn_lookup_file_key,
) # TODO: replace with postgres implementation
logger.debug("Successfully got UPRN lookup file from s3")
else:
uprn_lookup = None
logger.debug("Getting conditions data from s3")
file_bytes: BytesIO = read_io_from_s3(
bucket_name=payload.trigger_file_bucket,
file_key=payload.trigger_file_key,
)
logger.debug(
"Successfully got conditions data from s3. Moving on to process file..."
)
process_file(
file_stream=file_bytes,
file_type=payload.file_type,
uprn_lookup=uprn_lookup,
)
except Exception as e:
logger.error(f"Failed to process record: {e}")

View file

@ -0,0 +1,9 @@
openpyxl
sqlmodel
pydantic-settings
psycopg2-binary==2.9.10
# pandas isn't used, but needed for importing from utils.s3
pandas==2.2.2
numpy==1.26.4
openpyxl

View file

@ -1,5 +1,7 @@
from pathlib import Path
from backend.condition.condition_trigger_request import ConditionFileType
from backend.condition.lookups.uprn_lookup_csv import UprnLookupLocal
from backend.condition.processor import process_file
@ -20,15 +22,27 @@ def main() -> None:
/ "peabody"
/ "2026_01_06 - Peabody - Stock Condition Data - Survey Records - D Lower.xlsx"
)
filepaths = [lbwf_path, peabody_path]
# filepaths = [lbwf_path]
peabody_uprn_lookup_path: Path = (
path / "peabody" / "PeabodyPropertymatched_Dec25_propref_UPRN.csv"
)
# filepaths = [lbwf_path, peabody_path]
filepaths = [lbwf_path]
# filepaths = [peabody_path]
uprn_lookup = UprnLookupLocal(csv_path=peabody_uprn_lookup_path.as_posix())
def get_file_type(file_path: str) -> ConditionFileType:
if "peabody" in file_path:
return ConditionFileType.Peabody
if "lbwf" in file_path:
return ConditionFileType.LBWF
for fp in filepaths:
with fp.open("rb") as f:
process_file(
file_stream=f,
source_key=fp.as_posix(),
file_type=get_file_type(fp.as_posix()),
uprn_lookup=uprn_lookup,
)

View file

@ -0,0 +1,8 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Dict
class UprnLookup(ABC):
@abstractmethod
def get_property_ref_to_uprn_lookup(self) -> Dict[str, int]:
pass

View file

@ -0,0 +1,23 @@
import csv
from io import TextIOWrapper
from typing import BinaryIO, Dict, TextIO
from backend.condition.lookups.uprn_lookup import UprnLookup
class UprnLookupLocal(UprnLookup):
def __init__(self, csv_path: str):
self.csv_path = csv_path
def get_property_ref_to_uprn_lookup(self) -> Dict[str, int]:
with open(self.csv_path, "rb") as f:
return self.parse_csv(f)
def parse_csv(self, file_stream: BinaryIO) -> Dict[str, int]:
text_stream: TextIO = TextIOWrapper(file_stream, encoding="utf-8")
mapping: Dict[str, int] = {}
reader = csv.DictReader(text_stream)
for row in reader:
if not row["reference"] or not row["out_uprn"]:
continue
mapping[row["reference"].strip()] = int(row["out_uprn"].strip())
return mapping

View file

@ -0,0 +1,29 @@
import csv
from io import BytesIO, TextIOWrapper
from typing import BinaryIO, Dict, TextIO
from backend.condition.lookups.uprn_lookup import UprnLookup
from utils.s3 import read_io_from_s3
class UprnLookupS3(UprnLookup):
def __init__(self, bucket: str = "", key: str = ""):
self.bucket = bucket
self.key = key
def get_property_ref_to_uprn_lookup(self) -> Dict[str, int]:
file_bytes: BytesIO = read_io_from_s3(
bucket_name=self.bucket, file_key=self.key
)
return self._parse_csv_bytes(file_bytes)
def _parse_csv_bytes(self, file_stream: BinaryIO) -> Dict[str, int]:
text_stream: TextIO = TextIOWrapper(file_stream, encoding="utf-8")
mapping: Dict[str, int] = {}
reader = csv.DictReader(text_stream)
for row in reader:
if not row["reference"] or not row["out_uprn"]:
continue
mapping[row["reference"].strip()] = int(row["out_uprn"].strip())
return mapping

View file

@ -1,27 +1,35 @@
from typing import Optional
from backend.condition.condition_trigger_request import ConditionFileType
from backend.condition.domain.mapping.lbwf.lbwf_mapper import LbwfMapper
from backend.condition.domain.mapping.mapper import Mapper
from backend.condition.domain.mapping.peabody.peabody_mapper import PeabodyMapper
from backend.condition.file_type import FileType
from backend.condition.lookups.uprn_lookup import UprnLookup
from backend.condition.parsing.parser import Parser
from backend.condition.parsing.lbwf_parser import LbwfParser
from backend.condition.parsing.peabody_parser import PeabodyParser
def select_parser(file_type: FileType) -> Parser:
if file_type is FileType.LBWF:
def select_parser(
file_type: ConditionFileType, uprn_lookup: Optional[UprnLookup] = None
) -> Parser:
if file_type is ConditionFileType.LBWF:
return LbwfParser()
if file_type is FileType.Peabody:
return PeabodyParser()
if file_type is ConditionFileType.Peabody:
if not uprn_lookup:
raise ValueError(
"Cannot instantiate Peabody Parser without UPRN lookup being provided"
)
return PeabodyParser(uprn_lookup=uprn_lookup)
raise ValueError("Unrecognised file type, unable to instantiate Parser")
def select_mapper(file_type: FileType) -> Mapper:
if file_type is FileType.LBWF:
def select_mapper(file_type: ConditionFileType) -> Mapper:
if file_type is ConditionFileType.LBWF:
return LbwfMapper()
if file_type is FileType.Peabody:
if file_type is ConditionFileType.Peabody:
return PeabodyMapper()
raise ValueError("Unrecognised file type, unable to instantiate Mapper")

View file

@ -18,7 +18,6 @@ class LbwfParser(Parser):
def parse(
self,
file_stream: BinaryIO,
location_ref_to_uprn_map: Optional[Dict[str, int]] = None,
) -> Any:
wb: Workbook = load_workbook(file_stream)
address_to_uprn_map: Dict[str, int] = LbwfParser._generate_address_to_uprn_dict(

View file

@ -8,6 +8,5 @@ class Parser(ABC):
def parse(
self,
file_stream: BinaryIO,
location_ref_to_uprn_map: Optional[Dict[str, int]] = None,
) -> Any:
pass

View file

@ -4,6 +4,7 @@ from typing import Any, BinaryIO, Dict, List, Optional, Tuple, DefaultDict
from openpyxl import Workbook, load_workbook
from collections import defaultdict
from backend.condition.lookups.uprn_lookup import UprnLookup
from backend.condition.parsing.parser import Parser
from backend.condition.parsing.records.peabody.peabody_asset_condition import (
PeabodyAssetCondition,
@ -15,42 +16,29 @@ logger = setup_logger()
class PeabodyParser(Parser):
def __init__(self, uprn_lookup: UprnLookup):
self.uprn_lookup: UprnLookup = uprn_lookup # TODO: move this to the ABC?
def parse(
self,
file_stream: BinaryIO,
location_ref_to_uprn_map: Optional[Dict[str, int]] = None,
) -> Any:
wb: Workbook = load_workbook(file_stream)
if location_ref_to_uprn_map is None:
location_ref_to_uprn_map: Dict[str, int] = (
PeabodyParser._build_location_ref_to_uprn_map()
)
file_stream.seek(0)
logger.debug("[PeabodyParser] Loading workbook...")
wb: Workbook = load_workbook(file_stream, read_only=True, data_only=True)
logger.debug("[PeabodyParser] Successfully loaded workbook. Parsing assets...")
assets = PeabodyParser._parse_assets(wb)
logger.debug(
"[PeabodyParser] Successfully parsed assets. Parsing UPRN lookup..."
)
location_ref_to_uprn_map = self.uprn_lookup.get_property_ref_to_uprn_lookup()
logger.debug("[PeabodyParser] Successfully parsed UPRN lookup")
return PeabodyParser._group_assets_into_properties(
assets=assets,
location_ref_to_uprn_map=location_ref_to_uprn_map,
)
@staticmethod
def _build_location_ref_to_uprn_map() -> Dict[str, int]:
location_ref_to_uprn_filepath: Path = (
Path(__file__).resolve().parents[1]
/ "sample_data"
/ "peabody"
/ "PeabodyPropertymatched_Dec25_propref_UPRN.csv"
)
location_ref_to_uprn_map: Dict[str, int] = {}
with location_ref_to_uprn_filepath.open(newline="") as f:
reader: Any = csv.DictReader(f)
for row in reader:
location_ref_to_uprn_map[row["reference"]] = int(row["out_uprn"])
return location_ref_to_uprn_map
@staticmethod
def _parse_assets(wb: Workbook) -> List[PeabodyAssetCondition]:
assets_sheet = wb["Survey Records - D & Lower"]
@ -67,7 +55,7 @@ class PeabodyParser(Parser):
)
if not asset.is_block_level:
# Block-level condition surveys are out of scope for now
# until we have a wider think on how to handle block
# until we have a wider think on how to handle blocks
assets.append(asset) # TODO: handle block-level assets
except Exception as e:
@ -92,13 +80,14 @@ class PeabodyParser(Parser):
assets_by_location_reference[asset.lo_reference].append(asset)
properties: List[PeabodyProperty] = []
failed_mappings_count = 0
for location_ref, grouped_assets in assets_by_location_reference.items():
uprn = location_ref_to_uprn_map.get(location_ref)
if uprn is None:
logger.warning(f"No UPRN found for Location Reference: {location_ref}")
failed_mappings_count += 1
continue
properties.append(
@ -108,6 +97,7 @@ class PeabodyParser(Parser):
)
)
logger.warning(f"No UPRN found for {failed_mappings_count} Location References")
return properties
@staticmethod

View file

@ -19,18 +19,19 @@ class ConditionPostgres:
def bulk_insert_surveys(
self, surveys: List[PropertyConditionSurvey], batch_size: Optional[int] = 100
) -> None:
logger.info(
f"Preparing to load {len(surveys)} property surveys to Postgres. Mapping to SQLModel objects..."
logger.debug(
f"[ConditionPostgres] Preparing to load {len(surveys)} property surveys to Postgres. Mapping to SQLModel objects..."
)
survey_models: List[PropertyConditionSurveyModel] = [
ConditionPostgres.map_survey_to_model(s) for s in surveys
]
total: int = len(survey_models)
logger.info(
f"Finished mapping {total} surveys. Writing to database in batches of {batch_size}..."
logger.debug(
f"[ConditionPostgres] Finished mapping {total} surveys. Writing to database in batches of {batch_size}..."
)
with db_session() as session:
logger.info("[ConditionPostgres] Successfully made connection to database")
for start in range(0, total, batch_size):
end = min(start + batch_size, total)
batch = survey_models[start:end]

View file

@ -1,26 +1,31 @@
from typing import Any, BinaryIO, List
from typing import Any, BinaryIO, List, Optional
from datetime import datetime
from backend.condition.condition_trigger_request import ConditionFileType
from backend.condition.lookups.uprn_lookup import UprnLookup
from utils.logger import setup_logger
from backend.condition.domain.mapping.mapper import Mapper
from backend.condition.domain.property_condition_survey import PropertyConditionSurvey
from backend.condition.parsing.parser import Parser
from backend.condition.persistence.condition_postgres import ConditionPostgres
from backend.condition.file_type import FileType, detect_file_type
from backend.condition.parsing.factory import select_parser, select_mapper
logger = setup_logger()
def process_file(file_stream: BinaryIO, source_key: str) -> None:
logger.info(f"[processor] Received file: {source_key}")
def process_file(
file_stream: BinaryIO,
file_type: ConditionFileType,
uprn_lookup: Optional[UprnLookup],
) -> None:
# Instantiation
file_type: FileType = detect_file_type(source_key)
parser: Parser = select_parser(file_type)
logger.debug(f"[processor] Instantiating classes...")
parser: Parser = select_parser(file_type, uprn_lookup)
mapper: Mapper = select_mapper(file_type)
persistence = ConditionPostgres()
logger.debug(f"[processor] Finished instantiating classes. Calling Parser...")
# Orchestration
raw_properties: List[Any] = parser.parse(file_stream)

View file

@ -0,0 +1,34 @@
import pytest
from typing import Dict
from tempfile import NamedTemporaryFile
from backend.condition.lookups.uprn_lookup_csv import UprnLookupLocal
@pytest.fixture
def prop_ref_uprn_csv_file() -> str:
csv_content = """reference,out_uprn
ABC123,10000000001
DEF456,10000000002
GHI789,10000000003
"""
with NamedTemporaryFile(mode="w+", delete=False, suffix=".csv") as tmp:
tmp.write(csv_content)
tmp.flush()
return tmp.name
def test_generate_prop_ref_uprn_from_csv_file(prop_ref_uprn_csv_file: str) -> None:
# arrange
uprn_lookup = UprnLookupLocal(prop_ref_uprn_csv_file)
expected_map: Dict[str, int] = {
"ABC123": 10000000001,
"DEF456": 10000000002,
"GHI789": 10000000003,
}
# act
actual_map: Dict[str, int] = uprn_lookup.get_property_ref_to_uprn_lookup()
# assert
assert actual_map == expected_map

View file

@ -1,11 +1,13 @@
import pytest
from backend.condition.condition_trigger_request import ConditionFileType
from backend.condition.lookups.uprn_lookup_csv import UprnLookupLocal
from backend.condition.parsing.factory import select_parser
from backend.condition.file_type import FileType
def test_selects_lbwf_parser():
# arrange
file_type = FileType.LBWF
file_type = ConditionFileType.LBWF
expected_class_name = "LbwfParser"
# act
@ -14,13 +16,15 @@ def test_selects_lbwf_parser():
# assert
assert expected_class_name == actual_class_name
def test_selects_peabody_parser():
# arrange
file_type = FileType.Peabody
file_type = ConditionFileType.Peabody
expected_class_name = "PeabodyParser"
uprn_lookup = UprnLookupLocal(csv_path="test")
# act
actual_class_name = select_parser(file_type).__class__.__name__
actual_class_name = select_parser(file_type, uprn_lookup).__class__.__name__
# assert
assert expected_class_name == actual_class_name
assert expected_class_name == actual_class_name

View file

@ -1,9 +1,11 @@
from tempfile import NamedTemporaryFile
import pytest
from typing import Any, Dict
from io import BytesIO
from openpyxl import Workbook
from datetime import datetime
from backend.condition.lookups.uprn_lookup_csv import UprnLookupLocal
from backend.condition.parsing.peabody_parser import PeabodyParser
from backend.condition.parsing.records.peabody.peabody_asset_condition import (
PeabodyAssetCondition,
@ -145,23 +147,28 @@ def peabody_assets_xlsx_bytes() -> BytesIO:
@pytest.fixture
def location_ref_to_uprn_map() -> Dict[str, int]:
return {
"B000RAND": 1,
"B000BLOCK": 2,
"B000FAKE": 3,
"B000MIS": 4,
}
def prop_ref_uprn_csv_file() -> str:
csv_content = """reference,out_uprn
B000RAND,1
B000BLOCK,2
B000FAKE,3
B000MIS,4
"""
with NamedTemporaryFile(mode="w+", delete=False, suffix=".csv") as tmp:
tmp.write(csv_content)
tmp.flush()
return tmp.name
def test_peabody_parser_parses_conditions(
peabody_assets_xlsx_bytes, location_ref_to_uprn_map
peabody_assets_xlsx_bytes, prop_ref_uprn_csv_file
):
# arrange
parser = PeabodyParser()
uprn_lookup = UprnLookupLocal(csv_path=prop_ref_uprn_csv_file)
parser = PeabodyParser(uprn_lookup=uprn_lookup)
# act
result: Any = parser.parse(peabody_assets_xlsx_bytes, location_ref_to_uprn_map)
result: Any = parser.parse(peabody_assets_xlsx_bytes)
# assert
assert len(result) == 3

View file

@ -1,22 +0,0 @@
import pytest
from backend.condition.file_type import FileType, detect_file_type
def test_detects_lbwf_file_type():
# arrange
file_path_str = "uploads/lbwf/Exaple Asset Data.xlsx"
expected_file_type = FileType.LBWF
# act
actual_file_type: FileType = detect_file_type(file_path_str)
# assert
assert expected_file_type == actual_file_type
def test_unknown_filepath_raises_value_error():
# arrange
file_path_str = "unknown/Example Asset Data.xlsx"
# act + assert
with pytest.raises(ValueError):
detect_file_type(file_path_str)

View file

@ -0,0 +1,9 @@
FROM public.ecr.aws/lambda/python:3.10
# Set working directory (Lambda task root)
WORKDIR /var/task
# -----------------------------
# Lambda handler
# -----------------------------
CMD ["main.handler"]

View file

@ -1,10 +1,12 @@
import pandas as pd
import requests
from backend.address2UPRN.main import resolve_uprns_for_postcode_group, get_epc_data_with_postcode
from backend.address2UPRN.main import (
resolve_uprns_for_postcode_group,
get_epc_data_with_postcode,
)
from tqdm import tqdm
def sanitise_postcode(postcode: str) -> str | None:
"""
Normalise postcode for grouping.
@ -51,11 +53,7 @@ def main():
# --- validate AFTER grouping (save API calls) ---
# Get unique, non-null postcodes
unique_postcodes = (
df["postcode_clean"]
.dropna()
.unique()
)
unique_postcodes = df["postcode_clean"].dropna().unique()
# Validate each postcode once, TODOadd a progress bar
postcode_validity = {
@ -66,7 +64,6 @@ def main():
# Map validity back onto dataframe
df["postcode_valid"] = df["postcode_clean"].map(postcode_validity)
results = []
for postcode, group_df in tqdm(
@ -98,17 +95,33 @@ def main():
results.append(tmp)
final_df = pd.concat(results, ignore_index=True)
a = final_df[[
"best_match_lexiscore","Address 1",
"best_match_address", "Postcode",
"UPRN", "best_match_uprn"
]] # add levi score to viewing
b = final_df[final_df["best_match_lexiscore"]>0] # add levi score to viewing
b = b[[
"best_match_lexiscore","Address 1",
"best_match_address", "Postcode",
"UPRN", "best_match_uprn"
]]
a = final_df[
[
"best_match_lexiscore",
"Address 1",
"best_match_address",
"Postcode",
"UPRN",
"best_match_uprn",
]
] # add levi score to viewing
b = final_df[final_df["best_match_lexiscore"] > 0] # add levi score to viewing
b = b[
[
"best_match_lexiscore",
"Address 1",
"best_match_address",
"Postcode",
"UPRN",
"best_match_uprn",
]
]
def handler(event, context):
print("hello Postcode splitter world")
return {"statusCode": 200, "body": "hello world"}
if __name__ == "__main__":
main()

View file

@ -0,0 +1,43 @@
data "aws_secretsmanager_secret_version" "db_credentials" {
secret_id = "${var.stage}/assessment_model/db_credentials"
}
data "terraform_remote_state" "shared" {
backend = "s3"
config = {
bucket = "assessment-model-terraform-state"
key = "env:/${var.stage}/terraform.tfstate" # TODO: dont hardcode this
region = "eu-west-2"
}
}
locals {
db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)
}
module "lambda" {
source = "../modules/lambda_with_sqs"
name = "condition-etl"
stage = var.stage
image_uri = local.image_uri
timeout = 180
environment = merge(
{
STAGE = var.stage
LOG_LEVEL = "info"
DB_USERNAME = local.db_credentials.db_assessment_model_username
DB_PASSWORD = local.db_credentials.db_assessment_model_password
},
)
}
resource "aws_iam_role_policy_attachment" "attach_condition_etl_s3_read" {
role = module.lambda.role_name
policy_arn = data.terraform_remote_state.shared.outputs.condition_etl_s3_read_arn
}

View file

@ -0,0 +1,16 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.16"
}
}
backend "s3" {
bucket = "condition-etl-terraform-state"
key = "terraform.tfstate"
region = "eu-west-2"
}
required_version = ">= 1.2.0"
}

View file

@ -0,0 +1,27 @@
variable "lambda_name" {
type = string
description = "Logical name of the lambda (e.g. address2uprn)"
}
variable "stage" {
description = "Deployment stage (e.g. dev, prod)"
type = string
}
variable "ecr_repo_url" {
type = string
description = "ECR repository URL (no tag, no digest)"
}
variable "image_digest" {
type = string
description = "Image digest (sha256:...)"
}
locals {
image_uri = "${var.ecr_repo_url}@${var.image_digest}"
}
output "resolved_image_uri" {
value = local.image_uri
}

View file

@ -6,6 +6,10 @@ module "role" {
name = "${var.name}-lambda-${var.stage}"
}
output "role_name" {
value = module.role.role_name
}
############################################
# SQS queue + DLQ
############################################

View file

@ -0,0 +1,14 @@
module "lambda" {
source = "../modules/lambda_with_sqs"
name = "postcode-splitter"
stage = var.stage
image_uri = local.image_uri
environment = {
STAGE = var.stage
LOG_LEVEL = "info"
}
}

View file

@ -0,0 +1,16 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.16"
}
}
backend "s3" {
bucket = "postcode-splitter-terraform-state"
key = "terraform.tfstate"
region = "eu-west-2"
}
required_version = ">= 1.2.0"
}

View file

@ -0,0 +1,26 @@
variable "lambda_name" {
type = string
description = "Logical name of the lambda (e.g. address2uprn)"
}
variable "stage" {
description = "Deployment stage (e.g. dev, prod)"
type = string
}
variable "ecr_repo_url" {
type = string
description = "ECR repository URL (no tag, no digest)"
}
variable "image_digest" {
type = string
description = "Image digest (sha256:...)"
}
locals {
image_uri = "${var.ecr_repo_url}@${var.image_digest}"
}
output "resolved_image_uri" {
value = local.image_uri
}

View file

@ -298,10 +298,6 @@ module "address2uprn_state_bucket" {
}
output "address2uprn_state_bucket_name" {
value = module.address2uprn_state_bucket.bucket_name
}
module "address2uprn_registry" {
source = "../modules/container_registry"
name = "address2uprn"
@ -309,6 +305,62 @@ module "address2uprn_registry" {
}
output "address2uprn_repository_url" {
value = module.address2uprn_registry.repository_url
################################################
# Condition ETL Lambda ECR
################################################
module "condition_etl_state_bucket" {
source = "../modules/tf_state_bucket"
bucket_name = "condition-etl-terraform-state"
}
module "condition_etl_registry" {
source = "../modules/container_registry"
name = "condition-etl"
stage = var.stage
}
################################################
# Postcode Splitter Lambda ECR
################################################
module "postcode_splitter_state_bucket" {
source = "../modules/tf_state_bucket"
bucket_name = "postcode-splitter-terraform-state"
}
module "postcode_splitter_registry" {
source = "../modules/container_registry"
name = "postcode_splitter"
stage = var.stage
}
################################################
# Conidition data S3 bucket
################################################
module "condition_data_bucket" {
source = "../modules/s3"
bucketname = "condition-data-${var.stage}"
allowed_origins = var.allowed_origins
}
resource "aws_iam_policy" "condition_etl_s3_read" {
name = "ConditionETLReadS3"
description = "Allow Lambda to read objects from condition-data-${var.stage}"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = ["s3:GetObject"]
Resource = "arn:aws:s3:::condition-data-${var.stage}/*"
}
]
})
}
output "condition_etl_s3_read_arn" {
value = aws_iam_policy.condition_etl_s3_read.arn
}

View file

@ -2,6 +2,10 @@
This script prepares the data for the financial model
"""
from dotenv import load_dotenv
load_dotenv(".env.local")
import pandas as pd
import numpy as np
from backend.app.utils import sap_to_epc
@ -24,12 +28,12 @@ from sqlalchemy import func
# PORTFOLIO_ID = 206
# SCENARIOS = [389]
PORTFOLIO_ID = 502 # Peabody
PORTFOLIO_ID = 524
SCENARIOS = [
986,
1009,
]
scenario_names = {
986: "EPC C",
1009: "EPC C; Most Economic",
}