Merge pull request #1107 from Hestia-Homes/feature/rewrite_task_handler

Feature/rewrite task handler and postcode splitter
This commit is contained in:
Jun-te Kim 2026-05-20 15:56:02 +01:00 committed by GitHub
commit 9ad4f3359f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
201 changed files with 3429 additions and 120 deletions

View file

@ -6,7 +6,7 @@ backend/.idea/*
backend/.env
recommendations/tests/*
model_data/tests/*
infrastructure/*
deployment/*
data_collection/*
node_modules/*
conservation_areas/*

View file

@ -62,20 +62,20 @@ jobs:
- uses: hashicorp/setup-terraform@v3
- name: Terraform Init
working-directory: infrastructure/terraform/shared
working-directory: deployment/terraform/shared
run: terraform init -reconfigure
- name: Terraform Workspace
working-directory: infrastructure/terraform/shared
working-directory: deployment/terraform/shared
run: terraform workspace select ${STAGE} || terraform workspace new ${STAGE}
- name: Terraform Plan
working-directory: infrastructure/terraform/shared
working-directory: deployment/terraform/shared
run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan
- name: Terraform Apply
if: env.TERRAFORM_APPLY == 'true'
working-directory: infrastructure/terraform/shared
working-directory: deployment/terraform/shared
run: terraform apply -auto-approve tfplan
# ============================================================
@ -101,7 +101,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: ara_engine
lambda_path: infrastructure/terraform/lambda/engine
lambda_path: deployment/terraform/lambda/engine
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: engine-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.ara_engine_image.outputs.image_digest }}
@ -150,7 +150,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: address2uprn
lambda_path: infrastructure/terraform/lambda/address2UPRN
lambda_path: deployment/terraform/lambda/address2UPRN
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.address2uprn_image.outputs.image_digest }}
@ -191,7 +191,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: postcodeSplitter
lambda_path: infrastructure/terraform/lambda/postcodeSplitter
lambda_path: deployment/terraform/lambda/postcodeSplitter
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }}
@ -231,7 +231,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: bulk_address2uprn_combiner
lambda_path: infrastructure/terraform/lambda/bulk_address2uprn_combiner
lambda_path: deployment/terraform/lambda/bulk_address2uprn_combiner
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: bulk_address2uprn_combiner-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.bulk_address2uprn_combiner_image.outputs.image_digest }}
@ -271,7 +271,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: condition-etl
lambda_path: infrastructure/terraform/lambda/condition-etl
lambda_path: deployment/terraform/lambda/condition-etl
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.condition_etl_image.outputs.image_digest }}
@ -311,7 +311,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: categorisation
lambda_path: infrastructure/terraform/lambda/categorisation
lambda_path: deployment/terraform/lambda/categorisation
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: categorisation-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.categorisation_image.outputs.image_digest }}
@ -351,7 +351,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: ordnanceSurvey
lambda_path: infrastructure/terraform/lambda/ordnanceSurvey
lambda_path: deployment/terraform/lambda/ordnanceSurvey
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.ordnanceSurvey_image.outputs.image_digest }}
@ -386,7 +386,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: pashub_to_ara
lambda_path: infrastructure/terraform/lambda/pashub_to_ara
lambda_path: deployment/terraform/lambda/pashub_to_ara
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: pashub_to_ara-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.pashub_to_ara_image.outputs.image_digest }}
@ -419,7 +419,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: ara_fast_api
lambda_path: infrastructure/terraform/lambda/fast-api
lambda_path: deployment/terraform/lambda/fast-api
stage: ${{ needs.determine_stage.outputs.stage }}
terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
secrets:
@ -458,17 +458,17 @@ jobs:
- uses: hashicorp/setup-terraform@v3
- name: Terraform Init
working-directory: infrastructure/terraform/cdn_certificate
working-directory: deployment/terraform/cdn_certificate
run: terraform init -reconfigure
- name: Terraform Workspace
working-directory: infrastructure/terraform/cdn_certificate
working-directory: deployment/terraform/cdn_certificate
run: |
terraform workspace select $STAGE \
|| terraform workspace new $STAGE
- name: Terraform Plan
working-directory: infrastructure/terraform/cdn_certificate
working-directory: deployment/terraform/cdn_certificate
run: |
terraform plan \
-var="stage=${STAGE}" \
@ -476,7 +476,7 @@ jobs:
- name: Terraform Apply
if: env.TERRAFORM_APPLY == 'true'
working-directory: infrastructure/terraform/cdn_certificate
working-directory: deployment/terraform/cdn_certificate
run: terraform apply -auto-approve tfplan
@ -503,17 +503,17 @@ jobs:
- uses: hashicorp/setup-terraform@v3
- name: Terraform Init
working-directory: infrastructure/terraform/cdn
working-directory: deployment/terraform/cdn
run: terraform init -reconfigure
- name: Terraform Workspace
working-directory: infrastructure/terraform/cdn
working-directory: deployment/terraform/cdn
run: |
terraform workspace select $STAGE \
|| terraform workspace new $STAGE
- name: Terraform Plan
working-directory: infrastructure/terraform/cdn
working-directory: deployment/terraform/cdn
run: |
terraform plan \
-var="stage=${STAGE}" \
@ -521,7 +521,7 @@ jobs:
- name: Terraform Apply
if: env.TERRAFORM_APPLY == 'true'
working-directory: infrastructure/terraform/cdn
working-directory: deployment/terraform/cdn
run: terraform apply -auto-approve tfplan
# ============================================================
@ -562,7 +562,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: magic_plan
lambda_path: infrastructure/terraform/lambda/magic_plan
lambda_path: deployment/terraform/lambda/magic_plan
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: magic-plan-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.magic_plan_image.outputs.image_digest }}
@ -585,7 +585,7 @@ jobs:
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: hubspot-etl-to-ara
lambda_path: infrastructure/terraform/lambda/hubspot_deal_etl
lambda_path: deployment/terraform/lambda/hubspot_deal_etl
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: hubspot-etl-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.hubspot_etl_image.outputs.image_digest }}

View file

@ -60,3 +60,15 @@ jobs:
-e DB_PASSWORD=test \
-e DB_PORT=5432 \
model-test pytest -vv -m 'not integration'
# The DDD rewrite (tests/) defines SQLModel table classes that map to the
# same physical tables as the legacy backend models. Both sets share the
# one global SQLModel.metadata, so they cannot be imported into the same
# pytest process. It runs as a separate invocation until the legacy
# models are retired. Its DB is spawned in-process by pytest-postgresql,
# so no DB service or env is required.
- name: Run DDD tests
run: |
docker run --rm \
--network host \
model-test pytest -vv tests/

1
.gitignore vendored
View file

@ -121,6 +121,7 @@ celerybeat.pid
# Environments
.env
.env.local
.venv
env/
venv/

View file

@ -1,29 +0,0 @@
<!-- BACKLOG.MD MCP GUIDELINES START -->
<CRITICAL_INSTRUCTION>
## BACKLOG WORKFLOW INSTRUCTIONS
This project uses Backlog.md MCP for all task and project management activities.
**CRITICAL GUIDANCE**
- If your client supports MCP resources, read `backlog://workflow/overview` to understand when and how to use Backlog for this project.
- If your client only supports tools or the above request fails, call `backlog.get_backlog_instructions()` to load the tool-oriented overview. Use the `instruction` selector when you need `task-creation`, `task-execution`, or `task-finalization`.
- **First time working here?** Read the overview resource IMMEDIATELY to learn the workflow
- **Already familiar?** You should have the overview cached ("## Backlog.md Overview (MCP)")
- **When to read it**: BEFORE creating tasks, or when you're unsure whether to track work
These guides cover:
- Decision framework for when to create tasks
- Search-first workflow to avoid duplicates
- Links to detailed guides for task creation, execution, and finalization
- MCP tools reference
You MUST read the overview resource to understand the complete workflow. The information is NOT summarized here.
</CRITICAL_INSTRUCTION>
<!-- BACKLOG.MD MCP GUIDELINES END -->

View file

@ -1,33 +1,4 @@
<!-- BACKLOG.MD MCP GUIDELINES START -->
<CRITICAL_INSTRUCTION>
## BACKLOG WORKFLOW INSTRUCTIONS
This project uses Backlog.md MCP for all task and project management activities.
**CRITICAL GUIDANCE**
- If your client supports MCP resources, read `backlog://workflow/overview` to understand when and how to use Backlog for this project.
- If your client only supports tools or the above request fails, call `backlog.get_backlog_instructions()` to load the tool-oriented overview. Use the `instruction` selector when you need `task-creation`, `task-execution`, or `task-finalization`.
- **First time working here?** Read the overview resource IMMEDIATELY to learn the workflow
- **Already familiar?** You should have the overview cached ("## Backlog.md Overview (MCP)")
- **When to read it**: BEFORE creating tasks, or when you're unsure whether to track work
These guides cover:
- Decision framework for when to create tasks
- Search-first workflow to avoid duplicates
- Links to detailed guides for task creation, execution, and finalization
- MCP tools reference
You MUST read the overview resource to understand the complete workflow. The information is NOT summarized here.
</CRITICAL_INSTRUCTION>
<!-- BACKLOG.MD MCP GUIDELINES END -->
## Available Skills
Five Claude Code skills are installed in this repo's dev container. Each maps to a phase of the feature lifecycle.

View file

@ -4,7 +4,7 @@ model_data/local_data/
backend/node_modules/
backend/.idea/
backend/.env
infrastructure/
deployment/
data_collection/
node_modules/
conservation_areas/

View file

@ -23,7 +23,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve
|------|------------|------------------|
| **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" |
| **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" |
| **User Address** | A free-text address string provided by a user or imported from a customer dataset, before any normalisation or matching. | "user input", "raw address", "user_inputed_address" |
| **User Address** | A structured dataclass (`domain.addresses.user_address.UserAddress`) capturing a customer-supplied address: a free-text `user_address` line, a canonical `postcode` (sanitised on construction), and an optional `internal_reference`. The bare string sense -- the raw free-text address line as it arrives from upstream ingestion, before being wrapped -- remains valid when discussing CSV columns, API payloads, or other upstream contexts; in domain code, prefer the dataclass. | "user input", "raw address", "user_inputed_address" |
| **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" |
## Address Matching
@ -72,7 +72,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve
## Flagged ambiguities
- **"address"** appears as both the raw **User Address** (free-text from customer data) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1".
- **"address"** appears as both the raw **User Address** (free-text from customer data, or the structured `UserAddress` dataclass that wraps it) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". Within `domain/`, **User Address** specifically means the `UserAddress` dataclass; in upstream ingestion contexts (CSV columns, SQS payloads) it can still mean the raw string sense.
- **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments.
- **"user_inputed_address"** in `backend/address2UPRN/main.py` is a misspelling and a synonym for **User Address** — the canonical term. New code should use `user_address`.
- **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter.

0
applications/__init__.py Normal file
View file

View file

@ -0,0 +1,34 @@
FROM public.ecr.aws/lambda/python:3.11
# Postgres host/port/database are baked into the image at build time from
# the deploy workflow's --build-arg values (GitHub Actions DEV_DB_* secrets),
# mirroring backend/postcode_splitter/handler/Dockerfile. They map onto the
# POSTGRES_* names PostgresConfig.from_env reads. Username/password are NOT
# baked in -- Terraform injects those as Lambda env vars from Secrets Manager.
ARG DEV_DB_HOST
ARG DEV_DB_PORT
ARG DEV_DB_NAME
ENV POSTGRES_HOST=${DEV_DB_HOST}
ENV POSTGRES_PORT=${DEV_DB_PORT}
ENV POSTGRES_DATABASE=${DEV_DB_NAME}
WORKDIR /var/task
COPY applications/postcode_splitter/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy the layered source the handler imports from. The new splitter pulls
# only DDD-shaped packages — no pandas, no legacy backend/.
COPY domain/ domain/
COPY infrastructure/ infrastructure/
COPY orchestration/ orchestration/
COPY repositories/ repositories/
COPY utilities/ utilities/
COPY applications/ applications/
# Place the handler at the Lambda task root so the runtime can resolve
# ``main.handler`` without an extra package prefix.
COPY applications/postcode_splitter/handler.py /var/task/main.py
CMD ["main.handler"]

View file

@ -0,0 +1,52 @@
from __future__ import annotations
import os
from typing import Any
import boto3
from applications.postcode_splitter.postcode_splitter_trigger_body import (
PostcodeSplitterTriggerBody,
)
from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
from infrastructure.csv_s3_client import CsvS3Client
from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator
from orchestration.task_orchestrator import TaskOrchestrator
from repositories.user_address.user_address_csv_s3_repository import (
UserAddressCsvS3Repository,
)
from utilities.aws_lambda.subtask_handler import subtask_handler
@subtask_handler()
def handler(
body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator
) -> dict[str, list[str]]:
trigger = PostcodeSplitterTriggerBody.model_validate(body)
bucket = os.environ["S3_BUCKET_NAME"]
queue_url = os.environ["ADDRESS2UPRN_QUEUE_URL"]
# boto3.client is overloaded per-service in the installed stubs; cast
# to Any so the strict-mode checker treats it as opaque.
boto3_client: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
boto_s3: Any = boto3_client("s3")
boto_sqs: Any = boto3_client("sqs")
csv_client = CsvS3Client(boto_s3, bucket)
user_address_repo = UserAddressCsvS3Repository(csv_client, bucket)
queue_client = Address2UprnQueueClient(boto_sqs, queue_url)
splitter = PostcodeSplitterOrchestrator(
task_orchestrator=task_orchestrator,
user_address_repo=user_address_repo,
queue_client=queue_client,
)
child_ids = splitter.split_and_dispatch(
parent_task_id=trigger.task_id,
parent_subtask_id=trigger.sub_task_id,
input_s3_uri=trigger.s3_uri,
)
return {"child_subtask_ids": [str(cid) for cid in child_ids]}

View file

@ -0,0 +1,34 @@
# Local-test environment for the postcode_splitter Lambda.
#
# cp .env.local.example .env.local then fill in the values below.
#
# .env.local is gitignored. The container hits REAL AWS and a REAL Postgres,
# so every value here points at infrastructure that actually exists.
#
# NOTE: the new DDD code uses different env var names than the repo root
# .env. The mapping (root .env name -> var here) is given per section.
# Keep comments on their own lines — docker-compose's env_file parser folds a
# trailing "# ..." into the value.
# --- Postgres (orchestration/default_orchestrator -> PostgresConfig.from_env) ---
# POSTGRES_HOST <- DB_HOST, PORT <- DB_PORT, USERNAME <- DB_USERNAME,
# PASSWORD <- DB_PASSWORD, DATABASE <- DB_NAME.
POSTGRES_HOST=
POSTGRES_PORT=5432
POSTGRES_USERNAME=
POSTGRES_PASSWORD=
POSTGRES_DATABASE=
# POSTGRES_DRIVER=psycopg2 (optional; defaults to psycopg2)
# --- Handler config (applications/postcode_splitter/handler.py) ---
# S3_BUCKET_NAME: bucket holding the input address CSV (root .env: DATA_BUCKET).
# ADDRESS2UPRN_QUEUE_URL: SQS queue the splitter fans batches out to; not in
# the root .env (Terraform sets it in prod).
S3_BUCKET_NAME=
ADDRESS2UPRN_QUEUE_URL=
# --- AWS credentials for boto3 (S3 + SQS clients) ---
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_DEFAULT_REGION=eu-west-2
# AWS_SESSION_TOKEN= (only if using temporary/SSO credentials)

View file

@ -0,0 +1,9 @@
services:
postcode-splitter:
build:
context: ../../../
dockerfile: applications/postcode_splitter/Dockerfile
ports:
- "9001:8080"
env_file:
- .env.local

View file

@ -0,0 +1,28 @@
#!/usr/bin/env python3
import json
import requests
HOST = "localhost"
PORT = "9001"
LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations"
payload = {
"Records": [
{
"body": json.dumps(
{
"task_id": "f4b3332f-c0cc-481f-96a5-d39860a647cf",
"sub_task_id": "14c042de-40c4-473b-8cd8-72c983a94a8d",
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv",
}
)
}
]
}
response = requests.post(LAMBDA_URL, json=payload)
print("Status code:", response.status_code)
print("Response:")
print(response.text)

View file

@ -0,0 +1,12 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")"
if [ ! -f .env.local ]; then
cp .env.local.example .env.local
echo "Created .env.local from the template — fill it in, then re-run." >&2
exit 1
fi
docker compose build --no-cache
docker compose up --force-recreate

View file

@ -0,0 +1,11 @@
from uuid import UUID
from pydantic import BaseModel, ConfigDict
class PostcodeSplitterTriggerBody(BaseModel):
model_config = ConfigDict(extra="allow")
task_id: UUID
sub_task_id: UUID
s3_uri: str

View file

@ -0,0 +1,4 @@
boto3
pydantic
sqlmodel
psycopg2-binary

View file

@ -79,23 +79,23 @@ def app():
"""
data_folder = "/workspaces/model/asset_list"
data_filename = "input.xlsx"
sheet_name = "Handovers"
postcode_column = "POSTCODE"
address1_column = "Full Addres"
data_filename = "lincs_address_list.xlsx"
sheet_name = "Sheet1"
postcode_column = "Postcode"
address1_column = "Deal Name"
address1_method = None
fulladdress_column = "Full Addres"
fulladdress_column = "Deal Name"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = "domna_found_uprn"
landlord_property_type = "PROPERTY TYPE" # Good to include if landlord gave
landlord_built_form = "Type Description" # Good to include if landlord gave
landlord_os_uprn = None
landlord_property_type = None # Good to include if landlord gave
landlord_built_form = None # Good to include if landlord gave
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "PROP REF"
landlord_property_id = "landlord_id"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None
@ -468,9 +468,3 @@ def app():
asset_list.duplicated_addresses.to_excel(
writer, sheet_name="Duplicate Properties", index=False
)
for key,value in dict.items():
lsakjfldsa

View file

@ -8,4 +8,5 @@ boto3==1.35.44
sqlmodel
sqlalchemy==2.0.36
psycopg2-binary==2.9.10
pydantic-settings==2.6.0
pydantic-settings==2.6.0
httpx

View file

@ -10,7 +10,7 @@
### 2. Add infrastructure prerequisites (shared stack)
- Add a new ECR repository in:
infrastructure/terraform/shared/main.tf
deployment/terraform/shared/main.tf
- Create a PR to deploy this to main then dev in order to deploy the shared stack

View file

@ -40,20 +40,6 @@ module "lambda" {
LOG_LEVEL = "info"
DB_USERNAME = local.db_credentials.db_assessment_model_username
DB_PASSWORD = local.db_credentials.db_assessment_model_password
GOOGLE_SOLAR_API_KEY = "test"
SAP_PREDICTIONS_BUCKET = "test"
CARBON_PREDICTIONS_BUCKET = "test"
HEAT_PREDICTIONS_BUCKET = "test"
HEATING_KWH_PREDICTIONS_BUCKET = "test"
HOTWATER_KWH_PREDICTIONS_BUCKET = "test"
API_KEY = "test"
ENVIRONMENT = "test"
SECRET_KEY = "test"
PLAN_TRIGGER_BUCKET = "test"
DATA_BUCKET = "test"
EPC_AUTH_TOKEN = "test"
ENGINE_SQS_URL = "test"
ENERGY_ASSESSMENTS_BUCKET = "test"
ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url
S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name
},

Some files were not shown because too many files have changed in this diff Show more