diff --git a/.devcontainer/asset_list/Dockerfile b/.devcontainer/asset_list/Dockerfile new file mode 100644 index 00000000..72a5de53 --- /dev/null +++ b/.devcontainer/asset_list/Dockerfile @@ -0,0 +1,40 @@ +FROM python:3.11.10-bullseye + + +ARG USER=vscode +ARG DEBIAN_FRONTEND=noninteractive + +# 1) Toolchain + utilities for building libpostal +RUN apt-get update && apt-get install -y --no-install-recommends \ + sudo jq vim curl git ca-certificates \ + build-essential pkg-config automake autoconf libtool \ + && rm -rf /var/lib/apt/lists/* + +# # 2) Build and install libpostal from source +RUN git clone --depth 1 https://github.com/openvenues/libpostal /tmp/libpostal \ + && cd /tmp/libpostal \ + && ./bootstrap.sh \ + && ./configure --datadir=/usr/local/share/libpostal \ + && make -j"$(nproc)" \ + && make install \ + && ldconfig \ + && rm -rf /tmp/libpostal + +# 3) Create the user and grant sudo privileges +RUN useradd -m -s /usr/bin/bash ${USER} \ + && echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \ + && chmod 0440 /etc/sudoers.d/${USER} + +# # 4) Python deps - if you want to run assest list +ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 +ADD .devcontainer/asset_list/requirements.txt requirements2.txt +ADD asset_list/requirements.txt requirements1.txt +RUN cat requirements1.txt requirements2.txt >> requirements.txt + +RUN pip install -r requirements.txt +# 5) Workdir +WORKDIR /workspaces/model + +# 6) Make Python find your package +# Add project root to PYTHONPATH for all processes +ENV PYTHONPATH=/workspaces/model:${PYTHONPATH} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/asset_list/devcontainer.json similarity index 95% rename from .devcontainer/devcontainer.json rename to .devcontainer/asset_list/devcontainer.json index 5e23ae0d..4834d559 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/asset_list/devcontainer.json @@ -1,7 +1,7 @@ { - "name": "Basic Python", + "name": "SAL ENV", "dockerComposeFile": "docker-compose.yml", - "service": "model", + "service": "model-sal", "remoteUser": "vscode", "workspaceFolder": "/workspaces/model", "postStartCommand": "bash .devcontainer/post-install.sh", diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/asset_list/docker-compose.yml similarity index 59% rename from .devcontainer/docker-compose.yml rename to .devcontainer/asset_list/docker-compose.yml index 7f60d34d..06e4124d 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/asset_list/docker-compose.yml @@ -1,14 +1,14 @@ version: '3.8' services: - model: + model-sal: user: "${UID}:${GID}" build: - context: .. - dockerfile: .devcontainer/Dockerfile + context: ../.. + dockerfile: .devcontainer/asset_list/Dockerfile command: sleep infinity volumes: - - ..:/workspaces/model + - ../../:/workspaces/model networks: - model-net diff --git a/.devcontainer/post-install.sh b/.devcontainer/asset_list/post-install.sh similarity index 98% rename from .devcontainer/post-install.sh rename to .devcontainer/asset_list/post-install.sh index dc6da006..48fbfde1 100644 --- a/.devcontainer/post-install.sh +++ b/.devcontainer/asset_list/post-install.sh @@ -11,4 +11,4 @@ if os.path.exists(env_path): print("✔ Loaded .env into Jupyter kernel") else: print("⚠ No .env file found to load") -EOF \ No newline at end of file +EOF diff --git a/.devcontainer/asset_list/requirements.txt b/.devcontainer/asset_list/requirements.txt new file mode 100644 index 00000000..fe536a81 --- /dev/null +++ b/.devcontainer/asset_list/requirements.txt @@ -0,0 +1,23 @@ +fastapi==0.115.2 +sqlalchemy==2.0.36 +psycopg2-binary==2.9.10 +python-jose==3.3.0 +cryptography==43.0.3 +mangum==0.19.0 +# AWS +boto3==1.35.44 +# Data +openpyxl==3.1.2 +# Basic +pytz +uvicorn[standard] +# Testing +pytest==9.0.2 +pytest-cov==7.0.0 +ipykernel>=6.25,<7 +pyyaml>=6.0.1 +sqlmodel +# Formatting +black==26.1.0 +dotenv +pydantic-settings \ No newline at end of file diff --git a/.devcontainer/Dockerfile b/.devcontainer/backend/Dockerfile similarity index 96% rename from .devcontainer/Dockerfile rename to .devcontainer/backend/Dockerfile index ccfb55b6..4c5d16f5 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -34,7 +34,7 @@ RUN useradd -m -s /usr/bin/bash ${USER} \ ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 ADD backend/engine/requirements.txt requirements1.txt ADD backend/app/requirements/requirements.txt requirements2.txt -ADD .devcontainer/requirements.txt requirements3.txt +ADD .devcontainer/backend/requirements.txt requirements3.txt RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt RUN pip install -r requirements.txt diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json new file mode 100644 index 00000000..c672b1bf --- /dev/null +++ b/.devcontainer/backend/devcontainer.json @@ -0,0 +1,40 @@ +{ + "name": "Backend Model Env", + "dockerComposeFile": "docker-compose.yml", + "service": "model-backend", + "remoteUser": "vscode", + "workspaceFolder": "/workspaces/model", + "postStartCommand": "bash .devcontainer/backend/post-install.sh", + "mounts": [ + "source=${localEnv:HOME},target=/workspaces/home,type=bind" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-toolsai.jupyter", + "mechatroner.rainbow-csv", + "ms-toolsai.datawrangler", + "lindacong.vscode-book-reader", + "4ops.terraform", + "fabiospampinato.vscode-todo-plus", + "jgclark.vscode-todo-highlight", + "corentinartaud.pdfpreview", + "ms-python.vscode-python-envs", + "ms-python.black-formatter", + "waderyan.gitblame" + ], + "settings": { + "files.defaultWorkspace": "/workspaces/model", + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true + }, + "python.formatting.provider": "none" + } + } + }, + "containerEnv": { + "PYTHONFLAGS": "-Xfrozen_modules=off" + } +} diff --git a/.devcontainer/backend/docker-compose.yml b/.devcontainer/backend/docker-compose.yml new file mode 100644 index 00000000..683b4489 --- /dev/null +++ b/.devcontainer/backend/docker-compose.yml @@ -0,0 +1,28 @@ +version: '3.8' + +services: + model-backend: + user: "${UID}:${GID}" + build: + context: ../.. + dockerfile: .devcontainer/backend/Dockerfile + command: sleep infinity + volumes: + - ../../:/workspaces/model + + + db: + image: postgres:17.4 + restart: unless-stopped + ports: + - 5432:5432 + environment: + - PGDATABASE=tech_team_local_db + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=makingwarmerhomes + volumes: + - postgres-data-two:/var/lib/postgresql/data + + +volumes: + postgres-data-two: \ No newline at end of file diff --git a/.devcontainer/backend/post-install.sh b/.devcontainer/backend/post-install.sh new file mode 100644 index 00000000..48fbfde1 --- /dev/null +++ b/.devcontainer/backend/post-install.sh @@ -0,0 +1,14 @@ +mkdir -p ~/.ipython/profile_default/startup + +cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py +from dotenv import load_dotenv +import os + +# Adjust path as needed +env_path = "/workspaces/model/backend/.env" +if os.path.exists(env_path): + load_dotenv(env_path) + print("✔ Loaded .env into Jupyter kernel") +else: + print("⚠ No .env file found to load") +EOF diff --git a/.devcontainer/requirements.txt b/.devcontainer/backend/requirements.txt similarity index 96% rename from .devcontainer/requirements.txt rename to .devcontainer/backend/requirements.txt index 5e7753a6..9562aa6a 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -1,4 +1,4 @@ -# fastapi + fastapi==0.115.2 sqlalchemy==2.0.36 pydantic-settings==2.6.0 diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml new file mode 100644 index 00000000..408c0319 --- /dev/null +++ b/.github/workflows/_build_image.yml @@ -0,0 +1,107 @@ +name: Build Docker image + +on: + workflow_call: + inputs: + ecr_repo: + required: true + type: string + dockerfile_path: + required: true + type: string + build_context: + required: false + default: "." + type: string + build_args: + required: false + type: string + + outputs: + image_digest: + description: "Pushed image digest" + value: ${{ jobs.build.outputs.image_digest }} + ecr_repo_url: + description: "ECR repository URL" + value: ${{ jobs.build.outputs.ecr_repo_url }} + + secrets: + AWS_ACCESS_KEY_ID: + required: true + AWS_SECRET_ACCESS_KEY: + required: true + AWS_REGION: + required: true + DEV_DB_HOST: + required: false + DEV_DB_PORT: + required: false + DEV_DB_NAME: + required: false + +jobs: + build: + runs-on: ubuntu-latest + + env: + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} + + outputs: + image_digest: ${{ steps.digest.outputs.image_digest }} + ecr_repo_url: ${{ steps.repo.outputs.ecr_repo_url }} + + steps: + - uses: actions/checkout@v4 + + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - uses: aws-actions/amazon-ecr-login@v2 + + - name: Resolve ECR repo URL + id: repo + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) + + ECR_REPO_URL="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${{ inputs.ecr_repo }}" + + echo "Resolved ECR repo URL (local var):" + echo "$ECR_REPO_URL" + + echo "ecr_repo_url=$ECR_REPO_URL" >> "$GITHUB_OUTPUT" + + - name: Build & push image + run: | + IMAGE_URI="${{ steps.repo.outputs.ecr_repo_url }}:${GITHUB_SHA}" + + # Writes build args and removes line breaks + BUILD_ARGS="" + while IFS= read -r line; do + # skip empty lines + [ -n "$line" ] || continue + temp=$(eval echo "$line") + BUILD_ARGS="$BUILD_ARGS --build-arg $temp" + done <<< "${{ inputs.build_args }}" + + docker build \ + -f ${{ inputs.dockerfile_path }} \ + $BUILD_ARGS \ + -t $IMAGE_URI \ + ${{ inputs.build_context }} + + docker push $IMAGE_URI + + - name: Resolve image digest + id: digest + run: | + DIGEST=$(aws ecr describe-images \ + --repository-name ${{ inputs.ecr_repo }} \ + --image-ids imageTag=${GITHUB_SHA} \ + --query 'imageDetails[0].imageDigest' \ + --output text) + echo "image_digest=$DIGEST" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml new file mode 100644 index 00000000..bff106c5 --- /dev/null +++ b/.github/workflows/_deploy_lambda.yml @@ -0,0 +1,91 @@ +name: Deploy Lambda (Terraform) + +on: + workflow_call: + inputs: + lambda_name: + required: true + type: string + + lambda_path: + required: true + type: string + + stage: + required: true + type: string + + ecr_repo: + required: true + type: string + + image_digest: + required: true + type: string + + secrets: + AWS_ACCESS_KEY_ID: + required: true + AWS_SECRET_ACCESS_KEY: + required: true + AWS_REGION: + required: true + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Debug inputs + run: | + echo "lambda_name=${{ inputs.lambda_name }}" + echo "lambda_path=${{ inputs.lambda_path }}" + echo "stage=${{ inputs.stage }}" + echo "ecr_repo_url=${{ inputs.ecr_repo_url }}" + echo "image_digest=${{ inputs.image_digest }}" + + + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - uses: hashicorp/setup-terraform@v3 + + - uses: aws-actions/amazon-ecr-login@v2 + + - name: Resolve ECR repo URL + id: repo + env: + AWS_REGION: ${{ secrets.AWS_REGION }} + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) + ECR_REPO_URL="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${{ inputs.ecr_repo }}" + echo "ecr_repo_url=$ECR_REPO_URL" >> "$GITHUB_OUTPUT" + + - name: Terraform Init + working-directory: ${{ inputs.lambda_path }} + run: terraform init -reconfigure + + - name: Terraform Workspace + working-directory: ${{ inputs.lambda_path }} + run: | + terraform workspace select ${{ inputs.stage }} \ + || terraform workspace new ${{ inputs.stage }} + + - name: Terraform Plan + working-directory: ${{ inputs.lambda_path }} + run: | + terraform plan \ + -var="stage=${{ inputs.stage }}" \ + -var="lambda_name=${{ inputs.lambda_name }}" \ + -var="ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" \ + -var="image_digest=${{ inputs.image_digest }}" \ + -out=lambdaplan + + - name: Terraform Apply + working-directory: ${{ inputs.lambda_path }} + run: terraform apply -auto-approve lambdaplan diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index a7aef225..4ac08e41 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -1,80 +1,172 @@ -name: Deploy terraform stack +name: Deploy infrastructure on: push: branches: - - dev - - prod + - "**" jobs: - deploy: + determine_stage: runs-on: ubuntu-latest + outputs: + stage: ${{ steps.set-stage.outputs.stage }} + steps: - - name: Checkout - uses: actions/checkout@v2 - - - name: Setup AWS credentials file + - name: Determine stage from branch + id: set-stage + shell: bash run: | - mkdir -p ~/.aws - echo "[DevAdmin]" > ~/.aws/credentials - echo "aws_access_key_id = ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}" >> ~/.aws/credentials - echo "aws_secret_access_key = ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}" >> ~/.aws/credentials - echo "[ProdAdmin]" >> ~/.aws/credentials - echo "aws_access_key_id = ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}" >> ~/.aws/credentials - echo "aws_secret_access_key = ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}" >> ~/.aws/credentials + env + BRANCH="${GITHUB_REF_NAME}" - - name: Setup AWS config file - run: | - echo "[profile DevAdmin]" > ~/.aws/config - echo "region = eu-west-2" >> ~/.aws/config - echo "[profile ProdAdmin]" >> ~/.aws/config - echo "region = eu-west-2" >> ~/.aws/config + if [[ "$BRANCH" == "prod" ]]; then + echo "stage=prod" >> "$GITHUB_OUTPUT" - - name: Setup Terraform - uses: hashicorp/setup-terraform@v1 - with: - terraform_version: 1.5.2 + elif [[ "$BRANCH" == "dev" ]]; then + echo "stage=dev" >> "$GITHUB_OUTPUT" - - name: Configure AWS credentials (DevAdmin) - uses: aws-actions/configure-aws-credentials@v1 + else + echo "stage=dev" >> "$GITHUB_OUTPUT" + fi + + # ============================================================ + # 1️⃣ Shared Terraform (infra) + # ============================================================ + shared_terraform: + needs: determine_stage + runs-on: ubuntu-latest + env: + STAGE: ${{ needs.determine_stage.outputs.stage }} + + steps: + - uses: actions/checkout@v4 + + - uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - env: - AWS_PROFILE: "DevAdmin" + aws-region: ${{ secrets.DEV_AWS_REGION }} + + - uses: hashicorp/setup-terraform@v3 - name: Terraform Init - run: cd infrastructure/terraform && terraform init + working-directory: infrastructure/terraform/shared + run: terraform init -reconfigure - name: Terraform Workspace - run: | - BRANCH_NAME=$(echo "${{ github.ref }}" | sed -e "s/^refs\/heads\///") - cd infrastructure/terraform - terraform workspace select ${BRANCH_NAME} || terraform workspace new ${BRANCH_NAME} + working-directory: infrastructure/terraform/shared + run: terraform workspace select ${STAGE} || terraform workspace new ${STAGE} - name: Terraform Plan - run: | - BRANCH_NAME=$(echo "${{ github.ref }}" | sed -e "s/^refs\/heads\///") - cd infrastructure/terraform && terraform plan -var-file=${BRANCH_NAME}.tfvars + working-directory: infrastructure/terraform/shared + run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - - name: Deploy to Dev - if: github.ref == 'refs/heads/dev' - run: cd infrastructure/terraform && terraform apply -var-file=dev.tfvars -auto-approve - env: - name: dev + - name: Terraform Apply + if: env.STAGE == 'prod' + working-directory: infrastructure/terraform/shared + run: terraform apply -auto-approve tfplan - - name: Configure AWS credentials (ProdAdmin) - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - env: - AWS_PROFILE: "ProdAdmin" + # ============================================================ + # 2️⃣ Build Address 2 UPRN image and Push + # ============================================================ + address2uprn_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/address2UPRN/handler/Dockerfile + build_context: . + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + # ============================================================ + # 3️⃣ Deploy Address 2 UPRN Lambda + # ============================================================ + address2uprn_lambda: + needs: [address2uprn_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: address2uprn + lambda_path: infrastructure/terraform/lambda/address2UPRN + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.address2uprn_image.outputs.image_digest }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + + # ============================================================ + # 2️⃣ Build Postcode Splitter image and Push + # ============================================================ + postcodeSplitter_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/postcode_splitter/handler/Dockerfile + build_context: . + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + # ============================================================ + # 3️⃣ Deploy Postcode Splitter Lambda + # ============================================================ + postcodeSplitter_lambda: + needs: [postcodeSplitter_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: postcodeSplitter + lambda_path: infrastructure/terraform/lambda/postcodeSplitter + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + # ============================================================ + # Condition ETL image and Push + # ============================================================ + condition_etl_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/condition/handler/Dockerfile + build_context: . + build_args: | + DEV_DB_HOST=$DEV_DB_HOST + DEV_DB_PORT=$DEV_DB_PORT + DEV_DB_NAME=$DEV_DB_NAME + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} + + # ============================================================ + # Deploy Condition ETL Lambda + # ============================================================ + condition_etl_lambda: + needs: [condition_etl_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: condition-etl + lambda_path: infrastructure/terraform/lambda/condition-etl + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.condition_etl_image.outputs.image_digest }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} - - name: Deploy to Prod - if: github.ref == 'refs/heads/prod' - run: cd infrastructure/terraform && terraform apply -var-file=prod.tfvars -auto-approve - env: - name: prod diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 95155c86..14d5a06f 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -2,6 +2,12 @@ name: Run unit tests on: pull_request: + branches: + - "**" + push: + branches: + - "**" + jobs: test: diff --git a/.vscode/settings.json b/.vscode/settings.json index 88c2ae2d..3d4c6b42 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,9 +9,12 @@ "path": "/bin/bash" } }, +<<<<<<< HEAD +======= "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "python.testing.pytestArgs": ["-s", "-q", "--no-cov"] +>>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d // Hot reload setting that needs to be in user settings // "jupyter.runStartupCommands": [ diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 940c723a..ea4d8b34 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -34,7 +34,8 @@ from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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") + class DataRemapper: @@ -1159,13 +1160,17 @@ class AssetList: ), axis=1 ) + + col = self.EPC_API_DATA_NAMES["roof-description"] self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply( - lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[ + lambda x: RoofAttributes(description=x[col]).process()[ "insulation_thickness"] if not pd.isnull( - x[self.EPC_API_DATA_NAMES["roof-description"]]) else None, + x[col]) else None, axis=1 ) + + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = ( self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].str.replace("+", "") ) diff --git a/asset_list/DataMapper.py b/asset_list/DataMapper.py index ac1b8db3..0751a7cf 100644 --- a/asset_list/DataMapper.py +++ b/asset_list/DataMapper.py @@ -1,5 +1,5 @@ # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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") class DataRemapper: diff --git a/asset_list/__init__.py b/asset_list/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/asset_list/app.py b/asset_list/app.py index 01906c5f..30172121 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -14,22 +14,32 @@ from dotenv import load_dotenv from backend.SearchEpc import SearchEpc load_dotenv(dotenv_path="backend/.env") -EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +EPC_AUTH_TOKEN = os.getenv( + "EPC_AUTH_TOKEN", +) -def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): +def extract_address1( + asset_list, full_address_col, postcode_col, method="first_two_words" +): if method == "first_two_words": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + asset_list["address1_extracted"] = ( + asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + ) return asset_list if method == "first_word": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + asset_list["address1_extracted"] = ( + asset_list[full_address_col].str.split(" ").str[0] + ) return asset_list if method == "house_number_extraction": asset_list["address1_extracted"] = asset_list.apply( - lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), - axis=1 + lambda x: SearchEpc.get_house_number( + address=x[full_address_col], postcode=x[postcode_col] + ), + axis=1, ) return asset_list @@ -59,24 +69,24 @@ def app(): Property UPRN """ - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hackney" - data_filename = "Domna SHF Wave 3 (3).xlsx" - sheet_name = "Domna Wave 3" - postcode_column = 'Postcode' - address1_column = "Address 1" - address1_method = None - fulladdress_column = None - address_cols_to_concat = ["Address 1"] + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Aspire" + data_filename = "ASPIRE ASSET LIST.xlsx" + sheet_name = "Asset List" + postcode_column = "Postcode" + address1_column = None + address1_method = "house_number_extraction" + fulladdress_column = "Address" + address_cols_to_concat = [] missing_postcodes_method = None - landlord_year_built = "Construction Years" - landlord_os_uprn = "UPRN" - landlord_property_type = "Type" - landlord_built_form = "Attachment" - landlord_wall_construction = "Wall type" + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_built_form = None + landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Row ID" + landlord_property_id = "LLUPRN" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -93,25 +103,27 @@ def app(): landlord_block_reference = None # Peabody data for cleaning - data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " - "Project/data_validation") + data_folder = ( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/data_validation" + ) data_filename = "to_standardise_uprns.xlsx" sheet_name = "Sheet1" - postcode_column = 'Postcode' - address1_column = "Address 1" - address1_method = None - fulladdress_column = None - address_cols_to_concat = ["Address 1", "Address 2", "Address 3"] + postcode_column = "Postcode" + address1_column = None + address1_method = "house_number_extraction" + fulladdress_column = "Address" + address_cols_to_concat = None missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = "Type" - landlord_built_form = "Attachment" + landlord_property_type = None + landlord_built_form = None landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Org Ref" + landlord_property_id = "LLUPRN" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -127,40 +139,6 @@ def app(): asset_list_header = 0 landlord_block_reference = None - # Lambeth: - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lambeth/December 10th" - # data_filename = "lambeth_sw2_leigham court estate.xlsx" - # sheet_name = "Sheet1" - # postcode_column = 'Postcode' - # address1_column = "Address" - # address1_method = None - # fulladdress_column = None - # address_cols_to_concat = ["Address"] - # missing_postcodes_method = None - # landlord_year_built = None - # landlord_os_uprn = None - # landlord_property_type = None - # landlord_built_form = None - # landlord_wall_construction = None - # landlord_roof_construction = None - # landlord_heating_system = None - # landlord_existing_pv = None - # landlord_property_id = "row_id" - # landlord_sap = None - # outcomes_filename = None - # outcomes_sheetname = None - # outcomes_postcode = None - # outcomes_houseno = None - # outcomes_id = None - # outcomes_address = None - # master_filepaths = [] - # master_id_colnames = [] - # master_to_asset_list_filepath = None - # phase = False - # ecosurv_landlords = None - # asset_list_header = 0 - # landlord_block_reference = None - # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -185,49 +163,62 @@ def app(): landlord_existing_pv=landlord_existing_pv, landlord_sap=landlord_sap, landlord_block_reference=landlord_block_reference, - phase=phase + phase=phase, ) asset_list.init_standardise() # We produce the new maps, which can be saved for future useage new_property_type_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_property_type] if - asset_list.landlord_property_type else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_property_type] + if asset_list.landlord_property_type + else {} ).items() if k not in PROPERTY_MAPPING } new_built_form_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_built_form] if - asset_list.landlord_built_form else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_built_form] + if asset_list.landlord_built_form + else {} ).items() if k not in BUILT_FORM_MAPPINGS } new_wall_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_wall_construction] if - asset_list.landlord_wall_construction else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_wall_construction] + if asset_list.landlord_wall_construction + else {} ).items() if k not in WALL_CONSTRUCTION_MAPPINGS } new_heating_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_heating_system] if - asset_list.landlord_heating_system else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_heating_system] + if asset_list.landlord_heating_system + else {} ).items() if k not in HEATING_MAPPINGS } new_existing_pv_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_existing_pv] + if asset_list.landlord_existing_pv + else {} ).items() if k not in EXISTING_PV_MAPPINGS } new_roof_construction_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_roof_construction] if - asset_list.landlord_roof_construction else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_roof_construction] + if asset_list.landlord_roof_construction + else {} ).items() if k not in ROOF_CONSTRUCTION_MAPPINGS } @@ -241,7 +232,7 @@ def app(): outcomes_address=outcomes_address, outcomes_postcode=outcomes_postcode, outcomes_houseno=outcomes_houseno, - outcomes_id=outcomes_id + outcomes_id=outcomes_id, ) asset_list.flag_survey_master( @@ -275,14 +266,16 @@ def app(): skip = max(chunk_indexes) if any(x in folder_contents for x in downloaded_files): - skip = max([i for i in chunk_indexes if filename.format(i=i) in folder_contents]) + skip = max( + [i for i in chunk_indexes if filename.format(i=i) in folder_contents] + ) for i in range(0, len(asset_list.standardised_asset_list), chunk_size): print(f"Processing chunk {i} to {i + chunk_size}") if skip is not None and not force_retrieve_data: if i <= skip: continue - chunk = asset_list.standardised_asset_list[i:i + chunk_size] + chunk = asset_list.standardised_asset_list[i: i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, @@ -294,7 +287,7 @@ def app(): built_form_column=AssetList.STANDARD_BUILT_FORM, manual_uprn_map=manual_uprn_map, epc_api_only=epc_api_only, - epc_auth_token=EPC_AUTH_TOKEN + epc_auth_token=EPC_AUTH_TOKEN, ) # We now retrieve any failed properties @@ -317,7 +310,9 @@ def app(): # Append the failed data to the main data # Store the chunk locally as a csv - pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False) + pd.DataFrame(epc_data_chunk).to_csv( + os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False + ) # Store the errors and no-data locally with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f: json.dump(errors_chunk, f) @@ -348,7 +343,9 @@ def app(): unique_recommendations = set() for _, row in recommendations_df.iterrows(): - unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + unique_recommendations.update( + [rec["improvement-summary-text"] for rec in row["recommendations"]] + ) columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) transformed_data = [] @@ -368,20 +365,24 @@ def app(): transformed_df = pd.DataFrame(transformed_data) for col in [ "Floor insulation (solid floor)", - "Floor insulation", "Floor insulation (suspended floor)" + "Floor insulation", + "Floor insulation (suspended floor)", ]: if col not in transformed_df.columns: transformed_df[col] = False transformed_df = transformed_df[ [ - asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", - "Floor insulation", "Floor insulation (suspended floor)" + asset_list.DOMNA_PROPERTY_ID, + "Floor insulation (solid floor)", + "Floor insulation", + "Floor insulation (suspended floor)", ] ] transformed_df["epc_has_floor_recommendation"] = ( - transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | - transformed_df["Floor insulation (suspended floor)"] + transformed_df["Floor insulation (solid floor)"] + | transformed_df["Floor insulation"] + | transformed_df["Floor insulation (suspended floor)"] ) # Get the find my epc data @@ -394,21 +395,20 @@ def app(): find_my_epc_data.append( { asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID], - **x["find_my_epc_data"] + **x["find_my_epc_data"], } ) else: find_my_epc_data.append( - { - asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID] - } + {asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID]} ) find_my_epc_data = pd.DataFrame(find_my_epc_data) find_my_epc_data = find_my_epc_data.merge( transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], - how="left", on=asset_list.DOMNA_PROPERTY_ID + how="left", + on=asset_list.DOMNA_PROPERTY_ID, ) # We check if we get the solar pv column: @@ -418,27 +418,33 @@ def app(): # Retrieve just the data we need epc_df = epc_df[ [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) - ].rename( - columns=asset_list.EPC_API_DATA_NAMES - ) + ].rename(columns=asset_list.EPC_API_DATA_NAMES) # Look for columns not in the find my EPC data, which will have happened if we didn't # retrieve it in the first place - missed_find_epc_cols = [c for c in list(asset_list.FIND_EPC_DATA_NAMES.keys()) if c not in find_my_epc_data.columns] + missed_find_epc_cols = [ + c + for c in list(asset_list.FIND_EPC_DATA_NAMES.keys()) + if c not in find_my_epc_data.columns + ] if missed_find_epc_cols: for c in missed_find_epc_cols: find_my_epc_data[c] = None epc_df = epc_df.merge( find_my_epc_data[ - [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) - ] - .rename(columns=asset_list.FIND_EPC_DATA_NAMES), + [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + + list(asset_list.FIND_EPC_DATA_NAMES.keys()) + ].rename(columns=asset_list.FIND_EPC_DATA_NAMES), how="left", - on=asset_list.DOMNA_PROPERTY_ID + on=asset_list.DOMNA_PROPERTY_ID, ) asset_list.merge_data(epc_df) + # asset_list.standardised_asset_list = asset_list.standardised_asset_list[ + # asset_list.standardised_asset_list["domna_full_address"] + # != "120 Airdrie Crescent, Burnley, Lancashire" + # ] asset_list.extract_attributes() asset_list.identify_worktypes() @@ -448,7 +454,10 @@ def app(): asset_list.get_work_figures() # Store as an excel - filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx" + filename = ( + os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + + " - Standardised.xlsx" + ) # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data # Determine inspections priority @@ -472,26 +481,42 @@ def app(): # ) with pd.ExcelWriter(filename) as writer: - asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) + asset_list.standardised_asset_list.to_excel( + writer, sheet_name="Standardised Asset List", index=False + ) if asset_list.block_analysis_df is not None: - asset_list.block_analysis_df.to_excel(writer, sheet_name="Block Analysis", index=False) + asset_list.block_analysis_df.to_excel( + writer, sheet_name="Block Analysis", index=False + ) # If we have outcomes, we add a tab with the outcomes if not asset_list.outcomes_for_output.empty: - asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False) + asset_list.outcomes_for_output.to_excel( + writer, sheet_name="Outcomes", index=False + ) if not asset_list.unmatched_submissions.empty: - asset_list.unmatched_submissions.to_excel(writer, sheet_name="Unmatched Submissions", index=False) + asset_list.unmatched_submissions.to_excel( + writer, sheet_name="Unmatched Submissions", index=False + ) if not asset_list.outcomes_no_match.empty: - asset_list.outcomes_no_match.to_excel(writer, sheet_name="Unmatched Outcomes", index=False) + asset_list.outcomes_no_match.to_excel( + writer, sheet_name="Unmatched Outcomes", index=False + ) if not asset_list.ecosurv_no_match.empty: - asset_list.ecosurv_no_match.to_excel(writer, sheet_name="Unmatched Ecosurv", index=False) + asset_list.ecosurv_no_match.to_excel( + writer, sheet_name="Unmatched Ecosurv", index=False + ) if not asset_list.geographical_areas.empty: - asset_list.geographical_areas.to_excel(writer, sheet_name="Geographical Areas", index=False) + asset_list.geographical_areas.to_excel( + writer, sheet_name="Geographical Areas", index=False + ) # Store dupes if asset_list.duplicated_addresses is not None: if not asset_list.duplicated_addresses.empty: - asset_list.duplicated_addresses.to_excel(writer, sheet_name="Duplicate Properties", index=False) + asset_list.duplicated_addresses.to_excel( + writer, sheet_name="Duplicate Properties", index=False + ) diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index a9defdef..d6466539 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -520,4 +520,14 @@ BUILT_FORM_MAPPINGS = { '2.EXT.WALL FLAT': 'mid-terrace', '2 EXT. WALL FLAT': 'mid-terrace', + 'Maisonette: Detached: Ground Floor': 'detached', + 'Maisonette: Enclosed End Terrace: Top Floor': 'enclosed end-terrace', + 'Flat: End Terrace: Basement': 'end-terrace', + 'Flat: Mid Terrace: Basement': 'mid-terrace', + 'Flat: Enclosed Mid Terrace: Basement': 'enclosed mid-terrace', + 'House: Semi Detached: Top Floor': 'semi-detached', + 'House: End Terrace: Ground Floor': 'end-terrace', + 'Maisonette: Enclosed End Terrace: Mid Floor': 'enclosed end-terrace', + 'Bungalow: EnclosedEndTerrace': 'enclosed end-terrace' + } diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py index e67fafb4..defce35f 100644 --- a/asset_list/mappings/exising_pv.py +++ b/asset_list/mappings/exising_pv.py @@ -17,5 +17,10 @@ EXISTING_PV_MAPPINGS = { 'PV: 10% roof area, PV: 2kWp array': 'already has PV', 'PV: 50% roof area': 'already has PV', 'Solar PV': 'already has PV', - 'SOLAR PV': 'already has PV' + 'SOLAR PV': 'already has PV', + + 'PV: 40% roof area, PV: 2kWp array': 'already has PV', + 'PV: 33% roof area, PV: 2kWp array': 'already has PV', + 'PV: 30% roof area': 'already has PV' + } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index ffd1b198..272d6279 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -494,6 +494,10 @@ HEATING_MAPPINGS = { 'Gas (including LPG) room heaters: Gas fire, open flue, 1980 or later (open fronted), sitting proud of, ' 'and sealed to, fireplace opening': 'room heaters', 'Boiler: A rated Regular Boiler, System 2: Boiler: C rated Regular Boiler': 'boiler - other fuel', - 'Boiler: G rated Combi': 'gas condensing combi' + 'Boiler: G rated Combi': 'gas condensing combi', + + 'Boiler: A rated Combi, System 2: Boiler: A rated Combi': 'gas combi boiler', + 'System 2: Boiler: A rated Regular Boiler, Boiler: A rated Regular Boiler': 'gas boiler, radiators', + 'Boiler: A rated Combi, System 2: Boiler: C rated Combi': 'gas combi boiler' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 1f251598..177a7549 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -427,6 +427,23 @@ PROPERTY_MAPPING = { 'End Terrace': 'unknown', 'Detached': 'unknown', 'Mid-terrace': 'unknown', - 'MID - TERRACE': 'unknown' + 'MID - TERRACE': 'unknown', + 'COMOFF': 'unknown', + 'LOTS': 'unknown', + + 'Maisonette: Detached: Ground Floor': 'maisonette', + 'Maisonette: Enclosed End Terrace: Top Floor': 'maisonette', + 'Flat: End Terrace: Basement': 'flat', + 'Bungalow: EnclosedEndTerrace': 'bungalow', + 'Flat: Mid Terrace: Basement': 'flat', + 'House: Semi Detached: Top Floor': 'house', + 'House: End Terrace: Ground Floor': 'house', + 'Maisonette: Enclosed End Terrace: Mid Floor': 'maisonette', + 'Flat: Enclosed Mid Terrace: Basement': 'flat', + + 'Warden Bungalow': 'bungalow', + 'Warden Flat': 'flat', + 'Upper Floor Flat': 'flat', + 'Extracare Scheme': 'other' } diff --git a/asset_list/mappings/roof.py b/asset_list/mappings/roof.py index 0857b046..cf829a5f 100644 --- a/asset_list/mappings/roof.py +++ b/asset_list/mappings/roof.py @@ -301,4 +301,13 @@ ROOF_CONSTRUCTION_MAPPINGS = { 'PitchedWithSlopingCeiling: As Built': 'pitched insulated', 'PitchedNormalLoftAccess: As Built': 'pitched unknown insulation', + 'Flat: 150mm, Flat: Unknown': 'flat insulated', + 'AnotherDwellingAbove: Unknown, Flat: Unknown': 'another dwelling above', + 'AnotherDwellingAbove, AnotherDwellingAbove: Unknown': 'another dwelling above', + 'PitchedNormalNoLoftAccess: Unknown, PitchedWithSlopingCeiling: As Built': 'pitched unknown access to loft', + 'Flat: No Insulation': 'flat uninsulated', + 'AnotherDwellingAbove: Unknown, PitchedNormalLoftAccess: 250mm': 'another dwelling above', + 'PitchedNormalLoftAccess: 175mm': 'pitched insulated', + 'AnotherDwellingAbove: 300mm': 'another dwelling above' + } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 418ae9f8..1bb02a9a 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -354,6 +354,15 @@ WALL_CONSTRUCTION_MAPPINGS = { 'System built Internal': 'insulated system built', 'Cavity: AsBuilt (1976-1982), TimberFrame: AsBuilt': 'cavity unknown insulation', - 'Cavity: FilledCavityPlusExternal': 'filled cavity' + 'Cavity: FilledCavityPlusExternal': 'filled cavity', + + 'Cavity, Filled Cavity': 'filled cavity', + 'Solid Brick, As Built': 'solid brick unknown insulation', + 'Cavity, As Built': 'cavity unknown insulation', + 'Sandstone, As Built': 'sandstone or limestone unknown insulation', + 'Timber Frame, As Built': 'timber frame unknown insulation', + 'Solid Brick, Internal Insulation': 'insulated solid brick', + 'Granite or Whinstone, As Built': 'granite or whinstone unknown insulation', + 'Solid Brick, External': 'insulated solid brick' } diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index b68706be..dc7e572e 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -1,7 +1,6 @@ postal pandas usaddress -pydantic-settings==2.6.0 epc-api-python==1.0.2 thefuzz boto3 @@ -10,6 +9,5 @@ openai>=1.3.5 tiktoken msgpack beautifulsoup4 -pydantic>=1.10.7 typing-extensions>=4.5.0 -requests>=2.28.2 +requests>=2.28.2 \ No newline at end of file diff --git a/backend/.env.test b/backend/.env.test new file mode 100644 index 00000000..5b77f243 --- /dev/null +++ b/backend/.env.test @@ -0,0 +1,22 @@ +DB_HOST=db +DB_PORT=5432 +DB_NAME=tech_team_local_db +DB_USERNAME=postgres +DB_PASSWORD=makingwarmerhomes + + +#not used +GOOGLE_SOLAR_API_KEY=test +SAP_PREDICTIONS_BUCKET=test +CARBON_PREDICTIONS_BUCKET=test +HEAT_PREDICTIONS_BUCKET=test +HEATING_KWH_PREDICTIONS_BUCKET=test +HOTWATER_KWH_PREDICTIONS_BUCKET=test +API_KEY=test +ENVIRONMENT=test +SECRET_KEY=test +PLAN_TRIGGER_BUCKET=test +DATA_BUCKET=test +EPC_AUTH_TOKEN=test +ENGINE_SQS_URL=test +ENERGY_ASSESSMENTS_BUCKET=test \ No newline at end of file diff --git a/backend/Property.py b/backend/Property.py index 14f7e03f..6a84fc09 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -1256,7 +1256,8 @@ class Property: "biodiesel": "Smokeless Fuel", "b30d": "B30K Biofuel", "coal": "Coal", - "oil": "Oil" + "oil": "Oil", + "unknown": None # Handle - anything post 2020 is electricity else gas } self.heating_energy_source = list({ @@ -1326,7 +1327,16 @@ class Property: if self.heating_energy_source == "Varied (Community Scheme)": if self.main_fuel["fuel_type"] in fuel_map: # We assume when None as it's unknown - self.heating_energy_source = fuel_map[self.main_fuel["fuel_type"]] + mapped_to = fuel_map[self.main_fuel["fuel_type"]] + if mapped_to is None and self.main_fuel["fuel_type"] == "unknown": + # Handle logic based on age band + if self.year_built >= 2020: + self.heating_energy_source = "Electricity" + else: + self.heating_energy_source = "Natural Gas (Community Scheme)" + + else: + self.heating_energy_source = mapped_to else: raise NotImplementedError(f"Unhandled fuel {self.main_fuel['fuel_type']}") diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md new file mode 100644 index 00000000..b4876340 --- /dev/null +++ b/backend/address2UPRN/README.md @@ -0,0 +1,20 @@ +We have list of address as input. + +It'll come in batches of the same post code and from then we want to somehow convert that into UPRN + +if this lambda/function can do that we'll be speeding ahead + + +Energy Performance Information: https://epc.opendatacommunities.org/ + +guidance page: https://epc.opendatacommunities.org/docs/guidance#field_domestic_LMK_KEY + +Example of past khalims code that he wrote some tests for: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/tests/test_search_epc.py#L11 + + +Example of EPC search: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/SearchEpc.py#L118 + + + +Khalim has made a python package to help scrape data: https://github.com/KhalimCK/epc-api-python + diff --git a/backend/address2UPRN/__init__.py b/backend/address2UPRN/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile new file mode 100644 index 00000000..5a09bd44 --- /dev/null +++ b/backend/address2UPRN/handler/Dockerfile @@ -0,0 +1,23 @@ +FROM public.ecr.aws/lambda/python:3.10 + +# Set working directory (Lambda task root) +WORKDIR /var/task + +# ----------------------------- +# Copy requirements FIRST (for Docker layer caching) +# ----------------------------- +COPY backend/address2UPRN/handler/requirements.txt . + +# Install dependencies into Lambda runtime +RUN pip install --no-cache-dir -r requirements.txt + +# ----------------------------- +# Copy application code +# ----------------------------- +COPY utils/ utils/ +COPY backend/address2UPRN/main.py . + +# ----------------------------- +# Lambda handler +# ----------------------------- +CMD ["main.handler"] diff --git a/backend/address2UPRN/handler/requirements.txt b/backend/address2UPRN/handler/requirements.txt new file mode 100644 index 00000000..bc753841 --- /dev/null +++ b/backend/address2UPRN/handler/requirements.txt @@ -0,0 +1,3 @@ +epc-api-python==1.0.2 +tqdm +pandas \ No newline at end of file diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py new file mode 100644 index 00000000..ba386e0a --- /dev/null +++ b/backend/address2UPRN/main.py @@ -0,0 +1,571 @@ +from epc_api.client import EpcClient +import os +from urllib.parse import urlencode +import pandas as pd +from difflib import SequenceMatcher +from tqdm import tqdm +from utils.logger import setup_logger + +logger = setup_logger() + +import re + +EPC_AUTH_TOKEN = os.getenv( + "EPC_AUTH_TOKEN", +) + +if EPC_AUTH_TOKEN is None: + raise RuntimeError("EPC_AUTH_TOKEN not defined in env") + +import re +from difflib import SequenceMatcher +from typing import Set + + +def levenshtein(a: str, b: str) -> float: + """ + Address similarity score in [0, 1]. + + Strategy: + - Normalise + - Strongly penalise mismatched house/flat numbers + - Combine token overlap + character similarity + """ + + def extract_number_sequence(s: str) -> list[str]: + return re.findall(r"\d+[a-z]?", s) + + def extract_numbers(s: str) -> Set[str]: + return set(extract_number_sequence(s)) + + def tokenise(s: str) -> Set[str]: + return set(s.split()) + + def extract_building_number(s: str) -> str | None: + """ + Extract the main building number (NOT flat/unit). + Assumes formats like: + - '42 moreton road' + - 'flat 3 42 moreton road' + """ + tokens = s.split() + + # remove flat/unit context + cleaned = [] + skip_next = False + for t in tokens: + if t in ("flat", "apt", "apartment", "unit"): + skip_next = True + continue + if skip_next: + skip_next = False + continue + cleaned.append(t) + + # first remaining number is building number + for t in cleaned: + if re.fullmatch(r"\d+[a-z]?", t): + return t + + return None + + a_norm = normalise_address(a) + b_norm = normalise_address(b) + + # --- hard signal: numbers --- + nums_a = extract_numbers(a_norm) + nums_b = extract_numbers(b_norm) + + if nums_a and not nums_b: + return 0.0 + + # No shared numbers at all → impossible match + if nums_a and nums_b and nums_a.isdisjoint(nums_b): + return 0.0 + + # 🔒 HARD GUARD: building number must match + bld_a = extract_building_number(a_norm) + bld_b = extract_building_number(b_norm) + + if bld_a and bld_b and bld_a != bld_b: + return 0.0 + + # --- order-sensitive flat/building guard --- + seq_a = extract_number_sequence(a_norm) + seq_b = extract_number_sequence(b_norm) + + has_flat_token_user = any( + tok in a_norm for tok in ("flat", "apt", "apartment", "unit") + ) + has_flat_token_epc = "flat" in b_norm + + if ( + len(seq_a) == 2 + and len(seq_b) >= 2 + and has_flat_token_epc + and not has_flat_token_user + and seq_a != seq_b[:2] + ): + return 0.0 + + # --- token similarity (order-independent) --- + toks_a = tokenise(a_norm) + toks_b = tokenise(b_norm) + + if not toks_a or not toks_b: + token_score = 0.0 + else: + token_score = len(toks_a & toks_b) / len(toks_a | toks_b) + + # --- character similarity (soft signal) --- + char_score = SequenceMatcher(None, a_norm, b_norm).ratio() + + # --- weighted blend --- + return round( + 0.65 * token_score + 0.35 * char_score, + 4, + ) + + +def normalise_address(s: str) -> str: + """ + Canonical UK-focused address normalisation. + + - Lowercases + - Removes punctuation (keeps / for flats) + - Normalises whitespace + - Applies synonym compression at token level + """ + + if not s: + return "" + + ADDRESS_SYNONYMS = { + # street types + "rd": "road", + "rd.": "road", + "st": "street", + "st.": "street", + "ave": "avenue", + "ave.": "avenue", + "ln": "lane", + "ln.": "lane", + "cres": "crescent", + "ct": "court", + "dr": "drive", + # flats / units + "apt": "flat", + "apartment": "flat", + "unit": "flat", + "ste": "suite", + # numbering noise + "no": "", + "no.": "", + } + # 1. lowercase + s = s.lower() + + # 1.5 split digit-letter suffixes + s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) + + # 2. remove punctuation except / + s = re.sub(r"[^\w\s/]", " ", s) + + # 3. normalise whitespace + s = re.sub(r"\s+", " ", s).strip() + + # 4. tokenise + synonym normalisation + tokens = [] + for tok in s.split(): + replacement = ADDRESS_SYNONYMS.get(tok, tok) + if replacement: + tokens.append(replacement) + + return " ".join(tokens) + + +def score_addresses( + df: pd.DataFrame, + user_address: str, + column: str = "address", +) -> pd.Series: + if column not in df.columns: + raise ValueError(f"Missing column: {column}") + + return df[column].apply(lambda x: levenshtein(user_address, x)) + + +def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): + """ + Recursively fetch EPC data by postcode. + If results hit the size limit, retry with double size up to max_attempts. + """ + client = EpcClient(auth_token=EPC_AUTH_TOKEN) + + url = os.path.join(client.domestic.host, "search") + + if size: + url += "?" + urlencode({"size": size}) + + search_resp = client.domestic.call( + url=url, + method="get", + params={"postcode": postcode}, + ) + if not search_resp or "rows" not in search_resp: + return pd.DataFrame() + + results_df = pd.DataFrame(search_resp["rows"], columns=search_resp["column-names"]) + + row_count = len(results_df) + + # If we hit the size limit, there *may* be more results + if row_count == size: + print( + f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. " + f"Attempt {attempt}/{max_attempts}." + ) + + if attempt < max_attempts: + print(f"🔁 Retrying with size={size * 2}") + return get_epc_data_with_postcode( + postcode=postcode, + size=size * 2, + attempt=attempt + 1, + max_attempts=max_attempts, + ) + else: + print( + "🚨 Max attempts reached. Results may be truncated. " + "(Please do a manual review by the tech team.)" + ) + + return results_df + + +def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: + """ + Returns True if all non-null UPRNs in df match the given uprn. + Returns False otherwise. + """ + + if column not in df.columns: + return False + + # Drop nulls and normalise to string + uprns = df[column].dropna().astype(str).str.strip().unique() + + # No valid UPRNs to compare + if len(uprns) == 0: + return False + + # Exactly one unique UPRN and it matches + return len(uprns) == 1 and uprns[0] == str(uprn) + + +def get_uprn_candidates( + df: pd.DataFrame, + user_address: str, + address_column: str = "address", + uprn_column: str = "uprn", +) -> pd.DataFrame: + """ + Annotate EPC results with lexicographical similarity scores and ranks. + + Returns a DataFrame sorted by descending lexiscore. + DOES NOT choose or return a UPRN. + """ + + if address_column not in df.columns: + raise ValueError(f"Missing column: {address_column}") + + if uprn_column not in df.columns: + raise ValueError(f"Missing column: {uprn_column}") + + out = df.copy() + + user_norm = normalise_address(user_address) + + out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x)) + + # Normalise UPRN to string + out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) + + # Rank: 1 = best match + out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) + + return out.sort_values( + ["lexirank", "lexiscore"], + ascending=[True, False], + ) + + +def get_uprn(user_inputed_address: str, postcode: str, return_address=False): + """ + Return uprn (str) + Return False if failed to find a sensible matching epc + Return Nons when epc found but no UPRN + """ + df = get_epc_data_with_postcode(postcode=postcode) + + if df.empty: + return None + + scored_df = get_uprn_candidates( + df, + user_address=user_inputed_address, + ) + + # Best score + best_score = scored_df.iloc[0]["lexiscore"] + + if best_score <= 0: + return None + + # All rank-1 rows (possible draw) + top_rank_df = scored_df[scored_df["lexirank"] == 1] + + # If rank-1 rows do not agree on a single UPRN → ambiguous + if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]): + return None + + address = top_rank_df["address"].values[0] + lexiscore = float(top_rank_df["lexiscore"].values[0]) + + logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}") + # Safe to return the agreed UPRN + found_uprn = top_rank_df.iloc[0]["uprn"] + + if found_uprn == "": + return None + + if return_address: + return found_uprn, address + return found_uprn + + +def resolve_uprns_for_postcode_group( + group_df: pd.DataFrame, + epc_df: pd.DataFrame, + address_col: str = "Address 1", +) -> pd.DataFrame: + """ + Given: + - group_df: rows sharing the same postcode + - epc_df: EPC search results for that postcode + + Returns: + group_df + found_uprn + diagnostics + """ + + results = [] + + for _, row in group_df.iterrows(): + user_address = str(row[address_col]).strip() + + scored_df = get_uprn_candidates( + epc_df, + user_address=user_address, + ) + + if scored_df.empty: + results.append( + { + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "no_epc_candidates", + } + ) + continue + + best_score = scored_df.iloc[0]["lexiscore"] + + if best_score <= 0: + results.append( + { + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": best_score, + "status": "zero_score", + } + ) + continue + + top_rank_df = scored_df[scored_df["lexirank"] == 1] + + if not df_has_single_uprn(top_rank_df, top_rank_df.iloc[0]["uprn"]): + results.append( + { + "found_uprn": None, + "best_match_uprn": top_rank_df.iloc[0]["uprn"], + "best_match_address": top_rank_df.iloc[0]["address"], + "best_match_lexiscore": best_score, + "status": "ambiguous", + } + ) + continue + + results.append( + { + "found_uprn": str(top_rank_df.iloc[0]["uprn"]), + "best_match_uprn": str(top_rank_df.iloc[0]["uprn"]), + "best_match_address": top_rank_df.iloc[0]["address"], + "best_match_lexiscore": best_score, + "status": "matched", + } + ) + + return pd.concat( + [group_df.reset_index(drop=True), pd.DataFrame(results)], + axis=1, + ) + + +def test(a, b): + assert a == b, f"erorr: {a}{type(a)} != {b}: {type(b)}" + + +def run_all_test(): + # Basic usage with different post codes styles + test(get_epc_data_with_postcode("b93 8sy").shape[0], 63) + test(get_epc_data_with_postcode("B938sy").shape[0], 63) + test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) + test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) + + test(get_uprn("68", "b93 8sy"), "100070989938") + test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938") + test(get_uprn("Flat A, 28, Nelgarde Road", "se6 4tf"), "100023278633") + test(get_uprn("28 A", "se6 4tf"), "100023278633") + test(get_uprn("28A", "se6 4tf"), "100023278633") + test(get_uprn("6 Aitken Close", "E8 4SQ"), False) + + # unique case + test(get_uprn("Flat 5, 1, Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("5 , 1 Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("5 Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("1, 5 Semley Gate", "e9 5nh"), False) + test( + get_uprn("1 Semley Gate", "e9 5nh"), "10008238188" + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("48 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("42 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("46 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + get_uprn_candidates(get_epc_data_with_postcode("e5 0bt"), "48 Oswald Street") + get_uprn_candidates( + get_epc_data_with_postcode("Cr2 7dl"), + "FLAT 3; 42 MORETON ROAD, SOUTH CROYDON, SURREY", + ) + + +if __name__ == "__main__": + INPUT_FILE = "hackney.xlsx" + + ADDRESS_COL = "Address 1" + POSTCODE_COL = "Postcode" + UPRN_COL = "UPRN" + + df = pd.read_excel(INPUT_FILE) + + failures = [] + + for _, row in tqdm( + df.iterrows(), + total=len(df), + desc="Auditing UPRNs", + ): + input_address = str(row[ADDRESS_COL]).strip() + postcode = str(row[POSTCODE_COL]).strip() + + expected_uprn = None if pd.isna(row[UPRN_COL]) else str(int(row[UPRN_COL])) + + try: + epc_df = get_epc_data_with_postcode(postcode) + + if epc_df.empty: + failures.append( + { + **row.to_dict(), + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "no_epc_results", + } + ) + continue + + scored_df = get_uprn_candidates( + epc_df, + user_address=input_address, + ) + + best_row = scored_df.iloc[0] + + best_match_uprn = str(best_row["uprn"]) + best_match_address = best_row["address"] + best_match_lexiscore = round(float(best_row["lexiscore"]), 4) + + found_uprn = get_uprn(input_address, postcode) + + except Exception as e: + failures.append( + { + **row.to_dict(), + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "exception", + "error": str(e), + } + ) + continue + + found_uprn_norm = None if not found_uprn else str(found_uprn) + + if found_uprn_norm != expected_uprn: + failures.append( + { + **row.to_dict(), + "found_uprn": found_uprn_norm, + "best_match_uprn": best_match_uprn, + "best_match_address": best_match_address, + "best_match_lexiscore": best_match_lexiscore, + "status": ("no_match" if found_uprn_norm is None else "mismatch"), + } + ) + + failures_df = pd.DataFrame(failures) + + print("===================================") + print(f"Total rows : {len(df)}") + print(f"Failures : {len(failures_df)}") + print("===================================") + + failures_df.to_excel( + "hackney_uprn_failures.xlsx", + index=False, + ) + + +def handler(event, context): + print("hello world") + return {"statusCode": 200, "body": "hello world"} + + +# TO do function dispatcher, + +# get_uprn_candidates(get_epc_data_with_postcode("E9 5NH"),"Flat 1, 5 Semley Gate" and Flat 5, 1 Semley Gate) +# fix that +# Look again at flat 1 +# pandas reader the seperate postcode_splitter +# dump into s3 diff --git a/backend/address2UPRN/script.py b/backend/address2UPRN/script.py new file mode 100644 index 00000000..a71b5827 --- /dev/null +++ b/backend/address2UPRN/script.py @@ -0,0 +1,24 @@ +import pandas as pd +from tqdm import tqdm +from backend.address2UPRN.main import get_uprn + +# Enable tqdm for pandas +tqdm.pandas() + +df = pd.read_excel("address2.xlsx") + + +def extract_uprn(row): + print(row["User Input"], row["Postcode"]) + result = get_uprn(row["User Input"], row["Postcode"], return_address=True) + + if result is None: + return pd.Series([None, None]) + + uprn, found_address = result + return pd.Series([uprn, found_address]) + + +df[["juntes uprn", "junte found address"]] = df.progress_apply(extract_uprn, axis=1) + +df.to_excel("outputs2.xlsx", index=False) diff --git a/backend/address2UPRN/tests/test_csv.py b/backend/address2UPRN/tests/test_csv.py new file mode 100644 index 00000000..70e7a9f9 --- /dev/null +++ b/backend/address2UPRN/tests/test_csv.py @@ -0,0 +1,40 @@ +# tests/test_address_to_uprn_csv.py + +import csv +import pytest +from pathlib import Path +from backend.address2UPRN.main import get_uprn + +FIXTURE_PATH = Path(__file__).parent / "test_data.csv" + + +def load_test_cases(): + with open(FIXTURE_PATH, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [ + pytest.param( + row["User Input"], + row["Postcode"], + row["Manual UPRN Code"], + id=f'{row["User Input"]} [{row["Postcode"]}]', + ) + for row in reader + ] + + +@pytest.mark.parametrize( + "user_input,postcode,expected_uprn", + load_test_cases(), +) +def test_uprn_resolution_matches_manual( + user_input: str, + postcode: str, + expected_uprn: str, +): + from utils.logger import setup_logger + + uprn = get_uprn(user_input, postcode) + if uprn: + assert uprn == expected_uprn + else: + assert str(uprn) == expected_uprn diff --git a/backend/address2UPRN/tests/test_data.csv b/backend/address2UPRN/tests/test_data.csv new file mode 100644 index 00000000..ee23813b --- /dev/null +++ b/backend/address2UPRN/tests/test_data.csv @@ -0,0 +1,366 @@ +User Input,Postcode,Manual UPRN Code +47 The Fairway,OX16 0RR,100120771697 +11 REGENT COURT,SL1 3LG,100081041562 +3/137a Windmill Road,TW8 9NH,100021516998 +Flat 33,SW18 4BE,100023328943 +FLAT 1 Brendon Grove,N2 8JE,200013412 +Flat 15,KT8 2NE,100062123759 +FLAT 5 Stonehill Road,W4 3AH,100021589829 +10 Douglas Court,SL7 1UQ,100081278099 +1 Windmill Road,HP17 8JA,766034606 +31 Denewood,HP13 7LH,100081095964 +"10, Greenways Drive",TW4 5DD,10091597009 +Flat 10,W4 3AH,"100021589834" +Flat 11,TW4 5DD,10091597010 +Flat 11,W4 3AH,100021589835 +"12, Greenways Drive",TW4 5DD,10091597011 +"Flat 12, Forbes House",W4 3AH,100021589836 +FLAT 1 Goodstone Court,HA1 4FL,10070269053 +Flat 13,TW4 5DD,10091597012 +Flat 13,W4 3AH,100021589837 +Flat 14,TW4 5DD,10091597013 +Flat 14,W4 3AH,100021589838 +Flat 15,TW4 5DD,10091597014 +Flat 15,W4 3AH,100021589839 +Flat 16,TW4 5DD,"10091597015" +Flat 16,W4 3AH,100021589840 +Flat 17,TW4 5DD,10091597016 +Flat 17,W4 3AH,100021589841 +Flat 18,TW4 5DD,10091597017 +Flat 19,W4 3AH,100021589843 +Flat 20,W4 3AH,100021589844 +Flat 21,W4 3AH,100021589845 +Flat 22,W4 3AH,100021589846 +FLAT 2 Goodstone Court,HA1 4FL,10070269054 +Flat 23,W4 3AH,100021589847 +Flat 24,W4 3AH,100021589848 +"30c, Bosanquet Close",UB8 3PE,100021475316 +"30e, Bosanquet Close",UB8 3PE,100021475318 +FLAT 3 Goodstone Court,HA1 4FL,10070269055 +FLAT 4 Goodstone Court,HA1 4FL,10070269056 +FLAT 5 Goodstone Court,HA1 4FL,10070269057 +FLAT 6 Goodstone Court,HA1 4FL,10070269058 +FLAT 7 Goodstone Court,HA1 4FL,10070269059 +FLAT 8 Goodstone Court,HA1 4FL,10070269060 +FLAT 9 Goodstone Court,HA1 4FL,10070269061 +FLAT 10 Goodstone Court,HA1 4FL,10070269062 +FLAT 11 Goodstone Court,HA1 4FL,10070269063 +FLAT 12 Goodstone Court,HA1 4FL,10070269064 +FLAT 13 Goodstone Court,HA1 4FL,10070269065 +FLAT 14 Goodstone Court,HA1 4FL,10070269066 +FLAT 15 Goodstone Court,HA1 4FL,10070269067 +FLAT 16 Goodstone Court,HA1 4FL,10070269068 +FLAT 17 Goodstone Court,HA1 4FL,10070269069 +FLAT 18 Goodstone Court,HA1 4FL,10070269070 +FLAT 19 Goodstone Court,HA1 4FL,10070269071 +FLAT 20 Goodstone Court,HA1 4FL,10070269072 +FLAT 21 Goodstone Court,HA1 4FL,10070269073 +FLAT 22 Goodstone Court,HA1 4FL,10070269074 +FLAT 23 Goodstone Court,HA1 4FL,10070269075 +FLAT 24 Goodstone Court,HA1 4FL,10070269076 +FLAT 25 Goodstone Court,HA1 4FL,10070269077 +FLAT 26 Goodstone Court,HA1 4FL,10070269078 +FLAT 27 Goodstone Court,HA1 4FL,10070269079 +FLAT 28 Goodstone Court,HA1 4FL,10070269080 +FLAT 29 Goodstone Court,HA1 4FL,10070269081 +FLAT 30 Goodstone Court,HA1 4FL,10070269082 +FLAT 31 Goodstone Court,HA1 4FL,10070269083 +FLAT 32 Goodstone Court,HA1 4FL,10070269084 +FLAT 33 Goodstone Court,HA1 4FL,10070269085 +FLAT 34 Goodstone Court,HA1 4FL,10070269086 +FLAT 35 Goodstone Court,HA1 4FL,10070269087 +FLAT 36 Goodstone Court,HA1 4FL,10070269088 +FLAT 37 Goodstone Court,HA1 4FL,10070269089 +FLAT 38 Goodstone Court,HA1 4FL,10070269090 +FLAT 39 Goodstone Court,HA1 4FL,10070269091 +FLAT 40 Goodstone Court,HA1 4FL,10070269092 +FLAT 41 Goodstone Court,HA1 4FL,10070269093 +FLAT 42 Goodstone Court,HA1 4FL,10070269094 +FLAT 43 Goodstone Court,HA1 4FL,10070269095 +"13 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778260 +"14 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778259 +"15 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778258 +"16 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778263 +"17 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778262 +"18 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778261 +"19 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778266 +"20 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778265 +"21 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778264 +90a Murray Road,W5 4DA,12135293 +"Flat 1, 6 Wolverton Gardens",W5 3LJ,"12119972" +"1, Monsted House",UB1 1FG,12189944 +"10, Monsted House",UB1 1FG,12189953 +"20, Monsted House",UB1 1FG,12189963 +"2, Monsted House",UB1 1FG,12189945 +"3, Monsted House",UB1 1FG,12189946 +"4, Monsted House",UB1 1FG,12189947 +"5, Monsted House",UB1 1FG,12189948 +"6, Monsted House",UB1 1FG,12189949 +"7, Monsted House",UB1 1FG,12189950 +"8, Monsted House",UB1 1FG,12189951 +"9, Monsted House",UB1 1FG,12189952 +"1 Cullis House, 1, Accolade Avenue",UB1 1FH,12189904 +"2 Cullis House, 1, Accolade Avenue",UB1 1FH,12189905 +"3 Cullis House, 1, Accolade Avenue",UB1 1FH,12189906 +"4 Cullis House, 1, Accolade Avenue",UB1 1FH,12189907 +"5 Cullis House, 1, Accolade Avenue",UB1 1FH,12189908 +"6 Cullis House, 1, Accolade Avenue",UB1 1FH,12189909 +1 Genteel House Samara Drive,UB1 1FJ,12189835 +2 Genteel House Samara Drive,UB1 1FJ,12189836 +3 Genteel House Samara Drive,UB1 1FJ,12189837 +4 Genteel House Samara Drive,UB1 1FJ,12189838 +5 Genteel House Samara Drive,UB1 1FJ,12189839 +6 Genteel House Samara Drive,UB1 1FJ,12189840 +7 Genteel House Samara Drive,UB1 1FJ,12189841 +8 Genteel House Samara Drive,UB1 1FJ,12189842 +9 Genteel House Samara Drive,UB1 1FJ,12189843 +10 Genteel House Samara Drive,UB1 1FJ,12189844 +1 ASH TREE HOUSE,SE5 0TE,None +"Flat 1 Ash Tree House, 2, Thompson Avenue",SE5 0TE,10009803979 +3 ASH TREE HOUSE,SE5 0TE,None +Flat 3 ASH TREE HOUSE,SE5 0TE,10009803981 +5 ASH TREE HOUSE,SE5 0TE,None +Flat 5 ASH TREE HOUSE,SE5 0TE,10009803983 +Flat 8 ASH TREE HOUSE,SE5 0TE,10009803986 +8 ASH TREE HOUSE,SE5 0TE,None +Flat 12 ASH TREE HOUSE,SE5 0TE,10009803990 +12 ASH TREE HOUSE,SE5 0TE,None +FLAT 1 599 HARROW ROAD,W10 4RA,217113930 +FLAT 2 599 HARROW ROAD,W10 4RA,217113931 +FLAT 3 599 HARROW ROAD,W10 4RA,None +FLAT 4 599 HARROW ROAD,W10 4RA,None +FLAT 5 599 HARROW ROAD,W10 4RA,217113934 +FLAT 6 599 HARROW ROAD,W10 4RA,None +FLAT 7 599 HARROW ROAD,W10 4RA,None +FLAT 8 599 HARROW ROAD,W10 4RA,None +"Flat 1, Ohio Building",SE13 7RX,10023226256 +"Flat 2, Ohio Building",SE13 7RX,10023226257 +"Apartment 1 Block B, 105, Benwell Road",N7 7BW,10012792307 +"Apartment 2 Block B, 105, Benwell Road",N7 7BW,10012792308 +"Apartment 3 Block B, 105, Benwell Road",N7 7BW,10012792309 +"Apartment 4 Block B, 105, Benwell Road",N7 7BW,10012792310 +"Apartment 5 Block B, 105, Benwell Road",N7 7BW,10012792311 +"Apartment 6 Block B, 105, Benwell Road",N7 7BW,10012792312 +"Apartment 7 Block B, 105, Benwell Road",N7 7BW,10012792313 +"Apartment 8 Block B, 105, Benwell Road",N7 7BW,10012792314 +"Apartment 9 Block B, 105, Benwell Road",N7 7BW,10012792315 +"Apartment 10 Block B, 105, Benwell Road",N7 7BW,10012792316 +"Apartment 11 Block B, 105, Benwell Road",N7 7BW,10012792317 +"Apartment 12 Block B, 105, Benwell Road",N7 7BW,10012792318 +"Apartment 13 Block B, 105, Benwell Road",N7 7BW,10012792319 +"Apartment 1 Block D, 32, Hornsey Road",N7 7AT,10012792366 +"Apartment 2 Block D, 32, Hornsey Road",N7 7AT,10012792367 +"Apartment 3 Block D, 32, Hornsey Road",N7 7AT,10012792368 +"Apartment 4 Block D, 32, Hornsey Road",N7 7AT,10012792369 +"Apartment 5 Block D, 32, Hornsey Road",N7 7AT,10012792370 +"Apartment 6 Block D, 32, Hornsey Road",N7 7AT,"10012792371" +"Apartment 7 Block D, 32, Hornsey Road",N7 7AT,10012792372 +"Apartment 8 Block D, 32, Hornsey Road",N7 7AT,10012792373 +"Apartment 9 Block D, 32, Hornsey Road",N7 7AT,10012792374 +"Apartment 10 Block D, 32, Hornsey Road",N7 7AT,10012792375 +"Apartment 11 Block D, 32, Hornsey Road",N7 7AT,10012792376 +"Apartment 12 Block D, 32, Hornsey Road",N7 7AT,10012792377 +"Apartment 13 Block D, 32, Hornsey Road",N7 7AT,10012792378 +"Apartment 14 Block D, 32, Hornsey Road",N7 7AT,10012792379 +"Apartment 15 Block D, 32, Hornsey Road",N7 7AT,10012792380 +"Apartment 16 Block D, 32, Hornsey Road",N7 7AT,"10012792381" +"Apartment 17Block D, 32, Hornsey Road",N7 7AT,10012792382 +"Apartment 18 Block D, 32, Hornsey Road",N7 7AT,10012792383 +24b Honley Road,SE6 2HZ,None +FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974 +2 COLLEGE HOUSE,CM7 1JS,100091449870 +3 COLLEGE HOUSE,CM7 1JS,100091449871 +1 Anita Street,M4 5DU,None +2 Anita Street,M4 5DU,77123061 +5 Anita Street,M4 5DU,77123081 +6 Anita Street,M4 5DU,77123082 +8 Anita Street,M4 5DU,None +9 Anita Street,M4 5DU,None +10 Anita Street,M4 5DU,77123051 +12 Anita Street,M4 5DU,77123053 +19 Anita Street,M4 5DU,None +22 Anita Street,M4 5DU,None +26 Anita Street,M4 5DU,77123068 +28 Anita Street,M4 5DU,None +30 Anita Street,M4 5DU,None +32 Anita Street,M4 5DU,None +33 Anita Street,M4 5DU,77123076 +34 Anita Street,M4 5DU,None +35 Anita Street,M4 5DU,77123078 +36 Anita Street,M4 5DU,77123079 +23 George Leigh Street,M4 5DR,77123171 +25 George Leigh Street,M4 5DR,None +35 George Leigh Street,M4 5DR,77123177 +39 George Leigh Street,M4 5DR,77123179 +41 George Leigh Street,M4 5DR,None +43 George Leigh Street,M4 5DR,None +49 George Leigh Street,M4 5DR,None +51 George Leigh Street,M4 5DR,77123185 +55 George Leigh Street,M4 5DR,None +57 George Leigh Street,M4 5DR,None +"1a, Victoria Square",M4 5DX,77211153 +2a Victoria Square ,M4 5DX,None +"4a, Victoria Square",M4 5DX,77211155 +5a Victoria Square,M4 5DX,77211156 + 6a Victoria Square,M4 5DX,77211157 +7a Victoria Square,M4 5DX,77211158 +8a Victoria Square,M4 5DX,77211159 +9a Victoria Square,M4 5DX,77211160 +10a Victoria Square,M4 5DX,77211161 +11a Victoria Square,M4 5DX,77211162 +12a Victoria Square,M4 5DX,77211163 +13a Victoria Square,M4 5DX,77211164 +14a Victoria Square,M4 5DX,77211165 +15a Victoria Square,M4 5DX,77211166 +16a Victoria Square,M4 5DX,77211167 +17a Victoria Square,M4 5DX,77211168 +18a Victoria Square,M4 5DX,77211169 +19a Victoria Square,M4 5DX,77211170 +20a Victoria Square,M4 5DX,77211171 +21a Victoria Square,M4 5DY,77211172 +22a Victoria Square,M4 5DY,None +23a Victoria Square,M4 5DY,77211174 +24a Victoria Square,M4 5DY,77211175 +25a Victoria Square,M4 5DY,77211176 +26a Victoria Square,M4 5DY,77211177 +27a Victoria Square,M4 5DY,77211178 +28a Victoria Square,M4 5DY,None +29a Victoria Square,M4 5DY,77211180 +30a Victoria Square,M4 5DY,77211181 +31a Victoria Square,M4 5DY,77211182 +32a Victoria Square,M4 5DY,77211183 +33a Victoria Square,M4 5DY,77211184 +34a Victoria Square,M4 5DY,77211185 +35a Victoria Square,M4 5DY,None +36a Victoria Square,M4 5DY,77211187 +37a Victoria Square,M4 5DY,77211188 +38a Victoria Square,M4 5DY,77211189 +39a Victoria Square,M4 5DY,77211190 +40a Victoria Square,M4 5DY,None +41a Victoria Square,M4 5DY,77211192 +42a Victoria Square,M4 5DY,77211193 +43a Victoria Square,M4 5DY,77211194 +44a Victoria Square,M4 5DY,77211195 +45a Victoria Square,M4 5DY,77211196 +46a Victoria Square,M4 5DY,77211197 +47a Victoria Square,M4 5DY,77211198 +48a Victoria Square,M4 5DY,77211199 +49a Victoria Square,M4 5DY,77211200 +50a Victoria Square,M4 5DY,77211201 +51a Victoria Square,M4 5DY,77211202 +52a Victoria Square,M4 5DY,77211203 +53a Victoria Square,M4 5DY,77211204 +54a Victoria Square,M4 5DY,77211205 +55a Victoria Square,M4 5DY,77211206 +56a Victoria Square,M4 5DZ,77211207 +57a Victoria Square,M4 5DZ,None +58a Victoria Square,M4 5DZ,77211209 +59a Victoria Square,M4 5DZ,77211210 +60a Victoria Square,M4 5DZ,77211211 +61a Victoria Square,M4 5DZ,77211212 +62a Victoria Square,M4 5DZ,77211213 +63a Victoria Square,M4 5DZ,None +64a Victoria Square,M4 5DZ,77211215 +65a Victoria Square,M4 5DZ,77211216 +66a Victoria Square,M4 5DZ,None +67a Victoria Square,M4 5DZ,None +68a Victoria Square,M4 5DZ,77211219 +69a Victoria Square,M4 5DZ,77211220 +70a Victoria Square,M4 5DZ,77211221 +71a Victoria Square,M4 5DZ,77211222 +72a Victoria Square,M4 5DZ,77211223 +73a Victoria Square,M4 5DZ,77211224 +74a Victoria Square,M4 5DZ,None +75a Victoria Square,M4 5DZ,77211226 +76a Victoria Square,M4 5DZ,77211227 +77a Victoria Square,M4 5DZ,None +78a Victoria Square,M4 5DZ,77211229 +79a Victoria Square,M4 5DZ,77211230 +80a Victoria Square,M4 5DZ,77211231 +81a Victoria Square,M4 5DZ,77211232 +82 Victoria Square,M4 5DZ,None +83a Victoria Square,M4 5DZ,77211234 +84a Victoria Square,M4 5DZ,None +85a Victoria Square,M4 5DZ,77211236 +86a Victoria Square,M4 5DZ,77211237 +87a Victoria Square,M4 5DZ,77211238 +88a Victoria Square,M4 5DZ,None +89a Victoria Square,M4 5DZ,77211240 +90a Victoria Square,M4 5DZ,77211241 +91a Victoria Square,M4 5DZ,77211242 +92a Victoria Square,M4 5DZ,77211243 +93a Victoria Square,M4 5EA,77211244 +94a Victoria Square,M4 5EA,None +95a Victoria Square,M4 5EA,77211246 +96a Victoria Square,M4 5EA,77211247 +97a Victoria Square,M4 5EA,77211248 +98a Victoria Square,M4 5EA,77211249 +99a Victoria Square,M4 5EA,77211250 +100a Victoria Square,M4 5EA,77211251 +101a Victoria Square,M4 5EA,None +102a Victoria Square,M4 5EA,None +103a Victoria Square,M4 5EA,77211254 +104a Victoria Square,M4 5EA,77211255 +105a Victoria Square,M4 5EA,None +106a Victoria Square,M4 5EA,77211257 +107a Victoria Square,M4 5EA,77211258 +108a Victoria Square,M4 5EA,77211259 +109a Victoria Square,M4 5EA,77211260 +110a Victoria Square,M4 5EA,77211261 +111a Victoria Square,M4 5EA,77211262 +112a Victoria Square,M4 5EA,None +113a Victoria Square,M4 5EA,77211264 +114a Victoria Square,M4 5EA,77211265 +115a Victoria Square,M4 5EA,77211266 +116a Victoria Square,M4 5EA,77211267 +117a Victoria Square,M4 5EA,None +118a Victoria Square,M4 5EA,None +119a Victoria Square,M4 5EA,77211270 +120a Victoria Square,M4 5EA,77211271 +121a Victoria Square,M4 5EA,77211272 +122a Victoria Square,M4 5EA,77211273 +123a Victoria Square,M4 5EA,77211274 +124a Victoria Square,M4 5EA,None +125a Victoria Square,M4 5EA,77211276 +126a Victoria Square,M4 5EA,77211277 +127a Victoria Square,M4 5EA,77211278 +128a Victoria Square,M4 5EA,77211279 +129a Victoria Square,M4 5EA,77211280 +130a Victoria Square,M4 5FA,77211281 +131a Victoria Square,M4 5FA,77211282 +132a Victoria Square,M4 5FA,77211283 +133a Victoria Square,M4 5FA,None +134a Victoria Square,M4 5FA,77211285 +135a Victoria Square,M4 5FA,77211286 +136a Victoria Square,M4 5FA,77211287 +137a Victoria Square,M4 5FA,77211288 +138a Victoria Square,M4 5FA,77211289 +139a Victoria Square,M4 5FA,77211290 +140a Victoria Square,M4 5FA,77211291 +141a Victoria Square,M4 5FA,77211292 +142a Victoria Square,M4 5FA,77211293 +143a Victoria Square,M4 5FA,77211294 +144a Victoria Square,M4 5FA,77211295 +145a Victoria Square,M4 5FA,None +146a Victoria Square,M4 5FA,77211297 +147a Victoria Square,M4 5FA,77211298 +148a Victoria Square,M4 5FA,77211299 +149a Victoria Square,M4 5FA,77211300 +150a Victoria Square,M4 5FA,77211301 +151a Victoria Square,M4 5FA,None +152a Victoria Square,M4 5FA,77211303 +153a Victoria Square,M4 5FA,None +154a Victoria Square,M4 5FA,77211305 +155a Victoria Square,M4 5FA,None +156a Victoria Square,M4 5FA,77211307 +157a Victoria Square,M4 5FA,77211308 +158a Victoria Square,M4 5FA,77211309 +159a Victoria Square,M4 5FA,None +160a Victoria Square,M4 5FA,77211311 +161a Victoria Square,M4 5FA,None +162a Victoria Square,M4 5FA,None +163a Victoria Square,M4 5FA,77211314 +164a Victoria Square,M4 5FA,77211315 +165a Victoria Square,M4 5FA,77211316 +166a Victoria Square,M4 5FA,None +"FLAT 3; 42 MORETON ROAD, SOUTH CROYDON, SURREY",CR2 7DL,None \ No newline at end of file diff --git a/backend/app/config.py b/backend/app/config.py index dd3f5db1..41552ae5 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -1,8 +1,22 @@ +import os from functools import lru_cache -from pydantic_settings import BaseSettings +from pydantic_settings import BaseSettings, SettingsConfigDict from typing import Optional +def resolve_env_file() -> Optional[str]: + env = os.getenv("ENVIRONMENT", "local") + + if env == "local": + return "backend/.env" + + if env == "test": + return "backend/.env.test" + + # prod = no env file + return None + + class Settings(BaseSettings): API_KEY: str API_KEY_NAME: str = "X-API-KEY" @@ -41,8 +55,10 @@ class Settings(BaseSettings): AWS_SECRET_KEY_ID: Optional[str] = None AWS_DEFAULT_REGION: Optional[str] = None - class Config: - env_file = "backend/.env" + model_config = SettingsConfigDict( + env_file=resolve_env_file(), + env_file_encoding="utf-8", + ) @lru_cache() diff --git a/backend/app/db/connection.py b/backend/app/db/connection.py index 74f3bd2e..f0649c71 100644 --- a/backend/app/db/connection.py +++ b/backend/app/db/connection.py @@ -3,7 +3,9 @@ from contextlib import contextmanager from backend.app.config import get_settings from sqlmodel import Session -connection_string = "postgresql+{drivername}://{username}:{password}@{server}:{port}/{dbname}" +connection_string = ( + "postgresql+{drivername}://{username}:{password}@{server}:{port}/{dbname}" +) db_string = connection_string.format( drivername="psycopg2", # You'll need to use psycopg2 driver for PostgreSQL username=get_settings().DB_USERNAME, @@ -28,7 +30,9 @@ db_engine = create_engine( def get_db_session(): if db_engine is None: - raise RuntimeError("Database is not configured. Set DATABASE_URL in environment variables.") + raise RuntimeError( + "Database is not configured. Set DATABASE_URL in environment variables." + ) return Session(db_engine) diff --git a/backend/app/db/functions/condition_functions.py b/backend/app/db/functions/condition_functions.py new file mode 100644 index 00000000..d281b9a4 --- /dev/null +++ b/backend/app/db/functions/condition_functions.py @@ -0,0 +1,12 @@ +from typing import List +from sqlalchemy import insert, delete +from sqlalchemy.orm import Session + +from backend.app.db.connection import db_session, db_read_session +from backend.app.db.models.condition import PropertyConditionSurveyModel + + +def bulk_insert_property_surveys( + session: Session, surveys: List[PropertyConditionSurveyModel] +) -> None: + raise NotImplementedError diff --git a/backend/app/db/models/condition.py b/backend/app/db/models/condition.py new file mode 100644 index 00000000..77043366 --- /dev/null +++ b/backend/app/db/models/condition.py @@ -0,0 +1,97 @@ +from sqlalchemy import ( + BigInteger, + Column, + Date, + ForeignKey, + Integer, + String, + Enum as SqlEnum, +) +from sqlalchemy.orm import declarative_base, relationship + +from backend.condition.domain.aspect_type import AspectType +from backend.condition.domain.element_type import ElementType + +Base = declarative_base() + +ElementTypeDb = SqlEnum( + ElementType, + name="element_type", + native_enum=True, + values_callable=lambda enum: [e.value for e in enum], +) + +AspectTypeDb = SqlEnum( + AspectType, + name="aspect_type", + native_enum=True, + values_callable=lambda enum: [a.value for a in enum], +) + + +class PropertyConditionSurveyModel(Base): + __tablename__ = "property_condition_survey" + + id = Column(BigInteger, primary_key=True, autoincrement=True) + uprn = Column(BigInteger, nullable=False) + + date = Column(Date, nullable=False) + source = Column(String, nullable=False) + + elements = relationship( + "ElementModel", + back_populates="survey", + cascade="all, delete-orphan", + ) + + +class ElementModel(Base): + __tablename__ = "element" # TODO: rename to survey_element? + + id = Column(BigInteger, primary_key=True, autoincrement=True) + + survey_id = Column( + BigInteger, + ForeignKey("property_condition_survey.id"), + nullable=False, + ) + + element_type = Column(ElementTypeDb, nullable=False) + element_instance = Column(BigInteger, nullable=False) + + survey = relationship( + "PropertyConditionSurveyModel", + back_populates="elements", + ) + + aspect_conditions = relationship( + "AspectConditionModel", + back_populates="element", + cascade="all, delete-orphan", + ) + + +class AspectConditionModel(Base): + __tablename__ = "aspect_condition" # TODO: rename to survey_aspect? + + id = Column(BigInteger, primary_key=True, autoincrement=True) + + element_id = Column( + BigInteger, + ForeignKey("element.id"), + nullable=False, + ) + + aspect_type = Column(AspectTypeDb, nullable=False) + aspect_instance = Column(BigInteger, nullable=False) + + value = Column(String) + quantity = Column(Integer) + install_date = Column(Date) + renewal_year = Column(Integer) + comments = Column(String) + + element = relationship( + "ElementModel", + back_populates="aspect_conditions", + ) diff --git a/backend/app/plan/utils.py b/backend/app/plan/utils.py index 33f391d4..10d7fb06 100644 --- a/backend/app/plan/utils.py +++ b/backend/app/plan/utils.py @@ -24,7 +24,7 @@ def get_cleaned(): cleaned = read_from_s3( s3_file_name="cleaned_epc_data/cleaned.bson", - bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT) + bucket_name=get_settings().DATA_BUCKET ) cleaned = msgpack.unpackb(cleaned, raw=False) diff --git a/backend/app/requirements/requirements.txt b/backend/app/requirements/requirements.txt index dff7a546..3124034e 100644 --- a/backend/app/requirements/requirements.txt +++ b/backend/app/requirements/requirements.txt @@ -1,3 +1,4 @@ + # fastapi fastapi==0.115.2 sqlalchemy==2.0.36 @@ -12,5 +13,4 @@ boto3==1.35.44 openpyxl==3.1.2 # Basic pytz -sqlmodel - +sqlmodel \ No newline at end of file diff --git a/backend/condition/README.md b/backend/condition/README.md index 140d4585..46302cab 100644 --- a/backend/condition/README.md +++ b/backend/condition/README.md @@ -20,7 +20,7 @@ The processor currently supports file formats provided by **Peabody** and **LBWF The `local_runner` script allows the processor to be executed in a local environment. -1. Copy a sample input file into the `sample_data/` directory. +1. Copy sample input file(s) into the `sample_data/` directory. If working with Peabody data, you'll need the Landlord Reference / UPRN lookup file as well. 2. Update `local_runner.py` as required, specifically the definitions of: - `lbwf_path` - `peabody_path` diff --git a/backend/condition/condition_trigger_request.py b/backend/condition/condition_trigger_request.py new file mode 100644 index 00000000..03bd6ad1 --- /dev/null +++ b/backend/condition/condition_trigger_request.py @@ -0,0 +1,33 @@ +from enum import Enum +from typing import Optional +from pydantic import BaseModel + + +class ConditionFileType(Enum): + LBWF = "LBWF" + Peabody = "Peabody" + # TODO: make these asset management systems rather than client names + + +class ConditionTriggerRequest(BaseModel): + file_type: ConditionFileType + trigger_file_bucket: str # TODO: get this from settings + trigger_file_key: str + + uprn_lookup_file_bucket: Optional[str] = None # TODO: get this from settings + uprn_lookup_file_key: Optional[str] = None + + +# { +# "file_type": "Peabody", +# "trigger_file_bucket": "condition-data-dev", +# "trigger_file_key": "input/peabody/2026_01_06 - Peabody - Stock Condition Data - Survey Records - D Lower.xlsx", +# "uprn_lookup_file_bucket": "condition-data-dev", +# "uprn_lookup_file_key": "input/peabody/uprn-lookup/PeabodyPropertymatched_Dec25_propref_UPRN.csv" +# } + +# { +# "file_type": "LBWF", +# "trigger_file_bucket": "condition-data-dev", +# "trigger_file_key": "input/lbwf/LBWF - Example Asset Data September 2025.xlsx", +# } diff --git a/backend/condition/domain/mapping/lbwf/lbwf_mapper.py b/backend/condition/domain/mapping/lbwf/lbwf_mapper.py index 60c8b1ac..9dbfcb17 100644 --- a/backend/condition/domain/mapping/lbwf/lbwf_mapper.py +++ b/backend/condition/domain/mapping/lbwf/lbwf_mapper.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Optional, Tuple from datetime import date from backend.condition.domain.aspect_condition import AspectCondition diff --git a/backend/condition/file_type.py b/backend/condition/file_type.py deleted file mode 100644 index e0736814..00000000 --- a/backend/condition/file_type.py +++ /dev/null @@ -1,16 +0,0 @@ -from enum import Enum - -class FileType(Enum): - LBWF = "lbwf" - Peabody = "peabody" - -def detect_file_type(filepath: str) -> FileType: - path = filepath.lower() - - if "lbwf" in path: - return FileType.LBWF - - if "peabody" in path: - return FileType.Peabody - - raise ValueError("Unrecognised file path") \ No newline at end of file diff --git a/backend/condition/handler.py b/backend/condition/handler.py deleted file mode 100644 index 5279b029..00000000 --- a/backend/condition/handler.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Mapping, Any -from io import BytesIO - -from utils.logger import setup_logger -from backend.condition.processor import process_file - - -logger = setup_logger() - -def handler(event: Mapping[str, Any], context: Any) -> None: - # Temporary stub for PoC wiring - dummy_stream = BytesIO(b"") - - source_key = event.get("source_key", "unknown-source") - - process_file(dummy_stream, source_key) \ No newline at end of file diff --git a/backend/condition/handler/Dockerfile b/backend/condition/handler/Dockerfile new file mode 100644 index 00000000..71556895 --- /dev/null +++ b/backend/condition/handler/Dockerfile @@ -0,0 +1,48 @@ +FROM public.ecr.aws/lambda/python:3.11 +# For local running: +# FROM python:3.11.10-bullseye + +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + + +# Set working directory (Lambda task root) +WORKDIR /var/task + +# Environment +ENV DB_HOST=${DEV_DB_HOST} +ENV DB_PORT=${DEV_DB_PORT} +ENV DB_NAME=${DEV_DB_NAME} + +COPY backend/.env.test backend/.env + +# ----------------------------- +# Copy requirements FIRST (for Docker layer caching) +# ----------------------------- +COPY backend/condition/handler/requirements.txt . + +# Install dependencies into Lambda runtime +RUN pip install --no-cache-dir -r requirements.txt + +# ----------------------------- +# Copy application code +# ----------------------------- +COPY utils/ utils/ +COPY backend/condition/ backend/condition/ + +COPY backend/app/db/models/condition.py backend/app/db/models/condition.py +COPY backend/app/db/connection.py backend/app/db/connection.py +COPY backend/app/config.py backend/app/config.py + +COPY backend/__init__.py backend/__init__.py +COPY backend/app/__init__.py backend/app/__init__.py +COPY backend/app/db/__init__.py backend/app/db/__init__.py + + +# ----------------------------- +# Lambda handler +# ----------------------------- +CMD ["backend/condition/handler/handler.handler"] +# For local running +# CMD ["python", "-m", "backend.condition.handler.handler"] diff --git a/backend/condition/handler/handler.py b/backend/condition/handler/handler.py new file mode 100644 index 00000000..2f3616a4 --- /dev/null +++ b/backend/condition/handler/handler.py @@ -0,0 +1,51 @@ +import json +from typing import Mapping, Any +from io import BytesIO + +from backend.condition.condition_trigger_request import ConditionTriggerRequest +from backend.condition.lookups.uprn_lookup_s3 import UprnLookupS3 +from backend.condition.processor import process_file +from utils.logger import setup_logger +from utils.s3 import read_io_from_s3 + + +logger = setup_logger() + + +def handler(event: Mapping[str, Any], context: Any) -> None: + + for record in event.get("Records", []): + try: + body_dict = json.loads(record["body"]) + logger.debug("Validating request body") + payload = ConditionTriggerRequest.model_validate(body_dict) + + logger.debug("Successfully validated request body") + + if payload.uprn_lookup_file_bucket and payload.uprn_lookup_file_key: + logger.debug("Getting UPRN lookup file from s3") + uprn_lookup = UprnLookupS3( + bucket=payload.uprn_lookup_file_bucket, + key=payload.uprn_lookup_file_key, + ) # TODO: replace with postgres implementation + logger.debug("Successfully got UPRN lookup file from s3") + else: + uprn_lookup = None + + logger.debug("Getting conditions data from s3") + file_bytes: BytesIO = read_io_from_s3( + bucket_name=payload.trigger_file_bucket, + file_key=payload.trigger_file_key, + ) + logger.debug( + "Successfully got conditions data from s3. Moving on to process file..." + ) + + process_file( + file_stream=file_bytes, + file_type=payload.file_type, + uprn_lookup=uprn_lookup, + ) + + except Exception as e: + logger.error(f"Failed to process record: {e}") diff --git a/backend/condition/handler/requirements.txt b/backend/condition/handler/requirements.txt new file mode 100644 index 00000000..1e259a95 --- /dev/null +++ b/backend/condition/handler/requirements.txt @@ -0,0 +1,9 @@ +openpyxl +sqlmodel +pydantic-settings +psycopg2-binary==2.9.10 + +# pandas isn't used, but needed for importing from utils.s3 +pandas==2.2.2 +numpy==1.26.4 +openpyxl diff --git a/backend/condition/local_runner.py b/backend/condition/local_runner.py index 404f64d4..4595b93b 100644 --- a/backend/condition/local_runner.py +++ b/backend/condition/local_runner.py @@ -1,5 +1,7 @@ from pathlib import Path +from backend.condition.condition_trigger_request import ConditionFileType +from backend.condition.lookups.uprn_lookup_csv import UprnLookupLocal from backend.condition.processor import process_file @@ -20,13 +22,27 @@ def main() -> None: / "peabody" / "2026_01_06 - Peabody - Stock Condition Data - Survey Records - D Lower.xlsx" ) - filepaths = [lbwf_path, peabody_path] + peabody_uprn_lookup_path: Path = ( + path / "peabody" / "PeabodyPropertymatched_Dec25_propref_UPRN.csv" + ) + # filepaths = [lbwf_path, peabody_path] + filepaths = [lbwf_path] + # filepaths = [peabody_path] + + uprn_lookup = UprnLookupLocal(csv_path=peabody_uprn_lookup_path.as_posix()) + + def get_file_type(file_path: str) -> ConditionFileType: + if "peabody" in file_path: + return ConditionFileType.Peabody + if "lbwf" in file_path: + return ConditionFileType.LBWF for fp in filepaths: with fp.open("rb") as f: process_file( file_stream=f, - source_key=fp.as_posix(), + file_type=get_file_type(fp.as_posix()), + uprn_lookup=uprn_lookup, ) diff --git a/backend/condition/lookups/uprn_lookup.py b/backend/condition/lookups/uprn_lookup.py new file mode 100644 index 00000000..0f6e78fd --- /dev/null +++ b/backend/condition/lookups/uprn_lookup.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod +from typing import BinaryIO, Dict + + +class UprnLookup(ABC): + @abstractmethod + def get_property_ref_to_uprn_lookup(self) -> Dict[str, int]: + pass diff --git a/backend/condition/lookups/uprn_lookup_csv.py b/backend/condition/lookups/uprn_lookup_csv.py new file mode 100644 index 00000000..8b1c21a2 --- /dev/null +++ b/backend/condition/lookups/uprn_lookup_csv.py @@ -0,0 +1,23 @@ +import csv +from io import TextIOWrapper +from typing import BinaryIO, Dict, TextIO +from backend.condition.lookups.uprn_lookup import UprnLookup + + +class UprnLookupLocal(UprnLookup): + def __init__(self, csv_path: str): + self.csv_path = csv_path + + def get_property_ref_to_uprn_lookup(self) -> Dict[str, int]: + with open(self.csv_path, "rb") as f: + return self.parse_csv(f) + + def parse_csv(self, file_stream: BinaryIO) -> Dict[str, int]: + text_stream: TextIO = TextIOWrapper(file_stream, encoding="utf-8") + mapping: Dict[str, int] = {} + reader = csv.DictReader(text_stream) + for row in reader: + if not row["reference"] or not row["out_uprn"]: + continue + mapping[row["reference"].strip()] = int(row["out_uprn"].strip()) + return mapping diff --git a/backend/condition/lookups/uprn_lookup_s3.py b/backend/condition/lookups/uprn_lookup_s3.py new file mode 100644 index 00000000..da725a2f --- /dev/null +++ b/backend/condition/lookups/uprn_lookup_s3.py @@ -0,0 +1,29 @@ +import csv +from io import BytesIO, TextIOWrapper +from typing import BinaryIO, Dict, TextIO + +from backend.condition.lookups.uprn_lookup import UprnLookup +from utils.s3 import read_io_from_s3 + + +class UprnLookupS3(UprnLookup): + def __init__(self, bucket: str = "", key: str = ""): + self.bucket = bucket + self.key = key + + def get_property_ref_to_uprn_lookup(self) -> Dict[str, int]: + file_bytes: BytesIO = read_io_from_s3( + bucket_name=self.bucket, file_key=self.key + ) + + return self._parse_csv_bytes(file_bytes) + + def _parse_csv_bytes(self, file_stream: BinaryIO) -> Dict[str, int]: + text_stream: TextIO = TextIOWrapper(file_stream, encoding="utf-8") + mapping: Dict[str, int] = {} + reader = csv.DictReader(text_stream) + for row in reader: + if not row["reference"] or not row["out_uprn"]: + continue + mapping[row["reference"].strip()] = int(row["out_uprn"].strip()) + return mapping diff --git a/backend/condition/parsing/factory.py b/backend/condition/parsing/factory.py index 68ca0292..b5d28e18 100644 --- a/backend/condition/parsing/factory.py +++ b/backend/condition/parsing/factory.py @@ -1,27 +1,35 @@ +from typing import Optional +from backend.condition.condition_trigger_request import ConditionFileType from backend.condition.domain.mapping.lbwf.lbwf_mapper import LbwfMapper from backend.condition.domain.mapping.mapper import Mapper from backend.condition.domain.mapping.peabody.peabody_mapper import PeabodyMapper -from backend.condition.file_type import FileType +from backend.condition.lookups.uprn_lookup import UprnLookup from backend.condition.parsing.parser import Parser from backend.condition.parsing.lbwf_parser import LbwfParser from backend.condition.parsing.peabody_parser import PeabodyParser -def select_parser(file_type: FileType) -> Parser: - if file_type is FileType.LBWF: +def select_parser( + file_type: ConditionFileType, uprn_lookup: Optional[UprnLookup] = None +) -> Parser: + if file_type is ConditionFileType.LBWF: return LbwfParser() - if file_type is FileType.Peabody: - return PeabodyParser() + if file_type is ConditionFileType.Peabody: + if not uprn_lookup: + raise ValueError( + "Cannot instantiate Peabody Parser without UPRN lookup being provided" + ) + return PeabodyParser(uprn_lookup=uprn_lookup) raise ValueError("Unrecognised file type, unable to instantiate Parser") -def select_mapper(file_type: FileType) -> Mapper: - if file_type is FileType.LBWF: +def select_mapper(file_type: ConditionFileType) -> Mapper: + if file_type is ConditionFileType.LBWF: return LbwfMapper() - if file_type is FileType.Peabody: + if file_type is ConditionFileType.Peabody: return PeabodyMapper() raise ValueError("Unrecognised file type, unable to instantiate Mapper") diff --git a/backend/condition/parsing/lbwf_parser.py b/backend/condition/parsing/lbwf_parser.py index 14d2efe4..a713b1ef 100644 --- a/backend/condition/parsing/lbwf_parser.py +++ b/backend/condition/parsing/lbwf_parser.py @@ -1,4 +1,4 @@ -from typing import BinaryIO, Any, Dict, Iterator, List, Tuple +from typing import BinaryIO, Any, Dict, Iterator, List, Optional, Tuple from openpyxl import Workbook, load_workbook from collections import defaultdict @@ -15,7 +15,10 @@ logger = setup_logger() class LbwfParser(Parser): - def parse(self, file_stream: BinaryIO) -> Any: + def parse( + self, + file_stream: BinaryIO, + ) -> Any: wb: Workbook = load_workbook(file_stream) address_to_uprn_map: Dict[str, int] = LbwfParser._generate_address_to_uprn_dict( wb diff --git a/backend/condition/parsing/parser.py b/backend/condition/parsing/parser.py index 105fda36..b160b217 100644 --- a/backend/condition/parsing/parser.py +++ b/backend/condition/parsing/parser.py @@ -1,8 +1,12 @@ from abc import ABC, abstractmethod -from typing import BinaryIO, Any +from typing import BinaryIO, Any, Dict, Optional + class Parser(ABC): @abstractmethod - def parse(self, file_stream: BinaryIO) -> Any: - pass \ No newline at end of file + def parse( + self, + file_stream: BinaryIO, + ) -> Any: + pass diff --git a/backend/condition/parsing/peabody_parser.py b/backend/condition/parsing/peabody_parser.py index b8a548a7..4620ba82 100644 --- a/backend/condition/parsing/peabody_parser.py +++ b/backend/condition/parsing/peabody_parser.py @@ -1,26 +1,43 @@ -from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, DefaultDict +import csv +from pathlib import Path +from typing import Any, BinaryIO, Dict, List, Optional, Tuple, DefaultDict from openpyxl import Workbook, load_workbook from collections import defaultdict +from backend.condition.lookups.uprn_lookup import UprnLookup from backend.condition.parsing.parser import Parser -from backend.condition.parsing.records.peabody.peabody_asset_condition import PeabodyAssetCondition +from backend.condition.parsing.records.peabody.peabody_asset_condition import ( + PeabodyAssetCondition, +) from backend.condition.parsing.records.peabody.peabody_property import PeabodyProperty from utils.logger import setup_logger logger = setup_logger() -class PeabodyParser(Parser): - def parse(self, file_stream: BinaryIO) -> Any: - wb: Workbook = load_workbook(file_stream) - address_to_uprn_map: Dict[str, int] = PeabodyParser._generate_address_to_uprn_dict(wb) - - assets = self._parse_assets(wb) - return self._group_assets_into_properties( - assets=assets, - address_to_uprn_map=address_to_uprn_map, +class PeabodyParser(Parser): + def __init__(self, uprn_lookup: UprnLookup): + self.uprn_lookup: UprnLookup = uprn_lookup # TODO: move this to the ABC? + + def parse( + self, + file_stream: BinaryIO, + ) -> Any: + file_stream.seek(0) + logger.debug("[PeabodyParser] Loading workbook...") + wb: Workbook = load_workbook(file_stream, read_only=True, data_only=True) + logger.debug("[PeabodyParser] Successfully loaded workbook. Parsing assets...") + assets = PeabodyParser._parse_assets(wb) + logger.debug( + "[PeabodyParser] Successfully parsed assets. Parsing UPRN lookup..." ) + location_ref_to_uprn_map = self.uprn_lookup.get_property_ref_to_uprn_lookup() + logger.debug("[PeabodyParser] Successfully parsed UPRN lookup") + return PeabodyParser._group_assets_into_properties( + assets=assets, + location_ref_to_uprn_map=location_ref_to_uprn_map, + ) @staticmethod def _parse_assets(wb: Workbook) -> List[PeabodyAssetCondition]: @@ -33,39 +50,44 @@ class PeabodyParser(Parser): assets: List[PeabodyAssetCondition] = [] for row in asset_rows: try: - asset = PeabodyParser._map_row_to_asset_record(row, asset_header_indexes) + asset = PeabodyParser._map_row_to_asset_record( + row, asset_header_indexes + ) if not asset.is_block_level: # Block-level condition surveys are out of scope for now - # until we have a wider think on how to handle block - assets.append(asset) # TODO: handle block-level assets + # until we have a wider think on how to handle blocks + assets.append(asset) # TODO: handle block-level assets except Exception as e: logger.error(f"Error mapping Peabody row to asset record: {e}") continue return assets - + @staticmethod def _group_assets_into_properties( assets: List[PeabodyAssetCondition], - address_to_uprn_map: Dict[str, int], + location_ref_to_uprn_map: Dict[str, int], ) -> List[PeabodyProperty]: - assets_by_address: DefaultDict[str, List[PeabodyAssetCondition]] = defaultdict(list) + assets_by_location_reference: DefaultDict[str, List[PeabodyAssetCondition]] = ( + defaultdict(list) + ) for asset in assets: - if asset.full_address is None: + if asset.lo_reference is None: continue - address = asset.full_address.strip() - assets_by_address[address].append(asset) + assets_by_location_reference[asset.lo_reference].append(asset) properties: List[PeabodyProperty] = [] + failed_mappings_count = 0 - for address, grouped_assets in assets_by_address.items(): - uprn = address_to_uprn_map.get(address) + for location_ref, grouped_assets in assets_by_location_reference.items(): + + uprn = location_ref_to_uprn_map.get(location_ref) if uprn is None: - logger.warning(f"No UPRN found for address: {address}") + failed_mappings_count += 1 continue properties.append( @@ -75,9 +97,9 @@ class PeabodyParser(Parser): ) ) + logger.warning(f"No UPRN found for {failed_mappings_count} Location References") return properties - @staticmethod def _map_row_to_asset_record( row: Any | Tuple[object | None, ...], @@ -102,39 +124,9 @@ class PeabodyParser(Parser): condition_survey_date=row[header_indexes["condition_survey_date"]], ) - @staticmethod - def _generate_address_to_uprn_dict(wb: Workbook) -> Dict[str, int | None]: - sheet = wb["Survey Records - D & Lower"] - rows: Iterator[Tuple[object | None, ...]] = sheet.iter_rows(values_only=True) - - headers = next(rows) - header_indexes: Dict[str, int] = PeabodyParser._get_column_indexes_by_name(headers) - - address_idx = header_indexes["full_address"] - - - address_to_uprn: Dict[str, int] = {} - # Generate random UPRNs for now - next_uprn = 1 # TODO: get real UPRNs - - for row in rows: - address = row[address_idx] - - if address is None: - continue - - address = address.strip() - - if address not in address_to_uprn: - address_to_uprn[address] = next_uprn - next_uprn += 1 - - return address_to_uprn - - @staticmethod def _get_column_indexes_by_name( - headers: Tuple[object | None, ...] + headers: Tuple[object | None, ...], ) -> Dict[str, int]: index: Dict[str, int] = {} @@ -142,4 +134,4 @@ class PeabodyParser(Parser): if isinstance(header, str): index[header] = i - return index \ No newline at end of file + return index diff --git a/backend/condition/persistence/condition_postgres.py b/backend/condition/persistence/condition_postgres.py new file mode 100644 index 00000000..e83df540 --- /dev/null +++ b/backend/condition/persistence/condition_postgres.py @@ -0,0 +1,87 @@ +import time +from typing import List, Optional +from sqlmodel import Session + +from utils.logger import setup_logger +from backend.app.db.models.condition import ( + AspectConditionModel, + ElementModel, + PropertyConditionSurveyModel, +) +from backend.condition.domain.property_condition_survey import PropertyConditionSurvey +from backend.app.db.connection import db_session + +logger = setup_logger() + + +class ConditionPostgres: + + def bulk_insert_surveys( + self, surveys: List[PropertyConditionSurvey], batch_size: Optional[int] = 100 + ) -> None: + logger.debug( + f"[ConditionPostgres] Preparing to load {len(surveys)} property surveys to Postgres. Mapping to SQLModel objects..." + ) + survey_models: List[PropertyConditionSurveyModel] = [ + ConditionPostgres.map_survey_to_model(s) for s in surveys + ] + total: int = len(survey_models) + logger.debug( + f"[ConditionPostgres] Finished mapping {total} surveys. Writing to database in batches of {batch_size}..." + ) + + with db_session() as session: + logger.info("[ConditionPostgres] Successfully made connection to database") + for start in range(0, total, batch_size): + end = min(start + batch_size, total) + batch = survey_models[start:end] + + t0: float = time.perf_counter() + ConditionPostgres._insert_surveys_batch(batch, session) + elapsed: float = time.perf_counter() - t0 + + logger.info( + f"Inserted batch {start} - {end} ({len(batch)} surveys) in {elapsed} seconds", + ) + + @staticmethod + def map_survey_to_model( + survey: PropertyConditionSurvey, + ) -> PropertyConditionSurveyModel: + survey_model = PropertyConditionSurveyModel( + uprn=survey.uprn, + date=survey.date, + source=survey.source, + elements=[], + ) + + for element in survey.elements: + element_model = ElementModel( + element_type=element.element_type, + element_instance=element.element_instance, + aspect_conditions=[], + ) + + for aspect in element.aspect_conditions: + aspect_model = AspectConditionModel( + aspect_type=aspect.aspect_type, + aspect_instance=aspect.aspect_instance, + value=aspect.value, + quantity=aspect.quantity, + install_date=aspect.install_date, + renewal_year=aspect.renewal_year, + comments=aspect.comments, + ) + + element_model.aspect_conditions.append(aspect_model) + + survey_model.elements.append(element_model) + + return survey_model + + @staticmethod + def _insert_surveys_batch( + surveys: List[PropertyConditionSurveyModel], session: Session + ) -> None: + session.add_all(surveys) + session.commit() diff --git a/backend/condition/processor.py b/backend/condition/processor.py index 3cbff498..ad5b4232 100644 --- a/backend/condition/processor.py +++ b/backend/condition/processor.py @@ -1,25 +1,38 @@ -from typing import Any, BinaryIO, List +from typing import Any, BinaryIO, List, Optional from datetime import datetime +from backend.condition.condition_trigger_request import ConditionFileType +from backend.condition.lookups.uprn_lookup import UprnLookup +from utils.logger import setup_logger from backend.condition.domain.mapping.mapper import Mapper from backend.condition.domain.property_condition_survey import PropertyConditionSurvey from backend.condition.parsing.parser import Parser -from utils.logger import setup_logger -from backend.condition.file_type import FileType, detect_file_type +from backend.condition.persistence.condition_postgres import ConditionPostgres from backend.condition.parsing.factory import select_parser, select_mapper +logger = setup_logger() -def process_file(file_stream: BinaryIO, source_key: str) -> None: - print(f"[processor] Received file: {source_key}") +def process_file( + file_stream: BinaryIO, + file_type: ConditionFileType, + uprn_lookup: Optional[UprnLookup], +) -> None: # Instantiation - file_type: FileType = detect_file_type(source_key) - parser: Parser = select_parser(file_type) + logger.debug(f"[processor] Instantiating classes...") + parser: Parser = select_parser(file_type, uprn_lookup) mapper: Mapper = select_mapper(file_type) + persistence = ConditionPostgres() + + logger.debug(f"[processor] Finished instantiating classes. Calling Parser...") # Orchestration raw_properties: List[Any] = parser.parse(file_stream) + logger.info( + f"[processor] Finished loading customer survey data for {len(raw_properties)} properties. Mapping..." + ) + survey_year = datetime.now().year # TODO: get this from filepath or elsewhere property_condition_surveys: List[PropertyConditionSurvey] = [] @@ -29,4 +42,10 @@ def process_file(file_stream: BinaryIO, source_key: str) -> None: mapper.map_asset_conditions_for_property(p, survey_year) ) - print("done") # temp + logger.info( + f"[processor] Finished mapping {len(property_condition_surveys)} properties. Writing to database..." + ) + + persistence.bulk_insert_surveys(property_condition_surveys) + + logger.info(f"[processor] Finished loading surveys to database") diff --git a/backend/condition/tests/custom_asserts.py b/backend/condition/tests/custom_asserts.py index 9e3abd7f..623dcf0c 100644 --- a/backend/condition/tests/custom_asserts.py +++ b/backend/condition/tests/custom_asserts.py @@ -1,3 +1,4 @@ +from backend.app.db.models.condition import PropertyConditionSurveyModel from backend.condition.domain.property_condition_survey import PropertyConditionSurvey @@ -72,3 +73,41 @@ class CustomAsserts: f"{actual_aspect.comments} != {expected_aspect.comments}" ) return True + + def assert_property_condition_survey_model_matches_expected( + actual_model: PropertyConditionSurveyModel, + expected: dict, + ) -> None: + assert actual_model.uprn == expected["uprn"], "UPRN differs" + assert actual_model.date == expected["date"], "Date differs" + assert actual_model.source == expected["source"], "Source differs" + + assert len(actual_model.elements) == len(expected["elements"]), ( + f"Expected {len(expected['elements'])} elements, " + f"got {len(actual_model.elements)}" + ) + + for i, (actual_element, expected_element) in enumerate( + zip(actual_model.elements, expected["elements"]) + ): + assert ( + actual_element.element_type == expected_element["element_type"] + ), f"Element[{i}].element_type differs" + assert ( + actual_element.element_instance == expected_element["element_instance"] + ), f"Element[{i}].element_instance differs" + + assert len(actual_element.aspect_conditions) == len( + expected_element["aspects"] + ), f"Element[{i}] aspect count differs" + + for j, (actual_aspect, expected_aspect) in enumerate( + zip(actual_element.aspect_conditions, expected_element["aspects"]) + ): + prefix = f"Element[{i}].Aspect[{j}]" + + for key, value in expected_aspect.items(): + assert getattr(actual_aspect, key) == value, ( + f"{prefix}.{key} differs: " + f"{getattr(actual_aspect, key)} != {value}" + ) diff --git a/backend/condition/tests/lookups/test_uprn_lookup_csv.py b/backend/condition/tests/lookups/test_uprn_lookup_csv.py new file mode 100644 index 00000000..d01c52c2 --- /dev/null +++ b/backend/condition/tests/lookups/test_uprn_lookup_csv.py @@ -0,0 +1,34 @@ +import pytest +from typing import Dict +from tempfile import NamedTemporaryFile + +from backend.condition.lookups.uprn_lookup_csv import UprnLookupLocal + + +@pytest.fixture +def prop_ref_uprn_csv_file() -> str: + csv_content = """reference,out_uprn + ABC123,10000000001 + DEF456,10000000002 + GHI789,10000000003 + """ + with NamedTemporaryFile(mode="w+", delete=False, suffix=".csv") as tmp: + tmp.write(csv_content) + tmp.flush() + return tmp.name + + +def test_generate_prop_ref_uprn_from_csv_file(prop_ref_uprn_csv_file: str) -> None: + # arrange + uprn_lookup = UprnLookupLocal(prop_ref_uprn_csv_file) + expected_map: Dict[str, int] = { + "ABC123": 10000000001, + "DEF456": 10000000002, + "GHI789": 10000000003, + } + + # act + actual_map: Dict[str, int] = uprn_lookup.get_property_ref_to_uprn_lookup() + + # assert + assert actual_map == expected_map diff --git a/backend/condition/tests/parsing/test_parsing_factory.py b/backend/condition/tests/parsing/test_parsing_factory.py index e2b478ff..df01eaad 100644 --- a/backend/condition/tests/parsing/test_parsing_factory.py +++ b/backend/condition/tests/parsing/test_parsing_factory.py @@ -1,11 +1,13 @@ import pytest +from backend.condition.condition_trigger_request import ConditionFileType +from backend.condition.lookups.uprn_lookup_csv import UprnLookupLocal from backend.condition.parsing.factory import select_parser -from backend.condition.file_type import FileType + def test_selects_lbwf_parser(): # arrange - file_type = FileType.LBWF + file_type = ConditionFileType.LBWF expected_class_name = "LbwfParser" # act @@ -14,13 +16,15 @@ def test_selects_lbwf_parser(): # assert assert expected_class_name == actual_class_name + def test_selects_peabody_parser(): # arrange - file_type = FileType.Peabody + file_type = ConditionFileType.Peabody expected_class_name = "PeabodyParser" + uprn_lookup = UprnLookupLocal(csv_path="test") # act - actual_class_name = select_parser(file_type).__class__.__name__ + actual_class_name = select_parser(file_type, uprn_lookup).__class__.__name__ # assert - assert expected_class_name == actual_class_name \ No newline at end of file + assert expected_class_name == actual_class_name diff --git a/backend/condition/tests/parsing/test_peabody_parser.py b/backend/condition/tests/parsing/test_peabody_parser.py index 32ff79d8..5fb42204 100644 --- a/backend/condition/tests/parsing/test_peabody_parser.py +++ b/backend/condition/tests/parsing/test_peabody_parser.py @@ -1,127 +1,143 @@ +from tempfile import NamedTemporaryFile import pytest -from typing import Any +from typing import Any, Dict from io import BytesIO from openpyxl import Workbook from datetime import datetime +from backend.condition.lookups.uprn_lookup_csv import UprnLookupLocal from backend.condition.parsing.peabody_parser import PeabodyParser -from backend.condition.parsing.records.peabody.peabody_asset_condition import PeabodyAssetCondition +from backend.condition.parsing.records.peabody.peabody_asset_condition import ( + PeabodyAssetCondition, +) from backend.condition.parsing.records.peabody.peabody_property import PeabodyProperty + @pytest.fixture def peabody_assets_xlsx_bytes() -> BytesIO: wb = Workbook() survey_records_d_and_lower = wb.active survey_records_d_and_lower.title = "Survey Records - D & Lower" - survey_records_d_and_lower.append([ - "Lo_Reference", - "full_address", - "location_type_code", - "Parent_Lo_Reference", - "Element_Code", - "Element", - "Sub_Element_Code", - "Sub_Element", - "Material_Code", - "material_or_answer", - "Renewal_Quantity", - "Renewal_Year", - "Renewal_Cost", - "cloned", - "lo_type_code", - "condition_survey_date", - ]) - survey_records_d_and_lower.append([ - "B000RAND", - "1 RANDOM HOUSE LONDON", - 3, - "RAND2EST", - 110, - "ROOFS", - 1, - "Primary Roof", - 9, - "Other", - 3, - 2054, - 330, - "N", - 3, - datetime(2025,12,4,9,17,0) - ]) - survey_records_d_and_lower.append([ - "B000BLOCK", - "1100 BLOCK", - 3, - "RAND2EST", - 110, - "ROOFS", - 1, - "Primary Roof", - 9, - "Other", - 3, - 2054, - 330, - "N", - 3, - datetime(2025,12,4,9,17,0) - ]) - survey_records_d_and_lower.append([ - "B000FAKE", - "3 FAKE CLOSE LONDON", - 3, - "FAKEEST", - 100, - "GENERAL", - 15, - "External Decoration", - 2, - "Normal", - 1, - 2035, - 1500.7, - "N", - 3, - datetime(2025,7,5,0,0,0) - ]) - survey_records_d_and_lower.append([ - "B000MIS", - "99 MISC ROAD LONDON", - 3, - "300828", - 54, - "HHSRS", - 29, - "HHSRS Structural Collapse & Falling Elements", - 4, - "HHSRS Moderate", - 2, - 2027, - None, - "N", - 3, - None - ]) - survey_records_d_and_lower.append([ - "B000MIS", - "99 MISC ROAD LONDON", - 3, - "300828", - 53, - "External", - 2, - "Chimney", - 2, - "Present", - 33, - 2053, - 3531, - "N", - 3, - None - ]) - + survey_records_d_and_lower.append( + [ + "Lo_Reference", + "full_address", + "location_type_code", + "Parent_Lo_Reference", + "Element_Code", + "Element", + "Sub_Element_Code", + "Sub_Element", + "Material_Code", + "material_or_answer", + "Renewal_Quantity", + "Renewal_Year", + "Renewal_Cost", + "cloned", + "lo_type_code", + "condition_survey_date", + ] + ) + survey_records_d_and_lower.append( + [ + "B000RAND", + "1 RANDOM HOUSE LONDON", + 3, + "RAND2EST", + 110, + "ROOFS", + 1, + "Primary Roof", + 9, + "Other", + 3, + 2054, + 330, + "N", + 3, + datetime(2025, 12, 4, 9, 17, 0), + ] + ) + survey_records_d_and_lower.append( + [ + "B000BLOCK", + "1100 BLOCK", + 3, + "RAND2EST", + 110, + "ROOFS", + 1, + "Primary Roof", + 9, + "Other", + 3, + 2054, + 330, + "N", + 3, + datetime(2025, 12, 4, 9, 17, 0), + ] + ) + survey_records_d_and_lower.append( + [ + "B000FAKE", + "3 FAKE CLOSE LONDON", + 3, + "FAKEEST", + 100, + "GENERAL", + 15, + "External Decoration", + 2, + "Normal", + 1, + 2035, + 1500.7, + "N", + 3, + datetime(2025, 7, 5, 0, 0, 0), + ] + ) + survey_records_d_and_lower.append( + [ + "B000MIS", + "99 MISC ROAD LONDON", + 3, + "300828", + 54, + "HHSRS", + 29, + "HHSRS Structural Collapse & Falling Elements", + 4, + "HHSRS Moderate", + 2, + 2027, + None, + "N", + 3, + None, + ] + ) + survey_records_d_and_lower.append( + [ + "B000MIS", + "99 MISC ROAD LONDON", + 3, + "300828", + 53, + "External", + 2, + "Chimney", + 2, + "Present", + 33, + 2053, + 3531, + "N", + 3, + None, + ] + ) stream = BytesIO() wb.save(stream) @@ -129,9 +145,27 @@ def peabody_assets_xlsx_bytes() -> BytesIO: return stream -def test_peabody_parser_parses_conditions(peabody_assets_xlsx_bytes): + +@pytest.fixture +def prop_ref_uprn_csv_file() -> str: + csv_content = """reference,out_uprn + B000RAND,1 + B000BLOCK,2 + B000FAKE,3 + B000MIS,4 + """ + with NamedTemporaryFile(mode="w+", delete=False, suffix=".csv") as tmp: + tmp.write(csv_content) + tmp.flush() + return tmp.name + + +def test_peabody_parser_parses_conditions( + peabody_assets_xlsx_bytes, prop_ref_uprn_csv_file +): # arrange - parser = PeabodyParser() + uprn_lookup = UprnLookupLocal(csv_path=prop_ref_uprn_csv_file) + parser = PeabodyParser(uprn_lookup=uprn_lookup) # act result: Any = parser.parse(peabody_assets_xlsx_bytes) @@ -141,6 +175,7 @@ def test_peabody_parser_parses_conditions(peabody_assets_xlsx_bytes): assert all(isinstance(item, PeabodyProperty) for item in result) + @pytest.fixture def asset_condition_factory(): def _factory(full_address: str) -> PeabodyAssetCondition: @@ -165,6 +200,7 @@ def asset_condition_factory(): return _factory + @pytest.mark.parametrize( "full_address, expected_block_level", [ @@ -175,7 +211,7 @@ def asset_condition_factory(): ("81A-B GORE ROAD LONDON", True), ("73 & 74 HARVEST COURT ST. ALBANS", True), ("25 HAVERSHAM COURT GREENFORD", False), - ("FLAT 10 SPARROW COURT SOUTHMERE DRIVE LONDON SE2 9ES", False) + ("FLAT 10 SPARROW COURT SOUTHMERE DRIVE LONDON SE2 9ES", False), ], ) def test_peabody_asset_is_block_level( @@ -187,4 +223,4 @@ def test_peabody_asset_is_block_level( asset_condition = asset_condition_factory(full_address) # act + assert - assert asset_condition.is_block_level == expected_block_level \ No newline at end of file + assert asset_condition.is_block_level == expected_block_level diff --git a/backend/condition/tests/persistence/test_condition_postgres.py b/backend/condition/tests/persistence/test_condition_postgres.py new file mode 100644 index 00000000..ca95eaaa --- /dev/null +++ b/backend/condition/tests/persistence/test_condition_postgres.py @@ -0,0 +1,164 @@ +import pytest +from datetime import date + +from backend.condition.persistence.condition_postgres import ConditionPostgres +from backend.condition.domain.property_condition_survey import PropertyConditionSurvey +from backend.condition.domain.element import Element +from backend.condition.domain.element_type import ElementType +from backend.condition.domain.aspect_condition import AspectCondition +from backend.condition.domain.aspect_type import AspectType +from backend.app.db.models.condition import PropertyConditionSurveyModel +from backend.condition.tests.custom_asserts import CustomAsserts + + +def test_map_survey_to_model() -> None: + # arrange + survey = PropertyConditionSurvey( + uprn=1, + elements=[ + Element( + element_type=ElementType.EXTERNAL_WINDOWS, + element_instance=1, + aspect_conditions=[ + AspectCondition( + aspect_type=AspectType.MATERIAL, + aspect_instance=1, + value="UPVC Double Glazed", + quantity=8, + install_date=None, + renewal_year=2036, + comments=None, + ), + ], + ), + Element( + element_type=ElementType.EXTERNAL_DECORATION, + element_instance=1, + aspect_conditions=[ + AspectCondition( + aspect_type=AspectType.CONDITION, + aspect_instance=1, + value="Normal", + quantity=1, + install_date=None, + renewal_year=2029, + comments=None, + ) + ], + ), + Element( + element_type=ElementType.EXTERNAL_WALL, + element_instance=1, + aspect_conditions=[ + AspectCondition( + aspect_type=AspectType.FINISH, + aspect_instance=1, + value="Pointed", + quantity=65, + install_date=None, + renewal_year=2045, + comments=None, + ), + AspectCondition( + aspect_type=AspectType.FINISH, + aspect_instance=1, + value="Pointing", + quantity=1, + install_date=None, + renewal_year=2069, + comments=None, + ), + AspectCondition( + aspect_type=AspectType.FINISH, + aspect_instance=2, + value="Tile Hung", + quantity=8, + install_date=None, + renewal_year=2049, + comments=None, + ), + ], + ), + ], + date=date(2000, 1, 1), + source="Peabody", + ) + + expected = { + "uprn": 1, + "date": date(2000, 1, 1), + "source": "Peabody", + "elements": [ + { + "element_type": ElementType.EXTERNAL_WINDOWS, + "element_instance": 1, + "aspects": [ + { + "aspect_type": AspectType.MATERIAL, + "aspect_instance": 1, + "value": "UPVC Double Glazed", + "quantity": 8, + "install_date": None, + "renewal_year": 2036, + "comments": None, + } + ], + }, + { + "element_type": ElementType.EXTERNAL_DECORATION, + "element_instance": 1, + "aspects": [ + { + "aspect_type": AspectType.CONDITION, + "aspect_instance": 1, + "value": "Normal", + "quantity": 1, + "install_date": None, + "renewal_year": 2029, + "comments": None, + } + ], + }, + { + "element_type": ElementType.EXTERNAL_WALL, + "element_instance": 1, + "aspects": [ + { + "aspect_instance": 1, + "value": "Pointed", + "quantity": 65, + "install_date": None, + "renewal_year": 2045, + "comments": None, + }, + { + "aspect_type": AspectType.FINISH, + "aspect_instance": 1, + "value": "Pointing", + "quantity": 1, + "install_date": None, + "renewal_year": 2069, + "comments": None, + }, + { + "aspect_type": AspectType.FINISH, + "aspect_instance": 2, + "value": "Tile Hung", + "quantity": 8, + "install_date": None, + "renewal_year": 2049, + "comments": None, + }, + ], + }, + ], + } + + # act + model: PropertyConditionSurveyModel = ConditionPostgres.map_survey_to_model(survey) + + # assert (survey level) + CustomAsserts.assert_property_condition_survey_model_matches_expected( + model, + expected, + ) diff --git a/backend/condition/tests/test_detect_file_type.py b/backend/condition/tests/test_detect_file_type.py deleted file mode 100644 index fecf22c1..00000000 --- a/backend/condition/tests/test_detect_file_type.py +++ /dev/null @@ -1,22 +0,0 @@ -import pytest - -from backend.condition.file_type import FileType, detect_file_type - -def test_detects_lbwf_file_type(): - # arrange - file_path_str = "uploads/lbwf/Exaple Asset Data.xlsx" - expected_file_type = FileType.LBWF - - # act - actual_file_type: FileType = detect_file_type(file_path_str) - - # assert - assert expected_file_type == actual_file_type - -def test_unknown_filepath_raises_value_error(): - # arrange - file_path_str = "unknown/Example Asset Data.xlsx" - - # act + assert - with pytest.raises(ValueError): - detect_file_type(file_path_str) \ No newline at end of file diff --git a/backend/engine/engine.py b/backend/engine/engine.py index e833eb89..69726604 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -978,13 +978,15 @@ async def model_engine(body: PlanTriggerRequest): recommendations_scoring_data.extend(p.recommendations_scoring_data) logger.info("Preparing data for scoring in sap change api") - recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data).drop( - columns=[ - "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", - "carbon_ending" - ] - ) - # Temp putting this here + recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data) + if not recommendations_scoring_data.empty: + recommendations_scoring_data = recommendations_scoring_data.drop( + columns=[ + "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", + "carbon_ending" + ] + ) + # TODO: Temp putting this here recommendations_scoring_data["is_post_sap10_ending"] = True all_predictions = await model_api.async_paginated_predictions( diff --git a/backend/engine/requirements.txt b/backend/engine/requirements.txt index b565e9d3..5cca1211 100644 --- a/backend/engine/requirements.txt +++ b/backend/engine/requirements.txt @@ -1,3 +1,4 @@ + # Pandas and numpy numpy==2.1.2 pandas==2.2.3 @@ -22,4 +23,4 @@ pyarrow==17.0.0 fastparquet==2024.5.0 aiohttp==3.10.10 # find my epc -beautifulsoup4 +beautifulsoup4 \ No newline at end of file diff --git a/backend/ml_models/api.py b/backend/ml_models/api.py index daf4b715..440367b2 100644 --- a/backend/ml_models/api.py +++ b/backend/ml_models/api.py @@ -313,4 +313,15 @@ class ModelApi: logger.error(f"Batch {chunk}-{chunk + batch_size} failed (Attempt {attempts}): {e}") await asyncio.sleep(2 ** attempts) # exponential backoff await self.close_aiohttp_session() + + # Ensure stable output structure for the datagrame to be utilised by other functions downstream + for k in all_predictions.keys(): + if all_predictions[k].empty: + col_template = ['id', 'predictions', 'property_id', 'recommendation_id', 'phase'] if ( + extract_ids) else ['id', 'predictions'] + + all_predictions[k] = pd.DataFrame( + columns=col_template + ) + return all_predictions diff --git a/backend/onboarders/README.md b/backend/onboarders/README.md new file mode 100644 index 00000000..063fee20 --- /dev/null +++ b/backend/onboarders/README.md @@ -0,0 +1,102 @@ +# Retrofit Property Data Onboarding + +This repository contains an ETL pipeline for transforming raw retrofit property data from external source systems ( +currently Parity) into a standardised internal format, compatible for both address2uprn and engine. + +The pipeline is designed to: + +- Run as an AWS Lambda triggered by SQS +- Read raw CSV/XLSX files from S3 +- Perform rule-based mappings +- Infer as built property attributes, assumed based on age +- Output a processed csv, back to s3 to be consumed by address2uprn + +### Structure + +SQS → Lambda handler → OnboarderFactory → System-specific Onboarder → Mapping → CSV to S3 + +Each source system implements its own **Onboarder**, while sharing a common base and mapping process. + +--- + +### Repository Structure + +onboarders/ +├── `handler.py` # Lambda entrypoint \ +├── `factory.py` # Onboarder factory \ +├── `base.py` # Shared onboarding base class \ +├── `parity.py` # Parity-specific transformation logic \ +├── `mappings/` \ +│ └── `parity/` # Parity domain mappings & classifiers \ +│ ├── `age_band.py` \ +│ ├── `property_type.py` \ +│ ├── `built_form.py` \ +│ ├── `walls.py` \ +│ ├── `roof.py` \ +│ ├── `floor.py` \ +│ ├── `glazing.py` \ +│ ├── `heating.py` \ +│ ├── `as_built_wall_classifiers.py` \ +│ ├── `as_built_roof_classifiers.py` \ +│ └── `as_built_floor_classifiers.py` \ +├── `tests/` \ +├── `requirements.txt` \ +└── `README.md` + + +--- + +### Lambda Entry Point (`handler.py`) + +The Lambda handler: + +1. Consumes SQS queue +2. Validates the payload +3. Instantiates the correct onboarder via `OnboarderFactory` +4. Runs the transformation +5. Writes the transformed CSV back to S3 + +### Expected Event Payload + +```json +{ + "s3_uri": "s3://bucket/path/to/input.xlsx", + "system": "parity", + "format": "xlsx", + "sheet_name": "Sustainability" +} + +``` + +### Onboarder Base `(base.py)` + +OnboarderBase provides shared functionality across all systems. + +*Responsibilities* + +- Reading CSV/XLSX files from S3 +- Writing transformed CSVs to S3 +- Defining canonical output column names +- Providing validation helpers +- Common output - for the moment, onboards will be expected to return a csv + +### Parity Onboarder `(parity.py)` + +`ParityOnboarder` contains all Parity-specific transformation logic. + +Responsibilities* + +- Map raw Parity fields to internal EPC-aligned enums +- Infer “as-built” constructions using age bands when insulation data is missing +- Resolve energy efficiency ratings deterministically +- Normalise output into a fixed schema + +The `transform()` method orchestrates the transformation process. + +### TODOs + +- In `backend/onboarders/mappings/parity/glazing.py` we currently map the partiy descriptions + to duples of descriptions and efficiency ratings. This is okay for the moment but we may consider + using a data class, just given how error-prone this is. +- This is also true for heating mappings in `backend/onboarders/mappings/parity/heating.py` +- Implement a AI-enabled version, to replace the standardised asset list \ No newline at end of file diff --git a/backend/onboarders/__init__.py b/backend/onboarders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/onboarders/base.py b/backend/onboarders/base.py new file mode 100644 index 00000000..03cb2370 --- /dev/null +++ b/backend/onboarders/base.py @@ -0,0 +1,84 @@ +import pandas as pd +from utils.s3 import read_from_s3, read_excel_from_s3, save_csv_to_s3 + + +class OnboarderBase: + # Input dataset to be transformed + data: pd.DataFrame | None = None + bucket_name = None + input_file_name = None + output_file_name = None + # Description columns + landlord_wall_construction: str = "landlord_wall_construction" + landlord_roof_construction: str = "landlord_roof_construction" + landlord_floor_construction: str = "landlord_floor_construction" + landlord_windows_type: str = "landlord_windows_type" + landlord_heating_construction: str = "landlord_heating_construction" + landlord_fuel_type: str = "landlord_fuel_type" + landlord_heating_controls: str = "landlord_heating_controls" + landlord_hot_water_system: str = "landlord_hot_water_system" + + # Efficiency columns + landlord_roof_efficiency: str = "landlord_roof_efficiency" + landlord_windows_efficiency: str = "landlord_windows_efficiency" + landlord_heating_controls_efficiency: str = "landlord_heating_controls_efficiency" + landlord_heating_efficiency: str = "landlord_heating_efficiency" + landlord_hot_water_efficiency: str = "landlord_hot_water_efficiency" + landlord_wall_efficiency: str = "landlord_wall_efficiency" + + # Additional windows features + landlord_multi_glaze_proportion: str = "landlord_multi_glaze_proportion" + landlord_glazed_type: str = "landlord_glazed_type" + landlord_glazed_area: str = "landlord_glazed_area" + + # Additional roof features + landlord_has_sloping_ceiling: str = "landlord_has_sloping_ceiling" + + # Shape, dimensions, age + landlord_total_floor_area_m2: str = "landlord_total_floor_area_m2" + landlord_construction_age_band: str = "landlord_construction_age_band" + landlord_property_type: str = "landlord_property_type" + landlord_built_form: str = "landlord_built_form" + + def read_s3(self, file_format, **kwargs): + + if self.input_file_name is None or self.bucket_name is None: + raise ValueError("Bucket name and input file name must be set before reading from S3.") + if file_format == "xlsx": + self.data = read_excel_from_s3( + bucket_name=self.bucket_name, + file_key=self.input_file_name, + sheet_name=kwargs.get("sheet_name"), + header_row=kwargs.get("header_row", 0) + ) + else: + self.data = read_from_s3(bucket_name=self.bucket_name, s3_file_name=self.input_file_name) + + def write(self): + if self.data is None: + raise ValueError("No data to write. Please run transform() before writing.") + + if self.bucket_name is None or self.output_file_name is None: + raise ValueError("Bucket name and output file name must be set before writing to S3.") + # Store file as csv - will store in the same route location as the input file + save_csv_to_s3(dataframe=self.data, bucket_name=self.bucket_name, file_name=self.output_file_name) + + @staticmethod + def assert_nulls_only_from_source_nulls(data: pd.DataFrame, original_column: str, mapped_column: str) -> bool: + # We only allow nulls if the original value was null + null_vals = data[pd.isnull(data[mapped_column])] + if null_vals.empty: + return True + # We make sure all original values were null + assert pd.isnull(null_vals[original_column]).all(), ( + f"Some values in {mapped_column} were not mapped, but original values were not null" + ) + + @staticmethod + def assert_no_nulls(data: pd.DataFrame, column: str): + assert pd.isnull(data[column]).sum() == 0, f"column {column} contains null values, but should not" + + def map_construction_age_band(self): + raise NotImplementedError( + "This method should be implemented by subclasses to map construction age bands to descriptions" + ) diff --git a/backend/onboarders/factory.py b/backend/onboarders/factory.py new file mode 100644 index 00000000..2ff7dcbc --- /dev/null +++ b/backend/onboarders/factory.py @@ -0,0 +1,10 @@ +from onboarders.parity import ParityOnboarder + + +class OnboarderFactory: + @staticmethod + def create_onboarder(onboarder_type, **kwargs): + if onboarder_type == "parity": + return ParityOnboarder(**kwargs) + + raise ValueError(f"Unknown onboarder type: {onboarder_type}") diff --git a/backend/onboarders/handler.py b/backend/onboarders/handler.py new file mode 100644 index 00000000..d66b5796 --- /dev/null +++ b/backend/onboarders/handler.py @@ -0,0 +1,50 @@ +import json +from pydantic import BaseModel, Field +from typing import Optional, Literal +from onboarders.factory import OnboarderFactory +from utils.logger import setup_logger + +logger = setup_logger() + + +class OnboardingEvent(BaseModel): + s3_uri: str = Field(..., description="S3 URI of the raw ARA input file") + system: Literal["parity", "generic"] = Field(..., description="Onboarding system identifier") + format: Literal["csv", "xlsx"] + sheet_name: Optional[str] = None + + +def handler(event, context): + """ + Lambda handler that triggers the model engine for each SQS message. + """ + for record in event.get("Records", []): + try: + event_body = json.loads(record["body"]) + # Sample input data + # event_body = { + # "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for " + # "Domna.xlsx", + # "system": "parity", + # "format": "xlsx", + # "sheet_name": "Sustainability" + # } + + logger.info("Processing record with body: %s", event_body) + + validated_event = OnboardingEvent(**event_body) + onboarder = OnboarderFactory.create_onboarder( + validated_event.system, + fileuri=validated_event.s3_uri, + format=validated_event.format, + sheet_name=validated_event.sheet_name, + file_format=validated_event.format + ) + + logger.info("Transforming data") + onboarder.transform() + logger.info(f"Writing data to {onboarder.output_file_name}, bucket: {onboarder.bucket_name}") + onboarder.write() + + except Exception as e: + logger.error(f"Failed to process record: {e}") diff --git a/backend/onboarders/mappings/age_band.py b/backend/onboarders/mappings/age_band.py deleted file mode 100644 index 2487c921..00000000 --- a/backend/onboarders/mappings/age_band.py +++ /dev/null @@ -1,14 +0,0 @@ -party_map = { - "Before 1900": 'England and Wales: before 1900', - "1900-1929": 'England and Wales: 1900-1929', - "1930-1949": 'England and Wales: 1930-1949', - "1950-1966": 'England and Wales: 1950-1966', - "1967-1975": 'England and Wales: 1967-1975', - "1976-1982": 'England and Wales: 1976-1982', - "1983-1990": 'England and Wales: 1983-1990', - "1991-1995": 'England and Wales: 1991-1995', - "1996-2002": 'England and Wales: 1996-2002', - "2003-2006": 'England and Wales: 2003-2006', - "2007-2011": 'England and Wales: 2007-2011', - "2012 onwards": 'England and Wales: 2012-2021', -} diff --git a/backend/onboarders/mappings/built_form.py b/backend/onboarders/mappings/built_form.py deleted file mode 100644 index 23901fc6..00000000 --- a/backend/onboarders/mappings/built_form.py +++ /dev/null @@ -1,15 +0,0 @@ -parity_map = { - "MidTerrace": "Mid-Terrace", - "EndTerrace": "End-Terrace", - "Detached": "Detached", - "SemiDetached": "Semi-Detached", - "EnclosedMidTerrace": "Enclosed Mid-Terrace", - "EnclosedEndTerrace": "Enclosed End-Terrace", -} - -# MidTerrace 41462 -# EndTerrace 20910 -# Detached 16875 -# SemiDetached 14725 -# EnclosedMidTerrace 3176 -# EnclosedEndTerrace 2393 diff --git a/backend/onboarders/mappings/parity/age_band.py b/backend/onboarders/mappings/parity/age_band.py new file mode 100644 index 00000000..406d39c1 --- /dev/null +++ b/backend/onboarders/mappings/parity/age_band.py @@ -0,0 +1,19 @@ +from datatypes.epc.construction_age_band import EpcConstructionAgeBand + +parity_map = { + "Before 1900": EpcConstructionAgeBand.before_1900, + "1900-1929": EpcConstructionAgeBand.from_1900_to_1929, + "1930-1949": EpcConstructionAgeBand.from_1930_to_1949, + "1950-1966": EpcConstructionAgeBand.from_1950_to_1966, + "1967-1975": EpcConstructionAgeBand.from_1967_to_1975, + "1976-1982": EpcConstructionAgeBand.from_1976_to_1982, + "1983-1990": EpcConstructionAgeBand.from_1983_to_1990, + "1991-1995": EpcConstructionAgeBand.from_1991_to_1995, + "1996-2002": EpcConstructionAgeBand.from_1996_to_2002, + "2003-2006": EpcConstructionAgeBand.from_2003_to_2006, + "2007-2011": EpcConstructionAgeBand.from_2007_to_2011, + "2012 onwards": EpcConstructionAgeBand.from_2012_onwards, + # Newer age bands, under SAP10 + "2012-2022": EpcConstructionAgeBand.from_2012_to_2022, + "2023 onwards": EpcConstructionAgeBand.from_2023_onwards, +} diff --git a/backend/onboarders/mappings/parity/as_built_floor_classifiers.py b/backend/onboarders/mappings/parity/as_built_floor_classifiers.py new file mode 100644 index 00000000..3af3c079 --- /dev/null +++ b/backend/onboarders/mappings/parity/as_built_floor_classifiers.py @@ -0,0 +1,60 @@ +from datatypes.epc.construction_age_band import EpcConstructionAgeBand +from datatypes.epc.floor import EpcFloorDescriptions + + +def unknown_floor_as_built(age_band: EpcConstructionAgeBand) -> EpcFloorDescriptions: + year = age_band.start_year() + + if year >= 2003: + return EpcFloorDescriptions.solid_insulated_assumed + + if year >= 1996: + return EpcFloorDescriptions.solid_limited_insulation_assumed + + if year >= 1930: + return EpcFloorDescriptions.solid_no_insulation_assumed + + return EpcFloorDescriptions.suspended_no_insulation_assumed + + +def unknown_floor_retrofitted(age_band: EpcConstructionAgeBand) -> EpcFloorDescriptions: + year = age_band.start_year() + + if year >= 1930: + return EpcFloorDescriptions.solid_insulated + + return EpcFloorDescriptions.suspended_insulated + + +def map_solid_floor_as_built(age_band: EpcConstructionAgeBand) -> EpcFloorDescriptions: + year = age_band.start_year() + + if year >= 2003: + return EpcFloorDescriptions.solid_insulated_assumed + if year >= 1996: + return EpcFloorDescriptions.solid_limited_insulation_assumed + return EpcFloorDescriptions.solid_no_insulation_assumed + + +def map_suspended_floor_as_built(age_band: EpcConstructionAgeBand) -> EpcFloorDescriptions: + year = age_band.start_year() + + if year >= 2003: + return EpcFloorDescriptions.suspended_insulated_assumed + if year >= 1996: + return EpcFloorDescriptions.suspended_limited_insulation_assumed + + return EpcFloorDescriptions.suspended_no_insulation_assumed + + +as_built_floor_classifiers = { + "Solid": map_solid_floor_as_built, + "SuspendedTimber": map_suspended_floor_as_built, + "SuspendedNotTimber": map_suspended_floor_as_built, +} + +unknown_as_built_floor_classifiers = { + "RetroFitted": unknown_floor_retrofitted, + "AsBuilt": unknown_floor_as_built, + "Unknown": unknown_floor_as_built, +} diff --git a/backend/onboarders/mappings/parity/as_built_roof_classifiers.py b/backend/onboarders/mappings/parity/as_built_roof_classifiers.py new file mode 100644 index 00000000..fcb554bd --- /dev/null +++ b/backend/onboarders/mappings/parity/as_built_roof_classifiers.py @@ -0,0 +1,56 @@ +from datatypes.epc.roof import EpcRoofDescriptions +from datatypes.epc.construction_age_band import EpcConstructionAgeBand + + +def map_flat_roof(age_band: EpcConstructionAgeBand) -> EpcRoofDescriptions: + """ + For a flat, as built roof, these are the breakdowns: + + 2023 onwards → Flat, insulated + 2003–2022 → Flat, insulated + 1983–2002 → Flat, insulated + 1976–1982 → Flat, limited insulation + 1967–1975 → Flat, limited insulation + 1950–1966 and earlier → Flat, no insulation + :param age_band: Input age band + :return: EpcRoofDescriptions + """ + + year = age_band.start_year() + + if year >= 1983: + return EpcRoofDescriptions.flat_insulated + + if year >= 1967: + return EpcRoofDescriptions.flat_limited_insulation + + return EpcRoofDescriptions.flat_no_insulation + + +def map_sloping_ceiling_roof(age_band: EpcConstructionAgeBand) -> EpcRoofDescriptions: + """ + For a sloping ceiling, as built roof, these are the breakdowns: + 2023 onwards → Sloping pitched, insulated + 2003–2022 → Sloping pitched, insulated + 1983–2002 → Sloping pitched, insulated + 1976–1982 → Sloping pitched, limited insulation + 1967–1975 and earlier → Sloping pitched, no insulation + :param age_band: Input age band + :return: EpcRoofDescriptions + """ + year = age_band.start_year() + + if year >= 1983: + return EpcRoofDescriptions.sloping_pitched_insulated + + if year >= 1976: + return EpcRoofDescriptions.sloping_pitched_limited_insulation + + return EpcRoofDescriptions.sloping_pitched_no_insulation + + +as_built_roof_classifiers = { + # Only need to apply this to flat and sloping ceiling roofs + "Flat": map_flat_roof, + "PitchedWithSlopingCeiling": map_sloping_ceiling_roof, +} diff --git a/backend/onboarders/mappings/parity/as_built_wall_classifiers.py b/backend/onboarders/mappings/parity/as_built_wall_classifiers.py new file mode 100644 index 00000000..480a7e24 --- /dev/null +++ b/backend/onboarders/mappings/parity/as_built_wall_classifiers.py @@ -0,0 +1,113 @@ +from datatypes.epc.construction_age_band import EpcConstructionAgeBand +from datatypes.epc.walls import EpcWallDescriptions + + +def map_cavity_wall_insulation(age_band: EpcConstructionAgeBand): + if age_band.start_year() < 1976: + return EpcWallDescriptions.cavity_no_insulation_assumed + + if age_band == EpcConstructionAgeBand.from_1976_to_1982: + return EpcWallDescriptions.cavity_partial_insulated_assumed + + if age_band in EpcConstructionAgeBand.from_year_onwards(1983): + return EpcWallDescriptions.cavity_insulated_assumed + + raise NotImplementedError(f"Age band {age_band} not handled for cavity wall as built insulation mapping") + + +def map_solid_wall_insulation(age_band: EpcConstructionAgeBand): + if age_band.start_year() < 1976: + return EpcWallDescriptions.solid_brick_no_insulation_assumed + + if age_band == EpcConstructionAgeBand.from_1976_to_1982: + return EpcWallDescriptions.solid_brick_partial_insulated_assumed + + if age_band in EpcConstructionAgeBand.from_year_onwards(1983): + return EpcWallDescriptions.solid_brick_insulated_assumed + + raise NotImplementedError( + f"Age band {age_band.value} not handled for solid wall insulation mapping" + ) + + +def map_timber_frame_wall_insulation(age_band: EpcConstructionAgeBand): + if age_band.start_year() < 1950: + return EpcWallDescriptions.timber_frame_no_insulation_assumed + + if age_band.start_year() < 1976: + return EpcWallDescriptions.timber_frame_partial_insulated_assumed + + if age_band in EpcConstructionAgeBand.from_year_onwards(1976): + return EpcWallDescriptions.timber_frame_insulated_assumed + + raise NotImplementedError( + f"Age band {age_band.value} not handled for timber frame wall insulation mapping" + ) + + +def map_system_build_wall_insulation(age_band: EpcConstructionAgeBand): + if age_band.start_year() < 1976: + return EpcWallDescriptions.system_no_insulation_assumed + + if age_band == EpcConstructionAgeBand.from_1976_to_1982: + return EpcWallDescriptions.system_partial_insulated_assumed + + if age_band in EpcConstructionAgeBand.from_year_onwards(1983): + return EpcWallDescriptions.system_insulated_assumed + + raise NotImplementedError( + f"Age band {age_band.value} not handled for system build wall insulation mapping" + ) + + +def map_granite_wall_insulation(age_band: EpcConstructionAgeBand): + if age_band.start_year() < 1976: + return EpcWallDescriptions.granite_whinstone_no_insulation_assumed + + if age_band == EpcConstructionAgeBand.from_1976_to_1982: + return EpcWallDescriptions.granite_whinstone_partial_insulated_assumed + + if age_band in EpcConstructionAgeBand.from_year_onwards(1983): + return EpcWallDescriptions.granite_whinestone_insulated_assumed + + raise NotImplementedError( + f"Age band {age_band.value} not handled for granite wall insulation mapping" + ) + + +def map_sandstone_wall_insulation(age_band: EpcConstructionAgeBand): + if age_band.start_year() < 1976: + return EpcWallDescriptions.sandstone_limestone_no_insulation_assumed + + if age_band == EpcConstructionAgeBand.from_1976_to_1982: + return EpcWallDescriptions.sandstone_limestone_partial_insulated_assumed + + if age_band in EpcConstructionAgeBand.from_year_onwards(1983): + return EpcWallDescriptions.sandstone_limestone_insulated_assumed + + raise NotImplementedError( + f"Age band {age_band.value} not handled for sandstone wall insulation mapping" + ) + + +def map_cob_wall_insulation(age_band: EpcConstructionAgeBand): + if age_band.start_year() < 1983: + return EpcWallDescriptions.cob_as_built_average + + if age_band in EpcConstructionAgeBand.from_year_onwards(1983): + return EpcWallDescriptions.cob_as_built_good + + raise NotImplementedError( + f"Age band {age_band.value} not handled for cob wall insulation mapping" + ) + + +as_built_wall_classifiers = { + "Cavity": map_cavity_wall_insulation, + "Solid Brick": map_solid_wall_insulation, + "Timber Frame": map_timber_frame_wall_insulation, + "System": map_system_build_wall_insulation, + "Granite": map_granite_wall_insulation, + "Sandstone": map_sandstone_wall_insulation, + "Cob": map_cob_wall_insulation, +} diff --git a/backend/onboarders/mappings/parity/built_form.py b/backend/onboarders/mappings/parity/built_form.py new file mode 100644 index 00000000..12ae6360 --- /dev/null +++ b/backend/onboarders/mappings/parity/built_form.py @@ -0,0 +1,10 @@ +from datatypes.epc.property_type_built_form import BuiltForm + +parity_map = { + "MidTerrace": BuiltForm.mid_terrace, + "EndTerrace": BuiltForm.end_terrace, + "Detached": BuiltForm.detached, + "SemiDetached": BuiltForm.semi_detached, + "EnclosedMidTerrace": BuiltForm.enclosed_mid_terrace, + "EnclosedEndTerrace": BuiltForm.enclosed_end_terrace, +} diff --git a/backend/onboarders/mappings/parity/floor.py b/backend/onboarders/mappings/parity/floor.py new file mode 100644 index 00000000..653d4c68 --- /dev/null +++ b/backend/onboarders/mappings/parity/floor.py @@ -0,0 +1,26 @@ +from numpy import nan +from datatypes.epc.floor import EpcFloorDescriptions + +floor_map = { + # Solid floor + ('Solid', 'AsBuilt'): None, # Mapped + ('Solid', 'Unknown'): None, # Mapped + ('Solid', nan): None, # Mapped + ('Solid', 'RetroFitted'): EpcFloorDescriptions.solid_insulated, + + # Suspended floor + ('SuspendedTimber', nan): None, # Mapped suspended_floor_as_built + ('SuspendedTimber', 'AsBuilt'): None, # Mapped suspended_floor_as_built + ('SuspendedTimber', 'RetroFitted'): EpcFloorDescriptions.suspended_insulated, + ('SuspendedTimber', 'Unknown'): None, # Mapped suspended_floor_as_built + ('SuspendedNotTimber', 'RetroFitted'): EpcFloorDescriptions.suspended_insulated, + ('SuspendedNotTimber', nan): None, # Mapped suspended_floor_as_built + ('SuspendedNotTimber', 'Unknown'): None, # Mapped suspended_floor_as_built + ('SuspendedNotTimber', 'AsBuilt'): None, # Mapped suspended_floor_as_built + + # Unknown type - mapped on age + ('Unknown', 'Unknown'): None, # Mapped unknown_floor_as_built + ('Unknown', 'RetroFitted'): None, # Mapped unknown_floor_retrofitted + (nan, nan): None, # No actual information! + ('Unknown', 'AsBuilt'): None, # Mapped unknown_floor_as_built +} diff --git a/backend/onboarders/mappings/parity/glazing.py b/backend/onboarders/mappings/parity/glazing.py new file mode 100644 index 00000000..46c006bd --- /dev/null +++ b/backend/onboarders/mappings/parity/glazing.py @@ -0,0 +1,20 @@ +from datatypes.epc.efficiency import EpcEfficiency + +glazing_map = { + # (description, energy efficiency, multi_glaze_proportion, glazed_type, glazed_area + # For SAP 10 assessments, The glazed type and glazed area are not populated in the EPC API data any more + "Double 2002 or later": ("Fully double glazed", EpcEfficiency.AVERAGE, 1, None, None), + "Double before 2002": ("Fully double glazed", EpcEfficiency.POOR, 1, None, None), + "Double but age unknown": ("Fully double glazed", EpcEfficiency.POOR, 1, None, None), + "Single": ("Single glazed", EpcEfficiency.VERY_POOR, 0, None, None), + # For triple glazing, with age unknown, the performance is only average, whereas if it's a post 2022 + # installation, it's classed as high performance glazing with good efficiency. We'll need to be considerate as to + # how we make updates to the windows data. + # Triple known data is high performance glazing with Good efficiency (at least) + "Triple": ("Fully triple glazed", EpcEfficiency.AVERAGE, 1, None, None), + # This is also classed as high performance glazing + "DoubleKnownData": ("High performance glazing", EpcEfficiency.GOOD, 1, None, None), + # Under SAP 10, secondary glazing is classed as poor efficiency (whereas under SAP 2012 it was generally good) + "Secondary": ("Full secondary glazing", EpcEfficiency.POOR, 1, None, None), + "TripleKnownData": ("High performance glazing", EpcEfficiency.GOOD, 1, None, None), +} diff --git a/backend/onboarders/mappings/parity/heating.py b/backend/onboarders/mappings/parity/heating.py new file mode 100644 index 00000000..aa74834b --- /dev/null +++ b/backend/onboarders/mappings/parity/heating.py @@ -0,0 +1,330 @@ +from datatypes.epc.main_heating import EpcHeatingSystems +from datatypes.epc.efficiency import EpcEfficiency +from datatypes.epc.fuel import EpcFuel +from datatypes.epc.heating_controls import EpcHeatingControls +from datatypes.epc.hotwater import EpcHotWaterSystems + +heating_map = { + # 0 + ('Boilers', 'A', 'ElectricityNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_and_radiators_electric, EpcEfficiency.VERY_POOR, EpcFuel.electricity_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 1 + ('Boilers', 'A', 'ElectricityNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_and_radiators_electric, EpcEfficiency.VERY_POOR, EpcFuel.electricity_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 2 + ('Boilers', 'A', 'ElectricityNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_and_radiators_electric, EpcEfficiency.VERY_POOR, EpcFuel.electricity_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 3 + ('Boilers', 'A', 'LPGNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_and_radiators_lpg, EpcEfficiency.POOR, EpcFuel.lpg_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 4 + ('Boilers', 'A', 'MainsGasNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.VERY_GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 5 + ('Boilers', 'A', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.VERY_GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 6 + ('Boilers', 'A', 'MainsGasNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.VERY_GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 7 + ('Boilers', 'B', 'MainsGasNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 8 + ('Boilers', 'B', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 9 + ('Boilers', 'B', 'MainsGasNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 10 + ('Boilers', 'C', 'ElectricityNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_and_radiators_electric, EpcEfficiency.VERY_POOR, EpcFuel.electricity_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 11 + ('Boilers', 'C', 'ElectricityNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_and_radiators_electric, EpcEfficiency.VERY_POOR, EpcFuel.electricity_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 12 + ('Boilers', 'C', 'ElectricityNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_and_radiators_electric, EpcEfficiency.VERY_POOR, EpcFuel.electricity_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 13 + ('Boilers', 'C', 'LPGNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_and_radiators_lpg, EpcEfficiency.POOR, EpcFuel.lpg_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 14 + ('Boilers', 'C', 'LPGNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_and_radiators_lpg, EpcEfficiency.POOR, EpcFuel.lpg_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 15 + ('Boilers', 'C', 'MainsGasNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 16 + ('Boilers', 'C', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 17 + ('Boilers', 'C', 'MainsGasNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + ('Boilers', 'C', 'OilNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_radiators_oil, EpcEfficiency.AVERAGE, EpcFuel.oil_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 19 + ('Boilers', 'C', 'OilNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_radiators_oil, EpcEfficiency.AVERAGE, EpcFuel.oil_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 20 + ('Boilers', 'C', 'OilNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_radiators_oil, EpcEfficiency.AVERAGE, EpcFuel.oil_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 21 + ('Boilers', 'D', 'MainsGasNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 22 + ('Boilers', 'D', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 23 + ('Boilers', 'D', 'MainsGasNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 24 + ('Boilers', 'E', 'ElectricityNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_and_radiators_electric, EpcEfficiency.VERY_POOR, EpcFuel.electricity_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 25 + ('Boilers', 'E', 'MainsGasNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 26 + ('Boilers', 'E', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 27 + ('Boilers', 'E', 'MainsGasNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 28 + ('Boilers', 'E', 'OilNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_radiators_oil, EpcEfficiency.AVERAGE, EpcFuel.oil_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 29 + ('Boilers', 'E', 'OilNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_radiators_oil, EpcEfficiency.AVERAGE, EpcFuel.oil_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 30 + ('Boilers', 'F', 'MainsGasNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 31 + ('Boilers', 'F', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 32 + ('Boilers', 'F', 'MainsGasNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 33 + ('Boilers', 'G', 'MainsGasNotCommunity', 'Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 34 + ('Boilers', 'G', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 35 + ('Boilers', 'G', 'MainsGasNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.boiler_radiators_mains_gas, EpcEfficiency.AVERAGE, EpcFuel.mains_gas_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 36 + ('Electric underfloor', 'A', 'ElectricityNotCommunity', 'Optimal'): ( + EpcHeatingSystems.electric_underfloor_heating, EpcEfficiency.AVERAGE, EpcFuel.electricity_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + # 37 + ('Electric underfloor', 'A', 'ElectricityNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.electric_underfloor_heating, EpcEfficiency.AVERAGE, EpcFuel.electricity_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + # 38 + ('Electric underfloor', 'A', 'ElectricityNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.electric_underfloor_heating, EpcEfficiency.AVERAGE, EpcFuel.electricity_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + # 39 + ('Heat pumps (warm air)', 'A', 'ElectricityNotCommunity', 'Optimal'): ( + EpcHeatingSystems.air_to_air_ashp, EpcEfficiency.AVERAGE, EpcFuel.electricity_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 40 + ('Heat pumps (warm air)', 'A', 'ElectricityNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.air_to_air_ashp, EpcEfficiency.AVERAGE, EpcFuel.electricity_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 41 + ('Heat pumps (wet)', 'A', 'ElectricityNotCommunity', 'Optimal'): ( + EpcHeatingSystems.ashp_radiators_electric, EpcEfficiency.GOOD, EpcFuel.electricity_not_community, + EpcHeatingControls.programmer_room_thermostat_trvs, EpcEfficiency.GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 42 + ('Heat pumps (wet)', 'A', 'ElectricityNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.ashp_radiators_electric, EpcEfficiency.GOOD, EpcFuel.electricity_not_community, + EpcHeatingControls.programmers_trvs_bypass, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 43 + ('Heat pumps (wet)', 'A', 'ElectricityNotCommunity', 'Top Spec'): ( + EpcHeatingSystems.ashp_radiators_electric, EpcEfficiency.GOOD, EpcFuel.electricity_not_community, + EpcHeatingControls.time_and_temperature_zone_control, EpcEfficiency.VERY_GOOD, + EpcHotWaterSystems.from_main_system, EpcEfficiency.AVERAGE + ), + # 44 + ('Room heaters', 'A', 'ElectricityNotCommunity', 'Optimal'): ( + EpcHeatingSystems.room_heaters_electric, EpcEfficiency.POOR, EpcFuel.electricity_not_community, + EpcHeatingControls.programmer_and_appliance_thermostats, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + # 45 + ('Room heaters', 'A', 'ElectricityNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.room_heaters_electric, EpcEfficiency.POOR, EpcFuel.electricity_not_community, + EpcHeatingControls.appliance_thermostats, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + # 46 + ('Room heaters', 'C', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.room_heaters_mains_gas, EpcEfficiency.AVERAGE, EpcFuel.mains_gas_not_community, + EpcHeatingControls.appliance_thermostats, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + # 47 - water done from here + ('Room heaters', 'F', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.room_heaters_mains_gas, EpcEfficiency.POOR, EpcFuel.mains_gas_not_community, + EpcHeatingControls.appliance_thermostats, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + ('Room heaters', 'G', 'MainsGasNotCommunity', 'Optimal'): ( + EpcHeatingSystems.room_heaters_mains_gas, EpcEfficiency.POOR, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmer_and_appliance_thermostats, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + ('Room heaters', 'G', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.room_heaters_mains_gas, EpcEfficiency.POOR, EpcFuel.mains_gas_not_community, + EpcHeatingControls.appliance_thermostats, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + ('Room heaters', 'G', 'SmokelessCoal', 'Sub Optimal'): ( + EpcHeatingSystems.room_heaters_smokeless_fuel, EpcEfficiency.VERY_POOR, EpcFuel.smokeless_coal, + EpcHeatingControls.appliance_thermostats, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + ('Storage heaters', 'A', 'ElectricityNotCommunity', 'Optimal'): ( + EpcHeatingSystems.electric_storage_heaters, EpcEfficiency.AVERAGE, EpcFuel.electricity_not_community, + EpcHeatingControls.automatic_charge_control, EpcEfficiency.AVERAGE, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + ('Storage heaters', 'A', 'ElectricityNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.electric_storage_heaters, EpcEfficiency.AVERAGE, EpcFuel.electricity_not_community, + EpcHeatingControls.manual_charge_control, EpcEfficiency.POOR, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + ('Warm Air (not heat pump)', 'G', 'ElectricityNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.warm_air_electricaire, EpcEfficiency.GOOD, EpcFuel.electricity_not_community, + EpcHeatingControls.programmer_and_atleast_two_room_thermostats, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ), + ('Warm Air (not heat pump)', 'G', 'MainsGasNotCommunity', 'Sub Optimal'): ( + EpcHeatingSystems.warm_air_mains_gas, EpcEfficiency.GOOD, EpcFuel.mains_gas_not_community, + EpcHeatingControls.programmer_and_atleast_two_room_thermostats, EpcEfficiency.GOOD, + EpcHotWaterSystems.electric_immersion_off_peak, EpcEfficiency.AVERAGE + ) +} diff --git a/backend/onboarders/mappings/parity/property_type.py b/backend/onboarders/mappings/parity/property_type.py new file mode 100644 index 00000000..f91c0c88 --- /dev/null +++ b/backend/onboarders/mappings/parity/property_type.py @@ -0,0 +1,8 @@ +from datatypes.epc.property_type_built_form import PropertyType + +parity_map = { + "Flat": PropertyType.flat, + "Maisonette": PropertyType.maisonette, + "Bungalow": PropertyType.bungalow, + "House": PropertyType.house, +} diff --git a/backend/onboarders/mappings/parity/roof.py b/backend/onboarders/mappings/parity/roof.py new file mode 100644 index 00000000..02518c3e --- /dev/null +++ b/backend/onboarders/mappings/parity/roof.py @@ -0,0 +1,461 @@ +import pandas as pd +from numpy import nan +from typing import Union, Callable +from collections.abc import Mapping +from datatypes.epc.roof import EpcRoofDescriptions +from datatypes.epc.efficiency import EpcEfficiency +from datatypes.epc.construction_age_band import EpcConstructionAgeBand + +roof_map = { + # Dwelling above + ('AnotherDwellingAbove', 'Another Dwelling Above'): EpcRoofDescriptions.another_dwelling_above, + ('SameDwellingAbove', 'Same Dwelling Above'): EpcRoofDescriptions.another_dwelling_above, + # Pitched, normal loft access, with a loft thickness + ('PitchedNormalLoftAccess', 'mm25'): EpcRoofDescriptions.loft_25mm_insulation, + ('PitchedNormalLoftAccess', 'mm50'): EpcRoofDescriptions.loft_50mm_insulation, + ('PitchedNormalLoftAccess', 'mm75'): EpcRoofDescriptions.loft_75mm_insulation, + ('PitchedNormalLoftAccess', 'mm100'): EpcRoofDescriptions.loft_100mm_insulation, + ('PitchedNormalLoftAccess', 'mm150'): EpcRoofDescriptions.loft_150mm_insulation, + ('PitchedNormalLoftAccess', 'mm200'): EpcRoofDescriptions.loft_200mm_insulation, + ('PitchedNormalLoftAccess', 'mm250'): EpcRoofDescriptions.loft_250mm_insulation, + ('PitchedNormalLoftAccess', 'mm270'): EpcRoofDescriptions.loft_270mm_insulation, + ('PitchedNormalLoftAccess', 'mm300'): EpcRoofDescriptions.loft_300mm_insulation, + ('PitchedNormalLoftAccess', 'mm350'): EpcRoofDescriptions.loft_350mm_insulation, + ('PitchedNormalLoftAccess', 'mm400'): EpcRoofDescriptions.loft_400mm_plus_insulation, + + # Pitched, no loft access, with a loft thickness + ('PitchedNormalNoLoftAccess', 'mm25'): EpcRoofDescriptions.loft_25mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm50'): EpcRoofDescriptions.loft_50mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm75'): EpcRoofDescriptions.loft_75mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm100'): EpcRoofDescriptions.loft_100mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm150'): EpcRoofDescriptions.loft_150mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm200'): EpcRoofDescriptions.loft_200mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm250'): EpcRoofDescriptions.loft_250mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm270'): EpcRoofDescriptions.loft_270mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm300'): EpcRoofDescriptions.loft_300mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm350'): EpcRoofDescriptions.loft_350mm_insulation, + ('PitchedNormalNoLoftAccess', 'mm400'): EpcRoofDescriptions.loft_400mm_plus_insulation, + + # All pitched options with asbuilt or unknown got to EpcRoofDescriptions.pitched_insulated_assumed + # With access + ('PitchedNormalLoftAccess', nan): EpcRoofDescriptions.pitched_insulated_assumed, + ('PitchedNormalLoftAccess', 'AsBuilt'): EpcRoofDescriptions.pitched_insulated_assumed, + ('PitchedNormalLoftAccess', 'Unknown'): EpcRoofDescriptions.pitched_insulated_assumed, + # No access + ('PitchedNormalNoLoftAccess', nan): EpcRoofDescriptions.pitched_insulated_assumed, + ('PitchedNormalNoLoftAccess', 'AsBuilt'): EpcRoofDescriptions.pitched_insulated_assumed, + ('PitchedNormalNoLoftAccess', 'Unknown'): EpcRoofDescriptions.pitched_insulated_assumed, + + # Flat + ('Flat', 'NoInsulation'): EpcRoofDescriptions.flat_no_insulation, + # Flat - limited insulation + ('Flat', '12mm'): EpcRoofDescriptions.flat_limited_insulation, + ('Flat', 'mm25'): EpcRoofDescriptions.flat_limited_insulation, + ('Flat', 'mm50'): EpcRoofDescriptions.flat_limited_insulation, + # Flat insulated + ('Flat', 'mm75'): EpcRoofDescriptions.flat_insulated, + ('Flat', 'mm100'): EpcRoofDescriptions.flat_insulated, + ('Flat', 'mm150'): EpcRoofDescriptions.flat_insulated, + ('Flat', 'mm200'): EpcRoofDescriptions.flat_insulated, + ('Flat', 'mm250'): EpcRoofDescriptions.flat_insulated, + ('Flat', 'mm300'): EpcRoofDescriptions.flat_insulated, + ('Flat', 'mm350'): EpcRoofDescriptions.flat_insulated, + ('Flat', 'mm400'): EpcRoofDescriptions.flat_insulated, + # Flat - as built or unknown + ('Flat', 'AsBuilt'): None, # To be classified + ('Flat', nan): None, # To be classified + ('Flat', 'Unknown'): None, # To be classified + + # 12mm = very poor & has limited insulation description + # 25, 50 = poor & has limited insulation description + # 75, 100, 125mm = average (Flat, insulated) + # 150, 175, 200, 225, 250mm = good (Flat, insulated) + # 270mm+ = very good (Flat, insulated) + + # Thatched + ('PitchedThatched', 'mm50'): EpcRoofDescriptions.thatched_with_additional_insulation, + ('PitchedThatched', 'mm150'): EpcRoofDescriptions.thatched_with_additional_insulation, + ('PitchedThatched', 'mm300'): EpcRoofDescriptions.thatched_with_additional_insulation, + ('PitchedThatched', 'Unknown'): EpcRoofDescriptions.thatched, # efficiency classified based on age + + # Sloping: + # Limited (12 very poor, 25-50 poor) + ('PitchedWithSlopingCeiling', 'mm12'): EpcRoofDescriptions.sloping_pitched_limited_insulation, + ('PitchedWithSlopingCeiling', 'mm25'): EpcRoofDescriptions.sloping_pitched_limited_insulation, + ('PitchedWithSlopingCeiling', 'mm50'): EpcRoofDescriptions.sloping_pitched_limited_insulation, + # Insulated 75mm+ (75 - 125 average, 150 - 250 good, 270+ very good) + ('PitchedWithSlopingCeiling', 'mm75'): EpcRoofDescriptions.sloping_pitched_insulated, + ('PitchedWithSlopingCeiling', 'mm100'): EpcRoofDescriptions.sloping_pitched_insulated, + ('PitchedWithSlopingCeiling', 'mm150'): EpcRoofDescriptions.sloping_pitched_insulated, + ('PitchedWithSlopingCeiling', 'mm200'): EpcRoofDescriptions.sloping_pitched_insulated, + ('PitchedWithSlopingCeiling', 'mm250'): EpcRoofDescriptions.sloping_pitched_insulated, + ('PitchedWithSlopingCeiling', 'mm270'): EpcRoofDescriptions.sloping_pitched_insulated, + ('PitchedWithSlopingCeiling', 'mm300'): EpcRoofDescriptions.sloping_pitched_insulated, + ('PitchedWithSlopingCeiling', 'mm350'): EpcRoofDescriptions.sloping_pitched_insulated, + ('PitchedWithSlopingCeiling', 'mm400'): EpcRoofDescriptions.sloping_pitched_insulated, + # As built/unknown + ('PitchedWithSlopingCeiling', 'AsBuilt'): None, # To be classified + ('PitchedWithSlopingCeiling', nan): None, # To be classified + ('PitchedWithSlopingCeiling', 'Unknown'): None, # +} + +roof_unknown_age_fallback = { + "Flat": EpcRoofDescriptions.flat_as_built_unknown, + "PitchedWithSlopingCeiling": EpcRoofDescriptions.sloping_pitched_as_built_unknown, + "PitchedThatched": EpcRoofDescriptions.thatched_as_built_unknown, + "PitchedNormalLoftAccess": EpcRoofDescriptions.loft_as_built_unknown, + "PitchedNormalNoLoftAccess": EpcRoofDescriptions.loft_as_built_unknown, +} + +RoofEfficiencyRule = Union[ + EpcEfficiency, + Callable[[EpcConstructionAgeBand, int | None], EpcEfficiency], +] + + +def flat_insulated_efficiency_age_band(age_band: EpcConstructionAgeBand) -> EpcEfficiency: + """ + before 1900, 1900-1929, 1930-1949, 1950-1966, 1967-1975 -> Pitched, no insulation, Very Poor + 1976-1982 -> Pitched, limited insulation, Poor + 1983-1990, to 1996-2002 Pitched, insulated, Average + 2003 - 2006, 2012-2022 -> Pitched, insulated, Good + 2023 onwards -> Pitched, insulated, Very Good + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + + start_year = age_band.start_year() + if start_year >= 2023: + return EpcEfficiency.VERY_GOOD + + if start_year >= 2003: + return EpcEfficiency.GOOD + + if start_year >= 1983: + return EpcEfficiency.AVERAGE + + if start_year >= 1976: + return EpcEfficiency.POOR + + return EpcEfficiency.VERY_POOR + + +def flat_insulated_efficiency_thickness(insulation_thickness: int | None) -> EpcEfficiency: + """ + 12mm -> Very Poor + 25mm - 50mm -> Poor + 75mm - 125mm -> Pitched, insulated, average + 150mm - 250mm -> good + 270mm+ -> very good + :param insulation_thickness: Insulation thickness in mm + :return: EpcEfficiency + """ + + if insulation_thickness is None: + raise ValueError("Insulation thickness is required for flat insulated efficiency calculation") + + if insulation_thickness >= 270: + return EpcEfficiency.VERY_GOOD + + if 150 <= insulation_thickness <= 250: + return EpcEfficiency.GOOD + + if 75 <= insulation_thickness <= 125: + return EpcEfficiency.AVERAGE + + if 25 <= insulation_thickness <= 50: + return EpcEfficiency.POOR + + return EpcEfficiency.VERY_POOR + + +def flat_efficiency(insulation_thickness: int | None, age_band: EpcConstructionAgeBand) -> EpcEfficiency: + """ + Combines both age band and insulation thickness to determine flat roof efficiency. + :param insulation_thickness: Insulation thickness in mm + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + if insulation_thickness is not None: + return flat_insulated_efficiency_thickness(insulation_thickness) + + return flat_insulated_efficiency_age_band(age_band) + + +def loft_insulated_efficiency(age_band: EpcConstructionAgeBand) -> EpcEfficiency: + """ + 2023 onwards -> Very Good + 2012-2022 -> Very Good + 2007-2011 -> Very Good + 2003-2006 -> Very Good + 1996-2002 -> Good + 1991-1995 -> Good + 1983-1990 -> Average + 1976-1982 -> Average + 1967-1975 -> Average + 1950-1966 -> Average + 1930-1949 -> Average + 1900-1929 -> Average + before 1900 -> Average + :param age_band: Input age band, EpcConstructionAgeBand + :return: EpcEfficiency + """ + year = age_band.start_year() + if year >= 2003: + return EpcEfficiency.VERY_GOOD + if year >= 1991: + return EpcEfficiency.GOOD + + return EpcEfficiency.AVERAGE + + +def thatched_efficiency_age_band(age_band: EpcConstructionAgeBand) -> EpcEfficiency: + """ + Maps thatched roof efficiency based on construction age band. + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + year = age_band.start_year() + if year >= 2023: + return EpcEfficiency.VERY_GOOD + if year >= 2003: + return EpcEfficiency.GOOD + + return EpcEfficiency.AVERAGE + + +def thatched_efficiency_thickness(insulation_thickness: int | None) -> EpcEfficiency: + """ + Maps thatched roof efficiency based on insulation thickness. + :param insulation_thickness: Insulation thickness in mm + :return: EpcEfficiency + """ + if insulation_thickness is None: + raise ValueError("Insulation thickness is required for thatched efficiency calculation") + + if insulation_thickness >= 175: + return EpcEfficiency.VERY_GOOD + + if insulation_thickness >= 25: + return EpcEfficiency.GOOD + + return EpcEfficiency.AVERAGE + + +def thatched_efficiency( + insulation_thickness: int | None, + age_band: EpcConstructionAgeBand, +) -> EpcEfficiency: + """ + Combines both age band and insulation thickness to determine thatched roof efficiency. + :param insulation_thickness: Insulation thickness in mm + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + if insulation_thickness is not None: + return thatched_efficiency_thickness(insulation_thickness) + + return thatched_efficiency_age_band(age_band) + + +def sloping_ceiling_efficiency_age_band(age_band: EpcConstructionAgeBand) -> EpcEfficiency: + """ + Maps sloping ceiling roof efficiency based on construction age band. + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + year = age_band.start_year() + if year >= 2023: + return EpcEfficiency.VERY_GOOD + if year >= 2003: + return EpcEfficiency.GOOD + if year >= 1983: + return EpcEfficiency.AVERAGE + if year >= 1976: + return EpcEfficiency.POOR + + return EpcEfficiency.VERY_POOR + + +def sloping_ceiling_efficiency_thickness(insulation_thickness: int | None) -> EpcEfficiency: + """ + Maps sloping ceiling roof efficiency based on insulation thickness. + :param insulation_thickness: Insulation thickness in mm + :return: EpcEfficiency + """ + if insulation_thickness is None: + raise ValueError("Insulation thickness is required for sloping ceiling efficiency calculation") + + if insulation_thickness >= 270: + return EpcEfficiency.VERY_GOOD + + if insulation_thickness >= 150: + return EpcEfficiency.GOOD + + if insulation_thickness >= 75: + return EpcEfficiency.AVERAGE + + if insulation_thickness >= 25: + return EpcEfficiency.POOR + + return EpcEfficiency.VERY_POOR + + +def sloping_ceiling_efficiency( + insulation_thickness: int | None, + age_band: EpcConstructionAgeBand, +) -> EpcEfficiency: + """ + Combines both age band and insulation thickness to determine sloping ceiling roof efficiency. + :param insulation_thickness: Insulation thickness in mm + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + if insulation_thickness is not None: + return sloping_ceiling_efficiency_thickness(insulation_thickness) + + return sloping_ceiling_efficiency_age_band(age_band) + + +def loft_insulated_at_rafters_efficiency_thickness(insulation_thickness: int | None) -> EpcEfficiency: + """ + 400mm, 350mm = very good + 200-300mm = good + 125-175 = average + 50-100 = poor + 25 and below= very poor + :return: + """ + if insulation_thickness is None: + raise ValueError("Insulation thickness is required for loft insulated at rafters efficiency calculation") + + if insulation_thickness >= 350: + return EpcEfficiency.VERY_GOOD + + if insulation_thickness >= 200: + return EpcEfficiency.GOOD + + if insulation_thickness >= 125: + return EpcEfficiency.AVERAGE + + if insulation_thickness >= 50: + return EpcEfficiency.POOR + + return EpcEfficiency.VERY_POOR + + +def loft_insulated_at_rafters_efficiency_age_band(age_band: EpcConstructionAgeBand) -> EpcEfficiency: + """ + # 2023 onwards -> Very Good + # 2003-2006, 2012-2022 -> Good + # 1983 - 1990, 1996-2002 -> Average + # 1976-1982 -> Poor + # 1967-1975 and earlier bands -> Very Poor + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + year = age_band.start_year() + if year >= 2023: + return EpcEfficiency.VERY_GOOD + if year >= 2003: + return EpcEfficiency.GOOD + if year >= 1983: + return EpcEfficiency.AVERAGE + if year >= 1976: + return EpcEfficiency.POOR + + return EpcEfficiency.VERY_POOR + + +def loft_insulated_at_rafters_efficiency( + insulation_thickness: int | None, + age_band: EpcConstructionAgeBand, +) -> EpcEfficiency: + """ + Combines both age band and insulation thickness to determine loft insulated at rafters roof efficiency. + :param insulation_thickness: Insulation thickness in mm + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + if insulation_thickness is not None: + return loft_insulated_at_rafters_efficiency_thickness(insulation_thickness) + + return loft_insulated_at_rafters_efficiency_age_band(age_band) + + +ROOF_DESCRIPTION_EFFICIENCIES: Mapping[EpcRoofDescriptions, RoofEfficiencyRule] = { + # Flat roof + EpcRoofDescriptions.flat_no_insulation: EpcEfficiency.VERY_POOR, + EpcRoofDescriptions.flat_limited_insulation: flat_efficiency, + EpcRoofDescriptions.flat_insulated: flat_efficiency, + + # Loft: + # value mappings + EpcRoofDescriptions.loft_12mm_insulation: EpcEfficiency.VERY_POOR, + EpcRoofDescriptions.loft_25mm_insulation: EpcEfficiency.POOR, + EpcRoofDescriptions.loft_50mm_insulation: EpcEfficiency.POOR, + EpcRoofDescriptions.loft_75mm_insulation: EpcEfficiency.AVERAGE, + EpcRoofDescriptions.loft_100mm_insulation: EpcEfficiency.AVERAGE, + EpcRoofDescriptions.loft_125mm_insulation: EpcEfficiency.AVERAGE, + EpcRoofDescriptions.loft_150mm_insulation: EpcEfficiency.GOOD, + EpcRoofDescriptions.loft_175mm_insulation: EpcEfficiency.GOOD, + EpcRoofDescriptions.loft_200mm_insulation: EpcEfficiency.GOOD, + EpcRoofDescriptions.loft_250mm_insulation: EpcEfficiency.GOOD, + EpcRoofDescriptions.loft_270mm_insulation: EpcEfficiency.VERY_GOOD, + EpcRoofDescriptions.loft_300mm_insulation: EpcEfficiency.VERY_GOOD, + EpcRoofDescriptions.loft_350mm_insulation: EpcEfficiency.VERY_GOOD, + EpcRoofDescriptions.loft_400mm_plus_insulation: EpcEfficiency.VERY_GOOD, + EpcRoofDescriptions.pitched_no_insulation: EpcEfficiency.VERY_POOR, + # function mappings + EpcRoofDescriptions.pitched_insulated_assumed: loft_insulated_efficiency, + + # Loft af rafters + EpcRoofDescriptions.loft_insulated_at_rafters: loft_insulated_at_rafters_efficiency, + + # Another dwelling above + EpcRoofDescriptions.another_dwelling_above: EpcEfficiency.NA, + + # Thatched + EpcRoofDescriptions.thatched: thatched_efficiency, + EpcRoofDescriptions.thatched_with_additional_insulation: thatched_efficiency, + + # Sloping ceiling + EpcRoofDescriptions.sloping_pitched_insulated: sloping_ceiling_efficiency, + EpcRoofDescriptions.sloping_pitched_limited_insulation: sloping_ceiling_efficiency, + EpcRoofDescriptions.sloping_pitched_no_insulation: EpcEfficiency.VERY_POOR, + +} + + +def resolve_roof_efficiency( + description: EpcRoofDescriptions, + age_band: EpcConstructionAgeBand | None, + insulation_thickness: int | None, +) -> EpcEfficiency: + """ + Resolve roof efficiency from description + age band + insulation thickness. + """ + + # Unknown / holding descriptions → efficiency unknown + if description in description.unknown_descriptions: + return EpcEfficiency.NA + + rule = ROOF_DESCRIPTION_EFFICIENCIES.get(description) + + if rule is None: + return EpcEfficiency.NA + + # Fixed efficiency + if isinstance(rule, EpcEfficiency): + return rule + + # Callable rule + if age_band is None or pd.isnull(age_band): + return EpcEfficiency.NA + + try: + # Try (thickness, age_band) + return rule(insulation_thickness, age_band) + except TypeError: + # Fallback to (age_band) + return rule(age_band) diff --git a/backend/onboarders/mappings/parity/walls.py b/backend/onboarders/mappings/parity/walls.py new file mode 100644 index 00000000..0ad6d6e1 --- /dev/null +++ b/backend/onboarders/mappings/parity/walls.py @@ -0,0 +1,211 @@ +from typing import Callable, Union +from collections.abc import Mapping +from datatypes.epc.walls import EpcWallDescriptions +from datatypes.epc.construction_age_band import EpcConstructionAgeBand +from datatypes.epc.efficiency import EpcEfficiency + +# Unique combinations +wall_map = { + # Cavity walls + ('Cavity', 'FilledCavity'): EpcWallDescriptions.cavity_filled_cavity, + ('Cavity', 'Internal'): EpcWallDescriptions.cavity_internal_insulation, + ('Cavity', 'External'): EpcWallDescriptions.cavity_external_insulation, + ('Cavity', 'FilledCavityPlusInternal'): EpcWallDescriptions.cavity_filled_plus_internal, + ('Cavity', 'FilledCavityPlusExternal'): EpcWallDescriptions.cavity_filled_plus_external, + ('Cavity', 'AsBuilt'): None, # To be classified + ('Cavity', 'Unknown'): None, # To be classified + + # System built walls + ('System', 'External'): EpcWallDescriptions.system_external_insulation, + ('System', 'Internal'): EpcWallDescriptions.system_internal_insulation, + ('System', 'AsBuilt'): None, # To be classified + ('System', 'Unknown'): None, + + # Timber Frame walls + ('Timber Frame', 'Internal'): EpcWallDescriptions.timber_frame_internal_insulation, + ('Timber Frame', 'External'): EpcWallDescriptions.timber_frame_external_insulation, + ('Timber Frame', 'AsBuilt'): None, # To be classified + ('Timber Frame', 'Unknown'): None, + + # Solid Brick walls + ('Solid Brick', 'External'): EpcWallDescriptions.solid_brick_external_insulation, + ('Solid Brick', 'Internal'): EpcWallDescriptions.solid_brick_internal_insulation, + ('Solid Brick', 'AsBuilt'): None, # To be classified + ('Solid Brick', 'Unknown'): None, + + # Granite walls + ('Granite', 'External'): EpcWallDescriptions.granite_whinstone_external_insulation, + ("Granite", 'Internal'): EpcWallDescriptions.granite_whinstone_internal_insulation, + ('Granite', 'AsBuilt'): None, + ('Granite', 'Unknown'): None, + + # Sandstone walls + ('Sandstone', 'Internal'): EpcWallDescriptions.sandstone_limestone_internal_insulation, + ('Sandstone', 'External'): EpcWallDescriptions.sandstone_limestone_external_insulation, + ('Sandstone', 'Unknown'): None, + ('Sandstone', 'AsBuilt'): None, + + # Cob walls + ('Cob', 'AsBuilt'): None, +} + +wall_unknown_age_fallback = { + "Cavity": EpcWallDescriptions.cavity_as_built_unknown, + "Solid Brick": EpcWallDescriptions.solid_brick_as_built_unknown, + "Timber Frame": EpcWallDescriptions.timber_frame_as_built_unknown, + "System": EpcWallDescriptions.system_as_built_unknown, + "Granite": EpcWallDescriptions.granite_as_built_unknown, + "Sandstone": EpcWallDescriptions.sandstone_as_built_unknown, + "Cob": EpcWallDescriptions.cob_as_built_unknown, +} + + +def cavity_filled_efficiency(age_band: EpcConstructionAgeBand) -> EpcEfficiency: + """" + Maps cavity filled to efficiency based on construction age band. + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + if age_band in { + EpcConstructionAgeBand.from_2023_onwards + }: + return EpcEfficiency.VERY_GOOD + + return EpcEfficiency.GOOD + + +def internal_external_insulation_efficiency( + age_band: EpcConstructionAgeBand, +) -> EpcEfficiency: + """ + Maps: + - cavity unfilled with internal/external insulation to efficiency based on construction age band. We assumed + based on 100mm insulation + - solid brick with internal/external insulation to efficiency based on construction age band. We assumed + based on 100mm insulation + - system built with internal/external insulation to efficiency based on construction age band. We assumed + based on 100mm insulation + + All of these wall types have the same behaviour in elmhurst + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + if age_band in { + EpcConstructionAgeBand.from_1983_to_1990, + EpcConstructionAgeBand.from_1991_to_1995, + EpcConstructionAgeBand.from_1996_to_2002, + EpcConstructionAgeBand.from_2003_to_2006, + EpcConstructionAgeBand.from_2007_to_2011, + EpcConstructionAgeBand.from_2012_to_2022, + EpcConstructionAgeBand.from_2023_onwards, + }: + return EpcEfficiency.VERY_GOOD + + return EpcEfficiency.GOOD + + +def timber_granite_sandstone_internal_external_efficiency(age_band: EpcConstructionAgeBand) -> EpcEfficiency: + """" + Maps: + - timber frame with internal/external wall insulation to efficiency based on construction age band. + - sandstone/limestone with internal/external wall insulation to efficiency based on construction age band. + - granite/whinstone with internal/external wall insulation to efficiency based on construction age band. + :param age_band: EpcConstructionAgeBand + :return: EpcEfficiency + """ + if age_band in { + EpcConstructionAgeBand.from_2023_onwards + }: + return EpcEfficiency.VERY_GOOD + + return EpcEfficiency.GOOD + + +WallEfficiencyRule = Union[ + EpcEfficiency, + Callable[[EpcConstructionAgeBand, int | None], EpcEfficiency], +] + +WALL_DESCRIPTION_EFFICIENCIES: Mapping[EpcWallDescriptions, WallEfficiencyRule] = { + # Note: all function mappings have been defined based on Elmhurst + # Cavity + # value mappings + EpcWallDescriptions.cavity_no_insulation_assumed: EpcEfficiency.POOR, + EpcWallDescriptions.cavity_partial_insulated_assumed: EpcEfficiency.AVERAGE, + EpcWallDescriptions.cavity_insulated_assumed: EpcEfficiency.GOOD, + EpcWallDescriptions.cavity_filled_plus_internal: EpcEfficiency.VERY_GOOD, + EpcWallDescriptions.cavity_filled_plus_external: EpcEfficiency.VERY_GOOD, + # function mappings + EpcWallDescriptions.cavity_filled_cavity: cavity_filled_efficiency, + EpcWallDescriptions.cavity_internal_insulation: internal_external_insulation_efficiency, + EpcWallDescriptions.cavity_external_insulation: internal_external_insulation_efficiency, + + # Solid brick + # value mappings + EpcWallDescriptions.solid_brick_no_insulation_assumed: EpcEfficiency.POOR, + EpcWallDescriptions.solid_brick_partial_insulated_assumed: EpcEfficiency.AVERAGE, + EpcWallDescriptions.solid_brick_insulated_assumed: EpcEfficiency.GOOD, + # function mappings + EpcWallDescriptions.solid_brick_internal_insulation: internal_external_insulation_efficiency, + EpcWallDescriptions.solid_brick_external_insulation: internal_external_insulation_efficiency, + + # System + # value mappings + EpcWallDescriptions.system_no_insulation_assumed: EpcEfficiency.POOR, + EpcWallDescriptions.system_partial_insulated_assumed: EpcEfficiency.AVERAGE, + EpcWallDescriptions.system_insulated_assumed: EpcEfficiency.GOOD, + # function mappings + EpcWallDescriptions.system_internal_insulation: internal_external_insulation_efficiency, + EpcWallDescriptions.system_external_insulation: internal_external_insulation_efficiency, + + # Timber frame + # value mappings + EpcWallDescriptions.timber_frame_no_insulation_assumed: EpcEfficiency.POOR, + EpcWallDescriptions.timber_frame_partial_insulated_assumed: EpcEfficiency.AVERAGE, + EpcWallDescriptions.timber_frame_insulated_assumed: EpcEfficiency.GOOD, + # function mappings + EpcWallDescriptions.timber_frame_internal_insulation: timber_granite_sandstone_internal_external_efficiency, + EpcWallDescriptions.timber_frame_external_insulation: timber_granite_sandstone_internal_external_efficiency, + + # Granite / whinstone + EpcWallDescriptions.granite_whinstone_no_insulation_assumed: EpcEfficiency.VERY_POOR, + EpcWallDescriptions.granite_whinstone_partial_insulated_assumed: EpcEfficiency.AVERAGE, + EpcWallDescriptions.granite_whinestone_insulated_assumed: EpcEfficiency.GOOD, + # function mappings + EpcWallDescriptions.granite_whinstone_internal_insulation: timber_granite_sandstone_internal_external_efficiency, + EpcWallDescriptions.granite_whinstone_external_insulation: timber_granite_sandstone_internal_external_efficiency, + + # Sandstone / limestone + EpcWallDescriptions.sandstone_limestone_no_insulation_assumed: EpcEfficiency.VERY_POOR, + EpcWallDescriptions.sandstone_limestone_partial_insulated_assumed: EpcEfficiency.AVERAGE, + EpcWallDescriptions.sandstone_limestone_insulated_assumed: EpcEfficiency.GOOD, + # function mappings + EpcWallDescriptions.sandstone_limestone_internal_insulation: timber_granite_sandstone_internal_external_efficiency, + EpcWallDescriptions.sandstone_limestone_external_insulation: timber_granite_sandstone_internal_external_efficiency, + + # Cob (special case) + EpcWallDescriptions.cob_as_built_average: EpcEfficiency.AVERAGE, + EpcWallDescriptions.cob_as_built_good: EpcEfficiency.GOOD, + + # Unknown mappings which are unhandled + EpcWallDescriptions.cavity_as_built_unknown: EpcEfficiency.NA, + EpcWallDescriptions.solid_brick_as_built_unknown: EpcEfficiency.NA, + EpcWallDescriptions.system_as_built_unknown: EpcEfficiency.NA, + EpcWallDescriptions.timber_frame_as_built_unknown: EpcEfficiency.NA, + EpcWallDescriptions.granite_as_built_unknown: EpcEfficiency.NA, + EpcWallDescriptions.sandstone_as_built_unknown: EpcEfficiency.NA, + EpcWallDescriptions.cob_as_built_unknown: EpcEfficiency.NA, + +} + + +def resolve_wall_efficiency( + description: EpcWallDescriptions, + age_band: EpcConstructionAgeBand, +) -> EpcEfficiency: + rule = WALL_DESCRIPTION_EFFICIENCIES[description] + + if isinstance(rule, EpcEfficiency): + return rule + + return rule(age_band) diff --git a/backend/onboarders/mappings/property_type.py b/backend/onboarders/mappings/property_type.py deleted file mode 100644 index 75deef04..00000000 --- a/backend/onboarders/mappings/property_type.py +++ /dev/null @@ -1,6 +0,0 @@ -parity_map = { - "Flat": "Flat", - "Maisonette": "Maisonette", - "Bungalow": "Bungalow", - "House": "House", -} diff --git a/backend/onboarders/mappings/walls.py b/backend/onboarders/mappings/walls.py deleted file mode 100644 index 9b70b49c..00000000 --- a/backend/onboarders/mappings/walls.py +++ /dev/null @@ -1,3 +0,0 @@ -parity_map = { - -} diff --git a/backend/onboarders/parity.py b/backend/onboarders/parity.py index 27244777..6c79d027 100644 --- a/backend/onboarders/parity.py +++ b/backend/onboarders/parity.py @@ -1,93 +1,371 @@ +import re +from tqdm import tqdm import pandas as pd -from etl.epc.DataProcessor import construction_age_bounds_map -from backend.onboarders.mappings.property_type import parity_map as property_map -from backend.onboarders.mappings.age_band import party_map as age_band_map -from backend.onboarders.mappings.built_form import parity_map as built_form_map - - -def check_nulls(data, original_column, mapped_column): - # We only allow nulls if the oroginal value was null - null_vals = data[pd.isnull(data[mapped_column])] - if null_vals.empty: - return True - # We make sure all original values were null - assert pd.isnull(null_vals[original_column]).all(), ( - f"Some values in {mapped_column} were not mapped, but original values were not null" - ) - - -# Sample input data - -data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " - "- Data Extracts for Domna.xlsx", - sheet_name="Sustainability" +from backend.onboarders.base import OnboarderBase +# Parity mappings +from backend.onboarders.mappings.parity.property_type import parity_map as property_map +from backend.onboarders.mappings.parity.age_band import parity_map as age_band_map +from backend.onboarders.mappings.parity.built_form import parity_map as built_form_map +from backend.onboarders.mappings.parity.walls import wall_map, wall_unknown_age_fallback, WALL_DESCRIPTION_EFFICIENCIES +from onboarders.mappings.parity.roof import roof_map, roof_unknown_age_fallback, resolve_roof_efficiency +from onboarders.mappings.parity.floor import floor_map +from onboarders.mappings.parity.heating import heating_map +from onboarders.mappings.parity.glazing import glazing_map +from backend.onboarders.mappings.parity.as_built_wall_classifiers import as_built_wall_classifiers +from backend.onboarders.mappings.parity.as_built_roof_classifiers import as_built_roof_classifiers +from backend.onboarders.mappings.parity.as_built_floor_classifiers import ( + as_built_floor_classifiers, unknown_as_built_floor_classifiers ) +from datatypes.epc.roof import EpcRoofDescriptions +from datatypes.epc.floor import EpcFloorDescriptions +from datatypes.epc.construction_age_band import EpcConstructionAgeBand +from datatypes.epc.walls import EpcWallDescriptions +from datatypes.epc.efficiency import EpcEfficiency -# We want to map the parity fields to standard EPC references. This will allow us to -# 1) Estimate EPCs, more accurately -# 2) Patch incorrect EPCs with ease -# 3) Indicate already installed measures - -# ------------ construction_age_band ------------ -# Map to EPC age bands -# def construction_date_to_band(year): -# if pd.isnull(year): -# return None -# # Get the year from the date which is numpy datetime format -# for label, ranges in construction_age_bounds_map.items(): -# if ranges["l"] <= year <= ranges["u"]: -# return label -# raise NotImplementedError("year out of bounds") -# -# -# data["construction_age_band"] = pd.to_datetime(data["Construction Date"]).dt.year.apply(construction_date_to_band) - -data["construction_age_band"] = data["Construction Years"].map(age_band_map) - -check_nulls(data, "Construction Years", "construction_age_band") - -# ------------ property_type ------------ -data["property_type"] = data["Type"].map(property_map) - -assert pd.isnull(data["property_type"]).sum() == 0, "Some property types were not mapped" - -# ------------ built_form ------------ -data["built_form"] = data["Attachment"].map(built_form_map) - -assert pd.isnull(data["built_form"]).sum() == 0, "Some built forms were not mapped" - -# ------------ Wall Construction ------------ - -data["walls_combined"] = data["Wall Construction"] + "+" + data["Wall Insulation"].fillna("Unknown Insulation") - -data["Wall Insulation"].value_counts() -data["Wall Construction"].value_counts() - -as_built_map = { - "Cavity": {"insulated_age_bands": [], "partial_insulated_age_bands": []}, - "Solid Brick": {"insulated_age_bands": [], "partial_insulated_age_bands": []}, - "System": {"insulated_age_bands": [], "partial_insulated_age_bands": []}, - "Timber Frame": {"insulated_age_bands": [], "partial_insulated_age_bands": []}, - "Sandstone": {"insulated_age_bands": [], "partial_insulated_age_bands": []}, - "Granite": {"insulated_age_bands": [], "partial_insulated_age_bands": []}, - "Cob": {"insulated_age_bands": [], "partial_insulated_age_bands": []}, -} +tqdm.pandas() -def map_wall_construction(wall_constuction, wall_insulation, construction_age_band): - if wall_insulation == "AsBuilt": - # Deduce based on wall construction and age band - bands = as_built_map.get(wall_constuction, None) - if bands is None: - raise NotImplementedError(f"Wall construction {wall_constuction} not in as built map") +class ParityOnboarder(OnboarderBase): - # We check if the age band is in insulated or partial insulated, and if neither, we assume uninsulated + def __init__( + self, + fileuri: str, + file_format: str, + **kwargs + ): + # Extract bucket, and filekey; Will be in the format s3://bucket/key + self.bucket_name = fileuri.split("/")[2] + self.input_file_name = "/".join(fileuri.split("/")[3:]) + # Also prepare output file name + self.output_file_name = self.input_file_name.replace("." + file_format, "") + "_transformed.csv" -# Variables we want to map -# 'Org Ref', 'Address 1', 'Address 2', 'Address 3', 'Postcode', 'Type', -# 'Attachment', 'Construction Years', 'Wall Construction', -# 'Wall Insulation', 'Roof Construction', 'Roof Insulation', -# 'Floor Construction', 'Floor Insulation', 'Glazing', 'Heating', -# 'Boiler Efficiency', 'Main Fuel', 'Controls Adequacy', 'UPRN', -# 'Total Floor Area (m2)' + self.read_s3(file_format=file_format, **kwargs) + pass + + def map_construction_age_band(self): + self.data[self.landlord_construction_age_band] = self.data["Construction Years"].map(age_band_map) + self.assert_nulls_only_from_source_nulls( + self.data, "Construction Years", self.landlord_construction_age_band + ) + + def map_property_type(self): + self.data[self.landlord_property_type] = self.data["Type"].map(property_map) + self.assert_no_nulls(self.data, self.landlord_property_type) + + def map_built_form(self): + self.data[self.landlord_built_form] = self.data["Attachment"].map(built_form_map) + self.assert_no_nulls(self.data, self.landlord_built_form) + + @staticmethod + def _fill_wall_as_built(row: pd.Series) -> EpcWallDescriptions | None: + """ + Utility function, used by map_wall_construction in parity transformation module + :param row: row of input sustainability data, being transformed + :return: EpcWallDescriptions, the as built wall description for the input row, based on the wall construction + type and age band + """ + # Already resolved via direct mapping + if row.landlord_wall_construction is not None: + return row.landlord_wall_construction + + wall_type = row["Wall Construction"] + + # Missing construction age → conservative fallback + if pd.isnull(row.landlord_construction_age_band): + return wall_unknown_age_fallback.get(wall_type) + + classifier = as_built_wall_classifiers.get(wall_type) + if classifier is None: + return None + + return classifier(row.landlord_construction_age_band) + + @staticmethod + def _resolve_wall_efficiency( + description: EpcWallDescriptions, + age_band: EpcConstructionAgeBand | None, + ) -> EpcEfficiency: + # Unknown / holding descriptions → efficiency unknown + if "unknown insulation" in description.value.lower(): + return EpcEfficiency.NA + + rule = WALL_DESCRIPTION_EFFICIENCIES.get(description) + + if rule is None: + return EpcEfficiency.NA + + if isinstance(rule, EpcEfficiency): + return rule + + # Rule needs age band but we don't have one + if age_band is None or pd.isnull(age_band): + return EpcEfficiency.NA + + return rule(age_band) + + def map_wall_construction(self): + self.data[self.landlord_wall_construction] = ( + self.data[["Wall Construction", "Wall Insulation"]] + .apply(tuple, axis=1) + .map(wall_map) + ) + + self.data[self.landlord_wall_construction] = self.data.progress_apply(self._fill_wall_as_built, axis=1) + + # Sanity check + self.assert_no_nulls(self.data, self.landlord_wall_construction) + + self.data[self.landlord_wall_efficiency] = self.data.progress_apply( + lambda row: self._resolve_wall_efficiency( + row.landlord_wall_construction, + row.landlord_construction_age_band, + ), + axis=1, + ) + # Additional santify check + self.assert_no_nulls(self.data, self.landlord_wall_efficiency) + + @staticmethod + def _fill_roof_as_built(row: pd.Series) -> EpcRoofDescriptions | None: + # Already resolved + if not pd.isnull(row.landlord_roof_construction): + return row.landlord_roof_construction + + roof_type = row["Roof Construction"] + + classifier = as_built_roof_classifiers.get(roof_type) + if classifier is None: + raise NotImplementedError(f"No roof classifier for roof type '{roof_type}'") + + if pd.isnull(row.landlord_construction_age_band): + return roof_unknown_age_fallback.get(roof_type) + + output = classifier(row.landlord_construction_age_band) + if output is None: + raise NotImplementedError( + f"Roof classification returned None for roof type '{roof_type}'" + ) + + return output + + @staticmethod + def _extract_insulation_thickness(value: str | None) -> int | None: + """ + Extract insulation thickness in mm from a string like 'mm150'. + Returns None if not present or not parseable. + """ + if value is None or pd.isnull(value): + return None + + match = re.search(r"(\d+)", str(value)) + if not match: + return None + + return int(match.group(1)) + + def map_roof_construction(self): + self.data[self.landlord_roof_construction] = ( + self.data[["Roof Construction", "Roof Insulation"]] + .progress_apply(tuple, axis=1) + .map(roof_map) + ) + + self.data[self.landlord_roof_construction] = self.data.progress_apply( + self._fill_roof_as_built, + axis=1, + ) + + # sanity check + self.assert_no_nulls(self.data, self.landlord_roof_construction) + + self.data["roof_insulation_thickness_mm"] = self.data["Roof Insulation"].apply( + self._extract_insulation_thickness + ) + + self.data[self.landlord_roof_efficiency] = self.data.progress_apply( + lambda row: resolve_roof_efficiency( + description=row.landlord_roof_construction, + age_band=row.landlord_construction_age_band, + insulation_thickness=row.roof_insulation_thickness_mm, + ), + axis=1, + ) + # sanity check + self.assert_no_nulls(self.data, self.landlord_roof_efficiency) + + # Flag sloping ceiling + self.data[self.landlord_has_sloping_ceiling] = self.data["Roof Construction"].apply( + lambda x: x == "PitchedWithSlopingCeiling" + ) + + @staticmethod + def _fill_floor_as_built(row: pd.Series): + # 1. Already resolved + if row.landlord_floor_construction is not None: + return row.landlord_floor_construction + + age_band = row.landlord_construction_age_band + floor_type = row["Floor Construction"] + insulation = row["Floor Insulation"] + + # 2. Missing age band → conservative fallback + if pd.isnull(age_band): + return EpcFloorDescriptions.unknown + + # 3. Known floor types + if floor_type in ["Solid", "SuspendedTimber", "SuspendedNotTimber"]: + classifier = as_built_floor_classifiers[floor_type] + return classifier(age_band) + + # 4. Unknown floor type + if floor_type == "Unknown": + classifier = unknown_as_built_floor_classifiers[insulation] + return classifier(age_band) + + # 5. Truly missing / garbage input + return EpcFloorDescriptions.unknown + + def map_floor_construction(self): + self.data[self.landlord_floor_construction] = ( + self.data[["Floor Construction", "Floor Insulation"]] + .progress_apply(tuple, axis=1) + .map(floor_map) + ) + + self.data[self.landlord_floor_construction] = self.data.progress_apply( + self._fill_floor_as_built, + axis=1, + ) + + self.assert_no_nulls(self.data, self.landlord_floor_construction) + + def map_glazing(self): + # TODO: probably doesn't make sense to store multi glazed proportion, glazed type or glazed area. + # There is maybe an argument for landlord_multi_glaze_proportion as this could be variable, + # however + self.data[ + [ + self.landlord_windows_type, + self.landlord_windows_efficiency, + self.landlord_multi_glaze_proportion, + self.landlord_glazed_type, + self.landlord_glazed_area + ] + ] = self.data["Glazing"].map(glazing_map).progress_apply(pd.Series) + + def map_heating(self): + # TODO - when mapping heating controls, we should check the existing heating controls and the efficiency rating + # For sub optimal heating controls, we're going to make an assumption as to what the heating controls are + # and the energy efficiency rating we prescribe here may not be accurate. We therefore use this as an + # upper limit + # as opposed to a guaranteed efficiency rating. To stress, this is only relevant for sub optimal heating + # controls. E.g. it may be programmer and room thermostat + self.data[ + [ + self.landlord_heating_construction, + self.landlord_heating_efficiency, + self.landlord_fuel_type, + self.landlord_heating_controls, + self.landlord_heating_controls_efficiency, + self.landlord_hot_water_system, + self.landlord_hot_water_efficiency + ] + ] = self.data[ + [ + "Heating", + "Boiler Efficiency", + "Main Fuel", + "Controls Adequacy" + ] + ].progress_apply(tuple, axis=1).map(heating_map).progress_apply(pd.Series) + + def map_floor_area(self): + # This is just a rename + self.data = self.data.rename( + columns={"Total Floor Area (m2)": self.landlord_total_floor_area_m2} + ) + + def select_columns(self): + self.data = self.data[ + [ + "Org Ref", + "UPRN", + "Address 1", + "Address 2", + "Address 3", + "Postcode", + self.landlord_total_floor_area_m2, + self.landlord_construction_age_band, + self.landlord_property_type, + self.landlord_built_form, + self.landlord_wall_construction, + self.landlord_wall_efficiency, + self.landlord_roof_construction, + self.landlord_roof_efficiency, + self.landlord_has_sloping_ceiling, + self.landlord_floor_construction, + self.landlord_windows_type, + self.landlord_windows_efficiency, + self.landlord_multi_glaze_proportion, + self.landlord_glazed_type, + self.landlord_glazed_area, + self.landlord_heating_construction, + self.landlord_heating_efficiency, + self.landlord_fuel_type, + self.landlord_heating_controls, + self.landlord_heating_controls_efficiency, + self.landlord_hot_water_system, + self.landlord_hot_water_efficiency + ] + ].rename( + columns={ + "Org Ref": "landlord_property_id", + "Address1": "address1", + "Address2": "address2", + "Address3": "address3", + "Postcode": "postcode", + } + ) + + def extract_values(self): + for columns in [ + self.landlord_construction_age_band, self.landlord_property_type, self.landlord_built_form, + self.landlord_wall_construction, self.landlord_wall_efficiency, self.landlord_roof_construction, + self.landlord_roof_efficiency, self.landlord_floor_construction, self.landlord_windows_type, + self.landlord_windows_efficiency, self.landlord_heating_construction, self.landlord_heating_efficiency, + self.landlord_fuel_type, self.landlord_heating_controls, self.landlord_heating_controls_efficiency, + self.landlord_hot_water_system, self.landlord_hot_water_efficiency + ]: + self.data[columns] = self.data[columns].progress_apply(lambda x: x.value if hasattr(x, "value") else x) + + def transform(self): + # ------------ construction_age_band ------------ + self.map_construction_age_band() + + # ------------ property_type ------------ + self.map_property_type() + + # ------------ built_form ------------ + self.map_built_form() + + # ------------ Wall Construction ------------ + self.map_wall_construction() + + # ------------ Roof Construction ------------ + self.map_roof_construction() + + # ------------ Floor Construction ------------ + self.map_floor_construction() + + # ------------ Glazing ------------ + self.map_glazing() + + # ------------ Heating, fuel, controls & hot water ------------ + self.map_heating() + + # ------------ Floor Area ------------ + self.map_floor_area() + + # ------------ Formating ------------ + self.select_columns() + self.extract_values() diff --git a/backend/onboarders/requirements.txt b/backend/onboarders/requirements.txt new file mode 100644 index 00000000..907cb877 --- /dev/null +++ b/backend/onboarders/requirements.txt @@ -0,0 +1,6 @@ +boto3 +numpy==2.1.2 +pandas==2.2.3 +tqdm==4.66.5 +pydantic==2.9.2 +openpyxl==3.1.2 \ No newline at end of file diff --git a/backend/onboarders/tests/test_floor_remapping.py b/backend/onboarders/tests/test_floor_remapping.py new file mode 100644 index 00000000..c20372b7 --- /dev/null +++ b/backend/onboarders/tests/test_floor_remapping.py @@ -0,0 +1,97 @@ +import pytest + +from datatypes.epc.construction_age_band import EpcConstructionAgeBand +from datatypes.epc.floor import EpcFloorDescriptions + +from backend.onboarders.mappings.parity.as_built_floor_classifiers import ( + unknown_floor_as_built, + unknown_floor_retrofitted, + map_solid_floor_as_built, + map_suspended_floor_as_built, +) + + +@pytest.mark.parametrize( + "age_band,expected", + [ + # Before 1900 / 1900–1929 → suspended, no insulation + (EpcConstructionAgeBand.before_1900, EpcFloorDescriptions.suspended_no_insulation_assumed), + (EpcConstructionAgeBand.from_1900_to_1929, EpcFloorDescriptions.suspended_no_insulation_assumed), + + # 1930–1995 → solid, no insulation + (EpcConstructionAgeBand.from_1930_to_1949, EpcFloorDescriptions.solid_no_insulation_assumed), + (EpcConstructionAgeBand.from_1950_to_1966, EpcFloorDescriptions.solid_no_insulation_assumed), + (EpcConstructionAgeBand.from_1967_to_1975, EpcFloorDescriptions.solid_no_insulation_assumed), + (EpcConstructionAgeBand.from_1976_to_1982, EpcFloorDescriptions.solid_no_insulation_assumed), + (EpcConstructionAgeBand.from_1983_to_1990, EpcFloorDescriptions.solid_no_insulation_assumed), + (EpcConstructionAgeBand.from_1991_to_1995, EpcFloorDescriptions.solid_no_insulation_assumed), + + # 1996–2002 → solid, limited insulation + (EpcConstructionAgeBand.from_1996_to_2002, EpcFloorDescriptions.solid_limited_insulation_assumed), + + # 2003+ → solid, insulated + (EpcConstructionAgeBand.from_2003_to_2006, EpcFloorDescriptions.solid_insulated_assumed), + (EpcConstructionAgeBand.from_2012_to_2022, EpcFloorDescriptions.solid_insulated_assumed), + (EpcConstructionAgeBand.from_2023_onwards, EpcFloorDescriptions.solid_insulated_assumed), + ], +) +def test_unknown_floor_as_built(age_band, expected): + assert unknown_floor_as_built(age_band) == expected + + +@pytest.mark.parametrize( + "age_band,expected", + [ + # Pre-1930 → suspended, insulated + (EpcConstructionAgeBand.before_1900, EpcFloorDescriptions.suspended_insulated), + (EpcConstructionAgeBand.from_1900_to_1929, EpcFloorDescriptions.suspended_insulated), + + # 1930+ → solid, insulated + (EpcConstructionAgeBand.from_1930_to_1949, EpcFloorDescriptions.solid_insulated), + (EpcConstructionAgeBand.from_1950_to_1966, EpcFloorDescriptions.solid_insulated), + (EpcConstructionAgeBand.from_1976_to_1982, EpcFloorDescriptions.solid_insulated), + (EpcConstructionAgeBand.from_2023_onwards, EpcFloorDescriptions.solid_insulated), + ], +) +def test_unknown_floor_retrofitted(age_band, expected): + assert unknown_floor_retrofitted(age_band) == expected + + +@pytest.mark.parametrize( + "age_band,expected", + [ + # 1983–1995 → no insulation + (EpcConstructionAgeBand.from_1983_to_1990, EpcFloorDescriptions.solid_no_insulation_assumed), + (EpcConstructionAgeBand.from_1991_to_1995, EpcFloorDescriptions.solid_no_insulation_assumed), + + # 1996–2002 → limited insulation + (EpcConstructionAgeBand.from_1996_to_2002, EpcFloorDescriptions.solid_limited_insulation_assumed), + + # 2003+ → insulated + (EpcConstructionAgeBand.from_2003_to_2006, EpcFloorDescriptions.solid_insulated_assumed), + (EpcConstructionAgeBand.from_2012_to_2022, EpcFloorDescriptions.solid_insulated_assumed), + (EpcConstructionAgeBand.from_2023_onwards, EpcFloorDescriptions.solid_insulated_assumed), + ], +) +def test_solid_floor_as_built(age_band, expected): + assert map_solid_floor_as_built(age_band) == expected + + +@pytest.mark.parametrize( + "age_band,expected", + [ + # 1983–1995 → no insulation + (EpcConstructionAgeBand.from_1983_to_1990, EpcFloorDescriptions.suspended_no_insulation_assumed), + (EpcConstructionAgeBand.from_1991_to_1995, EpcFloorDescriptions.suspended_no_insulation_assumed), + + # 1996–2002 → limited insulation + (EpcConstructionAgeBand.from_1996_to_2002, EpcFloorDescriptions.suspended_limited_insulation_assumed), + + # 2003+ → insulated + (EpcConstructionAgeBand.from_2003_to_2006, EpcFloorDescriptions.suspended_insulated_assumed), + (EpcConstructionAgeBand.from_2012_to_2022, EpcFloorDescriptions.suspended_insulated_assumed), + (EpcConstructionAgeBand.from_2023_onwards, EpcFloorDescriptions.suspended_insulated_assumed), + ], +) +def test_suspended_floor_as_built(age_band, expected): + assert map_suspended_floor_as_built(age_band) == expected diff --git a/backend/onboarders/tests/test_roof_remapping.py b/backend/onboarders/tests/test_roof_remapping.py new file mode 100644 index 00000000..cc19e057 --- /dev/null +++ b/backend/onboarders/tests/test_roof_remapping.py @@ -0,0 +1,173 @@ +import pytest + +from datatypes.epc.construction_age_band import EpcConstructionAgeBand +from datatypes.epc.roof import EpcRoofDescriptions +from datatypes.epc.efficiency import EpcEfficiency + +from backend.onboarders.mappings.parity.as_built_roof_classifiers import ( + map_flat_roof, + map_sloping_ceiling_roof, +) +from backend.onboarders.mappings.parity.roof import resolve_roof_efficiency + + +# --------------------------------------------------------------------- +# As-built roof description classification +# --------------------------------------------------------------------- + +@pytest.mark.parametrize( + "age_band, expected", + [ + (EpcConstructionAgeBand.before_1900, EpcRoofDescriptions.flat_no_insulation), + (EpcConstructionAgeBand.from_1950_to_1966, EpcRoofDescriptions.flat_no_insulation), + (EpcConstructionAgeBand.from_1967_to_1975, EpcRoofDescriptions.flat_limited_insulation), + (EpcConstructionAgeBand.from_1976_to_1982, EpcRoofDescriptions.flat_limited_insulation), + (EpcConstructionAgeBand.from_1983_to_1990, EpcRoofDescriptions.flat_insulated), + (EpcConstructionAgeBand.from_2007_to_2011, EpcRoofDescriptions.flat_insulated), + (EpcConstructionAgeBand.from_2023_onwards, EpcRoofDescriptions.flat_insulated), + ], +) +def test_classify_flat_roof(age_band, expected): + assert map_flat_roof(age_band) == expected + + +@pytest.mark.parametrize( + "age_band, expected", + [ + (EpcConstructionAgeBand.before_1900, EpcRoofDescriptions.sloping_pitched_no_insulation), + (EpcConstructionAgeBand.from_1967_to_1975, EpcRoofDescriptions.sloping_pitched_no_insulation), + (EpcConstructionAgeBand.from_1976_to_1982, EpcRoofDescriptions.sloping_pitched_limited_insulation), + (EpcConstructionAgeBand.from_1983_to_1990, EpcRoofDescriptions.sloping_pitched_insulated), + (EpcConstructionAgeBand.from_2012_to_2022, EpcRoofDescriptions.sloping_pitched_insulated), + (EpcConstructionAgeBand.from_2023_onwards, EpcRoofDescriptions.sloping_pitched_insulated), + ], +) +def test_classify_sloping_ceiling_roof(age_band, expected): + assert map_sloping_ceiling_roof(age_band) == expected + + +# --------------------------------------------------------------------- +# Roof efficiency — fixed & age-band driven +# --------------------------------------------------------------------- + +@pytest.mark.parametrize( + "description, age_band, expected", + [ + # Flat roof, no insulation + (EpcRoofDescriptions.flat_no_insulation, EpcConstructionAgeBand.before_1900, EpcEfficiency.VERY_POOR), + + # Flat roof, limited insulation (age-band driven) + (EpcRoofDescriptions.flat_limited_insulation, EpcConstructionAgeBand.from_1976_to_1982, EpcEfficiency.POOR), + ( + EpcRoofDescriptions.flat_limited_insulation, EpcConstructionAgeBand.from_1967_to_1975, + EpcEfficiency.VERY_POOR), + + # Flat roof, insulated (age-band driven) + (EpcRoofDescriptions.flat_insulated, EpcConstructionAgeBand.from_1983_to_1990, EpcEfficiency.AVERAGE), + (EpcRoofDescriptions.flat_insulated, EpcConstructionAgeBand.from_2003_to_2006, EpcEfficiency.GOOD), + (EpcRoofDescriptions.flat_insulated, EpcConstructionAgeBand.from_2023_onwards, EpcEfficiency.VERY_GOOD), + + # Pitched, insulated assumed (loft) + (EpcRoofDescriptions.pitched_insulated_assumed, EpcConstructionAgeBand.from_1996_to_2002, EpcEfficiency.GOOD), + (EpcRoofDescriptions.pitched_insulated_assumed, EpcConstructionAgeBand.from_2007_to_2011, + EpcEfficiency.VERY_GOOD), + ], +) +def test_roof_efficiency_age_band_only(description, age_band, expected): + assert resolve_roof_efficiency( + description=description, + age_band=age_band, + insulation_thickness=None, + ) == expected + + +# --------------------------------------------------------------------- +# Roof efficiency — insulation thickness driven +# --------------------------------------------------------------------- + +@pytest.mark.parametrize( + "description, thickness, expected", + [ + # Loft insulation + (EpcRoofDescriptions.loft_12mm_insulation, 12, EpcEfficiency.VERY_POOR), + (EpcRoofDescriptions.loft_25mm_insulation, 25, EpcEfficiency.POOR), + (EpcRoofDescriptions.loft_75mm_insulation, 75, EpcEfficiency.AVERAGE), + (EpcRoofDescriptions.loft_150mm_insulation, 150, EpcEfficiency.GOOD), + (EpcRoofDescriptions.loft_300mm_insulation, 300, EpcEfficiency.VERY_GOOD), + + # Flat insulated — thickness overrides age band + (EpcRoofDescriptions.flat_insulated, 50, EpcEfficiency.POOR), + (EpcRoofDescriptions.flat_insulated, 100, EpcEfficiency.AVERAGE), + (EpcRoofDescriptions.flat_insulated, 200, EpcEfficiency.GOOD), + (EpcRoofDescriptions.flat_insulated, 300, EpcEfficiency.VERY_GOOD), + + # Sloping ceiling + (EpcRoofDescriptions.sloping_pitched_insulated, 75, EpcEfficiency.AVERAGE), + (EpcRoofDescriptions.sloping_pitched_insulated, 150, EpcEfficiency.GOOD), + (EpcRoofDescriptions.sloping_pitched_insulated, 350, EpcEfficiency.VERY_GOOD), + ], +) +def test_roof_efficiency_thickness_based(description, thickness, expected): + assert resolve_roof_efficiency( + description=description, + age_band=EpcConstructionAgeBand.before_1900, # should be ignored + insulation_thickness=thickness, + ) == expected + + +# --------------------------------------------------------------------- +# Thatched roofs +# --------------------------------------------------------------------- + +@pytest.mark.parametrize( + "description, age_band, expected", + [ + (EpcRoofDescriptions.thatched, EpcConstructionAgeBand.before_1900, EpcEfficiency.AVERAGE), + (EpcRoofDescriptions.thatched, EpcConstructionAgeBand.from_2003_to_2006, EpcEfficiency.GOOD), + (EpcRoofDescriptions.thatched, EpcConstructionAgeBand.from_2023_onwards, EpcEfficiency.VERY_GOOD), + ], +) +def test_thatched_efficiency_age_band(description, age_band, expected): + assert resolve_roof_efficiency( + description=description, + age_band=age_band, + insulation_thickness=None, + ) == expected + + +@pytest.mark.parametrize( + "thickness, expected", + [ + (12, EpcEfficiency.AVERAGE), + (50, EpcEfficiency.GOOD), + (150, EpcEfficiency.GOOD), + (200, EpcEfficiency.VERY_GOOD), + ], +) +def test_thatched_efficiency_thickness(thickness, expected): + assert resolve_roof_efficiency( + description=EpcRoofDescriptions.thatched_with_additional_insulation, + age_band=EpcConstructionAgeBand.before_1900, + insulation_thickness=thickness, + ) == expected + + +# --------------------------------------------------------------------- +# Unknown / holding descriptions +# --------------------------------------------------------------------- + +@pytest.mark.parametrize( + "description", + [ + EpcRoofDescriptions.flat_as_built_unknown, + EpcRoofDescriptions.loft_as_built_unknown, + EpcRoofDescriptions.thatched_as_built_unknown, + EpcRoofDescriptions.sloping_pitched_as_built_unknown, + ], +) +def test_unknown_roof_descriptions_return_na(description): + assert resolve_roof_efficiency( + description=description, + age_band=None, + insulation_thickness=None, + ) == EpcEfficiency.NA diff --git a/backend/onboarders/tests/test_wall_remapping.py b/backend/onboarders/tests/test_wall_remapping.py new file mode 100644 index 00000000..c9476211 --- /dev/null +++ b/backend/onboarders/tests/test_wall_remapping.py @@ -0,0 +1,161 @@ +import pytest + +from datatypes.epc.construction_age_band import EpcConstructionAgeBand +from datatypes.epc.walls import EpcWallDescriptions +from datatypes.epc.efficiency import EpcEfficiency + +from backend.onboarders.mappings.parity.walls import resolve_wall_efficiency +from backend.onboarders.mappings.parity.as_built_wall_classifiers import ( + map_cavity_wall_insulation, + map_solid_wall_insulation, + map_timber_frame_wall_insulation, + map_system_build_wall_insulation, + map_granite_wall_insulation, + map_sandstone_wall_insulation, + map_cob_wall_insulation, +) + + +# --------------------------------------------------------------------- +# As-built wall description classification +# --------------------------------------------------------------------- + +@pytest.mark.parametrize( + "age_band, expected", + [ + (EpcConstructionAgeBand.before_1900, EpcWallDescriptions.cavity_no_insulation_assumed), + (EpcConstructionAgeBand.from_1950_to_1966, EpcWallDescriptions.cavity_no_insulation_assumed), + (EpcConstructionAgeBand.from_1976_to_1982, EpcWallDescriptions.cavity_partial_insulated_assumed), + (EpcConstructionAgeBand.from_1983_to_1990, EpcWallDescriptions.cavity_insulated_assumed), + (EpcConstructionAgeBand.from_2023_onwards, EpcWallDescriptions.cavity_insulated_assumed), + ], +) +def test_map_cavity_wall_insulation(age_band, expected): + assert map_cavity_wall_insulation(age_band) == expected + + +@pytest.mark.parametrize( + "age_band, expected", + [ + (EpcConstructionAgeBand.before_1900, EpcWallDescriptions.solid_brick_no_insulation_assumed), + (EpcConstructionAgeBand.from_1976_to_1982, EpcWallDescriptions.solid_brick_partial_insulated_assumed), + (EpcConstructionAgeBand.from_1996_to_2002, EpcWallDescriptions.solid_brick_insulated_assumed), + ], +) +def test_map_solid_wall_insulation(age_band, expected): + assert map_solid_wall_insulation(age_band) == expected + + +@pytest.mark.parametrize( + "age_band, expected", + [ + (EpcConstructionAgeBand.before_1900, EpcWallDescriptions.timber_frame_no_insulation_assumed), + (EpcConstructionAgeBand.from_1950_to_1966, EpcWallDescriptions.timber_frame_partial_insulated_assumed), + (EpcConstructionAgeBand.from_1983_to_1990, EpcWallDescriptions.timber_frame_insulated_assumed), + ], +) +def test_map_timber_frame_wall_insulation(age_band, expected): + assert map_timber_frame_wall_insulation(age_band) == expected + + +@pytest.mark.parametrize( + "age_band, expected", + [ + (EpcConstructionAgeBand.before_1900, EpcWallDescriptions.system_no_insulation_assumed), + (EpcConstructionAgeBand.from_1976_to_1982, EpcWallDescriptions.system_partial_insulated_assumed), + (EpcConstructionAgeBand.from_2003_to_2006, EpcWallDescriptions.system_insulated_assumed), + ], +) +def test_map_system_wall_insulation(age_band, expected): + assert map_system_build_wall_insulation(age_band) == expected + + +@pytest.mark.parametrize( + "age_band, expected", + [ + (EpcConstructionAgeBand.before_1900, EpcWallDescriptions.granite_whinstone_no_insulation_assumed), + (EpcConstructionAgeBand.from_1976_to_1982, EpcWallDescriptions.granite_whinstone_partial_insulated_assumed), + (EpcConstructionAgeBand.from_2012_to_2022, EpcWallDescriptions.granite_whinestone_insulated_assumed), + ], +) +def test_map_granite_wall_insulation(age_band, expected): + assert map_granite_wall_insulation(age_band) == expected + + +@pytest.mark.parametrize( + "age_band, expected", + [ + (EpcConstructionAgeBand.before_1900, EpcWallDescriptions.sandstone_limestone_no_insulation_assumed), + (EpcConstructionAgeBand.from_1976_to_1982, EpcWallDescriptions.sandstone_limestone_partial_insulated_assumed), + (EpcConstructionAgeBand.from_2007_to_2011, EpcWallDescriptions.sandstone_limestone_insulated_assumed), + ], +) +def test_map_sandstone_wall_insulation(age_band, expected): + assert map_sandstone_wall_insulation(age_band) == expected + + +@pytest.mark.parametrize( + "age_band, expected", + [ + (EpcConstructionAgeBand.before_1900, EpcWallDescriptions.cob_as_built_average), + (EpcConstructionAgeBand.from_1976_to_1982, EpcWallDescriptions.cob_as_built_average), + (EpcConstructionAgeBand.from_1983_to_1990, EpcWallDescriptions.cob_as_built_good), + ], +) +def test_map_cob_wall_insulation(age_band, expected): + assert map_cob_wall_insulation(age_band) == expected + + +# --------------------------------------------------------------------- +# Wall efficiency resolution +# --------------------------------------------------------------------- + +@pytest.mark.parametrize( + "description, age_band, expected", + [ + # Fixed efficiencies + (EpcWallDescriptions.cavity_no_insulation_assumed, None, EpcEfficiency.POOR), + (EpcWallDescriptions.cavity_partial_insulated_assumed, None, EpcEfficiency.AVERAGE), + (EpcWallDescriptions.cavity_insulated_assumed, None, EpcEfficiency.GOOD), + + # Function-based efficiencies + ( + EpcWallDescriptions.cavity_filled_cavity, + EpcConstructionAgeBand.from_2023_onwards, + EpcEfficiency.VERY_GOOD, + ), + ( + EpcWallDescriptions.cavity_filled_cavity, + EpcConstructionAgeBand.from_1991_to_1995, + EpcEfficiency.GOOD, + ), + ( + EpcWallDescriptions.solid_brick_internal_insulation, + EpcConstructionAgeBand.from_2003_to_2006, + EpcEfficiency.VERY_GOOD, + ), + ( + EpcWallDescriptions.solid_brick_internal_insulation, + EpcConstructionAgeBand.from_1950_to_1966, + EpcEfficiency.GOOD, + ), + ], +) +def test_resolve_wall_efficiency(description, age_band, expected): + assert resolve_wall_efficiency(description, age_band) == expected + + +@pytest.mark.parametrize( + "description", + [ + EpcWallDescriptions.cavity_as_built_unknown, + EpcWallDescriptions.solid_brick_as_built_unknown, + EpcWallDescriptions.system_as_built_unknown, + EpcWallDescriptions.timber_frame_as_built_unknown, + EpcWallDescriptions.granite_as_built_unknown, + EpcWallDescriptions.sandstone_as_built_unknown, + EpcWallDescriptions.cob_as_built_unknown, + ], +) +def test_unknown_wall_descriptions_return_na(description): + assert resolve_wall_efficiency(description, None) == EpcEfficiency.NA diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile new file mode 100644 index 00000000..7c1a7989 --- /dev/null +++ b/backend/postcode_splitter/handler/Dockerfile @@ -0,0 +1,9 @@ +FROM public.ecr.aws/lambda/python:3.10 + +# Set working directory (Lambda task root) +WORKDIR /var/task + +# ----------------------------- +# Lambda handler +# ----------------------------- +CMD ["main.handler"] diff --git a/backend/postcode_splitter/handler/requirements.txt b/backend/postcode_splitter/handler/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py new file mode 100644 index 00000000..d55f618a --- /dev/null +++ b/backend/postcode_splitter/main.py @@ -0,0 +1,127 @@ +import pandas as pd +import requests +from backend.address2UPRN.main import ( + resolve_uprns_for_postcode_group, + get_epc_data_with_postcode, +) +from tqdm import tqdm + + +def sanitise_postcode(postcode: str) -> str | None: + """ + Normalise postcode for grouping. + + - Uppercase + - Remove all whitespace + """ + if pd.isna(postcode): + return None + + return postcode.upper().replace(" ", "") + + +def is_valid_postcode(postcode_clean: str) -> bool: + """ + Validate postcode using postcodes.io. + + Expects a sanitised postcode (e.g. E84SQ). + Returns True if valid, False otherwise. + """ + POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" + if not postcode_clean: + return False + + try: + resp = requests.get( + POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), + timeout=5, + ) + resp.raise_for_status() + return resp.json().get("result", False) + except requests.RequestException: + # Network issues, rate limits, etc. + return False + + +def main(): + df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability") + df = df.head(500) + + # Sanitise postcodes + df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) + + # --- validate AFTER grouping (save API calls) --- + + # Get unique, non-null postcodes + unique_postcodes = df["postcode_clean"].dropna().unique() + + # Validate each postcode once, TODOadd a progress bar + postcode_validity = { + pc: is_valid_postcode(pc) + for pc in tqdm(unique_postcodes, total=len(unique_postcodes)) + } + + # Map validity back onto dataframe + df["postcode_valid"] = df["postcode_clean"].map(postcode_validity) + + results = [] + + for postcode, group_df in tqdm( + df[df["postcode_valid"]].groupby("postcode_clean"), + desc="Resolving UPRNs by postcode", + ): + try: + epc_df = get_epc_data_with_postcode(postcode) + + if epc_df.empty: + tmp = group_df.copy() + tmp["found_uprn"] = None + tmp["status"] = "no_epc_results" + results.append(tmp) + continue + + resolved = resolve_uprns_for_postcode_group( + group_df=group_df, + epc_df=epc_df, + ) + + results.append(resolved) + + except Exception as e: + tmp = group_df.copy() + tmp["found_uprn"] = None + tmp["status"] = "exception" + tmp["error"] = str(e) + results.append(tmp) + + final_df = pd.concat(results, ignore_index=True) + a = final_df[ + [ + "best_match_lexiscore", + "Address 1", + "best_match_address", + "Postcode", + "UPRN", + "best_match_uprn", + ] + ] # add levi score to viewing + b = final_df[final_df["best_match_lexiscore"] > 0] # add levi score to viewing + b = b[ + [ + "best_match_lexiscore", + "Address 1", + "best_match_address", + "Postcode", + "UPRN", + "best_match_uprn", + ] + ] + + +def handler(event, context): + print("hello Postcode splitter world") + return {"statusCode": 200, "body": "hello world"} + + +if __name__ == "__main__": + main() diff --git a/conftest.py b/conftest.py index e3add6e6..d93f0023 100644 --- a/conftest.py +++ b/conftest.py @@ -1,5 +1,11 @@ import os from backend.app.config import get_settings +import os +from dotenv import load_dotenv +import os + +# Load .env in conftest.py directory for local development +load_dotenv() DEFAULT_ENV = { "API_KEY": "test", @@ -8,7 +14,10 @@ DEFAULT_ENV = { "DATA_BUCKET": "test", "PLAN_TRIGGER_BUCKET": "test", "ENGINE_SQS_URL": "test", - "EPC_AUTH_TOKEN": "test", # overridden in GitHub Actions + "EPC_AUTH_TOKEN": os.getenv( + "EPC_AUTH_TOKEN", + "test", + ), # overridden in GitHub Actions "GOOGLE_SOLAR_API_KEY": "test", "DB_HOST": "localhost", "DB_USERNAME": "test", diff --git a/datatypes/epc/__init__.py b/datatypes/epc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/datatypes/epc/construction_age_band.py b/datatypes/epc/construction_age_band.py new file mode 100644 index 00000000..c5e7a03b --- /dev/null +++ b/datatypes/epc/construction_age_band.py @@ -0,0 +1,45 @@ +import re +from enum import Enum +from typing import List + + +class EpcConstructionAgeBand(Enum): + before_1900: str = 'England and Wales: before 1900' + from_1900_to_1929: str = 'England and Wales: 1900-1929' + from_1930_to_1949: str = 'England and Wales: 1930-1949' + from_1950_to_1966: str = 'England and Wales: 1950-1966' + from_1967_to_1975: str = 'England and Wales: 1967-1975' + from_1976_to_1982: str = 'England and Wales: 1976-1982' + from_1983_to_1990: str = 'England and Wales: 1983-1990' + from_1991_to_1995: str = 'England and Wales: 1991-1995' + from_1996_to_2002: str = 'England and Wales: 1996-2002' + from_2003_to_2006: str = 'England and Wales: 2003-2006' + from_2007_to_2011: str = 'England and Wales: 2007-2011' + from_2012_onwards: str = 'England and Wales: 2012-onwards' + from_2012_to_2022: str = 'England and Wales: 2012-2022' + from_2023_onwards: str = 'England and Wales: 2023 onwards' + + def start_year(self) -> int: + """ + Extract the starting year of the age band. + """ + value = self.value.lower() + + if 'before' in value: + return 0 + match = re.search(r'(\d{4})', value) + if not match: + raise ValueError(f"Cannot determine start year from '{self.value}'") + + return int(match.group(1)) + + @classmethod + def from_year_onwards(cls, year: int) -> List["EpcConstructionAgeBand"]: + """ + Return all age bands whose starting year is >= the given year. + """ + return [ + band + for band in cls + if band.start_year() >= year + ] diff --git a/datatypes/epc/efficiency.py b/datatypes/epc/efficiency.py new file mode 100644 index 00000000..0417f49e --- /dev/null +++ b/datatypes/epc/efficiency.py @@ -0,0 +1,10 @@ +from enum import Enum + + +class EpcEfficiency(Enum): + VERY_POOR: str = "Very Poor" + POOR: str = "Poor" + AVERAGE: str = "Average" + GOOD: str = "Good" + VERY_GOOD: str = "Very Good" + NA: str = "N/A" diff --git a/datatypes/epc/floor.py b/datatypes/epc/floor.py new file mode 100644 index 00000000..41786101 --- /dev/null +++ b/datatypes/epc/floor.py @@ -0,0 +1,17 @@ +from enum import Enum + + +class EpcFloorDescriptions(Enum): + # Solid floor + solid_insulated = "Solid, insulated" + solid_insulated_assumed = "Solid, insulated (assumed)" + solid_no_insulation_assumed = "Solid, no insulation (assumed)" + solid_limited_insulation_assumed = "Solid, limited insulation (assumed)" + + # Suspended floor + suspended_insulated = "Suspended, insulated" + suspended_insulated_assumed = "Suspended, insulated (assumed)" + suspended_no_insulation_assumed = "Suspended, no insulation (assumed)" + suspended_limited_insulation_assumed = "Suspended, limited insulation (assumed)" + + unknown = None # We don't resolve anything diff --git a/datatypes/epc/fuel.py b/datatypes/epc/fuel.py new file mode 100644 index 00000000..0d1e455c --- /dev/null +++ b/datatypes/epc/fuel.py @@ -0,0 +1,10 @@ +from enum import Enum + + +class EpcFuel(Enum): + electricity_not_community = "electricity (not community)" + lpg_not_community = "LPG (not community)" + mains_gas_not_community = "mains gas (not community)" + oil_not_community = "oil (not community)" + manufactured_smokeless_fuel = "Solid fuel: manufactured smokeless fuel" + smokeless_coal = "smokeless coal" diff --git a/datatypes/epc/heating_controls.py b/datatypes/epc/heating_controls.py new file mode 100644 index 00000000..48538bff --- /dev/null +++ b/datatypes/epc/heating_controls.py @@ -0,0 +1,18 @@ +from enum import Enum + + +class EpcHeatingControls(Enum): + programmer_room_thermostat_trvs = "Programmer, room thermostat and TRVs" + programmers_trvs_bypass = "Programmer, TRVs and bypass" + time_and_temperature_zone_control = "Time and temperature zone control" + + # Room heaters + programmer_and_appliance_thermostats = "Programmer and appliance thermostats" + appliance_thermostats = "Appliance thermostats" + + # Storage heaters + automatic_charge_control = "Automatic charge control" + manual_charge_control = "Manual charge control" + + # Warm air + programmer_and_atleast_two_room_thermostats = "Programmer and at least two room thermostats" diff --git a/datatypes/epc/hotwater.py b/datatypes/epc/hotwater.py new file mode 100644 index 00000000..96af2be3 --- /dev/null +++ b/datatypes/epc/hotwater.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class EpcHotWaterSystems(Enum): + # from primary heating system + from_main_system = "From main system" + # Common for heater-based systems, e.g. room heaters or storage heaters + electric_immersion_off_peak = "Electric immersion, off-peak" diff --git a/datatypes/epc/main_heating.py b/datatypes/epc/main_heating.py new file mode 100644 index 00000000..663ada99 --- /dev/null +++ b/datatypes/epc/main_heating.py @@ -0,0 +1,24 @@ +from enum import Enum + + +class EpcHeatingSystems(Enum): + # boiler and radiators + boiler_and_radiators_electric = "Boiler and radiators, electric" + boiler_and_radiators_lpg = "Boiler and radiators, LPG" + boiler_radiators_mains_gas = "Boiler and radiators, mains gas" + boiler_radiators_oil = "Boiler and radiators, oil" + # underfloor + electric_underfloor_heating = "Electric underfloor heating" + # ashp + air_to_air_ashp = "Air source heat pump, warm air, electric" + ashp_radiators_electric = "Air source heat pump, radiators, electric" + # Room heaters + room_heaters_electric = "Room heaters, electric" + room_heaters_mains_gas = "Room heaters, mains gas" + room_heaters_smokeless_fuel = "Room heaters, smokeless fuel" + room_heaters_coal = "Room heaters, coal" + # Storage heaters + electric_storage_heaters = "Electric storage heaters" + # Warm air + warm_air_electricaire = "Warm air, Electricaire" + warm_air_mains_gas = "Warm air, mains gas" diff --git a/datatypes/epc/property_type_built_form.py b/datatypes/epc/property_type_built_form.py new file mode 100644 index 00000000..2fd59ddf --- /dev/null +++ b/datatypes/epc/property_type_built_form.py @@ -0,0 +1,17 @@ +from enum import Enum + + +class PropertyType(Enum): + flat = "Flat" + maisonette = "Maisonette" + bungalow = "Bungalow" + house = "House" + + +class BuiltForm(Enum): + mid_terrace = "Mid-Terrace" + end_terrace = "End-Terrace" + detached = "Detached" + semi_detached = "Semi-Detached" + enclosed_mid_terrace = "Enclosed Mid-Terrace" + enclosed_end_terrace = "Enclosed End-Terrace" diff --git a/datatypes/epc/roof.py b/datatypes/epc/roof.py new file mode 100644 index 00000000..9cdaac96 --- /dev/null +++ b/datatypes/epc/roof.py @@ -0,0 +1,86 @@ +from enum import Enum +from typing import List + + +class EpcRoofDescriptions(Enum): + # Loft + # Assumed options + pitched_insulated_assumed: str = "Pitched, insulated (assumed)" + pitched_no_insulation: str = "Pitched, no insulation" + # Insulation thickness options + loft_12mm_insulation: str = "Pitched, 12 mm loft insulation" + loft_25mm_insulation: str = "Pitched, 25 mm loft insulation" + loft_50mm_insulation: str = "Pitched, 50 mm loft insulation" + loft_75mm_insulation: str = "Pitched, 75 mm loft insulation" + loft_100mm_insulation: str = "Pitched, 100 mm loft insulation" + loft_125mm_insulation: str = "Pitched, 125 mm loft insulation" + loft_150mm_insulation: str = "Pitched, 150 mm loft insulation" + loft_175mm_insulation: str = "Pitched, 175 mm loft insulation" + loft_200mm_insulation: str = "Pitched, 200 mm loft insulation" + loft_250mm_insulation: str = "Pitched, 250 mm loft insulation" + loft_270mm_insulation: str = "Pitched, 270 mm loft insulation" + loft_300mm_insulation: str = "Pitched, 300 mm loft insulation" + loft_350mm_insulation: str = "Pitched, 350 mm loft insulation" + loft_400mm_plus_insulation: str = "Pitched, 400+ mm loft insulation" + # Insulated at rafters "Pitched, insulated at rafters" + # Rafters + # 400mm, 350mm = very good + # 200-300mm = good + # 125-175 = average + # 50-100 = poor + # 25 and below= very poor + loft_insulated_at_rafters: str = "Pitched, insulated at rafters" + # another dwelling above + another_dwelling_above: str = "(another dwelling above)" + # flat roof, which if there is observed insulation is just "flat, insulated", however there is a + # different efficiency rating depending on insulation thickness + # categories: + # 12mm = very poor & has limited insulation description + # 25, 50 = poor & has limited insulation description + # 75, 100, 125mm = average (Flat, insulated) + # 150, 175, 200, 225, 250mm = good (Flat, insulated) + # 270mm+ = very good (Flat, insulated) + # As built 2023 = Flat, insulated, Very good + # 2003 - 2006, up to 2012-2022 = Flat insulated, Good + # 1983-1990, 1996-2002 = Flat, insulated, Average + # 1976-1982 = Flat, limited insulation, poor + # 1967 - 1975 = Flat, limited insulation, Very Poor + # 1950-1966 and earlier bands = flat, no insulation, very poor + + flat_insulated: str = "Flat, insulated" + flat_limited_insulation: str = "Flat, limited insulation" + flat_no_insulation: str = "Flat, no insulation" + + # Thatched roof descriptions + # With Loft insulation at joists + # Thatched + 12mm = thatched, with additional insulation, average + # Thatched + 25, 50, 100, 150mm = thatched, with additional insulation, good + # Thatched + 175mm+ = thatched, with additional insulation, very good + # With loft insulation at rafters [out of scope atm] + # Unknown insulation + # Pre 1900, 1930-1949, 1967-1975, 1983-1990, 1996-2002 = "Thatched", Average + # 2003-2006, 2012-2022 = "Thatched", Good + # 2023 onwards = "Thatched", Very Good + thatched: str = "Thatched" # We see this for no insulation, has average performance + thatched_with_additional_insulation: str = "Thatched, with additional insulation" + + # Sloping ceiling + # For sloping ceiling tags, we don't use any (assumed) tags so that it's unambiguous that the roof is sloped + sloping_pitched_no_insulation: str = "Pitched, no insulation" + sloping_pitched_limited_insulation: str = "Pitched, limited insulation" + sloping_pitched_insulated: str = "Pitched, insulated" + + # Unknown descriptions which may get mapped later or handled via fallback + flat_as_built_unknown: str = "Flat, as built, unknown insulation" + loft_as_built_unknown: str = "Loft, as built, unknown insulation" + thatched_as_built_unknown: str = "Thatched, as built, unknown insulation" + sloping_pitched_as_built_unknown: str = "Pitched, as built, unknown insulation" + + @property + def unknown_descriptions(self) -> List["EpcRoofDescriptions"]: + return [ + EpcRoofDescriptions.flat_as_built_unknown, + EpcRoofDescriptions.loft_as_built_unknown, + EpcRoofDescriptions.thatched_as_built_unknown, + EpcRoofDescriptions.sloping_pitched_as_built_unknown, + ] diff --git a/datatypes/epc/walls.py b/datatypes/epc/walls.py new file mode 100644 index 00000000..44ca7e49 --- /dev/null +++ b/datatypes/epc/walls.py @@ -0,0 +1,74 @@ +from enum import Enum +from typing import List + + +class EpcWallDescriptions(Enum): + # Cavity wall descriptions + cavity_insulated_assumed: str = "Cavity wall, as built, insulated (assumed)" + cavity_partial_insulated_assumed: str = "Cavity wall, as built, partial insulation (assumed)" + cavity_no_insulation_assumed: str = "Cavity wall, as built, no insulation (assumed)" + cavity_filled_cavity: str = "Cavity wall, filled cavity" + cavity_internal_insulation: str = "Cavity wall, with internal insulation" + cavity_external_insulation: str = "Cavity wall, with external insulation" + cavity_filled_plus_internal: str = "Cavity wall, filled cavity and internal insulation" + cavity_filled_plus_external: str = "Cavity wall, filled cavity and external insulation" + + # Solid wall descriptions + solid_brick_internal_insulation: str = "Solid brick, with internal insulation" + solid_brick_external_insulation: str = "Solid brick, with external insulation" + solid_brick_no_insulation_assumed: str = 'Solid brick, as built, no insulation (assumed)' + solid_brick_partial_insulated_assumed: str = 'Solid brick, as built, partial insulation (assumed)' + solid_brick_insulated_assumed: str = 'Solid brick, as built, insulated (assumed)' + + # System + system_external_insulation: str = "System built, with external insulation" + system_internal_insulation: str = "System built, with internal insulation" + system_no_insulation_assumed: str = "System built, as built, no insulation (assumed)" + system_partial_insulated_assumed: str = "System built, as built, partial insulation (assumed)" + system_insulated_assumed: str = "System built, as built, insulated (assumed)" + + # Timber + timber_frame_internal_insulation: str = "Timber frame, with internal insulation" + timber_frame_external_insulation: str = "Timber frame, with external insulation" + timber_frame_no_insulation_assumed: str = "Timber frame, as built, no insulation (assumed)" + timber_frame_partial_insulated_assumed: str = "Timber frame, as built, partial insulation (assumed)" + timber_frame_insulated_assumed: str = "Timber frame, as built, insulated (assumed)" + + # Granite/whinstone + granite_whinstone_external_insulation: str = "Granite or whin, with external insulation" + granite_whinstone_internal_insulation: str = "Granite or whin, with internal insulation" + granite_whinstone_no_insulation_assumed: str = "Granite or whin, as built, no insulation (assumed)" + granite_whinstone_partial_insulated_assumed: str = "Granite or whin, as built, partial insulation (assumed)" + granite_whinestone_insulated_assumed: str = "Granite or whin, as built, insulated (assumed)" + + # Sandstone/limestone + sandstone_limestone_internal_insulation: str = "Sandstone, with internal insulation" + sandstone_limestone_external_insulation: str = "Sandstone, with external insulation" + sandstone_limestone_no_insulation_assumed: str = "Sandstone, as built, no insulation (assumed)" + sandstone_limestone_partial_insulated_assumed: str = "Sandstone, as built, partial insulation (assumed)" + sandstone_limestone_insulated_assumed: str = "Sandstone, as built, insulated (assumed)" + + # Cob + cob_as_built_average: str = "Cob, as built" + cob_as_built_good: str = "Cob, as built" + + # unknown descriptions which may get mapped later or handled via fallback + cavity_as_built_unknown: str = "Cavity wall, as built, unknown insulation" + solid_brick_as_built_unknown: str = "Solid brick, as built, unknown insulation" + system_as_built_unknown: str = "System built, as built, unknown insulation" + timber_frame_as_built_unknown: str = "Timber frame, as built, unknown insulation" + granite_as_built_unknown: str = "Granite or whin, as built, unknown insulation" + sandstone_as_built_unknown: str = "Sandstone, as built, unknown insulation" + cob_as_built_unknown: str = "Cob, as built, unknown insulation" + + @property + def unknown_descriptions(self) -> List["EpcWallDescriptions"]: + return [ + EpcWallDescriptions.cavity_as_built_unknown, + EpcWallDescriptions.solid_brick_as_built_unknown, + EpcWallDescriptions.system_as_built_unknown, + EpcWallDescriptions.timber_frame_as_built_unknown, + EpcWallDescriptions.granite_as_built_unknown, + EpcWallDescriptions.sandstone_as_built_unknown, + EpcWallDescriptions.cob_as_built_unknown, + ] diff --git a/etl/bill_savings/KwhData.py b/etl/bill_savings/KwhData.py index 3291e909..b4bb979d 100644 --- a/etl/bill_savings/KwhData.py +++ b/etl/bill_savings/KwhData.py @@ -196,6 +196,10 @@ class KwhData: if save and self.bucket is None: raise Exception("bucket not set, cannot save data") + if data.empty: + # If we have no data + return data + # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features # in anticipation of the new model diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py b/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py index b6fc0f8f..68655e80 100644 --- a/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py +++ b/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py @@ -1,111 +1,111 @@ import pandas as pd -epc_c_recommendations = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " - "solid floor, ashp 3.0 - corrected.xlsx" -) -epc_b_recommendations = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC B - no " - "solid floor, ashp 3.0 - corrected.xlsx" -) +# epc_c_recommendations = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " +# "solid floor, ashp 3.0 - corrected.xlsx" +# ) +# epc_b_recommendations = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC B - no " +# "solid floor, ashp 3.0 - corrected.xlsx" +# ) -epc_c_movers = epc_b_recommendations[ - epc_b_recommendations["current_epc_rating"] == "Epc.C" - ] -epc_c_movers["property_type"].value_counts() +# epc_c_movers = epc_b_recommendations[ +# epc_b_recommendations["current_epc_rating"] == "Epc.C" +# ] +# epc_c_movers["property_type"].value_counts() -house_epc_c_movers = epc_c_movers[ - epc_c_movers["property_type"] == "House" - ] -house_epc_c_movers_with_solar = house_epc_c_movers[ - ~pd.isnull(house_epc_c_movers["solar_pv"]) | ~pd.isnull(house_epc_c_movers["solar_pv_with_battery"]) - ] +# house_epc_c_movers = epc_c_movers[ +# epc_c_movers["property_type"] == "House" +# ] +# house_epc_c_movers_with_solar = house_epc_c_movers[ +# ~pd.isnull(house_epc_c_movers["solar_pv"]) | ~pd.isnull(house_epc_c_movers["solar_pv_with_battery"]) +# ] -house_epc_c_movers_with_a_heatpump = house_epc_c_movers[ - ~pd.isnull(house_epc_c_movers["air_source_heat_pump"]) -] +# house_epc_c_movers_with_a_heatpump = house_epc_c_movers[ +# ~pd.isnull(house_epc_c_movers["air_source_heat_pump"]) +# ] -flat_epc_c_movers = epc_c_movers[ - epc_c_movers["property_type"] == "Flat" - ] +# flat_epc_c_movers = epc_c_movers[ +# epc_c_movers["property_type"] == "Flat" +# ] -epc_c_recommendations["sap_points"].mean() -epc_c_recommendations["sap_points"].mean() +# epc_c_recommendations["sap_points"].mean() +# epc_c_recommendations["sap_points"].mean() -measure_cols = [ - "air_source_heat_pump", - "boiler_upgrade", - "cavity_wall_insulation", - "double_glazing", - "external_wall_insulation", - "flat_roof_insulation", - "high_heat_retention_storage_heaters", - "internal_wall_insulation", - "loft_insulation", - "low_energy_lighting", - "mechanical_ventilation", - "room_roof_insulation", - "roomstat_programmer_trvs", - "sealing_open_fireplace", - "secondary_glazing", - "secondary_heating", - "solar_pv", - "solar_pv_with_battery", - "suspended_floor_insulation", - "time_temperature_zone_control", -] +# measure_cols = [ +# "air_source_heat_pump", +# "boiler_upgrade", +# "cavity_wall_insulation", +# "double_glazing", +# "external_wall_insulation", +# "flat_roof_insulation", +# "high_heat_retention_storage_heaters", +# "internal_wall_insulation", +# "loft_insulation", +# "low_energy_lighting", +# "mechanical_ventilation", +# "room_roof_insulation", +# "roomstat_programmer_trvs", +# "sealing_open_fireplace", +# "secondary_glazing", +# "secondary_heating", +# "solar_pv", +# "solar_pv_with_battery", +# "suspended_floor_insulation", +# "time_temperature_zone_control", +# ] -epc_c_melted = ( - epc_c_recommendations - .melt( - id_vars=[c for c in epc_c_recommendations.columns if c not in measure_cols], - value_vars=measure_cols, - var_name="measure_type", - value_name="value", - ) - .dropna(subset=["value"]) -) -epc_c_melted = epc_c_melted[epc_c_melted["value"] > 0] -epc_c_measures = epc_c_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() +# epc_c_melted = ( +# epc_c_recommendations +# .melt( +# id_vars=[c for c in epc_c_recommendations.columns if c not in measure_cols], +# value_vars=measure_cols, +# var_name="measure_type", +# value_name="value", +# ) +# .dropna(subset=["value"]) +# ) +# epc_c_melted = epc_c_melted[epc_c_melted["value"] > 0] +# epc_c_measures = epc_c_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() -epc_b_melted = ( - epc_b_recommendations - .melt( - id_vars=[c for c in epc_b_recommendations.columns if c not in measure_cols], - value_vars=measure_cols, - var_name="measure_type", - value_name="value", - ) - .dropna(subset=["value"]) -) +# epc_b_melted = ( +# epc_b_recommendations +# .melt( +# id_vars=[c for c in epc_b_recommendations.columns if c not in measure_cols], +# value_vars=measure_cols, +# var_name="measure_type", +# value_name="value", +# ) +# .dropna(subset=["value"]) +# ) -epc_b_melted = epc_b_melted[epc_b_melted["value"] > 0] -epc_b_measures = epc_b_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() +# epc_b_melted = epc_b_melted[epc_b_melted["value"] > 0] +# epc_b_measures = epc_b_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() -measures_compared = epc_c_measures.merge( - epc_b_measures, - left_on="measure_type", - right_on="measure_type", - suffixes=("_epc_c", "_epc_b"), -) +# measures_compared = epc_c_measures.merge( +# epc_b_measures, +# left_on="measure_type", +# right_on="measure_type", +# suffixes=("_epc_c", "_epc_b"), +# ) -epc_c_retrofits = epc_c_recommendations[ - epc_c_recommendations["total_retrofit_cost"] > 0 - ] +# epc_c_retrofits = epc_c_recommendations[ +# epc_c_recommendations["total_retrofit_cost"] > 0 +# ] -epc_b_retrofits = epc_b_recommendations[ - epc_b_recommendations["total_retrofit_cost"] > 0 - ] +# epc_b_retrofits = epc_b_recommendations[ +# epc_b_recommendations["total_retrofit_cost"] > 0 +# ] -epc_c_retrofits["sap_points"].mean() -epc_b_retrofits["sap_points"].mean() +# epc_c_retrofits["sap_points"].mean() +# epc_b_retrofits["sap_points"].mean() -properties_in_both = epc_c_retrofits.merge(epc_b_retrofits, on="uprn", suffixes=("_epc_c", "_epc_b")) +# properties_in_both = epc_c_retrofits.merge(epc_b_retrofits, on="uprn", suffixes=("_epc_c", "_epc_b")) -properties_in_both["total_retrofit_cost_epc_c"].mean() -properties_in_both["sap_points_epc_c"].mean() -properties_in_both["total_retrofit_cost_epc_b"].mean() -properties_in_both["sap_points_epc_b"].mean() +# properties_in_both["total_retrofit_cost_epc_c"].mean() +# properties_in_both["sap_points_epc_c"].mean() +# properties_in_both["total_retrofit_cost_epc_b"].mean() +# properties_in_both["sap_points_epc_b"].mean() # Solar PV savings - we need the amount of solar PV bill savings from sqlalchemy.orm import sessionmaker @@ -114,16 +114,12 @@ from backend.app.db.models.recommendations import Recommendation, Plan, PlanReco from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel from collections import defaultdict -PORTFOLIO_ID = 435 # Peabody +PORTFOLIO_ID = 485 # Peabody SCENARIOS = [ - 908, - 909, - 910, + 970 ] scenario_names = { - 908: "EPC C - no solid floor, ashp 3.0", - 909: "EPC C - no solid floor, no EWI or IWI, ashp 3.0", - 910: "EPC B - no solid floor, no EWI, ashp 3.0" + 970: "EPC C - no solid floor, ashp 3.0", } @@ -236,307 +232,266 @@ recommendations_df = pd.DataFrame(recommendations_data) properties_df = pd.DataFrame(properties_data) plans_df = pd.DataFrame(plans_data) -s_id = 910 -ps_w_a_plan = plans_df[plans_df["scenario_id"] == s_id].copy() -# Take the newest by scenario id -ps_w_a_plan = ps_w_a_plan.sort_values("created_at", ascending=False).drop_duplicates( - subset=["property_id"] -) -z = ps_w_a_plan[ - ps_w_a_plan["cost_of_works"] > 0 - ].copy() -z2 = properties_df[properties_df["property_id"].isin(z["property_id"].values)] -# '', 'hot_water_cost_current', -# 'lighting_cost_current', 'appliances_cost_current', -# 'gas_standing_charge', 'electricity_standing_charge' -z2["total_bills"] = z2["heating_cost_current"] + z2["hot_water_cost_current"] + z2["lighting_cost_current"] + z2[ - "appliances_cost_current" -] + z2["gas_standing_charge"] + z2["electricity_standing_charge"] +with pd.ExcelWriter("hackney.xlsx", engine="openpyxl") as writer: + recommendations_df.to_excel(writer, sheet_name="recommendations", index=False) + properties_df.to_excel(writer, sheet_name="properties", index=False) -from tqdm import tqdm + +# solar_pv_recommendations = recommendations_df[recommendations_df["measure_type"] == "solar_pv"] +# average_savings = solar_pv_recommendations.groupby("scenario_id")["energy_cost_savings"].mean().reset_index() -# For a property ID, find a property where the no EWI/IWI approach is more expensive than the EWI approach -pids = properties_df["property_id"].unique() -for pid in tqdm(pids): - if pid in [603272, 550550, 574493]: - continue - # get the plans - property_plan = plans_df[plans_df["property_id"] == int(pid)] - # Take the newest plan by scenario id - property_plan = property_plan.sort_values("created_at", ascending=False).drop_duplicates( - subset=["scenario_id"] - ) - a = property_plan[property_plan["scenario_id"] == 909].squeeze() # no EWI/IWI - b = property_plan[property_plan["scenario_id"] == 908].squeeze() # EWI - if (a["cost_of_works"] > b["cost_of_works"]) and ( - a["post_epc_rating"].value == "C") and (b["cost_of_works"] > 5000): - bah +# # Check tenures +# initial_asset_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " +# "- Data Extracts for Domna.xlsx", +# sheet_name="Properties" +# ) +# sustainability_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " +# "- Data Extracts for Domna.xlsx", +# sheet_name="Sustainability" +# ) -solar_pv_recommendations = recommendations_df[ - recommendations_df["measure_type"] == "solar_pv" - ] +# sustainability_sample = sustainability_data[ +# sustainability_data["UPRN"].isin(properties_df["uprn"].astype(int).astype(str).values) +# ] -solid_wall_recommendation = recommendations_df[ - recommendations_df["scenario_id"].isin([908]) & - recommendations_df["measure_type"].isin(["internal_wall_insulation"]) & - recommendations_df["default"] - ] -average_savings = solar_pv_recommendations.groupby("scenario_id")["energy_cost_savings"].mean().reset_index() -# Add on scenarion names -average_savings["scenario_name"] = average_savings["scenario_id"].map(scenario_names) +# sustainability_sample = sustainability_sample.merge( +# initial_asset_data, left_on="Org Ref", right_on="UPRN", suffixes=("_sustainability", "_initial_asset") +# ) -# Check tenures -initial_asset_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " - "- Data Extracts for Domna.xlsx", - sheet_name="Properties" -) -sustainability_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " - "- Data Extracts for Domna.xlsx", - sheet_name="Sustainability" -) +# block_sizes = initial_asset_data["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) +# block_sizes.to_excel("/Users/khalimconn-kowlessar/Downloads/peabody_block_sizes.xlsx", index=False) -sustainability_sample = sustainability_data[ - sustainability_data["UPRN"].isin(properties_df["uprn"].astype(int).astype(str).values) -] +# initial_asset_data.columns +# initial_asset_data["LeaseType"].value_counts() -sustainability_sample = sustainability_sample.merge( - initial_asset_data, left_on="Org Ref", right_on="UPRN", suffixes=("_sustainability", "_initial_asset") -) +# # sustainability_sample["Tenure Group"].value_counts() +# # Tenure Group +# # General Needs 57787 +# # Home Ownership 25471 +# # Care & Supported Housing 4239 +# # Rental 2677 +# # Other 188 -block_sizes = initial_asset_data["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) -block_sizes.to_excel("/Users/khalimconn-kowlessar/Downloads/peabody_block_sizes.xlsx", index=False) +# df = sustainability_sample["Ownership Type"].value_counts().to_frame().reset_index() +# df.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenures.xlsx", index=False) -initial_asset_data.columns -initial_asset_data["LeaseType"].value_counts() +# tenure_groups = sustainability_sample["Tenure Group"].value_counts().to_frame().reset_index() +# tenure_groups.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenure_groups.xlsx", index=False) -# sustainability_sample["Tenure Group"].value_counts() -# Tenure Group -# General Needs 57787 -# Home Ownership 25471 -# Care & Supported Housing 4239 -# Rental 2677 -# Other 188 +# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Tenure Group"].value_counts() -df = sustainability_sample["Ownership Type"].value_counts().to_frame().reset_index() -df.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenures.xlsx", index=False) +# sample_data = initial_asset_data[ +# ~initial_asset_data["Ownership Type"].isin( +# [ +# # Commercial # Everything is resi - based on the Residential Indicator variable - all are true +# # Freeholder +# "FREEHOLDER", # 19517 properties +# # HOMEBUY / EQUITY LOAN +# "Rent to Homebuy", # 1 property +# # Leaseholder +# "LEASEHOLD 100%", # 8455 properties +# "Owned and Managed - 999 year lease", # 2076 properties +# "Managed but not Owned-Private Lease", # 159 properties +# "Owned and managed LEASEHOLD", # 26 properties +# # Outright Sale - can't find anything matching +# # SHARED EQUITY +# "Shared Ownership", # 4065 properties +# "Shared Ownership Owned Not Managed", # 23 properties +# # Extra categories which seem sensible to exclude +# "NOT MANAGED AND NOT OWNED" +# ] +# ) +# ] -tenure_groups = sustainability_sample["Tenure Group"].value_counts().to_frame().reset_index() -tenure_groups.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenure_groups.xlsx", index=False) +# sample_data["Ownership Type"].value_counts() -initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Tenure Group"].value_counts() +# sample_data = initial_asset_data[ +# initial_asset_data["Ownership Type"].isin( +# [ +# "Owned and Managed", +# "Owned and Managed - 999 year lease", +# "Owned and managed LEASEHOLD", +# "LEASEHOLD 100%", +# "DATALOAD DEFAULT" +# ] +# ) +# ] +# dropped = initial_asset_data[~initial_asset_data["UPRN"].isin(sample_data["UPRN"].values)] +# dropped["Ownership Type"].value_counts() -sample_data = initial_asset_data[ - ~initial_asset_data["Ownership Type"].isin( - [ - # Commercial # Everything is resi - based on the Residential Indicator variable - all are true - # Freeholder - "FREEHOLDER", # 19517 properties - # HOMEBUY / EQUITY LOAN - "Rent to Homebuy", # 1 property - # Leaseholder - "LEASEHOLD 100%", # 8455 properties - "Owned and Managed - 999 year lease", # 2076 properties - "Managed but not Owned-Private Lease", # 159 properties - "Owned and managed LEASEHOLD", # 26 properties - # Outright Sale - can't find anything matching - # SHARED EQUITY - "Shared Ownership", # 4065 properties - "Shared Ownership Owned Not Managed", # 23 properties - # Extra categories which seem sensible to exclude - "NOT MANAGED AND NOT OWNED" - ] - ) -] +# for value in [ +# # Commercial # Everything is resi, so should be fine. No matches +# # Freeholder +# "FREEHOLDER", # 19517 properties +# # HOMEBUY / EQUITY LOAN +# "Rent to Homebuy", # 1 property +# # Leaseholder +# "LEASEHOLD 100%", # 8455 properties +# "Owned and Managed - 999 year lease", # 2076 properties +# "Managed but not Owned-Private Lease", # 159 properties +# "Owned and managed LEASEHOLD", # 26 properties +# # Outright Sale - can't find anything matching +# # SHARED EQUITY +# "Shared Ownership", # 4065 properties +# "Shared Ownership Owned Not Managed", # 23 properties +# ]: +# print(initial_asset_data[initial_asset_data["Ownership Type"] == value].shape[0]) -sample_data["Ownership Type"].value_counts() +# house_types = [ +# "HOUSE", +# "BUNGALOW", +# "MAISONETTE", +# "DUPLEX", +# ] -sample_data = initial_asset_data[ - initial_asset_data["Ownership Type"].isin( - [ - "Owned and Managed", - "Owned and Managed - 999 year lease", - "Owned and managed LEASEHOLD", - "LEASEHOLD 100%", - "DATALOAD DEFAULT" - ] - ) -] -dropped = initial_asset_data[~initial_asset_data["UPRN"].isin(sample_data["UPRN"].values)] -dropped["Ownership Type"].value_counts() +# guaranteed_control = [ +# "Owned and Managed", +# "Owned and Managed - 999 year lease", +# "Owned and managed LEASEHOLD", +# "LEASEHOLD 100%", +# "DATALOAD DEFAULT", +# ] -for value in [ - # Commercial # Everything is resi, so should be fine. No matches - # Freeholder - "FREEHOLDER", # 19517 properties - # HOMEBUY / EQUITY LOAN - "Rent to Homebuy", # 1 property - # Leaseholder - "LEASEHOLD 100%", # 8455 properties - "Owned and Managed - 999 year lease", # 2076 properties - "Managed but not Owned-Private Lease", # 159 properties - "Owned and managed LEASEHOLD", # 26 properties - # Outright Sale - can't find anything matching - # SHARED EQUITY - "Shared Ownership", # 4065 properties - "Shared Ownership Owned Not Managed", # 23 properties -]: - print(initial_asset_data[initial_asset_data["Ownership Type"] == value].shape[0]) +# sample_data = initial_asset_data[ +# ( +# initial_asset_data["Ownership Type"].isin(guaranteed_control) +# ) +# | +# ( +# (initial_asset_data["Ownership Type"] == "FREEHOLDER") +# & +# (initial_asset_data["Property Type"].isin(house_types)) +# ) +# ] -house_types = [ - "HOUSE", - "BUNGALOW", - "MAISONETTE", - "DUPLEX", -] +# fabric_retrofit_sample = initial_asset_data[ +# initial_asset_data["Ownership Type"].isin( +# [ +# "Owned and Managed", +# "FREEHOLDER", +# "DATALOAD DEFAULT", +# ] +# ) +# ] -guaranteed_control = [ - "Owned and Managed", - "Owned and Managed - 999 year lease", - "Owned and managed LEASEHOLD", - "LEASEHOLD 100%", - "DATALOAD DEFAULT", -] +# initial_asset_data[pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() +# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() -sample_data = initial_asset_data[ - ( - initial_asset_data["Ownership Type"].isin(guaranteed_control) - ) - | - ( - (initial_asset_data["Ownership Type"] == "FREEHOLDER") - & - (initial_asset_data["Property Type"].isin(house_types)) - ) - ] +# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Property Type"].value_counts() +# z = initial_asset_data[ +# ~pd.isnull(initial_asset_data["BlockCode"]) & initial_asset_data["Property Type"].isin(house_types) +# ] -fabric_retrofit_sample = initial_asset_data[ - initial_asset_data["Ownership Type"].isin( - [ - "Owned and Managed", - "FREEHOLDER", - "DATALOAD DEFAULT", - ] - ) -] +# block_code_agg = z["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) +# zz = initial_asset_data[initial_asset_data["BlockCode"] == "CHAT3343FM"] -initial_asset_data[pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() -initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() +# potential_sample = initial_asset_data[ +# ~pd.isnull(initial_asset_data["BlockCode"]) +# ] -initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Property Type"].value_counts() -z = initial_asset_data[ - ~pd.isnull(initial_asset_data["BlockCode"]) & initial_asset_data["Property Type"].isin(house_types) - ] +# compare = potential_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( +# initial_asset_data["Property Type"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="Property Type", +# right_on="Property Type", +# suffixes=("_on_block_codes", "_overall") +# ) -block_code_agg = z["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) -zz = initial_asset_data[initial_asset_data["BlockCode"] == "CHAT3343FM"] +# # Comparison of smaller sample vs overall +# new_asset_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " +# "- Peabody " +# "- Data Extracts for Domna v2.xlsx", +# sheet_name="Properties" +# ) -potential_sample = initial_asset_data[ - ~pd.isnull(initial_asset_data["BlockCode"]) -] +# new_sustainability_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " +# "- Peabody " +# "- Data Extracts for Domna v2.xlsx", +# sheet_name="Sustainability" +# ) -compare = potential_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( - initial_asset_data["Property Type"].value_counts(normalize=True).to_frame().reset_index(), - left_on="Property Type", - right_on="Property Type", - suffixes=("_on_block_codes", "_overall") -) +# sap_bands = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/Parity Data " +# "08012026.xlsx", +# ) -# Comparison of smaller sample vs overall -new_asset_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " - "- Peabody " - "- Data Extracts for Domna v2.xlsx", - sheet_name="Properties" -) +# combined = new_asset_data.merge( +# new_sustainability_data, +# left_on="UPRN", +# right_on="Org Ref", +# suffixes=("_asset", "_sustainability") +# ).merge( +# sap_bands[["OrgRef", "SAP Band", "Lodged EPC Band"]], how="left", left_on="Org Ref", right_on="OrgRef" +# ) +# reduced_sample = combined[ +# ~combined["AH Tenure"].isin( +# ["Commercial", +# "Freeholder", +# "HOMEBUY / EQUITY LOAN", +# "Leaseholder", +# "Outright Sale", +# "SHARED EQUITY", +# "Shared Ownership"] +# ) +# ].copy() -new_sustainability_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " - "- Peabody " - "- Data Extracts for Domna v2.xlsx", - sheet_name="Sustainability" -) +# # property types +# property_type_comparison = reduced_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( +# combined["Property Type"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="Property Type", +# right_on="Property Type", +# suffixes=("_reduced_sample", "_overall") +# ) -sap_bands = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/Parity Data " - "08012026.xlsx", -) +# # lodged ratings +# lodged_epc_band_comparison = reduced_sample["Lodged EPC Band"].value_counts( +# normalize=True).to_frame().reset_index().merge( +# combined["Lodged EPC Band"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="Lodged EPC Band", +# right_on="Lodged EPC Band", +# suffixes=("_reduced_sample", "_overall") +# ) -combined = new_asset_data.merge( - new_sustainability_data, - left_on="UPRN", - right_on="Org Ref", - suffixes=("_asset", "_sustainability") -).merge( - sap_bands[["OrgRef", "SAP Band", "Lodged EPC Band"]], how="left", left_on="Org Ref", right_on="OrgRef" -) -reduced_sample = combined[ - ~combined["AH Tenure"].isin( - ["Commercial", - "Freeholder", - "HOMEBUY / EQUITY LOAN", - "Leaseholder", - "Outright Sale", - "SHARED EQUITY", - "Shared Ownership"] - ) -].copy() +# # modelled ratings +# modelled_epc_band_comparison = reduced_sample["SAP Band"].value_counts( +# normalize=True).to_frame().reset_index().merge( +# combined["SAP Band"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="SAP Band", +# right_on="SAP Band", +# suffixes=("_reduced_sample", "_overall") +# ) -# property types -property_type_comparison = reduced_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( - combined["Property Type"].value_counts(normalize=True).to_frame().reset_index(), - left_on="Property Type", - right_on="Property Type", - suffixes=("_reduced_sample", "_overall") -) +# # Testing measures +# m1 = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " +# "solid floor, ashp 3.0 - 20250113 final.xlsx" +# ) +# m2 = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " +# "solid floor, no EWI or IWI, ashp 3.0 - 20250113 final.xlsx" +# ) -# lodged ratings -lodged_epc_band_comparison = reduced_sample["Lodged EPC Band"].value_counts( - normalize=True).to_frame().reset_index().merge( - combined["Lodged EPC Band"].value_counts(normalize=True).to_frame().reset_index(), - left_on="Lodged EPC Band", - right_on="Lodged EPC Band", - suffixes=("_reduced_sample", "_overall") -) +# compare = m1.merge( +# m2, +# left_on="uprn", +# right_on="uprn", +# suffixes=("_ewi_iwi", "_no_ewi_iwi") +# ) -# modelled ratings -modelled_epc_band_comparison = reduced_sample["SAP Band"].value_counts( - normalize=True).to_frame().reset_index().merge( - combined["SAP Band"].value_counts(normalize=True).to_frame().reset_index(), - left_on="SAP Band", - right_on="SAP Band", - suffixes=("_reduced_sample", "_overall") -) +# # Which properties get done under the no EWI/IWI scenario that do not under the EWI/IWI scenario +# only_no_ewi_iwi = compare[ +# (compare["total_retrofit_cost_ewi_iwi"] == 0) & +# (compare["total_retrofit_cost_no_ewi_iwi"] != 0) +# ] -# Testing measures -m1 = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " - "solid floor, ashp 3.0 - 20250113 final.xlsx" -) -m2 = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " - "solid floor, no EWI or IWI, ashp 3.0 - 20250113 final.xlsx" -) +# (m1["total_retrofit_cost"] > 0).sum() +# (m2["total_retrofit_cost"] > 0).sum() -compare = m1.merge( - m2, - left_on="uprn", - right_on="uprn", - suffixes=("_ewi_iwi", "_no_ewi_iwi") -) +# with_ewi_projects = compare[compare["total_retrofit_cost_no_ewi_iwi"] > 0] -# Which properties get done under the no EWI/IWI scenario that do not under the EWI/IWI scenario -only_no_ewi_iwi = compare[ - (compare["total_retrofit_cost_ewi_iwi"] == 0) & - (compare["total_retrofit_cost_no_ewi_iwi"] != 0) - ] - -(m1["total_retrofit_cost"] > 0).sum() -(m2["total_retrofit_cost"] > 0).sum() - -with_ewi_projects = compare[compare["total_retrofit_cost_no_ewi_iwi"] > 0] - -z = with_ewi_projects[pd.isnull(with_ewi_projects["total_retrofit_cost_ewi_iwi"])] +# z = with_ewi_projects[pd.isnull(with_ewi_projects["total_retrofit_cost_ewi_iwi"])] diff --git a/infrastructure/terraform/lambda/_template/README.md b/infrastructure/terraform/lambda/_template/README.md new file mode 100644 index 00000000..a7282fc9 --- /dev/null +++ b/infrastructure/terraform/lambda/_template/README.md @@ -0,0 +1,51 @@ +## Checklist for adding a new Lambda + +### 1. Create the Lambda scaffold +- Copy the template: + + cp -r lambda/_template lambda/ + +--- + +### 2. Add infrastructure prerequisites (shared stack) +- Add a new ECR repository in: + + infrastructure/terraform/shared/main.tf + +- Apply the shared stack + - This requires commenting 'if env.stage == "prod"' in .github/workflows/deploy_terraform.yml + +- Verify the ECR repository exists in AWS + +--- + +### 3. Add Docker build configuration +- Create a `Dockerfile` for the Lambda +- Verify the Dockerfile path and build context +- Add a new image build job in `deploy_terraform.yml` using `_build_image.yml` + +--- + +### 4. Wire the Lambda deploy job (CI) +- Add a deploy job using `_deploy_lambda.yml` +- Ensure the deploy job depends on the image build job + +--- + +### 5. Deploy +- Push changes to GitHub +- CI will: + 1. Build and push the Docker image + 2. Deploy the Lambda + 3. Verify everything deployed. Good things to check: + - ECR with image + - SQS + - Trigger SQS + - Cloud watch logs +--- +### 5. Delete + 1. Delete README if you used cp -r + +--- + +## Please feel free to update this document to make it easier for the next person \ No newline at end of file diff --git a/infrastructure/terraform/lambda/_template/main.tf b/infrastructure/terraform/lambda/_template/main.tf new file mode 100644 index 00000000..3010aa8a --- /dev/null +++ b/infrastructure/terraform/lambda/_template/main.tf @@ -0,0 +1,14 @@ +module "lambda" { + source = "../modules/lambda_with_sqs" + + name = REPLACE ME #"address2uprn" for example + stage = var.stage + + image_uri = local.image_uri + + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + } +} diff --git a/infrastructure/terraform/lambda/_template/provider.tf b/infrastructure/terraform/lambda/_template/provider.tf new file mode 100644 index 00000000..37c412ce --- /dev/null +++ b/infrastructure/terraform/lambda/_template/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.16" + } + } + + backend "s3" { + bucket = REPLACE_ME + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/_template/variables.tf b/infrastructure/terraform/lambda/_template/variables.tf new file mode 100644 index 00000000..e4bab243 --- /dev/null +++ b/infrastructure/terraform/lambda/_template/variables.tf @@ -0,0 +1,27 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf new file mode 100644 index 00000000..46b193f2 --- /dev/null +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -0,0 +1,14 @@ +module "address2uprn" { + source = "../modules/lambda_with_sqs" + + name = "address2uprn" + stage = var.stage + + image_uri = local.image_uri + + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + } +} diff --git a/infrastructure/terraform/lambda/address2UPRN/provider.tf b/infrastructure/terraform/lambda/address2UPRN/provider.tf new file mode 100644 index 00000000..ad873717 --- /dev/null +++ b/infrastructure/terraform/lambda/address2UPRN/provider.tf @@ -0,0 +1,17 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.16" + } + } + + backend "s3" { + bucket = "address2uprn-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} + diff --git a/infrastructure/terraform/lambda/address2UPRN/variables.tf b/infrastructure/terraform/lambda/address2UPRN/variables.tf new file mode 100644 index 00000000..e4bab243 --- /dev/null +++ b/infrastructure/terraform/lambda/address2UPRN/variables.tf @@ -0,0 +1,27 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/lambda/condition-etl/main.tf b/infrastructure/terraform/lambda/condition-etl/main.tf new file mode 100644 index 00000000..4219f209 --- /dev/null +++ b/infrastructure/terraform/lambda/condition-etl/main.tf @@ -0,0 +1,43 @@ +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" # TODO: dont hardcode this + region = "eu-west-2" + } +} + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + + +module "lambda" { + source = "../modules/lambda_with_sqs" + + name = "condition-etl" + stage = var.stage + + image_uri = local.image_uri + timeout = 180 + + + environment = merge( + { + STAGE = var.stage + LOG_LEVEL = "info" + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + }, + ) + +} + +resource "aws_iam_role_policy_attachment" "attach_condition_etl_s3_read" { + role = module.lambda.role_name + policy_arn = data.terraform_remote_state.shared.outputs.condition_etl_s3_read_arn +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/condition-etl/provider.tf b/infrastructure/terraform/lambda/condition-etl/provider.tf new file mode 100644 index 00000000..c633d238 --- /dev/null +++ b/infrastructure/terraform/lambda/condition-etl/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.16" + } + } + + backend "s3" { + bucket = "condition-etl-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/condition-etl/variables.tf b/infrastructure/terraform/lambda/condition-etl/variables.tf new file mode 100644 index 00000000..e4bab243 --- /dev/null +++ b/infrastructure/terraform/lambda/condition-etl/variables.tf @@ -0,0 +1,27 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/main.tf b/infrastructure/terraform/lambda/modules/lambda_with_sqs/main.tf new file mode 100644 index 00000000..065fb790 --- /dev/null +++ b/infrastructure/terraform/lambda/modules/lambda_with_sqs/main.tf @@ -0,0 +1,48 @@ +############################################ +# IAM role +############################################ +module "role" { + source = "../../../modules/lambda_execution_role" + name = "${var.name}-lambda-${var.stage}" +} + +output "role_name" { + value = module.role.role_name +} + +############################################ +# SQS queue + DLQ +############################################ +module "queue" { + source = "../../../modules/sqs_queue" + name = "${var.name}-queue-${var.stage}" +} + +############################################ +# Lambda +############################################ +module "lambda" { + source = "../../../modules/lambda_service" + + name = "${var.name}-${var.stage}" + role_arn = module.role.role_arn + image_uri = var.image_uri + + timeout = var.timeout + memory_size = var.memory_size + + environment = var.environment +} + +############################################ +# SQS → Lambda trigger +############################################ +module "sqs_trigger" { + source = "../../../modules/lambda_sqs_trigger" + + lambda_arn = module.lambda.lambda_arn + lambda_role_name = module.role.role_name + queue_arn = module.queue.queue_arn + + batch_size = var.batch_size +} diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf b/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf new file mode 100644 index 00000000..afc9246d --- /dev/null +++ b/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf @@ -0,0 +1,11 @@ +output "lambda_arn" { + value = module.lambda.lambda_arn +} + +output "queue_arn" { + value = module.queue.queue_arn +} + +output "queue_url" { + value = module.queue.queue_url +} diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/variables.tf b/infrastructure/terraform/lambda/modules/lambda_with_sqs/variables.tf new file mode 100644 index 00000000..b20ab2a8 --- /dev/null +++ b/infrastructure/terraform/lambda/modules/lambda_with_sqs/variables.tf @@ -0,0 +1,36 @@ +variable "name" { + type = string +} + +variable "stage" { + type = string +} + +variable "image_uri" { + type = string +} + +variable "region" { + type = string + default = "eu-west-2" +} + +variable "timeout" { + type = number + default = 60 +} + +variable "memory_size" { + type = number + default = 1024 +} + +variable "environment" { + type = map(string) + default = {} +} + +variable "batch_size" { + type = number + default = 10 +} diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf new file mode 100644 index 00000000..ebbdbfdc --- /dev/null +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -0,0 +1,14 @@ +module "lambda" { + source = "../modules/lambda_with_sqs" + + name = "postcode-splitter" + stage = var.stage + + image_uri = local.image_uri + + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + } +} diff --git a/infrastructure/terraform/lambda/postcodeSplitter/provider.tf b/infrastructure/terraform/lambda/postcodeSplitter/provider.tf new file mode 100644 index 00000000..dbe323f2 --- /dev/null +++ b/infrastructure/terraform/lambda/postcodeSplitter/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.16" + } + } + + backend "s3" { + bucket = "postcode-splitter-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/postcodeSplitter/variables.tf b/infrastructure/terraform/lambda/postcodeSplitter/variables.tf new file mode 100644 index 00000000..9ce45fa5 --- /dev/null +++ b/infrastructure/terraform/lambda/postcodeSplitter/variables.tf @@ -0,0 +1,26 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/modules/container_registry/main.tf b/infrastructure/terraform/modules/container_registry/main.tf new file mode 100644 index 00000000..f5ba8d5e --- /dev/null +++ b/infrastructure/terraform/modules/container_registry/main.tf @@ -0,0 +1,30 @@ +resource "aws_ecr_repository" "this" { + name = "${var.name}-${var.stage}" + + image_tag_mutability = "MUTABLE" + + image_scanning_configuration { + scan_on_push = true + } +} + +resource "aws_ecr_lifecycle_policy" "this" { + repository = aws_ecr_repository.this.name + + policy = jsonencode({ + rules = [ + { + rulePriority = 1 + description = "Expire old images" + selection = { + tagStatus = "any" + countType = "imageCountMoreThan" + countNumber = var.retain_count + } + action = { + type = "expire" + } + } + ] + }) +} diff --git a/infrastructure/terraform/modules/container_registry/outputs.tf b/infrastructure/terraform/modules/container_registry/outputs.tf new file mode 100644 index 00000000..47a4bc64 --- /dev/null +++ b/infrastructure/terraform/modules/container_registry/outputs.tf @@ -0,0 +1,11 @@ +output "repository_name" { + value = aws_ecr_repository.this.name +} + +output "repository_url" { + value = aws_ecr_repository.this.repository_url +} + +output "repository_arn" { + value = aws_ecr_repository.this.arn +} diff --git a/infrastructure/terraform/modules/container_registry/variables.tf b/infrastructure/terraform/modules/container_registry/variables.tf new file mode 100644 index 00000000..11821b31 --- /dev/null +++ b/infrastructure/terraform/modules/container_registry/variables.tf @@ -0,0 +1,15 @@ +variable "name" { + description = "Base name of the repository (without stage)" + type = string +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} + +variable "retain_count" { + description = "Number of images to retain" + type = number + default = 10 +} diff --git a/infrastructure/terraform/modules/ecr/main.tf b/infrastructure/terraform/modules/ecr/main.tf index 468ef3d2..d93d1340 100644 --- a/infrastructure/terraform/modules/ecr/main.tf +++ b/infrastructure/terraform/modules/ecr/main.tf @@ -1,3 +1,6 @@ +# This ecr works for things deployed by serverless. +# TODO: unify ecr and container_registry to one + resource "aws_ecr_repository" "my_repository" { name = "${var.ecr_name}" image_tag_mutability = "MUTABLE" diff --git a/infrastructure/terraform/modules/ecr/outputs.tf b/infrastructure/terraform/modules/ecr/outputs.tf index 53839718..7f045412 100644 --- a/infrastructure/terraform/modules/ecr/outputs.tf +++ b/infrastructure/terraform/modules/ecr/outputs.tf @@ -1,4 +1,10 @@ output "ecr_repository_name" { description = "Name of the EPR repo in AWS" value = aws_ecr_repository.my_repository.name +} + + +output "ecr_repository_url" { + description = "Full ECR repository URL" + value = aws_ecr_repository.my_repository.repository_url } \ No newline at end of file diff --git a/infrastructure/terraform/modules/lambda_execution_role/main.tf b/infrastructure/terraform/modules/lambda_execution_role/main.tf new file mode 100644 index 00000000..fa657afd --- /dev/null +++ b/infrastructure/terraform/modules/lambda_execution_role/main.tf @@ -0,0 +1,37 @@ +data "aws_iam_policy_document" "assume" { + statement { + effect = "Allow" + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + actions = ["sts:AssumeRole"] + } +} + +resource "aws_iam_role" "this" { + name = var.name + assume_role_policy = data.aws_iam_policy_document.assume.json +} + +resource "aws_iam_role_policy_attachment" "basic_logs" { + role = aws_iam_role.this.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" +} + +resource "aws_iam_role_policy" "ecr_pull" { + role = aws_iam_role.this.name + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "ecr:GetAuthorizationToken", + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer" + ] + Resource = "*" + }] + }) +} diff --git a/infrastructure/terraform/modules/lambda_execution_role/outputs.tf b/infrastructure/terraform/modules/lambda_execution_role/outputs.tf new file mode 100644 index 00000000..1baca34d --- /dev/null +++ b/infrastructure/terraform/modules/lambda_execution_role/outputs.tf @@ -0,0 +1,7 @@ +output "role_arn" { + value = aws_iam_role.this.arn +} + +output "role_name" { + value = aws_iam_role.this.name +} diff --git a/infrastructure/terraform/modules/lambda_execution_role/variables.tf b/infrastructure/terraform/modules/lambda_execution_role/variables.tf new file mode 100644 index 00000000..f9f512ff --- /dev/null +++ b/infrastructure/terraform/modules/lambda_execution_role/variables.tf @@ -0,0 +1,4 @@ +variable "name" { + description = "IAM role name for the Lambda execution role" + type = string +} diff --git a/infrastructure/terraform/modules/lambda_service/main.tf b/infrastructure/terraform/modules/lambda_service/main.tf new file mode 100644 index 00000000..8a159db1 --- /dev/null +++ b/infrastructure/terraform/modules/lambda_service/main.tf @@ -0,0 +1,15 @@ +resource "aws_lambda_function" "this" { + function_name = var.name + role = var.role_arn + + package_type = "Image" + image_uri = var.image_uri + + timeout = var.timeout + memory_size = var.memory_size + publish = true + + environment { + variables = var.environment + } +} diff --git a/infrastructure/terraform/modules/lambda_service/outputs.tf b/infrastructure/terraform/modules/lambda_service/outputs.tf new file mode 100644 index 00000000..dd05cccf --- /dev/null +++ b/infrastructure/terraform/modules/lambda_service/outputs.tf @@ -0,0 +1,3 @@ +output "lambda_arn" { + value = aws_lambda_function.this.arn +} diff --git a/infrastructure/terraform/modules/lambda_service/variables.tf b/infrastructure/terraform/modules/lambda_service/variables.tf new file mode 100644 index 00000000..43def6ad --- /dev/null +++ b/infrastructure/terraform/modules/lambda_service/variables.tf @@ -0,0 +1,18 @@ +variable "name" { type = string } +variable "role_arn" { type = string } +variable "image_uri" { type = string } + +variable "timeout" { + type = number + default = 30 +} + +variable "memory_size" { + type = number + default = 512 +} + +variable "environment" { + type = map(string) + default = {} +} diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf b/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf new file mode 100644 index 00000000..5919e10f --- /dev/null +++ b/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf @@ -0,0 +1,23 @@ +resource "aws_lambda_event_source_mapping" "this" { + event_source_arn = var.queue_arn + function_name = var.lambda_arn + batch_size = var.batch_size + enabled = true +} + +resource "aws_iam_role_policy" "allow_sqs" { + role = var.lambda_role_name + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes" + ] + Resource = var.queue_arn + }] + }) +} diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf b/infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf new file mode 100644 index 00000000..0e50cd54 --- /dev/null +++ b/infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf @@ -0,0 +1,8 @@ +variable "lambda_arn" { type = string } +variable "lambda_role_name" { type = string } +variable "queue_arn" { type = string } + +variable "batch_size" { + type = number + default = 10 +} diff --git a/infrastructure/terraform/modules/sqs_queue/main.tf b/infrastructure/terraform/modules/sqs_queue/main.tf new file mode 100644 index 00000000..580e67bd --- /dev/null +++ b/infrastructure/terraform/modules/sqs_queue/main.tf @@ -0,0 +1,14 @@ +resource "aws_sqs_queue" "dlq" { + name = "${var.name}-dlq" +} + +resource "aws_sqs_queue" "this" { + name = var.name + + visibility_timeout_seconds = 120 + + redrive_policy = jsonencode({ + deadLetterTargetArn = aws_sqs_queue.dlq.arn + maxReceiveCount = var.max_receive_count + }) +} diff --git a/infrastructure/terraform/modules/sqs_queue/outputs.tf b/infrastructure/terraform/modules/sqs_queue/outputs.tf new file mode 100644 index 00000000..46fafe90 --- /dev/null +++ b/infrastructure/terraform/modules/sqs_queue/outputs.tf @@ -0,0 +1,7 @@ +output "queue_arn" { + value = aws_sqs_queue.this.arn +} + +output "queue_url" { + value = aws_sqs_queue.this.url +} diff --git a/infrastructure/terraform/modules/sqs_queue/variables.tf b/infrastructure/terraform/modules/sqs_queue/variables.tf new file mode 100644 index 00000000..943a7a16 --- /dev/null +++ b/infrastructure/terraform/modules/sqs_queue/variables.tf @@ -0,0 +1,6 @@ +variable "name" { type = string } + +variable "max_receive_count" { + type = number + default = 5 +} diff --git a/infrastructure/terraform/modules/tf_state_bucket/main.tf b/infrastructure/terraform/modules/tf_state_bucket/main.tf new file mode 100644 index 00000000..86c0cc21 --- /dev/null +++ b/infrastructure/terraform/modules/tf_state_bucket/main.tf @@ -0,0 +1,30 @@ +resource "aws_s3_bucket" "this" { + bucket = var.bucket_name +} + +resource "aws_s3_bucket_versioning" "this" { + bucket = aws_s3_bucket.this.id + + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "this" { + bucket = aws_s3_bucket.this.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +resource "aws_s3_bucket_public_access_block" "this" { + bucket = aws_s3_bucket.this.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} diff --git a/infrastructure/terraform/modules/tf_state_bucket/outputs.tf b/infrastructure/terraform/modules/tf_state_bucket/outputs.tf new file mode 100644 index 00000000..e8ceffd1 --- /dev/null +++ b/infrastructure/terraform/modules/tf_state_bucket/outputs.tf @@ -0,0 +1,7 @@ +output "bucket_name" { + value = aws_s3_bucket.this.bucket +} + +output "bucket_arn" { + value = aws_s3_bucket.this.arn +} diff --git a/infrastructure/terraform/modules/tf_state_bucket/variables.tf b/infrastructure/terraform/modules/tf_state_bucket/variables.tf new file mode 100644 index 00000000..b3aae9bb --- /dev/null +++ b/infrastructure/terraform/modules/tf_state_bucket/variables.tf @@ -0,0 +1,3 @@ +variable "bucket_name" { + type = string +} diff --git a/infrastructure/terraform/dev.tfvars b/infrastructure/terraform/shared/dev.tfvars similarity index 95% rename from infrastructure/terraform/dev.tfvars rename to infrastructure/terraform/shared/dev.tfvars index 92b7e158..53ca6d9e 100644 --- a/infrastructure/terraform/dev.tfvars +++ b/infrastructure/terraform/shared/dev.tfvars @@ -1,5 +1,4 @@ stage = "dev" -profile = "DevAdmin" region = "eu-west-2" # Domain diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/shared/main.tf similarity index 70% rename from infrastructure/terraform/main.tf rename to infrastructure/terraform/shared/main.tf index 5a67b793..b1474055 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -8,7 +8,6 @@ terraform { backend "s3" { bucket = "assessment-model-terraform-state" region = "eu-west-2" - profile = "DevAdmin" key = "terraform.tfstate" } @@ -16,7 +15,6 @@ terraform { } provider "aws" { - profile = var.profile region = var.region } @@ -86,106 +84,106 @@ resource "aws_db_instance" "default" { # Temporary to enfore immediate change apply_immediately = true # Set up storage type to gp3 for better performance - storage_type = "gp3" + storage_type = "gp3" } # Set up the bucket that recieve the csv uploads of epc to be retrofit module "s3_presignable_bucket" { - source = "./modules/s3_presignable_bucket" + source = "../modules/s3_presignable_bucket" bucketname = "retrofit-plan-inputs-${var.stage}" environment = var.stage allowed_origins = var.allowed_origins } module "s3_due_considerations_bucket" { - source = "./modules/s3_presignable_bucket" + source = "../modules/s3_presignable_bucket" bucketname = "retrofit-due-considerations-${var.stage}" environment = var.stage allowed_origins = var.allowed_origins } module "s3_eco_spreadseet_bucket" { - source = "./modules/s3_presignable_bucket" + source = "../modules/s3_presignable_bucket" bucketname = "retrofit-eco-spreadsheet-${var.stage}" environment = var.stage allowed_origins = var.allowed_origins } module "s3" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-datalake-${var.stage}" allowed_origins = var.allowed_origins } module "model_directory" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-model-directory-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_sap_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-sap-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_sap_data" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-data-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_carbon_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-carbon-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_heat_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-heat-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_lighting_cost_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-lighting-cost-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_heating_cost_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-heating-cost-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_hot_water_cost_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-hot-water-cost-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_heating_kwh_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-heating-kwh-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_hotwater_kwh_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-hotwater-kwh-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_sap_baseline_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-sap-baseline-predictions-${var.stage}" allowed_origins = var.allowed_origins } // We make this bucket presignable, because we want to generate download links for the frontend module "retrofit_energy_assessments" { - source = "./modules/s3_presignable_bucket" + source = "../modules/s3_presignable_bucket" bucketname = "retrofit-energy-assessments-${var.stage}" allowed_origins = var.allowed_origins environment = var.stage @@ -193,7 +191,7 @@ module "retrofit_energy_assessments" { # Set up the route53 record for the API module "route53" { - source = "./modules/route53" + source = "../modules/route53" domain_name = var.domain_name api_url_prefix = var.api_url_prefix providers = { @@ -201,75 +199,76 @@ module "route53" { } } + # Create an ECR repository for storage of the lambda's docker images module "ecr" { ecr_name = "fastapi-repository-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_sap_prediction_ecr" { ecr_name = "lambda-sap-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "due_considerations_ecr" { ecr_name = "due-considerations-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "eco_spreadsheet_ecr" { ecr_name = "eco-spreadsheet-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_carbon_prediction_ecr" { ecr_name = "lambda-carbon-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_heat_prediction_ecr" { ecr_name = "lambda-heat-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } # ECR repos for lighting cost, heating cost and hot water cost models module "lambda_lighting_cost_prediction_ecr" { ecr_name = "lighting-cost-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_heating_cost_prediction_ecr" { ecr_name = "heating-cost-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_hot_water_cost_prediction_ecr" { ecr_name = "hot-water-cost-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } # For heating and hot water kwh models module "lambda_heating_kwh_prediction_ecr" { ecr_name = "heating-kwh-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_hotwater_kwh_prediction_ecr" { ecr_name = "hotwater-kwh-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } # Baselining models module "sap_baseline_ecr" { ecr_name = "sap-baseline-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } ############################################## # CDN - Cloudfront ############################################## module "cloudfront_distribution" { - source = "./modules/cloudfront" + source = "../modules/cloudfront" bucket_name = module.s3.bucket_name bucket_id = module.s3.bucket_id bucket_arn = module.s3.bucket_arn @@ -281,11 +280,87 @@ module "cloudfront_distribution" { # SES - Email sending ################################################ module "ses" { - source = "./modules/ses" + source = "../modules/ses" domain_name = "domna.homes" stage = var.stage } output "ses_dns_records" { value = module.ses.dns_records +} + +################################################ +# Address2UPRN – Lambda ECR +################################################ +module "address2uprn_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "address2uprn-terraform-state" + +} + +module "address2uprn_registry" { + source = "../modules/container_registry" + name = "address2uprn" + stage = var.stage + +} + +################################################ +# Condition ETL – Lambda ECR +################################################ +module "condition_etl_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "condition-etl-terraform-state" + +} + +module "condition_etl_registry" { + source = "../modules/container_registry" + name = "condition-etl" + stage = var.stage + +} + +################################################ +# Postcode Splitter – Lambda ECR +################################################ +module "postcode_splitter_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "postcode-splitter-terraform-state" + +} + +module "postcode_splitter_registry" { + source = "../modules/container_registry" + name = "postcode_splitter" + stage = var.stage + +} + +################################################ +# Conidition data – S3 bucket +################################################ +module "condition_data_bucket" { + source = "../modules/s3" + bucketname = "condition-data-${var.stage}" + allowed_origins = var.allowed_origins +} + +resource "aws_iam_policy" "condition_etl_s3_read" { + name = "ConditionETLReadS3" + description = "Allow Lambda to read objects from condition-data-${var.stage}" + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = ["s3:GetObject"] + Resource = "arn:aws:s3:::condition-data-${var.stage}/*" + } + ] + }) +} + +output "condition_etl_s3_read_arn" { + value = aws_iam_policy.condition_etl_s3_read.arn } \ No newline at end of file diff --git a/infrastructure/terraform/secrets.tf b/infrastructure/terraform/shared/secrets.tf similarity index 100% rename from infrastructure/terraform/secrets.tf rename to infrastructure/terraform/shared/secrets.tf diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/shared/variables.tf similarity index 90% rename from infrastructure/terraform/variables.tf rename to infrastructure/terraform/shared/variables.tf index 76734340..e922e465 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/shared/variables.tf @@ -3,11 +3,6 @@ variable stage { type = string } -variable "profile" { - description = "AWS profile to use" - type = string -} - variable "region" { description = "AWS region" type = string diff --git a/model_data/requirements/requirements.txt b/model_data/requirements/requirements.txt index 845166d9..bbf75df5 100644 --- a/model_data/requirements/requirements.txt +++ b/model_data/requirements/requirements.txt @@ -1,4 +1,4 @@ -pydantic==2.9.2 +pydantic>=1.10.7 pydantic-settings==2.6.0 epc-api-python==1.0.2 numpy==2.1.2 diff --git a/pytest.ini b/pytest.ini index 1422657b..ee203d46 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] pythonpath = . addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index c6fea3b6..e470c1a3 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -1090,6 +1090,7 @@ class Recommendations: ashp_cop = ashp_cop if ashp_cop else assumptions.AVERAGE_ASHP_EFFICIENCY + # kwh_impact_table = kwh_simulation_predictions["heating_kwh_predictions"][ kwh_simulation_predictions["heating_kwh_predictions"]["property_id"] == str(property_instance.id) ].merge( diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index 71e47ba6..f88a672b 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -331,18 +331,18 @@ class RoofRecommendations: """ # Can a non-primary part satisfy loft insulation? - primary_needs_loft = component_needs[1]["needs_loft_insulation"] + primary_needs_loft = component_needs[0]["needs_loft_insulation"] secondary_needs_loft = any( - p['needs_loft_insulation'] for idx, p in component_needs.items() if idx != 1 + p['needs_loft_insulation'] for idx, p in component_needs.items() if idx != 0 ) if primary_needs_loft and not secondary_needs_loft: # Only option is loft return "loft" - primary_needs_sloping = component_needs[1]["needs_sloping_ceiling"] + primary_needs_sloping = component_needs[0]["needs_sloping_ceiling"] secondary_needs_sloping = any( - p['needs_sloping_ceiling'] for idx, p in component_needs.items() if idx != 1 + p['needs_sloping_ceiling'] for idx, p in component_needs.items() if idx != 0 ) if primary_needs_sloping and not secondary_needs_sloping: @@ -418,11 +418,13 @@ class RoofRecommendations: return needs_sloping, not needs_loft # Indicates that the property needs sloping ceiling as we only run # this in that case + roof_components = [x for x in find_my_epc_components if x["component_name"] == "Roof"] + extracted_roof_descriptions = { idx: { "description": component["description"], **RoofAttributes(component["description"]).process() - } for idx, component in enumerate(find_my_epc_components) if component["component_name"] == "Roof" + } for idx, component in enumerate(roof_components) } component_needs = {} diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index 2184d074..a65509d5 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -2,29 +2,38 @@ This script prepares the data for the financial model """ +from dotenv import load_dotenv + +load_dotenv(".env.local") + import pandas as pd import numpy as np from backend.app.utils import sap_to_epc from sqlalchemy.orm import sessionmaker from backend.app.db.connection import db_engine, db_read_session -from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations, RecommendationMaterials -from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel, PropertyDetailsSpatial +from backend.app.db.models.recommendations import ( + Recommendation, + Plan, + PlanRecommendations, + RecommendationMaterials, +) +from backend.app.db.models.portfolio import ( + PropertyModel, + PropertyDetailsEpcModel, + PropertyDetailsSpatial, +) from backend.app.db.functions.materials_functions import get_materials from collections import defaultdict from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 435 # Peabody +PORTFOLIO_ID = 524 SCENARIOS = [ - 908, - 909, - 910, + 1009, ] scenario_names = { - 908: "EPC C - no solid floor, ashp 3.0", - 909: "EPC C - no solid floor, no EWI or IWI, ashp 3.0", - 910: "EPC B - no solid floor, no EWI, ashp 3.0" + 1009: "EPC C; Most Economic", } @@ -35,22 +44,26 @@ def get_data(portfolio_id, scenario_ids): # -------------------- # Properties # -------------------- - properties_query = session.query( - PropertyModel, - PropertyDetailsEpcModel - ).join( - PropertyDetailsEpcModel, - PropertyModel.id == PropertyDetailsEpcModel.property_id - ).filter( - PropertyModel.portfolio_id == portfolio_id - ).all() + properties_query = ( + session.query(PropertyModel, PropertyDetailsEpcModel) + .join( + PropertyDetailsEpcModel, + PropertyModel.id == PropertyDetailsEpcModel.property_id, + ) + .filter(PropertyModel.portfolio_id == portfolio_id) + .all() + ) properties_data = [ { - **{col.name: getattr(p.PropertyModel, col.name) - for col in PropertyModel.__table__.columns}, - **{col.name: getattr(p.PropertyDetailsEpcModel, col.name) - for col in PropertyDetailsEpcModel.__table__.columns}, + **{ + col.name: getattr(p.PropertyModel, col.name) + for col in PropertyModel.__table__.columns + }, + **{ + col.name: getattr(p.PropertyDetailsEpcModel, col.name) + for col in PropertyDetailsEpcModel.__table__.columns + }, } for p in properties_query ] @@ -62,13 +75,10 @@ def get_data(portfolio_id, scenario_ids): session.query( Plan.scenario_id, Plan.property_id, - func.max(Plan.created_at).label("latest_created_at") + func.max(Plan.created_at).label("latest_created_at"), ) .filter(Plan.scenario_id.in_(scenario_ids)) - .group_by( - Plan.scenario_id, - Plan.property_id - ) + .group_by(Plan.scenario_id, Plan.property_id) .subquery() ) @@ -80,9 +90,9 @@ def get_data(portfolio_id, scenario_ids): session.query(Plan) .join( latest_plans_subq, - (Plan.scenario_id == latest_plans_subq.c.scenario_id) & - (Plan.property_id == latest_plans_subq.c.property_id) & - (Plan.created_at == latest_plans_subq.c.latest_created_at) + (Plan.scenario_id == latest_plans_subq.c.scenario_id) + & (Plan.property_id == latest_plans_subq.c.property_id) + & (Plan.created_at == latest_plans_subq.c.latest_created_at), ) .all() ) @@ -107,28 +117,29 @@ def get_data(portfolio_id, scenario_ids): # -------------------- # Recommendations (NO materials yet) # -------------------- - recommendations_query = session.query( - Recommendation, - Plan.scenario_id, - PlanRecommendations.plan_id - ).join( - PlanRecommendations, - Recommendation.id == PlanRecommendations.recommendation_id - ).join( - Plan, - Plan.id == PlanRecommendations.plan_id - ).filter( - PlanRecommendations.plan_id.in_(plan_ids), - Recommendation.default.is_(True), - Recommendation.already_installed.is_(False) - ).all() + recommendations_query = ( + session.query(Recommendation, Plan.scenario_id, PlanRecommendations.plan_id) + .join( + PlanRecommendations, + Recommendation.id == PlanRecommendations.recommendation_id, + ) + .join(Plan, Plan.id == PlanRecommendations.plan_id) + .filter( + PlanRecommendations.plan_id.in_(plan_ids), + Recommendation.default.is_(True), + Recommendation.already_installed.is_(False), + ) + .all() + ) recommendations_data = [ { - **{col.name: getattr(r.Recommendation, col.name) - for col in Recommendation.__table__.columns}, + **{ + col.name: getattr(r.Recommendation, col.name) + for col in Recommendation.__table__.columns + }, "scenario_id": r.scenario_id, - "materials": [] # placeholder + "materials": [], # placeholder } for r in recommendations_query ] @@ -138,23 +149,25 @@ def get_data(portfolio_id, scenario_ids): # -------------------- # Recommendation materials (SEPARATE QUERY) # -------------------- - materials_query = session.query( - RecommendationMaterials - ).filter( - RecommendationMaterials.recommendation_id.in_(recommendation_ids) - ).all() + materials_query = ( + session.query(RecommendationMaterials) + .filter(RecommendationMaterials.recommendation_id.in_(recommendation_ids)) + .all() + ) # Group materials by recommendation_id materials_by_recommendation = defaultdict(list) for m in materials_query: - materials_by_recommendation[m.recommendation_id].append({ - "material_id": m.material_id, - "depth": m.depth, - "quantity": m.quantity, - "quantity_unit": m.quantity_unit, - "estimated_cost": m.estimated_cost, - }) + materials_by_recommendation[m.recommendation_id].append( + { + "material_id": m.material_id, + "depth": m.depth, + "quantity": m.quantity, + "quantity_unit": m.quantity_unit, + "estimated_cost": m.estimated_cost, + } + ) # Attach materials safely (no filtering side effects) for r in recommendations_data: @@ -165,7 +178,9 @@ def get_data(portfolio_id, scenario_ids): return properties_data, plans_data, recommendations_data -properties_data, plans_data, recommendations_data = get_data(portfolio_id=PORTFOLIO_ID, scenario_ids=SCENARIOS) +properties_data, plans_data, recommendations_data = get_data( + portfolio_id=PORTFOLIO_ID, scenario_ids=SCENARIOS +) properties_df = pd.DataFrame(properties_data) plans_df = pd.DataFrame(plans_data) @@ -176,10 +191,8 @@ with db_read_session() as session: materials = pd.DataFrame(materials) -material_lookup = ( - materials - .set_index("id")[["type", "includes_battery"]] - .to_dict("index") +material_lookup = materials.set_index("id")[["type", "includes_battery"]].to_dict( + "index" ) @@ -193,14 +206,14 @@ def has_solar_with_battery(materials_list): return False -recommendations_df["has_solar_with_battery"] = ( - recommendations_df["materials"].apply(has_solar_with_battery) +recommendations_df["has_solar_with_battery"] = recommendations_df["materials"].apply( + has_solar_with_battery ) recommendations_df["measure_type"] = np.where( recommendations_df["has_solar_with_battery"] == True, recommendations_df["measure_type"] + "_with_battery", - recommendations_df["measure_type"] + recommendations_df["measure_type"], ) # Adjust material type to indicate if there is a battery included @@ -215,50 +228,67 @@ from utils.s3 import read_csv_from_s3, read_excel_from_s3 for scenario_id in SCENARIOS: # Get recs for this scenario - recommended_measures_df = recommendations_df[recommendations_df["scenario_id"] == scenario_id][ - ["property_id", "measure_type", "estimated_cost", "default"] + recommended_measures_df = recommendations_df[ + recommendations_df["scenario_id"] == scenario_id + ][["property_id", "measure_type", "estimated_cost", "default"]] + recommended_measures_df = recommended_measures_df[ + recommended_measures_df["default"] ] - recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]] recommended_measures_df = recommended_measures_df.drop(columns=["default"]) - post_install_sap = recommendations_df[recommendations_df["scenario_id"] == scenario_id][ - ["property_id", "default", "sap_points"]] + post_install_sap = recommendations_df[ + recommendations_df["scenario_id"] == scenario_id + ][["property_id", "default", "sap_points"]] post_install_sap = post_install_sap[post_install_sap["default"]] # Sum up the sap points by property id - post_install_sap = post_install_sap.groupby(["property_id"])[["sap_points"]].sum().reset_index() + post_install_sap = ( + post_install_sap.groupby(["property_id"])[["sap_points"]].sum().reset_index() + ) # Find dupes by property id and measure type - dupes = recommended_measures_df.duplicated(subset=["property_id", "measure_type"], keep=False) + dupes = recommended_measures_df.duplicated( + subset=["property_id", "measure_type"], keep=False + ) dupe_df = recommended_measures_df[dupes] if dupe_df.shape: # Drop dupes - happened due to a funny bug recommended_measures_df = recommended_measures_df.drop_duplicates( - subset=["property_id", "measure_type"], keep='first' + subset=["property_id", "measure_type"], keep="first" ) recommendations_measures_pivot = recommended_measures_df.pivot( - index='property_id', - columns='measure_type', - values='estimated_cost' + index="property_id", columns="measure_type", values="estimated_cost" ) recommendations_measures_pivot = recommendations_measures_pivot.reset_index() # Total cost is the row sum, excluding the property_id column - recommendations_measures_pivot["total_retrofit_cost"] = recommendations_measures_pivot.drop( - columns=["property_id"] - ).sum(axis=1) + recommendations_measures_pivot["total_retrofit_cost"] = ( + recommendations_measures_pivot.drop(columns=["property_id"]).sum(axis=1) + ) - df = properties_df[ - [ - "landlord_property_id", "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", - "heating", "windows", "current_epc_rating", "current_sap_points", "total_floor_area", "number_of_rooms", - "id" + df = ( + properties_df[ + [ + "landlord_property_id", + "property_id", + "uprn", + "address", + "postcode", + "property_type", + "walls", + "roof", + "heating", + "windows", + "current_epc_rating", + "current_sap_points", + "total_floor_area", + "number_of_rooms", + "id", + ] ] - ].merge( - recommendations_measures_pivot, how="left", on="property_id" - ).merge( - post_install_sap, how="left", on="property_id" + .merge(recommendations_measures_pivot, how="left", on="property_id") + .merge(post_install_sap, how="left", on="property_id") ) # df = df.drop(columns=["property_id"]) @@ -266,21 +296,25 @@ for scenario_id in SCENARIOS: df["predicted_post_works_sap"] = df["current_sap_points"] + df["sap_points"] df["predicted_post_works_sap"] = df["predicted_post_works_sap"] - df["predicted_post_works_epc"] = df["predicted_post_works_sap"].apply(lambda x: sap_to_epc(x)) + df["predicted_post_works_epc"] = df["predicted_post_works_sap"].apply( + lambda x: sap_to_epc(x) + ) df["uprn"] = df["uprn"].astype(str) relevant_plans = plans_df[plans_df["scenario_id"] == scenario_id] df2 = df.merge( - relevant_plans[["property_id", "post_sap_points", "post_epc_rating"]], how="left", on="property_id", - suffixes=("", "_plan") + relevant_plans[["property_id", "post_sap_points", "post_epc_rating"]], + how="left", + on="property_id", + suffixes=("", "_plan"), ) print(df2["predicted_post_works_epc"].value_counts()) print(df2["post_epc_rating"].value_counts()) z = df2[ - (df2["predicted_post_works_epc"] != "D") & - (df2["post_epc_rating"].astype(str) == "Epc.D") - ] + (df2["predicted_post_works_epc"] != "D") + & (df2["post_epc_rating"].astype(str) == "Epc.D") + ] df2["predicted_post_works_epc"].value_counts() df2["post_epc_rating"].astype(str).value_counts() @@ -295,183 +329,6 @@ for scenario_id in SCENARIOS: df[df["predicted_post_works_sap"] == ""] # Create excel to store to - filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " - f"Project/Final SAL/scenarios/{scenario_names[scenario_id]} - 20250114 final.xlsx") + filename = f"{scenario_names[scenario_id]} - 20250113 final.xlsx" with pd.ExcelWriter(filename) as writer: df.to_excel(writer, sheet_name="properties", index=False) - - -# asset_list = pd.DataFrame(asset_list) -# asset_list = asset_list.rename( -# columns={ -# "postcode": "domna_postcode" -# } -# ) -# if "domna_full_address": -# # For Peabody -# asset_list["domna_full_address"] = asset_list["domna_address_1"] -# -# asset_list = asset_list[["domna_full_address", "domna_postcode", "epc_os_uprn", ]].copy() -# asset_list = asset_list.rename(columns={"epc_os_uprn": "uprn"}) -# asset_list["uprn"] = asset_list["uprn"].astype("Int64").astype(str) -# asset_list = asset_list.merge( -# df.drop(columns=["address", "postcode", "property_type", "total_floor_area"]), -# how="left", -# on="uprn" -# ) - - -# Get conservation area data from property details spatial. based on the UPRNs -def get_conservation_area_data(uprns): - session = sessionmaker(bind=db_engine)() - session.begin() - - # Query to get conservation area data - spatial_query = session.query( - PropertyDetailsSpatial - ).filter( - PropertyDetailsSpatial.uprn.in_(uprns) # Filter by UPRNs - ).all() - - # Transform spatial data to include all fields dynamically - spatial_data = [ - {col.name: getattr(spatial, col.name) for col in PropertyDetailsSpatial.__table__.columns} - for spatial in spatial_query - ] - - session.close() - return pd.DataFrame(spatial_data) - - -uprns = asset_list[ - ~pd.isna(asset_list["uprn"]) & (asset_list["uprn"] != "") - ]["uprn"].astype(int).unique().tolist() -conservation_area_data = get_conservation_area_data(uprns) -conservation_area_data["uprn"] = conservation_area_data["uprn"].astype(str) -asset_list = asset_list.merge( - conservation_area_data[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]], - how="left", - on="uprn" -) - -# For exporting -df.to_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lincs Rural/EPC C -without floors proposed measures - " - "with ID.xlsx", - index=False -) -# asset_list.to_excel( -# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lincs Rural/epc_measures.xlsx", -# index=False -# ) - -condition_costs = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/sfr/Spring JV/Condition costs.xlsx", - sheet_name="Prices - Khalim", - header=35 -) -# Remove unnamed columns and reset index -condition_costs = condition_costs.loc[:, ~condition_costs.columns.str.contains('^Unnamed')] -condition_costs = condition_costs.reset_index(drop=True) - - -# We now estimate condition cost -def simulate_condition(asset_list, condition_costs): - """ - This function is for testing, and will simulate condition cost from 1-10 for each property to see what the - costing array looks like. - :param df: - :return: - """ - - condition_df = [] - for _, row in asset_list.iterrows(): - - n_bathrooms = row["bathrooms"] - - conditions = {} - for condition in reversed(range(1, 11)): - condition_cost = condition_costs[ - condition_costs["Condition"] == condition - ].drop(columns=["Condition"]).iloc[0] - - # Each cost is scaled by floor area - condition_cost = condition_cost * row["total_floor_area"] - condition_cost["Bathroom"] = condition_cost["Bathroom"] * n_bathrooms - - total_condition_cost = condition_cost.sum() - conditions["Condition " + str(condition)] = (total_condition_cost) - - condition_df.append( - { - "uprn": row["uprn"], - **conditions - } - ) - - condition_df = pd.DataFrame(condition_df) - - asset_list = asset_list.merge( - condition_df, - how="left", - on="uprn" - ) - - return asset_list - - -# asset_list = simulate_condition(asset_list, condition_costs) - -# We calculate the condition cost based on the condition -for _, row in asset_list.iterrows(): - - condition = row["condition_score"] - if condition in [None, ""]: - continue - condition = int(float(condition)) - - condition_cost = condition_costs[ - condition_costs["Condition"] == condition - ].drop(columns=["Condition"]).iloc[0] - - # Each cost is scaled by floor area - condition_cost = condition_cost * float(row["total_floor_area"]) - n_bathrooms = row["n_bathrooms"] - condition_cost["Bathroom"] = condition_cost["Bathroom"] * float(n_bathrooms) - - total_condition_cost = condition_cost.sum() - asset_list.loc[asset_list["uprn"] == row["uprn"], "domna_condition_cost"] = total_condition_cost - -# Store output -asset_list.to_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/sfr/Spring JV/20250624_portfolio_retrofit_packages.xlsx", - index=False -) - -condition_cost_comparison = asset_list[ - ["condition_score", "decoration_sum_min ", "decoration_sum_max", "domna_condition_cost"] -] - -# Testing -plans_df.head() - -example = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final " - "SAL/scenarios/EPC C - no solid floor, no EWI or IWI, ashp 3.0 - 20250114 final.xlsx" -) - -plans_df2 = plans_df.merge( - properties_df[["property_id", "landlord_property_id"]], - left_on="property_id", - right_on="property_id", - how="left" -) - -plans_df2 = plans_df2[plans_df2["scenario_id"] == 909] - -dupes = plans_df2[plans_df2["property_id"].duplicated()] - -# merge on plans -example = example.merge( - plans_df, how="left", -) diff --git a/utils/s3.py b/utils/s3.py index e70669d0..2e67d4f0 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -264,6 +264,7 @@ def save_excel_to_s3(df, bucket_name, file_key): def read_csv_from_s3(bucket_name, filepath): + logger.info(f"Reading CSV file from S3 bucket '{bucket_name}' with key '{filepath}'") s3 = boto3.client('s3') # Get the object from s3