diff --git a/.devcontainer/asset_list/Dockerfile b/.devcontainer/asset_list/Dockerfile new file mode 100644 index 00000000..512ab109 --- /dev/null +++ b/.devcontainer/asset_list/Dockerfile @@ -0,0 +1,39 @@ +FROM python:3.11.10-bullseye + + +ARG USER=vscode +ARG DEBIAN_FRONTEND=noninteractive + +# 1) Toolchain + utilities for building libpostal +RUN apt-get update && apt-get install -y --no-install-recommends \ + sudo jq vim curl git ca-certificates \ + build-essential pkg-config automake autoconf libtool \ + && rm -rf /var/lib/apt/lists/* + +# # 2) Build and install libpostal from source +RUN git clone --depth 1 https://github.com/openvenues/libpostal /tmp/libpostal \ + && cd /tmp/libpostal \ + && ./bootstrap.sh \ + && ./configure --datadir=/usr/local/share/libpostal \ + && make -j"$(nproc)" \ + && make install \ + && ldconfig \ + && rm -rf /tmp/libpostal + +# 3) Create the user and grant sudo privileges +RUN useradd -m -s /usr/bin/bash ${USER} \ + && echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \ + && chmod 0440 /etc/sudoers.d/${USER} + +# # 4) Python deps - if you want to run assest list +ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 +ADD asset_list/requirements.txt requirements.txt +RUN pip install -r requirements.txt + +RUN pip install -r requirements.txt +# 5) Workdir +WORKDIR /workspaces/model + +# 6) Make Python find your package +# Add project root to PYTHONPATH for all processes +ENV PYTHONPATH=/workspaces/model:${PYTHONPATH} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/asset_list/devcontainer.json similarity index 95% rename from .devcontainer/devcontainer.json rename to .devcontainer/asset_list/devcontainer.json index 5e23ae0d..4834d559 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/asset_list/devcontainer.json @@ -1,7 +1,7 @@ { - "name": "Basic Python", + "name": "SAL ENV", "dockerComposeFile": "docker-compose.yml", - "service": "model", + "service": "model-sal", "remoteUser": "vscode", "workspaceFolder": "/workspaces/model", "postStartCommand": "bash .devcontainer/post-install.sh", diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/asset_list/docker-compose.yml similarity index 59% rename from .devcontainer/docker-compose.yml rename to .devcontainer/asset_list/docker-compose.yml index 7f60d34d..06e4124d 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/asset_list/docker-compose.yml @@ -1,14 +1,14 @@ version: '3.8' services: - model: + model-sal: user: "${UID}:${GID}" build: - context: .. - dockerfile: .devcontainer/Dockerfile + context: ../.. + dockerfile: .devcontainer/asset_list/Dockerfile command: sleep infinity volumes: - - ..:/workspaces/model + - ../../:/workspaces/model networks: - model-net diff --git a/.devcontainer/post-install.sh b/.devcontainer/asset_list/post-install.sh similarity index 98% rename from .devcontainer/post-install.sh rename to .devcontainer/asset_list/post-install.sh index dc6da006..48fbfde1 100644 --- a/.devcontainer/post-install.sh +++ b/.devcontainer/asset_list/post-install.sh @@ -11,4 +11,4 @@ if os.path.exists(env_path): print("✔ Loaded .env into Jupyter kernel") else: print("⚠ No .env file found to load") -EOF \ No newline at end of file +EOF diff --git a/.devcontainer/asset_list/requirements.txt b/.devcontainer/asset_list/requirements.txt new file mode 100644 index 00000000..0640f2c9 --- /dev/null +++ b/.devcontainer/asset_list/requirements.txt @@ -0,0 +1,24 @@ +fastapi==0.115.2 +sqlalchemy==2.0.36 +psycopg2-binary==2.9.10 +python-jose==3.3.0 +cryptography==43.0.3 +mangum==0.19.0 +# AWS +boto3==1.35.44 +# Data +openpyxl==3.1.2 +# Basic +pytz +uvicorn[standard] +# Testing +pytest==9.0.2 +pytest-cov==7.0.0 +ipykernel>=6.25,<7 +pydantic-settings<2 +pyyaml>=6.0.1 +pydantic>=1.10.7,<2 +sqlmodel +# Formatting +black==26.1.0 +dotenv diff --git a/.devcontainer/Dockerfile b/.devcontainer/backend/Dockerfile similarity index 96% rename from .devcontainer/Dockerfile rename to .devcontainer/backend/Dockerfile index ccfb55b6..4c5d16f5 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -34,7 +34,7 @@ RUN useradd -m -s /usr/bin/bash ${USER} \ ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 ADD backend/engine/requirements.txt requirements1.txt ADD backend/app/requirements/requirements.txt requirements2.txt -ADD .devcontainer/requirements.txt requirements3.txt +ADD .devcontainer/backend/requirements.txt requirements3.txt RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt RUN pip install -r requirements.txt diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json new file mode 100644 index 00000000..c672b1bf --- /dev/null +++ b/.devcontainer/backend/devcontainer.json @@ -0,0 +1,40 @@ +{ + "name": "Backend Model Env", + "dockerComposeFile": "docker-compose.yml", + "service": "model-backend", + "remoteUser": "vscode", + "workspaceFolder": "/workspaces/model", + "postStartCommand": "bash .devcontainer/backend/post-install.sh", + "mounts": [ + "source=${localEnv:HOME},target=/workspaces/home,type=bind" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-toolsai.jupyter", + "mechatroner.rainbow-csv", + "ms-toolsai.datawrangler", + "lindacong.vscode-book-reader", + "4ops.terraform", + "fabiospampinato.vscode-todo-plus", + "jgclark.vscode-todo-highlight", + "corentinartaud.pdfpreview", + "ms-python.vscode-python-envs", + "ms-python.black-formatter", + "waderyan.gitblame" + ], + "settings": { + "files.defaultWorkspace": "/workspaces/model", + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true + }, + "python.formatting.provider": "none" + } + } + }, + "containerEnv": { + "PYTHONFLAGS": "-Xfrozen_modules=off" + } +} diff --git a/.devcontainer/backend/docker-compose.yml b/.devcontainer/backend/docker-compose.yml new file mode 100644 index 00000000..683b4489 --- /dev/null +++ b/.devcontainer/backend/docker-compose.yml @@ -0,0 +1,28 @@ +version: '3.8' + +services: + model-backend: + user: "${UID}:${GID}" + build: + context: ../.. + dockerfile: .devcontainer/backend/Dockerfile + command: sleep infinity + volumes: + - ../../:/workspaces/model + + + db: + image: postgres:17.4 + restart: unless-stopped + ports: + - 5432:5432 + environment: + - PGDATABASE=tech_team_local_db + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=makingwarmerhomes + volumes: + - postgres-data-two:/var/lib/postgresql/data + + +volumes: + postgres-data-two: \ No newline at end of file diff --git a/.devcontainer/backend/post-install.sh b/.devcontainer/backend/post-install.sh new file mode 100644 index 00000000..48fbfde1 --- /dev/null +++ b/.devcontainer/backend/post-install.sh @@ -0,0 +1,14 @@ +mkdir -p ~/.ipython/profile_default/startup + +cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py +from dotenv import load_dotenv +import os + +# Adjust path as needed +env_path = "/workspaces/model/backend/.env" +if os.path.exists(env_path): + load_dotenv(env_path) + print("✔ Loaded .env into Jupyter kernel") +else: + print("⚠ No .env file found to load") +EOF diff --git a/.devcontainer/requirements.txt b/.devcontainer/backend/requirements.txt similarity index 96% rename from .devcontainer/requirements.txt rename to .devcontainer/backend/requirements.txt index 5e7753a6..9562aa6a 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -1,4 +1,4 @@ -# fastapi + fastapi==0.115.2 sqlalchemy==2.0.36 pydantic-settings==2.6.0 diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml new file mode 100644 index 00000000..6b6c4994 --- /dev/null +++ b/.github/workflows/_build_image.yml @@ -0,0 +1,78 @@ +name: Build Docker image + +on: + workflow_call: + inputs: + ecr_repo: + required: true + type: string + dockerfile_path: + required: true + type: string + build_context: + required: false + default: "." + type: string + + outputs: + image_digest: + description: "Pushed image digest" + value: ${{ jobs.build.outputs.image_digest }} + ecr_repo_url: + description: "ECR repository URL" + value: ${{ jobs.build.outputs.ecr_repo_url }} + + secrets: + AWS_ACCESS_KEY_ID: + required: true + AWS_SECRET_ACCESS_KEY: + required: true + AWS_REGION: + required: true + +jobs: + build: + runs-on: ubuntu-latest + + outputs: + image_digest: ${{ steps.digest.outputs.image_digest }} + ecr_repo_url: ${{ steps.repo.outputs.ecr_repo_url }} + + steps: + - uses: actions/checkout@v4 + + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - uses: aws-actions/amazon-ecr-login@v2 + + - name: Resolve ECR repo URL + id: repo + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) + + ECR_REPO_URL="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${{ inputs.ecr_repo }}" + + echo "Resolved ECR repo URL (local var):" + echo "$ECR_REPO_URL" + + echo "ecr_repo_url=$ECR_REPO_URL" >> "$GITHUB_OUTPUT" + + - name: Build & push image + run: | + IMAGE_URI="${{ steps.repo.outputs.ecr_repo_url }}:${GITHUB_SHA}" + docker build -f ${{ inputs.dockerfile_path }} -t $IMAGE_URI ${{ inputs.build_context }} + docker push $IMAGE_URI + + - name: Resolve image digest + id: digest + run: | + DIGEST=$(aws ecr describe-images \ + --repository-name ${{ inputs.ecr_repo }} \ + --image-ids imageTag=${GITHUB_SHA} \ + --query 'imageDetails[0].imageDigest' \ + --output text) + echo "image_digest=$DIGEST" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml new file mode 100644 index 00000000..bff106c5 --- /dev/null +++ b/.github/workflows/_deploy_lambda.yml @@ -0,0 +1,91 @@ +name: Deploy Lambda (Terraform) + +on: + workflow_call: + inputs: + lambda_name: + required: true + type: string + + lambda_path: + required: true + type: string + + stage: + required: true + type: string + + ecr_repo: + required: true + type: string + + image_digest: + required: true + type: string + + secrets: + AWS_ACCESS_KEY_ID: + required: true + AWS_SECRET_ACCESS_KEY: + required: true + AWS_REGION: + required: true + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Debug inputs + run: | + echo "lambda_name=${{ inputs.lambda_name }}" + echo "lambda_path=${{ inputs.lambda_path }}" + echo "stage=${{ inputs.stage }}" + echo "ecr_repo_url=${{ inputs.ecr_repo_url }}" + echo "image_digest=${{ inputs.image_digest }}" + + + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - uses: hashicorp/setup-terraform@v3 + + - uses: aws-actions/amazon-ecr-login@v2 + + - name: Resolve ECR repo URL + id: repo + env: + AWS_REGION: ${{ secrets.AWS_REGION }} + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) + ECR_REPO_URL="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${{ inputs.ecr_repo }}" + echo "ecr_repo_url=$ECR_REPO_URL" >> "$GITHUB_OUTPUT" + + - name: Terraform Init + working-directory: ${{ inputs.lambda_path }} + run: terraform init -reconfigure + + - name: Terraform Workspace + working-directory: ${{ inputs.lambda_path }} + run: | + terraform workspace select ${{ inputs.stage }} \ + || terraform workspace new ${{ inputs.stage }} + + - name: Terraform Plan + working-directory: ${{ inputs.lambda_path }} + run: | + terraform plan \ + -var="stage=${{ inputs.stage }}" \ + -var="lambda_name=${{ inputs.lambda_name }}" \ + -var="ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" \ + -var="image_digest=${{ inputs.image_digest }}" \ + -out=lambdaplan + + - name: Terraform Apply + working-directory: ${{ inputs.lambda_path }} + run: terraform apply -auto-approve lambdaplan diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index a7aef225..41a551c4 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -1,80 +1,98 @@ -name: Deploy terraform stack +name: Deploy infrastructure on: push: branches: - - dev - - prod + - "**" jobs: - deploy: + determine_stage: runs-on: ubuntu-latest + outputs: + stage: ${{ steps.set-stage.outputs.stage }} + steps: - - name: Checkout - uses: actions/checkout@v2 - - - name: Setup AWS credentials file + - name: Determine stage from branch + id: set-stage + shell: bash run: | - mkdir -p ~/.aws - echo "[DevAdmin]" > ~/.aws/credentials - echo "aws_access_key_id = ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}" >> ~/.aws/credentials - echo "aws_secret_access_key = ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}" >> ~/.aws/credentials - echo "[ProdAdmin]" >> ~/.aws/credentials - echo "aws_access_key_id = ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}" >> ~/.aws/credentials - echo "aws_secret_access_key = ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}" >> ~/.aws/credentials + BRANCH="${GITHUB_REF_NAME}" - - name: Setup AWS config file - run: | - echo "[profile DevAdmin]" > ~/.aws/config - echo "region = eu-west-2" >> ~/.aws/config - echo "[profile ProdAdmin]" >> ~/.aws/config - echo "region = eu-west-2" >> ~/.aws/config + if [[ "$BRANCH" == "prod" ]]; then + echo "stage=prod" >> "$GITHUB_OUTPUT" - - name: Setup Terraform - uses: hashicorp/setup-terraform@v1 - with: - terraform_version: 1.5.2 + elif [[ "$BRANCH" == "dev" ]]; then + echo "stage=dev" >> "$GITHUB_OUTPUT" - - name: Configure AWS credentials (DevAdmin) - uses: aws-actions/configure-aws-credentials@v1 + else + echo "stage=dev" >> "$GITHUB_OUTPUT" + fi + + # ============================================================ + # 1️⃣ Shared Terraform (infra) + # ============================================================ + shared_terraform: + needs: determine_stage + runs-on: ubuntu-latest + env: + STAGE: ${{ needs.determine_stage.outputs.stage }} + + steps: + - uses: actions/checkout@v4 + + - uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - env: - AWS_PROFILE: "DevAdmin" + aws-region: ${{ secrets.DEV_AWS_REGION }} + + - uses: hashicorp/setup-terraform@v3 - name: Terraform Init - run: cd infrastructure/terraform && terraform init + working-directory: infrastructure/terraform/shared + run: terraform init -reconfigure - name: Terraform Workspace - run: | - BRANCH_NAME=$(echo "${{ github.ref }}" | sed -e "s/^refs\/heads\///") - cd infrastructure/terraform - terraform workspace select ${BRANCH_NAME} || terraform workspace new ${BRANCH_NAME} + working-directory: infrastructure/terraform/shared + run: terraform workspace select ${STAGE} || terraform workspace new ${STAGE} - name: Terraform Plan - run: | - BRANCH_NAME=$(echo "${{ github.ref }}" | sed -e "s/^refs\/heads\///") - cd infrastructure/terraform && terraform plan -var-file=${BRANCH_NAME}.tfvars + working-directory: infrastructure/terraform/shared + run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - - name: Deploy to Dev - if: github.ref == 'refs/heads/dev' - run: cd infrastructure/terraform && terraform apply -var-file=dev.tfvars -auto-approve - env: - name: dev + - name: Terraform Apply + if: env.STAGE == 'prod' + working-directory: infrastructure/terraform/shared + run: terraform apply -auto-approve tfplan - - name: Configure AWS credentials (ProdAdmin) - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - env: - AWS_PROFILE: "ProdAdmin" + # ============================================================ + # 2️⃣ Build Address 2 UPRN image and Push + # ============================================================ + address2uprn_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/address2UPRN/Dockerfile + build_context: backend/address2UPRN + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} - - name: Deploy to Prod - if: github.ref == 'refs/heads/prod' - run: cd infrastructure/terraform && terraform apply -var-file=prod.tfvars -auto-approve - env: - name: prod + # ============================================================ + # 3️⃣ Deploy Address 2 UPRN Lambda + # ============================================================ + address2uprn_lambda: + needs: [address2uprn_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: address2uprn + lambda_path: infrastructure/terraform/lambda/address2UPRN + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.address2uprn_image.outputs.image_digest }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 95155c86..14d5a06f 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -2,6 +2,12 @@ name: Run unit tests on: pull_request: + branches: + - "**" + push: + branches: + - "**" + jobs: test: diff --git a/.vscode/settings.json b/.vscode/settings.json index 88c2ae2d..3d4c6b42 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,9 +9,12 @@ "path": "/bin/bash" } }, +<<<<<<< HEAD +======= "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "python.testing.pytestArgs": ["-s", "-q", "--no-cov"] +>>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d // Hot reload setting that needs to be in user settings // "jupyter.runStartupCommands": [ diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 940c723a..ea4d8b34 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -34,7 +34,8 @@ from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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") + class DataRemapper: @@ -1159,13 +1160,17 @@ class AssetList: ), axis=1 ) + + col = self.EPC_API_DATA_NAMES["roof-description"] self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply( - lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[ + lambda x: RoofAttributes(description=x[col]).process()[ "insulation_thickness"] if not pd.isnull( - x[self.EPC_API_DATA_NAMES["roof-description"]]) else None, + x[col]) else None, axis=1 ) + + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = ( self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].str.replace("+", "") ) diff --git a/asset_list/DataMapper.py b/asset_list/DataMapper.py index ac1b8db3..0751a7cf 100644 --- a/asset_list/DataMapper.py +++ b/asset_list/DataMapper.py @@ -1,5 +1,5 @@ # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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") class DataRemapper: diff --git a/asset_list/__init__.py b/asset_list/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/asset_list/app.py b/asset_list/app.py index 63dc0601..b46254f9 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -14,22 +14,32 @@ from dotenv import load_dotenv from backend.SearchEpc import SearchEpc load_dotenv(dotenv_path="backend/.env") -EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +EPC_AUTH_TOKEN = os.getenv( + "EPC_AUTH_TOKEN", +) -def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): +def extract_address1( + asset_list, full_address_col, postcode_col, method="first_two_words" +): if method == "first_two_words": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + asset_list["address1_extracted"] = ( + asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + ) return asset_list if method == "first_word": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + asset_list["address1_extracted"] = ( + asset_list[full_address_col].str.split(" ").str[0] + ) return asset_list if method == "house_number_extraction": asset_list["address1_extracted"] = asset_list.apply( - lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), - axis=1 + lambda x: SearchEpc.get_house_number( + address=x[full_address_col], postcode=x[postcode_col] + ), + axis=1, ) return asset_list @@ -59,21 +69,20 @@ def app(): Property UPRN """ - # Fairhive - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Fairhive" - data_filename = "Fairhive Asset list.xlsx" - sheet_name = "Sheet1" - postcode_column = 'POSTCODE' - address1_column = "ADDRESS" + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hackney" + data_filename = "Domna SHF Wave 3 (3).xlsx" + sheet_name = "Domna Wave 3" + postcode_column = "Postcode" + address1_column = "Address 1" address1_method = None - fulladdress_column = 'ADDRESS' - address_cols_to_concat = [] + fulladdress_column = None + address_cols_to_concat = ["Address 1"] missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = "PROPERTY TYPE" - landlord_built_form = None - landlord_wall_construction = None + landlord_year_built = "Construction Years" + landlord_os_uprn = "UPRN" + landlord_property_type = "Type" + landlord_built_form = "Attachment" + landlord_wall_construction = "Wall type" landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None @@ -93,93 +102,28 @@ def app(): asset_list_header = 0 landlord_block_reference = None - # Hyde - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Minor Works" - data_filename = "Hyde Group - Domna Minor Works Programme List.xlsx" - sheet_name = "Sheet1" - postcode_column = 'Postcode' - address1_column = None - address1_method = "house_number_extraction" - fulladdress_column = 'Address' - address_cols_to_concat = [] - missing_postcodes_method = None - landlord_year_built = "Age" - landlord_os_uprn = None - landlord_property_type = "Property Type" - landlord_built_form = "Property Type" - landlord_wall_construction = "Walls" - landlord_roof_construction = "Roofs" - landlord_heating_system = "Heating" - landlord_existing_pv = "Renewables" - landlord_property_id = "Organisation Reference" - landlord_sap = "SAP (10)" - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] - master_id_colnames = [] - master_to_asset_list_filepath = None - phase = False - ecosurv_landlords = None - asset_list_header = 0 - landlord_block_reference = None - - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA/20260129 SAL" - data_filename = "NCHA ASSET LIST 1.xlsx" - sheet_name = "NCHA ASSET LIST" - postcode_column = 'POSTCODE' - address1_column = None - address1_method = "house_number_extraction" - fulladdress_column = 'ADDRESS' - address_cols_to_concat = [] - missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = "PROPERTY TYPE" - landlord_built_form = "BUILD FORM" - landlord_wall_construction = "wall combined" - landlord_roof_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "UPRN" - landlord_sap = None - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] - master_id_colnames = [] - master_to_asset_list_filepath = None - phase = False - ecosurv_landlords = None - asset_list_header = 0 - landlord_block_reference = None - # Peabody data for cleaning - data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " - "Project/data_validation") + data_folder = ( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/data_validation" + ) data_filename = "to_standardise_uprns.xlsx" sheet_name = "Sheet1" - postcode_column = 'Postcode' - address1_column = "Address 1" - address1_method = None - fulladdress_column = None - address_cols_to_concat = ["Address 1", "Address 2", "Address 3"] + postcode_column = "Postcode" + address1_column = None + address1_method = "house_number_extraction" + fulladdress_column = "Address" + address_cols_to_concat = None missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = "Type" - landlord_built_form = "Attachment" + landlord_property_type = None + landlord_built_form = None landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Org Ref" + landlord_property_id = "LLUPRN" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -195,40 +139,6 @@ def app(): asset_list_header = 0 landlord_block_reference = None - # Lambeth: - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lambeth/December 10th" - # data_filename = "lambeth_sw2_leigham court estate.xlsx" - # sheet_name = "Sheet1" - # postcode_column = 'Postcode' - # address1_column = "Address" - # address1_method = None - # fulladdress_column = None - # address_cols_to_concat = ["Address"] - # missing_postcodes_method = None - # landlord_year_built = None - # landlord_os_uprn = None - # landlord_property_type = None - # landlord_built_form = None - # landlord_wall_construction = None - # landlord_roof_construction = None - # landlord_heating_system = None - # landlord_existing_pv = None - # landlord_property_id = "row_id" - # landlord_sap = None - # outcomes_filename = None - # outcomes_sheetname = None - # outcomes_postcode = None - # outcomes_houseno = None - # outcomes_id = None - # outcomes_address = None - # master_filepaths = [] - # master_id_colnames = [] - # master_to_asset_list_filepath = None - # phase = False - # ecosurv_landlords = None - # asset_list_header = 0 - # landlord_block_reference = None - # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -253,49 +163,62 @@ def app(): landlord_existing_pv=landlord_existing_pv, landlord_sap=landlord_sap, landlord_block_reference=landlord_block_reference, - phase=phase + phase=phase, ) asset_list.init_standardise() # We produce the new maps, which can be saved for future useage new_property_type_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_property_type] if - asset_list.landlord_property_type else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_property_type] + if asset_list.landlord_property_type + else {} ).items() if k not in PROPERTY_MAPPING } new_built_form_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_built_form] if - asset_list.landlord_built_form else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_built_form] + if asset_list.landlord_built_form + else {} ).items() if k not in BUILT_FORM_MAPPINGS } new_wall_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_wall_construction] if - asset_list.landlord_wall_construction else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_wall_construction] + if asset_list.landlord_wall_construction + else {} ).items() if k not in WALL_CONSTRUCTION_MAPPINGS } new_heating_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_heating_system] if - asset_list.landlord_heating_system else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_heating_system] + if asset_list.landlord_heating_system + else {} ).items() if k not in HEATING_MAPPINGS } new_existing_pv_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_existing_pv] + if asset_list.landlord_existing_pv + else {} ).items() if k not in EXISTING_PV_MAPPINGS } new_roof_construction_map = { - k: v for k, v in ( - asset_list.variable_mappings[asset_list.landlord_roof_construction] if - asset_list.landlord_roof_construction else {} + k: v + for k, v in ( + asset_list.variable_mappings[asset_list.landlord_roof_construction] + if asset_list.landlord_roof_construction + else {} ).items() if k not in ROOF_CONSTRUCTION_MAPPINGS } @@ -309,7 +232,7 @@ def app(): outcomes_address=outcomes_address, outcomes_postcode=outcomes_postcode, outcomes_houseno=outcomes_houseno, - outcomes_id=outcomes_id + outcomes_id=outcomes_id, ) asset_list.flag_survey_master( @@ -343,14 +266,16 @@ def app(): skip = max(chunk_indexes) if any(x in folder_contents for x in downloaded_files): - skip = max([i for i in chunk_indexes if filename.format(i=i) in folder_contents]) + skip = max( + [i for i in chunk_indexes if filename.format(i=i) in folder_contents] + ) for i in range(0, len(asset_list.standardised_asset_list), chunk_size): print(f"Processing chunk {i} to {i + chunk_size}") if skip is not None and not force_retrieve_data: if i <= skip: continue - chunk = asset_list.standardised_asset_list[i:i + chunk_size] + chunk = asset_list.standardised_asset_list[i : i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, @@ -362,7 +287,7 @@ def app(): built_form_column=AssetList.STANDARD_BUILT_FORM, manual_uprn_map=manual_uprn_map, epc_api_only=epc_api_only, - epc_auth_token=EPC_AUTH_TOKEN + epc_auth_token=EPC_AUTH_TOKEN, ) # We now retrieve any failed properties @@ -385,7 +310,9 @@ def app(): # Append the failed data to the main data # Store the chunk locally as a csv - pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False) + pd.DataFrame(epc_data_chunk).to_csv( + os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False + ) # Store the errors and no-data locally with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f: json.dump(errors_chunk, f) @@ -416,7 +343,9 @@ def app(): unique_recommendations = set() for _, row in recommendations_df.iterrows(): - unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + unique_recommendations.update( + [rec["improvement-summary-text"] for rec in row["recommendations"]] + ) columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) transformed_data = [] @@ -436,20 +365,24 @@ def app(): transformed_df = pd.DataFrame(transformed_data) for col in [ "Floor insulation (solid floor)", - "Floor insulation", "Floor insulation (suspended floor)" + "Floor insulation", + "Floor insulation (suspended floor)", ]: if col not in transformed_df.columns: transformed_df[col] = False transformed_df = transformed_df[ [ - asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", - "Floor insulation", "Floor insulation (suspended floor)" + asset_list.DOMNA_PROPERTY_ID, + "Floor insulation (solid floor)", + "Floor insulation", + "Floor insulation (suspended floor)", ] ] transformed_df["epc_has_floor_recommendation"] = ( - transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | - transformed_df["Floor insulation (suspended floor)"] + transformed_df["Floor insulation (solid floor)"] + | transformed_df["Floor insulation"] + | transformed_df["Floor insulation (suspended floor)"] ) # Get the find my epc data @@ -462,21 +395,20 @@ def app(): find_my_epc_data.append( { asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID], - **x["find_my_epc_data"] + **x["find_my_epc_data"], } ) else: find_my_epc_data.append( - { - asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID] - } + {asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID]} ) find_my_epc_data = pd.DataFrame(find_my_epc_data) find_my_epc_data = find_my_epc_data.merge( transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], - how="left", on=asset_list.DOMNA_PROPERTY_ID + how="left", + on=asset_list.DOMNA_PROPERTY_ID, ) # We check if we get the solar pv column: @@ -486,27 +418,33 @@ def app(): # Retrieve just the data we need epc_df = epc_df[ [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) - ].rename( - columns=asset_list.EPC_API_DATA_NAMES - ) + ].rename(columns=asset_list.EPC_API_DATA_NAMES) # Look for columns not in the find my EPC data, which will have happened if we didn't # retrieve it in the first place - missed_find_epc_cols = [c for c in list(asset_list.FIND_EPC_DATA_NAMES.keys()) if c not in find_my_epc_data.columns] + missed_find_epc_cols = [ + c + for c in list(asset_list.FIND_EPC_DATA_NAMES.keys()) + if c not in find_my_epc_data.columns + ] if missed_find_epc_cols: for c in missed_find_epc_cols: find_my_epc_data[c] = None epc_df = epc_df.merge( find_my_epc_data[ - [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) - ] - .rename(columns=asset_list.FIND_EPC_DATA_NAMES), + [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + + list(asset_list.FIND_EPC_DATA_NAMES.keys()) + ].rename(columns=asset_list.FIND_EPC_DATA_NAMES), how="left", - on=asset_list.DOMNA_PROPERTY_ID + on=asset_list.DOMNA_PROPERTY_ID, ) asset_list.merge_data(epc_df) + # asset_list.standardised_asset_list = asset_list.standardised_asset_list[ + # asset_list.standardised_asset_list["domna_full_address"] + # != "120 Airdrie Crescent, Burnley, Lancashire" + # ] asset_list.extract_attributes() asset_list.identify_worktypes() @@ -516,7 +454,10 @@ def app(): asset_list.get_work_figures() # Store as an excel - filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx" + filename = ( + os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + + " - Standardised.xlsx" + ) # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data # Determine inspections priority @@ -540,26 +481,42 @@ def app(): # ) with pd.ExcelWriter(filename) as writer: - asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) + asset_list.standardised_asset_list.to_excel( + writer, sheet_name="Standardised Asset List", index=False + ) if asset_list.block_analysis_df is not None: - asset_list.block_analysis_df.to_excel(writer, sheet_name="Block Analysis", index=False) + asset_list.block_analysis_df.to_excel( + writer, sheet_name="Block Analysis", index=False + ) # If we have outcomes, we add a tab with the outcomes if not asset_list.outcomes_for_output.empty: - asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False) + asset_list.outcomes_for_output.to_excel( + writer, sheet_name="Outcomes", index=False + ) if not asset_list.unmatched_submissions.empty: - asset_list.unmatched_submissions.to_excel(writer, sheet_name="Unmatched Submissions", index=False) + asset_list.unmatched_submissions.to_excel( + writer, sheet_name="Unmatched Submissions", index=False + ) if not asset_list.outcomes_no_match.empty: - asset_list.outcomes_no_match.to_excel(writer, sheet_name="Unmatched Outcomes", index=False) + asset_list.outcomes_no_match.to_excel( + writer, sheet_name="Unmatched Outcomes", index=False + ) if not asset_list.ecosurv_no_match.empty: - asset_list.ecosurv_no_match.to_excel(writer, sheet_name="Unmatched Ecosurv", index=False) + asset_list.ecosurv_no_match.to_excel( + writer, sheet_name="Unmatched Ecosurv", index=False + ) if not asset_list.geographical_areas.empty: - asset_list.geographical_areas.to_excel(writer, sheet_name="Geographical Areas", index=False) + asset_list.geographical_areas.to_excel( + writer, sheet_name="Geographical Areas", index=False + ) # Store dupes if asset_list.duplicated_addresses is not None: if not asset_list.duplicated_addresses.empty: - asset_list.duplicated_addresses.to_excel(writer, sheet_name="Duplicate Properties", index=False) + asset_list.duplicated_addresses.to_excel( + writer, sheet_name="Duplicate Properties", index=False + ) diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index b68706be..dc7e572e 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -1,7 +1,6 @@ postal pandas usaddress -pydantic-settings==2.6.0 epc-api-python==1.0.2 thefuzz boto3 @@ -10,6 +9,5 @@ openai>=1.3.5 tiktoken msgpack beautifulsoup4 -pydantic>=1.10.7 typing-extensions>=4.5.0 -requests>=2.28.2 +requests>=2.28.2 \ No newline at end of file diff --git a/backend/.env.local b/backend/.env.local new file mode 100644 index 00000000..a05c93a3 --- /dev/null +++ b/backend/.env.local @@ -0,0 +1,22 @@ +DB_HOST=db +DB_PORT=5432 +DB_NAME=tech_team_local_db +DB_USERNAME=postgres +DB_PASSWORD=makingwarmerhomes + + +#not used +GOOGLE_SOLAR_API_KEY="test" +SAP_PREDICTIONS_BUCKET="test" +CARBON_PREDICTIONS_BUCKET="test" +HEAT_PREDICTIONS_BUCKET="test" +HEATING_KWH_PREDICTIONS_BUCKET="test" +HOTWATER_KWH_PREDICTIONS_BUCKET="test" +API_KEY="test" +ENVIRONMENT="test" +SECRET_KEY="test" +PLAN_TRIGGER_BUCKET="test" +DATA_BUCKET="test" +EPC_AUTH_TOKEN="test" +ENGINE_SQS_URL="test" +ENERGY_ASSESSMENTS_BUCKET="test" \ No newline at end of file diff --git a/backend/address2UPRN/Dockerfile b/backend/address2UPRN/Dockerfile new file mode 100644 index 00000000..ac6af2a5 --- /dev/null +++ b/backend/address2UPRN/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/python:3.10 + +# Copy function code +COPY main.py . + +# Set the handler +CMD ["main.handler"] diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md new file mode 100644 index 00000000..b4876340 --- /dev/null +++ b/backend/address2UPRN/README.md @@ -0,0 +1,20 @@ +We have list of address as input. + +It'll come in batches of the same post code and from then we want to somehow convert that into UPRN + +if this lambda/function can do that we'll be speeding ahead + + +Energy Performance Information: https://epc.opendatacommunities.org/ + +guidance page: https://epc.opendatacommunities.org/docs/guidance#field_domestic_LMK_KEY + +Example of past khalims code that he wrote some tests for: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/tests/test_search_epc.py#L11 + + +Example of EPC search: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/SearchEpc.py#L118 + + + +Khalim has made a python package to help scrape data: https://github.com/KhalimCK/epc-api-python + diff --git a/backend/address2UPRN/__init__.py b/backend/address2UPRN/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py new file mode 100644 index 00000000..9d27a5ce --- /dev/null +++ b/backend/address2UPRN/main.py @@ -0,0 +1,567 @@ +from epc_api.client import EpcClient +import os +from urllib.parse import urlencode +import pandas as pd +from difflib import SequenceMatcher +from tqdm import tqdm +from utils.logger import setup_logger + +logger = setup_logger() + +import re + +EPC_AUTH_TOKEN = os.getenv( + "EPC_AUTH_TOKEN", +) + +if EPC_AUTH_TOKEN is None: + raise RuntimeError("EPC_AUTH_TOKEN not defined in env") + +import re +from difflib import SequenceMatcher +from typing import Set + + +def levenshtein(a: str, b: str) -> float: + """ + Address similarity score in [0, 1]. + + Strategy: + - Normalise + - Strongly penalise mismatched house/flat numbers + - Combine token overlap + character similarity + """ + + def extract_number_sequence(s: str) -> list[str]: + return re.findall(r"\d+[a-z]?", s) + + def extract_numbers(s: str) -> Set[str]: + return set(extract_number_sequence(s)) + + def tokenise(s: str) -> Set[str]: + return set(s.split()) + + def extract_building_number(s: str) -> str | None: + """ + Extract the main building number (NOT flat/unit). + Assumes formats like: + - '42 moreton road' + - 'flat 3 42 moreton road' + """ + tokens = s.split() + + # remove flat/unit context + cleaned = [] + skip_next = False + for t in tokens: + if t in ("flat", "apt", "apartment", "unit"): + skip_next = True + continue + if skip_next: + skip_next = False + continue + cleaned.append(t) + + # first remaining number is building number + for t in cleaned: + if re.fullmatch(r"\d+[a-z]?", t): + return t + + return None + + a_norm = normalise_address(a) + b_norm = normalise_address(b) + + # --- hard signal: numbers --- + nums_a = extract_numbers(a_norm) + nums_b = extract_numbers(b_norm) + + if nums_a and not nums_b: + return 0.0 + + # No shared numbers at all → impossible match + if nums_a and nums_b and nums_a.isdisjoint(nums_b): + return 0.0 + + # 🔒 HARD GUARD: building number must match + bld_a = extract_building_number(a_norm) + bld_b = extract_building_number(b_norm) + + if bld_a and bld_b and bld_a != bld_b: + return 0.0 + + # --- order-sensitive flat/building guard --- + seq_a = extract_number_sequence(a_norm) + seq_b = extract_number_sequence(b_norm) + + has_flat_token_user = any( + tok in a_norm for tok in ("flat", "apt", "apartment", "unit") + ) + has_flat_token_epc = "flat" in b_norm + + if ( + len(seq_a) == 2 + and len(seq_b) >= 2 + and has_flat_token_epc + and not has_flat_token_user + and seq_a != seq_b[:2] + ): + return 0.0 + + # --- token similarity (order-independent) --- + toks_a = tokenise(a_norm) + toks_b = tokenise(b_norm) + + if not toks_a or not toks_b: + token_score = 0.0 + else: + token_score = len(toks_a & toks_b) / len(toks_a | toks_b) + + # --- character similarity (soft signal) --- + char_score = SequenceMatcher(None, a_norm, b_norm).ratio() + + # --- weighted blend --- + return round( + 0.65 * token_score + 0.35 * char_score, + 4, + ) + + +def normalise_address(s: str) -> str: + """ + Canonical UK-focused address normalisation. + + - Lowercases + - Removes punctuation (keeps / for flats) + - Normalises whitespace + - Applies synonym compression at token level + """ + + if not s: + return "" + + ADDRESS_SYNONYMS = { + # street types + "rd": "road", + "rd.": "road", + "st": "street", + "st.": "street", + "ave": "avenue", + "ave.": "avenue", + "ln": "lane", + "ln.": "lane", + "cres": "crescent", + "ct": "court", + "dr": "drive", + # flats / units + "apt": "flat", + "apartment": "flat", + "unit": "flat", + "ste": "suite", + # numbering noise + "no": "", + "no.": "", + } + # 1. lowercase + s = s.lower() + + # 1.5 split digit-letter suffixes + s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) + + # 2. remove punctuation except / + s = re.sub(r"[^\w\s/]", " ", s) + + # 3. normalise whitespace + s = re.sub(r"\s+", " ", s).strip() + + # 4. tokenise + synonym normalisation + tokens = [] + for tok in s.split(): + replacement = ADDRESS_SYNONYMS.get(tok, tok) + if replacement: + tokens.append(replacement) + + return " ".join(tokens) + + +def score_addresses( + df: pd.DataFrame, + user_address: str, + column: str = "address", +) -> pd.Series: + if column not in df.columns: + raise ValueError(f"Missing column: {column}") + + return df[column].apply(lambda x: levenshtein(user_address, x)) + + +def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): + """ + Recursively fetch EPC data by postcode. + If results hit the size limit, retry with double size up to max_attempts. + """ + client = EpcClient(auth_token=EPC_AUTH_TOKEN) + + url = os.path.join(client.domestic.host, "search") + + if size: + url += "?" + urlencode({"size": size}) + + search_resp = client.domestic.call( + url=url, + method="get", + params={"postcode": postcode}, + ) + + results_df = pd.DataFrame(search_resp["rows"], columns=search_resp["column-names"]) + + row_count = len(results_df) + + # If we hit the size limit, there *may* be more results + if row_count == size: + print( + f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. " + f"Attempt {attempt}/{max_attempts}." + ) + + if attempt < max_attempts: + print(f"🔁 Retrying with size={size * 2}") + return get_epc_data_with_postcode( + postcode=postcode, + size=size * 2, + attempt=attempt + 1, + max_attempts=max_attempts, + ) + else: + print( + "🚨 Max attempts reached. Results may be truncated. " + "(Please do a manual review by the tech team.)" + ) + + return results_df + + +def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: + """ + Returns True if all non-null UPRNs in df match the given uprn. + Returns False otherwise. + """ + + if column not in df.columns: + return False + + # Drop nulls and normalise to string + uprns = df[column].dropna().astype(str).str.strip().unique() + + # No valid UPRNs to compare + if len(uprns) == 0: + return False + + # Exactly one unique UPRN and it matches + return len(uprns) == 1 and uprns[0] == str(uprn) + + +def get_uprn_candidates( + df: pd.DataFrame, + user_address: str, + address_column: str = "address", + uprn_column: str = "uprn", +) -> pd.DataFrame: + """ + Annotate EPC results with lexicographical similarity scores and ranks. + + Returns a DataFrame sorted by descending lexiscore. + DOES NOT choose or return a UPRN. + """ + + if address_column not in df.columns: + raise ValueError(f"Missing column: {address_column}") + + if uprn_column not in df.columns: + raise ValueError(f"Missing column: {uprn_column}") + + out = df.copy() + + user_norm = normalise_address(user_address) + + out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x)) + + # Normalise UPRN to string + out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) + + # Rank: 1 = best match + out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) + + return out.sort_values( + ["lexirank", "lexiscore"], + ascending=[True, False], + ) + + +def get_uprn(user_inputed_address: str, postcode: str): + """ + Return uprn (str) + Return False if failed to find a sensible matching epc + Return Nons when epc found but no UPRN + """ + df = get_epc_data_with_postcode(postcode=postcode) + + if df.empty: + return None + + scored_df = get_uprn_candidates( + df, + user_address=user_inputed_address, + ) + + # Best score + best_score = scored_df.iloc[0]["lexiscore"] + + if best_score <= 0: + return None + + # All rank-1 rows (possible draw) + top_rank_df = scored_df[scored_df["lexirank"] == 1] + + # If rank-1 rows do not agree on a single UPRN → ambiguous + if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]): + return None + + address = top_rank_df["address"].values[0] + lexiscore = float(top_rank_df["lexiscore"].values[0]) + + logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}") + # Safe to return the agreed UPRN + found_uprn = top_rank_df.iloc[0]["uprn"] + + if found_uprn == "": + return None + + return found_uprn + + +def resolve_uprns_for_postcode_group( + group_df: pd.DataFrame, + epc_df: pd.DataFrame, + address_col: str = "Address 1", +) -> pd.DataFrame: + """ + Given: + - group_df: rows sharing the same postcode + - epc_df: EPC search results for that postcode + + Returns: + group_df + found_uprn + diagnostics + """ + + results = [] + + for _, row in group_df.iterrows(): + user_address = str(row[address_col]).strip() + + scored_df = get_uprn_candidates( + epc_df, + user_address=user_address, + ) + + if scored_df.empty: + results.append( + { + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "no_epc_candidates", + } + ) + continue + + best_score = scored_df.iloc[0]["lexiscore"] + + if best_score <= 0: + results.append( + { + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": best_score, + "status": "zero_score", + } + ) + continue + + top_rank_df = scored_df[scored_df["lexirank"] == 1] + + if not df_has_single_uprn(top_rank_df, top_rank_df.iloc[0]["uprn"]): + results.append( + { + "found_uprn": None, + "best_match_uprn": top_rank_df.iloc[0]["uprn"], + "best_match_address": top_rank_df.iloc[0]["address"], + "best_match_lexiscore": best_score, + "status": "ambiguous", + } + ) + continue + + results.append( + { + "found_uprn": str(top_rank_df.iloc[0]["uprn"]), + "best_match_uprn": str(top_rank_df.iloc[0]["uprn"]), + "best_match_address": top_rank_df.iloc[0]["address"], + "best_match_lexiscore": best_score, + "status": "matched", + } + ) + + return pd.concat( + [group_df.reset_index(drop=True), pd.DataFrame(results)], + axis=1, + ) + + +def test(a, b): + assert a == b, f"erorr: {a}{type(a)} != {b}: {type(b)}" + + +def run_all_test(): + # Basic usage with different post codes styles + test(get_epc_data_with_postcode("b93 8sy").shape[0], 63) + test(get_epc_data_with_postcode("B938sy").shape[0], 63) + test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) + test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) + + test(get_uprn("68", "b93 8sy"), "100070989938") + test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938") + test(get_uprn("Flat A, 28, Nelgarde Road", "se6 4tf"), "100023278633") + test(get_uprn("28 A", "se6 4tf"), "100023278633") + test(get_uprn("28A", "se6 4tf"), "100023278633") + test(get_uprn("6 Aitken Close", "E8 4SQ"), False) + + # unique case + test(get_uprn("Flat 5, 1, Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("5 , 1 Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("5 Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("1, 5 Semley Gate", "e9 5nh"), False) + test( + get_uprn("1 Semley Gate", "e9 5nh"), "10008238188" + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("48 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("42 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("46 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + get_uprn_candidates(get_epc_data_with_postcode("e5 0bt"), "48 Oswald Street") + get_uprn_candidates( + get_epc_data_with_postcode("Cr2 7dl"), + "FLAT 3; 42 MORETON ROAD, SOUTH CROYDON, SURREY", + ) + + +if __name__ == "__main__": + INPUT_FILE = "hackney.xlsx" + + ADDRESS_COL = "Address 1" + POSTCODE_COL = "Postcode" + UPRN_COL = "UPRN" + + df = pd.read_excel(INPUT_FILE) + + failures = [] + + for _, row in tqdm( + df.iterrows(), + total=len(df), + desc="Auditing UPRNs", + ): + input_address = str(row[ADDRESS_COL]).strip() + postcode = str(row[POSTCODE_COL]).strip() + + expected_uprn = None if pd.isna(row[UPRN_COL]) else str(int(row[UPRN_COL])) + + try: + epc_df = get_epc_data_with_postcode(postcode) + + if epc_df.empty: + failures.append( + { + **row.to_dict(), + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "no_epc_results", + } + ) + continue + + scored_df = get_uprn_candidates( + epc_df, + user_address=input_address, + ) + + best_row = scored_df.iloc[0] + + best_match_uprn = str(best_row["uprn"]) + best_match_address = best_row["address"] + best_match_lexiscore = round(float(best_row["lexiscore"]), 4) + + found_uprn = get_uprn(input_address, postcode) + + except Exception as e: + failures.append( + { + **row.to_dict(), + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "exception", + "error": str(e), + } + ) + continue + + found_uprn_norm = None if not found_uprn else str(found_uprn) + + if found_uprn_norm != expected_uprn: + failures.append( + { + **row.to_dict(), + "found_uprn": found_uprn_norm, + "best_match_uprn": best_match_uprn, + "best_match_address": best_match_address, + "best_match_lexiscore": best_match_lexiscore, + "status": ("no_match" if found_uprn_norm is None else "mismatch"), + } + ) + + failures_df = pd.DataFrame(failures) + + print("===================================") + print(f"Total rows : {len(df)}") + print(f"Failures : {len(failures_df)}") + print("===================================") + + failures_df.to_excel( + "hackney_uprn_failures.xlsx", + index=False, + ) + + +def handler(event, context): + print("hello world") + return {"statusCode": 200, "body": "hello world"} + + +# TO do function dispatcher, + +# get_uprn_candidates(get_epc_data_with_postcode("E9 5NH"),"Flat 1, 5 Semley Gate" and Flat 5, 1 Semley Gate) +# fix that +# Look again at flat 1 +# pandas reader the seperate postcode_splitter +# dump into s3 diff --git a/backend/address2UPRN/script.py b/backend/address2UPRN/script.py new file mode 100644 index 00000000..bd8f8017 --- /dev/null +++ b/backend/address2UPRN/script.py @@ -0,0 +1,17 @@ +import pandas as pd + + +# use Address 1 +junte_df = pd.read_excel("hackney_uprn_failures.xlsx") + + +# use domna_address_1 +khalim_df = pd.read_excel("khalim_standard.xlsx") + + +combined_df = junte_df.merge(khalim_df, how="left", left_on="Address 1", right_on='domna_address_1') + +# Find the row in khalim_df that does not app + +result = combined_df[~pd.isnull(combined_df["epc_os_uprn"])] + diff --git a/backend/address2UPRN/tests/test_csv.py b/backend/address2UPRN/tests/test_csv.py new file mode 100644 index 00000000..70e7a9f9 --- /dev/null +++ b/backend/address2UPRN/tests/test_csv.py @@ -0,0 +1,40 @@ +# tests/test_address_to_uprn_csv.py + +import csv +import pytest +from pathlib import Path +from backend.address2UPRN.main import get_uprn + +FIXTURE_PATH = Path(__file__).parent / "test_data.csv" + + +def load_test_cases(): + with open(FIXTURE_PATH, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [ + pytest.param( + row["User Input"], + row["Postcode"], + row["Manual UPRN Code"], + id=f'{row["User Input"]} [{row["Postcode"]}]', + ) + for row in reader + ] + + +@pytest.mark.parametrize( + "user_input,postcode,expected_uprn", + load_test_cases(), +) +def test_uprn_resolution_matches_manual( + user_input: str, + postcode: str, + expected_uprn: str, +): + from utils.logger import setup_logger + + uprn = get_uprn(user_input, postcode) + if uprn: + assert uprn == expected_uprn + else: + assert str(uprn) == expected_uprn diff --git a/backend/address2UPRN/tests/test_data.csv b/backend/address2UPRN/tests/test_data.csv new file mode 100644 index 00000000..ee23813b --- /dev/null +++ b/backend/address2UPRN/tests/test_data.csv @@ -0,0 +1,366 @@ +User Input,Postcode,Manual UPRN Code +47 The Fairway,OX16 0RR,100120771697 +11 REGENT COURT,SL1 3LG,100081041562 +3/137a Windmill Road,TW8 9NH,100021516998 +Flat 33,SW18 4BE,100023328943 +FLAT 1 Brendon Grove,N2 8JE,200013412 +Flat 15,KT8 2NE,100062123759 +FLAT 5 Stonehill Road,W4 3AH,100021589829 +10 Douglas Court,SL7 1UQ,100081278099 +1 Windmill Road,HP17 8JA,766034606 +31 Denewood,HP13 7LH,100081095964 +"10, Greenways Drive",TW4 5DD,10091597009 +Flat 10,W4 3AH,"100021589834" +Flat 11,TW4 5DD,10091597010 +Flat 11,W4 3AH,100021589835 +"12, Greenways Drive",TW4 5DD,10091597011 +"Flat 12, Forbes House",W4 3AH,100021589836 +FLAT 1 Goodstone Court,HA1 4FL,10070269053 +Flat 13,TW4 5DD,10091597012 +Flat 13,W4 3AH,100021589837 +Flat 14,TW4 5DD,10091597013 +Flat 14,W4 3AH,100021589838 +Flat 15,TW4 5DD,10091597014 +Flat 15,W4 3AH,100021589839 +Flat 16,TW4 5DD,"10091597015" +Flat 16,W4 3AH,100021589840 +Flat 17,TW4 5DD,10091597016 +Flat 17,W4 3AH,100021589841 +Flat 18,TW4 5DD,10091597017 +Flat 19,W4 3AH,100021589843 +Flat 20,W4 3AH,100021589844 +Flat 21,W4 3AH,100021589845 +Flat 22,W4 3AH,100021589846 +FLAT 2 Goodstone Court,HA1 4FL,10070269054 +Flat 23,W4 3AH,100021589847 +Flat 24,W4 3AH,100021589848 +"30c, Bosanquet Close",UB8 3PE,100021475316 +"30e, Bosanquet Close",UB8 3PE,100021475318 +FLAT 3 Goodstone Court,HA1 4FL,10070269055 +FLAT 4 Goodstone Court,HA1 4FL,10070269056 +FLAT 5 Goodstone Court,HA1 4FL,10070269057 +FLAT 6 Goodstone Court,HA1 4FL,10070269058 +FLAT 7 Goodstone Court,HA1 4FL,10070269059 +FLAT 8 Goodstone Court,HA1 4FL,10070269060 +FLAT 9 Goodstone Court,HA1 4FL,10070269061 +FLAT 10 Goodstone Court,HA1 4FL,10070269062 +FLAT 11 Goodstone Court,HA1 4FL,10070269063 +FLAT 12 Goodstone Court,HA1 4FL,10070269064 +FLAT 13 Goodstone Court,HA1 4FL,10070269065 +FLAT 14 Goodstone Court,HA1 4FL,10070269066 +FLAT 15 Goodstone Court,HA1 4FL,10070269067 +FLAT 16 Goodstone Court,HA1 4FL,10070269068 +FLAT 17 Goodstone Court,HA1 4FL,10070269069 +FLAT 18 Goodstone Court,HA1 4FL,10070269070 +FLAT 19 Goodstone Court,HA1 4FL,10070269071 +FLAT 20 Goodstone Court,HA1 4FL,10070269072 +FLAT 21 Goodstone Court,HA1 4FL,10070269073 +FLAT 22 Goodstone Court,HA1 4FL,10070269074 +FLAT 23 Goodstone Court,HA1 4FL,10070269075 +FLAT 24 Goodstone Court,HA1 4FL,10070269076 +FLAT 25 Goodstone Court,HA1 4FL,10070269077 +FLAT 26 Goodstone Court,HA1 4FL,10070269078 +FLAT 27 Goodstone Court,HA1 4FL,10070269079 +FLAT 28 Goodstone Court,HA1 4FL,10070269080 +FLAT 29 Goodstone Court,HA1 4FL,10070269081 +FLAT 30 Goodstone Court,HA1 4FL,10070269082 +FLAT 31 Goodstone Court,HA1 4FL,10070269083 +FLAT 32 Goodstone Court,HA1 4FL,10070269084 +FLAT 33 Goodstone Court,HA1 4FL,10070269085 +FLAT 34 Goodstone Court,HA1 4FL,10070269086 +FLAT 35 Goodstone Court,HA1 4FL,10070269087 +FLAT 36 Goodstone Court,HA1 4FL,10070269088 +FLAT 37 Goodstone Court,HA1 4FL,10070269089 +FLAT 38 Goodstone Court,HA1 4FL,10070269090 +FLAT 39 Goodstone Court,HA1 4FL,10070269091 +FLAT 40 Goodstone Court,HA1 4FL,10070269092 +FLAT 41 Goodstone Court,HA1 4FL,10070269093 +FLAT 42 Goodstone Court,HA1 4FL,10070269094 +FLAT 43 Goodstone Court,HA1 4FL,10070269095 +"13 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778260 +"14 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778259 +"15 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778258 +"16 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778263 +"17 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778262 +"18 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778261 +"19 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778266 +"20 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778265 +"21 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778264 +90a Murray Road,W5 4DA,12135293 +"Flat 1, 6 Wolverton Gardens",W5 3LJ,"12119972" +"1, Monsted House",UB1 1FG,12189944 +"10, Monsted House",UB1 1FG,12189953 +"20, Monsted House",UB1 1FG,12189963 +"2, Monsted House",UB1 1FG,12189945 +"3, Monsted House",UB1 1FG,12189946 +"4, Monsted House",UB1 1FG,12189947 +"5, Monsted House",UB1 1FG,12189948 +"6, Monsted House",UB1 1FG,12189949 +"7, Monsted House",UB1 1FG,12189950 +"8, Monsted House",UB1 1FG,12189951 +"9, Monsted House",UB1 1FG,12189952 +"1 Cullis House, 1, Accolade Avenue",UB1 1FH,12189904 +"2 Cullis House, 1, Accolade Avenue",UB1 1FH,12189905 +"3 Cullis House, 1, Accolade Avenue",UB1 1FH,12189906 +"4 Cullis House, 1, Accolade Avenue",UB1 1FH,12189907 +"5 Cullis House, 1, Accolade Avenue",UB1 1FH,12189908 +"6 Cullis House, 1, Accolade Avenue",UB1 1FH,12189909 +1 Genteel House Samara Drive,UB1 1FJ,12189835 +2 Genteel House Samara Drive,UB1 1FJ,12189836 +3 Genteel House Samara Drive,UB1 1FJ,12189837 +4 Genteel House Samara Drive,UB1 1FJ,12189838 +5 Genteel House Samara Drive,UB1 1FJ,12189839 +6 Genteel House Samara Drive,UB1 1FJ,12189840 +7 Genteel House Samara Drive,UB1 1FJ,12189841 +8 Genteel House Samara Drive,UB1 1FJ,12189842 +9 Genteel House Samara Drive,UB1 1FJ,12189843 +10 Genteel House Samara Drive,UB1 1FJ,12189844 +1 ASH TREE HOUSE,SE5 0TE,None +"Flat 1 Ash Tree House, 2, Thompson Avenue",SE5 0TE,10009803979 +3 ASH TREE HOUSE,SE5 0TE,None +Flat 3 ASH TREE HOUSE,SE5 0TE,10009803981 +5 ASH TREE HOUSE,SE5 0TE,None +Flat 5 ASH TREE HOUSE,SE5 0TE,10009803983 +Flat 8 ASH TREE HOUSE,SE5 0TE,10009803986 +8 ASH TREE HOUSE,SE5 0TE,None +Flat 12 ASH TREE HOUSE,SE5 0TE,10009803990 +12 ASH TREE HOUSE,SE5 0TE,None +FLAT 1 599 HARROW ROAD,W10 4RA,217113930 +FLAT 2 599 HARROW ROAD,W10 4RA,217113931 +FLAT 3 599 HARROW ROAD,W10 4RA,None +FLAT 4 599 HARROW ROAD,W10 4RA,None +FLAT 5 599 HARROW ROAD,W10 4RA,217113934 +FLAT 6 599 HARROW ROAD,W10 4RA,None +FLAT 7 599 HARROW ROAD,W10 4RA,None +FLAT 8 599 HARROW ROAD,W10 4RA,None +"Flat 1, Ohio Building",SE13 7RX,10023226256 +"Flat 2, Ohio Building",SE13 7RX,10023226257 +"Apartment 1 Block B, 105, Benwell Road",N7 7BW,10012792307 +"Apartment 2 Block B, 105, Benwell Road",N7 7BW,10012792308 +"Apartment 3 Block B, 105, Benwell Road",N7 7BW,10012792309 +"Apartment 4 Block B, 105, Benwell Road",N7 7BW,10012792310 +"Apartment 5 Block B, 105, Benwell Road",N7 7BW,10012792311 +"Apartment 6 Block B, 105, Benwell Road",N7 7BW,10012792312 +"Apartment 7 Block B, 105, Benwell Road",N7 7BW,10012792313 +"Apartment 8 Block B, 105, Benwell Road",N7 7BW,10012792314 +"Apartment 9 Block B, 105, Benwell Road",N7 7BW,10012792315 +"Apartment 10 Block B, 105, Benwell Road",N7 7BW,10012792316 +"Apartment 11 Block B, 105, Benwell Road",N7 7BW,10012792317 +"Apartment 12 Block B, 105, Benwell Road",N7 7BW,10012792318 +"Apartment 13 Block B, 105, Benwell Road",N7 7BW,10012792319 +"Apartment 1 Block D, 32, Hornsey Road",N7 7AT,10012792366 +"Apartment 2 Block D, 32, Hornsey Road",N7 7AT,10012792367 +"Apartment 3 Block D, 32, Hornsey Road",N7 7AT,10012792368 +"Apartment 4 Block D, 32, Hornsey Road",N7 7AT,10012792369 +"Apartment 5 Block D, 32, Hornsey Road",N7 7AT,10012792370 +"Apartment 6 Block D, 32, Hornsey Road",N7 7AT,"10012792371" +"Apartment 7 Block D, 32, Hornsey Road",N7 7AT,10012792372 +"Apartment 8 Block D, 32, Hornsey Road",N7 7AT,10012792373 +"Apartment 9 Block D, 32, Hornsey Road",N7 7AT,10012792374 +"Apartment 10 Block D, 32, Hornsey Road",N7 7AT,10012792375 +"Apartment 11 Block D, 32, Hornsey Road",N7 7AT,10012792376 +"Apartment 12 Block D, 32, Hornsey Road",N7 7AT,10012792377 +"Apartment 13 Block D, 32, Hornsey Road",N7 7AT,10012792378 +"Apartment 14 Block D, 32, Hornsey Road",N7 7AT,10012792379 +"Apartment 15 Block D, 32, Hornsey Road",N7 7AT,10012792380 +"Apartment 16 Block D, 32, Hornsey Road",N7 7AT,"10012792381" +"Apartment 17Block D, 32, Hornsey Road",N7 7AT,10012792382 +"Apartment 18 Block D, 32, Hornsey Road",N7 7AT,10012792383 +24b Honley Road,SE6 2HZ,None +FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974 +2 COLLEGE HOUSE,CM7 1JS,100091449870 +3 COLLEGE HOUSE,CM7 1JS,100091449871 +1 Anita Street,M4 5DU,None +2 Anita Street,M4 5DU,77123061 +5 Anita Street,M4 5DU,77123081 +6 Anita Street,M4 5DU,77123082 +8 Anita Street,M4 5DU,None +9 Anita Street,M4 5DU,None +10 Anita Street,M4 5DU,77123051 +12 Anita Street,M4 5DU,77123053 +19 Anita Street,M4 5DU,None +22 Anita Street,M4 5DU,None +26 Anita Street,M4 5DU,77123068 +28 Anita Street,M4 5DU,None +30 Anita Street,M4 5DU,None +32 Anita Street,M4 5DU,None +33 Anita Street,M4 5DU,77123076 +34 Anita Street,M4 5DU,None +35 Anita Street,M4 5DU,77123078 +36 Anita Street,M4 5DU,77123079 +23 George Leigh Street,M4 5DR,77123171 +25 George Leigh Street,M4 5DR,None +35 George Leigh Street,M4 5DR,77123177 +39 George Leigh Street,M4 5DR,77123179 +41 George Leigh Street,M4 5DR,None +43 George Leigh Street,M4 5DR,None +49 George Leigh Street,M4 5DR,None +51 George Leigh Street,M4 5DR,77123185 +55 George Leigh Street,M4 5DR,None +57 George Leigh Street,M4 5DR,None +"1a, Victoria Square",M4 5DX,77211153 +2a Victoria Square ,M4 5DX,None +"4a, Victoria Square",M4 5DX,77211155 +5a Victoria Square,M4 5DX,77211156 + 6a Victoria Square,M4 5DX,77211157 +7a Victoria Square,M4 5DX,77211158 +8a Victoria Square,M4 5DX,77211159 +9a Victoria Square,M4 5DX,77211160 +10a Victoria Square,M4 5DX,77211161 +11a Victoria Square,M4 5DX,77211162 +12a Victoria Square,M4 5DX,77211163 +13a Victoria Square,M4 5DX,77211164 +14a Victoria Square,M4 5DX,77211165 +15a Victoria Square,M4 5DX,77211166 +16a Victoria Square,M4 5DX,77211167 +17a Victoria Square,M4 5DX,77211168 +18a Victoria Square,M4 5DX,77211169 +19a Victoria Square,M4 5DX,77211170 +20a Victoria Square,M4 5DX,77211171 +21a Victoria Square,M4 5DY,77211172 +22a Victoria Square,M4 5DY,None +23a Victoria Square,M4 5DY,77211174 +24a Victoria Square,M4 5DY,77211175 +25a Victoria Square,M4 5DY,77211176 +26a Victoria Square,M4 5DY,77211177 +27a Victoria Square,M4 5DY,77211178 +28a Victoria Square,M4 5DY,None +29a Victoria Square,M4 5DY,77211180 +30a Victoria Square,M4 5DY,77211181 +31a Victoria Square,M4 5DY,77211182 +32a Victoria Square,M4 5DY,77211183 +33a Victoria Square,M4 5DY,77211184 +34a Victoria Square,M4 5DY,77211185 +35a Victoria Square,M4 5DY,None +36a Victoria Square,M4 5DY,77211187 +37a Victoria Square,M4 5DY,77211188 +38a Victoria Square,M4 5DY,77211189 +39a Victoria Square,M4 5DY,77211190 +40a Victoria Square,M4 5DY,None +41a Victoria Square,M4 5DY,77211192 +42a Victoria Square,M4 5DY,77211193 +43a Victoria Square,M4 5DY,77211194 +44a Victoria Square,M4 5DY,77211195 +45a Victoria Square,M4 5DY,77211196 +46a Victoria Square,M4 5DY,77211197 +47a Victoria Square,M4 5DY,77211198 +48a Victoria Square,M4 5DY,77211199 +49a Victoria Square,M4 5DY,77211200 +50a Victoria Square,M4 5DY,77211201 +51a Victoria Square,M4 5DY,77211202 +52a Victoria Square,M4 5DY,77211203 +53a Victoria Square,M4 5DY,77211204 +54a Victoria Square,M4 5DY,77211205 +55a Victoria Square,M4 5DY,77211206 +56a Victoria Square,M4 5DZ,77211207 +57a Victoria Square,M4 5DZ,None +58a Victoria Square,M4 5DZ,77211209 +59a Victoria Square,M4 5DZ,77211210 +60a Victoria Square,M4 5DZ,77211211 +61a Victoria Square,M4 5DZ,77211212 +62a Victoria Square,M4 5DZ,77211213 +63a Victoria Square,M4 5DZ,None +64a Victoria Square,M4 5DZ,77211215 +65a Victoria Square,M4 5DZ,77211216 +66a Victoria Square,M4 5DZ,None +67a Victoria Square,M4 5DZ,None +68a Victoria Square,M4 5DZ,77211219 +69a Victoria Square,M4 5DZ,77211220 +70a Victoria Square,M4 5DZ,77211221 +71a Victoria Square,M4 5DZ,77211222 +72a Victoria Square,M4 5DZ,77211223 +73a Victoria Square,M4 5DZ,77211224 +74a Victoria Square,M4 5DZ,None +75a Victoria Square,M4 5DZ,77211226 +76a Victoria Square,M4 5DZ,77211227 +77a Victoria Square,M4 5DZ,None +78a Victoria Square,M4 5DZ,77211229 +79a Victoria Square,M4 5DZ,77211230 +80a Victoria Square,M4 5DZ,77211231 +81a Victoria Square,M4 5DZ,77211232 +82 Victoria Square,M4 5DZ,None +83a Victoria Square,M4 5DZ,77211234 +84a Victoria Square,M4 5DZ,None +85a Victoria Square,M4 5DZ,77211236 +86a Victoria Square,M4 5DZ,77211237 +87a Victoria Square,M4 5DZ,77211238 +88a Victoria Square,M4 5DZ,None +89a Victoria Square,M4 5DZ,77211240 +90a Victoria Square,M4 5DZ,77211241 +91a Victoria Square,M4 5DZ,77211242 +92a Victoria Square,M4 5DZ,77211243 +93a Victoria Square,M4 5EA,77211244 +94a Victoria Square,M4 5EA,None +95a Victoria Square,M4 5EA,77211246 +96a Victoria Square,M4 5EA,77211247 +97a Victoria Square,M4 5EA,77211248 +98a Victoria Square,M4 5EA,77211249 +99a Victoria Square,M4 5EA,77211250 +100a Victoria Square,M4 5EA,77211251 +101a Victoria Square,M4 5EA,None +102a Victoria Square,M4 5EA,None +103a Victoria Square,M4 5EA,77211254 +104a Victoria Square,M4 5EA,77211255 +105a Victoria Square,M4 5EA,None +106a Victoria Square,M4 5EA,77211257 +107a Victoria Square,M4 5EA,77211258 +108a Victoria Square,M4 5EA,77211259 +109a Victoria Square,M4 5EA,77211260 +110a Victoria Square,M4 5EA,77211261 +111a Victoria Square,M4 5EA,77211262 +112a Victoria Square,M4 5EA,None +113a Victoria Square,M4 5EA,77211264 +114a Victoria Square,M4 5EA,77211265 +115a Victoria Square,M4 5EA,77211266 +116a Victoria Square,M4 5EA,77211267 +117a Victoria Square,M4 5EA,None +118a Victoria Square,M4 5EA,None +119a Victoria Square,M4 5EA,77211270 +120a Victoria Square,M4 5EA,77211271 +121a Victoria Square,M4 5EA,77211272 +122a Victoria Square,M4 5EA,77211273 +123a Victoria Square,M4 5EA,77211274 +124a Victoria Square,M4 5EA,None +125a Victoria Square,M4 5EA,77211276 +126a Victoria Square,M4 5EA,77211277 +127a Victoria Square,M4 5EA,77211278 +128a Victoria Square,M4 5EA,77211279 +129a Victoria Square,M4 5EA,77211280 +130a Victoria Square,M4 5FA,77211281 +131a Victoria Square,M4 5FA,77211282 +132a Victoria Square,M4 5FA,77211283 +133a Victoria Square,M4 5FA,None +134a Victoria Square,M4 5FA,77211285 +135a Victoria Square,M4 5FA,77211286 +136a Victoria Square,M4 5FA,77211287 +137a Victoria Square,M4 5FA,77211288 +138a Victoria Square,M4 5FA,77211289 +139a Victoria Square,M4 5FA,77211290 +140a Victoria Square,M4 5FA,77211291 +141a Victoria Square,M4 5FA,77211292 +142a Victoria Square,M4 5FA,77211293 +143a Victoria Square,M4 5FA,77211294 +144a Victoria Square,M4 5FA,77211295 +145a Victoria Square,M4 5FA,None +146a Victoria Square,M4 5FA,77211297 +147a Victoria Square,M4 5FA,77211298 +148a Victoria Square,M4 5FA,77211299 +149a Victoria Square,M4 5FA,77211300 +150a Victoria Square,M4 5FA,77211301 +151a Victoria Square,M4 5FA,None +152a Victoria Square,M4 5FA,77211303 +153a Victoria Square,M4 5FA,None +154a Victoria Square,M4 5FA,77211305 +155a Victoria Square,M4 5FA,None +156a Victoria Square,M4 5FA,77211307 +157a Victoria Square,M4 5FA,77211308 +158a Victoria Square,M4 5FA,77211309 +159a Victoria Square,M4 5FA,None +160a Victoria Square,M4 5FA,77211311 +161a Victoria Square,M4 5FA,None +162a Victoria Square,M4 5FA,None +163a Victoria Square,M4 5FA,77211314 +164a Victoria Square,M4 5FA,77211315 +165a Victoria Square,M4 5FA,77211316 +166a Victoria Square,M4 5FA,None +"FLAT 3; 42 MORETON ROAD, SOUTH CROYDON, SURREY",CR2 7DL,None \ No newline at end of file diff --git a/backend/app/config.py b/backend/app/config.py index dd3f5db1..b335c215 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -42,7 +42,7 @@ class Settings(BaseSettings): AWS_DEFAULT_REGION: Optional[str] = None class Config: - env_file = "backend/.env" + env_file = "backend/.env.local" @lru_cache() diff --git a/backend/app/db/connection.py b/backend/app/db/connection.py index 74f3bd2e..f0649c71 100644 --- a/backend/app/db/connection.py +++ b/backend/app/db/connection.py @@ -3,7 +3,9 @@ from contextlib import contextmanager from backend.app.config import get_settings from sqlmodel import Session -connection_string = "postgresql+{drivername}://{username}:{password}@{server}:{port}/{dbname}" +connection_string = ( + "postgresql+{drivername}://{username}:{password}@{server}:{port}/{dbname}" +) db_string = connection_string.format( drivername="psycopg2", # You'll need to use psycopg2 driver for PostgreSQL username=get_settings().DB_USERNAME, @@ -28,7 +30,9 @@ db_engine = create_engine( def get_db_session(): if db_engine is None: - raise RuntimeError("Database is not configured. Set DATABASE_URL in environment variables.") + raise RuntimeError( + "Database is not configured. Set DATABASE_URL in environment variables." + ) return Session(db_engine) diff --git a/backend/app/db/functions/condition_functions.py b/backend/app/db/functions/condition_functions.py new file mode 100644 index 00000000..d281b9a4 --- /dev/null +++ b/backend/app/db/functions/condition_functions.py @@ -0,0 +1,12 @@ +from typing import List +from sqlalchemy import insert, delete +from sqlalchemy.orm import Session + +from backend.app.db.connection import db_session, db_read_session +from backend.app.db.models.condition import PropertyConditionSurveyModel + + +def bulk_insert_property_surveys( + session: Session, surveys: List[PropertyConditionSurveyModel] +) -> None: + raise NotImplementedError diff --git a/backend/app/db/models/condition.py b/backend/app/db/models/condition.py new file mode 100644 index 00000000..77043366 --- /dev/null +++ b/backend/app/db/models/condition.py @@ -0,0 +1,97 @@ +from sqlalchemy import ( + BigInteger, + Column, + Date, + ForeignKey, + Integer, + String, + Enum as SqlEnum, +) +from sqlalchemy.orm import declarative_base, relationship + +from backend.condition.domain.aspect_type import AspectType +from backend.condition.domain.element_type import ElementType + +Base = declarative_base() + +ElementTypeDb = SqlEnum( + ElementType, + name="element_type", + native_enum=True, + values_callable=lambda enum: [e.value for e in enum], +) + +AspectTypeDb = SqlEnum( + AspectType, + name="aspect_type", + native_enum=True, + values_callable=lambda enum: [a.value for a in enum], +) + + +class PropertyConditionSurveyModel(Base): + __tablename__ = "property_condition_survey" + + id = Column(BigInteger, primary_key=True, autoincrement=True) + uprn = Column(BigInteger, nullable=False) + + date = Column(Date, nullable=False) + source = Column(String, nullable=False) + + elements = relationship( + "ElementModel", + back_populates="survey", + cascade="all, delete-orphan", + ) + + +class ElementModel(Base): + __tablename__ = "element" # TODO: rename to survey_element? + + id = Column(BigInteger, primary_key=True, autoincrement=True) + + survey_id = Column( + BigInteger, + ForeignKey("property_condition_survey.id"), + nullable=False, + ) + + element_type = Column(ElementTypeDb, nullable=False) + element_instance = Column(BigInteger, nullable=False) + + survey = relationship( + "PropertyConditionSurveyModel", + back_populates="elements", + ) + + aspect_conditions = relationship( + "AspectConditionModel", + back_populates="element", + cascade="all, delete-orphan", + ) + + +class AspectConditionModel(Base): + __tablename__ = "aspect_condition" # TODO: rename to survey_aspect? + + id = Column(BigInteger, primary_key=True, autoincrement=True) + + element_id = Column( + BigInteger, + ForeignKey("element.id"), + nullable=False, + ) + + aspect_type = Column(AspectTypeDb, nullable=False) + aspect_instance = Column(BigInteger, nullable=False) + + value = Column(String) + quantity = Column(Integer) + install_date = Column(Date) + renewal_year = Column(Integer) + comments = Column(String) + + element = relationship( + "ElementModel", + back_populates="aspect_conditions", + ) diff --git a/backend/app/requirements/requirements.txt b/backend/app/requirements/requirements.txt index dff7a546..3124034e 100644 --- a/backend/app/requirements/requirements.txt +++ b/backend/app/requirements/requirements.txt @@ -1,3 +1,4 @@ + # fastapi fastapi==0.115.2 sqlalchemy==2.0.36 @@ -12,5 +13,4 @@ boto3==1.35.44 openpyxl==3.1.2 # Basic pytz -sqlmodel - +sqlmodel \ No newline at end of file diff --git a/backend/condition/README.md b/backend/condition/README.md index 140d4585..46302cab 100644 --- a/backend/condition/README.md +++ b/backend/condition/README.md @@ -20,7 +20,7 @@ The processor currently supports file formats provided by **Peabody** and **LBWF The `local_runner` script allows the processor to be executed in a local environment. -1. Copy a sample input file into the `sample_data/` directory. +1. Copy sample input file(s) into the `sample_data/` directory. If working with Peabody data, you'll need the Landlord Reference / UPRN lookup file as well. 2. Update `local_runner.py` as required, specifically the definitions of: - `lbwf_path` - `peabody_path` diff --git a/backend/condition/local_runner.py b/backend/condition/local_runner.py index 404f64d4..e39d38c7 100644 --- a/backend/condition/local_runner.py +++ b/backend/condition/local_runner.py @@ -21,6 +21,8 @@ def main() -> None: / "2026_01_06 - Peabody - Stock Condition Data - Survey Records - D Lower.xlsx" ) filepaths = [lbwf_path, peabody_path] + # filepaths = [lbwf_path] + # filepaths = [peabody_path] for fp in filepaths: with fp.open("rb") as f: diff --git a/backend/condition/parsing/lbwf_parser.py b/backend/condition/parsing/lbwf_parser.py index 14d2efe4..3a23d028 100644 --- a/backend/condition/parsing/lbwf_parser.py +++ b/backend/condition/parsing/lbwf_parser.py @@ -1,4 +1,4 @@ -from typing import BinaryIO, Any, Dict, Iterator, List, Tuple +from typing import BinaryIO, Any, Dict, Iterator, List, Optional, Tuple from openpyxl import Workbook, load_workbook from collections import defaultdict @@ -15,7 +15,11 @@ logger = setup_logger() class LbwfParser(Parser): - def parse(self, file_stream: BinaryIO) -> Any: + def parse( + self, + file_stream: BinaryIO, + location_ref_to_uprn_map: Optional[Dict[str, int]] = None, + ) -> Any: wb: Workbook = load_workbook(file_stream) address_to_uprn_map: Dict[str, int] = LbwfParser._generate_address_to_uprn_dict( wb diff --git a/backend/condition/parsing/parser.py b/backend/condition/parsing/parser.py index 105fda36..825abcd5 100644 --- a/backend/condition/parsing/parser.py +++ b/backend/condition/parsing/parser.py @@ -1,8 +1,13 @@ from abc import ABC, abstractmethod -from typing import BinaryIO, Any +from typing import BinaryIO, Any, Dict, Optional + class Parser(ABC): @abstractmethod - def parse(self, file_stream: BinaryIO) -> Any: - pass \ No newline at end of file + def parse( + self, + file_stream: BinaryIO, + location_ref_to_uprn_map: Optional[Dict[str, int]] = None, + ) -> Any: + pass diff --git a/backend/condition/parsing/peabody_parser.py b/backend/condition/parsing/peabody_parser.py index b8a548a7..c53fd6d1 100644 --- a/backend/condition/parsing/peabody_parser.py +++ b/backend/condition/parsing/peabody_parser.py @@ -1,26 +1,55 @@ -from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, DefaultDict +import csv +from pathlib import Path +from typing import Any, BinaryIO, Dict, List, Optional, Tuple, DefaultDict from openpyxl import Workbook, load_workbook from collections import defaultdict from backend.condition.parsing.parser import Parser -from backend.condition.parsing.records.peabody.peabody_asset_condition import PeabodyAssetCondition +from backend.condition.parsing.records.peabody.peabody_asset_condition import ( + PeabodyAssetCondition, +) from backend.condition.parsing.records.peabody.peabody_property import PeabodyProperty from utils.logger import setup_logger logger = setup_logger() -class PeabodyParser(Parser): - def parse(self, file_stream: BinaryIO) -> Any: - wb: Workbook = load_workbook(file_stream) - address_to_uprn_map: Dict[str, int] = PeabodyParser._generate_address_to_uprn_dict(wb) - - assets = self._parse_assets(wb) - return self._group_assets_into_properties( +class PeabodyParser(Parser): + def parse( + self, + file_stream: BinaryIO, + location_ref_to_uprn_map: Optional[Dict[str, int]] = None, + ) -> Any: + wb: Workbook = load_workbook(file_stream) + + if location_ref_to_uprn_map is None: + location_ref_to_uprn_map: Dict[str, int] = ( + PeabodyParser._build_location_ref_to_uprn_map() + ) + + assets = PeabodyParser._parse_assets(wb) + + return PeabodyParser._group_assets_into_properties( assets=assets, - address_to_uprn_map=address_to_uprn_map, + location_ref_to_uprn_map=location_ref_to_uprn_map, ) + @staticmethod + def _build_location_ref_to_uprn_map() -> Dict[str, int]: + location_ref_to_uprn_filepath: Path = ( + Path(__file__).resolve().parents[1] + / "sample_data" + / "peabody" + / "PeabodyPropertymatched_Dec25_propref_UPRN.csv" + ) + location_ref_to_uprn_map: Dict[str, int] = {} + + with location_ref_to_uprn_filepath.open(newline="") as f: + reader: Any = csv.DictReader(f) + for row in reader: + location_ref_to_uprn_map[row["reference"]] = int(row["out_uprn"]) + + return location_ref_to_uprn_map @staticmethod def _parse_assets(wb: Workbook) -> List[PeabodyAssetCondition]: @@ -33,39 +62,43 @@ class PeabodyParser(Parser): assets: List[PeabodyAssetCondition] = [] for row in asset_rows: try: - asset = PeabodyParser._map_row_to_asset_record(row, asset_header_indexes) + asset = PeabodyParser._map_row_to_asset_record( + row, asset_header_indexes + ) if not asset.is_block_level: # Block-level condition surveys are out of scope for now - # until we have a wider think on how to handle block - assets.append(asset) # TODO: handle block-level assets + # until we have a wider think on how to handle block + assets.append(asset) # TODO: handle block-level assets except Exception as e: logger.error(f"Error mapping Peabody row to asset record: {e}") continue return assets - + @staticmethod def _group_assets_into_properties( assets: List[PeabodyAssetCondition], - address_to_uprn_map: Dict[str, int], + location_ref_to_uprn_map: Dict[str, int], ) -> List[PeabodyProperty]: - assets_by_address: DefaultDict[str, List[PeabodyAssetCondition]] = defaultdict(list) + assets_by_location_reference: DefaultDict[str, List[PeabodyAssetCondition]] = ( + defaultdict(list) + ) for asset in assets: - if asset.full_address is None: + if asset.lo_reference is None: continue - address = asset.full_address.strip() - assets_by_address[address].append(asset) + assets_by_location_reference[asset.lo_reference].append(asset) properties: List[PeabodyProperty] = [] - for address, grouped_assets in assets_by_address.items(): - uprn = address_to_uprn_map.get(address) + for location_ref, grouped_assets in assets_by_location_reference.items(): + + uprn = location_ref_to_uprn_map.get(location_ref) if uprn is None: - logger.warning(f"No UPRN found for address: {address}") + logger.warning(f"No UPRN found for Location Reference: {location_ref}") continue properties.append( @@ -77,7 +110,6 @@ class PeabodyParser(Parser): return properties - @staticmethod def _map_row_to_asset_record( row: Any | Tuple[object | None, ...], @@ -102,39 +134,9 @@ class PeabodyParser(Parser): condition_survey_date=row[header_indexes["condition_survey_date"]], ) - @staticmethod - def _generate_address_to_uprn_dict(wb: Workbook) -> Dict[str, int | None]: - sheet = wb["Survey Records - D & Lower"] - rows: Iterator[Tuple[object | None, ...]] = sheet.iter_rows(values_only=True) - - headers = next(rows) - header_indexes: Dict[str, int] = PeabodyParser._get_column_indexes_by_name(headers) - - address_idx = header_indexes["full_address"] - - - address_to_uprn: Dict[str, int] = {} - # Generate random UPRNs for now - next_uprn = 1 # TODO: get real UPRNs - - for row in rows: - address = row[address_idx] - - if address is None: - continue - - address = address.strip() - - if address not in address_to_uprn: - address_to_uprn[address] = next_uprn - next_uprn += 1 - - return address_to_uprn - - @staticmethod def _get_column_indexes_by_name( - headers: Tuple[object | None, ...] + headers: Tuple[object | None, ...], ) -> Dict[str, int]: index: Dict[str, int] = {} @@ -142,4 +144,4 @@ class PeabodyParser(Parser): if isinstance(header, str): index[header] = i - return index \ No newline at end of file + return index diff --git a/backend/condition/persistence/condition_postgres.py b/backend/condition/persistence/condition_postgres.py new file mode 100644 index 00000000..9d7895f0 --- /dev/null +++ b/backend/condition/persistence/condition_postgres.py @@ -0,0 +1,86 @@ +import time +from typing import List, Optional +from sqlmodel import Session + +from utils.logger import setup_logger +from backend.app.db.models.condition import ( + AspectConditionModel, + ElementModel, + PropertyConditionSurveyModel, +) +from backend.condition.domain.property_condition_survey import PropertyConditionSurvey +from backend.app.db.connection import db_session + +logger = setup_logger() + + +class ConditionPostgres: + + def bulk_insert_surveys( + self, surveys: List[PropertyConditionSurvey], batch_size: Optional[int] = 100 + ) -> None: + logger.info( + f"Preparing to load {len(surveys)} property surveys to Postgres. Mapping to SQLModel objects..." + ) + survey_models: List[PropertyConditionSurveyModel] = [ + ConditionPostgres.map_survey_to_model(s) for s in surveys + ] + total: int = len(survey_models) + logger.info( + f"Finished mapping {total} surveys. Writing to database in batches of {batch_size}..." + ) + + with db_session() as session: + for start in range(0, total, batch_size): + end = min(start + batch_size, total) + batch = survey_models[start:end] + + t0: float = time.perf_counter() + ConditionPostgres._insert_surveys_batch(batch, session) + elapsed: float = time.perf_counter() - t0 + + logger.info( + f"Inserted batch {start} - {end} ({len(batch)} surveys) in {elapsed} seconds", + ) + + @staticmethod + def map_survey_to_model( + survey: PropertyConditionSurvey, + ) -> PropertyConditionSurveyModel: + survey_model = PropertyConditionSurveyModel( + uprn=survey.uprn, + date=survey.date, + source=survey.source, + elements=[], + ) + + for element in survey.elements: + element_model = ElementModel( + element_type=element.element_type, + element_instance=element.element_instance, + aspect_conditions=[], + ) + + for aspect in element.aspect_conditions: + aspect_model = AspectConditionModel( + aspect_type=aspect.aspect_type, + aspect_instance=aspect.aspect_instance, + value=aspect.value, + quantity=aspect.quantity, + install_date=aspect.install_date, + renewal_year=aspect.renewal_year, + comments=aspect.comments, + ) + + element_model.aspect_conditions.append(aspect_model) + + survey_model.elements.append(element_model) + + return survey_model + + @staticmethod + def _insert_surveys_batch( + surveys: List[PropertyConditionSurveyModel], session: Session + ) -> None: + session.add_all(surveys) + session.commit() diff --git a/backend/condition/processor.py b/backend/condition/processor.py index 3cbff498..4d8f16cf 100644 --- a/backend/condition/processor.py +++ b/backend/condition/processor.py @@ -1,25 +1,33 @@ from typing import Any, BinaryIO, List from datetime import datetime +from utils.logger import setup_logger from backend.condition.domain.mapping.mapper import Mapper from backend.condition.domain.property_condition_survey import PropertyConditionSurvey from backend.condition.parsing.parser import Parser -from utils.logger import setup_logger +from backend.condition.persistence.condition_postgres import ConditionPostgres from backend.condition.file_type import FileType, detect_file_type from backend.condition.parsing.factory import select_parser, select_mapper +logger = setup_logger() + def process_file(file_stream: BinaryIO, source_key: str) -> None: - print(f"[processor] Received file: {source_key}") + logger.info(f"[processor] Received file: {source_key}") # Instantiation file_type: FileType = detect_file_type(source_key) parser: Parser = select_parser(file_type) mapper: Mapper = select_mapper(file_type) + persistence = ConditionPostgres() # Orchestration raw_properties: List[Any] = parser.parse(file_stream) + logger.info( + f"[processor] Finished loading customer survey data for {len(raw_properties)} properties. Mapping..." + ) + survey_year = datetime.now().year # TODO: get this from filepath or elsewhere property_condition_surveys: List[PropertyConditionSurvey] = [] @@ -29,4 +37,10 @@ def process_file(file_stream: BinaryIO, source_key: str) -> None: mapper.map_asset_conditions_for_property(p, survey_year) ) - print("done") # temp + logger.info( + f"[processor] Finished mapping {len(property_condition_surveys)} properties. Writing to database..." + ) + + persistence.bulk_insert_surveys(property_condition_surveys) + + logger.info(f"[processor] Finished loading surveys to database") diff --git a/backend/condition/tests/custom_asserts.py b/backend/condition/tests/custom_asserts.py index 9e3abd7f..623dcf0c 100644 --- a/backend/condition/tests/custom_asserts.py +++ b/backend/condition/tests/custom_asserts.py @@ -1,3 +1,4 @@ +from backend.app.db.models.condition import PropertyConditionSurveyModel from backend.condition.domain.property_condition_survey import PropertyConditionSurvey @@ -72,3 +73,41 @@ class CustomAsserts: f"{actual_aspect.comments} != {expected_aspect.comments}" ) return True + + def assert_property_condition_survey_model_matches_expected( + actual_model: PropertyConditionSurveyModel, + expected: dict, + ) -> None: + assert actual_model.uprn == expected["uprn"], "UPRN differs" + assert actual_model.date == expected["date"], "Date differs" + assert actual_model.source == expected["source"], "Source differs" + + assert len(actual_model.elements) == len(expected["elements"]), ( + f"Expected {len(expected['elements'])} elements, " + f"got {len(actual_model.elements)}" + ) + + for i, (actual_element, expected_element) in enumerate( + zip(actual_model.elements, expected["elements"]) + ): + assert ( + actual_element.element_type == expected_element["element_type"] + ), f"Element[{i}].element_type differs" + assert ( + actual_element.element_instance == expected_element["element_instance"] + ), f"Element[{i}].element_instance differs" + + assert len(actual_element.aspect_conditions) == len( + expected_element["aspects"] + ), f"Element[{i}] aspect count differs" + + for j, (actual_aspect, expected_aspect) in enumerate( + zip(actual_element.aspect_conditions, expected_element["aspects"]) + ): + prefix = f"Element[{i}].Aspect[{j}]" + + for key, value in expected_aspect.items(): + assert getattr(actual_aspect, key) == value, ( + f"{prefix}.{key} differs: " + f"{getattr(actual_aspect, key)} != {value}" + ) diff --git a/backend/condition/tests/parsing/test_peabody_parser.py b/backend/condition/tests/parsing/test_peabody_parser.py index 32ff79d8..20f7a28e 100644 --- a/backend/condition/tests/parsing/test_peabody_parser.py +++ b/backend/condition/tests/parsing/test_peabody_parser.py @@ -1,127 +1,141 @@ import pytest -from typing import Any +from typing import Any, Dict from io import BytesIO from openpyxl import Workbook from datetime import datetime from backend.condition.parsing.peabody_parser import PeabodyParser -from backend.condition.parsing.records.peabody.peabody_asset_condition import PeabodyAssetCondition +from backend.condition.parsing.records.peabody.peabody_asset_condition import ( + PeabodyAssetCondition, +) from backend.condition.parsing.records.peabody.peabody_property import PeabodyProperty + @pytest.fixture def peabody_assets_xlsx_bytes() -> BytesIO: wb = Workbook() survey_records_d_and_lower = wb.active survey_records_d_and_lower.title = "Survey Records - D & Lower" - survey_records_d_and_lower.append([ - "Lo_Reference", - "full_address", - "location_type_code", - "Parent_Lo_Reference", - "Element_Code", - "Element", - "Sub_Element_Code", - "Sub_Element", - "Material_Code", - "material_or_answer", - "Renewal_Quantity", - "Renewal_Year", - "Renewal_Cost", - "cloned", - "lo_type_code", - "condition_survey_date", - ]) - survey_records_d_and_lower.append([ - "B000RAND", - "1 RANDOM HOUSE LONDON", - 3, - "RAND2EST", - 110, - "ROOFS", - 1, - "Primary Roof", - 9, - "Other", - 3, - 2054, - 330, - "N", - 3, - datetime(2025,12,4,9,17,0) - ]) - survey_records_d_and_lower.append([ - "B000BLOCK", - "1100 BLOCK", - 3, - "RAND2EST", - 110, - "ROOFS", - 1, - "Primary Roof", - 9, - "Other", - 3, - 2054, - 330, - "N", - 3, - datetime(2025,12,4,9,17,0) - ]) - survey_records_d_and_lower.append([ - "B000FAKE", - "3 FAKE CLOSE LONDON", - 3, - "FAKEEST", - 100, - "GENERAL", - 15, - "External Decoration", - 2, - "Normal", - 1, - 2035, - 1500.7, - "N", - 3, - datetime(2025,7,5,0,0,0) - ]) - survey_records_d_and_lower.append([ - "B000MIS", - "99 MISC ROAD LONDON", - 3, - "300828", - 54, - "HHSRS", - 29, - "HHSRS Structural Collapse & Falling Elements", - 4, - "HHSRS Moderate", - 2, - 2027, - None, - "N", - 3, - None - ]) - survey_records_d_and_lower.append([ - "B000MIS", - "99 MISC ROAD LONDON", - 3, - "300828", - 53, - "External", - 2, - "Chimney", - 2, - "Present", - 33, - 2053, - 3531, - "N", - 3, - None - ]) - + survey_records_d_and_lower.append( + [ + "Lo_Reference", + "full_address", + "location_type_code", + "Parent_Lo_Reference", + "Element_Code", + "Element", + "Sub_Element_Code", + "Sub_Element", + "Material_Code", + "material_or_answer", + "Renewal_Quantity", + "Renewal_Year", + "Renewal_Cost", + "cloned", + "lo_type_code", + "condition_survey_date", + ] + ) + survey_records_d_and_lower.append( + [ + "B000RAND", + "1 RANDOM HOUSE LONDON", + 3, + "RAND2EST", + 110, + "ROOFS", + 1, + "Primary Roof", + 9, + "Other", + 3, + 2054, + 330, + "N", + 3, + datetime(2025, 12, 4, 9, 17, 0), + ] + ) + survey_records_d_and_lower.append( + [ + "B000BLOCK", + "1100 BLOCK", + 3, + "RAND2EST", + 110, + "ROOFS", + 1, + "Primary Roof", + 9, + "Other", + 3, + 2054, + 330, + "N", + 3, + datetime(2025, 12, 4, 9, 17, 0), + ] + ) + survey_records_d_and_lower.append( + [ + "B000FAKE", + "3 FAKE CLOSE LONDON", + 3, + "FAKEEST", + 100, + "GENERAL", + 15, + "External Decoration", + 2, + "Normal", + 1, + 2035, + 1500.7, + "N", + 3, + datetime(2025, 7, 5, 0, 0, 0), + ] + ) + survey_records_d_and_lower.append( + [ + "B000MIS", + "99 MISC ROAD LONDON", + 3, + "300828", + 54, + "HHSRS", + 29, + "HHSRS Structural Collapse & Falling Elements", + 4, + "HHSRS Moderate", + 2, + 2027, + None, + "N", + 3, + None, + ] + ) + survey_records_d_and_lower.append( + [ + "B000MIS", + "99 MISC ROAD LONDON", + 3, + "300828", + 53, + "External", + 2, + "Chimney", + 2, + "Present", + 33, + 2053, + 3531, + "N", + 3, + None, + ] + ) stream = BytesIO() wb.save(stream) @@ -129,18 +143,32 @@ def peabody_assets_xlsx_bytes() -> BytesIO: return stream -def test_peabody_parser_parses_conditions(peabody_assets_xlsx_bytes): + +@pytest.fixture +def location_ref_to_uprn_map() -> Dict[str, int]: + return { + "B000RAND": 1, + "B000BLOCK": 2, + "B000FAKE": 3, + "B000MIS": 4, + } + + +def test_peabody_parser_parses_conditions( + peabody_assets_xlsx_bytes, location_ref_to_uprn_map +): # arrange parser = PeabodyParser() # act - result: Any = parser.parse(peabody_assets_xlsx_bytes) + result: Any = parser.parse(peabody_assets_xlsx_bytes, location_ref_to_uprn_map) # assert assert len(result) == 3 assert all(isinstance(item, PeabodyProperty) for item in result) + @pytest.fixture def asset_condition_factory(): def _factory(full_address: str) -> PeabodyAssetCondition: @@ -165,6 +193,7 @@ def asset_condition_factory(): return _factory + @pytest.mark.parametrize( "full_address, expected_block_level", [ @@ -175,7 +204,7 @@ def asset_condition_factory(): ("81A-B GORE ROAD LONDON", True), ("73 & 74 HARVEST COURT ST. ALBANS", True), ("25 HAVERSHAM COURT GREENFORD", False), - ("FLAT 10 SPARROW COURT SOUTHMERE DRIVE LONDON SE2 9ES", False) + ("FLAT 10 SPARROW COURT SOUTHMERE DRIVE LONDON SE2 9ES", False), ], ) def test_peabody_asset_is_block_level( @@ -187,4 +216,4 @@ def test_peabody_asset_is_block_level( asset_condition = asset_condition_factory(full_address) # act + assert - assert asset_condition.is_block_level == expected_block_level \ No newline at end of file + assert asset_condition.is_block_level == expected_block_level diff --git a/backend/condition/tests/persistence/test_condition_postgres.py b/backend/condition/tests/persistence/test_condition_postgres.py new file mode 100644 index 00000000..ca95eaaa --- /dev/null +++ b/backend/condition/tests/persistence/test_condition_postgres.py @@ -0,0 +1,164 @@ +import pytest +from datetime import date + +from backend.condition.persistence.condition_postgres import ConditionPostgres +from backend.condition.domain.property_condition_survey import PropertyConditionSurvey +from backend.condition.domain.element import Element +from backend.condition.domain.element_type import ElementType +from backend.condition.domain.aspect_condition import AspectCondition +from backend.condition.domain.aspect_type import AspectType +from backend.app.db.models.condition import PropertyConditionSurveyModel +from backend.condition.tests.custom_asserts import CustomAsserts + + +def test_map_survey_to_model() -> None: + # arrange + survey = PropertyConditionSurvey( + uprn=1, + elements=[ + Element( + element_type=ElementType.EXTERNAL_WINDOWS, + element_instance=1, + aspect_conditions=[ + AspectCondition( + aspect_type=AspectType.MATERIAL, + aspect_instance=1, + value="UPVC Double Glazed", + quantity=8, + install_date=None, + renewal_year=2036, + comments=None, + ), + ], + ), + Element( + element_type=ElementType.EXTERNAL_DECORATION, + element_instance=1, + aspect_conditions=[ + AspectCondition( + aspect_type=AspectType.CONDITION, + aspect_instance=1, + value="Normal", + quantity=1, + install_date=None, + renewal_year=2029, + comments=None, + ) + ], + ), + Element( + element_type=ElementType.EXTERNAL_WALL, + element_instance=1, + aspect_conditions=[ + AspectCondition( + aspect_type=AspectType.FINISH, + aspect_instance=1, + value="Pointed", + quantity=65, + install_date=None, + renewal_year=2045, + comments=None, + ), + AspectCondition( + aspect_type=AspectType.FINISH, + aspect_instance=1, + value="Pointing", + quantity=1, + install_date=None, + renewal_year=2069, + comments=None, + ), + AspectCondition( + aspect_type=AspectType.FINISH, + aspect_instance=2, + value="Tile Hung", + quantity=8, + install_date=None, + renewal_year=2049, + comments=None, + ), + ], + ), + ], + date=date(2000, 1, 1), + source="Peabody", + ) + + expected = { + "uprn": 1, + "date": date(2000, 1, 1), + "source": "Peabody", + "elements": [ + { + "element_type": ElementType.EXTERNAL_WINDOWS, + "element_instance": 1, + "aspects": [ + { + "aspect_type": AspectType.MATERIAL, + "aspect_instance": 1, + "value": "UPVC Double Glazed", + "quantity": 8, + "install_date": None, + "renewal_year": 2036, + "comments": None, + } + ], + }, + { + "element_type": ElementType.EXTERNAL_DECORATION, + "element_instance": 1, + "aspects": [ + { + "aspect_type": AspectType.CONDITION, + "aspect_instance": 1, + "value": "Normal", + "quantity": 1, + "install_date": None, + "renewal_year": 2029, + "comments": None, + } + ], + }, + { + "element_type": ElementType.EXTERNAL_WALL, + "element_instance": 1, + "aspects": [ + { + "aspect_instance": 1, + "value": "Pointed", + "quantity": 65, + "install_date": None, + "renewal_year": 2045, + "comments": None, + }, + { + "aspect_type": AspectType.FINISH, + "aspect_instance": 1, + "value": "Pointing", + "quantity": 1, + "install_date": None, + "renewal_year": 2069, + "comments": None, + }, + { + "aspect_type": AspectType.FINISH, + "aspect_instance": 2, + "value": "Tile Hung", + "quantity": 8, + "install_date": None, + "renewal_year": 2049, + "comments": None, + }, + ], + }, + ], + } + + # act + model: PropertyConditionSurveyModel = ConditionPostgres.map_survey_to_model(survey) + + # assert (survey level) + CustomAsserts.assert_property_condition_survey_model_matches_expected( + model, + expected, + ) diff --git a/backend/engine/requirements.txt b/backend/engine/requirements.txt index b565e9d3..5cca1211 100644 --- a/backend/engine/requirements.txt +++ b/backend/engine/requirements.txt @@ -1,3 +1,4 @@ + # Pandas and numpy numpy==2.1.2 pandas==2.2.3 @@ -22,4 +23,4 @@ pyarrow==17.0.0 fastparquet==2024.5.0 aiohttp==3.10.10 # find my epc -beautifulsoup4 +beautifulsoup4 \ No newline at end of file diff --git a/backend/postcode_splitter/hackney.xlsx b/backend/postcode_splitter/hackney.xlsx new file mode 100644 index 00000000..64892f3a Binary files /dev/null and b/backend/postcode_splitter/hackney.xlsx differ diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py new file mode 100644 index 00000000..d417c8f1 --- /dev/null +++ b/backend/postcode_splitter/main.py @@ -0,0 +1,114 @@ +import pandas as pd +import requests +from backend.address2UPRN.main import resolve_uprns_for_postcode_group, get_epc_data_with_postcode +from tqdm import tqdm + + + +def sanitise_postcode(postcode: str) -> str | None: + """ + Normalise postcode for grouping. + + - Uppercase + - Remove all whitespace + """ + if pd.isna(postcode): + return None + + return postcode.upper().replace(" ", "") + + +def is_valid_postcode(postcode_clean: str) -> bool: + """ + Validate postcode using postcodes.io. + + Expects a sanitised postcode (e.g. E84SQ). + Returns True if valid, False otherwise. + """ + POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" + if not postcode_clean: + return False + + try: + resp = requests.get( + POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), + timeout=5, + ) + resp.raise_for_status() + return resp.json().get("result", False) + except requests.RequestException: + # Network issues, rate limits, etc. + return False + + +def main(): + df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability") + df = df.head(500) + + # Sanitise postcodes + df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) + + # --- validate AFTER grouping (save API calls) --- + + # Get unique, non-null postcodes + unique_postcodes = ( + df["postcode_clean"] + .dropna() + .unique() + ) + + # Validate each postcode once, TODOadd a progress bar + postcode_validity = { + pc: is_valid_postcode(pc) + for pc in tqdm(unique_postcodes, total=len(unique_postcodes)) + } + + # Map validity back onto dataframe + df["postcode_valid"] = df["postcode_clean"].map(postcode_validity) + + + results = [] + + for postcode, group_df in tqdm( + df[df["postcode_valid"]].groupby("postcode_clean"), + desc="Resolving UPRNs by postcode", + ): + try: + epc_df = get_epc_data_with_postcode(postcode) + + if epc_df.empty: + tmp = group_df.copy() + tmp["found_uprn"] = None + tmp["status"] = "no_epc_results" + results.append(tmp) + continue + + resolved = resolve_uprns_for_postcode_group( + group_df=group_df, + epc_df=epc_df, + ) + + results.append(resolved) + + except Exception as e: + tmp = group_df.copy() + tmp["found_uprn"] = None + tmp["status"] = "exception" + tmp["error"] = str(e) + results.append(tmp) + + final_df = pd.concat(results, ignore_index=True) + a = final_df[[ + "best_match_lexiscore","Address 1", + "best_match_address", "Postcode", + "UPRN", "best_match_uprn" + ]] # add levi score to viewing + b = final_df[final_df["best_match_lexiscore"]>0] # add levi score to viewing + b = b[[ + "best_match_lexiscore","Address 1", + "best_match_address", "Postcode", + "UPRN", "best_match_uprn" + ]] + +if __name__ == "__main__": + main() diff --git a/conftest.py b/conftest.py index e3add6e6..d93f0023 100644 --- a/conftest.py +++ b/conftest.py @@ -1,5 +1,11 @@ import os from backend.app.config import get_settings +import os +from dotenv import load_dotenv +import os + +# Load .env in conftest.py directory for local development +load_dotenv() DEFAULT_ENV = { "API_KEY": "test", @@ -8,7 +14,10 @@ DEFAULT_ENV = { "DATA_BUCKET": "test", "PLAN_TRIGGER_BUCKET": "test", "ENGINE_SQS_URL": "test", - "EPC_AUTH_TOKEN": "test", # overridden in GitHub Actions + "EPC_AUTH_TOKEN": os.getenv( + "EPC_AUTH_TOKEN", + "test", + ), # overridden in GitHub Actions "GOOGLE_SOLAR_API_KEY": "test", "DB_HOST": "localhost", "DB_USERNAME": "test", diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py b/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py index b6fc0f8f..68655e80 100644 --- a/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py +++ b/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py @@ -1,111 +1,111 @@ import pandas as pd -epc_c_recommendations = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " - "solid floor, ashp 3.0 - corrected.xlsx" -) -epc_b_recommendations = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC B - no " - "solid floor, ashp 3.0 - corrected.xlsx" -) +# epc_c_recommendations = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " +# "solid floor, ashp 3.0 - corrected.xlsx" +# ) +# epc_b_recommendations = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC B - no " +# "solid floor, ashp 3.0 - corrected.xlsx" +# ) -epc_c_movers = epc_b_recommendations[ - epc_b_recommendations["current_epc_rating"] == "Epc.C" - ] -epc_c_movers["property_type"].value_counts() +# epc_c_movers = epc_b_recommendations[ +# epc_b_recommendations["current_epc_rating"] == "Epc.C" +# ] +# epc_c_movers["property_type"].value_counts() -house_epc_c_movers = epc_c_movers[ - epc_c_movers["property_type"] == "House" - ] -house_epc_c_movers_with_solar = house_epc_c_movers[ - ~pd.isnull(house_epc_c_movers["solar_pv"]) | ~pd.isnull(house_epc_c_movers["solar_pv_with_battery"]) - ] +# house_epc_c_movers = epc_c_movers[ +# epc_c_movers["property_type"] == "House" +# ] +# house_epc_c_movers_with_solar = house_epc_c_movers[ +# ~pd.isnull(house_epc_c_movers["solar_pv"]) | ~pd.isnull(house_epc_c_movers["solar_pv_with_battery"]) +# ] -house_epc_c_movers_with_a_heatpump = house_epc_c_movers[ - ~pd.isnull(house_epc_c_movers["air_source_heat_pump"]) -] +# house_epc_c_movers_with_a_heatpump = house_epc_c_movers[ +# ~pd.isnull(house_epc_c_movers["air_source_heat_pump"]) +# ] -flat_epc_c_movers = epc_c_movers[ - epc_c_movers["property_type"] == "Flat" - ] +# flat_epc_c_movers = epc_c_movers[ +# epc_c_movers["property_type"] == "Flat" +# ] -epc_c_recommendations["sap_points"].mean() -epc_c_recommendations["sap_points"].mean() +# epc_c_recommendations["sap_points"].mean() +# epc_c_recommendations["sap_points"].mean() -measure_cols = [ - "air_source_heat_pump", - "boiler_upgrade", - "cavity_wall_insulation", - "double_glazing", - "external_wall_insulation", - "flat_roof_insulation", - "high_heat_retention_storage_heaters", - "internal_wall_insulation", - "loft_insulation", - "low_energy_lighting", - "mechanical_ventilation", - "room_roof_insulation", - "roomstat_programmer_trvs", - "sealing_open_fireplace", - "secondary_glazing", - "secondary_heating", - "solar_pv", - "solar_pv_with_battery", - "suspended_floor_insulation", - "time_temperature_zone_control", -] +# measure_cols = [ +# "air_source_heat_pump", +# "boiler_upgrade", +# "cavity_wall_insulation", +# "double_glazing", +# "external_wall_insulation", +# "flat_roof_insulation", +# "high_heat_retention_storage_heaters", +# "internal_wall_insulation", +# "loft_insulation", +# "low_energy_lighting", +# "mechanical_ventilation", +# "room_roof_insulation", +# "roomstat_programmer_trvs", +# "sealing_open_fireplace", +# "secondary_glazing", +# "secondary_heating", +# "solar_pv", +# "solar_pv_with_battery", +# "suspended_floor_insulation", +# "time_temperature_zone_control", +# ] -epc_c_melted = ( - epc_c_recommendations - .melt( - id_vars=[c for c in epc_c_recommendations.columns if c not in measure_cols], - value_vars=measure_cols, - var_name="measure_type", - value_name="value", - ) - .dropna(subset=["value"]) -) -epc_c_melted = epc_c_melted[epc_c_melted["value"] > 0] -epc_c_measures = epc_c_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() +# epc_c_melted = ( +# epc_c_recommendations +# .melt( +# id_vars=[c for c in epc_c_recommendations.columns if c not in measure_cols], +# value_vars=measure_cols, +# var_name="measure_type", +# value_name="value", +# ) +# .dropna(subset=["value"]) +# ) +# epc_c_melted = epc_c_melted[epc_c_melted["value"] > 0] +# epc_c_measures = epc_c_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() -epc_b_melted = ( - epc_b_recommendations - .melt( - id_vars=[c for c in epc_b_recommendations.columns if c not in measure_cols], - value_vars=measure_cols, - var_name="measure_type", - value_name="value", - ) - .dropna(subset=["value"]) -) +# epc_b_melted = ( +# epc_b_recommendations +# .melt( +# id_vars=[c for c in epc_b_recommendations.columns if c not in measure_cols], +# value_vars=measure_cols, +# var_name="measure_type", +# value_name="value", +# ) +# .dropna(subset=["value"]) +# ) -epc_b_melted = epc_b_melted[epc_b_melted["value"] > 0] -epc_b_measures = epc_b_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() +# epc_b_melted = epc_b_melted[epc_b_melted["value"] > 0] +# epc_b_measures = epc_b_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() -measures_compared = epc_c_measures.merge( - epc_b_measures, - left_on="measure_type", - right_on="measure_type", - suffixes=("_epc_c", "_epc_b"), -) +# measures_compared = epc_c_measures.merge( +# epc_b_measures, +# left_on="measure_type", +# right_on="measure_type", +# suffixes=("_epc_c", "_epc_b"), +# ) -epc_c_retrofits = epc_c_recommendations[ - epc_c_recommendations["total_retrofit_cost"] > 0 - ] +# epc_c_retrofits = epc_c_recommendations[ +# epc_c_recommendations["total_retrofit_cost"] > 0 +# ] -epc_b_retrofits = epc_b_recommendations[ - epc_b_recommendations["total_retrofit_cost"] > 0 - ] +# epc_b_retrofits = epc_b_recommendations[ +# epc_b_recommendations["total_retrofit_cost"] > 0 +# ] -epc_c_retrofits["sap_points"].mean() -epc_b_retrofits["sap_points"].mean() +# epc_c_retrofits["sap_points"].mean() +# epc_b_retrofits["sap_points"].mean() -properties_in_both = epc_c_retrofits.merge(epc_b_retrofits, on="uprn", suffixes=("_epc_c", "_epc_b")) +# properties_in_both = epc_c_retrofits.merge(epc_b_retrofits, on="uprn", suffixes=("_epc_c", "_epc_b")) -properties_in_both["total_retrofit_cost_epc_c"].mean() -properties_in_both["sap_points_epc_c"].mean() -properties_in_both["total_retrofit_cost_epc_b"].mean() -properties_in_both["sap_points_epc_b"].mean() +# properties_in_both["total_retrofit_cost_epc_c"].mean() +# properties_in_both["sap_points_epc_c"].mean() +# properties_in_both["total_retrofit_cost_epc_b"].mean() +# properties_in_both["sap_points_epc_b"].mean() # Solar PV savings - we need the amount of solar PV bill savings from sqlalchemy.orm import sessionmaker @@ -114,16 +114,12 @@ from backend.app.db.models.recommendations import Recommendation, Plan, PlanReco from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel from collections import defaultdict -PORTFOLIO_ID = 435 # Peabody +PORTFOLIO_ID = 485 # Peabody SCENARIOS = [ - 908, - 909, - 910, + 970 ] scenario_names = { - 908: "EPC C - no solid floor, ashp 3.0", - 909: "EPC C - no solid floor, no EWI or IWI, ashp 3.0", - 910: "EPC B - no solid floor, no EWI, ashp 3.0" + 970: "EPC C - no solid floor, ashp 3.0", } @@ -236,307 +232,266 @@ recommendations_df = pd.DataFrame(recommendations_data) properties_df = pd.DataFrame(properties_data) plans_df = pd.DataFrame(plans_data) -s_id = 910 -ps_w_a_plan = plans_df[plans_df["scenario_id"] == s_id].copy() -# Take the newest by scenario id -ps_w_a_plan = ps_w_a_plan.sort_values("created_at", ascending=False).drop_duplicates( - subset=["property_id"] -) -z = ps_w_a_plan[ - ps_w_a_plan["cost_of_works"] > 0 - ].copy() -z2 = properties_df[properties_df["property_id"].isin(z["property_id"].values)] -# '', 'hot_water_cost_current', -# 'lighting_cost_current', 'appliances_cost_current', -# 'gas_standing_charge', 'electricity_standing_charge' -z2["total_bills"] = z2["heating_cost_current"] + z2["hot_water_cost_current"] + z2["lighting_cost_current"] + z2[ - "appliances_cost_current" -] + z2["gas_standing_charge"] + z2["electricity_standing_charge"] +with pd.ExcelWriter("hackney.xlsx", engine="openpyxl") as writer: + recommendations_df.to_excel(writer, sheet_name="recommendations", index=False) + properties_df.to_excel(writer, sheet_name="properties", index=False) -from tqdm import tqdm + +# solar_pv_recommendations = recommendations_df[recommendations_df["measure_type"] == "solar_pv"] +# average_savings = solar_pv_recommendations.groupby("scenario_id")["energy_cost_savings"].mean().reset_index() -# For a property ID, find a property where the no EWI/IWI approach is more expensive than the EWI approach -pids = properties_df["property_id"].unique() -for pid in tqdm(pids): - if pid in [603272, 550550, 574493]: - continue - # get the plans - property_plan = plans_df[plans_df["property_id"] == int(pid)] - # Take the newest plan by scenario id - property_plan = property_plan.sort_values("created_at", ascending=False).drop_duplicates( - subset=["scenario_id"] - ) - a = property_plan[property_plan["scenario_id"] == 909].squeeze() # no EWI/IWI - b = property_plan[property_plan["scenario_id"] == 908].squeeze() # EWI - if (a["cost_of_works"] > b["cost_of_works"]) and ( - a["post_epc_rating"].value == "C") and (b["cost_of_works"] > 5000): - bah +# # Check tenures +# initial_asset_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " +# "- Data Extracts for Domna.xlsx", +# sheet_name="Properties" +# ) +# sustainability_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " +# "- Data Extracts for Domna.xlsx", +# sheet_name="Sustainability" +# ) -solar_pv_recommendations = recommendations_df[ - recommendations_df["measure_type"] == "solar_pv" - ] +# sustainability_sample = sustainability_data[ +# sustainability_data["UPRN"].isin(properties_df["uprn"].astype(int).astype(str).values) +# ] -solid_wall_recommendation = recommendations_df[ - recommendations_df["scenario_id"].isin([908]) & - recommendations_df["measure_type"].isin(["internal_wall_insulation"]) & - recommendations_df["default"] - ] -average_savings = solar_pv_recommendations.groupby("scenario_id")["energy_cost_savings"].mean().reset_index() -# Add on scenarion names -average_savings["scenario_name"] = average_savings["scenario_id"].map(scenario_names) +# sustainability_sample = sustainability_sample.merge( +# initial_asset_data, left_on="Org Ref", right_on="UPRN", suffixes=("_sustainability", "_initial_asset") +# ) -# Check tenures -initial_asset_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " - "- Data Extracts for Domna.xlsx", - sheet_name="Properties" -) -sustainability_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " - "- Data Extracts for Domna.xlsx", - sheet_name="Sustainability" -) +# block_sizes = initial_asset_data["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) +# block_sizes.to_excel("/Users/khalimconn-kowlessar/Downloads/peabody_block_sizes.xlsx", index=False) -sustainability_sample = sustainability_data[ - sustainability_data["UPRN"].isin(properties_df["uprn"].astype(int).astype(str).values) -] +# initial_asset_data.columns +# initial_asset_data["LeaseType"].value_counts() -sustainability_sample = sustainability_sample.merge( - initial_asset_data, left_on="Org Ref", right_on="UPRN", suffixes=("_sustainability", "_initial_asset") -) +# # sustainability_sample["Tenure Group"].value_counts() +# # Tenure Group +# # General Needs 57787 +# # Home Ownership 25471 +# # Care & Supported Housing 4239 +# # Rental 2677 +# # Other 188 -block_sizes = initial_asset_data["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) -block_sizes.to_excel("/Users/khalimconn-kowlessar/Downloads/peabody_block_sizes.xlsx", index=False) +# df = sustainability_sample["Ownership Type"].value_counts().to_frame().reset_index() +# df.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenures.xlsx", index=False) -initial_asset_data.columns -initial_asset_data["LeaseType"].value_counts() +# tenure_groups = sustainability_sample["Tenure Group"].value_counts().to_frame().reset_index() +# tenure_groups.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenure_groups.xlsx", index=False) -# sustainability_sample["Tenure Group"].value_counts() -# Tenure Group -# General Needs 57787 -# Home Ownership 25471 -# Care & Supported Housing 4239 -# Rental 2677 -# Other 188 +# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Tenure Group"].value_counts() -df = sustainability_sample["Ownership Type"].value_counts().to_frame().reset_index() -df.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenures.xlsx", index=False) +# sample_data = initial_asset_data[ +# ~initial_asset_data["Ownership Type"].isin( +# [ +# # Commercial # Everything is resi - based on the Residential Indicator variable - all are true +# # Freeholder +# "FREEHOLDER", # 19517 properties +# # HOMEBUY / EQUITY LOAN +# "Rent to Homebuy", # 1 property +# # Leaseholder +# "LEASEHOLD 100%", # 8455 properties +# "Owned and Managed - 999 year lease", # 2076 properties +# "Managed but not Owned-Private Lease", # 159 properties +# "Owned and managed LEASEHOLD", # 26 properties +# # Outright Sale - can't find anything matching +# # SHARED EQUITY +# "Shared Ownership", # 4065 properties +# "Shared Ownership Owned Not Managed", # 23 properties +# # Extra categories which seem sensible to exclude +# "NOT MANAGED AND NOT OWNED" +# ] +# ) +# ] -tenure_groups = sustainability_sample["Tenure Group"].value_counts().to_frame().reset_index() -tenure_groups.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenure_groups.xlsx", index=False) +# sample_data["Ownership Type"].value_counts() -initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Tenure Group"].value_counts() +# sample_data = initial_asset_data[ +# initial_asset_data["Ownership Type"].isin( +# [ +# "Owned and Managed", +# "Owned and Managed - 999 year lease", +# "Owned and managed LEASEHOLD", +# "LEASEHOLD 100%", +# "DATALOAD DEFAULT" +# ] +# ) +# ] +# dropped = initial_asset_data[~initial_asset_data["UPRN"].isin(sample_data["UPRN"].values)] +# dropped["Ownership Type"].value_counts() -sample_data = initial_asset_data[ - ~initial_asset_data["Ownership Type"].isin( - [ - # Commercial # Everything is resi - based on the Residential Indicator variable - all are true - # Freeholder - "FREEHOLDER", # 19517 properties - # HOMEBUY / EQUITY LOAN - "Rent to Homebuy", # 1 property - # Leaseholder - "LEASEHOLD 100%", # 8455 properties - "Owned and Managed - 999 year lease", # 2076 properties - "Managed but not Owned-Private Lease", # 159 properties - "Owned and managed LEASEHOLD", # 26 properties - # Outright Sale - can't find anything matching - # SHARED EQUITY - "Shared Ownership", # 4065 properties - "Shared Ownership Owned Not Managed", # 23 properties - # Extra categories which seem sensible to exclude - "NOT MANAGED AND NOT OWNED" - ] - ) -] +# for value in [ +# # Commercial # Everything is resi, so should be fine. No matches +# # Freeholder +# "FREEHOLDER", # 19517 properties +# # HOMEBUY / EQUITY LOAN +# "Rent to Homebuy", # 1 property +# # Leaseholder +# "LEASEHOLD 100%", # 8455 properties +# "Owned and Managed - 999 year lease", # 2076 properties +# "Managed but not Owned-Private Lease", # 159 properties +# "Owned and managed LEASEHOLD", # 26 properties +# # Outright Sale - can't find anything matching +# # SHARED EQUITY +# "Shared Ownership", # 4065 properties +# "Shared Ownership Owned Not Managed", # 23 properties +# ]: +# print(initial_asset_data[initial_asset_data["Ownership Type"] == value].shape[0]) -sample_data["Ownership Type"].value_counts() +# house_types = [ +# "HOUSE", +# "BUNGALOW", +# "MAISONETTE", +# "DUPLEX", +# ] -sample_data = initial_asset_data[ - initial_asset_data["Ownership Type"].isin( - [ - "Owned and Managed", - "Owned and Managed - 999 year lease", - "Owned and managed LEASEHOLD", - "LEASEHOLD 100%", - "DATALOAD DEFAULT" - ] - ) -] -dropped = initial_asset_data[~initial_asset_data["UPRN"].isin(sample_data["UPRN"].values)] -dropped["Ownership Type"].value_counts() +# guaranteed_control = [ +# "Owned and Managed", +# "Owned and Managed - 999 year lease", +# "Owned and managed LEASEHOLD", +# "LEASEHOLD 100%", +# "DATALOAD DEFAULT", +# ] -for value in [ - # Commercial # Everything is resi, so should be fine. No matches - # Freeholder - "FREEHOLDER", # 19517 properties - # HOMEBUY / EQUITY LOAN - "Rent to Homebuy", # 1 property - # Leaseholder - "LEASEHOLD 100%", # 8455 properties - "Owned and Managed - 999 year lease", # 2076 properties - "Managed but not Owned-Private Lease", # 159 properties - "Owned and managed LEASEHOLD", # 26 properties - # Outright Sale - can't find anything matching - # SHARED EQUITY - "Shared Ownership", # 4065 properties - "Shared Ownership Owned Not Managed", # 23 properties -]: - print(initial_asset_data[initial_asset_data["Ownership Type"] == value].shape[0]) +# sample_data = initial_asset_data[ +# ( +# initial_asset_data["Ownership Type"].isin(guaranteed_control) +# ) +# | +# ( +# (initial_asset_data["Ownership Type"] == "FREEHOLDER") +# & +# (initial_asset_data["Property Type"].isin(house_types)) +# ) +# ] -house_types = [ - "HOUSE", - "BUNGALOW", - "MAISONETTE", - "DUPLEX", -] +# fabric_retrofit_sample = initial_asset_data[ +# initial_asset_data["Ownership Type"].isin( +# [ +# "Owned and Managed", +# "FREEHOLDER", +# "DATALOAD DEFAULT", +# ] +# ) +# ] -guaranteed_control = [ - "Owned and Managed", - "Owned and Managed - 999 year lease", - "Owned and managed LEASEHOLD", - "LEASEHOLD 100%", - "DATALOAD DEFAULT", -] +# initial_asset_data[pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() +# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() -sample_data = initial_asset_data[ - ( - initial_asset_data["Ownership Type"].isin(guaranteed_control) - ) - | - ( - (initial_asset_data["Ownership Type"] == "FREEHOLDER") - & - (initial_asset_data["Property Type"].isin(house_types)) - ) - ] +# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Property Type"].value_counts() +# z = initial_asset_data[ +# ~pd.isnull(initial_asset_data["BlockCode"]) & initial_asset_data["Property Type"].isin(house_types) +# ] -fabric_retrofit_sample = initial_asset_data[ - initial_asset_data["Ownership Type"].isin( - [ - "Owned and Managed", - "FREEHOLDER", - "DATALOAD DEFAULT", - ] - ) -] +# block_code_agg = z["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) +# zz = initial_asset_data[initial_asset_data["BlockCode"] == "CHAT3343FM"] -initial_asset_data[pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() -initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() +# potential_sample = initial_asset_data[ +# ~pd.isnull(initial_asset_data["BlockCode"]) +# ] -initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Property Type"].value_counts() -z = initial_asset_data[ - ~pd.isnull(initial_asset_data["BlockCode"]) & initial_asset_data["Property Type"].isin(house_types) - ] +# compare = potential_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( +# initial_asset_data["Property Type"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="Property Type", +# right_on="Property Type", +# suffixes=("_on_block_codes", "_overall") +# ) -block_code_agg = z["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) -zz = initial_asset_data[initial_asset_data["BlockCode"] == "CHAT3343FM"] +# # Comparison of smaller sample vs overall +# new_asset_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " +# "- Peabody " +# "- Data Extracts for Domna v2.xlsx", +# sheet_name="Properties" +# ) -potential_sample = initial_asset_data[ - ~pd.isnull(initial_asset_data["BlockCode"]) -] +# new_sustainability_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " +# "- Peabody " +# "- Data Extracts for Domna v2.xlsx", +# sheet_name="Sustainability" +# ) -compare = potential_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( - initial_asset_data["Property Type"].value_counts(normalize=True).to_frame().reset_index(), - left_on="Property Type", - right_on="Property Type", - suffixes=("_on_block_codes", "_overall") -) +# sap_bands = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/Parity Data " +# "08012026.xlsx", +# ) -# Comparison of smaller sample vs overall -new_asset_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " - "- Peabody " - "- Data Extracts for Domna v2.xlsx", - sheet_name="Properties" -) +# combined = new_asset_data.merge( +# new_sustainability_data, +# left_on="UPRN", +# right_on="Org Ref", +# suffixes=("_asset", "_sustainability") +# ).merge( +# sap_bands[["OrgRef", "SAP Band", "Lodged EPC Band"]], how="left", left_on="Org Ref", right_on="OrgRef" +# ) +# reduced_sample = combined[ +# ~combined["AH Tenure"].isin( +# ["Commercial", +# "Freeholder", +# "HOMEBUY / EQUITY LOAN", +# "Leaseholder", +# "Outright Sale", +# "SHARED EQUITY", +# "Shared Ownership"] +# ) +# ].copy() -new_sustainability_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " - "- Peabody " - "- Data Extracts for Domna v2.xlsx", - sheet_name="Sustainability" -) +# # property types +# property_type_comparison = reduced_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( +# combined["Property Type"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="Property Type", +# right_on="Property Type", +# suffixes=("_reduced_sample", "_overall") +# ) -sap_bands = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/Parity Data " - "08012026.xlsx", -) +# # lodged ratings +# lodged_epc_band_comparison = reduced_sample["Lodged EPC Band"].value_counts( +# normalize=True).to_frame().reset_index().merge( +# combined["Lodged EPC Band"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="Lodged EPC Band", +# right_on="Lodged EPC Band", +# suffixes=("_reduced_sample", "_overall") +# ) -combined = new_asset_data.merge( - new_sustainability_data, - left_on="UPRN", - right_on="Org Ref", - suffixes=("_asset", "_sustainability") -).merge( - sap_bands[["OrgRef", "SAP Band", "Lodged EPC Band"]], how="left", left_on="Org Ref", right_on="OrgRef" -) -reduced_sample = combined[ - ~combined["AH Tenure"].isin( - ["Commercial", - "Freeholder", - "HOMEBUY / EQUITY LOAN", - "Leaseholder", - "Outright Sale", - "SHARED EQUITY", - "Shared Ownership"] - ) -].copy() +# # modelled ratings +# modelled_epc_band_comparison = reduced_sample["SAP Band"].value_counts( +# normalize=True).to_frame().reset_index().merge( +# combined["SAP Band"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="SAP Band", +# right_on="SAP Band", +# suffixes=("_reduced_sample", "_overall") +# ) -# property types -property_type_comparison = reduced_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( - combined["Property Type"].value_counts(normalize=True).to_frame().reset_index(), - left_on="Property Type", - right_on="Property Type", - suffixes=("_reduced_sample", "_overall") -) +# # Testing measures +# m1 = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " +# "solid floor, ashp 3.0 - 20250113 final.xlsx" +# ) +# m2 = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " +# "solid floor, no EWI or IWI, ashp 3.0 - 20250113 final.xlsx" +# ) -# lodged ratings -lodged_epc_band_comparison = reduced_sample["Lodged EPC Band"].value_counts( - normalize=True).to_frame().reset_index().merge( - combined["Lodged EPC Band"].value_counts(normalize=True).to_frame().reset_index(), - left_on="Lodged EPC Band", - right_on="Lodged EPC Band", - suffixes=("_reduced_sample", "_overall") -) +# compare = m1.merge( +# m2, +# left_on="uprn", +# right_on="uprn", +# suffixes=("_ewi_iwi", "_no_ewi_iwi") +# ) -# modelled ratings -modelled_epc_band_comparison = reduced_sample["SAP Band"].value_counts( - normalize=True).to_frame().reset_index().merge( - combined["SAP Band"].value_counts(normalize=True).to_frame().reset_index(), - left_on="SAP Band", - right_on="SAP Band", - suffixes=("_reduced_sample", "_overall") -) +# # Which properties get done under the no EWI/IWI scenario that do not under the EWI/IWI scenario +# only_no_ewi_iwi = compare[ +# (compare["total_retrofit_cost_ewi_iwi"] == 0) & +# (compare["total_retrofit_cost_no_ewi_iwi"] != 0) +# ] -# Testing measures -m1 = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " - "solid floor, ashp 3.0 - 20250113 final.xlsx" -) -m2 = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " - "solid floor, no EWI or IWI, ashp 3.0 - 20250113 final.xlsx" -) +# (m1["total_retrofit_cost"] > 0).sum() +# (m2["total_retrofit_cost"] > 0).sum() -compare = m1.merge( - m2, - left_on="uprn", - right_on="uprn", - suffixes=("_ewi_iwi", "_no_ewi_iwi") -) +# with_ewi_projects = compare[compare["total_retrofit_cost_no_ewi_iwi"] > 0] -# Which properties get done under the no EWI/IWI scenario that do not under the EWI/IWI scenario -only_no_ewi_iwi = compare[ - (compare["total_retrofit_cost_ewi_iwi"] == 0) & - (compare["total_retrofit_cost_no_ewi_iwi"] != 0) - ] - -(m1["total_retrofit_cost"] > 0).sum() -(m2["total_retrofit_cost"] > 0).sum() - -with_ewi_projects = compare[compare["total_retrofit_cost_no_ewi_iwi"] > 0] - -z = with_ewi_projects[pd.isnull(with_ewi_projects["total_retrofit_cost_ewi_iwi"])] +# z = with_ewi_projects[pd.isnull(with_ewi_projects["total_retrofit_cost_ewi_iwi"])] diff --git a/infrastructure/terraform/lambda/_template/README.md b/infrastructure/terraform/lambda/_template/README.md new file mode 100644 index 00000000..a7282fc9 --- /dev/null +++ b/infrastructure/terraform/lambda/_template/README.md @@ -0,0 +1,51 @@ +## Checklist for adding a new Lambda + +### 1. Create the Lambda scaffold +- Copy the template: + + cp -r lambda/_template lambda/ + +--- + +### 2. Add infrastructure prerequisites (shared stack) +- Add a new ECR repository in: + + infrastructure/terraform/shared/main.tf + +- Apply the shared stack + - This requires commenting 'if env.stage == "prod"' in .github/workflows/deploy_terraform.yml + +- Verify the ECR repository exists in AWS + +--- + +### 3. Add Docker build configuration +- Create a `Dockerfile` for the Lambda +- Verify the Dockerfile path and build context +- Add a new image build job in `deploy_terraform.yml` using `_build_image.yml` + +--- + +### 4. Wire the Lambda deploy job (CI) +- Add a deploy job using `_deploy_lambda.yml` +- Ensure the deploy job depends on the image build job + +--- + +### 5. Deploy +- Push changes to GitHub +- CI will: + 1. Build and push the Docker image + 2. Deploy the Lambda + 3. Verify everything deployed. Good things to check: + - ECR with image + - SQS + - Trigger SQS + - Cloud watch logs +--- +### 5. Delete + 1. Delete README if you used cp -r + +--- + +## Please feel free to update this document to make it easier for the next person \ No newline at end of file diff --git a/infrastructure/terraform/lambda/_template/main.tf b/infrastructure/terraform/lambda/_template/main.tf new file mode 100644 index 00000000..3010aa8a --- /dev/null +++ b/infrastructure/terraform/lambda/_template/main.tf @@ -0,0 +1,14 @@ +module "lambda" { + source = "../modules/lambda_with_sqs" + + name = REPLACE ME #"address2uprn" for example + stage = var.stage + + image_uri = local.image_uri + + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + } +} diff --git a/infrastructure/terraform/lambda/_template/provider.tf b/infrastructure/terraform/lambda/_template/provider.tf new file mode 100644 index 00000000..37c412ce --- /dev/null +++ b/infrastructure/terraform/lambda/_template/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.16" + } + } + + backend "s3" { + bucket = REPLACE_ME + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/_template/variables.tf b/infrastructure/terraform/lambda/_template/variables.tf new file mode 100644 index 00000000..e4bab243 --- /dev/null +++ b/infrastructure/terraform/lambda/_template/variables.tf @@ -0,0 +1,27 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf new file mode 100644 index 00000000..46b193f2 --- /dev/null +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -0,0 +1,14 @@ +module "address2uprn" { + source = "../modules/lambda_with_sqs" + + name = "address2uprn" + stage = var.stage + + image_uri = local.image_uri + + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + } +} diff --git a/infrastructure/terraform/lambda/address2UPRN/provider.tf b/infrastructure/terraform/lambda/address2UPRN/provider.tf new file mode 100644 index 00000000..ad873717 --- /dev/null +++ b/infrastructure/terraform/lambda/address2UPRN/provider.tf @@ -0,0 +1,17 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.16" + } + } + + backend "s3" { + bucket = "address2uprn-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} + diff --git a/infrastructure/terraform/lambda/address2UPRN/variables.tf b/infrastructure/terraform/lambda/address2UPRN/variables.tf new file mode 100644 index 00000000..e4bab243 --- /dev/null +++ b/infrastructure/terraform/lambda/address2UPRN/variables.tf @@ -0,0 +1,27 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/main.tf b/infrastructure/terraform/lambda/modules/lambda_with_sqs/main.tf new file mode 100644 index 00000000..3816c206 --- /dev/null +++ b/infrastructure/terraform/lambda/modules/lambda_with_sqs/main.tf @@ -0,0 +1,44 @@ +############################################ +# IAM role +############################################ +module "role" { + source = "../../../modules/lambda_execution_role" + name = "${var.name}-lambda-${var.stage}" +} + +############################################ +# SQS queue + DLQ +############################################ +module "queue" { + source = "../../../modules/sqs_queue" + name = "${var.name}-queue-${var.stage}" +} + +############################################ +# Lambda +############################################ +module "lambda" { + source = "../../../modules/lambda_service" + + name = "${var.name}-${var.stage}" + role_arn = module.role.role_arn + image_uri = var.image_uri + + timeout = var.timeout + memory_size = var.memory_size + + environment = var.environment +} + +############################################ +# SQS → Lambda trigger +############################################ +module "sqs_trigger" { + source = "../../../modules/lambda_sqs_trigger" + + lambda_arn = module.lambda.lambda_arn + lambda_role_name = module.role.role_name + queue_arn = module.queue.queue_arn + + batch_size = var.batch_size +} diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf b/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf new file mode 100644 index 00000000..afc9246d --- /dev/null +++ b/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf @@ -0,0 +1,11 @@ +output "lambda_arn" { + value = module.lambda.lambda_arn +} + +output "queue_arn" { + value = module.queue.queue_arn +} + +output "queue_url" { + value = module.queue.queue_url +} diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/variables.tf b/infrastructure/terraform/lambda/modules/lambda_with_sqs/variables.tf new file mode 100644 index 00000000..b20ab2a8 --- /dev/null +++ b/infrastructure/terraform/lambda/modules/lambda_with_sqs/variables.tf @@ -0,0 +1,36 @@ +variable "name" { + type = string +} + +variable "stage" { + type = string +} + +variable "image_uri" { + type = string +} + +variable "region" { + type = string + default = "eu-west-2" +} + +variable "timeout" { + type = number + default = 60 +} + +variable "memory_size" { + type = number + default = 1024 +} + +variable "environment" { + type = map(string) + default = {} +} + +variable "batch_size" { + type = number + default = 10 +} diff --git a/infrastructure/terraform/modules/container_registry/main.tf b/infrastructure/terraform/modules/container_registry/main.tf new file mode 100644 index 00000000..f5ba8d5e --- /dev/null +++ b/infrastructure/terraform/modules/container_registry/main.tf @@ -0,0 +1,30 @@ +resource "aws_ecr_repository" "this" { + name = "${var.name}-${var.stage}" + + image_tag_mutability = "MUTABLE" + + image_scanning_configuration { + scan_on_push = true + } +} + +resource "aws_ecr_lifecycle_policy" "this" { + repository = aws_ecr_repository.this.name + + policy = jsonencode({ + rules = [ + { + rulePriority = 1 + description = "Expire old images" + selection = { + tagStatus = "any" + countType = "imageCountMoreThan" + countNumber = var.retain_count + } + action = { + type = "expire" + } + } + ] + }) +} diff --git a/infrastructure/terraform/modules/container_registry/outputs.tf b/infrastructure/terraform/modules/container_registry/outputs.tf new file mode 100644 index 00000000..47a4bc64 --- /dev/null +++ b/infrastructure/terraform/modules/container_registry/outputs.tf @@ -0,0 +1,11 @@ +output "repository_name" { + value = aws_ecr_repository.this.name +} + +output "repository_url" { + value = aws_ecr_repository.this.repository_url +} + +output "repository_arn" { + value = aws_ecr_repository.this.arn +} diff --git a/infrastructure/terraform/modules/container_registry/variables.tf b/infrastructure/terraform/modules/container_registry/variables.tf new file mode 100644 index 00000000..11821b31 --- /dev/null +++ b/infrastructure/terraform/modules/container_registry/variables.tf @@ -0,0 +1,15 @@ +variable "name" { + description = "Base name of the repository (without stage)" + type = string +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} + +variable "retain_count" { + description = "Number of images to retain" + type = number + default = 10 +} diff --git a/infrastructure/terraform/modules/ecr/main.tf b/infrastructure/terraform/modules/ecr/main.tf index 468ef3d2..d93d1340 100644 --- a/infrastructure/terraform/modules/ecr/main.tf +++ b/infrastructure/terraform/modules/ecr/main.tf @@ -1,3 +1,6 @@ +# This ecr works for things deployed by serverless. +# TODO: unify ecr and container_registry to one + resource "aws_ecr_repository" "my_repository" { name = "${var.ecr_name}" image_tag_mutability = "MUTABLE" diff --git a/infrastructure/terraform/modules/ecr/outputs.tf b/infrastructure/terraform/modules/ecr/outputs.tf index 53839718..7f045412 100644 --- a/infrastructure/terraform/modules/ecr/outputs.tf +++ b/infrastructure/terraform/modules/ecr/outputs.tf @@ -1,4 +1,10 @@ output "ecr_repository_name" { description = "Name of the EPR repo in AWS" value = aws_ecr_repository.my_repository.name +} + + +output "ecr_repository_url" { + description = "Full ECR repository URL" + value = aws_ecr_repository.my_repository.repository_url } \ No newline at end of file diff --git a/infrastructure/terraform/modules/lambda_execution_role/main.tf b/infrastructure/terraform/modules/lambda_execution_role/main.tf new file mode 100644 index 00000000..fa657afd --- /dev/null +++ b/infrastructure/terraform/modules/lambda_execution_role/main.tf @@ -0,0 +1,37 @@ +data "aws_iam_policy_document" "assume" { + statement { + effect = "Allow" + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + actions = ["sts:AssumeRole"] + } +} + +resource "aws_iam_role" "this" { + name = var.name + assume_role_policy = data.aws_iam_policy_document.assume.json +} + +resource "aws_iam_role_policy_attachment" "basic_logs" { + role = aws_iam_role.this.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" +} + +resource "aws_iam_role_policy" "ecr_pull" { + role = aws_iam_role.this.name + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "ecr:GetAuthorizationToken", + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer" + ] + Resource = "*" + }] + }) +} diff --git a/infrastructure/terraform/modules/lambda_execution_role/outputs.tf b/infrastructure/terraform/modules/lambda_execution_role/outputs.tf new file mode 100644 index 00000000..1baca34d --- /dev/null +++ b/infrastructure/terraform/modules/lambda_execution_role/outputs.tf @@ -0,0 +1,7 @@ +output "role_arn" { + value = aws_iam_role.this.arn +} + +output "role_name" { + value = aws_iam_role.this.name +} diff --git a/infrastructure/terraform/modules/lambda_execution_role/variables.tf b/infrastructure/terraform/modules/lambda_execution_role/variables.tf new file mode 100644 index 00000000..f9f512ff --- /dev/null +++ b/infrastructure/terraform/modules/lambda_execution_role/variables.tf @@ -0,0 +1,4 @@ +variable "name" { + description = "IAM role name for the Lambda execution role" + type = string +} diff --git a/infrastructure/terraform/modules/lambda_service/main.tf b/infrastructure/terraform/modules/lambda_service/main.tf new file mode 100644 index 00000000..8a159db1 --- /dev/null +++ b/infrastructure/terraform/modules/lambda_service/main.tf @@ -0,0 +1,15 @@ +resource "aws_lambda_function" "this" { + function_name = var.name + role = var.role_arn + + package_type = "Image" + image_uri = var.image_uri + + timeout = var.timeout + memory_size = var.memory_size + publish = true + + environment { + variables = var.environment + } +} diff --git a/infrastructure/terraform/modules/lambda_service/outputs.tf b/infrastructure/terraform/modules/lambda_service/outputs.tf new file mode 100644 index 00000000..dd05cccf --- /dev/null +++ b/infrastructure/terraform/modules/lambda_service/outputs.tf @@ -0,0 +1,3 @@ +output "lambda_arn" { + value = aws_lambda_function.this.arn +} diff --git a/infrastructure/terraform/modules/lambda_service/variables.tf b/infrastructure/terraform/modules/lambda_service/variables.tf new file mode 100644 index 00000000..43def6ad --- /dev/null +++ b/infrastructure/terraform/modules/lambda_service/variables.tf @@ -0,0 +1,18 @@ +variable "name" { type = string } +variable "role_arn" { type = string } +variable "image_uri" { type = string } + +variable "timeout" { + type = number + default = 30 +} + +variable "memory_size" { + type = number + default = 512 +} + +variable "environment" { + type = map(string) + default = {} +} diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf b/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf new file mode 100644 index 00000000..5919e10f --- /dev/null +++ b/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf @@ -0,0 +1,23 @@ +resource "aws_lambda_event_source_mapping" "this" { + event_source_arn = var.queue_arn + function_name = var.lambda_arn + batch_size = var.batch_size + enabled = true +} + +resource "aws_iam_role_policy" "allow_sqs" { + role = var.lambda_role_name + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes" + ] + Resource = var.queue_arn + }] + }) +} diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf b/infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf new file mode 100644 index 00000000..0e50cd54 --- /dev/null +++ b/infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf @@ -0,0 +1,8 @@ +variable "lambda_arn" { type = string } +variable "lambda_role_name" { type = string } +variable "queue_arn" { type = string } + +variable "batch_size" { + type = number + default = 10 +} diff --git a/infrastructure/terraform/modules/sqs_queue/main.tf b/infrastructure/terraform/modules/sqs_queue/main.tf new file mode 100644 index 00000000..580e67bd --- /dev/null +++ b/infrastructure/terraform/modules/sqs_queue/main.tf @@ -0,0 +1,14 @@ +resource "aws_sqs_queue" "dlq" { + name = "${var.name}-dlq" +} + +resource "aws_sqs_queue" "this" { + name = var.name + + visibility_timeout_seconds = 120 + + redrive_policy = jsonencode({ + deadLetterTargetArn = aws_sqs_queue.dlq.arn + maxReceiveCount = var.max_receive_count + }) +} diff --git a/infrastructure/terraform/modules/sqs_queue/outputs.tf b/infrastructure/terraform/modules/sqs_queue/outputs.tf new file mode 100644 index 00000000..46fafe90 --- /dev/null +++ b/infrastructure/terraform/modules/sqs_queue/outputs.tf @@ -0,0 +1,7 @@ +output "queue_arn" { + value = aws_sqs_queue.this.arn +} + +output "queue_url" { + value = aws_sqs_queue.this.url +} diff --git a/infrastructure/terraform/modules/sqs_queue/variables.tf b/infrastructure/terraform/modules/sqs_queue/variables.tf new file mode 100644 index 00000000..943a7a16 --- /dev/null +++ b/infrastructure/terraform/modules/sqs_queue/variables.tf @@ -0,0 +1,6 @@ +variable "name" { type = string } + +variable "max_receive_count" { + type = number + default = 5 +} diff --git a/infrastructure/terraform/modules/tf_state_bucket/main.tf b/infrastructure/terraform/modules/tf_state_bucket/main.tf new file mode 100644 index 00000000..86c0cc21 --- /dev/null +++ b/infrastructure/terraform/modules/tf_state_bucket/main.tf @@ -0,0 +1,30 @@ +resource "aws_s3_bucket" "this" { + bucket = var.bucket_name +} + +resource "aws_s3_bucket_versioning" "this" { + bucket = aws_s3_bucket.this.id + + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "this" { + bucket = aws_s3_bucket.this.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +resource "aws_s3_bucket_public_access_block" "this" { + bucket = aws_s3_bucket.this.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} diff --git a/infrastructure/terraform/modules/tf_state_bucket/outputs.tf b/infrastructure/terraform/modules/tf_state_bucket/outputs.tf new file mode 100644 index 00000000..e8ceffd1 --- /dev/null +++ b/infrastructure/terraform/modules/tf_state_bucket/outputs.tf @@ -0,0 +1,7 @@ +output "bucket_name" { + value = aws_s3_bucket.this.bucket +} + +output "bucket_arn" { + value = aws_s3_bucket.this.arn +} diff --git a/infrastructure/terraform/modules/tf_state_bucket/variables.tf b/infrastructure/terraform/modules/tf_state_bucket/variables.tf new file mode 100644 index 00000000..b3aae9bb --- /dev/null +++ b/infrastructure/terraform/modules/tf_state_bucket/variables.tf @@ -0,0 +1,3 @@ +variable "bucket_name" { + type = string +} diff --git a/infrastructure/terraform/dev.tfvars b/infrastructure/terraform/shared/dev.tfvars similarity index 95% rename from infrastructure/terraform/dev.tfvars rename to infrastructure/terraform/shared/dev.tfvars index 92b7e158..53ca6d9e 100644 --- a/infrastructure/terraform/dev.tfvars +++ b/infrastructure/terraform/shared/dev.tfvars @@ -1,5 +1,4 @@ stage = "dev" -profile = "DevAdmin" region = "eu-west-2" # Domain diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/shared/main.tf similarity index 81% rename from infrastructure/terraform/main.tf rename to infrastructure/terraform/shared/main.tf index 5dfe765f..a19c4e21 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -8,7 +8,6 @@ terraform { backend "s3" { bucket = "assessment-model-terraform-state" region = "eu-west-2" - profile = "DevAdmin" key = "terraform.tfstate" } @@ -16,7 +15,6 @@ terraform { } provider "aws" { - profile = var.profile region = var.region } @@ -91,101 +89,101 @@ resource "aws_db_instance" "default" { # Set up the bucket that recieve the csv uploads of epc to be retrofit module "s3_presignable_bucket" { - source = "./modules/s3_presignable_bucket" + source = "../modules/s3_presignable_bucket" bucketname = "retrofit-plan-inputs-${var.stage}" environment = var.stage allowed_origins = var.allowed_origins } module "s3_due_considerations_bucket" { - source = "./modules/s3_presignable_bucket" + source = "../modules/s3_presignable_bucket" bucketname = "retrofit-due-considerations-${var.stage}" environment = var.stage allowed_origins = var.allowed_origins } module "s3_eco_spreadseet_bucket" { - source = "./modules/s3_presignable_bucket" + source = "../modules/s3_presignable_bucket" bucketname = "retrofit-eco-spreadsheet-${var.stage}" environment = var.stage allowed_origins = var.allowed_origins } module "s3" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-datalake-${var.stage}" allowed_origins = var.allowed_origins } module "model_directory" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-model-directory-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_sap_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-sap-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_sap_data" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-data-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_carbon_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-carbon-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_heat_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-heat-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_lighting_cost_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-lighting-cost-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_heating_cost_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-heating-cost-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_hot_water_cost_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-hot-water-cost-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_heating_kwh_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-heating-kwh-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_hotwater_kwh_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-hotwater-kwh-predictions-${var.stage}" allowed_origins = var.allowed_origins } module "retrofit_sap_baseline_predictions" { - source = "./modules/s3" + source = "../modules/s3" bucketname = "retrofit-sap-baseline-predictions-${var.stage}" allowed_origins = var.allowed_origins } // We make this bucket presignable, because we want to generate download links for the frontend module "retrofit_energy_assessments" { - source = "./modules/s3_presignable_bucket" + source = "../modules/s3_presignable_bucket" bucketname = "retrofit-energy-assessments-${var.stage}" allowed_origins = var.allowed_origins environment = var.stage @@ -193,7 +191,7 @@ module "retrofit_energy_assessments" { # Set up the route53 record for the API module "route53" { - source = "./modules/route53" + source = "../modules/route53" domain_name = var.domain_name api_url_prefix = var.api_url_prefix providers = { @@ -201,75 +199,76 @@ module "route53" { } } + # Create an ECR repository for storage of the lambda's docker images module "ecr" { ecr_name = "fastapi-repository-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_sap_prediction_ecr" { ecr_name = "lambda-sap-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "due_considerations_ecr" { ecr_name = "due-considerations-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "eco_spreadsheet_ecr" { ecr_name = "eco-spreadsheet-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_carbon_prediction_ecr" { ecr_name = "lambda-carbon-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_heat_prediction_ecr" { ecr_name = "lambda-heat-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } # ECR repos for lighting cost, heating cost and hot water cost models module "lambda_lighting_cost_prediction_ecr" { ecr_name = "lighting-cost-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_heating_cost_prediction_ecr" { ecr_name = "heating-cost-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_hot_water_cost_prediction_ecr" { ecr_name = "hot-water-cost-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } # For heating and hot water kwh models module "lambda_heating_kwh_prediction_ecr" { ecr_name = "heating-kwh-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } module "lambda_hotwater_kwh_prediction_ecr" { ecr_name = "hotwater-kwh-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } # Baselining models module "sap_baseline_ecr" { ecr_name = "sap-baseline-prediction-${var.stage}" - source = "./modules/ecr" + source = "../modules/ecr" } ############################################## # CDN - Cloudfront ############################################## module "cloudfront_distribution" { - source = "./modules/cloudfront" + source = "../modules/cloudfront" bucket_name = module.s3.bucket_name bucket_id = module.s3.bucket_id bucket_arn = module.s3.bucket_arn @@ -281,11 +280,35 @@ module "cloudfront_distribution" { # SES - Email sending ################################################ module "ses" { - source = "./modules/ses" + source = "../modules/ses" domain_name = "domna.homes" stage = var.stage } output "ses_dns_records" { value = module.ses.dns_records +} + +################################################ +# Address2UPRN – Lambda ECR +################################################ +module "address2uprn_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "address2uprn-terraform-state" + +} + +output "address2uprn_state_bucket_name" { + value = module.address2uprn_state_bucket.bucket_name +} + +module "address2uprn_registry" { + source = "../modules/container_registry" + name = "address2uprn" + stage = var.stage + +} + +output "address2uprn_repository_url" { + value = module.address2uprn_registry.repository_url } \ No newline at end of file diff --git a/infrastructure/terraform/secrets.tf b/infrastructure/terraform/shared/secrets.tf similarity index 100% rename from infrastructure/terraform/secrets.tf rename to infrastructure/terraform/shared/secrets.tf diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/shared/variables.tf similarity index 90% rename from infrastructure/terraform/variables.tf rename to infrastructure/terraform/shared/variables.tf index 76734340..e922e465 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/shared/variables.tf @@ -3,11 +3,6 @@ variable stage { type = string } -variable "profile" { - description = "AWS profile to use" - type = string -} - variable "region" { description = "AWS region" type = string diff --git a/model_data/requirements/requirements.txt b/model_data/requirements/requirements.txt index 845166d9..bbf75df5 100644 --- a/model_data/requirements/requirements.txt +++ b/model_data/requirements/requirements.txt @@ -1,4 +1,4 @@ -pydantic==2.9.2 +pydantic>=1.10.7 pydantic-settings==2.6.0 epc-api-python==1.0.2 numpy==2.1.2 diff --git a/pytest.ini b/pytest.ini index fe2c7d67..ee203d46 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] pythonpath = . addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/onboarders/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index 2184d074..ae807654 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -7,24 +7,29 @@ import numpy as np from backend.app.utils import sap_to_epc from sqlalchemy.orm import sessionmaker from backend.app.db.connection import db_engine, db_read_session -from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations, RecommendationMaterials -from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel, PropertyDetailsSpatial +from backend.app.db.models.recommendations import ( + Recommendation, + Plan, + PlanRecommendations, + RecommendationMaterials, +) +from backend.app.db.models.portfolio import ( + PropertyModel, + PropertyDetailsEpcModel, + PropertyDetailsSpatial, +) from backend.app.db.functions.materials_functions import get_materials from collections import defaultdict from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 435 # Peabody +PORTFOLIO_ID = 502 # Peabody SCENARIOS = [ - 908, - 909, - 910, + 986, ] scenario_names = { - 908: "EPC C - no solid floor, ashp 3.0", - 909: "EPC C - no solid floor, no EWI or IWI, ashp 3.0", - 910: "EPC B - no solid floor, no EWI, ashp 3.0" + 986: "EPC C", } @@ -35,22 +40,26 @@ def get_data(portfolio_id, scenario_ids): # -------------------- # Properties # -------------------- - properties_query = session.query( - PropertyModel, - PropertyDetailsEpcModel - ).join( - PropertyDetailsEpcModel, - PropertyModel.id == PropertyDetailsEpcModel.property_id - ).filter( - PropertyModel.portfolio_id == portfolio_id - ).all() + properties_query = ( + session.query(PropertyModel, PropertyDetailsEpcModel) + .join( + PropertyDetailsEpcModel, + PropertyModel.id == PropertyDetailsEpcModel.property_id, + ) + .filter(PropertyModel.portfolio_id == portfolio_id) + .all() + ) properties_data = [ { - **{col.name: getattr(p.PropertyModel, col.name) - for col in PropertyModel.__table__.columns}, - **{col.name: getattr(p.PropertyDetailsEpcModel, col.name) - for col in PropertyDetailsEpcModel.__table__.columns}, + **{ + col.name: getattr(p.PropertyModel, col.name) + for col in PropertyModel.__table__.columns + }, + **{ + col.name: getattr(p.PropertyDetailsEpcModel, col.name) + for col in PropertyDetailsEpcModel.__table__.columns + }, } for p in properties_query ] @@ -62,13 +71,10 @@ def get_data(portfolio_id, scenario_ids): session.query( Plan.scenario_id, Plan.property_id, - func.max(Plan.created_at).label("latest_created_at") + func.max(Plan.created_at).label("latest_created_at"), ) .filter(Plan.scenario_id.in_(scenario_ids)) - .group_by( - Plan.scenario_id, - Plan.property_id - ) + .group_by(Plan.scenario_id, Plan.property_id) .subquery() ) @@ -80,9 +86,9 @@ def get_data(portfolio_id, scenario_ids): session.query(Plan) .join( latest_plans_subq, - (Plan.scenario_id == latest_plans_subq.c.scenario_id) & - (Plan.property_id == latest_plans_subq.c.property_id) & - (Plan.created_at == latest_plans_subq.c.latest_created_at) + (Plan.scenario_id == latest_plans_subq.c.scenario_id) + & (Plan.property_id == latest_plans_subq.c.property_id) + & (Plan.created_at == latest_plans_subq.c.latest_created_at), ) .all() ) @@ -107,28 +113,29 @@ def get_data(portfolio_id, scenario_ids): # -------------------- # Recommendations (NO materials yet) # -------------------- - recommendations_query = session.query( - Recommendation, - Plan.scenario_id, - PlanRecommendations.plan_id - ).join( - PlanRecommendations, - Recommendation.id == PlanRecommendations.recommendation_id - ).join( - Plan, - Plan.id == PlanRecommendations.plan_id - ).filter( - PlanRecommendations.plan_id.in_(plan_ids), - Recommendation.default.is_(True), - Recommendation.already_installed.is_(False) - ).all() + recommendations_query = ( + session.query(Recommendation, Plan.scenario_id, PlanRecommendations.plan_id) + .join( + PlanRecommendations, + Recommendation.id == PlanRecommendations.recommendation_id, + ) + .join(Plan, Plan.id == PlanRecommendations.plan_id) + .filter( + PlanRecommendations.plan_id.in_(plan_ids), + Recommendation.default.is_(True), + Recommendation.already_installed.is_(False), + ) + .all() + ) recommendations_data = [ { - **{col.name: getattr(r.Recommendation, col.name) - for col in Recommendation.__table__.columns}, + **{ + col.name: getattr(r.Recommendation, col.name) + for col in Recommendation.__table__.columns + }, "scenario_id": r.scenario_id, - "materials": [] # placeholder + "materials": [], # placeholder } for r in recommendations_query ] @@ -138,23 +145,25 @@ def get_data(portfolio_id, scenario_ids): # -------------------- # Recommendation materials (SEPARATE QUERY) # -------------------- - materials_query = session.query( - RecommendationMaterials - ).filter( - RecommendationMaterials.recommendation_id.in_(recommendation_ids) - ).all() + materials_query = ( + session.query(RecommendationMaterials) + .filter(RecommendationMaterials.recommendation_id.in_(recommendation_ids)) + .all() + ) # Group materials by recommendation_id materials_by_recommendation = defaultdict(list) for m in materials_query: - materials_by_recommendation[m.recommendation_id].append({ - "material_id": m.material_id, - "depth": m.depth, - "quantity": m.quantity, - "quantity_unit": m.quantity_unit, - "estimated_cost": m.estimated_cost, - }) + materials_by_recommendation[m.recommendation_id].append( + { + "material_id": m.material_id, + "depth": m.depth, + "quantity": m.quantity, + "quantity_unit": m.quantity_unit, + "estimated_cost": m.estimated_cost, + } + ) # Attach materials safely (no filtering side effects) for r in recommendations_data: @@ -165,7 +174,9 @@ def get_data(portfolio_id, scenario_ids): return properties_data, plans_data, recommendations_data -properties_data, plans_data, recommendations_data = get_data(portfolio_id=PORTFOLIO_ID, scenario_ids=SCENARIOS) +properties_data, plans_data, recommendations_data = get_data( + portfolio_id=PORTFOLIO_ID, scenario_ids=SCENARIOS +) properties_df = pd.DataFrame(properties_data) plans_df = pd.DataFrame(plans_data) @@ -176,10 +187,8 @@ with db_read_session() as session: materials = pd.DataFrame(materials) -material_lookup = ( - materials - .set_index("id")[["type", "includes_battery"]] - .to_dict("index") +material_lookup = materials.set_index("id")[["type", "includes_battery"]].to_dict( + "index" ) @@ -193,14 +202,14 @@ def has_solar_with_battery(materials_list): return False -recommendations_df["has_solar_with_battery"] = ( - recommendations_df["materials"].apply(has_solar_with_battery) +recommendations_df["has_solar_with_battery"] = recommendations_df["materials"].apply( + has_solar_with_battery ) recommendations_df["measure_type"] = np.where( recommendations_df["has_solar_with_battery"] == True, recommendations_df["measure_type"] + "_with_battery", - recommendations_df["measure_type"] + recommendations_df["measure_type"], ) # Adjust material type to indicate if there is a battery included @@ -215,50 +224,67 @@ from utils.s3 import read_csv_from_s3, read_excel_from_s3 for scenario_id in SCENARIOS: # Get recs for this scenario - recommended_measures_df = recommendations_df[recommendations_df["scenario_id"] == scenario_id][ - ["property_id", "measure_type", "estimated_cost", "default"] + recommended_measures_df = recommendations_df[ + recommendations_df["scenario_id"] == scenario_id + ][["property_id", "measure_type", "estimated_cost", "default"]] + recommended_measures_df = recommended_measures_df[ + recommended_measures_df["default"] ] - recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]] recommended_measures_df = recommended_measures_df.drop(columns=["default"]) - post_install_sap = recommendations_df[recommendations_df["scenario_id"] == scenario_id][ - ["property_id", "default", "sap_points"]] + post_install_sap = recommendations_df[ + recommendations_df["scenario_id"] == scenario_id + ][["property_id", "default", "sap_points"]] post_install_sap = post_install_sap[post_install_sap["default"]] # Sum up the sap points by property id - post_install_sap = post_install_sap.groupby(["property_id"])[["sap_points"]].sum().reset_index() + post_install_sap = ( + post_install_sap.groupby(["property_id"])[["sap_points"]].sum().reset_index() + ) # Find dupes by property id and measure type - dupes = recommended_measures_df.duplicated(subset=["property_id", "measure_type"], keep=False) + dupes = recommended_measures_df.duplicated( + subset=["property_id", "measure_type"], keep=False + ) dupe_df = recommended_measures_df[dupes] if dupe_df.shape: # Drop dupes - happened due to a funny bug recommended_measures_df = recommended_measures_df.drop_duplicates( - subset=["property_id", "measure_type"], keep='first' + subset=["property_id", "measure_type"], keep="first" ) recommendations_measures_pivot = recommended_measures_df.pivot( - index='property_id', - columns='measure_type', - values='estimated_cost' + index="property_id", columns="measure_type", values="estimated_cost" ) recommendations_measures_pivot = recommendations_measures_pivot.reset_index() # Total cost is the row sum, excluding the property_id column - recommendations_measures_pivot["total_retrofit_cost"] = recommendations_measures_pivot.drop( - columns=["property_id"] - ).sum(axis=1) + recommendations_measures_pivot["total_retrofit_cost"] = ( + recommendations_measures_pivot.drop(columns=["property_id"]).sum(axis=1) + ) - df = properties_df[ - [ - "landlord_property_id", "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", - "heating", "windows", "current_epc_rating", "current_sap_points", "total_floor_area", "number_of_rooms", - "id" + df = ( + properties_df[ + [ + "landlord_property_id", + "property_id", + "uprn", + "address", + "postcode", + "property_type", + "walls", + "roof", + "heating", + "windows", + "current_epc_rating", + "current_sap_points", + "total_floor_area", + "number_of_rooms", + "id", + ] ] - ].merge( - recommendations_measures_pivot, how="left", on="property_id" - ).merge( - post_install_sap, how="left", on="property_id" + .merge(recommendations_measures_pivot, how="left", on="property_id") + .merge(post_install_sap, how="left", on="property_id") ) # df = df.drop(columns=["property_id"]) @@ -266,21 +292,25 @@ for scenario_id in SCENARIOS: df["predicted_post_works_sap"] = df["current_sap_points"] + df["sap_points"] df["predicted_post_works_sap"] = df["predicted_post_works_sap"] - df["predicted_post_works_epc"] = df["predicted_post_works_sap"].apply(lambda x: sap_to_epc(x)) + df["predicted_post_works_epc"] = df["predicted_post_works_sap"].apply( + lambda x: sap_to_epc(x) + ) df["uprn"] = df["uprn"].astype(str) relevant_plans = plans_df[plans_df["scenario_id"] == scenario_id] df2 = df.merge( - relevant_plans[["property_id", "post_sap_points", "post_epc_rating"]], how="left", on="property_id", - suffixes=("", "_plan") + relevant_plans[["property_id", "post_sap_points", "post_epc_rating"]], + how="left", + on="property_id", + suffixes=("", "_plan"), ) print(df2["predicted_post_works_epc"].value_counts()) print(df2["post_epc_rating"].value_counts()) z = df2[ - (df2["predicted_post_works_epc"] != "D") & - (df2["post_epc_rating"].astype(str) == "Epc.D") - ] + (df2["predicted_post_works_epc"] != "D") + & (df2["post_epc_rating"].astype(str) == "Epc.D") + ] df2["predicted_post_works_epc"].value_counts() df2["post_epc_rating"].astype(str).value_counts() @@ -295,183 +325,6 @@ for scenario_id in SCENARIOS: df[df["predicted_post_works_sap"] == ""] # Create excel to store to - filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " - f"Project/Final SAL/scenarios/{scenario_names[scenario_id]} - 20250114 final.xlsx") + filename = f"{scenario_names[scenario_id]} - 20250113 final.xlsx" with pd.ExcelWriter(filename) as writer: df.to_excel(writer, sheet_name="properties", index=False) - - -# asset_list = pd.DataFrame(asset_list) -# asset_list = asset_list.rename( -# columns={ -# "postcode": "domna_postcode" -# } -# ) -# if "domna_full_address": -# # For Peabody -# asset_list["domna_full_address"] = asset_list["domna_address_1"] -# -# asset_list = asset_list[["domna_full_address", "domna_postcode", "epc_os_uprn", ]].copy() -# asset_list = asset_list.rename(columns={"epc_os_uprn": "uprn"}) -# asset_list["uprn"] = asset_list["uprn"].astype("Int64").astype(str) -# asset_list = asset_list.merge( -# df.drop(columns=["address", "postcode", "property_type", "total_floor_area"]), -# how="left", -# on="uprn" -# ) - - -# Get conservation area data from property details spatial. based on the UPRNs -def get_conservation_area_data(uprns): - session = sessionmaker(bind=db_engine)() - session.begin() - - # Query to get conservation area data - spatial_query = session.query( - PropertyDetailsSpatial - ).filter( - PropertyDetailsSpatial.uprn.in_(uprns) # Filter by UPRNs - ).all() - - # Transform spatial data to include all fields dynamically - spatial_data = [ - {col.name: getattr(spatial, col.name) for col in PropertyDetailsSpatial.__table__.columns} - for spatial in spatial_query - ] - - session.close() - return pd.DataFrame(spatial_data) - - -uprns = asset_list[ - ~pd.isna(asset_list["uprn"]) & (asset_list["uprn"] != "") - ]["uprn"].astype(int).unique().tolist() -conservation_area_data = get_conservation_area_data(uprns) -conservation_area_data["uprn"] = conservation_area_data["uprn"].astype(str) -asset_list = asset_list.merge( - conservation_area_data[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]], - how="left", - on="uprn" -) - -# For exporting -df.to_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lincs Rural/EPC C -without floors proposed measures - " - "with ID.xlsx", - index=False -) -# asset_list.to_excel( -# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lincs Rural/epc_measures.xlsx", -# index=False -# ) - -condition_costs = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/sfr/Spring JV/Condition costs.xlsx", - sheet_name="Prices - Khalim", - header=35 -) -# Remove unnamed columns and reset index -condition_costs = condition_costs.loc[:, ~condition_costs.columns.str.contains('^Unnamed')] -condition_costs = condition_costs.reset_index(drop=True) - - -# We now estimate condition cost -def simulate_condition(asset_list, condition_costs): - """ - This function is for testing, and will simulate condition cost from 1-10 for each property to see what the - costing array looks like. - :param df: - :return: - """ - - condition_df = [] - for _, row in asset_list.iterrows(): - - n_bathrooms = row["bathrooms"] - - conditions = {} - for condition in reversed(range(1, 11)): - condition_cost = condition_costs[ - condition_costs["Condition"] == condition - ].drop(columns=["Condition"]).iloc[0] - - # Each cost is scaled by floor area - condition_cost = condition_cost * row["total_floor_area"] - condition_cost["Bathroom"] = condition_cost["Bathroom"] * n_bathrooms - - total_condition_cost = condition_cost.sum() - conditions["Condition " + str(condition)] = (total_condition_cost) - - condition_df.append( - { - "uprn": row["uprn"], - **conditions - } - ) - - condition_df = pd.DataFrame(condition_df) - - asset_list = asset_list.merge( - condition_df, - how="left", - on="uprn" - ) - - return asset_list - - -# asset_list = simulate_condition(asset_list, condition_costs) - -# We calculate the condition cost based on the condition -for _, row in asset_list.iterrows(): - - condition = row["condition_score"] - if condition in [None, ""]: - continue - condition = int(float(condition)) - - condition_cost = condition_costs[ - condition_costs["Condition"] == condition - ].drop(columns=["Condition"]).iloc[0] - - # Each cost is scaled by floor area - condition_cost = condition_cost * float(row["total_floor_area"]) - n_bathrooms = row["n_bathrooms"] - condition_cost["Bathroom"] = condition_cost["Bathroom"] * float(n_bathrooms) - - total_condition_cost = condition_cost.sum() - asset_list.loc[asset_list["uprn"] == row["uprn"], "domna_condition_cost"] = total_condition_cost - -# Store output -asset_list.to_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/sfr/Spring JV/20250624_portfolio_retrofit_packages.xlsx", - index=False -) - -condition_cost_comparison = asset_list[ - ["condition_score", "decoration_sum_min ", "decoration_sum_max", "domna_condition_cost"] -] - -# Testing -plans_df.head() - -example = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final " - "SAL/scenarios/EPC C - no solid floor, no EWI or IWI, ashp 3.0 - 20250114 final.xlsx" -) - -plans_df2 = plans_df.merge( - properties_df[["property_id", "landlord_property_id"]], - left_on="property_id", - right_on="property_id", - how="left" -) - -plans_df2 = plans_df2[plans_df2["scenario_id"] == 909] - -dupes = plans_df2[plans_df2["property_id"].duplicated()] - -# merge on plans -example = example.merge( - plans_df, how="left", -)