diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index dac0087..7354027 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -5,10 +5,16 @@ "remoteUser": "vscode", "workspaceFolder": "/workspaces/survey-extractor", "postStartCommand": "bash .devcontainer/post-install.sh", + + "features": { + // "ghcr.io/devcontainers/features/ssh-agent:1": {} + }, + "mounts": [ - // Optional, just makes getting from Downloads (local env) easier + // Optional convenience mount "source=${localEnv:HOME},target=/workspaces/home,type=bind" ], + "customizations": { "vscode": { "settings": { @@ -28,3 +34,4 @@ } } } + diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 24893e8..a0d477b 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -47,3 +47,4 @@ networks: volumes: postgres-data: + diff --git a/.github/workflows/actions/lambda-deploy/action.yml b/.github/workflows/actions/lambda-deploy/action.yml index ba19c67..3ca0fc8 100644 --- a/.github/workflows/actions/lambda-deploy/action.yml +++ b/.github/workflows/actions/lambda-deploy/action.yml @@ -2,7 +2,7 @@ name: "Build and Push Lambda Image to ECR" description: "Reusable action for building and pushing lambda Docker image to ECR" inputs: - lambda_name: + ecr_name: description: "Lambda name / ECR repo name" required: true dockerfile_path: @@ -66,8 +66,8 @@ runs: - name: Build and push Docker image shell: bash run: | - IMAGE_URI=${{ steps.login-ecr.outputs.registry }}/${{ inputs.lambda_name }}:${{ steps.set_tag.outputs.tag }} - echo "Building Docker image for ${{ inputs.lambda_name }}..." + IMAGE_URI=${{ steps.login-ecr.outputs.registry }}/${{ inputs.ecr_name }}:${{ steps.set_tag.outputs.tag }} + echo "Building Docker image for ${{ inputs.ecr_name }}..." docker build -t $IMAGE_URI -f ${{ inputs.dockerfile_path }} . echo "Pushing to ECR..." diff --git a/.github/workflows/actions/terraform-deploy/action.yml b/.github/workflows/actions/terraform-deploy/action.yml index 685a0ac..5613329 100644 --- a/.github/workflows/actions/terraform-deploy/action.yml +++ b/.github/workflows/actions/terraform-deploy/action.yml @@ -52,3 +52,4 @@ runs: working-directory: ${{ inputs.working_directory }} shell: bash run: terraform apply -auto-approve tfplan + diff --git a/.github/workflows/hubspot_surveyed_needs_sign_off.yml b/.github/workflows/hubspot_surveyed_needs_sign_off.yml index cd4db7e..ac747d8 100644 --- a/.github/workflows/hubspot_surveyed_needs_sign_off.yml +++ b/.github/workflows/hubspot_surveyed_needs_sign_off.yml @@ -1,29 +1,29 @@ -name: Surveyed Needs Sign Off Script -on: - schedule: - # - cron: '0 17 * * 1-5' - workflow_dispatch: +# name: Surveyed Needs Sign Off Script +# on: +# schedule: +# # - cron: '0 17 * * 1-5' +# workflow_dispatch: -jobs: - surveyed-needs-sign-off: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 +# jobs: +# surveyed-needs-sign-off: +# runs-on: ubuntu-22.04 +# steps: +# - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.12' +# - name: Set up Python +# uses: actions/setup-python@v4 +# with: +# python-version: '3.12' - - name: Install dependencies - run: | - pip install poetry - poetry install --no-root +# - name: Install dependencies +# run: | +# pip install poetry +# poetry install --no-root - - name: run script - run: | - pwd - ls -la - poetry run python etl/hubspot_surveyed_needs_sign_off.py - env: - PYTHONPATH: ${{ github.workspace }} \ No newline at end of file +# - name: run script +# run: | +# pwd +# ls -la +# poetry run python etl/hubspot_surveyed_needs_sign_off.py +# env: +# PYTHONPATH: ${{ github.workspace }} \ No newline at end of file diff --git a/.github/workflows/lambda_main.yml b/.github/workflows/lambda_main.yml index 6330833..15f5991 100644 --- a/.github/workflows/lambda_main.yml +++ b/.github/workflows/lambda_main.yml @@ -2,7 +2,7 @@ name: Lambda Main Workflow on: push: - branches: [main, feautre/additional_features_in_condition_report_extraction] + branches: [main, feautre/walthamforest_etl] env: AWS_REGION: eu-west-2 @@ -34,7 +34,7 @@ jobs: - name: Build and deploy Lambda example uses: ./.github/workflows/actions/lambda-deploy with: - lambda_name: lambda_example + ecr_name: lambda_example dockerfile_path: ./deployment/lambda/lambda_example/docker/Dockerfile ecr_tf_dir: ./deployment/lambda/lambda_example/docker/ lambda_tf_dir: ./deployment/lambda/lambda_example/ @@ -57,7 +57,7 @@ jobs: - name: Build and deploy Extractor & Loader Lambda uses: ./.github/workflows/actions/lambda-deploy with: - lambda_name: extractor_and_loader + ecr_name: extractor_and_loader dockerfile_path: ./deployment/lambda/extractor_and_loader/docker/Dockerfile ecr_tf_dir: ./deployment/lambda/extractor_and_loader/docker/ lambda_tf_dir: ./deployment/lambda/extractor_and_loader/ @@ -67,3 +67,26 @@ jobs: git-sha: ${{ github.sha }} git-ref: ${{ github.ref_name }} + + walthamforest-etl: + runs-on: ubuntu-latest + needs: shared-lambda-terraform + permissions: + id-token: write + contents: read + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Build and deploy WalthamForest ETL + uses: ./.github/workflows/actions/lambda-deploy + with: + ecr_name: walthamforest_etl_adhoc_ecr + dockerfile_path: ./deployment/lambda/walthamforest_etl/docker/Dockerfile + ecr_tf_dir: ./deployment/lambda/walthamforest_etl/docker/ + lambda_tf_dir: ./deployment/lambda/walthamforest_etl/ + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + git-sha: ${{ github.sha }} + git-ref: ${{ github.ref_name }} \ No newline at end of file diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 7109d8f..5d93010 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -1,42 +1,42 @@ -name: Run Pytest +# name: Run Pytest -on: - push: - branches: - - '**' # Run on all branches - pull_request: - branches: - - main +# on: +# push: +# branches: +# - '**' # Run on all branches +# pull_request: +# branches: +# - main -jobs: - etl-unit-tests: - runs-on: ubuntu-22.04 +# jobs: +# etl-unit-tests: +# runs-on: ubuntu-22.04 - steps: - - name: Checkout Repository - uses: actions/checkout@v4 +# steps: +# - name: Checkout Repository +# uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' +# - name: Set up Python +# uses: actions/setup-python@v5 +# with: +# python-version: '3.12' - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.12' +# - name: Set up Python +# uses: actions/setup-python@v4 +# with: +# python-version: '3.12' - - name: Install dependencies - run: | - pip install poetry - poetry install --no-root +# - name: Install dependencies +# run: | +# pip install poetry +# poetry install --no-root - - name: Run Tests - run: | - poetry run pytest -W ignore::DeprecationWarning - env: - PYTHONPATH: ${{ github.workspace }} +# - name: Run Tests +# run: | +# poetry run pytest -W ignore::DeprecationWarning +# env: +# PYTHONPATH: ${{ github.workspace }} - continue-on-error: ${{ github.event_name == 'push' && github.ref != 'refs/heads/main' }} +# continue-on-error: ${{ github.event_name == 'push' && github.ref != 'refs/heads/main' }} diff --git a/.github/workflows/scis_invoice_calculator.yml b/.github/workflows/scis_invoice_calculator.yml index 66a5461..d0739f7 100644 --- a/.github/workflows/scis_invoice_calculator.yml +++ b/.github/workflows/scis_invoice_calculator.yml @@ -1,39 +1,39 @@ -name: SCIS Invoice Calculator -on: - schedule: - - cron: '0 6 * * *' - workflow_dispatch: +# name: SCIS Invoice Calculator +# on: +# schedule: +# - cron: '0 6 * * *' +# workflow_dispatch: -jobs: - scis_invoice_calculator: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 +# jobs: +# scis_invoice_calculator: +# runs-on: ubuntu-22.04 +# steps: +# - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.12' +# - name: Set up Python +# uses: actions/setup-python@v4 +# with: +# python-version: '3.12' - - name: Install dependencies - run: | - pip install poetry - poetry install --no-root +# - name: Install dependencies +# run: | +# pip install poetry +# poetry install --no-root - - name: run script - run: | - bash scis_invoice.sh - env: - PYTHONPATH: ${{ github.workspace }} - SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID: ${{ secrets.SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID }} - JJC_SERVICE_SHAREPOINT_ID: ${{ secrets.JJC_SERVICE_SHAREPOINT_ID }} - BAXTER_KELLY_SERVICE_SHAREPOINT_ID: ${{ secrets.BAXTER_KELLY_SERVICE_SHAREPOINT_ID }} - SGEC_SERVICE_SHAREPOINT_ID: ${{ secrets.SGEC_SERVICE_SHAREPOINT_ID }} - SHAREPOINT_CLIENT_ID: ${{ secrets.SHAREPOINT_CLIENT_ID }} - SHAREPOINT_CLIENT_SECRET: ${{ secrets.SHAREPOINT_CLIENT_SECRET }} - SHAREPOINT_TENANT_ID: ${{ secrets.SHAREPOINT_TENANT_ID }} - - name: Upload Excel file - uses: actions/upload-artifact@v4 - with: - name: my-excel-file - path: survey_data.xlsx \ No newline at end of file +# - name: run script +# run: | +# bash scis_invoice.sh +# env: +# PYTHONPATH: ${{ github.workspace }} +# SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID: ${{ secrets.SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID }} +# JJC_SERVICE_SHAREPOINT_ID: ${{ secrets.JJC_SERVICE_SHAREPOINT_ID }} +# BAXTER_KELLY_SERVICE_SHAREPOINT_ID: ${{ secrets.BAXTER_KELLY_SERVICE_SHAREPOINT_ID }} +# SGEC_SERVICE_SHAREPOINT_ID: ${{ secrets.SGEC_SERVICE_SHAREPOINT_ID }} +# SHAREPOINT_CLIENT_ID: ${{ secrets.SHAREPOINT_CLIENT_ID }} +# SHAREPOINT_CLIENT_SECRET: ${{ secrets.SHAREPOINT_CLIENT_SECRET }} +# SHAREPOINT_TENANT_ID: ${{ secrets.SHAREPOINT_TENANT_ID }} +# - name: Upload Excel file +# uses: actions/upload-artifact@v4 +# with: +# name: my-excel-file +# path: survey_data.xlsx \ No newline at end of file diff --git a/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf b/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf index ef1c07c..6e3ecbf 100644 --- a/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf +++ b/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf @@ -6,17 +6,19 @@ data "aws_iam_role" "lambda_exec_role" { # Reference existing ECR repository data "aws_ecr_repository" "extractor_and_loader" { name = "extractor_and_loader" + } # SQS queue for extractor_and_loader resource "aws_sqs_queue" "extractor_and_loader_queue" { name = "extractor-loader-queue" + visibility_timeout_seconds = 1800 # 30 minutes (>= 300s and ~6x Lambda timeout) } -# IAM policy specific to this Lambda +# Custom IAM policy specific to lambda_example resource "aws_iam_policy" "extractor_loader_policy" { - name = "extractor-loader-policy" + name = "extractor_loader_policy" policy = jsonencode({ Version = "2012-10-17", @@ -26,7 +28,9 @@ resource "aws_iam_policy" "extractor_loader_policy" { Action = [ "sqs:ReceiveMessage", "sqs:DeleteMessage", - "sqs:GetQueueAttributes" + "sqs:GetQueueAttributes", + "sqs:GetQueueUrl", + "sqs:ChangeMessageVisibility" ], Resource = aws_sqs_queue.extractor_and_loader_queue.arn }, @@ -55,7 +59,7 @@ resource "aws_iam_role_policy_attachment" "extractor_loader_policy_attach" { # Lambda function resource "aws_lambda_function" "extractor_and_loader" { - function_name = "extractor-and-loader" + function_name = "extractor-and-loader-lambda" role = data.aws_iam_role.lambda_exec_role.arn package_type = "Image" image_uri = "${data.aws_ecr_repository.extractor_and_loader.repository_url}:${var.lambda_image_tag}" diff --git a/deployment/lambda/lambda_example/lambda_example_and_config.tf b/deployment/lambda/lambda_example/lambda_example_and_config.tf index 4f87771..5e52a05 100644 --- a/deployment/lambda/lambda_example/lambda_example_and_config.tf +++ b/deployment/lambda/lambda_example/lambda_example_and_config.tf @@ -25,7 +25,9 @@ resource "aws_iam_policy" "lambda_example_policy" { Action = [ "sqs:ReceiveMessage", "sqs:DeleteMessage", - "sqs:GetQueueAttributes" + "sqs:GetQueueAttributes", + "sqs:GetQueueUrl", + "sqs:ChangeMessageVisibility" ], Resource = aws_sqs_queue.lambda_example_queue.arn }, @@ -47,6 +49,8 @@ resource "aws_iam_policy" "lambda_example_policy" { }) } + + resource "aws_iam_role_policy_attachment" "lambda_example_policy_attach" { role = data.aws_iam_role.lambda_exec_role.name policy_arn = aws_iam_policy.lambda_example_policy.arn diff --git a/deployment/lambda/walthamforest_etl/docker/.dockerignore b/deployment/lambda/walthamforest_etl/docker/.dockerignore new file mode 100644 index 0000000..d587d34 --- /dev/null +++ b/deployment/lambda/walthamforest_etl/docker/.dockerignore @@ -0,0 +1,21 @@ +# Ignore junk and large files +*.pdf +*.csv +*.xml +*.parquet +*.ipynb +*.mp4 +*.mov +*.jpg +*.png +*.zip +*.tar.gz +__pycache__/ +*.pyc +*.pyo +*.pyd +build/ +dist/ +.etl_cache/ +tests/ +docs/ diff --git a/deployment/lambda/walthamforest_etl/docker/Dockerfile b/deployment/lambda/walthamforest_etl/docker/Dockerfile new file mode 100644 index 0000000..cdd1f8a --- /dev/null +++ b/deployment/lambda/walthamforest_etl/docker/Dockerfile @@ -0,0 +1,25 @@ +FROM public.ecr.aws/lambda/python:3.12 + +# Install Poetry (you could pin a version if you like) +RUN curl -sSL https://install.python-poetry.org | python3 - + +# Add Poetry to PATH +ENV PATH="/root/.local/bin:$PATH" + +# Set working directory +WORKDIR /var/task + +# Copy Poetry files first to leverage Docker layer caching +COPY pyproject.toml poetry.lock README.md ./ +COPY etl/ etl/ + + +# Install dependencies into /var/task +RUN poetry config virtualenvs.create false \ + && poetry install --only main --no-interaction --no-ansi + +# Copy app code +COPY deployment/lambda/extractor_and_loader/docker/app.py ./ + +# Set Lambda handler +CMD ["app.handler"] \ No newline at end of file diff --git a/deployment/lambda/walthamforest_etl/docker/app.py b/deployment/lambda/walthamforest_etl/docker/app.py new file mode 100644 index 0000000..535ddd0 --- /dev/null +++ b/deployment/lambda/walthamforest_etl/docker/app.py @@ -0,0 +1,124 @@ +import pandas as pd +import json +from pprint import pprint +import os +import copy +from collections import defaultdict +from typing import List, Dict, Any, Union, Optional + +def process_complex(sheet_name, group_key="ADDRESS"): + df = pd.read_excel("../../../../../home/Downloads/data.xlsx", sheet_name=sheet_name) + + element_cols = [ + "ELEMENT GROUP", "ELEMENT CODE", "ELEMENT CODE DESCRIPTION", + "ATTRIBUTE CODE", "ATTRIBUTE CODE DESCRIPTION", + "ELEMENT DATE VALUE", "ELEMENT NUMERIC VALUE", + "ELEMENT TEXT VALUE", "QUANTITY", + "INSTALL DATE", "REMAINING LIFE", "ELEMENT COMMENTS" + ] + + property_cols = [ + "PROP REF", "ADDRESS", "OWNERSHIP", + "PROP STATUS", "PROP TYPE", "PROP SUB TYPE" + ] + + # Prepare output + records = [] + + # Loop through unique values in group_key (ADDRESS or BLOCK_CODE) + for val in df[group_key].unique(): + g = df[df[group_key] == val] # subset + + property_info = g[property_cols].drop_duplicates().iloc[0].to_dict() + + # build elements dict keyed by ELEMENT CODE DESCRIPTION + elements_dict = {} + for _, row in g[element_cols].drop_duplicates().iterrows(): + key = row["ELEMENT CODE DESCRIPTION"] # could also use "ELEMENT CODE" + elements_dict[key] = row.to_dict() + + records.append({ + group_key: val, + "property_info": property_info, + "elements": elements_dict + }) + + return records + +def process_simple(sheet_name): + df = pd.read_excel("../../../../../home/Downloads/data.xlsx", sheet_name=sheet_name) + + records = [] + + for address in df["Address"].unique(): + g = df[df["Address"] == address].drop_duplicates() # subset for that address + row = g.iloc[0] # take first row if multiple + + # build dict of all columns except Address + elements_dict = row.drop(labels=["Address"]).to_dict() + + records.append({ + "ADDRESS": address, + "to_add": elements_dict + }) + + return records + + +def combine_records_by_address( + asset_records: List[Dict[str, Any]], + simple_records: List[Dict[str, Any]], + dest_key: str = "to_add", + unique_identifier="Address" +) -> List[Dict[str, Any]]: + """ + Merge process_house_asset_data() and process_simple() results by ADDRESS. + All columns from simple_records['to_add'] will be merged under dest_key. + """ + # Index inputs by ADDRESS + asset_by_addr = {r["ADDRESS"]: r for r in asset_records} + simple_by_addr = {r["ADDRESS"]: r for r in simple_records} + + merged: List[Dict[str, Any]] = [] + + # Use union of addresses from both sources + all_addresses = set(asset_by_addr) | set(simple_by_addr) + + for addr in sorted(all_addresses): + base = copy.deepcopy(asset_by_addr.get(addr, {"ADDRESS": addr})) + simple = simple_by_addr.get(addr) + + if simple: + base[dest_key] = simple.get("to_add", {}) + + merged.append(base) + + return merged + +def combine_records_for_flats(assets: dict, simple: list) -> dict: + """Attach BLOCK_INFO (from simple[0]) to each asset in assets.""" + if not simple or not isinstance(simple[0], dict): + return assets # nothing to add + + block_info = simple[0] + + for record in assets: + # Make sure record is a dict + record.update({"BLOCK_INFO": block_info}) + + return assets + +def handler(event, context): + # read data for houses only + assets = process_complex("Houses Asset Data") + simple = process_simple("Houses") + houses = combine_records_by_address(assets, simple, dest_key="EPC_DATA") + + # read data for flats + assets = process_complex("Chingford Rd 236-256 Properties") + simple = process_complex("CHINGFORD ROAD 236-254 Asset Bl", "BLOCK_CODE") + flats = combine_records_for_flats(assets, simple) + + + + diff --git a/deployment/lambda/walthamforest_etl/docker/ecr.tf b/deployment/lambda/walthamforest_etl/docker/ecr.tf new file mode 100644 index 0000000..503bb20 --- /dev/null +++ b/deployment/lambda/walthamforest_etl/docker/ecr.tf @@ -0,0 +1,63 @@ +# ECR repo +resource "aws_ecr_repository" "walthamforest_etl_adhoc_ecr" { + name = "walthamforest_etl_adhoc_ecr" +} + +# ECR policy to allow Lambda access +resource "aws_ecr_repository_policy" "walthamforest_etl_adhoc_ecr_access" { + repository = aws_ecr_repository.walthamforest_etl_adhoc_ecr.name + + policy = jsonencode({ + Version = "2008-10-17", + Statement = [{ + Sid = "AllowLambdaPull", + Effect = "Allow", + Principal = { + Service = "lambda.amazonaws.com" + }, + Action = [ + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:BatchCheckLayerAvailability" + ] + }] + }) +} + + + +# ECR lifecycle policy to delete tagged images older than 14 days +resource "aws_ecr_lifecycle_policy" "walthamforest_etl_adhoc_loader_lifecycle" { + repository = aws_ecr_repository.walthamforest_etl_adhoc_ecr.name + + policy = jsonencode({ + "rules": [ + { + "rulePriority": 2, + "description": "Expire images older than 14 days", + "selection": { + "tagStatus": "untagged", + "countType": "sinceImagePushed", + "countUnit": "days", + "countNumber": 1 + }, + "action": { + "type": "expire" + } + }, + { + "rulePriority": 1, + "description": "Keep last 5 images", + "selection": { + "tagStatus": "tagged", + "tagPrefixList": ["feature"], + "countType": "imageCountMoreThan", + "countNumber": 5 + }, + "action": { + "type": "expire" + } + } + ] + }) +} \ No newline at end of file diff --git a/deployment/lambda/walthamforest_etl/docker/main.tf b/deployment/lambda/walthamforest_etl/docker/main.tf new file mode 100644 index 0000000..e69de29 diff --git a/deployment/lambda/walthamforest_etl/docker/provider.tf b/deployment/lambda/walthamforest_etl/docker/provider.tf new file mode 100644 index 0000000..e41dcbf --- /dev/null +++ b/deployment/lambda/walthamforest_etl/docker/provider.tf @@ -0,0 +1,15 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 6.3.0" + } + } + backend "s3" { + bucket = "survey-extractor-tf-state" + region = "eu-west-2" + key = "env:/dev/lambda/ecr/walthamforest_etl.tfstate" + } + + required_version = ">= 1.2.0" +} diff --git a/deployment/lambda/walthamforest_etl/main.tf b/deployment/lambda/walthamforest_etl/main.tf new file mode 100644 index 0000000..e69de29 diff --git a/deployment/lambda/walthamforest_etl/provider.tf b/deployment/lambda/walthamforest_etl/provider.tf new file mode 100644 index 0000000..7100c0d --- /dev/null +++ b/deployment/lambda/walthamforest_etl/provider.tf @@ -0,0 +1,15 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 6.3.0" + } + } + backend "s3" { + bucket = "survey-extractor-tf-state" + region = "eu-west-2" + key = "env:/dev/lambda/eachlambda/walthamforest_etl_lambda.tfstate" + } + + required_version = ">= 1.2.0" +} diff --git a/deployment/lambda/walthamforest_etl/vars.tf b/deployment/lambda/walthamforest_etl/vars.tf new file mode 100644 index 0000000..ecdf359 --- /dev/null +++ b/deployment/lambda/walthamforest_etl/vars.tf @@ -0,0 +1,5 @@ +variable "lambda_image_tag" { + description = "Docker image tag (e.g. GitHub SHA)" + type = string + default = "local-dev-latest" +} \ No newline at end of file diff --git a/deployment/lambda/walthamforest_etl/walthamforest_etl_lambda.tf b/deployment/lambda/walthamforest_etl/walthamforest_etl_lambda.tf new file mode 100644 index 0000000..a4eedc8 --- /dev/null +++ b/deployment/lambda/walthamforest_etl/walthamforest_etl_lambda.tf @@ -0,0 +1,83 @@ +# Reference existing IAM role +data "aws_iam_role" "lambda_exec_role" { + name = "lambda-exec-role" +} + +# Reference existing ECR repository +data "aws_ecr_repository" "walthamforest_etl_adhoc_ecr" { + name = "walthamforest_etl_adhoc_ecr" +} + +# SQS queue +resource "aws_sqs_queue" "walthamforest_etl_adhoc_queue" { + name = "walthamforest_etl_adhoc-queue" + visibility_timeout_seconds = 1800 # 30 minutes (>= 300s and ~6x Lambda timeout) +} + + +# Custom IAM policy specific to lambda_example +resource "aws_iam_policy" "walthamforest_etl_adhoc_policy" { + name = "walthamforest_adhoc_policy_lambda" + + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Action = [ + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes", + "sqs:GetQueueUrl", + "sqs:ChangeMessageVisibility" + ], + Resource = aws_sqs_queue.walthamforest_etl_adhoc_queue.arn + }, + { + Effect = "Allow", + Action = [ + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:BatchCheckLayerAvailability" + ], + Resource = data.aws_ecr_repository.walthamforest_etl_adhoc_ecr.arn + }, + { + Effect = "Allow", + Action = ["ecr:GetAuthorizationToken"], + Resource = "*" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "walthamforest_etl_adhoc_policy_attach" { + role = data.aws_iam_role.lambda_exec_role.name + policy_arn = aws_iam_policy.walthamforest_etl_adhoc_policy.arn +} + +# Lambda function +resource "aws_lambda_function" "walthamforest_etl_adhoc" { + function_name = "walthamforest_etl_adhoc" + role = data.aws_iam_role.lambda_exec_role.arn + package_type = "Image" + image_uri = "${data.aws_ecr_repository.walthamforest_etl_adhoc_ecr.repository_url}:${var.lambda_image_tag}" + # Increase timeout (max 900 sec / 15 min) + # timeout = 300 # e.g. 5 minutes + + # Increase memory (default 128 MB) + memory_size = 2048 # try 1024 or 2048 MB to start + + # environment { + # variables = { + # DATABASE_URL = "postgresql://postgres:makingwarmhomes@terraform-20250331175522503500000002.cdgzupxvdyp0.eu-west-2.rds.amazonaws.com:5432/surveyDB" + # } + # } +} + +# SQS trigger +resource "aws_lambda_event_source_mapping" "walthamforest_etl_adhoc_trigger" { + event_source_arn = aws_sqs_queue.walthamforest_etl_adhoc_queue.arn + function_name = aws_lambda_function.walthamforest_etl_adhoc.arn + batch_size = 1 +} diff --git a/etl/month_end_automation_wave_3_layout.py b/etl/month_end_automation_wave_3_layout.py index 8b18355..74038ad 100644 --- a/etl/month_end_automation_wave_3_layout.py +++ b/etl/month_end_automation_wave_3_layout.py @@ -256,17 +256,17 @@ for board, all_records in board_to_record.items(): filtered_dfs.append(design2) # Design repetitive simple - design3 = get_df(design, "design invoice type", ["archetype (simple)"], "Design Archetype repetitive") + design3 = get_df(design, "design invoice type", ["repetitive (simple)"], "Design repetitive simple") if not design1.empty: filtered_dfs.append(design3) # Design repetitive complex - design4 = get_df(design, "design invoice type", ["archetype (complex)"], "Design Archetype complex") + design4 = get_df(design, "design invoice type", ["repetitive (complex)"], "Design Repetitive complex") if not design1.empty: filtered_dfs.append(design4) # Design not specified - all_filtered = pd.concat([design1, design2, design3, design4], ignore_index=True) + all_filtered = pd.concat([df for df in (design1, design2, design3, design4) if not df.empty]) design_remaining = design.loc[~design.index.isin(all_filtered.index)] if not design_remaining.empty: design_remaining["job_type"] = "design type not specified" diff --git a/etl/month_end_automation_wave_accent_housing.py b/etl/month_end_automation_wave_accent_housing.py index b13d155..8e04c38 100644 --- a/etl/month_end_automation_wave_accent_housing.py +++ b/etl/month_end_automation_wave_accent_housing.py @@ -15,15 +15,21 @@ board_ids = [ ] empty = "Rate card info missing" - +junte = "ask junte to update" rate_card_data_2502_accent_housing = { "job_type": [ - "First half of MTP", "Second half of MTP", "Full MTP" + "First half of MTP", "Second half of MTP", "Full MTP", "Design Archetype Complex", + "Design Archetype Simple", "Design Repetitive Complex", "Design Repetitive Simple", + "Design Revision", "design type not specified", + ], "rate": [ - 150, 130, 280 + 150, 130, 280, junte, junte, junte, junte, junte, "please ask andreas" ] } +# ToDO +# Design Revision +# Design Check with Andreas rate_card_df = pd.DataFrame(rate_card_data_2502_accent_housing) @@ -91,6 +97,43 @@ full_cost = get_df(df, "mtp invoicing status", ["(v1) full cost mtp to invoice ( if not full_cost.empty: filtered_dfs.append(full_cost) +# Design archetype complex +design = get_df(df, "design invoicing status", ["to invoice"]) +design1 = get_df(design, "design invoice type", ["archetype (complex)"], "Design Archetype Complex") +if not design1.empty : + filtered_dfs.append(design1) + +# Design archetype simple +design2 = get_df(design, "design invoice type", ["archetype (simple)"], "Design Archetype Simple") +if not design1.empty: + filtered_dfs.append(design2) + +# Design repetitive simple +design3 = get_df(design, "design invoice type", ["repetitive (simple)"], "Design repetitive simple") +if not design1.empty: + filtered_dfs.append(design3) + +# Design repetitive complex +design4 = get_df(design, "design invoice type", ["repetitive (complex)"], "Design repetitive complex") +if not design1.empty: + filtered_dfs.append(design4) + +# Design not specified +all_filtered = pd.concat([df for df in (design1, design2, design3, design4) if not df.empty]) +design_remaining = design.loc[~design.index.isin(all_filtered.index)] + +if not design_remaining.empty: + design_remaining["job_type"] = "design type not specified" + filtered_dfs.append(design_remaining) + +# Design Revision +revision_letter = ['a', 'b', 'c', 'd'] +for letter in revision_letter: + design = get_df(df, "design revision invoice", [f"rev. {letter} to invoice"], "Design Revision") + if not design.empty: + filtered_dfs.append(design) + + final_df = pd.concat(filtered_dfs).reset_index(drop=True) final_df["job_type"] = final_df["job_type"].str.lower()