Merge pull request #85 from Hestia-Homes/feautre/walthamforest_etl

Feautre/walthamforest etl
This commit is contained in:
Jun-te Kim 2025-09-16 17:16:00 +01:00 committed by GitHub
commit a52f3dfb5f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 544 additions and 110 deletions

View file

@ -5,10 +5,16 @@
"remoteUser": "vscode",
"workspaceFolder": "/workspaces/survey-extractor",
"postStartCommand": "bash .devcontainer/post-install.sh",
"features": {
// "ghcr.io/devcontainers/features/ssh-agent:1": {}
},
"mounts": [
// Optional, just makes getting from Downloads (local env) easier
// Optional convenience mount
"source=${localEnv:HOME},target=/workspaces/home,type=bind"
],
"customizations": {
"vscode": {
"settings": {
@ -28,3 +34,4 @@
}
}
}

View file

@ -47,3 +47,4 @@ networks:
volumes:
postgres-data:

View file

@ -2,7 +2,7 @@ name: "Build and Push Lambda Image to ECR"
description: "Reusable action for building and pushing lambda Docker image to ECR"
inputs:
lambda_name:
ecr_name:
description: "Lambda name / ECR repo name"
required: true
dockerfile_path:
@ -66,8 +66,8 @@ runs:
- name: Build and push Docker image
shell: bash
run: |
IMAGE_URI=${{ steps.login-ecr.outputs.registry }}/${{ inputs.lambda_name }}:${{ steps.set_tag.outputs.tag }}
echo "Building Docker image for ${{ inputs.lambda_name }}..."
IMAGE_URI=${{ steps.login-ecr.outputs.registry }}/${{ inputs.ecr_name }}:${{ steps.set_tag.outputs.tag }}
echo "Building Docker image for ${{ inputs.ecr_name }}..."
docker build -t $IMAGE_URI -f ${{ inputs.dockerfile_path }} .
echo "Pushing to ECR..."

View file

@ -52,3 +52,4 @@ runs:
working-directory: ${{ inputs.working_directory }}
shell: bash
run: terraform apply -auto-approve tfplan

View file

@ -1,29 +1,29 @@
name: Surveyed Needs Sign Off Script
on:
schedule:
# - cron: '0 17 * * 1-5'
workflow_dispatch:
# name: Surveyed Needs Sign Off Script
# on:
# schedule:
# # - cron: '0 17 * * 1-5'
# workflow_dispatch:
jobs:
surveyed-needs-sign-off:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
# jobs:
# surveyed-needs-sign-off:
# runs-on: ubuntu-22.04
# steps:
# - uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'
# - name: Set up Python
# uses: actions/setup-python@v4
# with:
# python-version: '3.12'
- name: Install dependencies
run: |
pip install poetry
poetry install --no-root
# - name: Install dependencies
# run: |
# pip install poetry
# poetry install --no-root
- name: run script
run: |
pwd
ls -la
poetry run python etl/hubspot_surveyed_needs_sign_off.py
env:
PYTHONPATH: ${{ github.workspace }}
# - name: run script
# run: |
# pwd
# ls -la
# poetry run python etl/hubspot_surveyed_needs_sign_off.py
# env:
# PYTHONPATH: ${{ github.workspace }}

View file

@ -2,7 +2,7 @@ name: Lambda Main Workflow
on:
push:
branches: [main, feautre/additional_features_in_condition_report_extraction]
branches: [main, feautre/walthamforest_etl]
env:
AWS_REGION: eu-west-2
@ -34,7 +34,7 @@ jobs:
- name: Build and deploy Lambda example
uses: ./.github/workflows/actions/lambda-deploy
with:
lambda_name: lambda_example
ecr_name: lambda_example
dockerfile_path: ./deployment/lambda/lambda_example/docker/Dockerfile
ecr_tf_dir: ./deployment/lambda/lambda_example/docker/
lambda_tf_dir: ./deployment/lambda/lambda_example/
@ -57,7 +57,7 @@ jobs:
- name: Build and deploy Extractor & Loader Lambda
uses: ./.github/workflows/actions/lambda-deploy
with:
lambda_name: extractor_and_loader
ecr_name: extractor_and_loader
dockerfile_path: ./deployment/lambda/extractor_and_loader/docker/Dockerfile
ecr_tf_dir: ./deployment/lambda/extractor_and_loader/docker/
lambda_tf_dir: ./deployment/lambda/extractor_and_loader/
@ -67,3 +67,26 @@ jobs:
git-sha: ${{ github.sha }}
git-ref: ${{ github.ref_name }}
walthamforest-etl:
runs-on: ubuntu-latest
needs: shared-lambda-terraform
permissions:
id-token: write
contents: read
steps:
- name: Checkout repo
uses: actions/checkout@v4
- name: Build and deploy WalthamForest ETL
uses: ./.github/workflows/actions/lambda-deploy
with:
ecr_name: walthamforest_etl_adhoc_ecr
dockerfile_path: ./deployment/lambda/walthamforest_etl/docker/Dockerfile
ecr_tf_dir: ./deployment/lambda/walthamforest_etl/docker/
lambda_tf_dir: ./deployment/lambda/walthamforest_etl/
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
git-sha: ${{ github.sha }}
git-ref: ${{ github.ref_name }}

View file

@ -1,42 +1,42 @@
name: Run Pytest
# name: Run Pytest
on:
push:
branches:
- '**' # Run on all branches
pull_request:
branches:
- main
# on:
# push:
# branches:
# - '**' # Run on all branches
# pull_request:
# branches:
# - main
jobs:
etl-unit-tests:
runs-on: ubuntu-22.04
# jobs:
# etl-unit-tests:
# runs-on: ubuntu-22.04
steps:
- name: Checkout Repository
uses: actions/checkout@v4
# steps:
# - name: Checkout Repository
# uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
# - name: Set up Python
# uses: actions/setup-python@v5
# with:
# python-version: '3.12'
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'
# - name: Set up Python
# uses: actions/setup-python@v4
# with:
# python-version: '3.12'
- name: Install dependencies
run: |
pip install poetry
poetry install --no-root
# - name: Install dependencies
# run: |
# pip install poetry
# poetry install --no-root
- name: Run Tests
run: |
poetry run pytest -W ignore::DeprecationWarning
env:
PYTHONPATH: ${{ github.workspace }}
# - name: Run Tests
# run: |
# poetry run pytest -W ignore::DeprecationWarning
# env:
# PYTHONPATH: ${{ github.workspace }}
continue-on-error: ${{ github.event_name == 'push' && github.ref != 'refs/heads/main' }}
# continue-on-error: ${{ github.event_name == 'push' && github.ref != 'refs/heads/main' }}

View file

@ -1,39 +1,39 @@
name: SCIS Invoice Calculator
on:
schedule:
- cron: '0 6 * * *'
workflow_dispatch:
# name: SCIS Invoice Calculator
# on:
# schedule:
# - cron: '0 6 * * *'
# workflow_dispatch:
jobs:
scis_invoice_calculator:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
# jobs:
# scis_invoice_calculator:
# runs-on: ubuntu-22.04
# steps:
# - uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'
# - name: Set up Python
# uses: actions/setup-python@v4
# with:
# python-version: '3.12'
- name: Install dependencies
run: |
pip install poetry
poetry install --no-root
# - name: Install dependencies
# run: |
# pip install poetry
# poetry install --no-root
- name: run script
run: |
bash scis_invoice.sh
env:
PYTHONPATH: ${{ github.workspace }}
SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID: ${{ secrets.SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID }}
JJC_SERVICE_SHAREPOINT_ID: ${{ secrets.JJC_SERVICE_SHAREPOINT_ID }}
BAXTER_KELLY_SERVICE_SHAREPOINT_ID: ${{ secrets.BAXTER_KELLY_SERVICE_SHAREPOINT_ID }}
SGEC_SERVICE_SHAREPOINT_ID: ${{ secrets.SGEC_SERVICE_SHAREPOINT_ID }}
SHAREPOINT_CLIENT_ID: ${{ secrets.SHAREPOINT_CLIENT_ID }}
SHAREPOINT_CLIENT_SECRET: ${{ secrets.SHAREPOINT_CLIENT_SECRET }}
SHAREPOINT_TENANT_ID: ${{ secrets.SHAREPOINT_TENANT_ID }}
- name: Upload Excel file
uses: actions/upload-artifact@v4
with:
name: my-excel-file
path: survey_data.xlsx
# - name: run script
# run: |
# bash scis_invoice.sh
# env:
# PYTHONPATH: ${{ github.workspace }}
# SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID: ${{ secrets.SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID }}
# JJC_SERVICE_SHAREPOINT_ID: ${{ secrets.JJC_SERVICE_SHAREPOINT_ID }}
# BAXTER_KELLY_SERVICE_SHAREPOINT_ID: ${{ secrets.BAXTER_KELLY_SERVICE_SHAREPOINT_ID }}
# SGEC_SERVICE_SHAREPOINT_ID: ${{ secrets.SGEC_SERVICE_SHAREPOINT_ID }}
# SHAREPOINT_CLIENT_ID: ${{ secrets.SHAREPOINT_CLIENT_ID }}
# SHAREPOINT_CLIENT_SECRET: ${{ secrets.SHAREPOINT_CLIENT_SECRET }}
# SHAREPOINT_TENANT_ID: ${{ secrets.SHAREPOINT_TENANT_ID }}
# - name: Upload Excel file
# uses: actions/upload-artifact@v4
# with:
# name: my-excel-file
# path: survey_data.xlsx

View file

@ -6,17 +6,19 @@ data "aws_iam_role" "lambda_exec_role" {
# Reference existing ECR repository
data "aws_ecr_repository" "extractor_and_loader" {
name = "extractor_and_loader"
}
# SQS queue for extractor_and_loader
resource "aws_sqs_queue" "extractor_and_loader_queue" {
name = "extractor-loader-queue"
visibility_timeout_seconds = 1800 # 30 minutes (>= 300s and ~6x Lambda timeout)
}
# IAM policy specific to this Lambda
# Custom IAM policy specific to lambda_example
resource "aws_iam_policy" "extractor_loader_policy" {
name = "extractor-loader-policy"
name = "extractor_loader_policy"
policy = jsonencode({
Version = "2012-10-17",
@ -26,7 +28,9 @@ resource "aws_iam_policy" "extractor_loader_policy" {
Action = [
"sqs:ReceiveMessage",
"sqs:DeleteMessage",
"sqs:GetQueueAttributes"
"sqs:GetQueueAttributes",
"sqs:GetQueueUrl",
"sqs:ChangeMessageVisibility"
],
Resource = aws_sqs_queue.extractor_and_loader_queue.arn
},
@ -55,7 +59,7 @@ resource "aws_iam_role_policy_attachment" "extractor_loader_policy_attach" {
# Lambda function
resource "aws_lambda_function" "extractor_and_loader" {
function_name = "extractor-and-loader"
function_name = "extractor-and-loader-lambda"
role = data.aws_iam_role.lambda_exec_role.arn
package_type = "Image"
image_uri = "${data.aws_ecr_repository.extractor_and_loader.repository_url}:${var.lambda_image_tag}"

View file

@ -25,7 +25,9 @@ resource "aws_iam_policy" "lambda_example_policy" {
Action = [
"sqs:ReceiveMessage",
"sqs:DeleteMessage",
"sqs:GetQueueAttributes"
"sqs:GetQueueAttributes",
"sqs:GetQueueUrl",
"sqs:ChangeMessageVisibility"
],
Resource = aws_sqs_queue.lambda_example_queue.arn
},
@ -47,6 +49,8 @@ resource "aws_iam_policy" "lambda_example_policy" {
})
}
resource "aws_iam_role_policy_attachment" "lambda_example_policy_attach" {
role = data.aws_iam_role.lambda_exec_role.name
policy_arn = aws_iam_policy.lambda_example_policy.arn

View file

@ -0,0 +1,21 @@
# Ignore junk and large files
*.pdf
*.csv
*.xml
*.parquet
*.ipynb
*.mp4
*.mov
*.jpg
*.png
*.zip
*.tar.gz
__pycache__/
*.pyc
*.pyo
*.pyd
build/
dist/
.etl_cache/
tests/
docs/

View file

@ -0,0 +1,25 @@
FROM public.ecr.aws/lambda/python:3.12
# Install Poetry (you could pin a version if you like)
RUN curl -sSL https://install.python-poetry.org | python3 -
# Add Poetry to PATH
ENV PATH="/root/.local/bin:$PATH"
# Set working directory
WORKDIR /var/task
# Copy Poetry files first to leverage Docker layer caching
COPY pyproject.toml poetry.lock README.md ./
COPY etl/ etl/
# Install dependencies into /var/task
RUN poetry config virtualenvs.create false \
&& poetry install --only main --no-interaction --no-ansi
# Copy app code
COPY deployment/lambda/extractor_and_loader/docker/app.py ./
# Set Lambda handler
CMD ["app.handler"]

View file

@ -0,0 +1,124 @@
import pandas as pd
import json
from pprint import pprint
import os
import copy
from collections import defaultdict
from typing import List, Dict, Any, Union, Optional
def process_complex(sheet_name, group_key="ADDRESS"):
df = pd.read_excel("../../../../../home/Downloads/data.xlsx", sheet_name=sheet_name)
element_cols = [
"ELEMENT GROUP", "ELEMENT CODE", "ELEMENT CODE DESCRIPTION",
"ATTRIBUTE CODE", "ATTRIBUTE CODE DESCRIPTION",
"ELEMENT DATE VALUE", "ELEMENT NUMERIC VALUE",
"ELEMENT TEXT VALUE", "QUANTITY",
"INSTALL DATE", "REMAINING LIFE", "ELEMENT COMMENTS"
]
property_cols = [
"PROP REF", "ADDRESS", "OWNERSHIP",
"PROP STATUS", "PROP TYPE", "PROP SUB TYPE"
]
# Prepare output
records = []
# Loop through unique values in group_key (ADDRESS or BLOCK_CODE)
for val in df[group_key].unique():
g = df[df[group_key] == val] # subset
property_info = g[property_cols].drop_duplicates().iloc[0].to_dict()
# build elements dict keyed by ELEMENT CODE DESCRIPTION
elements_dict = {}
for _, row in g[element_cols].drop_duplicates().iterrows():
key = row["ELEMENT CODE DESCRIPTION"] # could also use "ELEMENT CODE"
elements_dict[key] = row.to_dict()
records.append({
group_key: val,
"property_info": property_info,
"elements": elements_dict
})
return records
def process_simple(sheet_name):
df = pd.read_excel("../../../../../home/Downloads/data.xlsx", sheet_name=sheet_name)
records = []
for address in df["Address"].unique():
g = df[df["Address"] == address].drop_duplicates() # subset for that address
row = g.iloc[0] # take first row if multiple
# build dict of all columns except Address
elements_dict = row.drop(labels=["Address"]).to_dict()
records.append({
"ADDRESS": address,
"to_add": elements_dict
})
return records
def combine_records_by_address(
asset_records: List[Dict[str, Any]],
simple_records: List[Dict[str, Any]],
dest_key: str = "to_add",
unique_identifier="Address"
) -> List[Dict[str, Any]]:
"""
Merge process_house_asset_data() and process_simple() results by ADDRESS.
All columns from simple_records['to_add'] will be merged under dest_key.
"""
# Index inputs by ADDRESS
asset_by_addr = {r["ADDRESS"]: r for r in asset_records}
simple_by_addr = {r["ADDRESS"]: r for r in simple_records}
merged: List[Dict[str, Any]] = []
# Use union of addresses from both sources
all_addresses = set(asset_by_addr) | set(simple_by_addr)
for addr in sorted(all_addresses):
base = copy.deepcopy(asset_by_addr.get(addr, {"ADDRESS": addr}))
simple = simple_by_addr.get(addr)
if simple:
base[dest_key] = simple.get("to_add", {})
merged.append(base)
return merged
def combine_records_for_flats(assets: dict, simple: list) -> dict:
"""Attach BLOCK_INFO (from simple[0]) to each asset in assets."""
if not simple or not isinstance(simple[0], dict):
return assets # nothing to add
block_info = simple[0]
for record in assets:
# Make sure record is a dict
record.update({"BLOCK_INFO": block_info})
return assets
def handler(event, context):
# read data for houses only
assets = process_complex("Houses Asset Data")
simple = process_simple("Houses")
houses = combine_records_by_address(assets, simple, dest_key="EPC_DATA")
# read data for flats
assets = process_complex("Chingford Rd 236-256 Properties")
simple = process_complex("CHINGFORD ROAD 236-254 Asset Bl", "BLOCK_CODE")
flats = combine_records_for_flats(assets, simple)

View file

@ -0,0 +1,63 @@
# ECR repo
resource "aws_ecr_repository" "walthamforest_etl_adhoc_ecr" {
name = "walthamforest_etl_adhoc_ecr"
}
# ECR policy to allow Lambda access
resource "aws_ecr_repository_policy" "walthamforest_etl_adhoc_ecr_access" {
repository = aws_ecr_repository.walthamforest_etl_adhoc_ecr.name
policy = jsonencode({
Version = "2008-10-17",
Statement = [{
Sid = "AllowLambdaPull",
Effect = "Allow",
Principal = {
Service = "lambda.amazonaws.com"
},
Action = [
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"ecr:BatchCheckLayerAvailability"
]
}]
})
}
# ECR lifecycle policy to delete tagged images older than 14 days
resource "aws_ecr_lifecycle_policy" "walthamforest_etl_adhoc_loader_lifecycle" {
repository = aws_ecr_repository.walthamforest_etl_adhoc_ecr.name
policy = jsonencode({
"rules": [
{
"rulePriority": 2,
"description": "Expire images older than 14 days",
"selection": {
"tagStatus": "untagged",
"countType": "sinceImagePushed",
"countUnit": "days",
"countNumber": 1
},
"action": {
"type": "expire"
}
},
{
"rulePriority": 1,
"description": "Keep last 5 images",
"selection": {
"tagStatus": "tagged",
"tagPrefixList": ["feature"],
"countType": "imageCountMoreThan",
"countNumber": 5
},
"action": {
"type": "expire"
}
}
]
})
}

View file

@ -0,0 +1,15 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 6.3.0"
}
}
backend "s3" {
bucket = "survey-extractor-tf-state"
region = "eu-west-2"
key = "env:/dev/lambda/ecr/walthamforest_etl.tfstate"
}
required_version = ">= 1.2.0"
}

View file

@ -0,0 +1,15 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 6.3.0"
}
}
backend "s3" {
bucket = "survey-extractor-tf-state"
region = "eu-west-2"
key = "env:/dev/lambda/eachlambda/walthamforest_etl_lambda.tfstate"
}
required_version = ">= 1.2.0"
}

View file

@ -0,0 +1,5 @@
variable "lambda_image_tag" {
description = "Docker image tag (e.g. GitHub SHA)"
type = string
default = "local-dev-latest"
}

View file

@ -0,0 +1,83 @@
# Reference existing IAM role
data "aws_iam_role" "lambda_exec_role" {
name = "lambda-exec-role"
}
# Reference existing ECR repository
data "aws_ecr_repository" "walthamforest_etl_adhoc_ecr" {
name = "walthamforest_etl_adhoc_ecr"
}
# SQS queue
resource "aws_sqs_queue" "walthamforest_etl_adhoc_queue" {
name = "walthamforest_etl_adhoc-queue"
visibility_timeout_seconds = 1800 # 30 minutes (>= 300s and ~6x Lambda timeout)
}
# Custom IAM policy specific to lambda_example
resource "aws_iam_policy" "walthamforest_etl_adhoc_policy" {
name = "walthamforest_adhoc_policy_lambda"
policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Effect = "Allow",
Action = [
"sqs:ReceiveMessage",
"sqs:DeleteMessage",
"sqs:GetQueueAttributes",
"sqs:GetQueueUrl",
"sqs:ChangeMessageVisibility"
],
Resource = aws_sqs_queue.walthamforest_etl_adhoc_queue.arn
},
{
Effect = "Allow",
Action = [
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"ecr:BatchCheckLayerAvailability"
],
Resource = data.aws_ecr_repository.walthamforest_etl_adhoc_ecr.arn
},
{
Effect = "Allow",
Action = ["ecr:GetAuthorizationToken"],
Resource = "*"
}
]
})
}
resource "aws_iam_role_policy_attachment" "walthamforest_etl_adhoc_policy_attach" {
role = data.aws_iam_role.lambda_exec_role.name
policy_arn = aws_iam_policy.walthamforest_etl_adhoc_policy.arn
}
# Lambda function
resource "aws_lambda_function" "walthamforest_etl_adhoc" {
function_name = "walthamforest_etl_adhoc"
role = data.aws_iam_role.lambda_exec_role.arn
package_type = "Image"
image_uri = "${data.aws_ecr_repository.walthamforest_etl_adhoc_ecr.repository_url}:${var.lambda_image_tag}"
# Increase timeout (max 900 sec / 15 min)
# timeout = 300 # e.g. 5 minutes
# Increase memory (default 128 MB)
memory_size = 2048 # try 1024 or 2048 MB to start
# environment {
# variables = {
# DATABASE_URL = "postgresql://postgres:makingwarmhomes@terraform-20250331175522503500000002.cdgzupxvdyp0.eu-west-2.rds.amazonaws.com:5432/surveyDB"
# }
# }
}
# SQS trigger
resource "aws_lambda_event_source_mapping" "walthamforest_etl_adhoc_trigger" {
event_source_arn = aws_sqs_queue.walthamforest_etl_adhoc_queue.arn
function_name = aws_lambda_function.walthamforest_etl_adhoc.arn
batch_size = 1
}

View file

@ -256,17 +256,17 @@ for board, all_records in board_to_record.items():
filtered_dfs.append(design2)
# Design repetitive simple
design3 = get_df(design, "design invoice type", ["archetype (simple)"], "Design Archetype repetitive")
design3 = get_df(design, "design invoice type", ["repetitive (simple)"], "Design repetitive simple")
if not design1.empty:
filtered_dfs.append(design3)
# Design repetitive complex
design4 = get_df(design, "design invoice type", ["archetype (complex)"], "Design Archetype complex")
design4 = get_df(design, "design invoice type", ["repetitive (complex)"], "Design Repetitive complex")
if not design1.empty:
filtered_dfs.append(design4)
# Design not specified
all_filtered = pd.concat([design1, design2, design3, design4], ignore_index=True)
all_filtered = pd.concat([df for df in (design1, design2, design3, design4) if not df.empty])
design_remaining = design.loc[~design.index.isin(all_filtered.index)]
if not design_remaining.empty:
design_remaining["job_type"] = "design type not specified"

View file

@ -15,15 +15,21 @@ board_ids = [
]
empty = "Rate card info missing"
junte = "ask junte to update"
rate_card_data_2502_accent_housing = {
"job_type": [
"First half of MTP", "Second half of MTP", "Full MTP"
"First half of MTP", "Second half of MTP", "Full MTP", "Design Archetype Complex",
"Design Archetype Simple", "Design Repetitive Complex", "Design Repetitive Simple",
"Design Revision", "design type not specified",
],
"rate": [
150, 130, 280
150, 130, 280, junte, junte, junte, junte, junte, "please ask andreas"
]
}
# ToDO
# Design Revision
# Design Check with Andreas
rate_card_df = pd.DataFrame(rate_card_data_2502_accent_housing)
@ -91,6 +97,43 @@ full_cost = get_df(df, "mtp invoicing status", ["(v1) full cost mtp to invoice (
if not full_cost.empty:
filtered_dfs.append(full_cost)
# Design archetype complex
design = get_df(df, "design invoicing status", ["to invoice"])
design1 = get_df(design, "design invoice type", ["archetype (complex)"], "Design Archetype Complex")
if not design1.empty :
filtered_dfs.append(design1)
# Design archetype simple
design2 = get_df(design, "design invoice type", ["archetype (simple)"], "Design Archetype Simple")
if not design1.empty:
filtered_dfs.append(design2)
# Design repetitive simple
design3 = get_df(design, "design invoice type", ["repetitive (simple)"], "Design repetitive simple")
if not design1.empty:
filtered_dfs.append(design3)
# Design repetitive complex
design4 = get_df(design, "design invoice type", ["repetitive (complex)"], "Design repetitive complex")
if not design1.empty:
filtered_dfs.append(design4)
# Design not specified
all_filtered = pd.concat([df for df in (design1, design2, design3, design4) if not df.empty])
design_remaining = design.loc[~design.index.isin(all_filtered.index)]
if not design_remaining.empty:
design_remaining["job_type"] = "design type not specified"
filtered_dfs.append(design_remaining)
# Design Revision
revision_letter = ['a', 'b', 'c', 'd']
for letter in revision_letter:
design = get_df(df, "design revision invoice", [f"rev. {letter} to invoice"], "Design Revision")
if not design.empty:
filtered_dfs.append(design)
final_df = pd.concat(filtered_dfs).reset_index(drop=True)
final_df["job_type"] = final_df["job_type"].str.lower()