diff --git a/.devcontainer/asset_list/devcontainer.json b/.devcontainer/asset_list/devcontainer.json index 83e5a276..dfa9ba4d 100644 --- a/.devcontainer/asset_list/devcontainer.json +++ b/.devcontainer/asset_list/devcontainer.json @@ -25,7 +25,8 @@ "ms-python.vscode-python-envs", "ms-python.black-formatter", "GrapeCity.gc-excelviewer", - "jakobhoeg.vscode-pokemon" + "jakobhoeg.vscode-pokemon", + "eamodio.gitlens" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index ac654ac1..48a58bd6 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -27,7 +27,9 @@ "GrapeCity.gc-excelviewer", "jakobhoeg.vscode-pokemon", "github.vscode-github-actions", - "me-dutour-mathieu.vscode-github-actions" + "me-dutour-mathieu.vscode-github-actions", + "anthropic.claude-code", + "eamodio.gitlens" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md index 390aeb62..1a835b6e 100644 --- a/backend/address2UPRN/README.md +++ b/backend/address2UPRN/README.md @@ -19,8 +19,37 @@ Before we run this, we need to upload it into S3 as well as put initiate a subta * S3 upload I'll recommend somewhere in retrofit-data-dev and get the s3_uri -For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/formated(Sheet1).csv" +For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv" Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key +task_id = a7b70a02-4df4-45b5-a50b-196e095910bb +sub_task_id = 567cf73b-1210-4909-9ecc-36ae7e23420e +Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling +postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev + +{ + "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb", + "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv" +} +Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches///.csv + +outputs of address2uprn ( which is automatically triggered on postcodesplitter) will be saved on retrofit-data-dev/ara_raw_outputs///.csv + + +Run the script in backend/scripts/combine_address2uprn_outputs.py with . +This will combine all the outputs of the files for each address2uprn into one big file + +Find out which ones have missing uprn and save that as a seperate sheet and save it somewhere in s3://retrofit-data-dev + +I uploaded the missing uprn here: s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv + +ordnance_survey sqs is => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2FordnanceSurvey-queue-dev + +{ + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv", + "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb", + "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e" +} diff --git a/backend/app/config.py b/backend/app/config.py index b5b29137..6604fec9 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -35,7 +35,7 @@ class Settings(BaseSettings): SECRET_KEY: str = "changeme" ENVIRONMENT: str = "changeme" DATA_BUCKET: str = "changeme" - PLAN_TRIGGER_BUCKET: str + PLAN_TRIGGER_BUCKET: str = "changeme" ENGINE_SQS_URL: str = "changeme" CATEGORISATION_SQS_URL: str = "changeme" diff --git a/backend/scripts/combine_address2uprn_outputs.py b/backend/scripts/combine_address2uprn_outputs.py new file mode 100644 index 00000000..be17f610 --- /dev/null +++ b/backend/scripts/combine_address2uprn_outputs.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +import argparse +import boto3 +import pandas as pd +from io import BytesIO + +BUCKET = "retrofit-data-dev" + + +def list_csv_files(task_id): + s3 = boto3.client("s3") + paginator = s3.get_paginator("list_objects_v2") + + prefix = f"ara_raw_outputs/{task_id}" + csv_files = [] + + for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix): + for obj in page.get("Contents", []): + key = obj["Key"] + if key.endswith(".csv"): + csv_files.append(key) + + return csv_files + + +def download_csv(key): + s3 = boto3.client("s3") + obj = s3.get_object(Bucket=BUCKET, Key=key) + return pd.read_csv(BytesIO(obj["Body"].read())) + + +def main(task_id, output): + print(f"Scanning task: {task_id}") + + csv_files = list_csv_files(task_id) + + if not csv_files: + print("No CSV files found") + return + + print(f"Found {len(csv_files)} CSV files") + + dfs = [] + for key in csv_files: + print(f"Downloading {key}") + df = download_csv(key) + dfs.append(df) + + combined = pd.concat(dfs, ignore_index=True) + + combined.to_csv(output, index=False) + + print(f"Combined CSV saved to {output}") + print(f"Total rows: {len(combined)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("task_id", help="Task ID folder in S3") + parser.add_argument("--output", default="combined.csv") + + args = parser.parse_args() + + main(args.task_id, args.output) diff --git a/backend/utils/__init__.py b/backend/utils/__init__.py new file mode 100644 index 00000000..e69de29b