diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index ac654ac1..48a58bd6 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -27,7 +27,9 @@ "GrapeCity.gc-excelviewer", "jakobhoeg.vscode-pokemon", "github.vscode-github-actions", - "me-dutour-mathieu.vscode-github-actions" + "me-dutour-mathieu.vscode-github-actions", + "anthropic.claude-code", + "eamodio.gitlens" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md index aaeba08d..1a835b6e 100644 --- a/backend/address2UPRN/README.md +++ b/backend/address2UPRN/README.md @@ -34,4 +34,22 @@ postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?re "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e", "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv" } -Each batch of csv should be saved in retrofit-data-dev///.csv \ No newline at end of file +Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches///.csv + +outputs of address2uprn ( which is automatically triggered on postcodesplitter) will be saved on retrofit-data-dev/ara_raw_outputs///.csv + + +Run the script in backend/scripts/combine_address2uprn_outputs.py with . +This will combine all the outputs of the files for each address2uprn into one big file + +Find out which ones have missing uprn and save that as a seperate sheet and save it somewhere in s3://retrofit-data-dev + +I uploaded the missing uprn here: s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv + +ordnance_survey sqs is => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2FordnanceSurvey-queue-dev + +{ + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv", + "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb", + "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e" +} diff --git a/backend/scripts/combine_address2uprn_outputs.py b/backend/scripts/combine_address2uprn_outputs.py new file mode 100644 index 00000000..be17f610 --- /dev/null +++ b/backend/scripts/combine_address2uprn_outputs.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +import argparse +import boto3 +import pandas as pd +from io import BytesIO + +BUCKET = "retrofit-data-dev" + + +def list_csv_files(task_id): + s3 = boto3.client("s3") + paginator = s3.get_paginator("list_objects_v2") + + prefix = f"ara_raw_outputs/{task_id}" + csv_files = [] + + for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix): + for obj in page.get("Contents", []): + key = obj["Key"] + if key.endswith(".csv"): + csv_files.append(key) + + return csv_files + + +def download_csv(key): + s3 = boto3.client("s3") + obj = s3.get_object(Bucket=BUCKET, Key=key) + return pd.read_csv(BytesIO(obj["Body"].read())) + + +def main(task_id, output): + print(f"Scanning task: {task_id}") + + csv_files = list_csv_files(task_id) + + if not csv_files: + print("No CSV files found") + return + + print(f"Found {len(csv_files)} CSV files") + + dfs = [] + for key in csv_files: + print(f"Downloading {key}") + df = download_csv(key) + dfs.append(df) + + combined = pd.concat(dfs, ignore_index=True) + + combined.to_csv(output, index=False) + + print(f"Combined CSV saved to {output}") + print(f"Total rows: {len(combined)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("task_id", help="Task ID folder in S3") + parser.add_argument("--output", default="combined.csv") + + args = parser.parse_args() + + main(args.task_id, args.output) diff --git a/backend/utils/__init__.py b/backend/utils/__init__.py new file mode 100644 index 00000000..e69de29b