added __init__.py missing in ordancy survey and helpeer extensions

2026-06-08 11:17:27 +00:00 · 2026-03-10 12:39:27 +00:00 · 2026-03-10 12:39:27 +00:00 · 5b15cb5518
commit 5b15cb5518
parent 41d817b218
5 changed files with 5379 additions and 2 deletions
--- a/.devcontainer/backend/devcontainer.json
+++ b/.devcontainer/backend/devcontainer.json
@ -27,7 +27,9 @@
        "GrapeCity.gc-excelviewer",
        "jakobhoeg.vscode-pokemon",
        "github.vscode-github-actions",
-        "me-dutour-mathieu.vscode-github-actions"
+        "me-dutour-mathieu.vscode-github-actions",
+        "anthropic.claude-code",
+        "eamodio.gitlens"
      ],
      "settings": {
        "files.defaultWorkspace": "/workspaces/model",
--- a/backend/address2UPRN/README.md
+++ b/backend/address2UPRN/README.md
@ -34,4 +34,22 @@ postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?re
    "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e",
    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv"
 }
-Each batch of csv should be saved in retrofit-data-dev/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv
+Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv
+
+outputs of address2uprn ( which is automatically triggered on postcodesplitter) will be saved on retrofit-data-dev/ara_raw_outputs/<task-id>/<subtask-id>/<timestamp:uuid4>.csv
+
+
+Run the script in backend/scripts/combine_address2uprn_outputs.py with <task-id>.
+This will combine all the outputs of the files for each address2uprn into one big file
+
+Find out which ones have missing uprn and save that as a seperate sheet and save it somewhere in s3://retrofit-data-dev
+
+I uploaded the missing uprn here: s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv
+
+ordnance_survey sqs is => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2FordnanceSurvey-queue-dev
+
+{
+    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv",
+    "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb",
+    "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e"
+}
--- a/backend/scripts/combine_address2uprn_outputs.py
+++ b/backend/scripts/combine_address2uprn_outputs.py
@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+
+import argparse
+import boto3
+import pandas as pd
+from io import BytesIO
+
+BUCKET = "retrofit-data-dev"
+
+
+def list_csv_files(task_id):
+    s3 = boto3.client("s3")
+    paginator = s3.get_paginator("list_objects_v2")
+
+    prefix = f"ara_raw_outputs/{task_id}"
+    csv_files = []
+
+    for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix):
+        for obj in page.get("Contents", []):
+            key = obj["Key"]
+            if key.endswith(".csv"):
+                csv_files.append(key)
+
+    return csv_files
+
+
+def download_csv(key):
+    s3 = boto3.client("s3")
+    obj = s3.get_object(Bucket=BUCKET, Key=key)
+    return pd.read_csv(BytesIO(obj["Body"].read()))
+
+
+def main(task_id, output):
+    print(f"Scanning task: {task_id}")
+
+    csv_files = list_csv_files(task_id)
+
+    if not csv_files:
+        print("No CSV files found")
+        return
+
+    print(f"Found {len(csv_files)} CSV files")
+
+    dfs = []
+    for key in csv_files:
+        print(f"Downloading {key}")
+        df = download_csv(key)
+        dfs.append(df)
+
+    combined = pd.concat(dfs, ignore_index=True)
+
+    combined.to_csv(output, index=False)
+
+    print(f"Combined CSV saved to {output}")
+    print(f"Total rows: {len(combined)}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("task_id", help="Task ID folder in S3")
+    parser.add_argument("--output", default="combined.csv")
+
+    args = parser.parse_args()
+
+    main(args.task_id, args.output)
--- a/backend/scripts/combined.csv
+++ b/backend/scripts/combined.csv
--- a/backend/utils/init.py
+++ b/backend/utils/init.py