Model/backend/scripts/combine_address2uprn_outputs.py
2026-04-16 22:21:54 +00:00

57 lines
1.3 KiB
Python

#!/usr/bin/env python3
import argparse
import boto3
import pandas as pd
from io import BytesIO
BUCKET = "retrofit-data-dev"
def list_csv_files(task_id):
s3 = boto3.client("s3")
paginator = s3.get_paginator("list_objects_v2")
prefix = f"ara_raw_outputs/{task_id}"
csv_files = []
for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix):
for obj in page.get("Contents", []):
key = obj["Key"]
if key.endswith(".csv"):
csv_files.append(key)
return csv_files
def download_csv(key):
s3 = boto3.client("s3")
obj = s3.get_object(Bucket=BUCKET, Key=key)
return pd.read_csv(BytesIO(obj["Body"].read()))
def main(task_id, output):
task_id = "3fb9a9b7-ff49-4c11-b9e1-9d00da955a75"
print(f"Scanning task: {task_id}")
csv_files = list_csv_files(task_id)
if not csv_files:
print("No CSV files found")
return
print(f"Found {len(csv_files)} CSV files")
dfs = []
for key in csv_files:
print(f"Downloading {key}")
df = download_csv(key)
dfs.append(df)
combined = pd.concat(dfs, ignore_index=True)
combined.to_csv(output, index=False)
print(f"Combined CSV saved to {output}")
print(f"Total rows: {len(combined)}")