#!/usr/bin/env python3 import argparse import boto3 import pandas as pd from io import BytesIO BUCKET = "retrofit-data-dev" def list_csv_files(task_id): s3 = boto3.client("s3") paginator = s3.get_paginator("list_objects_v2") prefix = f"ara_raw_outputs/{task_id}" csv_files = [] for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix): for obj in page.get("Contents", []): key = obj["Key"] if key.endswith(".csv"): csv_files.append(key) return csv_files def download_csv(key): s3 = boto3.client("s3") obj = s3.get_object(Bucket=BUCKET, Key=key) return pd.read_csv(BytesIO(obj["Body"].read())) def main(task_id, output): task_id = "3fb9a9b7-ff49-4c11-b9e1-9d00da955a75" print(f"Scanning task: {task_id}") csv_files = list_csv_files(task_id) if not csv_files: print("No CSV files found") return print(f"Found {len(csv_files)} CSV files") dfs = [] for key in csv_files: print(f"Downloading {key}") df = download_csv(key) dfs.append(df) combined = pd.concat(dfs, ignore_index=True) combined.to_csv(output, index=False) print(f"Combined CSV saved to {output}") print(f"Total rows: {len(combined)}")