mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
65 lines
1.5 KiB
Python
65 lines
1.5 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import boto3
|
|
import pandas as pd
|
|
from io import BytesIO
|
|
|
|
BUCKET = "retrofit-data-dev"
|
|
|
|
|
|
def list_csv_files(task_id):
|
|
s3 = boto3.client("s3")
|
|
paginator = s3.get_paginator("list_objects_v2")
|
|
|
|
prefix = f"ara_raw_outputs/{task_id}"
|
|
csv_files = []
|
|
|
|
for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix):
|
|
for obj in page.get("Contents", []):
|
|
key = obj["Key"]
|
|
if key.endswith(".csv"):
|
|
csv_files.append(key)
|
|
|
|
return csv_files
|
|
|
|
|
|
def download_csv(key):
|
|
s3 = boto3.client("s3")
|
|
obj = s3.get_object(Bucket=BUCKET, Key=key)
|
|
return pd.read_csv(BytesIO(obj["Body"].read()))
|
|
|
|
|
|
def main(task_id, output):
|
|
print(f"Scanning task: {task_id}")
|
|
|
|
csv_files = list_csv_files(task_id)
|
|
|
|
if not csv_files:
|
|
print("No CSV files found")
|
|
return
|
|
|
|
print(f"Found {len(csv_files)} CSV files")
|
|
|
|
dfs = []
|
|
for key in csv_files:
|
|
print(f"Downloading {key}")
|
|
df = download_csv(key)
|
|
dfs.append(df)
|
|
|
|
combined = pd.concat(dfs, ignore_index=True)
|
|
|
|
combined.to_csv(output, index=False)
|
|
|
|
print(f"Combined CSV saved to {output}")
|
|
print(f"Total rows: {len(combined)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("task_id", help="Task ID folder in S3")
|
|
parser.add_argument("--output", default="combined.csv")
|
|
|
|
args = parser.parse_args()
|
|
|
|
main(args.task_id, args.output)
|