passing sheet count to lambda

This commit is contained in:
Khalim Conn-Kowlessar 2025-07-22 18:04:53 +01:00
parent 8c5bd19992
commit 38d0dcdb77

View file

@ -2,8 +2,6 @@ import boto3
import json
import math
from datetime import datetime
from openpyxl import load_workbook
from io import BytesIO
from fastapi import APIRouter, Depends
from backend.app.dependencies import validate_token
@ -15,53 +13,6 @@ from backend.app.db.connection import db_engine
from backend.app.db.functions.recommendations_functions import create_scenario
def read_excel_from_s3(bucket_name, file_key, header_row=0, drop_all_na=True, sheet_name=None):
"""
Reads an Excel file from S3 and returns it as a list of dictionaries.
:param bucket_name: Name of the S3 bucket.
:param file_key: S3 key/path to the file.
:param header_row: Row number (0-indexed) to use as header.
:param drop_all_na: If True, drop columns where all values are None.
:param sheet_name: Name of the worksheet to read. Defaults to the first.
:return: List of dicts, one per row.
"""
s3 = boto3.client("s3")
response = s3.get_object(Bucket=bucket_name, Key=file_key)
excel_buffer = BytesIO(response["Body"].read())
wb = load_workbook(filename=excel_buffer, data_only=True)
ws = wb[sheet_name] if sheet_name else wb.active
rows = list(ws.iter_rows(values_only=True))
if len(rows) <= header_row:
raise ValueError("Header row index is out of range.")
headers = [str(h).strip() if h is not None else f"__col_{i}" for i, h in enumerate(rows[header_row])]
data_rows = rows[header_row + 1:]
# Drop columns where all values are None if required
if drop_all_na:
# Transpose rows to get columns
col_data = list(zip(*data_rows))
keep_indices = [i for i, col in enumerate(col_data) if not all(v is None for v in col)]
headers = [h for i, h in enumerate(headers) if i in keep_indices]
data_rows = [
[row[i] for i in keep_indices]
for row in data_rows
]
# Create list of dicts
result = [
{headers[i]: cell for i, cell in enumerate(row)}
for row in data_rows
if any(cell is not None for cell in row) # skip fully empty rows
]
return result
logger = setup_logger()
router = APIRouter(
@ -93,14 +44,7 @@ async def trigger_plan_entrypoint(body: PlanTriggerRequest):
if data.get("file_format") == "domna_asset_list" and data.get("file_type") == "xlsx":
try:
input_data = read_excel_from_s3(
bucket_name=settings.PLAN_TRIGGER_BUCKET,
file_key=data.get("trigger_file_path"),
sheet_name=data.get("sheet_name"),
header_row=0,
)
total_rows = len(input_data)
total_rows = body.get("sheet_count", 0)
chunk_size = 30
total_chunks = math.ceil(total_rows / chunk_size)