adding sheet count

This commit is contained in:
Khalim Conn-Kowlessar 2025-07-22 18:02:56 +01:00
parent 802da66ce9
commit 8c5bd19992
4 changed files with 56 additions and 5 deletions

View file

@ -2,19 +2,66 @@ import boto3
import json
import math
from datetime import datetime
from openpyxl import load_workbook
from io import BytesIO
import pandas as pd
from fastapi import APIRouter, Depends
from backend.app.dependencies import validate_token
from backend.app.plan.schemas import PlanTriggerRequest
from backend.app.config import get_settings
from sqlalchemy.orm import sessionmaker
from utils.logger import setup_logger
from utils.s3 import read_excel_from_s3
from backend.app.db.connection import db_engine
from backend.app.db.functions.recommendations_functions import create_scenario
def read_excel_from_s3(bucket_name, file_key, header_row=0, drop_all_na=True, sheet_name=None):
"""
Reads an Excel file from S3 and returns it as a list of dictionaries.
:param bucket_name: Name of the S3 bucket.
:param file_key: S3 key/path to the file.
:param header_row: Row number (0-indexed) to use as header.
:param drop_all_na: If True, drop columns where all values are None.
:param sheet_name: Name of the worksheet to read. Defaults to the first.
:return: List of dicts, one per row.
"""
s3 = boto3.client("s3")
response = s3.get_object(Bucket=bucket_name, Key=file_key)
excel_buffer = BytesIO(response["Body"].read())
wb = load_workbook(filename=excel_buffer, data_only=True)
ws = wb[sheet_name] if sheet_name else wb.active
rows = list(ws.iter_rows(values_only=True))
if len(rows) <= header_row:
raise ValueError("Header row index is out of range.")
headers = [str(h).strip() if h is not None else f"__col_{i}" for i, h in enumerate(rows[header_row])]
data_rows = rows[header_row + 1:]
# Drop columns where all values are None if required
if drop_all_na:
# Transpose rows to get columns
col_data = list(zip(*data_rows))
keep_indices = [i for i, col in enumerate(col_data) if not all(v is None for v in col)]
headers = [h for i, h in enumerate(headers) if i in keep_indices]
data_rows = [
[row[i] for i in keep_indices]
for row in data_rows
]
# Create list of dicts
result = [
{headers[i]: cell for i, cell in enumerate(row)}
for row in data_rows
if any(cell is not None for cell in row) # skip fully empty rows
]
return result
logger = setup_logger()
router = APIRouter(
@ -45,11 +92,12 @@ async def trigger_plan_entrypoint(body: PlanTriggerRequest):
# If file_format is domna_asset_list and type is xlsx, read and chunk it
if data.get("file_format") == "domna_asset_list" and data.get("file_type") == "xlsx":
try:
input_data: pd.DataFrame = read_excel_from_s3(
input_data = read_excel_from_s3(
bucket_name=settings.PLAN_TRIGGER_BUCKET,
file_key=data.get("trigger_file_path"),
sheet_name=data.get("sheet_name"),
header_row=0
header_row=0,
)
total_rows = len(input_data)

View file

@ -108,6 +108,7 @@ class PlanTriggerRequest(BaseModel):
file_type: Optional[Literal["csv", "xlsx"]] = None
file_format: Optional[Literal["domna_asset_list"]] = None
sheet_name: Optional[str] = None
sheet_count: Optional[int] = None
# If one of index_start or index_end is set, the other must be set too
index_start: Optional[int] = None
index_end: Optional[int] = None

View file

@ -8,4 +8,5 @@ cryptography==43.0.3
mangum==0.19.0
# AWS
boto3==1.35.44
# Data
openpyxl==3.1.2

View file

@ -11,6 +11,7 @@ def handler(event, context):
"""
Lambda handler that triggers the model engine for each SQS message.
"""
logger.info("Received event: %s", json.dumps(event, indent=2))
for record in event.get("Records", []):
try:
body_dict = json.loads(record["body"])