survey-extraction/deployment/lambda/walthamforest_etl/docker/app.py

50 lines
1.6 KiB
Python

import pandas as pd
import json
from pprint import pprint
def handler(event, context):
# read data for houses only
print("waltham forest set up correctly")
return None
df = pd.read_excel("../../home/Downloads/data.xlsx", sheet_name="Houses Asset Data")
element_cols = [
"ELEMENT GROUP", "ELEMENT CODE", "ELEMENT CODE DESCRIPTION",
"ATTRIBUTE CODE", "ATTRIBUTE CODE DESCRIPTION",
"ELEMENT DATE VALUE", "ELEMENT NUMERIC VALUE",
"ELEMENT TEXT VALUE", "QUANTITY",
"INSTALL DATE", "REMAINING LIFE", "ELEMENT COMMENTS"
]
property_cols = [
"PROP REF", "Domna", "ADDRESS", "OWNERSHIP",
"PROP STATUS", "PROP TYPE", "PROP SUB TYPE"
]
# Group by ADDRESS (and other identifiers if needed)
result = (
df.groupby(["ADDRESS"])
.apply(lambda g: {
"property_info": g[property_cols].drop_duplicates().iloc[0].to_dict(),
"elements_info": [
{
"ELEMENT GROUP": eg_name,
"elements": eg_df.drop(columns=["ELEMENT GROUP"]).to_dict(orient="records")
}
for eg_name, eg_df in g[element_cols].groupby("ELEMENT GROUP")
]
})
.reset_index()
.rename(columns={0: "data"})
)
# Convert to list of dicts
records = []
for _, row in result.iterrows():
records.append({
"ADDRESS": row["ADDRESS"],
**row["data"]
})
json_output = json.dumps(records, ensure_ascii=False, default=str)
pprint(json_output)