mount home directory to devcontainer home directory

This commit is contained in:
Jun-te Kim 2026-02-11 11:50:02 +00:00
parent b7e201f3d4
commit d4ac6aee71
5 changed files with 76 additions and 176 deletions

View file

@ -6,7 +6,7 @@
"workspaceFolder": "/workspaces/model",
"postStartCommand": "bash .devcontainer/backend/post-install.sh",
"mounts": [
"source=${localEnv:HOME},target=/workspaces/home,type=bind"
"source=${localEnv:HOME},target=/home/vscode,type=bind"
],
"customizations": {
"vscode": {

View file

@ -34,7 +34,7 @@ from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
logger = setup_logger()
# OpenAI API Key (set this in your environment variables for security)
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

View file

@ -13,11 +13,15 @@ from asset_list.utils import get_data
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
load_dotenv(dotenv_path="backend/.env")
load_dotenv(dotenv_path="../backend/.env")
EPC_AUTH_TOKEN = os.getenv(
"EPC_AUTH_TOKEN",
)
OPENAI_API_KEY = os.getenv(
"OPENAI_API_KEY",
)
def extract_address1(
asset_list, full_address_col, postcode_col, method="first_two_words"
@ -69,72 +73,24 @@ def app():
Property UPRN
"""
<<<<<<< HEAD
data_folder = "/workspaces/model/asset_list/"
data_filename = "manchester.xlsx"
sheet_name = "PW0099 - Property List"
postcode_column = "post Code"
address1_column = "address"
address1_method = None
fulladdress_column = None
address_cols_to_concat = ["address"]
=======
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Aspire"
data_filename = "ASPIRE ASSET LIST.xlsx"
sheet_name = "Asset List"
postcode_column = "Postcode"
data_folder = "/workspaces/model/asset_list"
data_filename = "assets.xlsx"
sheet_name = "Sheet1"
postcode_column = "POSTCODE"
address1_column = None
address1_method = "house_number_extraction"
fulladdress_column = "Address"
fulladdress_column = "ADDRESS"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = None
landlord_property_type = "Property Type"
landlord_built_form = None
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = None
landlord_property_type = "PROPERTY TYPE"
landlord_built_form = None # Skipped as empty
landlord_wall_construction = "wall combined" # combin F + G
landlord_roof_construction = "HEATING SYSTEM" # Combine I + J
landlord_heating_system = None # Check with Khalim
landlord_existing_pv = None
landlord_property_id = "LLUPRN"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
outcomes_address = None
master_filepaths = []
master_id_colnames = []
master_to_asset_list_filepath = None
phase = False
ecosurv_landlords = None
asset_list_header = 0
landlord_block_reference = None
# Peabody data for cleaning
data_folder = (
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
"Project/data_validation"
)
data_filename = "to_standardise_uprns.xlsx"
sheet_name = "Sheet1"
postcode_column = "Postcode"
address1_column = None
address1_method = "house_number_extraction"
fulladdress_column = "Address"
address_cols_to_concat = None
>>>>>>> d4064da36565f87c2b72d10e9f3604cc6c37bdb6
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = None
landlord_property_type = None
landlord_built_form = None
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "UHTprop Ref"
landlord_property_id = "UPRN"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None
@ -286,7 +242,7 @@ def app():
if skip is not None and not force_retrieve_data:
if i <= skip:
continue
chunk = asset_list.standardised_asset_list[i: i + chunk_size]
chunk = asset_list.standardised_asset_list[i : i + chunk_size]
epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
df=chunk,
row_id_name=asset_list.DOMNA_PROPERTY_ID,
@ -429,7 +385,7 @@ def app():
# Retrieve just the data we need
epc_df = epc_df[
[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
].rename(columns=asset_list.EPC_API_DATA_NAMES)
].rename(columns=asset_list.EPC_API_DATA_NAMES)
# Look for columns not in the find my EPC data, which will have happened if we didn't
# retrieve it in the first place
@ -446,7 +402,7 @@ def app():
find_my_epc_data[
[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]
+ list(asset_list.FIND_EPC_DATA_NAMES.keys())
].rename(columns=asset_list.FIND_EPC_DATA_NAMES),
].rename(columns=asset_list.FIND_EPC_DATA_NAMES),
how="left",
on=asset_list.DOMNA_PROPERTY_ID,
)

View file

@ -20,6 +20,29 @@ if EPC_AUTH_TOKEN is None:
raise RuntimeError("EPC_AUTH_TOKEN not defined in env")
def is_valid_postcode(postcode_clean: str) -> bool:
"""
Validate postcode using postcodes.io.
Expects a sanitised postcode (e.g. E84SQ).
Returns True if valid, False otherwise.
"""
POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
if not postcode_clean:
return False
try:
resp = requests.get(
POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
timeout=5,
)
resp.raise_for_status()
return resp.json().get("result", False)
except requests.RequestException:
# Network issues, rate limits, etc.
return False
def levenshtein(a: str, b: str) -> float:
"""
Address similarity score in [0, 1].

View file

@ -78,112 +78,14 @@ def sanitise_postcode(postcode: str) -> str | None:
return postcode.upper().replace(" ", "")
def is_valid_postcode(postcode_clean: str) -> bool:
"""
Validate postcode using postcodes.io.
Expects a sanitised postcode (e.g. E84SQ).
Returns True if valid, False otherwise.
"""
POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
if not postcode_clean:
return False
try:
resp = requests.get(
POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
timeout=5,
)
resp.raise_for_status()
return resp.json().get("result", False)
except requests.RequestException:
# Network issues, rate limits, etc.
return False
def main():
df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability")
df = df.head(500)
# Sanitise postcodes
df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode)
# --- validate AFTER grouping (save API calls) ---
# Get unique, non-null postcodes
unique_postcodes = df["postcode_clean"].dropna().unique()
# Validate each postcode once, TODOadd a progress bar
postcode_validity = {
pc: is_valid_postcode(pc)
for pc in tqdm(unique_postcodes, total=len(unique_postcodes))
}
# Map validity back onto dataframe
df["postcode_valid"] = df["postcode_clean"].map(postcode_validity)
results = []
for postcode, group_df in tqdm(
df[df["postcode_valid"]].groupby("postcode_clean"),
desc="Resolving UPRNs by postcode",
):
try:
epc_df = get_epc_data_with_postcode(postcode)
if epc_df.empty:
tmp = group_df.copy()
tmp["found_uprn"] = None
tmp["status"] = "no_epc_results"
results.append(tmp)
continue
resolved = resolve_uprns_for_postcode_group(
group_df=group_df,
epc_df=epc_df,
)
results.append(resolved)
except Exception as e:
tmp = group_df.copy()
tmp["found_uprn"] = None
tmp["status"] = "exception"
tmp["error"] = str(e)
results.append(tmp)
final_df = pd.concat(results, ignore_index=True)
a = final_df[
[
"best_match_lexiscore",
"Address 1",
"best_match_address",
"Postcode",
"UPRN",
"best_match_uprn",
]
] # add levi score to viewing
b = final_df[final_df["best_match_lexiscore"] > 0] # add levi score to viewing
b = b[
[
"best_match_lexiscore",
"Address 1",
"best_match_address",
"Postcode",
"UPRN",
"best_match_uprn",
]
]
def handler(event, context):
def handler(event, context, local=False):
print(f"Function: {context.function_name}")
print(f"Request ID: {context.aws_request_id}")
# Example SQS message for testing (copy and paste into SQS):
# {
# "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
# "s3_uri": "https://337213553626-7ovirzjr.eu-west-2.console.aws.amazon.com/s3/object/retrofit-data-dev?region=eu-west-2&prefix=ara_raw_inputs/peabody/2025_11_11+-+Peabody+-+Data+Extracts+for+Domna_transformed.csv",
# "task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917",
# "s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv"
# }
# Handle both single event and batch events (SQS, etc.)
@ -196,7 +98,13 @@ def handler(event, context):
task_id = None
subtask_id = None
try:
# Parse body
# For local development
if local is True:
record = {}
record["body"] = (
'{"task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917","s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv"}'
)
# Parse body (inputs)
if isinstance(record.get("body"), str):
body = json.loads(record["body"])
else:
@ -236,17 +144,33 @@ def handler(event, context):
df = pd.DataFrame(csv_data)
logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns")
# Get head for demo
df_head = df.head()
logger.info("DataFrame head:")
logger.info(f"\n{df_head}")
# Sanitise postcodes
df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode)
# Group by sanitised postcode (excluding null values)
grouped_data = []
for postcode, group_df in df.dropna(subset=["postcode_clean"]).groupby(
"postcode_clean"
):
group_info = {
"postcode": postcode,
"row_count": len(group_df),
"rows": group_df.to_dict(orient="records"),
}
grouped_data.append(group_info)
logger.info(f"Postcode: {postcode}, Rows: {len(group_df)}")
logger.info(f"Total postcodes: {len(grouped_data)}")
results.append(
{
"message": "Postcode splitter processing started",
"message": "Postcode splitter processing completed",
"task_id": str(task_id),
"s3_uri": s3_uri,
"subtask_id": str(subtask_id),
"total_rows": len(df),
"total_postcodes": len(grouped_data),
"grouped_data": grouped_data,
}
)
@ -258,6 +182,7 @@ def handler(event, context):
"status": "processing_complete",
"s3_uri": s3_uri,
"rows_processed": len(df),
"total_postcodes": len(grouped_data),
},
)
logger.info(f"Subtask {subtask_id} marked as complete")
@ -295,7 +220,3 @@ def handler(event, context):
{"processed": results, "errors": errors if errors else None}
),
}
if __name__ == "__main__":
main()