import re from dataclasses import dataclass from typing import Any, Dict, Optional from openpyxl import Workbook, load_workbook from openpyxl.worksheet.worksheet import Worksheet @dataclass class PropertyRow: row_index: int address: str listing_id: str def extract_addresses_from_spreadsheet( filepath: str, ) -> Dict[str, PropertyRow]: wb: Workbook = load_workbook(filepath, data_only=True) ws: Worksheet = wb["Southern RA-Lite Programme 3103"] header_row: int = 1 id_col: Optional[int] = None deal_name_col: Optional[int] = None listing_id_col: Optional[int] = None # find columns for col in range(1, ws.max_column + 1): raw_value: Any = ws.cell(row=header_row, column=col).value value: str = str(raw_value).strip().lower() if raw_value else "" if value == "id": id_col = col elif value == "deal name": deal_name_col = col elif value == "associated listing ids": listing_id_col = col if id_col is None or deal_name_col is None or listing_id_col is None: raise Exception("Missing required columns") properties: Dict[str, PropertyRow] = {} for row in range(2, ws.max_row + 1): id_val: Any = ws.cell(row=row, column=id_col).value deal_name: Any = ws.cell(row=row, column=deal_name_col).value listing_id: Any = ws.cell(row=row, column=listing_id_col).value if not id_val or not deal_name or not listing_id: continue property_id: str = str(id_val).strip() properties[property_id] = PropertyRow( row_index=row, address=extract_succinct_address(str(deal_name)), listing_id=listing_id, ) return properties def extract_succinct_address(deal_name: str) -> str: left_part = deal_name.split("|")[0].strip() postcode_match: Optional[re.Match[str]] = re.search( r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", left_part, re.IGNORECASE, ) postcode = postcode_match.group(1).upper() if postcode_match else None first_part = left_part.split(",")[0].strip() return f"{first_part} {postcode}" if postcode else first_part