From 978deb286bc411a563631e81685319a38ef9061e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 22:32:05 +0000 Subject: [PATCH] debugging remapper --- asset_list/AssetList.py | 19 ++++++++++---- asset_list/mappings/exising_pv.py | 4 +++ asset_list/mappings/heating_systems.py | 17 ++++++------- asset_list/mappings/property_type.py | 2 ++ asset_list/mappings/walls.py | 34 +++++++++++++++++++++++++- etl/route_march_data_pull/app.py | 5 ++-- 6 files changed, 63 insertions(+), 18 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index e61cc89b..8f905a33 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -27,8 +27,8 @@ class DataRemapper: :param standard_values: Set of allowed standardized values. :param standard_map: Dictionary of common remappings {raw_value: standard_value}. """ - self.standard_values = {v.lower() for v in standard_values} # Normalize to lowercase - self.standard_map = {k.lower(): v.lower() for k, v in (standard_map or {}).items()} # Predefined mappings + self.standard_values = standard_values + self.standard_map = standard_map self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing @@ -39,7 +39,7 @@ class DataRemapper: self.total_tokens_used = 0 self.total_cost = 0 self.remap_dict = {} # {original_value: standardized_value} - self.max_tokens = 1000 # Limit for OpenAI API + self.max_tokens = max_tokens # Limit for OpenAI API # Memoization for AI calls self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} @@ -61,6 +61,8 @@ class DataRemapper: return None text = text.strip().lower() text = re.sub(r'[^\w\s]', '', text) # Remove punctuation + # Replace double strings + text = re.sub(r'\s+', ' ', text) return text def fuzzy_match(self, text): @@ -106,6 +108,7 @@ class DataRemapper: if input_tokens > self.max_tokens: raise ValueError("Input tokens exceed the maximum limit.") + logger.info("Calling OpenAI API for standardization...") response = self.openai_client.chat.completions.create( model=self.ai_model, messages=[{"role": "user", "content": prompt}], @@ -156,8 +159,14 @@ class DataRemapper: cleaned_value = self.clean_string(value) # Rule-Based Check (Predefined Mapping) - if cleaned_value in self.standard_map: - self.remap_dict[value] = self.standard_map[cleaned_value] + if cleaned_value in self.standard_map or value in self.standard_map: + self.remap_dict[value] = ( + self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value] + ) + continue + + if value.lower() in self.standard_map: + self.remap_dict[value] = self.standard_map[value.lower()] continue # Exact Match in Standard Values diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py index 1e45bd83..06e77bba 100644 --- a/asset_list/mappings/exising_pv.py +++ b/asset_list/mappings/exising_pv.py @@ -5,4 +5,8 @@ STANDARD_EXISTING_PV = { EXISTING_PV_MAPPINGS = { "NO": "no PV", "YES": "already has PV", + "no": "no PV", + "yes": "already has PV", + True: "already has PV", + False: "no PV", } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 4fce39ab..2fbdff70 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -34,13 +34,12 @@ HEATING_MAPPINGS = { "Eco Electric Radiators": "electric radiators", "Gas fire": "other", "Backboiler - Solid fuel": "other", + 'combi - gas': 'gas combi boiler', 'e7 storage heaters': 'electric storage heaters', + 'district heating system': 'district heating', 'condensing boiler - gas': 'gas condensing boiler', + 'boiler oil/other': 'oil boiler', 'condensing combi - gas': 'gas condensing combi', + 'air source source heat pump': 'air source heat pump', 'biomass boiler': 'boiler - other fuel', + 'ground source heat pump': 'ground source heat pump', 'electric oil filled radiators': 'electric radiators', + 'solid fuel': 'other', 'lpg boiler': 'boiler - other fuel', 'electric boiler': 'electric boiler', + 'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler', + 'eco electric radiators': 'electric radiators', 'gas fire': 'other', 'backboiler - solid fuel': 'other', } - -# array(['Combi - GAS', 'E7 Storage Heaters', 'District heating system', -# 'Condensing Boiler - GAS', 'Boiler Oil/other', -# 'Condensing Combi - Gas', 'Air Source Source Heat Pump', -# 'Biomass Boiler', 'Ground Source Heat Pump', -# 'Electric Oil filled radiators', 'Solid Fuel', 'LPG Boiler', -# 'Electric Boiler', 'No data', 'Boiler Communal/Commercial - GAS', -# 'Eco Electric Radiators', 'Gas fire', 'Backboiler - Solid fuel'], -# dtype=object) diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index bcad9ede..ec569123 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -11,6 +11,8 @@ PROPERTY_MAPPING = { "MAISONET": "maisonette", "BUNGALOW": "bungalow", "BLKHOUS": "block house", + "blkhous": "block house", "BEDSIT": "bedsit", "COACHSE": "coach house", + "coachse": "coach house", } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 7dec7d12..33db1fef 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,3 +1,5 @@ +from asset_list.AssetList import DataRemapper + STANDARD_WALL_CONSTRUCTIONS = { "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick", "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob", @@ -18,6 +20,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown', 'Average thermal transmittance 0.18 W/m?K': 'unknown', 'Granite or whin, with internal insulation': 'granite or whinstone', + "Granite or whinstone, as built, insulated (assumed)": "granite or whinstone", 'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown', 'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown', 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown', @@ -34,5 +37,34 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown', 'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown', 'Cavity wall, with internal insulation': 'filled cavity', - 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown' + 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown', + 'new build - average thermal transmittance': 'new build - average thermal transmittance', + 'average thermal transmittance 0.25 w/m?k': 'unknown', + 'cavity wall, as built, insulated (assumed)': 'filled cavity', + 'average thermal transmittance 0.31 w/m?k': 'unknown', + 'cavity wall, as built, no insulation (assumed)': 'uninsulated cavity', + 'average thermal transmittance 0.30 w/m?k': 'unknown', 'average thermal transmittance 0.28 w/m-¦k': 'unknown', + 'average thermal transmittance 0.25 w/m-¦k': 'unknown', 'average thermal transmittance 0.21 w/m-¦k': 'unknown', + 'average thermal transmittance 0.20 w/m-¦k': 'unknown', 'average thermal transmittance 0.29 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m²k': 'unknown', + 'average thermal transmittance 0.15 w/m-¦k': 'unknown', 'average thermal transmittance 0.23 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m?k': 'unknown', + 'granite or whin, with internal insulation': 'granite or whinstone', + 'average thermal transmittance 0.22 w/m-¦k': 'unknown', 'average thermal transmittance 0.24 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m-¦k': 'unknown', 'average thermal transmittance 0.35 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown', + 'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown', + 'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', + 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': 'unknown', + 'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', + 'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown', + 'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown', + 'average thermal transmittance 0.22 w/m?k': 'unknown', 'average thermal transmittance 0.38 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m-¦k': 'unknown', 'average thermal transmittance = 0.27 w/m?k': 'unknown', + 'cavity wall, with external insulation': 'filled cavity', 'average thermal transmittance 0.21 w/m?k': 'unknown', + 'average thermal transmittance 0.23 w/m?k': 'unknown', 'average thermal transmittance 0.20 w/m?k': 'unknown', + 'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown', + 'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown', + 'average thermal transmittance 0.28 w/m?k': 'unknown', } diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index ca5195d6..1289fb09 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -346,7 +346,7 @@ def app(): invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] - self = AssetList( + asset_list = AssetList( local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME, @@ -364,8 +364,7 @@ def app(): landlord_heating_system="Heat Source", landlord_existing_pv="PV (Y/N)" ) - self.init_standardise( - ) + asset_list.init_standardise() self.apply_transformations()