diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 86b1bf87..88425e6d 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -202,6 +202,33 @@ class AssetList: This class is used to standardise asset lists so that we can process the core information in a consistent manner. """ + EPC_API_DATA_NAMES = { + "uprn": "epc_os_uprn", + "address1": "epc_address1", + "address": "epc_address", + "postcode": "epc_postcode", + "inspection-date": "epc_inspection_date", + "current-energy-efficiency": "epc_sap_score_on_register", + "current-energy-rating": "epc_rating_on_register", + "property-type": "epc_property_type", + "built-form": "epc_archetype", + "total-floor-area": "epc_total_floor_area", + "construction-age-band": "epc_age_band", + "floor-height": "epc_floor_height", + "number-habitable-rooms": "epc_number_habitable_rooms", + "walls-description": "epc_wall_construction", + "roof-description": "epc_roof_construction", + "floor-description": "epc_floor_construction", + "mainheat-description": "epc_heating_type", + 'mainheatcont-description': "epc_heating_controls", + "secondheat-description": "epc_secondary_heating", + "transaction-type": "epc_reason", + "energy-consumption-current": "epc_heat_demand", + } + FIND_EPC_DATA_NAMES = { + + } + DATETIME_REMAP = { "Pre 1900": datetime(year=1899, month=12, day=31), } @@ -590,3 +617,18 @@ class AssetList: def create_lookup_mappings(self): pass + + def merge_data(self, df: pd.DataFrame): + """ + Used to insert data into the standardised asset list, based on the domna property id + :return: + """ + if self.DOMNA_PROPERTY_ID not in df.columns: + raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}") + + if df[self.DOMNA_PROPERTY_ID].duplicated().sum(): + raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs") + + self.standardised_asset_list = self.standardised_asset_list.merge( + df, how="left", on=self.DOMNA_PROPERTY_ID + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 4bf9fe3a..2e66c4aa 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -474,20 +474,22 @@ def app(): epc_data.append(csv_data) epc_df = pd.concat(epc_data) + # TODO: TEMP!!! + epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID}) # We expand out the recommendations - recommendations_df = epc_df[["row_id", "recommendations"]] + recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] unique_recommendations = set() for _, row in recommendations_df.iterrows(): unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) - columns = ["row_id"] + list(unique_recommendations) + columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) transformed_data = [] for _, row in recommendations_df.iterrows(): # Initialize a dictionary for this row with False for all recommendations row_data = {col: False for col in columns} - row_data["row_id"] = row["row_id"] + row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID] # Set True for each recommendation present in this row for rec in row["recommendations"]: @@ -500,10 +502,11 @@ def app(): transformed_df = pd.DataFrame(transformed_data) # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation # recommendations - transformed_df = transformed_df[["row_id", "Cavity wall insulation"]] + transformed_df = transformed_df[[asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation"]] # Get the find my epc data - find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( + find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( + columns=["find_my_epc_data"]).join( pd.json_normalize(epc_df["find_my_epc_data"]) ) # We check if we get the solar pv column: @@ -513,46 +516,15 @@ def app(): # Retrieve just the data we need epc_df = epc_df[ - [ - "row_id", - "uprn", - "address1", - "address", - "postcode", - "property-type", - "built-form", - "inspection-date", - "current-energy-rating", - "current-energy-efficiency", - "roof-description", - "walls-description", - "floor-description", - "transaction-type", - "secondheat-description", - "total-floor-area", - "construction-age-band", - "floor-height", - "number-habitable-rooms", - "mainheat-description", - 'mainheatcont-description', - "energy-consumption-current", - "photo-supply", - ] - ].rename( - columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"} + [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) + ].rename( + columns=asset_list.EPC_API_DATA_NAMES ) - asset_list.merge_data(epc_df) - asset_list.insert_ - - asset_list = asset_list.merge( - epc_df, - how="left", - on="row_id" - ).merge( + epc_df = epc_df.merge( find_my_epc_data[ [ - "row_id", "heating_text", "hot_water_text", 'Assessor’s name', + asset_list.DOMNA_PROPERTY_ID, "heating_text", "hot_water_text", 'Assessor’s name', "Assessor's Telephone", "Assessor's Email", "Accreditation scheme", "Assessor’s ID", "Solar photovoltaics" ] @@ -564,31 +536,16 @@ def app(): } ), how="left", - on="row_id" + on=asset_list.DOMNA_PROPERTY_ID ) + asset_list.merge_data(epc_df) + asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) asset_list = asset_list.drop(columns=["photo-supply"]) # Rename the columns - asset_list = asset_list.rename(columns={ - "inspection-date": "Date of last EPC", - "current-energy-efficiency": "SAP score on register", - "current-energy-rating": "EPC rating on register", - "property-type": "Property Type", - "built-form": "Archetype - EPC", - "total-floor-area": "Property Floor Area", - "construction-age-band": "Property Age Band", - "floor-height": "Property Floor Height", - "number-habitable-rooms": "Number of Habitable Rooms", - "walls-description": "Wall Construction", - "roof-description": "Roof Construction", - "floor-description": "Floor Construction", - "mainheat-description": "Heating Type", - "secondheat-description": "Secondary Heating", - "transaction-type": "Reason for last EPC", - "energy-consumption-current": "Heat Demand (kWh/m2)", - }) + asset_list = asset_list asset_list["Estimated Number of Floors"] = asset_list.apply( lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(