From e9d3577cf61d36170ee3f6452a586efce202a467 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 5 Jan 2024 16:07:23 +0000 Subject: [PATCH] set up solar etc process and deleted research script --- etl/solar/SolarPhotoSupply.py | 1 + etl/solar/app.py | 3 - etl/testing_data/solar_research.py | 112 ----------------------------- 3 files changed, 1 insertion(+), 115 deletions(-) delete mode 100644 etl/testing_data/solar_research.py diff --git a/etl/solar/SolarPhotoSupply.py b/etl/solar/SolarPhotoSupply.py index c6e2f9cb..dadb71f0 100644 --- a/etl/solar/SolarPhotoSupply.py +++ b/etl/solar/SolarPhotoSupply.py @@ -110,6 +110,7 @@ class SolarPhotoSupply: if self.photo_supply_lookup.empty: raise ValueError("No data to save") + logger.info("Storing outputs to S3") # Store this data in s3 as a parquet file save_dataframe_to_s3_parquet( diff --git a/etl/solar/app.py b/etl/solar/app.py index 29802e72..50a3d282 100644 --- a/etl/solar/app.py +++ b/etl/solar/app.py @@ -1,9 +1,6 @@ -import pandas as pd from pathlib import Path -from tqdm import tqdm from etl.epc.property_change_app import get_cleaned from etl.solar.SolarPhotoSupply import SolarPhotoSupply -from utils.s3 import save_dataframe_to_s3_parquet DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates" diff --git a/etl/testing_data/solar_research.py b/etl/testing_data/solar_research.py deleted file mode 100644 index e66e992c..00000000 --- a/etl/testing_data/solar_research.py +++ /dev/null @@ -1,112 +0,0 @@ -import pandas as pd -from pathlib import Path -from tqdm import tqdm -from etl.epc.property_change_app import get_cleaned -from utils.s3 import save_dataframe_to_s3_parquet - -DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates" - - -def app(): - """ - This code reads in the EPC data and attempt to produce a reasonable figure for the photo-supply variable, which - is the following: - "Percentage of photovoltaic area as a percentage of total roof area. 0% indicates that a Photovoltaic Supply - is not present in the property." - - When recommending solar, we want to simulate the retrofit by increasing this value from 0, so we need a sensible - figure to increase this to. This script will pull the data for that, to allow us to try and deduce what - a sensible figure would be - :return: - """ - - directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] - results = [] - for dir in tqdm(directories): - filepath = dir / "certificates.csv" - df = pd.read_csv(filepath, low_memory=False) - df = df[~pd.isnull(df["UPRN"])] - df["UPRN"] = df["UPRN"].astype(int).astype(str) - # Drop rows that have a missing PROPERTY_TYPE, BUILT_FORM, CONSTRUCTION_AGE_BAND, TOTAL_FLOOR_AREA - for col in ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "TOTAL_FLOOR_AREA"]: - df = df[~pd.isnull(df[col])] - # Take newest LODGEMENT_DATE per UPRN - df = df.sort_values(by="LODGEMENT_DATE", ascending=False).drop_duplicates(subset=["UPRN"]) - - data = df[ - ["UPRN", "PROPERTY_TYPE", "TENURE", "BUILT_FORM", "ROOF_DESCRIPTION", "PHOTO_SUPPLY", "TOTAL_FLOOR_AREA", - "CONSTRUCTION_AGE_BAND", "SOLAR_WATER_HEATING_FLAG"] - ].copy() - data["PHOTO_SUPPLY"] = data["PHOTO_SUPPLY"].fillna(0) - data = data[data["PHOTO_SUPPLY"] != 0] - results.append(data) - - results = pd.concat(results) - - # Convert total floor area to deciles - decile_thresholds = results["TOTAL_FLOOR_AREA"].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).values - - def classify_floor_area(new_area, thresholds): - for i, threshold in enumerate(thresholds): - if new_area <= threshold: - return i # Returns the decile index (0 to 9) - return len(thresholds) - - # Assuming 'new_data' is your new DataFrame with floor area data - results["floor_area_decile"] = pd.cut( - results["TOTAL_FLOOR_AREA"], - bins=[0] + list(decile_thresholds) + [float('inf')], - labels=False, - include_lowest=True - ) - - # Convert tenure to lower - results["TENURE"] = results["TENURE"].str.lower() - - # Append on the roof details - cleaned_lookup = get_cleaned() - lookup = pd.DataFrame(cleaned_lookup["roof-description"]) - - results = results.merge( - lookup.drop( - columns=[ - "clean_description", "thermal_transmittance", "thermal_transmittance_unit", "insulation_thickness", - "is_assumed" - ] - ), - left_on="ROOF_DESCRIPTION", - right_on="original_description", - how="left" - ) - - aggregated = results.groupby( - [ - "PROPERTY_TYPE", "BUILT_FORM", "TENURE", "is_pitched", "is_roof_room", "is_flat", - "CONSTRUCTION_AGE_BAND", "floor_area_decile" - ], - observed=True - ).agg( - { - "PHOTO_SUPPLY": ["median", "mean"], - } - ).reset_index() - - aggregated.columns = ['_'.join(col).strip() for col in aggregated.columns.values] - # Remove trailing underscore from columns - aggregated.columns = [col[:-1] if col.endswith("_") else col for col in aggregated.columns.values] - # Convert columns to lowercase - aggregated.columns = [col.lower() for col in aggregated.columns.values] - - # Store this data in s3 as a parquet file - save_dataframe_to_s3_parquet( - df=aggregated, - bucket_name="retrofit-data-dev", - file_key="solar_pv_supply/photo_supply_lookup.parquet", - ) - - floor_area_decile_thresholds = pd.DataFrame(decile_thresholds, columns=["floor_area_decile_thresholds"]) - save_dataframe_to_s3_parquet( - df=floor_area_decile_thresholds, - bucket_name="retrofit-data-dev", - file_key=f"solar_pv_supply/floor_area_decile_thresholds.parquet", - )