Model/etl/customers/gla_croydon_demo/asset_list.py

import pandas as pd
from utils.s3 import save_csv_to_s3

USER_ID = 8
PORTFOLIO_ID = 67

archetype_1_uprns = [100020604138, 200001188299, 100020578756, 200001187196, 200001192253, 100020581792, 200001188304,
                     100020625813, 100020618060, 100020585305, 100020617489, 100020615039, 100020618076, 100020588913,
                     200001187197, 100020671205, 100020576940, 100020619814, 100020576472, 100020618083]
archetype_2_uprns = [100020698027, 10001007455, 100020653785, 10090383198, 100020665632, 100020620659, 100020615603,
                     100020609610, 100020625597, 100020665656, 100020665640, 100020587905, 100020665630, 100020624351,
                     100020625451, 100020624348, 100020666735, 100020653786, 100020576458, 100020657902, 100020624350,
                     100020637405, 100020666734, 100020616325, 100020666716, 100020653783, 100020665645, 100020642337,
                     100020665638, 100022904981, 100020688226, 100020630285, 100020626800, 100020665634, 100022907528,
                     100020665652, 100020624347, 100020666721, 100020585002, 10014055968, 10001008257, 100020621438,
                     100020576459, 100020665643, 100020665654, 100022917303]
archetype_3_uprns = [100020577523, 100020616446, 100020605342, 100020594652, 100020585394, 100020601138, 100020597485,
                     100020614883, 100020633162, 100020697787, 200001185785, 100020646842, 100020581449, 100020595611,
                     100020641814, 100020575611, 100020652986, 100020654671, 100020647336, 100020610518, 100020607980,
                     100020692380, 100020581690]
archetype_4_uprns = [100020650603, 100020582907, 100020605116, 100020650607, 100020589325, 100020655500, 100020642537,
                     200001187539, 100020631683, 100020610165, 100020596436, 100020598277, 100020660228]


def app():
    """
    We shall define a small portfolio of properties, based in Croydon
    :return:
    """

    # Firstly, read in the EPC data for Croydon
    epc_data = pd.read_csv(
        "local_data/all-domestic-certificates/domestic-E09000008-Croydon/certificates.csv",
        low_memory=False
    )

    # Filter on entries where we have a UPRN
    epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]

    # Get the newest EPC for each UPRN. We use LODGEMENT_DATE as a proxy for this
    epc_data["LODGEMENT_DATE"] = pd.to_datetime(epc_data["LODGEMENT_DATE"])

    epc_data = epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")

    # Now filter on social properties
    epc_data = epc_data[epc_data["TENURE"].isin(["rental (social)", "Rented (social)"])]
    # There are 17337 properties with a registered EPC in Croydon
    # Take below EPC C properties
    epc_data = epc_data[epc_data["CURRENT_ENERGY_EFFICIENCY"].astype(int) < 69]
    # 7994 properties are below EPC C (46%)

    # 79% D, 19% E, 1% F, 0.2% G - it probably makes the most sense to focus on E and D properties
    epc_data["CURRENT_ENERGY_RATING"].value_counts(normalize=True)

    # For the purpose of the sample, take the properties have surveys done in the last 3 years
    # This gives us 1351 remaining properties
    three_years_ago = pd.Timestamp.now() - pd.DateOffset(days=int(3 * 365))
    epc_data = epc_data[epc_data["LODGEMENT_DATE"] >= three_years_ago]

    # Archetype 1: defined below:
    # 1) House
    # 2) Unfilled cavity
    # 3) A roof that could be insulated (flat or pitched with no more than 50mm insulation)
    # 4) EPC E or D
    # 24 properties
    archetype_1_sample = epc_data[
        epc_data["PROPERTY_TYPE"].isin(["House"]) &
        (epc_data["CURRENT_ENERGY_RATING"].isin(["D", "E"])) &
        epc_data["WALLS_DESCRIPTION"].isin(["Cavity wall, as built, no insulation (assumed)"]) &
        epc_data["ROOF_DESCRIPTION"].isin(
            [
                "Pitched, 12 mm loft insulation",
                "Pitched, 0 mm loft insulation",
                "Pitched, no insulation",
                "Pitched, 50 mm loft insulation",
                "Flat, no insulation (assumed)",
                "Pitched, no insulation (assumed)"
            ]
        )
        ]
    archetype_1_sample_asset_list = archetype_1_sample[["UPRN", "ADDRESS1", "POSTCODE"]].copy()
    archetype_1_sample_asset_list["ARCHETYPE"] = "Archetype 1"

    # Archetype 2: defined below:
    # 1) Flat
    # 2) Unfilled cavity
    # 3) Another property above
    # 4) EPC E
    # 57 properties here
    archetype_2_sample = epc_data[
        epc_data["PROPERTY_TYPE"].isin(["Flat"]) &
        (epc_data["CURRENT_ENERGY_RATING"].isin(["E", "D"])) &
        epc_data["WALLS_DESCRIPTION"].isin(["Cavity wall, as built, no insulation (assumed)"]) &
        epc_data["ROOF_DESCRIPTION"].isin(
            [
                "(another dwelling above)"
            ]
        )
        ]
    archetype_2_sample_asset_list = archetype_2_sample[["UPRN", "ADDRESS1", "POSTCODE"]].copy()
    archetype_2_sample_asset_list["ARCHETYPE"] = "Archetype 2"

    # Archetype 3: defined below:
    # 1) EPC E or below
    # 2) Solid brick wall
    # 3) House
    # 4) Pitched roof with no insulation
    # Just 7 properties (more expensive to retrofit)
    archetype_3_sample = epc_data[
        epc_data["PROPERTY_TYPE"].isin(["House"]) &
        (epc_data["CURRENT_ENERGY_RATING"].isin(["E", "F", "G"])) &
        epc_data["WALLS_DESCRIPTION"].isin(["Solid brick, as built, no insulation (assumed)"]) &
        epc_data["ROOF_DESCRIPTION"].isin(
            [
                "Pitched, no insulation",
                "Pitched, limited insulation (assumed)",
                "Pitched, 100 mm loft insulation",
                "Pitched, no insulation (assumed)",
            ]
        )
        ]
    archetype_3_sample_asset_list = archetype_3_sample[["UPRN", "ADDRESS1", "POSTCODE"]].copy()
    archetype_3_sample_asset_list["ARCHETYPE"] = "Archetype 3"

    # Archetype 4: defined below:
    # 1) Maisonette
    # 2) Empty cavity
    # 3) EPC E
    # 16 properties here
    archetype_4_sample = epc_data[
        epc_data["PROPERTY_TYPE"].isin(["Maisonette"]) &
        epc_data["WALLS_DESCRIPTION"].isin(
            ["Cavity wall, as built, no insulation (assumed)"]
        )
        ]

    archetype_4_sample_asset_list = archetype_4_sample[["UPRN", "ADDRESS1", "POSTCODE"]].copy()
    archetype_4_sample_asset_list["ARCHETYPE"] = "Archetype 4"

    asset_list = pd.concat(
        [
            archetype_1_sample_asset_list,
            archetype_2_sample_asset_list,
            archetype_3_sample_asset_list,
            archetype_4_sample_asset_list
        ]
    )

    asset_list = asset_list.rename(
        columns={
            "UPRN": "uprn",
            "ADDRESS1": "address",
            "POSTCODE": "postcode",
            "ARCHETYPE": "archetype"
        }
    )

    asset_list["uprn"] = asset_list["uprn"].astype(int)

    # We end up with some properties that are currently an EPC C, but we do not have this data in the download, so we
    # manually remove
    # 1) 3 Reid Close, CR5 3BL
    # 2) Flat 6, Collier Court 2A, St. Peters Road CR0 1HD
    asset_list = asset_list[
        ~asset_list["uprn"].isin(
            [
                100020576460,
                100020624352,
            ]
        )
    ]
    # We have slightly too many properties, so we take a random sample of each archetype
    # achetype_1_size = 20
    # achetype_2_size = 46
    # achetype_3_size = 23
    # achetype_4_size = 13
    # archetype_1_uprns = asset_list[asset_list["archetype"] == "Archetype 1"]["uprn"].sample(
    #     int(achetype_1_size)
    # ).tolist()
    # archetype_2_uprns = asset_list[asset_list["archetype"] == "Archetype 2"]["uprn"].sample(
    #     int(achetype_2_size)
    # ).tolist()
    # archetype_3_uprns = asset_list[asset_list["archetype"] == "Archetype 3"]["uprn"].sample(
    #     int(achetype_3_size)
    # ).tolist()
    # archetype_4_uprns = asset_list[asset_list["archetype"] == "Archetype 4"]["uprn"].sample(
    #     int(achetype_4_size)
    # ).tolist()
    uprns_to_keep = archetype_1_uprns + archetype_2_uprns + archetype_3_uprns + archetype_4_uprns
    asset_list = asset_list[asset_list["uprn"].isin(uprns_to_keep)]

    filename = f"{USER_ID}/{PORTFOLIO_ID}/inputs.csv"
    save_csv_to_s3(
        dataframe=asset_list,
        bucket_name="retrofit-plan-inputs-dev",
        file_name=filename
    )

    body = {
        "portfolio_id": str(PORTFOLIO_ID),
        "housing_type": "Social",
        "goal": "Increase EPC",
        "goal_value": "C",
        "trigger_file_path": filename,
        "budget": None,
        "exclusions": ["floor_insulation"]
    }
    print(body)