diff --git a/backend/etl/etl_opendatacommunities/main.py b/backend/etl/etl_opendatacommunities/main.py index 30b4045a..2bd41005 100644 --- a/backend/etl/etl_opendatacommunities/main.py +++ b/backend/etl/etl_opendatacommunities/main.py @@ -15,16 +15,7 @@ logger = setup_logger() SRC_ROOT = Path("/workspaces/home/epc_data") TMP_ROOT = Path("/tmp/epc_postcodes") S3_BUCKET = "retrofit-data-dev" -S3_PREFIX = "epc_opendatacommunities" - -REC_COLS = { - "IMPROVEMENT_ITEM", - "IMPROVEMENT_SUMMARY_TEXT", - "IMPROVEMENT_DESCR_TEXT", - "IMPROVEMENT_ID", - "IMPROVEMENT_ID_TEXT", - "INDICATIVE_COST", -} +S3_PREFIX = "historical_epc" # This scripts assume you downloading the zip, unzip it, and running it locally @@ -35,18 +26,16 @@ def sanitise(pc: pd.Series) -> pd.Series: def shard_la(la_dir: Path) -> None: certs = pd.read_csv(la_dir / "certificates.csv", low_memory=False) - recs = pd.read_csv(la_dir / "recommendations.csv", low_memory=False) - merged = certs.merge(recs, on="LMK_KEY", how="left") - merged["POSTCODE_CLEAN"] = sanitise(merged["POSTCODE"]) - before = len(merged) - merged = merged.dropna(subset=["POSTCODE_CLEAN"]) - merged = merged[merged["POSTCODE_CLEAN"] != ""] - dropped = before - len(merged) + certs["POSTCODE_CLEAN"] = sanitise(certs["POSTCODE"]) + before = len(certs) + certs = certs.dropna(subset=["POSTCODE_CLEAN"]) + certs = certs[certs["POSTCODE_CLEAN"] != ""] + dropped = before - len(certs) if dropped: logger.warning(f"{la_dir.name}: dropped {dropped} rows with empty postcode") - for pc, group in merged.groupby("POSTCODE_CLEAN", sort=False): + for pc, group in certs.groupby("POSTCODE_CLEAN", sort=False): out = TMP_ROOT / f"{pc}.csv" group.drop(columns=["POSTCODE_CLEAN"]).to_csv( out, mode="a", header=not out.exists(), index=False @@ -67,9 +56,7 @@ def list_existing_keys(s3: Any) -> set[str]: def upload_postcode(path: Path, s3: Any) -> None: df = pd.read_csv(path, low_memory=False).drop_duplicates() - cert_cols = [c for c in df.columns if c not in REC_COLS] - cert_only = df[cert_cols].drop_duplicates() - dupes = cert_only["LMK_KEY"].value_counts() + dupes = df["LMK_KEY"].value_counts() bad = dupes[dupes > 1] if not bad.empty: raise ValueError( @@ -95,12 +82,14 @@ def main(): ) logger.info(f"Sharding {len(la_dirs)} LA folders -> {TMP_ROOT}") - # for la in tqdm(la_dirs, desc="shard"): - # shard_la(la) + for la in tqdm(la_dirs, desc="shard"): + shard_la(la) s3 = boto3.client( "s3", - config=Config(max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"}), + config=Config( + max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"} + ), ) pc_files = sorted(TMP_ROOT.glob("*.csv")) logger.info(f"Found {len(pc_files)} local shards") diff --git a/datatypes/epc/domain/historic_epc.py b/datatypes/epc/domain/historic_epc.py new file mode 100644 index 00000000..230c6327 --- /dev/null +++ b/datatypes/epc/domain/historic_epc.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + + +@dataclass +class HistoricEpc: + address1: str + address2: str + address3: str + postcode: str + uprn: str diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py index 9ebe4b09..f64ab8c4 100644 --- a/datatypes/epc/schema/historic_epc.py +++ b/datatypes/epc/schema/historic_epc.py @@ -96,9 +96,3 @@ class HistoricEpc: uprn: str uprn_source: str report_type: str - improvement_item: str - improvement_summary_text: str - improvement_descr_text: str - improvement_id: str - improvement_id_text: str - indicative_cost: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py index d5d5ea22..2170a8a6 100644 --- a/datatypes/epc/schema/tests/test_historic_epc_loading.py +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -47,9 +47,3 @@ class TestHistoricEpcLoading: def test_report_type(self, epc: HistoricEpc) -> None: assert epc.report_type == "100" - - def test_improvement_summary_text(self, epc: HistoricEpc) -> None: - assert epc.improvement_summary_text == "Increase loft insulation to 270 mm" - - def test_indicative_cost(self, epc: HistoricEpc) -> None: - assert epc.indicative_cost == "£100 - £350" diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index b275086d..5e3ce5d5 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -26,13 +26,13 @@ from backend.app.db.functions.materials_functions import get_materials from collections import defaultdict from sqlalchemy import func -PORTFOLIO_ID = 711 -SCENARIOS = [1233] +PORTFOLIO_ID = 632 +SCENARIOS = [1144] scenario_names = { - 1233: "Reach EPC C", + 1144: "EPC C", } -project_name = "Novus" +project_name = "Calico Refresh" def get_data(portfolio_id, scenario_ids):