feat(epc-prediction): anonymised Tier-1 fixture + builder (ADR-0030)

The committed gate needs frozen, reproducible data without dumping real UK
addresses into the repo. Add:
- harness anonymise_payload + stable_hash: hash street address + cert number
  into opaque, dedup-stable tokens; blank secondary address lines + post_town;
  keep postcode + all component/lodged fields (gov data is OGL). Unit-tested.
- scripts/build_epc_prediction_fixture.py: curate qualifying postcodes (>=1
  SAP 10.2 target + >=2 distinct addresses) from the local scratch corpus,
  anonymise, freeze under tests/fixtures/epc_prediction/.
- The frozen fixture: 15 postcodes / 280 certs / 36 SAP-10.2 targets.
  Verified no plaintext address_line_1 and post_town all blank.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-14 09:17:27 +00:00
parent 027ee1fba3
commit 008c1922c4
284 changed files with 789 additions and 0 deletions

View file

@ -13,6 +13,7 @@ not. Layout: `<dir>/<POSTCODE>/<cert>.json` + `<dir>/_index.json`.
from __future__ import annotations
import hashlib
import json
from datetime import date
from pathlib import Path
@ -21,6 +22,10 @@ from typing import Any, Optional
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.epc_prediction.comparable_properties import Comparable
# Identifying free-text fields blanked when freezing a payload into the committed
# fixture (postcode is kept — it is coarse open data and the cohort key).
_PII_BLANK_FIELDS = ("address_line_2", "address_line_3", "post_town")
def load_corpus(corpus_dir: Path) -> list[list[Comparable]]:
"""Load every postcode cohort under `corpus_dir`. Returns one list of
@ -61,6 +66,33 @@ def _load_cohort(
return cohort
def stable_hash(prefix: str, value: str) -> str:
"""A short, deterministic, one-way token for a free-text identifier. Stable
across re-lodgements of the same address (normalised first), so dedup still
collapses them but the plaintext address never lands in the repo."""
digest = hashlib.sha1(value.strip().upper().encode()).hexdigest()[:12]
return f"{prefix}-{digest}"
def anonymise_payload(raw: dict[str, Any]) -> dict[str, Any]:
"""De-identify a cert payload for the committed fixture: hash the street
address (`address_line_1`) and certificate number into stable tokens, blank
the other free-text address lines, and keep everything else postcode,
registration date, SAP version, lodged figures, and all component fields
untouched (gov data is OGL; only the direct identifiers are removed)."""
out = dict(raw)
address = raw.get("address_line_1")
if address:
out["address_line_1"] = stable_hash("addr", str(address))
cert = raw.get("certificate_number")
if cert:
out["certificate_number"] = stable_hash("cert", str(cert))
for blank_field in _PII_BLANK_FIELDS:
if blank_field in out:
out[blank_field] = ""
return out
def _address(raw: dict[str, Any]) -> Optional[str]:
value = raw.get("address_line_1")
return str(value).strip().upper() if value else None

View file

@ -0,0 +1,94 @@
"""Freeze a small, anonymised EPC Prediction fixture for the Tier-1 gate (ADR-0030).
Curates a deterministic subset of the local scratch corpus
(`/tmp/epc_prediction_corpus`, gitignored) into a committed fixture under
`tests/fixtures/epc_prediction/`. Selection keeps postcodes that can actually be
scored at least one SAP 10.2 target plus a second distinct address to predict
it from. Every payload is run through `anonymise_payload` first, so the street
address + certificate number become opaque tokens and no plaintext address lands
in the repo (postcode + component data are open gov data and kept).
The committed fixture is the deterministic basis for the ratcheting gate; the
large scratch corpus stays local for iteration + the offline battle-test.
USAGE
-----
PYTHONPATH=. python scripts/build_epc_prediction_fixture.py
Source: $EPC_PREDICTION_CORPUS (default /tmp/epc_prediction_corpus).
"""
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Any
from harness.epc_prediction_corpus import anonymise_payload, stable_hash
SOURCE = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpus"))
FIXTURE = Path("tests/fixtures/epc_prediction")
_SAP_10_2 = "10.2"
_MAX_POSTCODES = 15 # keep the committed fixture small
_MAX_COHORT = 25 # cap certs per postcode to bound repo size
def _load_payloads(
postcode: str, certs: list[str]
) -> list[tuple[str, dict[str, Any]]]:
"""The `(source cert number, payload)` pairs for a postcode — the cert
number lives in the index/filename, not the cached payload."""
payloads: list[tuple[str, dict[str, Any]]] = []
for cert in certs:
path = SOURCE / postcode / f"{cert}.json"
if path.exists():
payloads.append((cert, json.loads(path.read_text())))
return payloads
def _qualifies(payloads: list[tuple[str, dict[str, Any]]]) -> bool:
"""A postcode is usable iff it has ≥1 SAP 10.2 cert (a valid target) and ≥2
distinct addresses (so the target has at least one neighbour to predict it)."""
has_target = any(
str(p.get("sap_version")) == _SAP_10_2 for _, p in payloads
)
addresses = {
str(p.get("address_line_1", "")).strip().upper() for _, p in payloads
}
return has_target and len(addresses) >= 2
def main() -> None:
index: dict[str, list[str]] = json.loads(
(SOURCE / "_index.json").read_text()
)
fixture_index: dict[str, list[str]] = {}
total_certs = 0
for postcode, certs in index.items():
if len(fixture_index) >= _MAX_POSTCODES:
break
payloads = _load_payloads(postcode, certs)
if not _qualifies(payloads):
continue
kept: list[str] = []
for cert, raw in payloads[:_MAX_COHORT]:
cert_token = stable_hash("cert", cert)
anon = anonymise_payload(raw)
out = FIXTURE / postcode / f"{cert_token}.json"
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(anon))
kept.append(cert_token)
fixture_index[postcode] = kept
total_certs += len(kept)
(FIXTURE / "_index.json").parent.mkdir(parents=True, exist_ok=True)
(FIXTURE / "_index.json").write_text(json.dumps(fixture_index, indent=2))
print(
f"wrote {len(fixture_index)} postcodes / {total_certs} anonymised certs "
f"to {FIXTURE}"
)
if __name__ == "__main__":
main()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show more