diff --git a/.claude/settings.json b/.claude/settings.json index 7b99cc77..e89494b4 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,5 +1,28 @@ { "permissions": { + "allow": [ + "Read(//home/vscode/.claude/skills/**)", + "Bash(git fetch *)", + "Bash(git add *)", + "Bash(git commit *)", + "Bash(git merge *)", + "Bash(git pull *)", + "Bash(git push *)", + "Bash(git status *)", + "Bash(git checkout *)", + "Bash(git stash *)", + "Bash(git config *)", + "Bash(git branch *)", + "Bash(git worktree *)", + "Bash(git check-ignore *)", + "Bash(git ls-tree *)", + "Bash(npm install *)", + "Bash(npm run *)", + "Bash(npx drizzle-kit *)", + "Bash(pip install *)", + "Bash(terraform fmt *)", + "Bash(gh label *)" + ], "deny": [ "Bash(npx drizzle-kit generate)", "Bash(npx drizzle-kit push)" diff --git a/.gitignore b/.gitignore index df17244f..b26a69b4 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,6 @@ next-env.d.ts backlog/** docs/adr/** + +# Personal Claude Code settings (per-developer, not shared) +.claude/settings.local.json diff --git a/CONTEXT.md b/CONTEXT.md index 9f15c85a..c2073354 100644 --- a/CONTEXT.md +++ b/CONTEXT.md @@ -38,13 +38,42 @@ The housing association supplying a Portfolio's BulkUploads. A Landlord knows fa _Avoid_: customer, client, owner, organisation (Organisation is a separate, broader entity) **Landlord override**: -A landlord-supplied fact about a property that takes precedence over EPC-derived defaults when computing an assessment. The end-to-end Landlord override journey has two layers — a **VocabularyMapping** layer (this glossary entry below) and a per-Property fact layer (not yet modelled). +A landlord-supplied fact about a property that takes precedence over EPC-derived defaults when computing an assessment. The end-to-end Landlord override journey has two layers — a **VocabularyMapping** layer (this glossary entry below) and a per-Property fact layer (the **Property override**, below). _Avoid_: customer data, manual override, landlord data +**Property override**: +The per-Property fact layer — one resolved fact per `(Property, Building part, component)`, where component is one of `wall_type`/`roof_type`/`property_type`/`built_form_type`. Holds a **snapshot** of the resolved enum value (a denormalised copy of the VocabularyMapping outcome at finalise time, so two Properties sharing a description can later diverge), plus the original spreadsheet text it resolved from. Materialised by the finaliser **for UPRN-matched Properties only** (v2); the resolved value is never `UNKNOWN` — the Verify step forces every `UNKNOWN` to be mapped before Finalise, and an unresolved description fails the run. See [ADR-0005](./docs/adr/0005-async-bulk-upload-finaliser.md) (table) and [ADR-0006](./docs/adr/0006-property-overrides-join-and-no-uprn-defer.md) (population). +_Avoid_: per-property mapping, property fact, override row + +**Source row id**: +A synthetic UUID minted per source-file row at `start-address-matching` and written into **both** the address CSV and the classifier CSV. It is the stable join key that lets the finaliser tie a row's identity (combiner output → `property_id`) to that row's raw descriptions (classifier CSV), since neither file preserves row order and `Internal Reference` is absent from the classifier CSV. See [ADR-0006](./docs/adr/0006-property-overrides-join-and-no-uprn-defer.md). +_Avoid_: row index, internal reference (a separate, optional landlord field) + **VocabularyMapping**: The translation from a Landlord's free-text description in a BulkUpload column (e.g. `"cavity: filledcavity"`) to a canonical domain enum value (e.g. `WallType.CAVITY`). Produced by a `ColumnClassifier` (today an LLM, tomorrow possibly a lookup table or rules engine) in the Model service. Stored per-Portfolio, one row per `(category, description)`. A row carries provenance (`classifier` or `user`) so user overrides survive re-classification. _Avoid_: column mapping (that's a separate concept — see `ColumnMapping` above), classification, dictionary +### Building parts + +**Building part**: +One physically distinct part of a dwelling described by a single entry within a multi-valued cell. A dwelling is one **Main building** plus zero or more **Extensions**. Per-part descriptions appear as comma-separated entries in physical-element columns (e.g. `Walls`, `Roofs`); whole-dwelling columns (e.g. `Property Type`) carry a single entry and are **not** split per part. +_Avoid_: annexe, unit, section, dwelling part + +**Main building**: +The principal building part of a dwelling — exactly one per address. The others are **Extensions**. + +**Extension**: +A building part that is not the Main building, numbered **Extension 1 … Extension N-1** for an N-entry address. +_Avoid_: annexe, addition, outbuilding + +**Multi-entry**: +The property of a BulkUpload row whose physical-element cells hold **more than one comma-separated entry**, one per **Building part**. Always intra-cell in our data — never multiple rows sharing one address/UPRN. Within a row, the multi-valued columns agree on entry-count, so **position `i` is the same Building part across every multi-valued column**. +_Avoid_: multi-row, multi-record, duplicate address + +**Building-part ordering** (a.k.a. **ordering**): +The user's declaration, captured once per file, of which list-position maps to which Building part — because the entry order is a consistent per-file mistake (`"A, B"` could be `[Main, Extension 1]` or `[Extension 1, Main]`). Stored per entry-count as a permutation. See [ADR-0004](./docs/adr/0004-multi-entry-building-part-ordering.md). +_Avoid_: sort order, sequence, column mapping + ## Lifecycle A **BulkUpload** moves through these statuses: @@ -55,15 +84,20 @@ ready_for_processing → processing (Address matching triggered; Next.js writes) → combining (Combiner stage running; FastAPI writes directly) → awaiting_review (Combiner output in S3; FastAPI writes directly) - → complete (Finalise succeeded; Next.js writes) - → failed (FastAPI reports in-flight failure — schema only, not yet wired) + → finalising (Finalise dispatched; Next.js writes via compare-and-swap) + → complete (Finaliser succeeded; FastAPI/Lambda writes directly) + → failed (Finaliser failed; FastAPI/Lambda writes directly) ``` -`complete` and `failed` are terminal. +`complete` and `failed` are terminal. `finalising` is the in-flight state of the +async finaliser (mirrors `combining`); the UI renders it as "Uploading to ARA". See +[ADR-0005](./docs/adr/0005-async-bulk-upload-finaliser.md). Re-mapping (PATCHing `columnMapping`) is legal only in `ready_for_processing` and `mapping_complete`. Any later state rejects with 409. -**Two writers**: Next.js owns transitions out of `mapping_complete`, into `processing`, and the terminal Finalise outcomes. FastAPI owns `combining` and `awaiting_review` — writing them direct to the DB during the combiner run. The BulkUpload aggregate observes both. +**Two writers**: Next.js owns transitions out of `mapping_complete`, into `processing`, and the `awaiting_review → finalising` compare-and-swap at Finalise dispatch. FastAPI/Lambda owns `combining`, `awaiting_review`, and the terminal `finalising → complete`/`failed` — writing them direct to the DB during the combiner and finaliser runs. The BulkUpload aggregate observes both. See [ADR-0005](./docs/adr/0005-async-bulk-upload-finaliser.md). + +At `awaiting_review`, **Finalise is gated** (not a new status — a precondition on the action): when classifier columns were mapped the user must acknowledge the classification-verification step, and when the file is **Multi-entry** they must confirm the **Building-part ordering**. See [ADR-0004](./docs/adr/0004-multi-entry-building-part-ordering.md). See [ADR-0001](./docs/adr/0001-bulk-upload-state-machine.md) for the deliberate "not yet" decisions baked into this lifecycle. @@ -97,6 +131,8 @@ _Avoid_: override, adjustment, correction > > **Dev:** "And if **Finalise** runs and 30% of rows have no **UPRN**?" > **Domain expert:** "Those still get imported as **Properties** — just without a UPRN — and the BulkUpload moves to `complete`. Manual cleanup happens later in the property table." +> +> _(Planned change — v3 / [ADR-0006](./docs/adr/0006-property-overrides-join-and-no-uprn-defer.md): no-UPRN rows will move to a separate staging table to be re-matched, so `property` holds only matched rows. v2 does **not** change this yet — and v2 writes **Property overrides** only for the UPRN-matched rows.)_ ## Flagged ambiguities diff --git a/docs/design/bulk-upload-finaliser-v2-handover.md b/docs/design/bulk-upload-finaliser-v2-handover.md new file mode 100644 index 00000000..0a0ef5b5 --- /dev/null +++ b/docs/design/bulk-upload-finaliser-v2-handover.md @@ -0,0 +1,291 @@ +# Handover: `bulk_upload_finaliser` v2 — populate `property_overrides` + +> **Purpose.** Self-contained brief to start a fresh context implementing v2 (the +> per-Property fact layer) in the existing `bulk_upload_finaliser` Lambda. v1 +> (async finalise that writes `property`) is **shipped and working end-to-end**. +> This doc assumes no memory of the v1 session. + +## 0. Design resolved — grilling outcome (2026-06-05) + +> The open questions in §9 were resolved in a design session. **This section is now +> authoritative**; the later sections are kept for background but where they conflict +> with this one, this one wins. The new v2 ADR is +> [`docs/adr/0006-property-overrides-join-and-no-uprn-defer.md`](../adr/0006-property-overrides-join-and-no-uprn-defer.md); +> ADR-0004 was amended for per-count ordering capture. + +**Spine.** Populate `property_overrides` at finalise **for UPRN-matched rows only**. +Join the classifier descriptions to the combiner identity by a **synthetic UUID +`source_row_id`** — *not* `Internal Reference` (it is **absent from the classifier +CSV**, and optional anyway) and *not* by carrying description columns through +`address2uprn` (architecture B, rejected). This is architecture **(A)** with a +purpose-built key. + +**No-UPRN rows are deferred to v3.** v1 *currently* inserts them as `property` rows; +**v2 changes nothing in the property insert** and simply writes no overrides for +them. The eventual home for unmatched rows is a **separate staging table** (Model B): +`property` holds only matched rows; unmatched inputs (with their descriptions) live in +the staging table until a *different UPRN matcher* assigns a UPRN and promotes them. +"Found vs unfound" is a view across both tables, **not** a flag on `property`. v3 owns +the property-insert change + the staging table + the matcher-rerun UX together. + +**Frontend work** (`/workspaces/assessment-model`): +1. **Mint `source_row_id`** (UUID) in `start-address-matching` right after + `readRows()`, and **explicitly emit it as a column in both** `buildAddressCsv` and + `buildClassifierCsv` — both project a *fixed* column set, so attaching it to the + row object is not enough. It survives `address2uprn`→combiner like any input column + (carried as `additional_info`); **verify against a real combiner output**. +2. **Per-count ordering capture** (supersedes ADR-0004's largest-count-only): + `detectMultiEntry` keeps a sample **per distinct count**; `OnboardingProgress` + renders one ordering panel **per count ≥ 2**. The jsonb type and + `setMultiEntryOrdering` validation already accept all counts — **no migration, no + backend-validation change**. +3. **Verify gate hardened**: Finalise is blocked while **any** description is still + `UNKNOWN`. `UNKNOWN` is now a **transient "needs review" marker, never a final + value** (this retires the old "`UNKNOWN` is legitimate" line in §7). +4. **`dispatchFinaliser`** adds **two fields to the trigger body**: + `classifier_s3_uri` and `multi_entry_ordering` (it already reads the + `bulk_address_uploads` row, and dispatch happens *after* the user confirms + ordering, so the value is final). The classifier S3 key comes from a **shared + `classifierCsvKey(portfolioId, uploadId)` helper** used by both the writer and the + dispatcher (the key is not stored anywhere today — convention only). + +**Backend work** (`/workspaces/home/github/Model`): +5. Grow the trigger schema in two places — FastAPI `FinaliserTriggerRequest` and + Lambda `BulkUploadFinaliserTriggerBody` — with `classifier_s3_uri` + + `multi_entry_ordering`. Handler stays trigger-driven (no new `bulk_address_uploads` + coupling). +6. **`PropertyOverrideRow`** table mirror + a **sibling `PropertyOverrideRepository`** + (own aggregate; upsert on `(property_id, override_component, building_part)`), and a + **read-only `LandlordOverrideRepository`** that loads a portfolio's vocabulary + **per component into dicts once** (the vocabulary is deduplicated, not per-row). +7. **Orchestrator step**, in the same `commit_scope`: + - bulk `SELECT (portfolio_id, uprn) → id` for the run's UPRN rows → in-memory map; + - join classifier↔combiner rows by `source_row_id`; + - **uniform comma-split all four components** → `permutations[count]` → parts + (count-1 cell → `building_part = 0`); the finaliser needs **no fallback** because + every count ≥ 2 has a confirmed permutation; + - resolve each part's **normalized** description against the override dicts; + - `original_spreadsheet_description` = the **raw** entry text (un-normalized); + - **empty cell → write no row**; **non-empty but unresolved (or `UNKNOWN`) → raise** + → `commit_scope` rolls back → `_mark_failed` flips the upload to `failed` + (**fail loudly, no partial writes**); + - write only the classifier components actually **mapped** in `columnMapping`; + - **no `source` column in v2** — upsert is unconditional for now. + +**Locked assumptions (load-bearing — see ADR-0006).** +- **One real upload per user.** A re-upload only adds *new* properties (ones not + previously included), never re-describes existing ones → part-keys are append-only + across uploads → **upsert-only, no delete-orphans** is correct and complete. +- **Per-count consistency.** One ordering per count, confirmed from one sample, applies + to every cell of that count in the file (extends ADR-0004's bet to all counts). +- **Per-cell count.** `Walls` may split into 3 while `Roofs` splits into 2 in the same + row; each cell is ordered by *its own* entry count. +- **Classification completes before `awaiting_review`**, and the hardened verify gate + forces every `UNKNOWN` to be resolved — so an unresolved description at finalise is a + genuine defect, hence fail-loud. + +## 1. Where v1 left things (read first) + +v1 made **Finalise** an async dispatched Lambda that writes `property` rows. The +full flow works in dev: dispatch `202` → SQS → Lambda inserts properties + writes +terminal status → UI advances to "Processing complete". + +Authoritative background — **read these before coding**: +- `docs/design/bulk-upload-finaliser.md` — the full grilling/design doc (schema Q6–Q9, snapshot-not-FK, recalculate-on-rerun, the v2 input hazards). +- `docs/adr/0005-async-bulk-upload-finaliser.md` (frontend) — state machine + `property_overrides` shape. +- `/workspaces/home/github/Model/docs/adr/0013-bulk-upload-finaliser-writes-properties.md` (backend) — the Lambda write path + DDD layering. +- `docs/adr/0004-multi-entry-building-part-ordering.md` — **critical for v2**: how building parts and `multiEntryOrdering` work. +- `docs/adr/0002-landlord-override-vocabulary.md` — the vocabulary (`landlord_*_overrides`) layer v2 resolves against. +- `CONTEXT.md` — glossary: **Property override**, **Building part**, **Main building**, **Extension**, **Multi-entry**, **Building-part ordering**, **VocabularyMapping**. + +**Convention that must hold (it was corrected hard in v1):** in the Model repo, +business logic lives in `orchestration/*_orchestrator.py`; the Lambda +`applications/*/handler.py` stays thin (parse trigger, wire infra, delegate). One +repository per aggregate; orchestrators never commit (the handler owns the +transaction via `commit_scope`). See memory `model-ddd-layering`. + +## 2. v2 goal + +Populate `property_overrides` during finalise: for each property, write one row per +`(building_part, override_component)` carrying the **resolved enum snapshot** of the +landlord's description for that part. + +## 3. The target table (already shipped — migration 0221, do NOT re-migrate) + +Drizzle: `src/app/db/schema/property_overrides.ts`. + +``` +property_overrides + id uuid pk + property_id bigint NOT NULL FK → property.id ON DELETE CASCADE + portfolio_id bigint NOT NULL FK → portfolio.id ON DELETE CASCADE + building_part smallint NOT NULL -- 0 = main, 1 = ext 1, 2 = ext 2, … + override_component override_component NOT NULL -- pgEnum {wall_type, roof_type, property_type, built_form_type} + override_value text NOT NULL -- snapshot of the resolved enum value + original_spreadsheet_description text NOT NULL -- raw cell text it resolved from + created_at / updated_at timestamptz NOT NULL + UNIQUE (property_id, override_component, building_part) +``` + +## 4. Design decisions already locked (do not relitigate) + +- **Snapshot, not FK.** `override_value` is a denormalised text copy of the resolved + enum, taken at materialise time — *not* an FK to `landlord_*_overrides`. This is + what lets two properties sharing a description diverge later, and is required + because there are four polymorphic vocabulary tables. Lineage is the natural key + `(portfolio_id, override_component, original_spreadsheet_description)`. +- **Re-run = recalculate.** Write with `onConflictDoUpdate` on + `(property_id, override_component, building_part)`, refreshing `override_value` + + `original_spreadsheet_description` + `updated_at`. (Contrast `property`, which is + `onConflictDoNothing`.) When a per-property user-edit path eventually exists, this + upsert will need a `WHERE source='classifier'` guard — but there is **no `source` + column in v1**; add it as a nullable column only when that path is built. +- **`override_component` values** are exactly the classifier category keys + (`wall_type`, `roof_type`, `property_type`, `built_form_type`) used in both + `src/lib/bulkUpload/columnFields.ts` and the Model `ClassifiableColumn.name` — no + translation. +- **`building_part` indexing**: `0 = Main building`, `1 = Extension 1`, … per ADR-0004. +- **Whole-dwelling components** (`property_type`, `built_form_type`) are per-part- + capable but today's files supply them once → usually written at `building_part = 0`. + +## 5. The hard part: assembling the inputs (this is the real v2 work) + +The combiner output (what the v1 finaliser reads) carries **only** address/UPRN +columns — `Address 1/2/3`, `postcode`, `Internal Reference`, `address2uprn_uprn`, +`address2uprn_address`, `address2uprn_lexiscore`. The **raw `Walls`/`Roofs`/ +`Property Type`/`Built Form` cells are NOT in it.** They live only in: +- the **classifier CSV** on S3 — `bulk_onboarding_inputs/{portfolioId}/{uploadId}-classifier.csv` (original landlord headers), and +- `landlord_*_overrides` in Postgres — the *resolved* values keyed by `(portfolio_id, normalized description)`. + +To write one `property_overrides` row, v2 must assemble **four inputs**: + +| Need | Source | +|---|---| +| `property_id` (identity) | combiner output → `(portfolio_id, uprn)` … **but no-UPRN rows have no key** | +| raw cell text per row | the classifier CSV (not the combiner output) | +| split a multi-valued cell → building parts | `multiEntryOrdering` on `bulk_address_uploads` | +| description → `override_value` | `landlord_*_overrides` (resolve by normalized description) | + +### Two open hazards — both RESOLVED (see §0) + +1. **Join key (RESOLVED).** Investigation confirmed `Internal Reference` is in the + address CSV + combiner output but **NOT in the classifier CSV**, and is optional. + So architecture (A)-by-`Internal Reference` is dead. **Resolution: mint a synthetic + UUID `source_row_id`** in `start-address-matching` after `readRows()`, emitted as an + explicit column in *both* CSVs. It is the join key. (Architecture (A) with a + purpose-built key; (B) "carry descriptions through `address2uprn`" was rejected.) + +2. **`property_id` for no-UPRN rows (RESOLVED by descoping).** v2 writes overrides + **only for UPRN rows**, whose `property.id` is re-found by `(portfolio_id, uprn)` + — so **no `RETURNING` correlation is needed**. No-UPRN rows are deferred to v3 + (Model B staging table); v2 leaves the property insert untouched. + +## 6. `multiEntryOrdering` — how to split cells into parts + +Persisted on `bulk_address_uploads` (`src/app/db/schema/bulk_address_uploads.ts`): + +```ts +MultiEntryOrdering { permutations: Record; confirmed: boolean } +// permutations[count][k] = the 0-based FILE position holding building part k +// where 0 = Main building, 1..N-1 = Extension 1..N-1. +// e.g. { "2": [1, 0] } => for 2-entry rows, the main building is file position 1. +``` + +A multi-valued cell (e.g. `Walls = "Cavity: …, Solid brick: …"`) splits on commas +into entries by file position; `permutations[count]` maps file position → building +part. **Caveat (ADR-0004):** only the **largest count** permutation is captured this +iteration; other counts need a derivation rule — decide it in v2. +`multiEntrySummary` holds the detected multi-valued columns + **normalized** +description keys (the normalization that matches the classifier's stored keys: +`split → strip → lower`). + +## 7. Resolving description → value (`landlord_*_overrides`) + +Four per-component tables in `src/app/db/schema/landlord_overrides.ts` +(`landlord_wall_type_overrides`, `…_roof_type_…`, `…_property_type_…`, +`…_built_form_type_…`), each `UNIQUE (portfolio_id, description)`, value typed by the +component's pgEnum, plus a `source` (`classifier`|`user`). Resolve a normalized +description → `value`. The frontend already does this read in +`src/lib/bulkUpload/server.ts` (`lookupOverrides`) — mirror that mapping on the +backend. **`UNKNOWN` is now a transient "needs review" marker, never a final +value** (resolved in §0): the verify gate forces the user to map every `UNKNOWN` +before Finalise, so a `UNKNOWN` (or unresolvable description) reaching the finaliser +is a defect and **fails the run loudly**. + +## 8. Backend pieces to build (DDD, mirror v1) + +In `/workspaces/home/github/Model`: +- **`PropertyOverrideRow`** SQLModel mirror → `infrastructure/postgres/property_override_table.py` (mirror the pattern in `property_table.py` / `landlord_*_override_table.py`; reuse a shared `override_component` SAEnum like `landlord_override_enums.py`). +- **Repository** for the override write (one per aggregate): add to + `repositories/property/` (e.g. extend the property repo or a sibling + `property_override` repo), with an `upsert_all` using + `on_conflict_do_update(index_elements=[property_id, override_component, building_part], …)`. +- **Orchestrator logic** in `orchestration/bulk_upload_finaliser_orchestrator.py`: + extend `finalise(...)` (or add a step) to, after inserting properties and getting + ids, build the override rows (join → split by part → resolve) and persist them in + the **same** `commit_scope`. +- **Handler** stays thin — it already wires S3 + engine + repos. It will need the + extra input (classifier CSV and/or `multiEntryOrdering`); decide how those reach + the Lambda (extend `BulkUploadFinaliserTriggerBody`, or read `bulk_address_uploads` + for `multiEntryOrdering` + the classifier S3 URI). The trigger currently carries + `task_id, sub_task_id, s3_uri (combiner output), portfolio_id, bulk_upload_id`. + +Key v1 files to extend (all in the Model repo): +- `applications/bulk_upload_finaliser/handler.py` +- `orchestration/bulk_upload_finaliser_orchestrator.py` +- `repositories/property/property_repository.py` + `property_postgres_repository.py` +- `infrastructure/postgres/property_table.py` (reference for the new mirror) +- `infrastructure/s3/csv_s3_client.py` (`read_rows`) +- Packaging test: `tests/test_lambda_packaging.py` will flag any new top-level import + the Dockerfile doesn't `COPY` (v1 hit this with `datatypes/`). + +## 9. Open questions — all RESOLVED (see §0 + ADR-0006) + +- **Join key** → synthetic UUID `source_row_id` in both CSVs (not `Internal + Reference`, not architecture B). +- **`property_id` for no-UPRN rows** → out of scope; v2 is UPRN-only, no-UPRN deferred + to v3 (Model B). UPRN rows re-found by `(portfolio_id, uprn)`; no `RETURNING`. +- **Non-largest-count `multiEntryOrdering`** → capture a confirmed permutation for + **every** count ≥ 2 in the UI (supersedes ADR-0004); finaliser needs no fallback. +- **Trigger body vs handler-reads-DB** → **grow the trigger body** (`classifier_s3_uri` + + `multi_entry_ordering`), built in `dispatchFinaliser`. +- **Re-materialise** → recalculate every finalise via **upsert-only** on + `(property_id, override_component, building_part)`; **no delete-orphans** (justified + by the one-real-upload assumption); `property` rows untouched. + +## 10. Implementation order (design is settled — build it) + +Frontend first (the finaliser depends on `source_row_id` + per-count ordering): + +1. **`source_row_id`**: shared `classifierCsvKey` helper; mint the UUID in + `start-address-matching` after `readRows()`; emit it as an explicit column in both + `buildAddressCsv` and `buildClassifierCsv`. Verify it lands in a real combiner + output. +2. **Per-count ordering**: `detectMultiEntry` keeps a sample per count; + `OnboardingProgress` renders one ordering panel per count ≥ 2. Drop the + largest-count-only assumption in `setMultiEntryOrdering` if it requires the largest. +3. **Verify gate**: block Finalise while any classification is `UNKNOWN`. +4. **`dispatchFinaliser`**: add `classifier_s3_uri` + `multi_entry_ordering` to the + trigger payload. + +Backend: + +5. Grow `FinaliserTriggerRequest` (FastAPI) + `BulkUploadFinaliserTriggerBody` (Lambda). +6. `PropertyOverrideRow` mirror + sibling `PropertyOverrideRepository` (upsert) + + read-only `LandlordOverrideRepository`. +7. Orchestrator step (join → split → resolve → upsert; fail-loud on unresolved), + TDD against fakes (mirror + `tests/orchestration/test_bulk_upload_finaliser_orchestrator.py`). +8. Handler wiring; watch `tests/test_lambda_packaging.py` for Dockerfile COPY gaps. + +Docs (done in this session): ADR-0004 amended, ADR-0006 added, `CONTEXT.md` +"Property override" updated. + +## 11. Verification notes (environment) + +- Frontend: `npx tsc --noEmit` (was 0 errors at v1 close). +- Model repo: `mypy`/`pytest` need a deps-installed env (the v1 session couldn't run + them locally; `/app` Docker config runs the full suite). `terraform plan` needs the + CLI. Watch `tests/test_lambda_packaging.py` for Dockerfile COPY gaps. +- v1 is committed; dev Lambda + SQS queue are deployed and working + (`FINALISER_SQS_URL` wired in `backend/.env` for local, and in terraform/fast-api). diff --git a/docs/design/bulk-upload-finaliser.md b/docs/design/bulk-upload-finaliser.md new file mode 100644 index 00000000..3a41e28a --- /dev/null +++ b/docs/design/bulk-upload-finaliser.md @@ -0,0 +1,243 @@ +# Design WIP: `bulk_upload_finaliser` + `property_overrides` + +> **Status:** v1 fully resolved (grilling 2026-06-04). Ready to graduate to ADR(s). +> v2 (`property_overrides` population) deferred to its own session — see the +> "Input" application-flow item for its entry point. When decisions stabilise this should +> graduate into a new ADR in `docs/adr/` (frontend) and likely a companion ADR +> in the Model repo, plus a CONTEXT.md update (see "Docs to update"). + +## Goal + +Two linked pieces of work: + +1. **New backend application `bulk_upload_finaliser`** (lives in + `/workspaces/home/github/Model/applications/`, DDD-aligned — study + `/workspaces/home/github/Model/domain`). It reads the address-matching / + combiner output **and** the `landlord_*_overrides` vocabulary tables, then + writes Postgres correctly: the `property` rows (UPRN + address, as the + frontend does today) and — later — the new `property_overrides` rows. + Motivation: a property list can be ~40,000 rows, too big for a synchronous + Next.js HTTP handler. + +2. **New `property_overrides` table** — the per-Property fact layer that + **ADR-0004 explicitly deferred** ("the per-Property building-part fact layer + that consumes `multiEntryOrdering` and writes main/extension facts at + finalise"). One row per `(property, building_part, component)` carrying the + resolved enum value + provenance. + +**Split into two pieces (decided 2026-06-04):** + +- **v1 — async finaliser writes `property`.** Move today's synchronous Next.js + `/finalize` property-insert into a dispatched Lambda (`bulk_upload_finaliser`), + because a property list can be ~40,000 rows. Reproduces the exact 9-column insert + + `onConflictDoNothing`, adds the `finalising` status + async state machine, and + shifts terminal-status ownership to the backend. **Fully designed — ADR-ready.** +- **v2 — populate `property_overrides`.** The per-Property fact layer. The *table* + already shipped (migration 0221, PR #306), but population is a **separate + follow-up** with its own open input-plumbing questions (see the "Input" item + under application-flow questions). Not designed here. + +This doc resolves **v1 in full**; v2 gets its own grilling session against real +classifier-CSV / combiner-output samples. + +## Where this sits in the existing pipeline + +``` +BulkUpload → address matching → combiner → awaiting_review → [Finalise] + │ + (new) bulk_upload_finaliser ──────────┘ + reads: combiner output (S3) + landlord_*_overrides + writes: property (+ later property_overrides) + │ + downstream: Ingestion (EPC/solar fetch) + → PropertyBaseline (stage 2, + re-score-on-override seam, + Model ADR-0011/0012) +``` + +`Finalise` (the user action + state-machine gate) stays in Next.js; the new +application is the **worker it dispatches**. Downstream `PropertyBaseline` +already has an override-aware "re-score" seam — `property_overrides` will feed it. + +## Decisions locked + +| # | Decision | +|---|----------| +| Name | Application is **`bulk_upload_finaliser`**, in `Model/applications/`. `Finalise` stays the Next.js action that triggers it. | +| DDD | Follow the DDD structure under `Model/domain`. Domain terms discovered as needed. | +| Schema ownership | **Drizzle (frontend) owns migrations** for both `property` and the new `property_overrides`. | +| Backend access | Backend gets a **`PropertyOverrideRow` SQLModel** (mirror, like `landlord_wall_type_override_table.py`) + a **repository** (see `Model/infrastructure/postgres` + `Model/repositories` for examples). `PropertyRow` must drop its "backend never inserts" invariant and gain insertable columns. | +| Next.js `/finalize` | **Delete it** — fully replaced by the Lambda. | +| `property_overrides` shape | **Single polymorphic table**, not per-component tables. Accepts losing DB-level pgEnum typing on `value`. | +| `override_value` | `text` — a **denormalised snapshot copy** (own value per row) of the resolved enum from `landlord_*_overrides` at materialise time. Own-value (not an FK to the vocabulary) is what lets two properties sharing a description later **diverge**, and lets re-run recalculate one property's value without touching its siblings. | +| Snapshot, **not** FK to vocabulary | `property_overrides` does **not** foreign-key the originating `landlord_*_overrides` row. An FK forces every property sharing a description to share one value (forbids divergence); is structurally impossible as a real FK (4 polymorphic target tables, each with its own value enum); and would risk cascade-deleting per-property facts when re-classification prunes a vocabulary row. Lineage is preserved as a **natural key** — `(portfolio_id, override_component, original_spreadsheet_description)` re-finds the vocabulary row (its `UNIQUE` is `(portfolio_id, description)`) — so deliberate re-sync needs no surrogate FK. | +| Re-run = **recalculate** | The finaliser write to `property_overrides` is `onConflictDoUpdate` on `(property_id, override_component, building_part)`, refreshing `override_value` + `original_spreadsheet_description` + `updated_at` to the latest resolution. Contrast `property`, which stays `onConflictDoNothing` (identity rows, don't churn). When per-property `source='user'` edits exist, the update must guard `WHERE source='classifier'` to preserve hand-edits (mirrors the Model classifier upsert). | +| `building_part` | **`smallint NOT NULL`**, explicit index: `0 = main building, 1 = extension 1, 2 = extension 2, …` (matches ADR-0004 `multiEntryOrdering.permutations` indexing). | +| Whole-dwelling components | **No special case.** `property_type`/`built_form` are *per-part-capable* too (an extension — conservatory, summer house — can be a different built form / property type). Today's files only supply them once, so they'll usually be written at `building_part = 0` only, but the schema allows per-part with no future migration. | + +## `property_overrides` — columns so far + +Roughly (subject to remaining open questions): + +``` +property_overrides + id uuid pk (default random) -- match landlord_* tables + property_id bigint NOT NULL FK → property.id (FE-owned table) + portfolio_id bigint NOT NULL FK → portfolio.id + building_part smallint NOT NULL -- 0 = main, 1 = ext 1, 2 = ext 2, … + override_component override_component NOT NULL -- column name == enum type name; pgEnum {wall_type, roof_type, property_type, built_form_type} (Q6 ✓) + override_value text NOT NULL -- snapshot copy of landlord_* resolved enum (free text; `override_component` carries the typing) + -- (no `source`) — dropped Q9: pure value snapshot; add back as nullable column if/when a per-property edit path needs provenance + original_spreadsheet_description text NOT NULL -- raw spreadsheet cell text this snapshot resolved from (Q7 ✓) + created_at timestamptz NOT NULL default now() + updated_at timestamptz NOT NULL default now() + -- UNIQUE (property_id, override_component, building_part) -- Q8 ✓ (source NOT in key — mirrors ADR-0004 single-row flip; portfolio_id implied by property_id) + -- FK property_id → property.id ON DELETE CASCADE; portfolio_id → portfolio.id (Drizzle only; bare bigint in SQLModel mirror); portfolio_id kept (matches property_details_epc / property_targets) +``` + +## Open questions (resume here) + +- **Q6 — `component` discriminator. RESOLVED 2026-06-04.** pgEnum + **`override_component`** (column `component`) with values + **`wall_type`, `roof_type`, `property_type`, `built_form_type`**. Verified these + are the *exact* keys used both in the frontend + ([columnFields.ts:30-33](../../src/lib/bulkUpload/columnFields.ts#L30-L33)) and + the backend (`ClassifiableColumn.name` / handler `_build_columns()`), so the + finaliser maps category → component with **no translation**. pgEnum over text: + small closed set, typos caught at write time — and this is now the *only* + DB-level typing left on a row, since `override_value` is free text. New component + = one-line `ALTER TYPE … ADD VALUE` (Drizzle-owned). Enum named `override_*` + (not `property_*`) to sit with `override_source` and stay visually distinct from + the existing *value* enum `property_type`. + +- **Q7 — store raw description per override row? RESOLVED 2026-06-04: yes, as + `original_spreadsheet_description text NOT NULL`.** Names the *source artifact* + (the spreadsheet cell), not an actor — sidesteps the Landlord-vs-User conflation + the glossary warns against, and aligns with CONTEXT.md's "the source file". Stored + because `override_value` is a denormalised snapshot that deliberately won't + refresh on later vocabulary edits; pinning the original text makes each row + self-explaining and re-resolvable even after the source `landlord_*_overrides` row + changes. `NOT NULL` is safe **iff** every `property_overrides` row is materialised + from a `landlord_*_overrides` row (whose `description` is itself `NOT NULL`) — + confirm when settling Q9/source semantics. + +- **Q8 — uniqueness + FKs. RESOLVED 2026-06-04.** + `UNIQUE (property_id, override_component, building_part)`. `building_part` is in + the key (part 0 and part 1 both carry e.g. a `wall_type` row). `source` is + **deliberately not** in the key — mirrors ADR-0004's single-row-flip (one row, + flip `source` in place; the two-row model was rejected). `portfolio_id` is not in + the key (implied by `property_id`) but **is kept as a column** for query ergonomics + and consistency with `property_details_epc` / `property_targets`, which both + denormalise it. FKs: `property_id → property.id ON DELETE CASCADE`; + `portfolio_id → portfolio.id ON DELETE CASCADE` in the Drizzle migration, but a + bare `bigint` (no FK) in the backend `PropertyOverrideRow` SQLModel mirror — + matching `landlord_wall_type_override_table.py`. + +- **Q9 — `source` semantics. RESOLVED 2026-06-04: drop `source` entirely.** + `property_overrides` is a pure **snapshot of resolved values**. Rationale: there + is no per-property override concept today (per ADR-0004 edits happen at the + **vocabulary/portfolio** level, flipped in place), so a copied `source` would + describe the *vocabulary mapping's* provenance, not this property's — a footgun a + reader/re-score rule could misread, and no consumer needs it in v1. When a genuine + per-property edit path lands (the real use for per-property provenance), `source` + returns as an **additive nullable-column migration** — no need to carry it now. + This also confirms the Q7 `NOT NULL` contingency: every row is still materialised + from a `landlord_*_overrides` row (`description NOT NULL`). + +- **Q-scope — v1 scope. RESOLVED 2026-06-04.** v1 = the finaliser reproduces + today's **exact 9-column `property` insert** (`portfolio_id`, + `creation_status='READY'`, `uprn`, `landlord_property_id` ← `Internal Reference`, + `address` = matched ?? user-inputted, `postcode`, `user_inputted_address`, + `user_inputted_postcode`, `lexiscore`) **+** `onConflictDoNothing` on + `(portfolio_id, uprn) where uprn is not null` — not a reduced "UPRN + address". + This sizes the "PropertyRow gains insertable columns" decision to all nine + columns plus `creation_status`. The `property_overrides` *table* shipped ahead + (migration 0221, PR #306) but is **not populated** in v1 — population is + follow-up work (and needs a different input source; see combiner-output note + below). + +### Application-flow questions not yet reached + +- **Trigger + orchestration. RESOLVED 2026-06-04.** Mirror the + `start-address-matching` path. Next.js creates a `SubTask` (`service: + "finaliser"`) under the BulkUpload's existing Task, then POSTs a new FastAPI + endpoint `POST /v1/bulk-uploads/trigger-finaliser` (auth via `validate_token`), + which enqueues to a **new SQS queue**; a Lambda runs the finaliser wrapped in + `@subtask_handler` (auto-injected `TaskOrchestrator`; `run_subtask` owns the + subtask start/complete/fail + Task cascade). Trigger body + `FinaliserTriggerBody { task_id, sub_task_id, s3_uri (combined output), portfolio_id }` + (extends `SubtaskTriggerBody`). Slow work stays outside the txn; persistence in a + `commit_scope`. The synchronous Next.js `/finalize` route is deleted (locked). + +- **State machine / who writes `complete`. RESOLVED 2026-06-04.** New status + **`finalising`** between `awaiting_review` and `complete` (mirrors `combining` + before `awaiting_review`). Lifecycle: + `awaiting_review → finalising → complete` (↘ `failed`). + - **`finalising`** written by **Next.js at dispatch** via a **compare-and-swap**: + `UPDATE … SET status='finalising' WHERE id=? AND status='awaiting_review'` — + 0 rows ⇒ already dispatched ⇒ 409. This is the **double-dispatch guard** (closes + the simultaneous-click race under `loadForFinalize`'s existing precondition). + - **`complete` / `failed`** written by the **Lambda** directly to + `bulk_address_uploads` (new `set_finalized_status` / `set_failed_status`, + exactly like the combiner's `set_combining_status` / + `set_combined_output_s3_uri`). `markFinalized` + the Next.js `/finalize` route + are deleted. + - **CONTEXT.md "Two writers" change:** Next.js owns dispatch + the + `awaiting_review → finalising` CAS; the backend owns `finalising → complete` + and `→ failed` (in addition to `combining` / `awaiting_review`). + - **UI vs canonical:** persisted enum value is `finalising` (canonical; ties to + the **Finalise** action). The frontend renders it as **"Uploading to ARA"** — a + display-layer label only, **not** the enum name, so UX copy never needs a + migration. + +- **Input — does the combiner output carry the raw description cells? RESOLVED + 2026-06-04: NO. This is a v2 problem (deferred).** v1 needs only address/UPRN + columns, all **confirmed present** in the combiner output (`address2uprn_uprn`, + `address2uprn_address`, `address2uprn_lexiscore`, `Internal Reference`, + `Address 1/2/3`, `postcode`). The raw `Walls`/`Roofs`/`Property Type`/`Built Form` + cells are **not** in the combiner output — they survive only in (a) the + `{uploadId}-classifier.csv` on S3 (original headers) and (b) `landlord_*_overrides` + as *resolved* values keyed by description. So v2 population must assemble **four + inputs**, not one file: + - `property_id` (identity) ← combiner output `(portfolio_id, uprn)` — **but + no-UPRN rows have no such key**; + - raw cell text ← the classifier CSV (not the combiner output); + - cell → building-part split ← `multiEntryOrdering` on `bulk_address_uploads`; + - description → `override_value` ← `landlord_*_overrides` (normalized description). + - **Two open v2 hazards (entry point for the v2 session):** (1) the join key + between classifier CSV and combiner output — is there a stable per-row key + (`Internal Reference`?) and is row order preserved through postcode-split + + combine? (2) obtaining `property_id` for unmatched (no-UPRN) rows — v1's + `onConflictDoNothing` returns no ids, so v2 likely needs `RETURNING id` mapped + back to source rows. + +- **Idempotency / re-run. RESOLVED 2026-06-04 (per-table).** + - `property`: keep today's `onConflictDoNothing` on `(portfolio_id, uprn) where uprn is not null` — existing properties are not churned. + - `property_overrides`: `onConflictDoUpdate` on `(property_id, override_component, building_part)` — **recalculate** `override_value` + `original_spreadsheet_description` + `updated_at` to the latest resolution, so an existing property whose override changed is refreshed in place. Guard `WHERE source='classifier'` once a per-property user-edit path exists (until then every row is classifier-derived, so blind overwrite is correct). See the "Re-run = recalculate" and "Snapshot, not FK" locked decisions. + +## Key code references (from exploration) + +**Frontend (`assessment-model`):** +- [finalize/route.ts](../../src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/finalize/route.ts) — today's synchronous property insert (to be deleted). +- [property.ts](../../src/app/db/schema/property.ts) — `property` table (`property_type`, `built_form` columns exist; `uq_property_portfolio_uprn`). +- [landlord_overrides.ts](../../src/app/db/schema/landlord_overrides.ts) — the four per-component override tables + all pgEnums (`wallTypeEnum`, `roofTypeEnum`, `propertyTypeEnum`, `builtFormTypeEnum`, `overrideSourceEnum`). +- [bulk_address_uploads.ts](../../src/app/db/schema/bulk_address_uploads.ts) — `multiEntryOrdering` (permutations, `0=main`), `multiEntrySummary`, `verifyAck`, `combinedOutputS3Uri`. +- [ADR-0004](../adr/0004-multi-entry-building-part-ordering.md) — defers exactly this fact layer; the building-part ordering model. +- [ADR-0002](../adr/0002-landlord-override-vocabulary.md) — vocabulary layer. + +**Backend (`Model`):** +- `applications/landlord_description_overrides/handler.py` — the worker pattern to mirror (`subtask_handler`, `TaskOrchestrator`, trigger body, `commit_scope`). +- `infrastructure/postgres/landlord_wall_type_override_table.py` — SQLModel mirror pattern for the new `PropertyOverrideRow`. +- `infrastructure/postgres/landlord_override_enums.py` — shared `override_source` SAEnum pattern. +- `infrastructure/postgres/property_table.py` — `PropertyRow` defensive view ("backend never inserts" — to change). +- `repositories/landlord_overrides/landlord_override_repository.py` — repository pattern for the new override repo. +- `orchestration/landlord_description_overrides_orchestrator.py` — orchestrator pattern; note it splits cells into an orderless set (discards part order — recovered via `multiEntryOrdering`). +- Downstream: `orchestration/property_baseline_orchestrator.py` (re-score-on-override seam), `orchestration/ingestion_orchestrator.py`. + +## Docs to update (when this lands) + +- **CONTEXT.md**: `Property Type` / `built_form` are **per-part-capable**, not + whole-dwelling. Add the per-Property fact layer (`property_overrides`) to the + glossary + relationships. Possibly a `building_part` index definition. +- **New ADR** (frontend) for `property_overrides` + finaliser; companion Model ADR + for the cross-repo write, citing ADR-0003/0004. diff --git a/docs/wip/landlord-override-frontend-plan.md b/docs/wip/landlord-override-frontend-plan.md index a26deb4b..a67f666d 100644 --- a/docs/wip/landlord-override-frontend-plan.md +++ b/docs/wip/landlord-override-frontend-plan.md @@ -1,13 +1,14 @@ # Landlord override frontend — in-flight design notes -**Status:** Paused mid-grilling (2026-05-27) -**Branch:** `feautre/additional_walltypes` +**Status:** Grilling complete (2026-05-28) — Q1–Q7 resolved; ready to promote to ADR +**Branch:** `feature/frontend_landlord_overrides` **Author:** Jun-te (with Claude, via `/grill-me`) -This is a *design-in-progress* document, not an ADR. It captures decisions made -so far on the landlord-override frontend plan so the conversation can resume -without re-litigating settled questions. Promote to an ADR once the trigger -mechanism (Q4) is resolved — that's the decision worth permanent recording. +This is a _design-in-progress_ document, not an ADR. It captures the decisions +reached during grilling so the conversation can resume without re-litigating +settled questions. All seven questions are now resolved; the trigger + +state-machine integration (Q4) is ready to promote to a frontend ADR-0003, and +the work is ready to break into issues. ## Goal @@ -23,14 +24,22 @@ Build the front-end e2e for `landlord_description_override`, starting from the rationale in [ADR-0002](../adr/0002-landlord-override-vocabulary.md). - Nothing in Next.js reads or writes them yet. - The Python lambda at - `/workspaces/home/github/Model/applications/landlord_description_overrides/handler.py` - is **not deployed** and **not wired** into the BulkUpload pipeline. It - hardcodes its trigger params (`portfolio_id`, `s3_uri`) and its source column - names (`"Property Type"`, `"Walls"`, `"Roofs"`). -- Note: ADR-0002 says writes come from Next.js POST, but the current backend - writes direct to Postgres. This drift may need to be revisited under Q4. + `Model/applications/landlord_description_overrides/handler.py` (branch + `feature/landlord_data`) is **not deployed** and **not wired** into the + BulkUpload pipeline. It hardcodes the trigger fields (handler.py:55-60) and + builds a hardcoded `ClassifiableColumn` list with `source_column`s + `"Property Type"` (→ both `property_type` **and** `built_form_type`), + `"Walls"`, `"Roofs"`. It also caps the batch to 20 rows while under test. +- **Resolved drift (was a Q4 risk):** the backend's + [ADR-0003 (python-writes-landlord-overrides-directly)](https://github.com/Hestia-Homes/Model/blob/main/docs/adr/0003-python-writes-landlord-overrides-directly.md), + accepted 2026-05-26, **supersedes** ADR-0002's "writes happen from Next.js" + clause. The lambda computes **and** persists the classification directly to + Postgres via a SQLAlchemy `LandlordOverrideRepository[E]`, with an + `ON CONFLICT … WHERE source = 'classifier'` upsert. There is **no** Next.js + POST-back. Drizzle stays schema source-of-truth; the Python `SQLModel` + shadows it. -## Decided so far +## Decided ### Q1 — Scope @@ -41,8 +50,7 @@ to the lambda → lambda needs edits to work when deployed. ### Q2 — Categories -**All four classifier categories** get independent optional slots in -[`INTERNAL_FIELDS`](../../src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/map-columns/MapColumnsClient.tsx#L14-L21): +**All four classifier categories** are independent optional fields: `property_type`, `built_form_type`, `wall_type`, `roof_type`. Rejected alternatives: (a) start with only PT+BF — wasted plumbing churn for @@ -50,58 +58,121 @@ the same migration cost; (c) collapse PT+BF into one UI slot — bakes a backend coincidence (they read the same CSV column today) into the user-facing model. -### Q2.1 — No `autoDetect` for the new slots +### Q2.1 — No `autoDetect` for classifier fields -The four new slots default to `"skip"`. The user must explicitly map them. -`autoDetect()` regex patterns are for required address-ish fields only. +The four classifier fields default to unmapped ("Not provided"). The user must +explicitly map them. `autoDetect()` is for required address-ish fields only. **Why:** Address headers are unambiguous and required, so guessing is safe and -useful. Landlord-description columns are ambiguous (a "type" column could be -PropertyType or BuiltFormType or something else) and they are optional — +useful. Landlord-description columns are ambiguous and optional — auto-detecting them would silently opt the landlord into classifier runs they didn't intend. -## Open — resume here +### Q2.2 — Mapping shape: unified `field → header` (refines Q2's mechanism) -### Q3 (in flight) — Uniqueness validation on the mapping +The mapping is **one unified map keyed by internal field, valued by source CSV +header** (`{ address_1: "Addr 1", property_type: "Property Type", +built_form_type: "Property Type", wall_type: "Walls", … }`), replacing the +current `header → field` shape +([MapColumnsClient.tsx:62-64](<../../src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/map-columns/MapColumnsClient.tsx#L62-L64>)). -Today validation only checks required fields exist -([MapColumnsClient.tsx:67-68](../../src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/map-columns/MapColumnsClient.tsx#L67-L68)); -two CSV headers can both map to `address_1` silently. +**Why:** the backend's `ClassifiableColumn` is a `(name, source_column)` pair +where **one `source_column` feeds many `name`s** — `"Property Type"` feeds both +`property_type` and `built_form_type` +([classifiable_column.py:28-31](https://github.com/Hestia-Homes/Model/blob/feature/landlord_data/orchestration/classifiable_column.py)). +A `header → field` map (one value per header) **cannot express** that; the +chosen mechanism in Q2 (extending the single `INTERNAL_FIELDS` dropdown) would +leave `built_form_type` unmappable. `field → header` matches the backend model +1:1 and lets multiple categories share a header. -- (a) Leave it alone — backend last-wins. -- (b) Enforce uniqueness only on the four new slots. -- (c) Enforce uniqueness everywhere except `skip`. **Recommended.** +**Consequences:** UI inverts to one row per internal field with a header-picker; +`autoDetect` inverts to "best header per field"; "skip" becomes a per-field +"Not provided"; rewrite +[`transformFile`](<../../src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts#L17-L54>) +(currently iterates `header → field`), the `z.record` route schema, and +**migrate existing persisted `columnMapping` rows** (semantics flip) — see Q5. -### Q4 (queued — biggest) — Trigger mechanism +### Q3 — Sharing / uniqueness rule (reframed) -How does Next.js invoke the lambda once mapping is complete? SQS message? -Direct lambda invoke? HTTP endpoint? And what's the state-machine integration — -new `BulkUpload` status, or runs orthogonally to address matching? +A source header may feed **at most one address/reference field**, but **any +number of classifier fields** (PT + BF → `"Property Type"` is required, so +classifier sharing must be allowed). Required-field validation stays: +`address_1` and `postcode` must each be assigned a header. -This drives both the deployment work and the lambda edits. Likely worth its -own ADR once decided. +This **replaces** the original "enforce uniqueness everywhere except skip" +framing, which was wrong once classifier header-sharing became a requirement. -### Q5 (queued) — Persistence of the extended mapping +### Q4 — Trigger mechanism + state-machine integration (the big one) -Current `bulkAddressUploads.columnMapping` is a `Record` and -naturally accommodates the new slots. Confirm no separate table is needed. +- **Transport:** reuse + [`triggerFastApiPipeline`](../../src/lib/bulkUpload/server.ts) with a **new + FastAPI endpoint**; payload `{ task_id, sub_task_id, s3_uri, portfolio_id, + column_mapping }`. FastAPI turns the POST into the SQS subtask envelope the + handler TODO (handler.py:51-54) references. Not a direct lambda invoke, not + Next.js → SQS. +- **`s3_uri` is the ORIGINAL upload** (`upload.s3Bucket/s3Key`), **not** the + address-matching transformed CSV — the description columns and their original + header names only survive in the original (the address transform strips every + non-address column). +- **Writes:** lambda writes overrides directly to Postgres (ADR-0003); no + POST-back. +- **State machine:** the classifier runs as a **subtask under the same address + task** (not a separate task, not a new `BulkUpload` status). Both subtasks + fire together at the **"Start address matching"** action. The combined address + *output* is not affected (the classifier writes Postgres, not + `ara_raw_outputs/{task_id}/`), **but** the parent Task *status* IS recomputed + from all subtasks (`TaskOrchestrator._cascade`), so a classifier failure fails + the onboarding task and gates the combiner. This coupling was knowingly + accepted (2026-05-28), superseding Q1 non-blocking — see ADR-0003's Amended + note. (Corrects an earlier wrong claim that the envelope does no gating.) +- **Progress honesty:** add a nullable **`service`/`kind` discriminator to + `sub_task`** (existing rows = address/legacy) so the progress view shows + address batches vs classification separately and attributes failures + correctly. Update the 3 subtask-count sites + `OnboardingProgress`. -### Q6 (queued) — Lambda edits +### Q5 — Persistence + migration of the inverted mapping -Handler hardcodes `source_column="Property Type" / "Walls" / "Roofs"`; needs -to read the mapping from the trigger body. -`LandlordDescriptionOverridesTriggerBody` already exists — check what fields -it has vs needs. +Reuse `bulkAddressUploads.columnMapping` (jsonb `Record`); **no +separate table**. The Q2.2 inversion (`header → field` → `field → header`) is +handled by a **one-shot data migration**: invert each non-skip entry; on a +legacy duplicate (two headers → one field, which the new Q3 rule forbids +anyway) last-write-wins; `skip` entries drop. The migration is a no-op on empty +tables, so it's safe regardless of data volume. `validateMapping` +([server.ts:97-102](../../src/lib/bulkUpload/server.ts#L97-L102)) and the route +schema must be rewritten for the inverted shape (they currently check the +**values** for `address_1`/`postcode`). -### Q7 (queued) — Review/Edit UI for classified mappings +### Q6 — Lambda edits -Is the per-row review/edit surface in scope for this iteration, or deferred? -User has not addressed yet. ADR-0002 calls this "the future override -frontend" and treats it as deferred work — but "front end e2e from -bulk_upload" could reasonably include it. +`LandlordDescriptionOverridesTriggerBody` (task_id, sub_task_id, s3_uri, +portfolio_id; `extra="allow"`) gains a **`column_mapping: dict[str, str]`** +field carrying **only the classifier subset** (`category → source header`); the +frontend extracts the four classifier keys from its unified map before sending. +The handler keeps a fixed registry of the four category builders (each owns its +enum, repo, and any hint such as the wall construction-date one) and supplies +`source_column` from `column_mapping`, **skipping categories the user didn't +map**. Both `property_type` and `built_form_type` carry `"Property Type"`, so +PT+BF sharing falls out for free. Drop the hardcoded trigger and the 20-row cap. -## Resuming +### Q7 — Results UI: read-only this iteration -Re-read this file, then ask Q3. Don't re-litigate Q1/Q2 unless the user -reopens them. +This iteration ships a **read-only** results surface: it reads the four +`landlord_*_overrides` tables and shows each `description → value` (per +portfolio) so the pipeline is observably e2e. **No editing / write-back** — the +user-override write path (correcting a row to `source='user'`, which the +classifier upsert won't overwrite) is the deferred "future override frontend" +ADR-0002 anticipates. + +## Next steps + +Grilling is complete (Q1–Q7). Suggested follow-ups: + +1. **Done** — Q4 promoted to + [ADR-0003](../adr/0003-classifier-triggers-as-address-subtask.md) (trigger + + state-machine integration), cross-linking the backend's ADR-0003. +2. Break the work into issues (`/to-issues`): (a) invert the mapping shape + UI + + migration + validation; (b) `sub_task` discriminator + progress view; + (c) classifier trigger (new FastAPI endpoint + payload, fire at "Start"); + (d) lambda edits (trigger body + registry-from-mapping); (e) read-only + results view. +3. Confirm the working branch (see header) before implementation starts. diff --git a/docs/wip/landlord-override-verification.md b/docs/wip/landlord-override-verification.md new file mode 100644 index 00000000..82bcff08 --- /dev/null +++ b/docs/wip/landlord-override-verification.md @@ -0,0 +1,88 @@ +# Landlord override e2e — verification & deploy checklist + +**Created:** 2026-05-28 +**Branch:** `feature/frontend_landlord_overrides` (this repo) + `feature/landlord_data` (Model repo) +**Plan:** [landlord-override-frontend-plan.md](./landlord-override-frontend-plan.md) · **ADR:** [0003-classifier-triggers-as-address-subtask.md](../adr/0003-classifier-triggers-as-address-subtask.md) + +## Context for picking this up cold + +The landlord-classifier e2e is **implemented across both repos but uncommitted**, and +**statically verified only** (frontend `tsc` 0 errors, `next lint` clean, backend +`py_compile` clean). It has **not** been run live — that needs the steps below +(migrations applied, SQS queue + env, FastAPI endpoint + lambda deployed with +OpenAI/S3/Postgres access). Two migration files are **generated but not applied**: +`0215_invert_column_mapping.sql` (data) and `0216_add_subtask_service.sql` (schema). + +Work through the sections in order — each step's prerequisites come first. + +--- + +## A. Before you start +- [ ] Use dev/preview first, not prod. +- [ ] Confirm `.env.local` DB creds (`DB_HOST/PORT/USERNAME/PASSWORD/NAME`) point at the target DB. +- [ ] **Back up** `column_mapping` — the 0215 inversion is one-shot/irreversible: + ```sql + CREATE TABLE _bak_bulk_mapping AS + SELECT id, column_mapping FROM bulk_address_uploads WHERE column_mapping IS NOT NULL; + ``` + +## B. Database migrations ⚠️ read the gotcha +`0215` = data (inverts `header→field` → `field→header`); `0216` = schema (`ADD COLUMN sub_task.service`). + +⚠️ package.json only has `migration:push` (`drizzle-kit push`). **`push` diffs schema and will NOT run the 0215 data `UPDATE`** — it would add the column but silently skip the inversion. Use `migrate`: +- [ ] ```bash + npx drizzle-kit migrate # runs 0215 then 0216 in order + ``` + (If the team only uses `push`: run `push` for 0216, then execute `0215`'s SQL manually.) +- [ ] ⚠️ Run **0215 exactly once, on old-shape data**. Re-running re-inverts and corrupts. `migrate` guards via the journal; manual runs don't. +- [ ] Verify 0215 — values should now be headers: + ```sql + SELECT column_mapping FROM bulk_address_uploads WHERE column_mapping IS NOT NULL LIMIT 5; + -- expect {"address_1":"Addr 1","postcode":"PCode", ...} + ``` +- [ ] Verify 0216: + ```sql + SELECT 1 FROM information_schema.columns + WHERE table_name='sub_task' AND column_name='service'; + ``` + +## C. Backend deploy (Model service) +- [ ] Create an SQS queue for the classifier (e.g. `landlord-description-overrides`). +- [ ] Set **`LANDLORD_OVERRIDES_SQS_URL`** in the FastAPI env to that queue. +- [ ] Deploy FastAPI so `/v1/bulk-uploads/trigger-landlord-overrides` is live. +- [ ] Deploy the lambda (`applications/landlord_description_overrides`) + event-source mapping queue → lambda. +- [ ] Lambda env/IAM: `OPENAI_API_KEY`, Postgres creds, **S3 read on the original-upload bucket** (it reads `upload.s3Bucket/s3Key`, not `retrofit-data-dev`). + +## D. Frontend deploy +- [ ] Deploy `assessment-model` with the new code (`FASTAPI_API_URL` / `FASTAPI_API_KEY` already set). + +## E. Verify the UI (Column Remapper) +- [ ] Map-columns page shows **one row per field with a header dropdown**, split into **Address fields** + **Landlord description fields**. +- [ ] Leaving Address 1 / Postcode unset blocks submit. +- [ ] Two **address** fields → one column is blocked; the **same** column → Property Type + Built Form is allowed. +- [ ] Existing `mapping_complete` uploads open with their mapping intact (confirms 0215). + +## F. Verify trigger → classify → persist +- [ ] Map ≥1 landlord-description field, click **Start address matching**. +- [ ] `sub_task` has two rows under the task: `service='address2uprn'` and `service='landlord_description_overrides'`. +- [ ] SQS message enqueued + lambda ran (CloudWatch). +- [ ] Rows appear with `source='classifier'`: + ```sql + SELECT description, value FROM landlord_property_type_overrides WHERE portfolio_id = LIMIT 10; + ``` + +## G. Verify the results view +- [ ] `/portfolio//landlord-overrides` lists `description → value` per category with a "classifier" badge. (No nav link yet — reach by URL.) + +## H. Regression — address matching unaffected +- [ ] The same upload's address pipeline still emits the canonical CSV (`Address 1`/`postcode`) and combines normally. + +## I. Watch-out (by design — ADR-0003 "accepted coupling") +- [ ] If the classifier subtask **fails**, the shared onboarding task goes FAILED and **"Run Combiner" is blocked**; the task only COMPLETEs once classification finishes. If painful, switch to the separate-task design in the ADR. + +--- + +## Still open / not done +- [ ] Commit the work (this repo + Model repo, separately) — currently uncommitted. +- [ ] Nav link to `/portfolio//landlord-overrides` (reachable by URL only). +- [ ] User-edit write-back for overrides (deferred — Q7 "read-only this iteration"). diff --git a/docs/wip/multi-entry-ordering-plan.md b/docs/wip/multi-entry-ordering-plan.md new file mode 100644 index 00000000..045f9522 --- /dev/null +++ b/docs/wip/multi-entry-ordering-plan.md @@ -0,0 +1,178 @@ +# Multi-entry building-part ordering — in-flight design notes + +**Status:** Grilling complete (2026-06-02) — ready to break into issues +**Branch:** `feature/frontend_landlord_overrides` +**Author:** Jun-te (with Claude, via `/grill-me`) + +A _design-in-progress_ document, not the ADR. It records the decisions reached +during grilling so the conversation can resume without re-litigating settled +questions. The flow + schema decision is promoted to +[ADR-0004](../adr/0004-multi-entry-building-part-ordering.md); new domain terms +are promoted to [CONTEXT.md](../../CONTEXT.md#building-parts). + +## Goal + +After address matching and classification finish, a single address row can carry +**comma-separated entries** in physical-element columns — e.g. +`Walls = "Cavity: AsBuilt (1976-1982), Cavity: FilledCavity"`, +`Roofs = "Flat: As Built, PitchedNormalLoftAccess: 200mm"`. Each entry is a +**building part** (main building + extensions). The order is ambiguous and a +**consistent per-file mistake**, so we capture the correct ordering from the user +**once per file** and persist it on the BulkUpload for a later consumer. + +## Backstory / ground truth (verified against the example file + code) + +- In `ARA AddressProfiling_Download_28-04-2026_10501 (2).xlsx` (32,213 data + rows): **0 UPRNs appear in more than one row** — multi-entry is + comma-separated values **inside one cell**, never multiple rows per address. +- In a multi-entry row the multi-valued columns **agree on count** (Walls=2 ∧ + Roofs=2) while whole-dwelling columns stay at 1 (`Property Type` = `"House: + EndTerrace"`). So position *i* is the **same building part across every + multi-valued column**. +- The classifier today **discards** this: [`get_col_to_description_mappings`](/workspaces/home/github/Model/orchestration/landlord_description_overrides_orchestrator.py) + does `value.split(",")` into a **`set`** — orderless, deduped. Correct for the + vocabulary layer (description→enum), but it drops exactly the + position/building-part association this feature needs. +- This is the **per-Property building-part fact** territory ADR-0002 deferred + ("a per-Property fact layer (not yet modelled)"). We are **not** building that + layer here — only **capturing** the ordering it will need. + +## Decided + +### Q1 — Order semantics: full reorder, keyed by count + +Position *i* = a building part. The user supplies a **permutation per distinct +entry-count**; persisted as `{ count: permutation }`. This iteration captures +only the **largest-count** sample (see Q5). + +### Q1.1 — Order scope: one ordering across all columns + +A single per-count permutation realigns **every** multi-valued column at once +(index-aligned — Walls[i] and Roofs[i] are the same part). Not per-column. +Matches the data (counts agree across columns). + +### Q1.2 — Mixed counts: single-value columns are whole-property + +A 1-entry column (e.g. `Property Type`) is a **whole-dwelling** fact attached to +the property; only columns with N>1 are sliced into building parts. No padding. + +### Q2 — Scope: capture + persist ordering only + +Detect multi-entry, show one sample address + our classification, capture the +per-count ordering, persist on the BulkUpload. **Not** in scope: the +per-Property fact table or writing main/extension facts at finalise. The +ordering is stored for a later consumer. + +### Q2.1 — Editable verification IS in scope (expands Q2) + +The "verify classification" step lets the user **correct** a classification, +written back as `source='user'`. This deliberately picks up ADR-0002 Q7's +deferred **vocabulary** user-override write path — distinct from the per-Property +fact layer, which stays deferred. + +### Q3 — Placement: on the `awaiting_review` surface + +Render the flow on the existing +[OnboardingProgress](<../../src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx>) +page when `status === "awaiting_review"`. Classification finishes *before* the +combiner (both subtasks must complete → combiner → `awaiting_review`), so by the +time Finalise is offered the classification output exists. No new route. + +### Q3.1 — Flow: two-step stepper, steps appear independently + +- **Step 1 — Verify classification** — shows whenever **≥1 classifier column** + was mapped. +- **Step 2 — Confirm order** — shows only when **multi-entry was detected**. +- A file with classifier columns but no multi-entry shows only Step 1; a file + with neither goes straight to Finalise. + +### Q3.2 — Gate: both steps gate Finalise (where each applies) + +`canFinalize = status==="awaiting_review" && (noClassifierCols || verifyAck) && +(noMultiEntry || orderingConfirmed)`. Two flags persisted. Finalise is one +click but the button stays disabled until its applicable gates are satisfied. + +### Q4 — Verify step lists the sample address's entries only + +Step 1 lists just the descriptions in the **one sample address** (matches "one +address"). Because a correction is per-`(portfolio, description)`, editing one +changes the mapping **portfolio-wide** for that text — the UI must say so. A +spot-check, not full-vocabulary coverage. + +### Q4.1 — Write-back: Next.js upsert, `source='user'`, single row (as built) + +A Next.js route handler / server action upserts the `landlord_*_overrides` row +by `(portfolio_id, description)` setting `value` + `source='user'`, validating +against the pgEnum. **Schema unchanged** — we keep ADR-0002's `UNIQUE +(portfolio_id, description)` and flip the single row's source in place. The +Python classifier's existing `ON CONFLICT … WHERE source='classifier'` +([landlord_overrides_postgres_repository.py:84-91](/workspaces/home/github/Model/infrastructure/landlord_overrides/landlord_overrides_postgres_repository.py#L84)) +then never re-clobbers it. + +> Considered and **rejected**: two rows per description (classifier + user) with +> read-time `user > classifier` resolution. It buys "revert to our suggestion" + +> provenance, and is cheap now (no readers exist yet), but reopens ADR-0002's +> `UNIQUE` decision and migrates Drizzle + 4 Python tables + the conflict target. +> Not worth it for this iteration; the single-row flip already gives "user wins". +> This is the first Next.js writer of a `source='user'` row. + +### Q5 — Which sample: the largest-count row + +Show one sample address — the row with the **most** building parts — so ordering +it reveals the fullest convention. In the common case (only N=2) that is a +single 2-part address. + +### Q5.1 — Reorder UI: label each position + +Lay the file's entries out as rows (position 0, 1, …), each with a building-part +dropdown (**Main building** / **Extension 1** / …). Assigning labels yields the +permutation and validates (each part used once, exactly one Main building). All +multi-valued columns are shown together, each raw entry annotated with our +classified enum, so the user sanity-checks classification **and** alignment. + +### Q6 — Detection: at start, persist a summary + +Compute the multi-entry summary in the **start-address-matching POST** +([route.ts:106](<../../src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts#L106>)) +where the full `rows` are already parsed in memory — which columns are +multi-valued, the distinct counts (with row-counts so we can pick the largest), +and the largest-count sample (address + per-column raw entries). Avoids +re-reading a 32k-row file at render. Classification enums are joined at render +from the override tables. + +### Q7 — Persistence: two jsonb columns on `bulk_address_uploads` + +- `multiEntrySummary jsonb` — written at start (detection). +- `multiEntryOrdering jsonb` — written at confirm: `{ count: permutation }` plus + `verifyAck` / `orderingConfirmed` flags (final shape TBD; may split flags into + their own columns). + +No new table — mirrors how `columnMapping` lives on the upload row. + +## Risks / load-bearing assumptions + +1. **Consistent-mistake assumption.** All rows of a given count share one + ordering convention. The whole "ask once" design rests on this; if a file + mixes conventions within a count, a single per-count permutation is wrong. +2. **Largest-count-only capture.** Smaller counts stay unpopulated in the map. + A future consumer (or a later UI iteration) needs a derivation rule to apply + the convention to other counts. +3. **Normalization coupling — mitigated.** To join the sample's raw entries to + the override tables the frontend must match the backend's `split(",")` → + `strip` → `lower`. **Resolution:** store the *normalized* description keys in + `multiEntrySummary` at start (the route already holds the rows), so the + render-time join is exact-match — no cross-repo string-normalization drift. +4. **Portfolio-wide blast radius.** A verify-step edit changes the mapping for + every row with that description, not just the sample address. Must be + messaged in the UI. + +## Suggested issues (`/to-issues`) + +1. Schema: two jsonb columns on `bulk_address_uploads` + migration. +2. Detection at start: compute + persist `multiEntrySummary` (with normalized + description keys). +3. Verify step: list sample descriptions → enum (join override tables), + editable; Next.js upsert route writing `source='user'`; `verifyAck` flag. +4. Order step: largest-count sample, position→part dropdowns → permutation; + persist `multiEntryOrdering`; `orderingConfirmed` flag. +5. Gate: wire `canFinalize` to the two flags; conditional stepper rendering. diff --git a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/classifications/route.ts b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/classifications/route.ts new file mode 100644 index 00000000..5be47873 --- /dev/null +++ b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/classifications/route.ts @@ -0,0 +1,68 @@ +import { + getSampleClassifications, + getUnknownOverrides, + setClassificationOverride, +} from "@/lib/bulkUpload/server"; +import { NextRequest, NextResponse } from "next/server"; +import { getServerSession } from "next-auth"; +import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions"; +import { z } from "zod"; + +// Read-only: the classifier's resolved enums for the review sample's entries +// (field -> description -> value), plus the descriptions still classified +// `Unknown` portfolio-wide — the Finalise gate blocks until that list is empty +// and the user can resolve each via PATCH below (ADR-0004 #298, ADR-0006). +export async function GET( + _request: NextRequest, + { params }: { params: Promise<{ portfolioId: string; uploadId: string }> } +) { + const session = await getServerSession(AuthOptions); + if (!session) return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + + const { portfolioId, uploadId } = await params; + const [classifications, unknown] = await Promise.all([ + getSampleClassifications(uploadId), + getUnknownOverrides(portfolioId), + ]); + return NextResponse.json({ classifications, unknown }, { status: 200 }); +} + +const PatchSchema = z.object({ + field: z.string(), + description: z.string(), + value: z.string(), +}); + +// Correct one classification, written as a user override (source='user'). The +// edit is per-(portfolio, description), so it applies portfolio-wide (issue #299). +export async function PATCH( + request: NextRequest, + { params }: { params: Promise<{ portfolioId: string; uploadId: string }> } +) { + const session = await getServerSession(AuthOptions); + if (!session) return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + + const { portfolioId } = await params; + + let body; + try { + body = PatchSchema.parse(await request.json()); + } catch { + return NextResponse.json({ error: "Invalid input" }, { status: 400 }); + } + + try { + const result = await setClassificationOverride( + portfolioId, + body.field, + body.description, + body.value, + ); + if (result.kind === "invalid") + return NextResponse.json({ error: result.reason }, { status: 422 }); + return NextResponse.json({ ok: true }, { status: 200 }); + } catch (error) { + console.error("Failed to save classification override:", error); + return NextResponse.json({ error: "Internal server error" }, { status: 500 }); + } +} diff --git a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/combine/route.ts b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/combine/route.ts index afee7f0e..bc2f8438 100644 --- a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/combine/route.ts +++ b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/combine/route.ts @@ -25,6 +25,10 @@ export async function POST( ); case "already_combined": return NextResponse.json({ alreadyCombined: true }, { status: 200 }); + case "already_dispatched": + // Lost the double-dispatch CAS (or the combiner is already running) — a + // benign no-op; the client just keeps polling and sees `combining`. + return NextResponse.json({ alreadyDispatched: true }, { status: 200 }); case "not_found": return NextResponse.json({ error: "Not found" }, { status: 404 }); case "missing_task": diff --git a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/finalize/route.ts b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/finalize/route.ts index b6fd7ec9..f326c65c 100644 --- a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/finalize/route.ts +++ b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/finalize/route.ts @@ -1,160 +1,46 @@ -import { db } from "@/app/db/db"; -import { property } from "@/app/db/schema/property"; -import { sql } from "drizzle-orm"; import { NextRequest, NextResponse } from "next/server"; -import { revalidatePath } from "next/cache"; import { getServerSession } from "next-auth"; import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions"; -import { createRetrofitDataS3Client } from "@/app/utils/s3"; -import * as XLSX from "xlsx"; -import { loadForFinalize, markFinalized } from "@/lib/bulkUpload/server"; - -const ADDRESS_COLS = ["Address 1", "Address 2", "Address 3"] as const; -const POSTCODE_COL = "postcode"; -const INTERNAL_REF_COL = "Internal Reference"; -const UPRN_COL = "address2uprn_uprn"; -const MATCHED_ADDRESS_COL = "address2uprn_address"; -const LEXISCORE_COL = "address2uprn_lexiscore"; -const MISSING_SENTINEL = "invalid postcode"; -const UK_POSTCODE_RE = /[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i; - -function normalize(v: unknown): string { - if (v === null || v === undefined) return ""; - return String(v).trim(); -} - -function isMissing(v: string): boolean { - return v === "" || v.toLowerCase() === MISSING_SENTINEL; -} - -function parseUprn(raw: unknown): bigint | null { - const v = normalize(raw); - if (isMissing(v)) return null; - try { - return BigInt(v); - } catch { - return null; - } -} - -function parseLexiscore(raw: unknown): number | null { - const v = normalize(raw); - if (isMissing(v)) return null; - const n = Number(v); - return Number.isFinite(n) ? n : null; -} - -function extractPostcode(matched: string | null, fallback: string): string | null { - if (matched) { - const m = matched.match(UK_POSTCODE_RE); - if (m) return m[0].toUpperCase(); - } - return fallback || null; -} - -function parseS3Uri(uri: string): { bucket: string; key: string } | null { - if (!uri.startsWith("s3://")) return null; - const rest = uri.slice(5); - const slash = rest.indexOf("/"); - if (slash < 0) return null; - return { bucket: rest.slice(0, slash), key: rest.slice(slash + 1) }; -} +import { readSessionToken } from "@/lib/session"; +import { dispatchFinaliser } from "@/lib/bulkUpload/server"; +// Finalise is now asynchronous (ADR-0005). This route no longer inserts +// properties; it dispatches the bulk_upload_finaliser Lambda and flips the +// BulkUpload to `finalising` via a compare-and-swap (the double-dispatch guard). +// The Lambda reads the combiner output, inserts the property rows, and writes the +// terminal `complete`/`failed` status directly. The user sees "Uploading to ARA" +// while the row is `finalising`; the onboarding surface polls for the outcome. export async function POST( - _request: NextRequest, + request: NextRequest, { params }: { params: Promise<{ portfolioId: string; uploadId: string }> } ) { const session = await getServerSession(AuthOptions); if (!session) return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); const { uploadId } = await params; + const sessionToken = readSessionToken(request); - const guarded = await loadForFinalize(uploadId); - switch (guarded.kind) { + const result = await dispatchFinaliser({ uploadId, sessionToken }); + + switch (result.kind) { + case "ok": + // Accepted: the finaliser is running; the row is now `finalising`. + return NextResponse.json({ taskId: result.taskId }, { status: 202 }); case "not_found": return NextResponse.json({ error: "Not found" }, { status: 404 }); case "already_finalized": + // Idempotent: nothing to do. return new NextResponse(null, { status: 200 }); - case "wrong_state": - return NextResponse.json( - { error: `Upload not ready to finalize (state: ${guarded.current})` }, - { status: 409 } - ); case "not_yet_combined": return NextResponse.json({ error: "Combiner not finished" }, { status: 409 }); - } - const upload = guarded.upload; - - const parsed = parseS3Uri(upload.combinedOutputS3Uri!); - if (!parsed) { - return NextResponse.json({ error: "Invalid combined output S3 URI" }, { status: 500 }); - } - - const s3 = createRetrofitDataS3Client(); - - let rawRows: Record[]; - try { - const obj = await s3 - .getObject({ Bucket: parsed.bucket, Key: parsed.key }) - .promise(); - const buf = Buffer.from(obj.Body as Uint8Array); - const wb = XLSX.read(buf, { type: "buffer" }); - const sheet = wb.Sheets[wb.SheetNames[0]]; - rawRows = XLSX.utils.sheet_to_json>(sheet, { defval: "" }); - } catch (err) { - console.error("Failed to read combined CSV from S3:", err); - return NextResponse.json({ error: "Failed to read combined CSV" }, { status: 502 }); - } - - const portfolioIdBig = BigInt(upload.portfolioId); - - const values = rawRows.map((raw) => { - const userInputtedAddress = - ADDRESS_COLS.map((c) => normalize(raw[c])).filter(Boolean).join(", ") || null; - const userInputtedPostcode = normalize(raw[POSTCODE_COL]) || null; - - const uprn = parseUprn(raw[UPRN_COL]); - - const matchedAddressRaw = normalize(raw[MATCHED_ADDRESS_COL]); - const matchedAddress = isMissing(matchedAddressRaw) ? null : matchedAddressRaw; - - const address = matchedAddress ?? userInputtedAddress; - const postcode = extractPostcode(matchedAddress, userInputtedPostcode ?? ""); - - const internalRef = normalize(raw[INTERNAL_REF_COL]) || null; - const lexiscore = parseLexiscore(raw[LEXISCORE_COL]); - - return { - portfolioId: portfolioIdBig, - creationStatus: "READY" as const, - uprn, - landlordPropertyId: internalRef, - address, - postcode, - userInputtedAddress, - userInputtedPostcode, - lexiscore, - }; - }); - - try { - if (values.length > 0) { - await db - .insert(property) - .values(values) - .onConflictDoNothing({ - target: [property.portfolioId, property.uprn], - where: sql`${property.uprn} IS NOT NULL`, - }); - } - - await markFinalized(uploadId); - - revalidatePath("/portfolio/[slug]", "layout"); - - return new NextResponse(null, { status: 200 }); - } catch (err) { - console.error("Failed to finalize bulk upload:", err); - return NextResponse.json({ error: "Failed to import properties" }, { status: 500 }); + case "missing_task": + return NextResponse.json({ error: "Upload has no task to finalise" }, { status: 409 }); + case "wrong_state": + return NextResponse.json( + { error: `Upload not ready to finalize (state: ${result.current})` }, + { status: 409 } + ); + case "trigger_failed": + return NextResponse.json({ error: result.message }, { status: result.status }); } } diff --git a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/multi-entry-ordering/route.ts b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/multi-entry-ordering/route.ts new file mode 100644 index 00000000..676032fa --- /dev/null +++ b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/multi-entry-ordering/route.ts @@ -0,0 +1,49 @@ +import { setMultiEntryOrdering } from "@/lib/bulkUpload/server"; +import { NextRequest, NextResponse } from "next/server"; +import { getServerSession } from "next-auth"; +import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions"; +import { z } from "zod"; + +const PatchSchema = z.object({ + // entry-count -> permutation (part slot -> file position). See ADR-0004. + permutations: z.record(z.string(), z.array(z.number().int().nonnegative())), +}); + +export async function PATCH( + request: NextRequest, + { params }: { params: Promise<{ portfolioId: string; uploadId: string }> } +) { + const session = await getServerSession(AuthOptions); + if (!session) return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + + const { uploadId } = await params; + + let body; + try { + body = PatchSchema.parse(await request.json()); + } catch { + return NextResponse.json({ error: "Invalid input" }, { status: 400 }); + } + + try { + const result = await setMultiEntryOrdering(uploadId, body.permutations); + switch (result.kind) { + case "ok": + return NextResponse.json(result.upload, { status: 200 }); + case "not_found": + return NextResponse.json({ error: "Not found" }, { status: 404 }); + case "wrong_state": + return NextResponse.json( + { error: `Cannot set ordering in state '${result.current}'` }, + { status: 409 } + ); + case "not_multi_entry": + return NextResponse.json({ error: "Upload has no multi-entry rows" }, { status: 409 }); + case "invalid_ordering": + return NextResponse.json({ error: result.reason }, { status: 422 }); + } + } catch (error) { + console.error("Failed to save multi-entry ordering:", error); + return NextResponse.json({ error: "Internal server error" }, { status: 500 }); + } +} diff --git a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts index 5fd282fa..e7e6ae50 100644 --- a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts +++ b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts @@ -1,39 +1,35 @@ import { NextRequest, NextResponse } from "next/server"; import { getServerSession } from "next-auth"; +import { randomUUID } from "node:crypto"; import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions"; import { createS3Client, createRetrofitDataS3Client, retrofitDataS3Bucket } from "@/app/utils/s3"; import * as XLSX from "xlsx"; -import { loadForAddressMatching, triggerAddressMatching } from "@/lib/bulkUpload/server"; +import { loadForAddressMatching, saveMultiEntrySummary, triggerAddressMatching, triggerClassifier } from "@/lib/bulkUpload/server"; import { readSessionToken } from "@/lib/session"; +import { ADDRESS_FIELDS, classifierMapping } from "@/lib/bulkUpload/columnFields"; +import { addressCsvKey, classifierCsvKey, SOURCE_ROW_ID_COLUMN } from "@/lib/bulkUpload/s3Keys"; +import { detectMultiEntry } from "@/lib/bulkUpload/multiEntry"; -const FIELD_RENAME: Record = { - address_1: "Address 1", - address_2: "Address 2", - address_3: "Address 3", - postcode: "postcode", - internal_reference: "Internal Reference", -}; +type SheetRow = Record; -function transformFile( - buffer: Buffer, - columnMapping: Record -): { csv: string; error?: never } | { csv?: never; error: string } { +function readRows(buffer: Buffer): SheetRow[] { const wb = XLSX.read(buffer, { type: "buffer" }); const sheet = wb.Sheets[wb.SheetNames[0]]; - const rows = XLSX.utils.sheet_to_json>(sheet, { defval: "" }); + return XLSX.utils.sheet_to_json(sheet, { defval: "" }); +} - if (rows.length === 0) return { error: "Empty file" }; - - const sourceHeaders = Object.keys(rows[0]); +// Address-matching CSV: address fields only, renamed to canonical headers. +function buildAddressCsv( + rows: SheetRow[], + columnMapping: Record // field → source header +): { csv: string; error?: never } | { csv?: never; error: string } { const outputHeaders: string[] = []; - const sourceToOutput: Record = {}; - - for (const src of sourceHeaders) { - const mapped = columnMapping[src]; - if (!mapped || mapped === "skip") continue; - const renamed = FIELD_RENAME[mapped] ?? mapped; - outputHeaders.push(renamed); - sourceToOutput[src] = renamed; + const outputToSource: Record = {}; + for (const field of ADDRESS_FIELDS) { + const src = columnMapping[field.value]; + if (!src || !field.outputHeader) continue; + outputHeaders.push(field.outputHeader); + outputToSource[field.outputHeader] = src; } if (!outputHeaders.includes("Address 1")) @@ -41,11 +37,17 @@ function transformFile( if (!outputHeaders.includes("postcode")) return { error: 'Mapping must include "postcode"' }; + // Carry the synthetic per-row join key through to the combiner output, so the + // finaliser can re-associate a UPRN-matched row with its classifier + // descriptions (ADR-0006). It rides `address2uprn` as a preserved input column. + outputHeaders.push(SOURCE_ROW_ID_COLUMN); + const outputRows = rows.map((row) => { - const out: Record = {}; - for (const [src, renamed] of Object.entries(sourceToOutput)) { - out[renamed] = row[src] ?? ""; + const out: SheetRow = {}; + for (const [outName, src] of Object.entries(outputToSource)) { + out[outName] = row[src] ?? ""; } + out[SOURCE_ROW_ID_COLUMN] = row[SOURCE_ROW_ID_COLUMN] ?? ""; return out; }); @@ -53,6 +55,32 @@ function transformFile( return { csv: XLSX.utils.sheet_to_csv(outSheet) }; } +// Classifier CSV: the mapped classifier source columns only, original headers +// preserved (the lambda resolves them via column_mapping). Converting here means +// the classifier always reads a real CSV even when the upload was .xlsx/.xls — +// see ADR-0003. One source header may feed several categories, so dedupe to +// distinct headers. +function buildClassifierCsv( + rows: SheetRow[], + classifierMap: Record // category → source header +): string { + const sourceHeaders = [...new Set(Object.values(classifierMap))]; + // Emit the synthetic join key alongside the classifier columns so the + // finaliser can join this row's descriptions to its combiner identity by + // `source_row_id` (ADR-0006). `buildClassifierCsv` projects a fixed column + // set, so the key must be added explicitly — attaching it to the row is not + // enough. + const headers = [...sourceHeaders, SOURCE_ROW_ID_COLUMN]; + const outputRows = rows.map((row) => { + const out: SheetRow = {}; + for (const h of sourceHeaders) out[h] = row[h] ?? ""; + out[SOURCE_ROW_ID_COLUMN] = row[SOURCE_ROW_ID_COLUMN] ?? ""; + return out; + }); + const outSheet = XLSX.utils.json_to_sheet(outputRows, { header: headers }); + return XLSX.utils.sheet_to_csv(outSheet); +} + export async function POST( request: NextRequest, { params }: { params: Promise<{ portfolioId: string; uploadId: string }> } @@ -91,11 +119,29 @@ export async function POST( return NextResponse.json({ error: "Failed to read source file" }, { status: 500 }); } - const transformed = transformFile(fileBuffer, upload.columnMapping!); + const parsedRows = readRows(fileBuffer); + if (parsedRows.length === 0) + return NextResponse.json({ error: "Empty file" }, { status: 422 }); + + // Mint a stable synthetic id per source row, here at the one point both CSVs + // are built from the same array, and write it into both. It is the finaliser's + // join key between the combiner output (identity) and the classifier CSV + // (descriptions) — see ADR-0006. Deterministic ordering is not required: both + // CSVs are produced together in this handler, so they always share values. + const rows = parsedRows.map((row) => ({ + ...row, + [SOURCE_ROW_ID_COLUMN]: randomUUID(), + })); + + // Detect multi-entry building parts now, while the whole file is parsed in + // memory, so the awaiting_review surface never re-reads it (ADR-0004). + await saveMultiEntrySummary(uploadId, detectMultiEntry(rows, upload.columnMapping!)); + + const transformed = buildAddressCsv(rows, upload.columnMapping!); if (transformed.error) return NextResponse.json({ error: transformed.error }, { status: 422 }); - const transformedKey = `bulk_onboarding_inputs/${portfolioId}/${uploadId}.csv`; + const transformedKey = addressCsvKey(portfolioId, uploadId); try { await outputS3 .putObject({ @@ -112,13 +158,37 @@ export async function POST( const s3Uri = `s3://${outputBucket}/${transformedKey}`; - const trigger = await triggerAddressMatching({ - uploadId, - s3Uri, - sessionToken: readSessionToken(request), - }); + // Convert the mapped classifier columns to their own CSV so the classifier + // lambda always parses a real CSV, never the raw upload (which may be + // .xlsx/.xls). Only when the user mapped ≥1 classifier column. See ADR-0003. + const classifierMap = classifierMapping(upload.columnMapping!); + let classifierS3Uri: string | undefined; + if (Object.keys(classifierMap).length > 0) { + const classifierKey = classifierCsvKey(portfolioId, uploadId); + try { + await outputS3 + .putObject({ + Bucket: outputBucket, + Key: classifierKey, + Body: buildClassifierCsv(rows, classifierMap), + ContentType: "text/csv", + }) + .promise(); + classifierS3Uri = `s3://${outputBucket}/${classifierKey}`; + } catch (err) { + // Non-blocking: classification is skipped, address matching proceeds. + console.error("Failed to upload classifier CSV:", err); + } + } + + const sessionToken = readSessionToken(request); + const trigger = await triggerAddressMatching({ uploadId, s3Uri, sessionToken }); if (trigger.kind === "trigger_failed") return NextResponse.json({ error: trigger.message }, { status: trigger.status }); + // Co-fire the landlord classifier (non-blocking) under the same task. + if (classifierS3Uri) + await triggerClassifier({ taskId: trigger.taskId, uploadId, s3Uri: classifierS3Uri, sessionToken }); + return NextResponse.json({ taskId: trigger.taskId }, { status: 200 }); } diff --git a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/verify-classification/route.ts b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/verify-classification/route.ts new file mode 100644 index 00000000..391b3403 --- /dev/null +++ b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/verify-classification/route.ts @@ -0,0 +1,35 @@ +import { setVerifyAck } from "@/lib/bulkUpload/server"; +import { NextRequest, NextResponse } from "next/server"; +import { getServerSession } from "next-auth"; +import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions"; + +// Records the "Verify classification" acknowledgement (ADR-0004 Step 1). No +// body — the per-row corrections go through the classifications PATCH; this just +// marks that the user has checked the sample and unlocks Finalise. +export async function PATCH( + request: NextRequest, + { params }: { params: Promise<{ portfolioId: string; uploadId: string }> } +) { + const session = await getServerSession(AuthOptions); + if (!session) return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + + const { uploadId } = await params; + + try { + const result = await setVerifyAck(uploadId); + switch (result.kind) { + case "ok": + return NextResponse.json(result.upload, { status: 200 }); + case "not_found": + return NextResponse.json({ error: "Not found" }, { status: 404 }); + case "wrong_state": + return NextResponse.json( + { error: `Cannot verify classification in state '${result.current}'` }, + { status: 409 } + ); + } + } catch (error) { + console.error("Failed to record classification verification:", error); + return NextResponse.json({ error: "Internal server error" }, { status: 500 }); + } +} diff --git a/src/app/db/schema/bulk_address_uploads.ts b/src/app/db/schema/bulk_address_uploads.ts index 3e3c1dc9..b695c140 100644 --- a/src/app/db/schema/bulk_address_uploads.ts +++ b/src/app/db/schema/bulk_address_uploads.ts @@ -22,17 +22,26 @@ export interface MultiEntrySummary { multiValuedFields: string[]; countDistribution: Record; largestCount: number; + // Step 1 (verify) sample: the largest-count row when multi-entry, else the + // first classified row. `null` ⇒ nothing to verify. sample: MultiEntrySample | null; + // Step 2 (order): one sample per distinct entry-count ≥ 2 present in the file, + // keyed by count. Each count needs its OWN confirmed permutation — a smaller + // count's ordering can't be derived from a larger one (ADR-0004, amended + // 2026-06-05). Absent on uploads detected before that amendment. + samplesByCount?: Record; } -// User-confirmed building-part ordering (ADR-0004). Keyed by entry-count so it -// can hold more than one count later; this iteration populates only the -// largest. permutations[count][k] = the 0-based file position holding building -// part k, where 0 = Main building, 1..N-1 = Extension 1..N-1. +// User-confirmed building-part ordering (ADR-0004, amended 2026-06-05). Keyed by +// entry-count: a permutation is captured for EVERY distinct count ≥ 2 in the +// file (the v2 fact layer can't derive one count's order from another). +// permutations[count][k] = the 0-based file position holding building part k, +// where 0 = Main building, 1..N-1 = Extension 1..N-1. // e.g. { "2": [1, 0] } => for 2-part rows the main building is file position 1. export interface MultiEntryOrdering { permutations: Record; - // Set once the user confirms; gates Finalise when the upload is multi-entry. + // True once EVERY detected count ≥ 2 has a permutation; gates Finalise when the + // upload is multi-entry. confirmed: boolean; } @@ -46,16 +55,8 @@ export const bulkAddressUploads = pgTable("bulk_address_uploads", { status: text("status").notNull().default("ready_for_processing"), sourceHeaders: text("source_headers").array().notNull().default(sql`'{}'`), columnMapping: jsonb("column_mapping").$type>(), - // Multi-entry building-part detection, computed at start-address-matching - // and read by the awaiting_review review surface (ADR-0004). multiEntrySummary: jsonb("multi_entry_summary").$type(), - // User-confirmed building-part ordering for the multi-entry sample (ADR-0004). multiEntryOrdering: jsonb("multi_entry_ordering").$type(), - // Step 1 "Verify classification" acknowledgement (ADR-0004). Gates Finalise - // whenever the upload has classifier columns — independent of multi-entry, so - // it lives in its own column rather than on multiEntryOrdering. Set true by - // the verify-classification route once the user confirms the sample's - // description→enum mapping. verifyAck: boolean("verify_ack").notNull().default(false), taskId: uuid("task_id"), combinedOutputS3Uri: text("combined_output_s3_uri"), diff --git a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx index 3824b1b4..f5dc7cd0 100644 --- a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx +++ b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx @@ -1,18 +1,59 @@ "use client"; +import { useState } from "react"; import { useRouter } from "next/navigation"; import Link from "next/link"; -import { ArrowRightIcon } from "@heroicons/react/24/outline"; +import { ArrowRightIcon, CheckCircleIcon } from "@heroicons/react/24/outline"; import { useBulkUploadProgress, + useConfirmMultiEntryOrdering, + useConfirmVerification, + useEditClassification, useFinalize, useRequestCombine, + useSampleClassifications, + type SampleClassifications, } from "@/lib/bulkUpload/client"; +import { + partLabel, + isPermutation, + assignmentToPermutation, + type MultiEntrySample, +} from "@/lib/bulkUpload/multiEntry"; +import type { MultiEntryOrdering } from "@/app/db/schema/bulk_address_uploads"; +import { + PropertyTypeValues, + BuiltFormTypeValues, + WallTypeValues, + RoofTypeValues, +} from "@/app/db/schema/landlord_overrides"; +import { CLASSIFIER_FIELDS } from "@/lib/bulkUpload/columnFields"; +import { statusLabel, isTerminalStatus } from "@/lib/bulkUpload/types"; + +// Valid enum options per classifier category, for the editable dropdowns (#299). +const CATEGORY_VALUES: Record = { + property_type: PropertyTypeValues, + built_form_type: BuiltFormTypeValues, + wall_type: WallTypeValues, + roof_type: RoofTypeValues, +}; + +// Our category label per classifier field (e.g. built_form_type → "Built Form"). +// Distinguishes the categories when several read from the same source column — +// Property Type and Built Form both come from one column, so labelling by the +// customer's header alone is ambiguous. +const FIELD_LABEL: Record = Object.fromEntries( + CLASSIFIER_FIELDS.map((f) => [f.value, f.label]), +); interface Props { portfolioSlug: string; portfolioId: string; uploadId: string; + // The status at the last server render. Used to refresh the server page exactly + // once when polling first observes a terminal status (async finalise, ADR-0005), + // so the page advances from "Uploading to ARA" to the "Processing complete" card. + serverStatus: string; isDomnaUser: boolean; } @@ -23,13 +64,33 @@ export default function OnboardingProgress({ portfolioSlug, portfolioId, uploadId, + serverStatus, isDomnaUser, }: Props) { const router = useRouter(); - const progress = useBulkUploadProgress(portfolioId, uploadId); + const progress = useBulkUploadProgress(portfolioId, uploadId, { + // When the async finaliser finishes, the poll flips the status to a terminal + // value while the server page is still on `finalising`. Refresh once so the + // server re-renders the "Processing complete" / "failed" card. Guarding on the + // non-terminal serverStatus prevents a refresh loop: after the refresh the + // prop is terminal, so this no-ops. + onSuccess: (data) => { + if (!isTerminalStatus(serverStatus) && isTerminalStatus(data.upload.status)) { + router.refresh(); + } + }, + }); const combine = useRequestCombine(portfolioId, uploadId); const finalize = useFinalize(portfolioId, uploadId); + // Read-only classifications for the multi-entry sample (issue #298). Fetched + // only once a sample exists at awaiting_review. Hook stays above the early + // returns so its order is stable. + const sampleReady = + progress.data?.upload.status === "awaiting_review" && + !!progress.data.upload.multiEntrySummary?.sample; + const classifications = useSampleClassifications(portfolioId, uploadId, sampleReady); + if (progress.isError) return null; if (!progress.data) { return ( @@ -41,19 +102,61 @@ export default function OnboardingProgress({ } const { task, upload } = progress.data; - const total = task?.totalSubtasks ?? 0; - const completedSubtasks = task?.completedSubtasks ?? 0; - const failedSubtasks = task?.failedSubtasks ?? 0; + // Address-matching batches drive the bar; classification is shown separately. + const total = task?.addressTotal ?? 0; + const completedSubtasks = task?.addressCompleted ?? 0; + const failedSubtasks = task?.addressFailed ?? 0; const percent = total > 0 ? Math.round((completedSubtasks / total) * 100) : 0; + const classifierTotal = task?.classifierTotal ?? 0; + const classifierCompleted = task?.classifierCompleted ?? 0; + const classifierFailed = task?.classifierFailed ?? 0; + const taskStatus = task?.status.toLowerCase() ?? ""; const taskDone = TASK_TERMINAL_STATUSES.has(taskStatus); const taskFailed = TASK_FAILED_STATUSES.has(taskStatus); const isCombining = upload.status === "combining"; const isImporting = upload.status === "awaiting_review"; + // Async finaliser in flight (ADR-0005). Polling continues (non-terminal) until + // the backend writes complete/failed. Surfaced to the user as "Uploading to ARA". + const isFinalising = upload.status === "finalising"; const canRunCombiner = taskDone && !taskFailed && upload.status === "processing"; - const canFinalize = upload.status === "awaiting_review"; + const isAwaitingReview = upload.status === "awaiting_review"; + + // Two-step review on the awaiting_review surface (ADR-0004). The sample exists + // whenever classifier columns were mapped; multi-entry is largestCount >= 2. + // Step 1 (verify) applies whenever there's a sample; Step 2 (order) only when + // multi-entry. Each gates Finalise where it applies. + const sample = isAwaitingReview ? (upload.multiEntrySummary?.sample ?? null) : null; + const isMultiEntry = (upload.multiEntrySummary?.largestCount ?? 0) >= 2; + const verifyAck = upload.verifyAck ?? false; + const orderingConfirmed = upload.multiEntryOrdering?.confirmed ?? false; + const needsVerify = !!sample; + const needsOrdering = !!sample && isMultiEntry; + // One ordering panel per distinct count ≥ 2, ascending (ADR-0004 amendment). + // Fall back to the single Step-1 sample for uploads detected before per-count + // capture existed (samplesByCount absent). + const samplesByCount = upload.multiEntrySummary?.samplesByCount; + const orderingSamples: Array<[string, MultiEntrySample]> = + samplesByCount && Object.keys(samplesByCount).length > 0 + ? Object.entries(samplesByCount).sort(([a], [b]) => Number(a) - Number(b)) + : sample && isMultiEntry + ? [[String(sample.count), sample]] + : []; + const showStepNumbers = needsVerify && needsOrdering; + // Descriptions still classified `Unknown` block Finalise — the user must map + // every one to a real value, else the finaliser fails loudly (ADR-0006). + const unknownByField = classifications.data?.unknown ?? {}; + const unknownTotal = Object.values(unknownByField).reduce( + (n, descriptions) => n + descriptions.length, + 0, + ); + const canFinalize = + isAwaitingReview && + (!needsVerify || verifyAck) && + (!needsOrdering || orderingConfirmed) && + unknownTotal === 0; return (
@@ -65,21 +168,40 @@ export default function OnboardingProgress({
- {total > 0 && ( - - {completedSubtasks} / {total} batches complete - - )} - {failedSubtasks > 0 && ( - - - {failedSubtasks} failed - - )} - {!taskDone && ( - - - Running + {/* Address matching: standardises addresses against the OS lookup, in batches. */} + + Address matching: + {failedSubtasks > 0 ? ( + + + {failedSubtasks} of {total} batches failed + + ) : total > 0 && completedSubtasks >= total ? ( + complete + ) : ( + + + running{total > 0 ? ` · ${completedSubtasks} / ${total} batches` : ""} + + )} + + {/* Classification: turns the landlord's free-text descriptions into EPC categories. */} + {classifierTotal > 0 && ( + + Classification: + {classifierFailed > 0 ? ( + + + failed + + ) : classifierCompleted >= classifierTotal ? ( + complete + ) : ( + + + running + + )} )} {isCombining && ( @@ -94,9 +216,58 @@ export default function OnboardingProgress({ Awaiting import )} + {isFinalising && ( + + + {statusLabel("finalising")}… + + )}
- {(canRunCombiner || canFinalize) && ( + {needsVerify && sample && ( + + )} + + {isAwaitingReview && unknownTotal > 0 && ( + + )} + + {needsOrdering && orderingSamples.length > 0 && ( +
+ {orderingSamples.map(([count, orderSample], i) => ( + 1 + ? `Part group ${i + 1}` + : undefined + } + portfolioId={portfolioId} + uploadId={uploadId} + /> + ))} +
+ )} + + {(canRunCombiner || isAwaitingReview) && (
{canRunCombiner && ( combine.mutate()} /> )} - {canFinalize && ( + {isAwaitingReview && ( 0 + ? `Resolve ${unknownTotal} unclassified description${unknownTotal === 1 ? "" : "s"} first` + : needsVerify && !verifyAck + ? "Verify the classification first" + : "Confirm the building-part order first" + } onClick={() => finalize.mutate(undefined, { onSuccess: () => router.refresh() }) } @@ -141,23 +320,395 @@ export default function OnboardingProgress({ ); } +// Step 1 — Verify classification (ADR-0004). Lists how we read the sample +// address's descriptions (per mapped classifier column) and lets the user +// correct any, written back as source='user'. Acknowledging unlocks Finalise. +// Shown whenever classifier columns were mapped, multi-entry or not. +function VerifyClassificationPanel({ + sample, + classifications, + verified, + stepLabel, + portfolioId, + uploadId, +}: { + sample: MultiEntrySample; + classifications: SampleClassifications; + verified: boolean; + stepLabel?: string; + portfolioId: string; + uploadId: string; +}) { + const editClassification = useEditClassification(portfolioId, uploadId); + const confirm = useConfirmVerification(portfolioId, uploadId); + + return ( +
+

+ {stepLabel ? `${stepLabel} — ` : ""}Verify classification +

+

+ Based on your column mapping, here's how we read{" "} + {sample.address ? {sample.address} : "one sample address"}. + Correct anything that's wrong, then confirm. +

+ +
+ {sample.columns.map((column) => { + // One editable row per distinct description, in file order. + const seen = new Set(); + const entries = column.entries.filter((entry) => { + if (seen.has(entry.description)) return false; + seen.add(entry.description); + return true; + }); + const options = CATEGORY_VALUES[column.field] ?? []; + return ( +
+

+ {FIELD_LABEL[column.field] ?? column.field} + + — from your “{column.header}” column + +

+
+ {entries.map((entry) => { + const classified = classifications[column.field]?.[entry.description] ?? ""; + return ( +
+ + {entry.raw} + + + +
+ ); + })} +
+
+ ); + })} +
+ +

+ Correcting a classification updates that description for{" "} + every row across the portfolio, not + just this address. +

+ {editClassification.error && ( +

{editClassification.error.message}

+ )} + +
+ + {verified && !confirm.isPending && ( + + + Classification verified + + )} +
+ {confirm.error && ( +

{confirm.error.message}

+ )} +
+ ); +} + +// Interactive building-part ordering for ONE entry-count's sample (ADR-0004, +// amended 2026-06-05 — one panel per distinct count). The user labels each file +// position with a building part (one Main building + Extensions); the labels +// must form a permutation. Confirming persists this count's ordering (merged +// server-side with the other counts'); Finalise unlocks once every count is +// confirmed. +function MultiEntryOrderingPanel({ + sample, + ordering, + classifications, + stepLabel, + portfolioId, + uploadId, +}: { + sample: MultiEntrySample; + ordering: MultiEntryOrdering | null; + classifications: SampleClassifications; + stepLabel?: string; + portfolioId: string; + uploadId: string; +}) { + const confirm = useConfirmMultiEntryOrdering(portfolioId, uploadId); + const count = sample.count; + // Only the multi-valued columns are sliced into building parts; single-value + // columns are whole-dwelling facts (verified in Step 1, not ordered here). + const orderColumns = sample.columns.filter((column) => column.entries.length > 1); + + // assignment[filePosition] = building-part slot. Seed from a stored ordering + // (slot -> position, so invert) or default to identity (main building first). + const [assignment, setAssignment] = useState(() => { + const stored = ordering?.permutations?.[String(count)]; + if (stored && stored.length === count) { + const seeded = new Array(count); + stored.forEach((position, slot) => { + seeded[position] = slot; + }); + return seeded; + } + return Array.from({ length: count }, (_, i) => i); + }); + + // Per-panel confirmation reflects whether THIS count's permutation is stored, + // not the global all-counts-confirmed flag — so each panel gives its own + // feedback as the user works through them. + const confirmed = Array.isArray(ordering?.permutations?.[String(count)]); + const valid = isPermutation(assignment); + + const setSlot = (position: number, slot: number) => + setAssignment((prev) => prev.map((s, i) => (i === position ? slot : s))); + + const onConfirm = () => { + if (!valid) return; + confirm.mutate({ + permutations: { [String(count)]: assignmentToPermutation(assignment) }, + }); + }; + + return ( +
+

+ {stepLabel ? `${stepLabel} — ` : ""}Confirm building-part order +

+

+ {sample.address ? {sample.address} : "An address"}{" "} + has {count} building parts. Tell us which entry is the main building and + which are extensions — we'll apply the same order to every{" "} + {count}-part row in this file. +

+ +
+ + + + + {orderColumns.map((column) => ( + + ))} + + + + + {Array.from({ length: count }).map((_, position) => ( + + + {orderColumns.map((column) => { + const entry = column.entries[position]; + const classified = entry + ? classifications[column.field]?.[entry.description] ?? "" + : ""; + return ( + + ); + })} + + + ))} + +
Entry + {FIELD_LABEL[column.field] ?? column.header} + + your “{column.header}” + + Building part
{position + 1} +
{entry?.raw ?? "—"}
+ {/* Read-only classification annotation; edit it in Step 1. */} + {entry && ( +
+ {classified || "not classified"} +
+ )} +
+ +
+
+ + {!valid && ( +

+ Each part (Main building, Extension 1, …) must be used exactly once. +

+ )} + +
+ + {confirmed && !confirm.isPending && ( + + + Order confirmed + + )} +
+ + {confirm.error && ( +

{confirm.error.message}

+ )} +
+ ); +} + +// Unresolved-classification gate (ADR-0006). Lists every description still +// classified `Unknown` portfolio-wide and lets the user map each to a real value +// via the same per-description override path as Step 1 (it applies portfolio- +// wide). Finalise stays blocked until this list is empty — `Unknown` is never a +// final value, and an unresolved one would fail the import loudly. +function UnresolvedClassificationsPanel({ + unknown, + portfolioId, + uploadId, +}: { + unknown: Record; + portfolioId: string; + uploadId: string; +}) { + const editClassification = useEditClassification(portfolioId, uploadId); + const total = Object.values(unknown).reduce((n, d) => n + d.length, 0); + + return ( +
+

+ Resolve unclassified descriptions ({total}) +

+

+ We couldn't classify these automatically. Map each to a category + before finalising — an unresolved value would fail the import. Edits apply + to every row across the portfolio. +

+ +
+ {Object.entries(unknown).map(([field, descriptions]) => { + const options = (CATEGORY_VALUES[field] ?? []).filter((o) => o !== "Unknown"); + return ( +
+

+ {FIELD_LABEL[field] ?? field} +

+
+ {descriptions.map((description) => ( +
+ + {description} + + + +
+ ))} +
+
+ ); + })} +
+ {editClassification.error && ( +

{editClassification.error.message}

+ )} +
+ ); +} + function StageButton({ label, activeLabel, isPending, + disabled = false, + disabledReason, onClick, }: { label: string; activeLabel: string; isPending: boolean; + disabled?: boolean; + disabledReason?: string; onClick: () => void; }) { + const blocked = isPending || disabled; return (
diff --git a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/page.tsx b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/page.tsx index f28787ff..522fc8ce 100644 --- a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/page.tsx +++ b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/page.tsx @@ -67,6 +67,14 @@ const STATUS_CONFIG = { body: "Matches ready, writing into your portfolio.", cta: false, }, + finalising: { + icon: ArrowPathIcon, + iconBg: "bg-blue-50", + iconColor: "text-blue-500", + title: "Uploading to ARA", + body: "Creating your properties from the matched addresses. This can take a little while for large files.", + cta: false, + }, complete: { icon: CheckCircleIcon, iconBg: "bg-green-50", @@ -167,6 +175,7 @@ export default async function BulkUploadDetailPage(props: { {(statusKey === "processing" || statusKey === "combining" || statusKey === "awaiting_review" || + statusKey === "finalising" || statusKey === "complete" || statusKey === "failed") && upload.taskId && ( @@ -174,6 +183,7 @@ export default async function BulkUploadDetailPage(props: { portfolioSlug={slug} portfolioId={upload.portfolioId} uploadId={uploadId} + serverStatus={upload.status} isDomnaUser={isDomnaUser} /> )} diff --git a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/page.tsx b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/page.tsx index 281f5744..6aaca9c2 100644 --- a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/page.tsx +++ b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/page.tsx @@ -14,6 +14,7 @@ import { const STATUS_LABELS: Record = { ready_for_processing: { label: "Ready", classes: "bg-amber-100 text-amber-700" }, processing: { label: "Processing", classes: "bg-blue-100 text-blue-700" }, + finalising: { label: "Uploading to ARA", classes: "bg-blue-100 text-blue-700" }, complete: { label: "Complete", classes: "bg-green-100 text-green-700" }, failed: { label: "Failed", classes: "bg-red-100 text-red-700" }, }; diff --git a/src/app/portfolio/[slug]/(portfolio)/landlord-overrides/page.tsx b/src/app/portfolio/[slug]/(portfolio)/landlord-overrides/page.tsx new file mode 100644 index 00000000..9e9ac249 --- /dev/null +++ b/src/app/portfolio/[slug]/(portfolio)/landlord-overrides/page.tsx @@ -0,0 +1,101 @@ +import { getServerSession } from "next-auth"; +import { redirect } from "next/navigation"; +import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions"; +import { + getLandlordOverrides, + type OverrideRow, + type LandlordOverrideCategory, +} from "@/lib/landlordOverrides/server"; +import { CLASSIFIER_FIELDS } from "@/lib/bulkUpload/columnFields"; + +export default async function LandlordOverridesPage(props: { + params: Promise<{ slug: string }>; +}) { + const { slug } = await props.params; + const session = await getServerSession(AuthOptions); + if (!session) redirect("/login"); + + const results = await getLandlordOverrides(slug); + const total = Object.values(results).reduce((n, rows) => n + rows.length, 0); + + return ( +
+
+

+ Landlord overrides +

+

+ Property facts classified from your bulk-upload descriptions. Read-only + — editing comes later. +

+
+ + {total === 0 ? ( +
+ No classified values yet. They appear here once a bulk upload with + landlord-description columns has been processed. +
+ ) : ( + CLASSIFIER_FIELDS.map((field) => ( + + )) + )} +
+ ); +} + +function OverrideSection({ title, rows }: { title: string; rows: OverrideRow[] }) { + return ( +
+
+

+ {title} +

+ + {rows.length} {rows.length === 1 ? "value" : "values"} + +
+ {rows.length === 0 ? ( +
+ No values for this category. +
+ ) : ( +
+ {rows.map((row, i) => ( +
+

+ {row.description} +

+

+ {row.value} +

+
+ +
+
+ ))} +
+ )} +
+ ); +} + +function SourceBadge({ source }: { source: string }) { + const isUser = source === "user"; + return ( + + {isUser ? "user" : "classifier"} + + ); +} diff --git a/src/lib/bulkUpload/client.ts b/src/lib/bulkUpload/client.ts index 536bd07f..3f3a34e9 100644 --- a/src/lib/bulkUpload/client.ts +++ b/src/lib/bulkUpload/client.ts @@ -96,6 +96,100 @@ export function useSetColumnMapping(portfolioId: string, uploadId: string) { }); } +// field -> description -> resolved enum, for the multi-entry sample (issue #298). +export type SampleClassifications = Record>; + +export function useEditClassification(portfolioId: string, uploadId: string) { + const queryClient = useQueryClient(); + return useMutation({ + mutationFn: async (input) => { + const res = await fetch( + `/api/portfolio/${portfolioId}/bulk-uploads/${uploadId}/classifications`, + { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(input), + }, + ); + if (!res.ok) throw await parseError(res, "Failed to save classification."); + }, + onSuccess: () => { + queryClient.invalidateQueries({ + queryKey: [...bulkUploadKeys.progress(uploadId), "classifications"], + }); + }, + }); +} + +// Sample classifications for the review panels PLUS the still-`Unknown` +// descriptions that gate Finalise (ADR-0006). +export interface ClassificationsView { + classifications: SampleClassifications; + unknown: Record; +} + +export function useSampleClassifications( + portfolioId: string, + uploadId: string, + enabled: boolean, +) { + return useQuery({ + queryKey: [...bulkUploadKeys.progress(uploadId), "classifications"], + enabled, + queryFn: async () => { + const res = await fetch( + `/api/portfolio/${portfolioId}/bulk-uploads/${uploadId}/classifications`, + ); + if (!res.ok) throw await parseError(res, "Failed to load classifications."); + const body = await res.json(); + return { + classifications: (body.classifications ?? {}) as SampleClassifications, + unknown: (body.unknown ?? {}) as Record, + }; + }, + }); +} + +export function useConfirmMultiEntryOrdering(portfolioId: string, uploadId: string) { + const queryClient = useQueryClient(); + return useMutation }>({ + mutationFn: async (input) => { + const res = await fetch( + `/api/portfolio/${portfolioId}/bulk-uploads/${uploadId}/multi-entry-ordering`, + { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(input), + }, + ); + if (!res.ok) throw await parseError(res, "Failed to save ordering."); + return res.json(); + }, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: bulkUploadKeys.progress(uploadId) }); + }, + }); +} + +// Records the "Verify classification" acknowledgement (ADR-0004 Step 1), +// unlocking Finalise. Per-row corrections go through useEditClassification. +export function useConfirmVerification(portfolioId: string, uploadId: string) { + const queryClient = useQueryClient(); + return useMutation({ + mutationFn: async () => { + const res = await fetch( + `/api/portfolio/${portfolioId}/bulk-uploads/${uploadId}/verify-classification`, + { method: "PATCH" }, + ); + if (!res.ok) throw await parseError(res, "Failed to confirm classification."); + return res.json(); + }, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: bulkUploadKeys.progress(uploadId) }); + }, + }); +} + export function useStartAddressMatching(portfolioId: string, uploadId: string) { const queryClient = useQueryClient(); return useMutation<{ taskId: string }, Error, void>({ @@ -113,7 +207,11 @@ export function useStartAddressMatching(portfolioId: string, uploadId: string) { }); } -export function useBulkUploadProgress(portfolioId: string, uploadId: string) { +export function useBulkUploadProgress( + portfolioId: string, + uploadId: string, + options?: { onSuccess?: (data: ProgressView) => void }, +) { return useQuery({ queryKey: bulkUploadKeys.progress(uploadId), queryFn: async () => { @@ -127,6 +225,9 @@ export function useBulkUploadProgress(portfolioId: string, uploadId: string) { const status = data?.upload.status; return status && isTerminalStatus(status) ? false : 3000; }, + // v4 onSuccess fires after each successful poll; callers use it to react to a + // status transition (e.g. refresh the server page once it goes terminal). + onSuccess: options?.onSuccess, }); } diff --git a/src/lib/bulkUpload/columnFields.ts b/src/lib/bulkUpload/columnFields.ts new file mode 100644 index 00000000..f29bff27 --- /dev/null +++ b/src/lib/bulkUpload/columnFields.ts @@ -0,0 +1,100 @@ +// Single source of truth for BulkUpload column mapping. +// +// The mapping is stored as `field → source CSV header` (one entry per mapped +// internal field). One source header may feed several CLASSIFIER fields (e.g. +// "Property Type" → both property_type and built_form_type) but at most one +// ADDRESS field — see docs/adr/0003-classifier-triggers-as-address-subtask.md +// and docs/wip/landlord-override-frontend-plan.md (Q2.2, Q3). +// +// Classifier field `value`s mirror the Model service's ClassifiableColumn names +// (property_type / built_form_type / wall_type / roof_type) so the mapping can +// be forwarded to the lambda trigger verbatim. + +export type InternalFieldKind = "address" | "classifier"; + +export interface InternalField { + value: string; + label: string; + kind: InternalFieldKind; + required: boolean; + // Canonical header written into the address-matching CSV (address fields only). + outputHeader?: string; +} + +export const INTERNAL_FIELDS: InternalField[] = [ + { value: "address_1", label: "Address 1", kind: "address", required: true, outputHeader: "Address 1" }, + { value: "address_2", label: "Address 2", kind: "address", required: false, outputHeader: "Address 2" }, + { value: "address_3", label: "Address 3", kind: "address", required: false, outputHeader: "Address 3" }, + { value: "postcode", label: "Postcode", kind: "address", required: true, outputHeader: "postcode" }, + { value: "internal_reference", label: "Internal Reference", kind: "address", required: false, outputHeader: "Internal Reference" }, + { value: "property_type", label: "Property Type", kind: "classifier", required: false }, + { value: "built_form_type", label: "Built Form", kind: "classifier", required: false }, + { value: "wall_type", label: "Wall Type", kind: "classifier", required: false }, + { value: "roof_type", label: "Roof Type", kind: "classifier", required: false }, +]; + +export const ADDRESS_FIELDS = INTERNAL_FIELDS.filter((f) => f.kind === "address"); +export const CLASSIFIER_FIELDS = INTERNAL_FIELDS.filter((f) => f.kind === "classifier"); +export const CLASSIFIER_FIELD_VALUES = CLASSIFIER_FIELDS.map((f) => f.value); +export const REQUIRED_FIELD_VALUES = INTERNAL_FIELDS.filter((f) => f.required).map((f) => f.value); + +// Sentinel for an unmapped field in the UI dropdown ("Not provided"). +export const NOT_PROVIDED = ""; + +// header → address field detection. Classifier fields are never auto-detected +// (Q2.1): mapping them is always an explicit user choice. +export function autoDetectField(header: string): string | null { + const h = header.toLowerCase().replace(/[\s_\-]/g, ""); + if (/^(address|addr)(line)?(1|one)?$/.test(h)) return "address_1"; + if (/^(address|addr)(line)?(2|two)|^street$/.test(h)) return "address_2"; + if (/^(address|addr)(line)?(3|three)|^locality$|^town$|^city$/.test(h)) return "address_3"; + if (/^post(al)?code$|^postcode$|^pcode$/.test(h)) return "postcode"; + if (/^(internal)?ref(erence)?$|^id$/.test(h)) return "internal_reference"; + return null; +} + +// Build the initial field→header mapping: keep any existing choices, then +// auto-fill address fields from the headers (first matching header wins). +export function buildInitialMapping( + sourceHeaders: string[], + existing?: Record, +): Record { + const mapping: Record = { ...(existing ?? {}) }; + for (const header of sourceHeaders) { + const field = autoDetectField(header); + if (!field) continue; + if (mapping[field] === undefined) mapping[field] = header; + } + return mapping; +} + +// Validation shared by the client (live) and the server (authoritative). +// Returns the first problem as a message, or null when the mapping is valid. +export function validateColumnMapping(mapping: Record): string | null { + for (const field of REQUIRED_FIELD_VALUES) { + if (!mapping[field]) { + const label = INTERNAL_FIELDS.find((f) => f.value === field)?.label ?? field; + return `${label} must be mapped to a column.`; + } + } + const usedAddressHeaders = new Set(); + for (const field of ADDRESS_FIELDS) { + const header = mapping[field.value]; + if (!header) continue; + if (usedAddressHeaders.has(header)) { + return `Column "${header}" is mapped to more than one address field.`; + } + usedAddressHeaders.add(header); + } + return null; +} + +// The classifier subset of the mapping (category → source header) that gets +// forwarded to the lambda trigger. Address fields are intentionally excluded. +export function classifierMapping(mapping: Record): Record { + const out: Record = {}; + for (const field of CLASSIFIER_FIELD_VALUES) { + if (mapping[field]) out[field] = mapping[field]; + } + return out; +} diff --git a/src/lib/bulkUpload/multiEntry.test.ts b/src/lib/bulkUpload/multiEntry.test.ts new file mode 100644 index 00000000..da0eb253 --- /dev/null +++ b/src/lib/bulkUpload/multiEntry.test.ts @@ -0,0 +1,110 @@ +import { describe, expect, it } from "vitest"; +import { detectMultiEntry, assignmentToPermutation, isPermutation } from "./multiEntry"; + +// field -> source header, the shape stored on the upload. property_type and +// built_form_type intentionally share a header (the classifier allows it). +const MAPPING = { + address_1: "Addr", + postcode: "PC", + property_type: "Property Type", + wall_type: "Walls", + roof_type: "Roofs", +}; + +describe("detectMultiEntry", () => { + it("returns an empty summary when no classifier columns are mapped", () => { + const rows = [{ Addr: "1 High St", PC: "AB1 2CD" }]; + const summary = detectMultiEntry(rows, { address_1: "Addr", postcode: "PC" }); + expect(summary.sample).toBeNull(); + expect(summary.largestCount).toBe(0); + }); + + it("captures a single-part verify sample when classifier columns exist but no row is multi-entry", () => { + const rows = [ + { Addr: "1 High St", PC: "AB1 2CD", "Property Type": "House: EndTerrace", Walls: "Cavity: AsBuilt", Roofs: "Pitched: 200mm" }, + ]; + const summary = detectMultiEntry(rows, MAPPING); + + // Not multi-entry, but there IS a sample to verify (ADR-0004 Step 1). + expect(summary.largestCount).toBe(0); + expect(summary.sample).not.toBeNull(); + expect(summary.sample!.count).toBe(1); + expect(summary.sample!.address).toBe("1 High St, AB1 2CD"); + // All three mapped classifier columns are present, one entry each. + expect(summary.sample!.columns.map((c) => c.field).sort()).toEqual([ + "property_type", + "roof_type", + "wall_type", + ]); + expect(summary.sample!.columns.every((c) => c.entries.length === 1)).toBe(true); + }); + + it("picks the largest-count row as the sample and reports it as multi-entry", () => { + const rows = [ + { Addr: "1 High St", PC: "AB1 2CD", "Property Type": "House: EndTerrace", Walls: "Cavity: AsBuilt", Roofs: "Pitched: 200mm" }, + { Addr: "2 Low St", PC: "AB3 4EF", "Property Type": "House: Detached", Walls: "Cavity: AsBuilt, Cavity: Filled", Roofs: "Flat: AsBuilt, Pitched: 200mm" }, + ]; + const summary = detectMultiEntry(rows, MAPPING); + + expect(summary.largestCount).toBe(2); + expect(summary.countDistribution).toEqual({ "2": 1 }); + expect(summary.sample!.address).toBe("2 Low St, AB3 4EF"); + expect(summary.sample!.count).toBe(2); + // multiValuedFields are the ones that actually split. + expect([...summary.multiValuedFields].sort()).toEqual(["roof_type", "wall_type"]); + // The whole-dwelling Property Type column is still carried (for Step 1), + // with a single entry — Step 2 filters it out by entries.length. + const propertyCol = summary.sample!.columns.find((c) => c.field === "property_type"); + expect(propertyCol?.entries).toHaveLength(1); + const wallCol = summary.sample!.columns.find((c) => c.field === "wall_type"); + expect(wallCol?.entries.map((e) => e.raw)).toEqual(["Cavity: AsBuilt", "Cavity: Filled"]); + }); + + it("captures one ordering sample per distinct count (ADR-0004 amendment)", () => { + const rows = [ + { Addr: "1 High St", PC: "AB1 2CD", "Property Type": "House: Detached", Walls: "Cavity: AsBuilt", Roofs: "Pitched: 200mm" }, // count 1 + { Addr: "2 Low St", PC: "AB3 4EF", "Property Type": "House: Semi", Walls: "Cavity, Solid", Roofs: "Flat, Pitched" }, // count 2 + { Addr: "3 Mid Rd", PC: "AB5 6GH", "Property Type": "House: Mid", Walls: "Cavity, Solid, Render", Roofs: "Flat, Pitched, Slate" }, // count 3 + { Addr: "4 Side Ln", PC: "AB7 8IJ", "Property Type": "House: Other", Walls: "Brick, Stone", Roofs: "Tile, Slate" }, // count 2 again + ]; + const summary = detectMultiEntry(rows, MAPPING); + + expect(summary.largestCount).toBe(3); + expect(summary.countDistribution).toEqual({ "2": 2, "3": 1 }); + + // A sample for every count >= 2 — and only those. + expect(Object.keys(summary.samplesByCount ?? {}).sort()).toEqual(["2", "3"]); + expect(summary.samplesByCount!["2"].count).toBe(2); + expect(summary.samplesByCount!["3"].count).toBe(3); + // The count-2 sample is the FIRST count-2 row, not the count-3 one. + expect(summary.samplesByCount!["2"].address).toBe("2 Low St, AB3 4EF"); + const wall2 = summary.samplesByCount!["2"].columns.find((c) => c.field === "wall_type"); + expect(wall2?.entries.map((e) => e.raw)).toEqual(["Cavity", "Solid"]); + const wall3 = summary.samplesByCount!["3"].columns.find((c) => c.field === "wall_type"); + expect(wall3?.entries.map((e) => e.raw)).toEqual(["Cavity", "Solid", "Render"]); + }); + + it("normalizes descriptions to lower-case (matching the classifier's key)", () => { + const rows = [{ Addr: "1 High St", PC: "AB1 2CD", "Property Type": "House: EndTerrace", Walls: "", Roofs: "" }]; + const summary = detectMultiEntry(rows, MAPPING); + const entry = summary.sample!.columns.find((c) => c.field === "property_type")!.entries[0]; + expect(entry.raw).toBe("House: EndTerrace"); + expect(entry.description).toBe("house: endterrace"); + }); +}); + +describe("ordering helpers", () => { + it("isPermutation accepts a bijection and rejects duplicates/out-of-range", () => { + expect(isPermutation([0, 1])).toBe(true); + expect(isPermutation([1, 0, 2])).toBe(true); + expect(isPermutation([0, 0])).toBe(false); + expect(isPermutation([0, 2])).toBe(false); + }); + + it("assignmentToPermutation inverts assignment[pos]=slot to permutation[slot]=pos", () => { + // file position 1 holds the main building (slot 0), position 0 is extension 1. + expect(assignmentToPermutation([1, 0])).toEqual([1, 0]); + expect(assignmentToPermutation([0, 1, 2])).toEqual([0, 1, 2]); + expect(assignmentToPermutation([2, 0, 1])).toEqual([1, 2, 0]); + }); +}); diff --git a/src/lib/bulkUpload/multiEntry.ts b/src/lib/bulkUpload/multiEntry.ts new file mode 100644 index 00000000..d537360a --- /dev/null +++ b/src/lib/bulkUpload/multiEntry.ts @@ -0,0 +1,192 @@ +// Multi-entry building-part detection (ADR-0004). +// +// A BulkUpload row can carry several comma-separated entries in a physical- +// element column (e.g. Walls = "Cavity: AsBuilt (1976-1982), Cavity: +// FilledCavity"). Each entry is a Building part (Main building + Extensions). +// This module finds that pattern and captures one sample — the row with the +// MOST building parts — so the user can confirm the ordering downstream. +// +// Pure + I/O-free so it's unit-testable; the start-address-matching route runs +// it over the already-parsed upload rows and persists the result on the upload. + +import { ADDRESS_FIELDS, classifierMapping } from "./columnFields"; +import type { + MultiEntryEntry, + MultiEntryColumn, + MultiEntrySample, + MultiEntrySummary, +} from "@/app/db/schema/bulk_address_uploads"; + +// The jsonb shape lives with the column (schema/bulk_address_uploads.ts) so the +// migration is self-contained; re-export here for callers of this module. +export type { + MultiEntryEntry, + MultiEntryColumn, + MultiEntrySample, + MultiEntrySummary, + MultiEntryOrdering, +} from "@/app/db/schema/bulk_address_uploads"; + +// --- Building-part ordering (ADR-0004) --- + +// Label for building-part slot k: slot 0 is the Main building, the rest are +// numbered extensions. +export function partLabel(slot: number): string { + return slot === 0 ? "Main building" : `Extension ${slot}`; +} + +// True when `arr` is a permutation of [0, n-1] (each slot used exactly once). +export function isPermutation(arr: number[]): boolean { + const seen = new Set(arr); + return ( + seen.size === arr.length && + arr.every((n) => Number.isInteger(n) && n >= 0 && n < arr.length) + ); +} + +// The UI collects, per file position, which building-part slot it holds +// (`assignment[pos] = slot`). Storage is keyed the other way — +// `permutation[slot] = pos` — so a consumer can ask "which file position holds +// the main building?". Both are permutations of [0, n-1]; this inverts one to +// the other. +export function assignmentToPermutation(assignment: number[]): number[] { + const permutation = new Array(assignment.length); + assignment.forEach((slot, position) => { + permutation[slot] = position; + }); + return permutation; +} + +export const EMPTY_MULTI_ENTRY_SUMMARY: MultiEntrySummary = { + multiValuedFields: [], + countDistribution: {}, + largestCount: 0, + sample: null, + samplesByCount: {}, +}; + +// Split a cell into building-part entries. Mirrors the classifier's +// split(",") → trim → lower, dropping empty fragments so positions align +// across raw and normalized forms. +export function splitEntries(value: unknown): MultiEntryEntry[] { + return String(value ?? "") + .split(",") + .map((s) => s.trim()) + .filter((s) => s.length > 0) + .map((raw) => ({ raw, description: raw.toLowerCase() })); +} + +// Compose a display address from the mapped address fields (reference excluded). +function buildAddress( + row: Record, + columnMapping: Record, +): string { + const parts: string[] = []; + for (const field of ADDRESS_FIELDS) { + if (field.value === "internal_reference") continue; + const header = columnMapping[field.value]; + if (!header) continue; + const value = String(row[header] ?? "").trim(); + if (value) parts.push(value); + } + return parts.join(", "); +} + +// Scan the mapped classifier columns and capture one sample address. Only +// classifier columns are considered — they're the physical-element descriptions +// we slice into building parts; address columns are single-valued by nature. +// +// The sample serves both review steps (ADR-0004): the largest-count multi-entry +// row when one exists (Step 2 — Confirm order), otherwise the first row carrying +// any classifier value so Step 1 — Verify classification still has something to +// show. `largestCount >= 2` is the multi-entry signal; `sample != null` means +// "there is something to verify". The sample carries every mapped classifier +// column with a value — Step 1 lists them all; Step 2 renders only the +// multi-valued ones. +export function detectMultiEntry( + rows: Array>, + columnMapping: Record, +): MultiEntrySummary { + const classifierCols = Object.entries(classifierMapping(columnMapping)); + if (classifierCols.length === 0) return EMPTY_MULTI_ENTRY_SUMMARY; + + const multiValued = new Set(); + const countDistribution: Record = {}; + let largestCount = 0; + let multiEntryRowIndex = -1; + // Fallback sample for Step 1 when no row is multi-entry: the first row that + // carries any classifier value. + let firstClassifiedRowIndex = -1; + // First row index seen at each distinct count ≥ 2 — one ordering sample per + // count (ADR-0004 amendment): each count needs its own confirmed permutation. + const sampleRowIndexByCount: Record = {}; + + rows.forEach((row, index) => { + let rowMax = 0; + let hasValue = false; + for (const [field, header] of classifierCols) { + const n = splitEntries(row[header]).length; + if (n > 0) hasValue = true; + if (n > 1) multiValued.add(field); + if (n > rowMax) rowMax = n; + } + if (hasValue && firstClassifiedRowIndex === -1) firstClassifiedRowIndex = index; + if (rowMax >= 2) { + const key = String(rowMax); + countDistribution[key] = (countDistribution[key] ?? 0) + 1; + if (sampleRowIndexByCount[key] === undefined) sampleRowIndexByCount[key] = index; + // First row at a new maximum becomes the multi-entry (Step 1) sample. + if (rowMax > largestCount) { + largestCount = rowMax; + multiEntryRowIndex = index; + } + } + }); + + const sampleRowIndex = + multiEntryRowIndex !== -1 ? multiEntryRowIndex : firstClassifiedRowIndex; + if (sampleRowIndex === -1) { + return { + multiValuedFields: [...multiValued], + countDistribution, + largestCount, + sample: null, + samplesByCount: {}, + }; + } + + // One ordering sample per distinct count, so the UI can render a panel per + // count and the user confirms each independently. + const samplesByCount: Record = {}; + for (const [count, rowIndex] of Object.entries(sampleRowIndexByCount)) { + samplesByCount[count] = sampleFromRow(rows[rowIndex], columnMapping, classifierCols, Number(count)); + } + + return { + multiValuedFields: [...multiValued], + countDistribution, + largestCount, + sample: sampleFromRow( + rows[sampleRowIndex], + columnMapping, + classifierCols, + largestCount >= 2 ? largestCount : 1, + ), + samplesByCount, + }; +} + +// Build the sample for one row: its display address plus every mapped classifier +// column carrying a value. Step 1 lists all columns; Step 2's order table filters +// to the multi-valued ones (single-value columns are whole-dwelling facts). +function sampleFromRow( + row: Record, + columnMapping: Record, + classifierCols: Array<[string, string]>, + count: number, +): MultiEntrySample { + const columns: MultiEntryColumn[] = classifierCols + .map(([field, header]) => ({ field, header, entries: splitEntries(row[header]) })) + .filter((column) => column.entries.length > 0); + return { address: buildAddress(row, columnMapping), count, columns }; +} diff --git a/src/lib/bulkUpload/s3Keys.ts b/src/lib/bulkUpload/s3Keys.ts new file mode 100644 index 00000000..8188fcf4 --- /dev/null +++ b/src/lib/bulkUpload/s3Keys.ts @@ -0,0 +1,23 @@ +// Shared S3 key conventions + the synthetic join-column name for bulk-upload +// artifacts. The finaliser join (ADR-0006) depends on the classifier CSV key +// being built *identically* in two places — where the CSV is written +// (start-address-matching) and where the finaliser is dispatched +// (dispatchFinaliser) — and on the `source_row_id` column appearing in both the +// address CSV and the classifier CSV. Keeping the convention here is the single +// source of truth that stops those two callers drifting. + +export const BULK_UPLOAD_INPUT_PREFIX = "bulk_onboarding_inputs"; + +export function addressCsvKey(portfolioId: string, uploadId: string): string { + return `${BULK_UPLOAD_INPUT_PREFIX}/${portfolioId}/${uploadId}.csv`; +} + +export function classifierCsvKey(portfolioId: string, uploadId: string): string { + return `${BULK_UPLOAD_INPUT_PREFIX}/${portfolioId}/${uploadId}-classifier.csv`; +} + +// The synthetic per-row UUID column. Minted at start-address-matching and +// emitted into both CSVs so the finaliser can join a row's identity (combiner +// output) to its raw descriptions (classifier CSV). The Model finaliser reads +// this exact header — keep the two in sync. +export const SOURCE_ROW_ID_COLUMN = "source_row_id"; diff --git a/src/lib/bulkUpload/server.ts b/src/lib/bulkUpload/server.ts index a5785c39..ca303f9b 100644 --- a/src/lib/bulkUpload/server.ts +++ b/src/lib/bulkUpload/server.ts @@ -1,9 +1,25 @@ import { db } from "@/app/db/db"; import { bulkAddressUploads } from "@/app/db/schema/bulk_address_uploads"; +import { + landlordPropertyTypeOverrides, + landlordBuiltFormTypeOverrides, + landlordWallTypeOverrides, + landlordRoofTypeOverrides, + PropertyTypeValues, + BuiltFormTypeValues, + WallTypeValues, + RoofTypeValues, +} from "@/app/db/schema/landlord_overrides"; import { tasks } from "@/app/db/schema/tasks/tasks"; import { subTasks } from "@/app/db/schema/tasks/subtask"; -import { count, desc, eq, sql } from "drizzle-orm"; +import { and, count, desc, eq, inArray, sql } from "drizzle-orm"; import type { BulkUpload, BulkUploadStatus, ProgressView, TaskSummary } from "./types"; +import { validateColumnMapping, classifierMapping } from "./columnFields"; +import { classifierCsvKey } from "./s3Keys"; +import { retrofitDataS3Bucket } from "@/app/utils/s3"; +import { SUBTASK_SERVICE } from "./types"; +import type { MultiEntrySummary } from "./multiEntry"; +import { isPermutation } from "./multiEntry"; const REMAP_ALLOWED: ReadonlySet = new Set([ "ready_for_processing", @@ -78,6 +94,12 @@ async function loadTaskSummary(taskId: string): Promise { totalSubtasks: count(subTasks.id), completedSubtasks: sql`count(case when lower(${subTasks.status}) in ('completed', 'complete') then 1 end)::int`, failedSubtasks: sql`count(case when lower(${subTasks.status}) in ('failed', 'failure', 'error') then 1 end)::int`, + addressTotal: sql`count(case when (${subTasks.service} = 'address2uprn' or ${subTasks.service} is null) and ${subTasks.id} is not null then 1 end)::int`, + addressCompleted: sql`count(case when (${subTasks.service} = 'address2uprn' or ${subTasks.service} is null) and lower(${subTasks.status}) in ('completed', 'complete') then 1 end)::int`, + addressFailed: sql`count(case when (${subTasks.service} = 'address2uprn' or ${subTasks.service} is null) and lower(${subTasks.status}) in ('failed', 'failure', 'error') then 1 end)::int`, + classifierTotal: sql`count(case when ${subTasks.service} = 'landlord_description_overrides' then 1 end)::int`, + classifierCompleted: sql`count(case when ${subTasks.service} = 'landlord_description_overrides' and lower(${subTasks.status}) in ('completed', 'complete') then 1 end)::int`, + classifierFailed: sql`count(case when ${subTasks.service} = 'landlord_description_overrides' and lower(${subTasks.status}) in ('failed', 'failure', 'error') then 1 end)::int`, }) .from(tasks) .leftJoin(subTasks, eq(subTasks.taskId, tasks.id)) @@ -94,11 +116,300 @@ export async function getProgressView(uploadId: string): Promise): string | null { - const values = Object.values(mapping); - if (!values.includes("address_1")) return "Mapping must include address_1."; - if (!values.includes("postcode")) return "Mapping must include postcode."; - return null; +// Persist the multi-entry building-part detection (ADR-0004). Computed once at +// start-address-matching from the already-parsed rows; read back on the +// awaiting_review surface. Only this column is touched, so the later +// status/taskId update leaves it intact. +export async function saveMultiEntrySummary( + uploadId: string, + summary: MultiEntrySummary, +): Promise { + await db + .update(bulkAddressUploads) + .set({ multiEntrySummary: summary }) + .where(eq(bulkAddressUploads.id, uploadId)); +} + +// Classifier field -> resolved enum, keyed by the normalized description that +// the classifier persisted. Empty inner maps / absent fields mean "not +// classified (yet)". Read-only (ADR-0004, issue #298). +export type SampleClassifications = Record>; + +// Look up the classifier's resolved enums for one field's descriptions. One +// branch per category keeps drizzle's per-table enum typing intact. +async function lookupOverrides( + field: string, + portfolioId: bigint, + descriptions: string[], +): Promise | null> { + switch (field) { + case "property_type": + return db + .select({ description: landlordPropertyTypeOverrides.description, value: landlordPropertyTypeOverrides.value }) + .from(landlordPropertyTypeOverrides) + .where(and(eq(landlordPropertyTypeOverrides.portfolioId, portfolioId), inArray(landlordPropertyTypeOverrides.description, descriptions))); + case "built_form_type": + return db + .select({ description: landlordBuiltFormTypeOverrides.description, value: landlordBuiltFormTypeOverrides.value }) + .from(landlordBuiltFormTypeOverrides) + .where(and(eq(landlordBuiltFormTypeOverrides.portfolioId, portfolioId), inArray(landlordBuiltFormTypeOverrides.description, descriptions))); + case "wall_type": + return db + .select({ description: landlordWallTypeOverrides.description, value: landlordWallTypeOverrides.value }) + .from(landlordWallTypeOverrides) + .where(and(eq(landlordWallTypeOverrides.portfolioId, portfolioId), inArray(landlordWallTypeOverrides.description, descriptions))); + case "roof_type": + return db + .select({ description: landlordRoofTypeOverrides.description, value: landlordRoofTypeOverrides.value }) + .from(landlordRoofTypeOverrides) + .where(and(eq(landlordRoofTypeOverrides.portfolioId, portfolioId), inArray(landlordRoofTypeOverrides.description, descriptions))); + default: + return null; + } +} + +// The classifier's enums for the review samples' entries, joined by the +// normalized description (exact match — the summary stored it the way the +// classifier persists it, so no re-normalization here). Read-only. Covers the +// Step 1 verify sample AND every per-count ordering sample, since smaller-count +// panels may show descriptions the largest-count sample doesn't (ADR-0004 +// amendment). +export async function getSampleClassifications( + uploadId: string, +): Promise { + const upload = await loadById(uploadId); + const summary = upload?.multiEntrySummary; + if (!upload || !summary || !summary.sample) return {}; + + // Gather distinct descriptions per field across all samples. + const allSamples = [summary.sample, ...Object.values(summary.samplesByCount ?? {})]; + const descriptionsByField: Record> = {}; + for (const sample of allSamples) { + for (const column of sample.columns) { + const set = (descriptionsByField[column.field] ??= new Set()); + for (const e of column.entries) set.add(e.description); + } + } + + const portfolioId = BigInt(upload.portfolioId); + const result: SampleClassifications = {}; + for (const [field, descSet] of Object.entries(descriptionsByField)) { + const descriptions = [...descSet]; + if (descriptions.length === 0) continue; + const rows = await lookupOverrides(field, portfolioId, descriptions); + if (!rows) continue; + result[field] = Object.fromEntries(rows.map((r) => [r.description, r.value])); + } + return result; +} + +// Descriptions still classified `Unknown` per field, portfolio-wide (ADR-0006). +// `Unknown` is the classifier's "couldn't decide" marker; v2 treats it as +// never-final, so the Finalise gate blocks until the user maps every one to a +// real value (and the finaliser fails loudly if any slips through). Portfolio- +// wide is the right scope under the one-real-upload assumption (ADR-0006). +export type UnknownOverrides = Record; + +const UNKNOWN_VALUE = "Unknown"; + +async function unknownForField(field: string, portfolioId: bigint): Promise { + switch (field) { + case "property_type": + return ( + await db + .select({ description: landlordPropertyTypeOverrides.description }) + .from(landlordPropertyTypeOverrides) + .where(and(eq(landlordPropertyTypeOverrides.portfolioId, portfolioId), eq(landlordPropertyTypeOverrides.value, UNKNOWN_VALUE))) + ).map((r) => r.description); + case "built_form_type": + return ( + await db + .select({ description: landlordBuiltFormTypeOverrides.description }) + .from(landlordBuiltFormTypeOverrides) + .where(and(eq(landlordBuiltFormTypeOverrides.portfolioId, portfolioId), eq(landlordBuiltFormTypeOverrides.value, UNKNOWN_VALUE))) + ).map((r) => r.description); + case "wall_type": + return ( + await db + .select({ description: landlordWallTypeOverrides.description }) + .from(landlordWallTypeOverrides) + .where(and(eq(landlordWallTypeOverrides.portfolioId, portfolioId), eq(landlordWallTypeOverrides.value, UNKNOWN_VALUE))) + ).map((r) => r.description); + case "roof_type": + return ( + await db + .select({ description: landlordRoofTypeOverrides.description }) + .from(landlordRoofTypeOverrides) + .where(and(eq(landlordRoofTypeOverrides.portfolioId, portfolioId), eq(landlordRoofTypeOverrides.value, UNKNOWN_VALUE))) + ).map((r) => r.description); + default: + return []; + } +} + +export async function getUnknownOverrides(portfolioId: string): Promise { + const pid = BigInt(portfolioId); + const result: UnknownOverrides = {}; + for (const field of ["property_type", "built_form_type", "wall_type", "roof_type"]) { + const descriptions = await unknownForField(field, pid); + if (descriptions.length > 0) result[field] = descriptions; + } + return result; +} + +// Valid enum values per classifier category, for validating a user edit. +const CATEGORY_VALUES: Record = { + property_type: PropertyTypeValues, + built_form_type: BuiltFormTypeValues, + wall_type: WallTypeValues, + roof_type: RoofTypeValues, +}; + +// Upsert one user override (source='user') into the right category table. One +// branch per table keeps drizzle's per-table typing intact; the unique +// (portfolio_id, description) drives the conflict. Sets source='user' so the +// classifier's `ON CONFLICT … WHERE source='classifier'` never re-clobbers it. +async function upsertUserOverride( + field: string, + portfolioId: bigint, + description: string, + value: string, +): Promise { + const now = new Date(); + switch (field) { + case "property_type": + await db.insert(landlordPropertyTypeOverrides) + .values({ portfolioId, description, value, source: "user" }) + .onConflictDoUpdate({ + target: [landlordPropertyTypeOverrides.portfolioId, landlordPropertyTypeOverrides.description], + set: { value, source: "user", updatedAt: now }, + }); + return; + case "built_form_type": + await db.insert(landlordBuiltFormTypeOverrides) + .values({ portfolioId, description, value, source: "user" }) + .onConflictDoUpdate({ + target: [landlordBuiltFormTypeOverrides.portfolioId, landlordBuiltFormTypeOverrides.description], + set: { value, source: "user", updatedAt: now }, + }); + return; + case "wall_type": + await db.insert(landlordWallTypeOverrides) + .values({ portfolioId, description, value, source: "user" }) + .onConflictDoUpdate({ + target: [landlordWallTypeOverrides.portfolioId, landlordWallTypeOverrides.description], + set: { value, source: "user", updatedAt: now }, + }); + return; + case "roof_type": + await db.insert(landlordRoofTypeOverrides) + .values({ portfolioId, description, value, source: "user" }) + .onConflictDoUpdate({ + target: [landlordRoofTypeOverrides.portfolioId, landlordRoofTypeOverrides.description], + set: { value, source: "user", updatedAt: now }, + }); + return; + } +} + +export type SetClassificationOutcome = + | { kind: "ok" } + | { kind: "invalid"; reason: string }; + +// Correct one classification, persisted as a user override (ADR-0004, issue +// #299). Keyed by (portfolio, description), so it changes the mapping for every +// row with that description portfolio-wide. The description is normalized to +// match the classifier's stored key. +export async function setClassificationOverride( + portfolioId: string, + field: string, + description: string, + value: string, +): Promise { + const allowed = CATEGORY_VALUES[field]; + if (!allowed) return { kind: "invalid", reason: `Unknown category '${field}'` }; + if (!allowed.includes(value)) + return { kind: "invalid", reason: `'${value}' is not a valid ${field}` }; + + const normalized = description.trim().toLowerCase(); + if (!normalized) return { kind: "invalid", reason: "Empty description" }; + + await upsertUserOverride(field, BigInt(portfolioId), normalized, value); + return { kind: "ok" }; +} + +export type SetOrderingOutcome = + | { kind: "ok"; upload: BulkUpload } + | { kind: "not_found" } + | { kind: "wrong_state"; current: string } + | { kind: "not_multi_entry" } + | { kind: "invalid_ordering"; reason: string }; + +// Persist the user-confirmed building-part ordering (ADR-0004, amended +// 2026-06-05). Allowed only at awaiting_review and only when the upload is +// multi-entry. Each distinct count ≥ 2 needs its own permutation; the UI confirms +// one count at a time, so we MERGE the supplied permutations into any already +// stored, validate each is a bijection, and only mark `confirmed` once EVERY +// detected count has a permutation (which gates Finalise). +export async function setMultiEntryOrdering( + uploadId: string, + permutations: Record, +): Promise { + const upload = await loadById(uploadId); + if (!upload) return { kind: "not_found" }; + if (upload.status !== "awaiting_review") + return { kind: "wrong_state", current: upload.status }; + + const summary = upload.multiEntrySummary; + // A sample now exists for non-multi-entry uploads too (Step 1's verify + // sample), so "is multi-entry" is largestCount >= 2, not "has a sample". + if (!summary || summary.largestCount < 2) + return { kind: "not_multi_entry" }; + + for (const [count, permutation] of Object.entries(permutations)) { + if (permutation.length !== Number(count) || !isPermutation(permutation)) + return { kind: "invalid_ordering", reason: `Ordering for ${count} parts is not a valid arrangement.` }; + } + + // Merge with any counts confirmed earlier, then decide whether every detected + // count (the keys of countDistribution, all ≥ 2) now has a permutation. + const merged = { ...(upload.multiEntryOrdering?.permutations ?? {}), ...permutations }; + const requiredCounts = Object.keys(summary.countDistribution); + const confirmed = requiredCounts.every( + (c) => Array.isArray(merged[c]) && merged[c].length === Number(c), + ); + + const [updated] = await db + .update(bulkAddressUploads) + .set({ multiEntryOrdering: { permutations: merged, confirmed } }) + .where(eq(bulkAddressUploads.id, uploadId)) + .returning(); + if (!updated) return { kind: "not_found" }; + return { kind: "ok", upload: updated }; +} + +export type SetVerifyAckOutcome = + | { kind: "ok"; upload: BulkUpload } + | { kind: "not_found" } + | { kind: "wrong_state"; current: string }; + +// Record the user's "Verify classification" acknowledgement (ADR-0004 Step 1). +// Allowed only at awaiting_review. Gates Finalise whenever the upload has +// classifier columns — independent of multi-entry, hence its own column rather +// than a flag on multiEntryOrdering. +export async function setVerifyAck(uploadId: string): Promise { + const upload = await loadById(uploadId); + if (!upload) return { kind: "not_found" }; + if (upload.status !== "awaiting_review") + return { kind: "wrong_state", current: upload.status }; + + const [updated] = await db + .update(bulkAddressUploads) + .set({ verifyAck: true }) + .where(eq(bulkAddressUploads.id, uploadId)) + .returning(); + if (!updated) return { kind: "not_found" }; + return { kind: "ok", upload: updated }; } export type SetMappingOutcome = @@ -116,7 +427,7 @@ export async function setColumnMapping( if (!REMAP_ALLOWED.has(upload.status as BulkUploadStatus)) return { kind: "invalid_status", current: upload.status }; - const reason = validateMapping(mapping); + const reason = validateColumnMapping(mapping); if (reason) return { kind: "invalid_mapping", reason }; const [updated] = await db @@ -174,6 +485,7 @@ export async function triggerAddressMatching(args: { .values({ taskId: task.id, status: "waiting", + service: SUBTASK_SERVICE.address, inputs: JSON.stringify({ bulk_upload_id: args.uploadId }), }) .returning(); @@ -209,9 +521,69 @@ export async function triggerAddressMatching(args: { return { kind: "ok", taskId: task.id }; } +// Co-fires the landlord classifier as a subtask under the address task. Reads a +// dedicated classifier CSV (the classifier columns converted from the upload by +// the start-address-matching route — the address-matching CSV strips the +// description columns), so the lambda always parses a real CSV even for +// .xlsx/.xls uploads. Non-blocking: a trigger failure marks only the classifier +// subtask, so address matching is unaffected. See ADR-0003. +export async function triggerClassifier(args: { + taskId: string; + uploadId: string; + s3Uri: string; + sessionToken: string | undefined; +}): Promise { + const upload = await loadById(args.uploadId); + if (!upload || !upload.columnMapping) return; + + const columnMapping = classifierMapping(upload.columnMapping); + if (Object.keys(columnMapping).length === 0) return; + + const [subTask] = await db + .insert(subTasks) + .values({ + taskId: args.taskId, + status: "waiting", + service: SUBTASK_SERVICE.classifier, + inputs: JSON.stringify({ bulk_upload_id: args.uploadId }), + }) + .returning(); + + const payload = { + task_id: args.taskId, + sub_task_id: subTask.id, + s3_uri: args.s3Uri, + portfolio_id: Number(upload.portfolioId), + column_mapping: columnMapping, + }; + + const trigger = await triggerFastApiPipeline({ + endpoint: "/v1/bulk-uploads/trigger-landlord-overrides", + payload, + sessionToken: args.sessionToken, + }); + + if (!trigger.ok) { + await db + .update(subTasks) + .set({ + status: "failed", + outputs: JSON.stringify({ error: trigger.message }), + }) + .where(eq(subTasks.id, subTask.id)); + return; + } + + await db + .update(subTasks) + .set({ status: "in progress", inputs: JSON.stringify(payload) }) + .where(eq(subTasks.id, subTask.id)); +} + export type CombineRetriggerOutcome = | { kind: "triggered"; taskId: string; subTaskId: string } | { kind: "already_combined" } + | { kind: "already_dispatched" } | { kind: "not_found" } | { kind: "missing_task" } | { kind: "trigger_failed"; status: number; message: string }; @@ -225,6 +597,24 @@ export async function requestCombineRetrigger(args: { if (!upload.taskId) return { kind: "missing_task" }; if (upload.combinedOutputS3Uri) return { kind: "already_combined" }; + // CAS: atomically claim `processing → combining` — the double-dispatch guard + // (mirrors dispatchFinaliser's awaiting_review → finalising, ADR-0005). Of two + // rapid "Run Combiner" clicks exactly one flips the row; the loser updates 0 + // rows and bails, so only one combiner subtask is ever dispatched. It also + // closes the window where status is still `processing` because the backend + // hasn't written `combining` yet. + const claimed = await db + .update(bulkAddressUploads) + .set({ status: "combining" }) + .where( + and( + eq(bulkAddressUploads.id, args.uploadId), + eq(bulkAddressUploads.status, "processing"), + ), + ) + .returning(); + if (claimed.length === 0) return { kind: "already_dispatched" }; + const [subTask] = await db .insert(subTasks) .values({ taskId: upload.taskId, status: "waiting" }) @@ -237,8 +627,23 @@ export async function requestCombineRetrigger(args: { payload, sessionToken: args.sessionToken, }); - if (!trigger.ok) + if (!trigger.ok) { + // Roll the claim back so the user can retry, and fail the subtask. + await Promise.all([ + db + .update(bulkAddressUploads) + .set({ status: "processing" }) + .where(eq(bulkAddressUploads.id, args.uploadId)), + db + .update(subTasks) + .set({ + status: "failed", + outputs: JSON.stringify({ error: trigger.message }), + }) + .where(eq(subTasks.id, subTask.id)), + ]); return { kind: "trigger_failed", status: trigger.status, message: trigger.message }; + } await db .update(subTasks) @@ -265,9 +670,125 @@ export async function loadForFinalize(uploadId: string): Promise { - await db +export type DispatchFinaliserOutcome = + | { kind: "ok"; taskId: string; subTaskId: string } + | { kind: "not_found" } + | { kind: "already_finalized" } + | { kind: "not_yet_combined" } + | { kind: "wrong_state"; current: string } + | { kind: "missing_task" } + | { kind: "trigger_failed"; status: number; message: string }; + +// Dispatch the async bulk_upload_finaliser (ADR-0005). Replaces the old +// synchronous property insert + markFinalized. Order matters: +// 1. loadForFinalize — rich guards (combined output present, awaiting_review). +// 2. CAS claim `awaiting_review → finalising` — the double-dispatch guard: +// of two simultaneous clicks exactly one updates a row; the loser gets 409. +// 3. create the finaliser subtask under the upload's existing Task + POST the +// trigger. On trigger failure, revert the status so the user can retry and +// mark the subtask failed. The backend writes the terminal complete/failed. +export async function dispatchFinaliser(args: { + uploadId: string; + sessionToken: string | undefined; +}): Promise { + const guarded = await loadForFinalize(args.uploadId); + switch (guarded.kind) { + case "not_found": + return { kind: "not_found" }; + case "already_finalized": + return { kind: "already_finalized" }; + case "not_yet_combined": + return { kind: "not_yet_combined" }; + case "wrong_state": + return { kind: "wrong_state", current: guarded.current }; + } + const upload = guarded.upload; + if (!upload.taskId) return { kind: "missing_task" }; + + // CAS: atomically claim the dispatch. Only the request that flips + // awaiting_review → finalising proceeds; a concurrent one updates 0 rows. + const claimed = await db .update(bulkAddressUploads) - .set({ status: "complete" }) - .where(eq(bulkAddressUploads.id, uploadId)); + .set({ status: "finalising" }) + .where( + and( + eq(bulkAddressUploads.id, args.uploadId), + eq(bulkAddressUploads.status, "awaiting_review"), + ), + ) + .returning(); + if (claimed.length === 0) { + const current = await loadById(args.uploadId); + if (current?.status === "complete") return { kind: "already_finalized" }; + return { kind: "wrong_state", current: current?.status ?? "unknown" }; + } + + const [subTask] = await db + .insert(subTasks) + .values({ + taskId: upload.taskId, + status: "waiting", + service: SUBTASK_SERVICE.finaliser, + inputs: JSON.stringify({ bulk_upload_id: args.uploadId }), + }) + .returning(); + + // v2 (ADR-0006): the finaliser also writes property_overrides for UPRN-matched + // rows, which needs the classifier CSV (raw descriptions, joined to the + // combiner output by `source_row_id`) and the confirmed building-part ordering. + // Both are derivable here — we already hold the upload row, and dispatch runs + // after the user confirms ordering, so the value is final. + // - classifier_s3_uri: null when no classifier columns were mapped (no + // classifier CSV was written; the finaliser then writes no overrides). + // - multi_entry_ordering: permutations keyed by entry-count; {} when the + // upload is not multi-entry (every cell is a single building part → part 0). + const classifierMap = classifierMapping(upload.columnMapping ?? {}); + const classifierS3Uri = + Object.keys(classifierMap).length > 0 + ? `s3://${retrofitDataS3Bucket()}/${classifierCsvKey(upload.portfolioId, args.uploadId)}` + : null; + + const payload = { + task_id: upload.taskId, + sub_task_id: subTask.id, + s3_uri: upload.combinedOutputS3Uri, + portfolio_id: Number(upload.portfolioId), + bulk_upload_id: args.uploadId, + classifier_s3_uri: classifierS3Uri, + multi_entry_ordering: upload.multiEntryOrdering?.permutations ?? {}, + // classifier category → source CSV header, so the finaliser knows which + // classifier-CSV column feeds each override_component (ADR-0006). + column_mapping: classifierMap, + }; + + const trigger = await triggerFastApiPipeline({ + endpoint: "/v1/bulk-uploads/trigger-finaliser", + payload, + sessionToken: args.sessionToken, + }); + + if (!trigger.ok) { + // Roll the claim back so the user can retry, and fail the subtask. + await Promise.all([ + db + .update(bulkAddressUploads) + .set({ status: "awaiting_review" }) + .where(eq(bulkAddressUploads.id, args.uploadId)), + db + .update(subTasks) + .set({ + status: "failed", + outputs: JSON.stringify({ error: trigger.message }), + }) + .where(eq(subTasks.id, subTask.id)), + ]); + return { kind: "trigger_failed", status: trigger.status, message: trigger.message }; + } + + await db + .update(subTasks) + .set({ status: "in progress", inputs: JSON.stringify(payload) }) + .where(eq(subTasks.id, subTask.id)); + + return { kind: "ok", taskId: upload.taskId, subTaskId: subTask.id }; } diff --git a/src/lib/bulkUpload/types.ts b/src/lib/bulkUpload/types.ts index c2b1cb63..9d6a6049 100644 --- a/src/lib/bulkUpload/types.ts +++ b/src/lib/bulkUpload/types.ts @@ -6,6 +6,10 @@ export const BULK_UPLOAD_STATUSES = [ "processing", "combining", "awaiting_review", + // In-flight state of the async finaliser (ADR-0005); mirrors `combining`. The + // status column is free text, so no enum migration is needed. UI renders this + // as "Uploading to ARA" — see STATUS_LABELS. + "finalising", "complete", "failed", ] as const; @@ -14,6 +18,32 @@ export type BulkUploadStatus = (typeof BULK_UPLOAD_STATUSES)[number]; export type BulkUpload = typeof bulkAddressUploads.$inferSelect; +// sub_task.service values. NULL is treated as address (legacy rows + the +// backend-spawned postcode-split children, which don't set it). See ADR-0003. +export const SUBTASK_SERVICE = { + address: "address2uprn", + classifier: "landlord_description_overrides", + finaliser: "bulk_upload_finaliser", +} as const; + +// User-facing label for a BulkUpload status. The persisted enum value stays +// canonical (`finalising`); the product surface for that state is "Uploading to +// ARA" (ADR-0005 — a display-layer label, never the enum name). +export const STATUS_LABELS: Record = { + ready_for_processing: "Ready for processing", + mapping_complete: "Mapping complete", + processing: "Processing", + combining: "Combining", + awaiting_review: "Awaiting review", + finalising: "Uploading to ARA", + complete: "Complete", + failed: "Failed", +}; + +export function statusLabel(status: string): string { + return STATUS_LABELS[status] ?? status; +} + export type TaskSummary = { id: string; taskSource: string; @@ -25,6 +55,14 @@ export type TaskSummary = { totalSubtasks: number; completedSubtasks: number; failedSubtasks: number; + // Per-pipeline breakdown so onboarding progress can separate address + // matching from landlord classification (ADR-0003). + addressTotal: number; + addressCompleted: number; + addressFailed: number; + classifierTotal: number; + classifierCompleted: number; + classifierFailed: number; }; export type ProgressView = { diff --git a/src/lib/landlordOverrides/server.ts b/src/lib/landlordOverrides/server.ts new file mode 100644 index 00000000..c8f98eab --- /dev/null +++ b/src/lib/landlordOverrides/server.ts @@ -0,0 +1,79 @@ +import { db } from "@/app/db/db"; +import { + landlordPropertyTypeOverrides, + landlordBuiltFormTypeOverrides, + landlordWallTypeOverrides, + landlordRoofTypeOverrides, +} from "@/app/db/schema/landlord_overrides"; +import { asc, eq } from "drizzle-orm"; + +export interface OverrideRow { + description: string; + value: string; + source: string; +} + +export type LandlordOverrideCategory = + | "property_type" + | "built_form_type" + | "wall_type" + | "roof_type"; + +export type LandlordOverrideResults = Record; + +const EMPTY: LandlordOverrideResults = { + property_type: [], + built_form_type: [], + wall_type: [], + roof_type: [], +}; + +// Reads the four landlord_*_overrides tables for a portfolio. The portfolio id +// is a bigint FK; the bulk-upload flow carries it as a numeric string. +export async function getLandlordOverrides( + portfolioId: string +): Promise { + if (!/^\d+$/.test(portfolioId)) return EMPTY; + const pid = BigInt(portfolioId); + + const [property_type, built_form_type, wall_type, roof_type] = await Promise.all([ + db + .select({ + description: landlordPropertyTypeOverrides.description, + value: landlordPropertyTypeOverrides.value, + source: landlordPropertyTypeOverrides.source, + }) + .from(landlordPropertyTypeOverrides) + .where(eq(landlordPropertyTypeOverrides.portfolioId, pid)) + .orderBy(asc(landlordPropertyTypeOverrides.description)), + db + .select({ + description: landlordBuiltFormTypeOverrides.description, + value: landlordBuiltFormTypeOverrides.value, + source: landlordBuiltFormTypeOverrides.source, + }) + .from(landlordBuiltFormTypeOverrides) + .where(eq(landlordBuiltFormTypeOverrides.portfolioId, pid)) + .orderBy(asc(landlordBuiltFormTypeOverrides.description)), + db + .select({ + description: landlordWallTypeOverrides.description, + value: landlordWallTypeOverrides.value, + source: landlordWallTypeOverrides.source, + }) + .from(landlordWallTypeOverrides) + .where(eq(landlordWallTypeOverrides.portfolioId, pid)) + .orderBy(asc(landlordWallTypeOverrides.description)), + db + .select({ + description: landlordRoofTypeOverrides.description, + value: landlordRoofTypeOverrides.value, + source: landlordRoofTypeOverrides.source, + }) + .from(landlordRoofTypeOverrides) + .where(eq(landlordRoofTypeOverrides.portfolioId, pid)) + .orderBy(asc(landlordRoofTypeOverrides.description)), + ]); + + return { property_type, built_form_type, wall_type, roof_type }; +}