From f6545c2fa0fa0066a148bed8f47c7faf03eba01e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 5 Jun 2026 12:19:06 +0000 Subject: [PATCH] property override --- .claude/settings.json | 9 +- CONTEXT.md | 8 +- .../bulk-upload-finaliser-v2-handover.md | 181 +++++++++++++----- .../[uploadId]/classifications/route.ts | 21 +- .../start-address-matching/route.ts | 37 +++- src/app/db/schema/bulk_address_uploads.ts | 19 +- .../[uploadId]/OnboardingProgress.tsx | 160 ++++++++++++++-- src/lib/bulkUpload/client.ts | 14 +- src/lib/bulkUpload/multiEntry.test.ts | 24 +++ src/lib/bulkUpload/multiEntry.ts | 60 ++++-- src/lib/bulkUpload/s3Keys.ts | 23 +++ src/lib/bulkUpload/server.ts | 132 +++++++++++-- 12 files changed, 564 insertions(+), 124 deletions(-) create mode 100644 src/lib/bulkUpload/s3Keys.ts diff --git a/.claude/settings.json b/.claude/settings.json index 0343c161..c425aebf 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -53,7 +53,10 @@ "Bash(grep -E '\\\\.sql$')", "Bash(cd /home/vscode/po-migration *)", "Read(//home/vscode/po-migration/**)", - "Bash(python -m py_compile applications/bulk_upload_finaliser/handler.py orchestration/bulk_upload_finaliser_orchestrator.py)" + "Bash(python -m py_compile applications/bulk_upload_finaliser/handler.py orchestration/bulk_upload_finaliser_orchestrator.py)", + "Bash(python -m py_compile repositories/property/property_repository.py repositories/property/property_postgres_repository.py orchestration/bulk_upload_finaliser_orchestrator.py applications/bulk_upload_finaliser/handler.py tests/orchestration/test_bulk_upload_finaliser_orchestrator.py)", + "Bash(python -m py_compile tests/orchestration/fakes.py)", + "Bash(curl -s -o /dev/null -w \"%{http_code}\" --max-time 30 http://localhost:3000/home)" ], "deny": [ "Bash(npx drizzle-kit generate)", @@ -73,7 +76,9 @@ "/workspaces/home/github/Model/deployment/terraform/lambda/bulkUploadFinaliser", "/workspaces/home/github/Model/deployment/terraform/lambda/fast-api", "/workspaces/home/github/Model/backend/app/db/functions", - "/workspaces/home/github/Model/repositories/bulk_upload" + "/workspaces/home/github/Model/repositories/bulk_upload", + "/workspaces/home/github/Model/tests/orchestration", + "/workspaces/home/github/Model/.github/workflows" ] } } diff --git a/CONTEXT.md b/CONTEXT.md index 4e9234ef..bfd17635 100644 --- a/CONTEXT.md +++ b/CONTEXT.md @@ -42,9 +42,13 @@ A landlord-supplied fact about a property that takes precedence over EPC-derived _Avoid_: customer data, manual override, landlord data **Property override**: -The per-Property fact layer — one resolved fact per `(Property, Building part, component)`, where component is one of `wall_type`/`roof_type`/`property_type`/`built_form_type`. Holds a **snapshot** of the resolved enum value (a denormalised copy of the VocabularyMapping outcome at finalise time, so two Properties sharing a description can later diverge), plus the original spreadsheet text it resolved from. Materialised by the finaliser; see [ADR-0005](./docs/adr/0005-async-bulk-upload-finaliser.md). (Table created; population is follow-up work.) +The per-Property fact layer — one resolved fact per `(Property, Building part, component)`, where component is one of `wall_type`/`roof_type`/`property_type`/`built_form_type`. Holds a **snapshot** of the resolved enum value (a denormalised copy of the VocabularyMapping outcome at finalise time, so two Properties sharing a description can later diverge), plus the original spreadsheet text it resolved from. Materialised by the finaliser **for UPRN-matched Properties only** (v2); the resolved value is never `UNKNOWN` — the Verify step forces every `UNKNOWN` to be mapped before Finalise, and an unresolved description fails the run. See [ADR-0005](./docs/adr/0005-async-bulk-upload-finaliser.md) (table) and [ADR-0006](./docs/adr/0006-property-overrides-join-and-no-uprn-defer.md) (population). _Avoid_: per-property mapping, property fact, override row +**Source row id**: +A synthetic UUID minted per source-file row at `start-address-matching` and written into **both** the address CSV and the classifier CSV. It is the stable join key that lets the finaliser tie a row's identity (combiner output → `property_id`) to that row's raw descriptions (classifier CSV), since neither file preserves row order and `Internal Reference` is absent from the classifier CSV. See [ADR-0006](./docs/adr/0006-property-overrides-join-and-no-uprn-defer.md). +_Avoid_: row index, internal reference (a separate, optional landlord field) + **VocabularyMapping**: The translation from a Landlord's free-text description in a BulkUpload column (e.g. `"cavity: filledcavity"`) to a canonical domain enum value (e.g. `WallType.CAVITY`). Produced by a `ColumnClassifier` (today an LLM, tomorrow possibly a lookup table or rules engine) in the Model service. Stored per-Portfolio, one row per `(category, description)`. A row carries provenance (`classifier` or `user`) so user overrides survive re-classification. _Avoid_: column mapping (that's a separate concept — see `ColumnMapping` above), classification, dictionary @@ -125,6 +129,8 @@ _Avoid_: override, adjustment, correction > > **Dev:** "And if **Finalise** runs and 30% of rows have no **UPRN**?" > **Domain expert:** "Those still get imported as **Properties** — just without a UPRN — and the BulkUpload moves to `complete`. Manual cleanup happens later in the property table." +> +> _(Planned change — v3 / [ADR-0006](./docs/adr/0006-property-overrides-join-and-no-uprn-defer.md): no-UPRN rows will move to a separate staging table to be re-matched, so `property` holds only matched rows. v2 does **not** change this yet — and v2 writes **Property overrides** only for the UPRN-matched rows.)_ ## Flagged ambiguities diff --git a/docs/design/bulk-upload-finaliser-v2-handover.md b/docs/design/bulk-upload-finaliser-v2-handover.md index 281d4ec4..0a0ef5b5 100644 --- a/docs/design/bulk-upload-finaliser-v2-handover.md +++ b/docs/design/bulk-upload-finaliser-v2-handover.md @@ -5,6 +5,85 @@ > (async finalise that writes `property`) is **shipped and working end-to-end**. > This doc assumes no memory of the v1 session. +## 0. Design resolved — grilling outcome (2026-06-05) + +> The open questions in §9 were resolved in a design session. **This section is now +> authoritative**; the later sections are kept for background but where they conflict +> with this one, this one wins. The new v2 ADR is +> [`docs/adr/0006-property-overrides-join-and-no-uprn-defer.md`](../adr/0006-property-overrides-join-and-no-uprn-defer.md); +> ADR-0004 was amended for per-count ordering capture. + +**Spine.** Populate `property_overrides` at finalise **for UPRN-matched rows only**. +Join the classifier descriptions to the combiner identity by a **synthetic UUID +`source_row_id`** — *not* `Internal Reference` (it is **absent from the classifier +CSV**, and optional anyway) and *not* by carrying description columns through +`address2uprn` (architecture B, rejected). This is architecture **(A)** with a +purpose-built key. + +**No-UPRN rows are deferred to v3.** v1 *currently* inserts them as `property` rows; +**v2 changes nothing in the property insert** and simply writes no overrides for +them. The eventual home for unmatched rows is a **separate staging table** (Model B): +`property` holds only matched rows; unmatched inputs (with their descriptions) live in +the staging table until a *different UPRN matcher* assigns a UPRN and promotes them. +"Found vs unfound" is a view across both tables, **not** a flag on `property`. v3 owns +the property-insert change + the staging table + the matcher-rerun UX together. + +**Frontend work** (`/workspaces/assessment-model`): +1. **Mint `source_row_id`** (UUID) in `start-address-matching` right after + `readRows()`, and **explicitly emit it as a column in both** `buildAddressCsv` and + `buildClassifierCsv` — both project a *fixed* column set, so attaching it to the + row object is not enough. It survives `address2uprn`→combiner like any input column + (carried as `additional_info`); **verify against a real combiner output**. +2. **Per-count ordering capture** (supersedes ADR-0004's largest-count-only): + `detectMultiEntry` keeps a sample **per distinct count**; `OnboardingProgress` + renders one ordering panel **per count ≥ 2**. The jsonb type and + `setMultiEntryOrdering` validation already accept all counts — **no migration, no + backend-validation change**. +3. **Verify gate hardened**: Finalise is blocked while **any** description is still + `UNKNOWN`. `UNKNOWN` is now a **transient "needs review" marker, never a final + value** (this retires the old "`UNKNOWN` is legitimate" line in §7). +4. **`dispatchFinaliser`** adds **two fields to the trigger body**: + `classifier_s3_uri` and `multi_entry_ordering` (it already reads the + `bulk_address_uploads` row, and dispatch happens *after* the user confirms + ordering, so the value is final). The classifier S3 key comes from a **shared + `classifierCsvKey(portfolioId, uploadId)` helper** used by both the writer and the + dispatcher (the key is not stored anywhere today — convention only). + +**Backend work** (`/workspaces/home/github/Model`): +5. Grow the trigger schema in two places — FastAPI `FinaliserTriggerRequest` and + Lambda `BulkUploadFinaliserTriggerBody` — with `classifier_s3_uri` + + `multi_entry_ordering`. Handler stays trigger-driven (no new `bulk_address_uploads` + coupling). +6. **`PropertyOverrideRow`** table mirror + a **sibling `PropertyOverrideRepository`** + (own aggregate; upsert on `(property_id, override_component, building_part)`), and a + **read-only `LandlordOverrideRepository`** that loads a portfolio's vocabulary + **per component into dicts once** (the vocabulary is deduplicated, not per-row). +7. **Orchestrator step**, in the same `commit_scope`: + - bulk `SELECT (portfolio_id, uprn) → id` for the run's UPRN rows → in-memory map; + - join classifier↔combiner rows by `source_row_id`; + - **uniform comma-split all four components** → `permutations[count]` → parts + (count-1 cell → `building_part = 0`); the finaliser needs **no fallback** because + every count ≥ 2 has a confirmed permutation; + - resolve each part's **normalized** description against the override dicts; + - `original_spreadsheet_description` = the **raw** entry text (un-normalized); + - **empty cell → write no row**; **non-empty but unresolved (or `UNKNOWN`) → raise** + → `commit_scope` rolls back → `_mark_failed` flips the upload to `failed` + (**fail loudly, no partial writes**); + - write only the classifier components actually **mapped** in `columnMapping`; + - **no `source` column in v2** — upsert is unconditional for now. + +**Locked assumptions (load-bearing — see ADR-0006).** +- **One real upload per user.** A re-upload only adds *new* properties (ones not + previously included), never re-describes existing ones → part-keys are append-only + across uploads → **upsert-only, no delete-orphans** is correct and complete. +- **Per-count consistency.** One ordering per count, confirmed from one sample, applies + to every cell of that count in the file (extends ADR-0004's bet to all counts). +- **Per-cell count.** `Walls` may split into 3 while `Roofs` splits into 2 in the same + row; each cell is ordered by *its own* entry count. +- **Classification completes before `awaiting_review`**, and the hardened verify gate + forces every `UNKNOWN` to be resolved — so an unresolved description at finalise is a + genuine defect, hence fail-loud. + ## 1. Where v1 left things (read first) v1 made **Finalise** an async dispatched Lambda that writes `property` rows. The @@ -87,38 +166,19 @@ To write one `property_overrides` row, v2 must assemble **four inputs**: | split a multi-valued cell → building parts | `multiEntryOrdering` on `bulk_address_uploads` | | description → `override_value` | `landlord_*_overrides` (resolve by normalized description) | -### Two open hazards to resolve first (do these before writing code) +### Two open hazards — both RESOLVED (see §0) -1. **Join key between the classifier CSV and the combiner output.** Both derive from - the same upload rows, but **row order is NOT preserved** through postcode-split + - combine. So you need a stable per-row key present in *both* files. `Internal - Reference` is the candidate — **verify it survives into both** the address CSV - (→ combiner output) and the classifier CSV. If it doesn't, this is the first thing - to fix. +1. **Join key (RESOLVED).** Investigation confirmed `Internal Reference` is in the + address CSV + combiner output but **NOT in the classifier CSV**, and is optional. + So architecture (A)-by-`Internal Reference` is dead. **Resolution: mint a synthetic + UUID `source_row_id`** in `start-address-matching` after `readRows()`, emitted as an + explicit column in *both* CSVs. It is the join key. (Architecture (A) with a + purpose-built key; (B) "carry descriptions through `address2uprn`" was rejected.) -2. **`property_id` for unmatched (no-UPRN) rows.** v1's insert is `onConflictDoNothing` - and returns no ids. To attach overrides you need each row's `property.id`. For - UPRN rows you can re-select by `(portfolio_id, uprn)`; **no-UPRN rows can't be - re-found that way.** Likely fix: change the property insert to `RETURNING id` - mapped back to source rows (and decide the dedup/skip semantics for the RETURNING - path, since `onConflictDoNothing` returns nothing for conflicting rows). - -### Two candidate architectures (evaluate against real sample files) - -- **(A) Post-hoc join.** Keep the two files; the finaliser reads the combiner output - (UPRN/identity) and the classifier CSV (descriptions) and joins by `Internal - Reference`. Splits each multi-valued cell into parts via `multiEntryOrdering`, - resolves each part's description against `landlord_*_overrides`, and writes one row - per `(property, part, component)`. Lowest pipeline change; depends entirely on a - reliable join key. -- **(B) Carry descriptions through the pipeline.** Include the description columns in - the *address* CSV at `start-address-matching` so they flow through `address2uprn` - (which preserves input columns via `**row`) into the combiner output. Then the - finaliser reads **one** file with UPRN + descriptions in the same row — no join, no - key hazard. Costs a change to the address-CSV construction (frontend - `start-address-matching` route) and re-verifying `address2uprn`/combiner. Cleaner - long-term; bigger blast radius. **Recommended to seriously consider** — it deletes - hazard #1 entirely. +2. **`property_id` for no-UPRN rows (RESOLVED by descoping).** v2 writes overrides + **only for UPRN rows**, whose `property.id` is re-found by `(portfolio_id, uprn)` + — so **no `RETURNING` correlation is needed**. No-UPRN rows are deferred to v3 + (Model B staging table); v2 leaves the property insert untouched. ## 6. `multiEntryOrdering` — how to split cells into parts @@ -147,7 +207,10 @@ Four per-component tables in `src/app/db/schema/landlord_overrides.ts` component's pgEnum, plus a `source` (`classifier`|`user`). Resolve a normalized description → `value`. The frontend already does this read in `src/lib/bulkUpload/server.ts` (`lookupOverrides`) — mirror that mapping on the -backend. `UNKNOWN` is a legitimate stored value. +backend. **`UNKNOWN` is now a transient "needs review" marker, never a final +value** (resolved in §0): the verify gate forces the user to map every `UNKNOWN` +before Finalise, so a `UNKNOWN` (or unresolvable description) reaching the finaliser +is a defect and **fails the run loudly**. ## 8. Backend pieces to build (DDD, mirror v1) @@ -176,27 +239,47 @@ Key v1 files to extend (all in the Model repo): - Packaging test: `tests/test_lambda_packaging.py` will flag any new top-level import the Dockerfile doesn't `COPY` (v1 hit this with `datatypes/`). -## 9. Open questions for v2 to decide +## 9. Open questions — all RESOLVED (see §0 + ADR-0006) -- Join key confirmed (`Internal Reference` in both files) — or adopt architecture (B)? -- `property_id` for no-UPRN rows: `RETURNING id` strategy + dedup semantics. -- Non-largest-count `multiEntryOrdering` derivation rule (ADR-0004 deferred it). -- Does the trigger body grow, or does the handler read `bulk_address_uploads` - (`multiEntryOrdering`, classifier S3 URI) directly? -- Re-materialise semantics confirmed: recalculate overrides every finalise (snapshot - refreshes), `property` rows untouched. +- **Join key** → synthetic UUID `source_row_id` in both CSVs (not `Internal + Reference`, not architecture B). +- **`property_id` for no-UPRN rows** → out of scope; v2 is UPRN-only, no-UPRN deferred + to v3 (Model B). UPRN rows re-found by `(portfolio_id, uprn)`; no `RETURNING`. +- **Non-largest-count `multiEntryOrdering`** → capture a confirmed permutation for + **every** count ≥ 2 in the UI (supersedes ADR-0004); finaliser needs no fallback. +- **Trigger body vs handler-reads-DB** → **grow the trigger body** (`classifier_s3_uri` + + `multi_entry_ordering`), built in `dispatchFinaliser`. +- **Re-materialise** → recalculate every finalise via **upsert-only** on + `(property_id, override_component, building_part)`; **no delete-orphans** (justified + by the one-real-upload assumption); `property` rows untouched. -## 10. First steps in the new context +## 10. Implementation order (design is settled — build it) -1. Read §1 docs (esp. ADR-0004) + `CONTEXT.md`. -2. Get a **real sample**: the combiner output CSV and the `{uploadId}-classifier.csv` - for one dev upload, and inspect whether `Internal Reference` is in both → settle - hazard #1 / pick architecture (A) vs (B). -3. Decide the `property_id`-for-no-UPRN approach (hazard #2). -4. Build `PropertyOverrideRow` + repository + orchestrator step + handler wiring, - TDD against fakes (mirror `tests/orchestration/test_bulk_upload_finaliser_orchestrator.py`). -5. Update `CONTEXT.md` ("Property override" → populated) and add a v2 ADR if the - join/architecture choice is a real trade-off. +Frontend first (the finaliser depends on `source_row_id` + per-count ordering): + +1. **`source_row_id`**: shared `classifierCsvKey` helper; mint the UUID in + `start-address-matching` after `readRows()`; emit it as an explicit column in both + `buildAddressCsv` and `buildClassifierCsv`. Verify it lands in a real combiner + output. +2. **Per-count ordering**: `detectMultiEntry` keeps a sample per count; + `OnboardingProgress` renders one ordering panel per count ≥ 2. Drop the + largest-count-only assumption in `setMultiEntryOrdering` if it requires the largest. +3. **Verify gate**: block Finalise while any classification is `UNKNOWN`. +4. **`dispatchFinaliser`**: add `classifier_s3_uri` + `multi_entry_ordering` to the + trigger payload. + +Backend: + +5. Grow `FinaliserTriggerRequest` (FastAPI) + `BulkUploadFinaliserTriggerBody` (Lambda). +6. `PropertyOverrideRow` mirror + sibling `PropertyOverrideRepository` (upsert) + + read-only `LandlordOverrideRepository`. +7. Orchestrator step (join → split → resolve → upsert; fail-loud on unresolved), + TDD against fakes (mirror + `tests/orchestration/test_bulk_upload_finaliser_orchestrator.py`). +8. Handler wiring; watch `tests/test_lambda_packaging.py` for Dockerfile COPY gaps. + +Docs (done in this session): ADR-0004 amended, ADR-0006 added, `CONTEXT.md` +"Property override" updated. ## 11. Verification notes (environment) diff --git a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/classifications/route.ts b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/classifications/route.ts index e897319a..5be47873 100644 --- a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/classifications/route.ts +++ b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/classifications/route.ts @@ -1,11 +1,17 @@ -import { getSampleClassifications, setClassificationOverride } from "@/lib/bulkUpload/server"; +import { + getSampleClassifications, + getUnknownOverrides, + setClassificationOverride, +} from "@/lib/bulkUpload/server"; import { NextRequest, NextResponse } from "next/server"; import { getServerSession } from "next-auth"; import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions"; import { z } from "zod"; -// Read-only: the classifier's resolved enums for the multi-entry sample's -// entries, keyed by field -> description -> value (ADR-0004, issue #298). +// Read-only: the classifier's resolved enums for the review sample's entries +// (field -> description -> value), plus the descriptions still classified +// `Unknown` portfolio-wide — the Finalise gate blocks until that list is empty +// and the user can resolve each via PATCH below (ADR-0004 #298, ADR-0006). export async function GET( _request: NextRequest, { params }: { params: Promise<{ portfolioId: string; uploadId: string }> } @@ -13,9 +19,12 @@ export async function GET( const session = await getServerSession(AuthOptions); if (!session) return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); - const { uploadId } = await params; - const classifications = await getSampleClassifications(uploadId); - return NextResponse.json({ classifications }, { status: 200 }); + const { portfolioId, uploadId } = await params; + const [classifications, unknown] = await Promise.all([ + getSampleClassifications(uploadId), + getUnknownOverrides(portfolioId), + ]); + return NextResponse.json({ classifications, unknown }, { status: 200 }); } const PatchSchema = z.object({ diff --git a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts index 22bfa8b9..e7e6ae50 100644 --- a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts +++ b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts @@ -1,11 +1,13 @@ import { NextRequest, NextResponse } from "next/server"; import { getServerSession } from "next-auth"; +import { randomUUID } from "node:crypto"; import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions"; import { createS3Client, createRetrofitDataS3Client, retrofitDataS3Bucket } from "@/app/utils/s3"; import * as XLSX from "xlsx"; import { loadForAddressMatching, saveMultiEntrySummary, triggerAddressMatching, triggerClassifier } from "@/lib/bulkUpload/server"; import { readSessionToken } from "@/lib/session"; import { ADDRESS_FIELDS, classifierMapping } from "@/lib/bulkUpload/columnFields"; +import { addressCsvKey, classifierCsvKey, SOURCE_ROW_ID_COLUMN } from "@/lib/bulkUpload/s3Keys"; import { detectMultiEntry } from "@/lib/bulkUpload/multiEntry"; type SheetRow = Record; @@ -35,11 +37,17 @@ function buildAddressCsv( if (!outputHeaders.includes("postcode")) return { error: 'Mapping must include "postcode"' }; + // Carry the synthetic per-row join key through to the combiner output, so the + // finaliser can re-associate a UPRN-matched row with its classifier + // descriptions (ADR-0006). It rides `address2uprn` as a preserved input column. + outputHeaders.push(SOURCE_ROW_ID_COLUMN); + const outputRows = rows.map((row) => { const out: SheetRow = {}; for (const [outName, src] of Object.entries(outputToSource)) { out[outName] = row[src] ?? ""; } + out[SOURCE_ROW_ID_COLUMN] = row[SOURCE_ROW_ID_COLUMN] ?? ""; return out; }); @@ -56,10 +64,17 @@ function buildClassifierCsv( rows: SheetRow[], classifierMap: Record // category → source header ): string { - const headers = [...new Set(Object.values(classifierMap))]; + const sourceHeaders = [...new Set(Object.values(classifierMap))]; + // Emit the synthetic join key alongside the classifier columns so the + // finaliser can join this row's descriptions to its combiner identity by + // `source_row_id` (ADR-0006). `buildClassifierCsv` projects a fixed column + // set, so the key must be added explicitly — attaching it to the row is not + // enough. + const headers = [...sourceHeaders, SOURCE_ROW_ID_COLUMN]; const outputRows = rows.map((row) => { const out: SheetRow = {}; - for (const h of headers) out[h] = row[h] ?? ""; + for (const h of sourceHeaders) out[h] = row[h] ?? ""; + out[SOURCE_ROW_ID_COLUMN] = row[SOURCE_ROW_ID_COLUMN] ?? ""; return out; }); const outSheet = XLSX.utils.json_to_sheet(outputRows, { header: headers }); @@ -104,10 +119,20 @@ export async function POST( return NextResponse.json({ error: "Failed to read source file" }, { status: 500 }); } - const rows = readRows(fileBuffer); - if (rows.length === 0) + const parsedRows = readRows(fileBuffer); + if (parsedRows.length === 0) return NextResponse.json({ error: "Empty file" }, { status: 422 }); + // Mint a stable synthetic id per source row, here at the one point both CSVs + // are built from the same array, and write it into both. It is the finaliser's + // join key between the combiner output (identity) and the classifier CSV + // (descriptions) — see ADR-0006. Deterministic ordering is not required: both + // CSVs are produced together in this handler, so they always share values. + const rows = parsedRows.map((row) => ({ + ...row, + [SOURCE_ROW_ID_COLUMN]: randomUUID(), + })); + // Detect multi-entry building parts now, while the whole file is parsed in // memory, so the awaiting_review surface never re-reads it (ADR-0004). await saveMultiEntrySummary(uploadId, detectMultiEntry(rows, upload.columnMapping!)); @@ -116,7 +141,7 @@ export async function POST( if (transformed.error) return NextResponse.json({ error: transformed.error }, { status: 422 }); - const transformedKey = `bulk_onboarding_inputs/${portfolioId}/${uploadId}.csv`; + const transformedKey = addressCsvKey(portfolioId, uploadId); try { await outputS3 .putObject({ @@ -139,7 +164,7 @@ export async function POST( const classifierMap = classifierMapping(upload.columnMapping!); let classifierS3Uri: string | undefined; if (Object.keys(classifierMap).length > 0) { - const classifierKey = `bulk_onboarding_inputs/${portfolioId}/${uploadId}-classifier.csv`; + const classifierKey = classifierCsvKey(portfolioId, uploadId); try { await outputS3 .putObject({ diff --git a/src/app/db/schema/bulk_address_uploads.ts b/src/app/db/schema/bulk_address_uploads.ts index f3dcd82c..b695c140 100644 --- a/src/app/db/schema/bulk_address_uploads.ts +++ b/src/app/db/schema/bulk_address_uploads.ts @@ -22,17 +22,26 @@ export interface MultiEntrySummary { multiValuedFields: string[]; countDistribution: Record; largestCount: number; + // Step 1 (verify) sample: the largest-count row when multi-entry, else the + // first classified row. `null` ⇒ nothing to verify. sample: MultiEntrySample | null; + // Step 2 (order): one sample per distinct entry-count ≥ 2 present in the file, + // keyed by count. Each count needs its OWN confirmed permutation — a smaller + // count's ordering can't be derived from a larger one (ADR-0004, amended + // 2026-06-05). Absent on uploads detected before that amendment. + samplesByCount?: Record; } -// User-confirmed building-part ordering (ADR-0004). Keyed by entry-count so it -// can hold more than one count later; this iteration populates only the -// largest. permutations[count][k] = the 0-based file position holding building -// part k, where 0 = Main building, 1..N-1 = Extension 1..N-1. +// User-confirmed building-part ordering (ADR-0004, amended 2026-06-05). Keyed by +// entry-count: a permutation is captured for EVERY distinct count ≥ 2 in the +// file (the v2 fact layer can't derive one count's order from another). +// permutations[count][k] = the 0-based file position holding building part k, +// where 0 = Main building, 1..N-1 = Extension 1..N-1. // e.g. { "2": [1, 0] } => for 2-part rows the main building is file position 1. export interface MultiEntryOrdering { permutations: Record; - // Set once the user confirms; gates Finalise when the upload is multi-entry. + // True once EVERY detected count ≥ 2 has a permutation; gates Finalise when the + // upload is multi-entry. confirmed: boolean; } diff --git a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx index 041ca389..f5dc7cd0 100644 --- a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx +++ b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx @@ -134,11 +134,29 @@ export default function OnboardingProgress({ const orderingConfirmed = upload.multiEntryOrdering?.confirmed ?? false; const needsVerify = !!sample; const needsOrdering = !!sample && isMultiEntry; + // One ordering panel per distinct count ≥ 2, ascending (ADR-0004 amendment). + // Fall back to the single Step-1 sample for uploads detected before per-count + // capture existed (samplesByCount absent). + const samplesByCount = upload.multiEntrySummary?.samplesByCount; + const orderingSamples: Array<[string, MultiEntrySample]> = + samplesByCount && Object.keys(samplesByCount).length > 0 + ? Object.entries(samplesByCount).sort(([a], [b]) => Number(a) - Number(b)) + : sample && isMultiEntry + ? [[String(sample.count), sample]] + : []; const showStepNumbers = needsVerify && needsOrdering; + // Descriptions still classified `Unknown` block Finalise — the user must map + // every one to a real value, else the finaliser fails loudly (ADR-0006). + const unknownByField = classifications.data?.unknown ?? {}; + const unknownTotal = Object.values(unknownByField).reduce( + (n, descriptions) => n + descriptions.length, + 0, + ); const canFinalize = isAwaitingReview && (!needsVerify || verifyAck) && - (!needsOrdering || orderingConfirmed); + (!needsOrdering || orderingConfirmed) && + unknownTotal === 0; return (
@@ -209,7 +227,7 @@ export default function OnboardingProgress({ {needsVerify && sample && ( )} - {needsOrdering && sample && ( - 0 && ( + )} + {needsOrdering && orderingSamples.length > 0 && ( +
+ {orderingSamples.map(([count, orderSample], i) => ( + 1 + ? `Part group ${i + 1}` + : undefined + } + portfolioId={portfolioId} + uploadId={uploadId} + /> + ))} +
+ )} + {(canRunCombiner || isAwaitingReview) && (
{canRunCombiner && ( @@ -245,9 +284,11 @@ export default function OnboardingProgress({ isPending={finalize.isPending} disabled={!canFinalize} disabledReason={ - needsVerify && !verifyAck - ? "Verify the classification first" - : "Confirm the building-part order first" + unknownTotal > 0 + ? `Resolve ${unknownTotal} unclassified description${unknownTotal === 1 ? "" : "s"} first` + : needsVerify && !verifyAck + ? "Verify the classification first" + : "Confirm the building-part order first" } onClick={() => finalize.mutate(undefined, { onSuccess: () => router.refresh() }) @@ -405,10 +446,12 @@ function VerifyClassificationPanel({ ); } -// Interactive building-part ordering for the largest-count multi-entry sample -// (ADR-0004). The user labels each file position with a building part (one Main -// building + Extensions); the labels must form a permutation. Confirming -// persists the ordering and unlocks Finalise. +// Interactive building-part ordering for ONE entry-count's sample (ADR-0004, +// amended 2026-06-05 — one panel per distinct count). The user labels each file +// position with a building part (one Main building + Extensions); the labels +// must form a permutation. Confirming persists this count's ordering (merged +// server-side with the other counts'); Finalise unlocks once every count is +// confirmed. function MultiEntryOrderingPanel({ sample, ordering, @@ -444,7 +487,10 @@ function MultiEntryOrderingPanel({ return Array.from({ length: count }, (_, i) => i); }); - const confirmed = ordering?.confirmed ?? false; + // Per-panel confirmation reflects whether THIS count's permutation is stored, + // not the global all-counts-confirmed flag — so each panel gives its own + // feedback as the user works through them. + const confirmed = Array.isArray(ordering?.permutations?.[String(count)]); const valid = isPermutation(assignment); const setSlot = (position: number, slot: number) => @@ -558,6 +604,88 @@ function MultiEntryOrderingPanel({ ); } +// Unresolved-classification gate (ADR-0006). Lists every description still +// classified `Unknown` portfolio-wide and lets the user map each to a real value +// via the same per-description override path as Step 1 (it applies portfolio- +// wide). Finalise stays blocked until this list is empty — `Unknown` is never a +// final value, and an unresolved one would fail the import loudly. +function UnresolvedClassificationsPanel({ + unknown, + portfolioId, + uploadId, +}: { + unknown: Record; + portfolioId: string; + uploadId: string; +}) { + const editClassification = useEditClassification(portfolioId, uploadId); + const total = Object.values(unknown).reduce((n, d) => n + d.length, 0); + + return ( +
+

+ Resolve unclassified descriptions ({total}) +

+

+ We couldn't classify these automatically. Map each to a category + before finalising — an unresolved value would fail the import. Edits apply + to every row across the portfolio. +

+ +
+ {Object.entries(unknown).map(([field, descriptions]) => { + const options = (CATEGORY_VALUES[field] ?? []).filter((o) => o !== "Unknown"); + return ( +
+

+ {FIELD_LABEL[field] ?? field} +

+
+ {descriptions.map((description) => ( +
+ + {description} + + + +
+ ))} +
+
+ ); + })} +
+ {editClassification.error && ( +

{editClassification.error.message}

+ )} +
+ ); +} + function StageButton({ label, activeLabel, diff --git a/src/lib/bulkUpload/client.ts b/src/lib/bulkUpload/client.ts index 2a50023a..3f3a34e9 100644 --- a/src/lib/bulkUpload/client.ts +++ b/src/lib/bulkUpload/client.ts @@ -121,12 +121,19 @@ export function useEditClassification(portfolioId: string, uploadId: string) { }); } +// Sample classifications for the review panels PLUS the still-`Unknown` +// descriptions that gate Finalise (ADR-0006). +export interface ClassificationsView { + classifications: SampleClassifications; + unknown: Record; +} + export function useSampleClassifications( portfolioId: string, uploadId: string, enabled: boolean, ) { - return useQuery({ + return useQuery({ queryKey: [...bulkUploadKeys.progress(uploadId), "classifications"], enabled, queryFn: async () => { @@ -135,7 +142,10 @@ export function useSampleClassifications( ); if (!res.ok) throw await parseError(res, "Failed to load classifications."); const body = await res.json(); - return body.classifications as SampleClassifications; + return { + classifications: (body.classifications ?? {}) as SampleClassifications, + unknown: (body.unknown ?? {}) as Record, + }; }, }); } diff --git a/src/lib/bulkUpload/multiEntry.test.ts b/src/lib/bulkUpload/multiEntry.test.ts index e7ff6a9b..da0eb253 100644 --- a/src/lib/bulkUpload/multiEntry.test.ts +++ b/src/lib/bulkUpload/multiEntry.test.ts @@ -60,6 +60,30 @@ describe("detectMultiEntry", () => { expect(wallCol?.entries.map((e) => e.raw)).toEqual(["Cavity: AsBuilt", "Cavity: Filled"]); }); + it("captures one ordering sample per distinct count (ADR-0004 amendment)", () => { + const rows = [ + { Addr: "1 High St", PC: "AB1 2CD", "Property Type": "House: Detached", Walls: "Cavity: AsBuilt", Roofs: "Pitched: 200mm" }, // count 1 + { Addr: "2 Low St", PC: "AB3 4EF", "Property Type": "House: Semi", Walls: "Cavity, Solid", Roofs: "Flat, Pitched" }, // count 2 + { Addr: "3 Mid Rd", PC: "AB5 6GH", "Property Type": "House: Mid", Walls: "Cavity, Solid, Render", Roofs: "Flat, Pitched, Slate" }, // count 3 + { Addr: "4 Side Ln", PC: "AB7 8IJ", "Property Type": "House: Other", Walls: "Brick, Stone", Roofs: "Tile, Slate" }, // count 2 again + ]; + const summary = detectMultiEntry(rows, MAPPING); + + expect(summary.largestCount).toBe(3); + expect(summary.countDistribution).toEqual({ "2": 2, "3": 1 }); + + // A sample for every count >= 2 — and only those. + expect(Object.keys(summary.samplesByCount ?? {}).sort()).toEqual(["2", "3"]); + expect(summary.samplesByCount!["2"].count).toBe(2); + expect(summary.samplesByCount!["3"].count).toBe(3); + // The count-2 sample is the FIRST count-2 row, not the count-3 one. + expect(summary.samplesByCount!["2"].address).toBe("2 Low St, AB3 4EF"); + const wall2 = summary.samplesByCount!["2"].columns.find((c) => c.field === "wall_type"); + expect(wall2?.entries.map((e) => e.raw)).toEqual(["Cavity", "Solid"]); + const wall3 = summary.samplesByCount!["3"].columns.find((c) => c.field === "wall_type"); + expect(wall3?.entries.map((e) => e.raw)).toEqual(["Cavity", "Solid", "Render"]); + }); + it("normalizes descriptions to lower-case (matching the classifier's key)", () => { const rows = [{ Addr: "1 High St", PC: "AB1 2CD", "Property Type": "House: EndTerrace", Walls: "", Roofs: "" }]; const summary = detectMultiEntry(rows, MAPPING); diff --git a/src/lib/bulkUpload/multiEntry.ts b/src/lib/bulkUpload/multiEntry.ts index 3bbd053e..d537360a 100644 --- a/src/lib/bulkUpload/multiEntry.ts +++ b/src/lib/bulkUpload/multiEntry.ts @@ -13,6 +13,7 @@ import { ADDRESS_FIELDS, classifierMapping } from "./columnFields"; import type { MultiEntryEntry, MultiEntryColumn, + MultiEntrySample, MultiEntrySummary, } from "@/app/db/schema/bulk_address_uploads"; @@ -61,6 +62,7 @@ export const EMPTY_MULTI_ENTRY_SUMMARY: MultiEntrySummary = { countDistribution: {}, largestCount: 0, sample: null, + samplesByCount: {}, }; // Split a cell into building-part entries. Mirrors the classifier's @@ -115,6 +117,9 @@ export function detectMultiEntry( // Fallback sample for Step 1 when no row is multi-entry: the first row that // carries any classifier value. let firstClassifiedRowIndex = -1; + // First row index seen at each distinct count ≥ 2 — one ordering sample per + // count (ADR-0004 amendment): each count needs its own confirmed permutation. + const sampleRowIndexByCount: Record = {}; rows.forEach((row, index) => { let rowMax = 0; @@ -129,7 +134,8 @@ export function detectMultiEntry( if (rowMax >= 2) { const key = String(rowMax); countDistribution[key] = (countDistribution[key] ?? 0) + 1; - // First row at a new maximum becomes the multi-entry sample. + if (sampleRowIndexByCount[key] === undefined) sampleRowIndexByCount[key] = index; + // First row at a new maximum becomes the multi-entry (Step 1) sample. if (rowMax > largestCount) { largestCount = rowMax; multiEntryRowIndex = index; @@ -140,29 +146,47 @@ export function detectMultiEntry( const sampleRowIndex = multiEntryRowIndex !== -1 ? multiEntryRowIndex : firstClassifiedRowIndex; if (sampleRowIndex === -1) { - return { multiValuedFields: [...multiValued], countDistribution, largestCount, sample: null }; + return { + multiValuedFields: [...multiValued], + countDistribution, + largestCount, + sample: null, + samplesByCount: {}, + }; } - const sampleRow = rows[sampleRowIndex]; - // Every mapped classifier column with a value in the sample row. Step 1 lists - // them all; Step 2's ordering table filters to the multi-valued ones - // (single-value columns are whole-dwelling facts, not building parts). - const columns: MultiEntryColumn[] = classifierCols - .map(([field, header]) => ({ - field, - header, - entries: splitEntries(sampleRow[header]), - })) - .filter((column) => column.entries.length > 0); + // One ordering sample per distinct count, so the UI can render a panel per + // count and the user confirms each independently. + const samplesByCount: Record = {}; + for (const [count, rowIndex] of Object.entries(sampleRowIndexByCount)) { + samplesByCount[count] = sampleFromRow(rows[rowIndex], columnMapping, classifierCols, Number(count)); + } return { multiValuedFields: [...multiValued], countDistribution, largestCount, - sample: { - address: buildAddress(sampleRow, columnMapping), - count: largestCount >= 2 ? largestCount : 1, - columns, - }, + sample: sampleFromRow( + rows[sampleRowIndex], + columnMapping, + classifierCols, + largestCount >= 2 ? largestCount : 1, + ), + samplesByCount, }; } + +// Build the sample for one row: its display address plus every mapped classifier +// column carrying a value. Step 1 lists all columns; Step 2's order table filters +// to the multi-valued ones (single-value columns are whole-dwelling facts). +function sampleFromRow( + row: Record, + columnMapping: Record, + classifierCols: Array<[string, string]>, + count: number, +): MultiEntrySample { + const columns: MultiEntryColumn[] = classifierCols + .map(([field, header]) => ({ field, header, entries: splitEntries(row[header]) })) + .filter((column) => column.entries.length > 0); + return { address: buildAddress(row, columnMapping), count, columns }; +} diff --git a/src/lib/bulkUpload/s3Keys.ts b/src/lib/bulkUpload/s3Keys.ts new file mode 100644 index 00000000..8188fcf4 --- /dev/null +++ b/src/lib/bulkUpload/s3Keys.ts @@ -0,0 +1,23 @@ +// Shared S3 key conventions + the synthetic join-column name for bulk-upload +// artifacts. The finaliser join (ADR-0006) depends on the classifier CSV key +// being built *identically* in two places — where the CSV is written +// (start-address-matching) and where the finaliser is dispatched +// (dispatchFinaliser) — and on the `source_row_id` column appearing in both the +// address CSV and the classifier CSV. Keeping the convention here is the single +// source of truth that stops those two callers drifting. + +export const BULK_UPLOAD_INPUT_PREFIX = "bulk_onboarding_inputs"; + +export function addressCsvKey(portfolioId: string, uploadId: string): string { + return `${BULK_UPLOAD_INPUT_PREFIX}/${portfolioId}/${uploadId}.csv`; +} + +export function classifierCsvKey(portfolioId: string, uploadId: string): string { + return `${BULK_UPLOAD_INPUT_PREFIX}/${portfolioId}/${uploadId}-classifier.csv`; +} + +// The synthetic per-row UUID column. Minted at start-address-matching and +// emitted into both CSVs so the finaliser can join a row's identity (combiner +// output) to its raw descriptions (classifier CSV). The Model finaliser reads +// this exact header — keep the two in sync. +export const SOURCE_ROW_ID_COLUMN = "source_row_id"; diff --git a/src/lib/bulkUpload/server.ts b/src/lib/bulkUpload/server.ts index 2de69fff..95df4df5 100644 --- a/src/lib/bulkUpload/server.ts +++ b/src/lib/bulkUpload/server.ts @@ -15,6 +15,8 @@ import { subTasks } from "@/app/db/schema/tasks/subtask"; import { and, count, desc, eq, inArray, sql } from "drizzle-orm"; import type { BulkUpload, BulkUploadStatus, ProgressView, TaskSummary } from "./types"; import { validateColumnMapping, classifierMapping } from "./columnFields"; +import { classifierCsvKey } from "./s3Keys"; +import { retrofitDataS3Bucket } from "@/app/utils/s3"; import { SUBTASK_SERVICE } from "./types"; import type { MultiEntrySummary } from "./multiEntry"; import { isPermutation } from "./multiEntry"; @@ -166,24 +168,91 @@ async function lookupOverrides( } } -// The classifier's enums for the multi-entry sample's entries, joined by the +// The classifier's enums for the review samples' entries, joined by the // normalized description (exact match — the summary stored it the way the -// classifier persists it, so no re-normalization here). Read-only. +// classifier persists it, so no re-normalization here). Read-only. Covers the +// Step 1 verify sample AND every per-count ordering sample, since smaller-count +// panels may show descriptions the largest-count sample doesn't (ADR-0004 +// amendment). export async function getSampleClassifications( uploadId: string, ): Promise { const upload = await loadById(uploadId); - const sample = upload?.multiEntrySummary?.sample; - if (!upload || !sample) return {}; + const summary = upload?.multiEntrySummary; + if (!upload || !summary || !summary.sample) return {}; + + // Gather distinct descriptions per field across all samples. + const allSamples = [summary.sample, ...Object.values(summary.samplesByCount ?? {})]; + const descriptionsByField: Record> = {}; + for (const sample of allSamples) { + for (const column of sample.columns) { + const set = (descriptionsByField[column.field] ??= new Set()); + for (const e of column.entries) set.add(e.description); + } + } const portfolioId = BigInt(upload.portfolioId); const result: SampleClassifications = {}; - for (const column of sample.columns) { - const descriptions = [...new Set(column.entries.map((e) => e.description))]; + for (const [field, descSet] of Object.entries(descriptionsByField)) { + const descriptions = [...descSet]; if (descriptions.length === 0) continue; - const rows = await lookupOverrides(column.field, portfolioId, descriptions); + const rows = await lookupOverrides(field, portfolioId, descriptions); if (!rows) continue; - result[column.field] = Object.fromEntries(rows.map((r) => [r.description, r.value])); + result[field] = Object.fromEntries(rows.map((r) => [r.description, r.value])); + } + return result; +} + +// Descriptions still classified `Unknown` per field, portfolio-wide (ADR-0006). +// `Unknown` is the classifier's "couldn't decide" marker; v2 treats it as +// never-final, so the Finalise gate blocks until the user maps every one to a +// real value (and the finaliser fails loudly if any slips through). Portfolio- +// wide is the right scope under the one-real-upload assumption (ADR-0006). +export type UnknownOverrides = Record; + +const UNKNOWN_VALUE = "Unknown"; + +async function unknownForField(field: string, portfolioId: bigint): Promise { + switch (field) { + case "property_type": + return ( + await db + .select({ description: landlordPropertyTypeOverrides.description }) + .from(landlordPropertyTypeOverrides) + .where(and(eq(landlordPropertyTypeOverrides.portfolioId, portfolioId), eq(landlordPropertyTypeOverrides.value, UNKNOWN_VALUE))) + ).map((r) => r.description); + case "built_form_type": + return ( + await db + .select({ description: landlordBuiltFormTypeOverrides.description }) + .from(landlordBuiltFormTypeOverrides) + .where(and(eq(landlordBuiltFormTypeOverrides.portfolioId, portfolioId), eq(landlordBuiltFormTypeOverrides.value, UNKNOWN_VALUE))) + ).map((r) => r.description); + case "wall_type": + return ( + await db + .select({ description: landlordWallTypeOverrides.description }) + .from(landlordWallTypeOverrides) + .where(and(eq(landlordWallTypeOverrides.portfolioId, portfolioId), eq(landlordWallTypeOverrides.value, UNKNOWN_VALUE))) + ).map((r) => r.description); + case "roof_type": + return ( + await db + .select({ description: landlordRoofTypeOverrides.description }) + .from(landlordRoofTypeOverrides) + .where(and(eq(landlordRoofTypeOverrides.portfolioId, portfolioId), eq(landlordRoofTypeOverrides.value, UNKNOWN_VALUE))) + ).map((r) => r.description); + default: + return []; + } +} + +export async function getUnknownOverrides(portfolioId: string): Promise { + const pid = BigInt(portfolioId); + const result: UnknownOverrides = {}; + for (const field of ["property_type", "built_form_type", "wall_type", "roof_type"]) { + const descriptions = await unknownForField(field, pid); + if (descriptions.length > 0) result[field] = descriptions; } return result; } @@ -276,10 +345,12 @@ export type SetOrderingOutcome = | { kind: "not_multi_entry" } | { kind: "invalid_ordering"; reason: string }; -// Persist the user-confirmed building-part ordering (ADR-0004). Allowed only at -// awaiting_review and only when the upload is multi-entry. Validates that the -// largest count is provided and every supplied permutation is a bijection of -// its positions, then marks it confirmed (which gates Finalise). +// Persist the user-confirmed building-part ordering (ADR-0004, amended +// 2026-06-05). Allowed only at awaiting_review and only when the upload is +// multi-entry. Each distinct count ≥ 2 needs its own permutation; the UI confirms +// one count at a time, so we MERGE the supplied permutations into any already +// stored, validate each is a bijection, and only mark `confirmed` once EVERY +// detected count has a permutation (which gates Finalise). export async function setMultiEntryOrdering( uploadId: string, permutations: Record, @@ -292,22 +363,25 @@ export async function setMultiEntryOrdering( const summary = upload.multiEntrySummary; // A sample now exists for non-multi-entry uploads too (Step 1's verify // sample), so "is multi-entry" is largestCount >= 2, not "has a sample". - if (!summary || summary.largestCount < 2 || !summary.sample) + if (!summary || summary.largestCount < 2) return { kind: "not_multi_entry" }; - const sample = summary.sample; - - const largest = String(sample.count); - if (!permutations[largest]) - return { kind: "invalid_ordering", reason: `Missing ordering for ${sample.count} parts.` }; for (const [count, permutation] of Object.entries(permutations)) { if (permutation.length !== Number(count) || !isPermutation(permutation)) return { kind: "invalid_ordering", reason: `Ordering for ${count} parts is not a valid arrangement.` }; } + // Merge with any counts confirmed earlier, then decide whether every detected + // count (the keys of countDistribution, all ≥ 2) now has a permutation. + const merged = { ...(upload.multiEntryOrdering?.permutations ?? {}), ...permutations }; + const requiredCounts = Object.keys(summary.countDistribution); + const confirmed = requiredCounts.every( + (c) => Array.isArray(merged[c]) && merged[c].length === Number(c), + ); + const [updated] = await db .update(bulkAddressUploads) - .set({ multiEntryOrdering: { permutations, confirmed: true } }) + .set({ multiEntryOrdering: { permutations: merged, confirmed } }) .where(eq(bulkAddressUploads.id, uploadId)) .returning(); if (!updated) return { kind: "not_found" }; @@ -625,12 +699,32 @@ export async function dispatchFinaliser(args: { }) .returning(); + // v2 (ADR-0006): the finaliser also writes property_overrides for UPRN-matched + // rows, which needs the classifier CSV (raw descriptions, joined to the + // combiner output by `source_row_id`) and the confirmed building-part ordering. + // Both are derivable here — we already hold the upload row, and dispatch runs + // after the user confirms ordering, so the value is final. + // - classifier_s3_uri: null when no classifier columns were mapped (no + // classifier CSV was written; the finaliser then writes no overrides). + // - multi_entry_ordering: permutations keyed by entry-count; {} when the + // upload is not multi-entry (every cell is a single building part → part 0). + const classifierMap = classifierMapping(upload.columnMapping ?? {}); + const classifierS3Uri = + Object.keys(classifierMap).length > 0 + ? `s3://${retrofitDataS3Bucket()}/${classifierCsvKey(upload.portfolioId, args.uploadId)}` + : null; + const payload = { task_id: upload.taskId, sub_task_id: subTask.id, s3_uri: upload.combinedOutputS3Uri, portfolio_id: Number(upload.portfolioId), bulk_upload_id: args.uploadId, + classifier_s3_uri: classifierS3Uri, + multi_entry_ordering: upload.multiEntryOrdering?.permutations ?? {}, + // classifier category → source CSV header, so the finaliser knows which + // classifier-CSV column feeds each override_component (ADR-0006). + column_mapping: classifierMap, }; const trigger = await triggerFastApiPipeline({