diff --git a/.claude/settings.json b/.claude/settings.json index 9e885fd..e5143ec 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -15,7 +15,33 @@ "Bash(npx drizzle-kit *)", "Bash(echo \"frontend tsc exit: $?\")", "Bash(python3 -c ' *)", - "Bash(rm -f /workspaces/home/github/Model/backend/address2UPRN/local_handler/.env.local /workspaces/home/github/Model/backend/bulk_address2uprn_combiner/local_handler/.env.local && echo \"removed stub .env.local files\")" + "Bash(rm -f /workspaces/home/github/Model/backend/address2UPRN/local_handler/.env.local /workspaces/home/github/Model/backend/bulk_address2uprn_combiner/local_handler/.env.local && echo \"removed stub .env.local files\")", + "Bash(cat deployment/terraform/modules/s3_iam_policy/main.tf)", + "Bash(cat deployment/terraform/modules/s3_iam_policy/variables.tf)", + "Bash(terraform fmt *)", + "Bash(echo \"exit: $?\")", + "Bash(pip install *)", + "Bash(git -C /workspaces/assessment-model remote -v)", + "Bash(gh label *)", + "Bash(gh issue create --repo Hestia-Homes/assessment-model --label ready-for-agent --title 'Detect multi-entry rows and surface the largest-count sample on awaiting_review' --body ' *)", + "Bash(gh issue create --repo Hestia-Homes/assessment-model --label ready-for-agent --title 'Confirm building-part ordering and gate Finalise on it' --body ' *)", + "Bash(gh issue create --repo Hestia-Homes/assessment-model --label ready-for-agent --title 'Show our classification next to each multi-entry sample entry \\(read-only\\)' --body ' *)", + "Bash(gh issue create --repo Hestia-Homes/assessment-model --label ready-for-agent --title 'Editable classification verification writing source='\\\\''user'\\\\'', gating Finalise' --body ' *)", + "Bash(git check-ignore *)", + "Bash(git ls-tree *)", + "Bash(git worktree *)", + "Read(//workspaces/mig-wt/src/app/db/**)", + "Read(//workspaces/mig-wt/src/app/db/migrations/meta/**)", + "Bash(git branch *)", + "Bash(cp /workspaces/assessment-model/.env.local /tmp/mig-wt/.env.local; echo \"env copied\"; cat -n /tmp/mig-wt/src/app/db/schema/bulk_address_uploads.ts)", + "Bash(node /workspaces/assessment-model/node_modules/drizzle-kit/bin.cjs generate)", + "Bash(ln -s /workspaces/assessment-model/node_modules /tmp/mig-wt/node_modules)", + "Bash(node node_modules/drizzle-kit/bin.cjs generate)", + "Bash(git push *)", + "Bash(npm run *)", + "Bash(grep '\\\\.sql$')", + "Bash(git status *)", + "Bash(git checkout *)" ], "deny": [ "Bash(npx drizzle-kit generate)", @@ -25,7 +51,9 @@ "/workspaces/home/github/Model/backend/app/bulk_uploads", "/workspaces/home/github/Model/applications/landlord_description_overrides", "/workspaces/home/github/Model/orchestration", - "/workspaces/home/github/Model/backend/address2UPRN/local_handler" + "/workspaces/home/github/Model/backend/address2UPRN/local_handler", + "/workspaces/home/github/Model/deployment/terraform/shared", + "/tmp/mig-wt" ] } } diff --git a/CONTEXT.md b/CONTEXT.md index 7a79d1c..24963b5 100644 --- a/CONTEXT.md +++ b/CONTEXT.md @@ -45,6 +45,27 @@ _Avoid_: customer data, manual override, landlord data The translation from a Landlord's free-text description in a BulkUpload column (e.g. `"cavity: filledcavity"`) to a canonical domain enum value (e.g. `WallType.CAVITY`). Produced by a `ColumnClassifier` (today an LLM, tomorrow possibly a lookup table or rules engine) in the Model service. Stored per-Portfolio, one row per `(category, description)`. A row carries provenance (`classifier` or `user`) so user overrides survive re-classification. _Avoid_: column mapping (that's a separate concept — see `ColumnMapping` above), classification, dictionary +### Building parts + +**Building part**: +One physically distinct part of a dwelling described by a single entry within a multi-valued cell. A dwelling is one **Main building** plus zero or more **Extensions**. Per-part descriptions appear as comma-separated entries in physical-element columns (e.g. `Walls`, `Roofs`); whole-dwelling columns (e.g. `Property Type`) carry a single entry and are **not** split per part. +_Avoid_: annexe, unit, section, dwelling part + +**Main building**: +The principal building part of a dwelling — exactly one per address. The others are **Extensions**. + +**Extension**: +A building part that is not the Main building, numbered **Extension 1 … Extension N-1** for an N-entry address. +_Avoid_: annexe, addition, outbuilding + +**Multi-entry**: +The property of a BulkUpload row whose physical-element cells hold **more than one comma-separated entry**, one per **Building part**. Always intra-cell in our data — never multiple rows sharing one address/UPRN. Within a row, the multi-valued columns agree on entry-count, so **position `i` is the same Building part across every multi-valued column**. +_Avoid_: multi-row, multi-record, duplicate address + +**Building-part ordering** (a.k.a. **ordering**): +The user's declaration, captured once per file, of which list-position maps to which Building part — because the entry order is a consistent per-file mistake (`"A, B"` could be `[Main, Extension 1]` or `[Extension 1, Main]`). Stored per entry-count as a permutation. See [ADR-0004](./docs/adr/0004-multi-entry-building-part-ordering.md). +_Avoid_: sort order, sequence, column mapping + ## Lifecycle A **BulkUpload** moves through these statuses: @@ -65,6 +86,8 @@ Re-mapping (PATCHing `columnMapping`) is legal only in `ready_for_processing` an **Two writers**: Next.js owns transitions out of `mapping_complete`, into `processing`, and the terminal Finalise outcomes. FastAPI owns `combining` and `awaiting_review` — writing them direct to the DB during the combiner run. The BulkUpload aggregate observes both. +At `awaiting_review`, **Finalise is gated** (not a new status — a precondition on the action): when classifier columns were mapped the user must acknowledge the classification-verification step, and when the file is **Multi-entry** they must confirm the **Building-part ordering**. See [ADR-0004](./docs/adr/0004-multi-entry-building-part-ordering.md). + See [ADR-0001](./docs/adr/0001-bulk-upload-state-machine.md) for the deliberate "not yet" decisions baked into this lifecycle. ## Relationships diff --git a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts index 14441ce..22bfa8b 100644 --- a/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts +++ b/src/app/api/portfolio/[portfolioId]/bulk-uploads/[uploadId]/start-address-matching/route.ts @@ -3,20 +3,24 @@ import { getServerSession } from "next-auth"; import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions"; import { createS3Client, createRetrofitDataS3Client, retrofitDataS3Bucket } from "@/app/utils/s3"; import * as XLSX from "xlsx"; -import { loadForAddressMatching, triggerAddressMatching, triggerClassifier } from "@/lib/bulkUpload/server"; +import { loadForAddressMatching, saveMultiEntrySummary, triggerAddressMatching, triggerClassifier } from "@/lib/bulkUpload/server"; import { readSessionToken } from "@/lib/session"; -import { ADDRESS_FIELDS } from "@/lib/bulkUpload/columnFields"; +import { ADDRESS_FIELDS, classifierMapping } from "@/lib/bulkUpload/columnFields"; +import { detectMultiEntry } from "@/lib/bulkUpload/multiEntry"; -function transformFile( - buffer: Buffer, - columnMapping: Record // field → source header -): { csv: string; error?: never } | { csv?: never; error: string } { +type SheetRow = Record; + +function readRows(buffer: Buffer): SheetRow[] { const wb = XLSX.read(buffer, { type: "buffer" }); const sheet = wb.Sheets[wb.SheetNames[0]]; - const rows = XLSX.utils.sheet_to_json>(sheet, { defval: "" }); - - if (rows.length === 0) return { error: "Empty file" }; + return XLSX.utils.sheet_to_json(sheet, { defval: "" }); +} +// Address-matching CSV: address fields only, renamed to canonical headers. +function buildAddressCsv( + rows: SheetRow[], + columnMapping: Record // field → source header +): { csv: string; error?: never } | { csv?: never; error: string } { const outputHeaders: string[] = []; const outputToSource: Record = {}; for (const field of ADDRESS_FIELDS) { @@ -32,7 +36,7 @@ function transformFile( return { error: 'Mapping must include "postcode"' }; const outputRows = rows.map((row) => { - const out: Record = {}; + const out: SheetRow = {}; for (const [outName, src] of Object.entries(outputToSource)) { out[outName] = row[src] ?? ""; } @@ -43,6 +47,25 @@ function transformFile( return { csv: XLSX.utils.sheet_to_csv(outSheet) }; } +// Classifier CSV: the mapped classifier source columns only, original headers +// preserved (the lambda resolves them via column_mapping). Converting here means +// the classifier always reads a real CSV even when the upload was .xlsx/.xls — +// see ADR-0003. One source header may feed several categories, so dedupe to +// distinct headers. +function buildClassifierCsv( + rows: SheetRow[], + classifierMap: Record // category → source header +): string { + const headers = [...new Set(Object.values(classifierMap))]; + const outputRows = rows.map((row) => { + const out: SheetRow = {}; + for (const h of headers) out[h] = row[h] ?? ""; + return out; + }); + const outSheet = XLSX.utils.json_to_sheet(outputRows, { header: headers }); + return XLSX.utils.sheet_to_csv(outSheet); +} + export async function POST( request: NextRequest, { params }: { params: Promise<{ portfolioId: string; uploadId: string }> } @@ -81,7 +104,15 @@ export async function POST( return NextResponse.json({ error: "Failed to read source file" }, { status: 500 }); } - const transformed = transformFile(fileBuffer, upload.columnMapping!); + const rows = readRows(fileBuffer); + if (rows.length === 0) + return NextResponse.json({ error: "Empty file" }, { status: 422 }); + + // Detect multi-entry building parts now, while the whole file is parsed in + // memory, so the awaiting_review surface never re-reads it (ADR-0004). + await saveMultiEntrySummary(uploadId, detectMultiEntry(rows, upload.columnMapping!)); + + const transformed = buildAddressCsv(rows, upload.columnMapping!); if (transformed.error) return NextResponse.json({ error: transformed.error }, { status: 422 }); @@ -102,13 +133,37 @@ export async function POST( const s3Uri = `s3://${outputBucket}/${transformedKey}`; + // Convert the mapped classifier columns to their own CSV so the classifier + // lambda always parses a real CSV, never the raw upload (which may be + // .xlsx/.xls). Only when the user mapped ≥1 classifier column. See ADR-0003. + const classifierMap = classifierMapping(upload.columnMapping!); + let classifierS3Uri: string | undefined; + if (Object.keys(classifierMap).length > 0) { + const classifierKey = `bulk_onboarding_inputs/${portfolioId}/${uploadId}-classifier.csv`; + try { + await outputS3 + .putObject({ + Bucket: outputBucket, + Key: classifierKey, + Body: buildClassifierCsv(rows, classifierMap), + ContentType: "text/csv", + }) + .promise(); + classifierS3Uri = `s3://${outputBucket}/${classifierKey}`; + } catch (err) { + // Non-blocking: classification is skipped, address matching proceeds. + console.error("Failed to upload classifier CSV:", err); + } + } + const sessionToken = readSessionToken(request); const trigger = await triggerAddressMatching({ uploadId, s3Uri, sessionToken }); if (trigger.kind === "trigger_failed") return NextResponse.json({ error: trigger.message }, { status: trigger.status }); // Co-fire the landlord classifier (non-blocking) under the same task. - await triggerClassifier({ taskId: trigger.taskId, uploadId, sessionToken }); + if (classifierS3Uri) + await triggerClassifier({ taskId: trigger.taskId, uploadId, s3Uri: classifierS3Uri, sessionToken }); return NextResponse.json({ taskId: trigger.taskId }, { status: 200 }); } diff --git a/src/app/db/schema/bulk_address_uploads.ts b/src/app/db/schema/bulk_address_uploads.ts index 52f3211..8166551 100644 --- a/src/app/db/schema/bulk_address_uploads.ts +++ b/src/app/db/schema/bulk_address_uploads.ts @@ -1,6 +1,30 @@ import { pgTable, uuid, text, timestamp, jsonb } from "drizzle-orm/pg-core"; import { sql } from "drizzle-orm"; +// Shape of the multi_entry_summary jsonb (ADR-0004). Co-located with the column +// so the schema is self-contained; the detection logic in +// src/lib/bulkUpload/multiEntry.ts imports these. +export interface MultiEntryEntry { + raw: string; + description: string; +} +export interface MultiEntryColumn { + field: string; + header: string; + entries: MultiEntryEntry[]; +} +export interface MultiEntrySample { + address: string; + count: number; + columns: MultiEntryColumn[]; +} +export interface MultiEntrySummary { + multiValuedFields: string[]; + countDistribution: Record; + largestCount: number; + sample: MultiEntrySample | null; +} + export const bulkAddressUploads = pgTable("bulk_address_uploads", { id: uuid("id").defaultRandom().primaryKey(), portfolioId: text("portfolio_id").notNull(), @@ -11,6 +35,9 @@ export const bulkAddressUploads = pgTable("bulk_address_uploads", { status: text("status").notNull().default("ready_for_processing"), sourceHeaders: text("source_headers").array().notNull().default(sql`'{}'`), columnMapping: jsonb("column_mapping").$type>(), + // Multi-entry building-part detection, computed at start-address-matching + // and read by the awaiting_review review surface (ADR-0004). + multiEntrySummary: jsonb("multi_entry_summary").$type(), taskId: uuid("task_id"), combinedOutputS3Uri: text("combined_output_s3_uri"), createdAt: timestamp("created_at", { withTimezone: true }).notNull().defaultNow(), diff --git a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx index b298d7f..b80ae1c 100644 --- a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx +++ b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/OnboardingProgress.tsx @@ -8,6 +8,7 @@ import { useFinalize, useRequestCombine, } from "@/lib/bulkUpload/client"; +import type { MultiEntrySample } from "@/lib/bulkUpload/multiEntry"; interface Props { portfolioSlug: string; @@ -60,6 +61,13 @@ export default function OnboardingProgress({ const canRunCombiner = taskDone && !taskFailed && upload.status === "processing"; const canFinalize = upload.status === "awaiting_review"; + // Multi-entry building-part sample, shown read-only on the review surface + // (ADR-0004). Ordering confirmation arrives in a later slice. + const multiEntrySample = + upload.status === "awaiting_review" + ? (upload.multiEntrySummary?.sample ?? null) + : null; + return (
@@ -70,17 +78,24 @@ export default function OnboardingProgress({
- {total > 0 && ( - - {completedSubtasks} / {total} batches complete - - )} - {failedSubtasks > 0 && ( - - - {failedSubtasks} failed - - )} + {/* Address matching: standardises addresses against the OS lookup, in batches. */} + + Address matching: + {failedSubtasks > 0 ? ( + + + {failedSubtasks} of {total} batches failed + + ) : total > 0 && completedSubtasks >= total ? ( + complete + ) : ( + + + running{total > 0 ? ` · ${completedSubtasks} / ${total} batches` : ""} + + )} + + {/* Classification: turns the landlord's free-text descriptions into EPC categories. */} {classifierTotal > 0 && ( Classification: @@ -99,12 +114,6 @@ export default function OnboardingProgress({ )} )} - {!taskDone && ( - - - Running - - )} {isCombining && ( @@ -119,6 +128,8 @@ export default function OnboardingProgress({ )}
+ {multiEntrySample && } + {(canRunCombiner || canFinalize) && (
{canRunCombiner && ( @@ -164,6 +175,51 @@ export default function OnboardingProgress({ ); } +// Read-only preview of the largest-count multi-entry row (ADR-0004). Each +// comma-separated entry is a building part; the user will confirm their order +// in a later slice. Positions are shown 1-based, unlabelled for now. +function MultiEntrySamplePanel({ sample }: { sample: MultiEntrySample }) { + return ( +
+

+ Multiple building parts detected +

+

+ {sample.address ? {sample.address} : "An address"}{" "} + has {sample.count} building parts (e.g. a main building and extensions). + You'll be asked to confirm their order before finalising. +

+ +
+ + + + + {sample.columns.map((column) => ( + + ))} + + + + {Array.from({ length: sample.count }).map((_, position) => ( + + + {sample.columns.map((column) => ( + + ))} + + ))} + +
Position + {column.header} +
{position + 1} + {column.entries[position]?.raw ?? "—"} +
+
+
+ ); +} + function StageButton({ label, activeLabel, diff --git a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/map-columns/MapColumnsClient.tsx b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/map-columns/MapColumnsClient.tsx index fe729d6..88ab9ce 100644 --- a/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/map-columns/MapColumnsClient.tsx +++ b/src/app/portfolio/[slug]/(portfolio)/bulk-upload/[uploadId]/map-columns/MapColumnsClient.tsx @@ -104,8 +104,8 @@ export default function MapColumnsClient({ className="w-full text-sm border border-gray-200 rounded-lg px-3 py-2 bg-white text-gray-800 focus:outline-none focus:ring-2 focus:ring-[#14163d]/20 focus:border-[#14163d]" > - {sourceHeaders.map((header) => ( - ))} diff --git a/src/lib/bulkUpload/multiEntry.ts b/src/lib/bulkUpload/multiEntry.ts new file mode 100644 index 0000000..b0ebe3f --- /dev/null +++ b/src/lib/bulkUpload/multiEntry.ts @@ -0,0 +1,119 @@ +// Multi-entry building-part detection (ADR-0004). +// +// A BulkUpload row can carry several comma-separated entries in a physical- +// element column (e.g. Walls = "Cavity: AsBuilt (1976-1982), Cavity: +// FilledCavity"). Each entry is a Building part (Main building + Extensions). +// This module finds that pattern and captures one sample — the row with the +// MOST building parts — so the user can confirm the ordering downstream. +// +// Pure + I/O-free so it's unit-testable; the start-address-matching route runs +// it over the already-parsed upload rows and persists the result on the upload. + +import { ADDRESS_FIELDS, classifierMapping } from "./columnFields"; +import type { + MultiEntryEntry, + MultiEntryColumn, + MultiEntrySummary, +} from "@/app/db/schema/bulk_address_uploads"; + +// The jsonb shape lives with the column (schema/bulk_address_uploads.ts) so the +// migration is self-contained; re-export here for callers of this module. +export type { + MultiEntryEntry, + MultiEntryColumn, + MultiEntrySample, + MultiEntrySummary, +} from "@/app/db/schema/bulk_address_uploads"; + +export const EMPTY_MULTI_ENTRY_SUMMARY: MultiEntrySummary = { + multiValuedFields: [], + countDistribution: {}, + largestCount: 0, + sample: null, +}; + +// Split a cell into building-part entries. Mirrors the classifier's +// split(",") → trim → lower, dropping empty fragments so positions align +// across raw and normalized forms. +export function splitEntries(value: unknown): MultiEntryEntry[] { + return String(value ?? "") + .split(",") + .map((s) => s.trim()) + .filter((s) => s.length > 0) + .map((raw) => ({ raw, description: raw.toLowerCase() })); +} + +// Compose a display address from the mapped address fields (reference excluded). +function buildAddress( + row: Record, + columnMapping: Record, +): string { + const parts: string[] = []; + for (const field of ADDRESS_FIELDS) { + if (field.value === "internal_reference") continue; + const header = columnMapping[field.value]; + if (!header) continue; + const value = String(row[header] ?? "").trim(); + if (value) parts.push(value); + } + return parts.join(", "); +} + +// Scan the mapped classifier columns for multi-entry rows and capture the +// largest-count sample. Only classifier columns are considered — they're the +// physical-element descriptions we slice into building parts; address columns +// are single-valued by nature. +export function detectMultiEntry( + rows: Array>, + columnMapping: Record, +): MultiEntrySummary { + const classifierCols = Object.entries(classifierMapping(columnMapping)); + if (classifierCols.length === 0) return EMPTY_MULTI_ENTRY_SUMMARY; + + const multiValued = new Set(); + const countDistribution: Record = {}; + let largestCount = 0; + let sampleRowIndex = -1; + + rows.forEach((row, index) => { + let rowMax = 0; + for (const [field, header] of classifierCols) { + const n = splitEntries(row[header]).length; + if (n > 1) multiValued.add(field); + if (n > rowMax) rowMax = n; + } + if (rowMax >= 2) { + const key = String(rowMax); + countDistribution[key] = (countDistribution[key] ?? 0) + 1; + // First row at a new maximum becomes the sample. + if (rowMax > largestCount) { + largestCount = rowMax; + sampleRowIndex = index; + } + } + }); + + if (sampleRowIndex === -1) return EMPTY_MULTI_ENTRY_SUMMARY; + + const sampleRow = rows[sampleRowIndex]; + // Show only the columns that are actually split in the sample row; + // single-value columns are whole-dwelling facts, not building parts. + const columns: MultiEntryColumn[] = classifierCols + .map(([field, header]) => ({ + field, + header, + entries: splitEntries(sampleRow[header]), + })) + .filter((column) => column.entries.length > 1); + + return { + multiValuedFields: [...multiValued], + countDistribution, + largestCount, + sample: { + address: buildAddress(sampleRow, columnMapping), + count: largestCount, + columns, + }, + }; +} diff --git a/src/lib/bulkUpload/server.ts b/src/lib/bulkUpload/server.ts index e526062..f020dbc 100644 --- a/src/lib/bulkUpload/server.ts +++ b/src/lib/bulkUpload/server.ts @@ -6,6 +6,7 @@ import { count, desc, eq, sql } from "drizzle-orm"; import type { BulkUpload, BulkUploadStatus, ProgressView, TaskSummary } from "./types"; import { validateColumnMapping, classifierMapping } from "./columnFields"; import { SUBTASK_SERVICE } from "./types"; +import type { MultiEntrySummary } from "./multiEntry"; const REMAP_ALLOWED: ReadonlySet = new Set([ "ready_for_processing", @@ -102,6 +103,20 @@ export async function getProgressView(uploadId: string): Promise { + await db + .update(bulkAddressUploads) + .set({ multiEntrySummary: summary }) + .where(eq(bulkAddressUploads.id, uploadId)); +} + export type SetMappingOutcome = | { kind: "ok"; upload: BulkUpload } | { kind: "not_found" } @@ -211,13 +226,16 @@ export async function triggerAddressMatching(args: { return { kind: "ok", taskId: task.id }; } -// Co-fires the landlord classifier as a subtask under the address task. Reads -// the ORIGINAL upload (the address-matching CSV strips the description columns) -// and is non-blocking: a trigger failure marks only the classifier subtask, so -// address matching is unaffected. See ADR-0003. +// Co-fires the landlord classifier as a subtask under the address task. Reads a +// dedicated classifier CSV (the classifier columns converted from the upload by +// the start-address-matching route — the address-matching CSV strips the +// description columns), so the lambda always parses a real CSV even for +// .xlsx/.xls uploads. Non-blocking: a trigger failure marks only the classifier +// subtask, so address matching is unaffected. See ADR-0003. export async function triggerClassifier(args: { taskId: string; uploadId: string; + s3Uri: string; sessionToken: string | undefined; }): Promise { const upload = await loadById(args.uploadId); @@ -239,7 +257,7 @@ export async function triggerClassifier(args: { const payload = { task_id: args.taskId, sub_task_id: subTask.id, - s3_uri: `s3://${upload.s3Bucket}/${upload.s3Key}`, + s3_uri: args.s3Uri, portfolio_id: Number(upload.portfolioId), column_mapping: columnMapping, };