mirror of
https://github.com/Hestia-Homes/assessment-model.git
synced 2026-06-08 11:37:25 +00:00
lanlord exetension application
This commit is contained in:
parent
90407799ac
commit
38c82ebca3
8 changed files with 364 additions and 38 deletions
|
|
@ -15,7 +15,33 @@
|
|||
"Bash(npx drizzle-kit *)",
|
||||
"Bash(echo \"frontend tsc exit: $?\")",
|
||||
"Bash(python3 -c ' *)",
|
||||
"Bash(rm -f /workspaces/home/github/Model/backend/address2UPRN/local_handler/.env.local /workspaces/home/github/Model/backend/bulk_address2uprn_combiner/local_handler/.env.local && echo \"removed stub .env.local files\")"
|
||||
"Bash(rm -f /workspaces/home/github/Model/backend/address2UPRN/local_handler/.env.local /workspaces/home/github/Model/backend/bulk_address2uprn_combiner/local_handler/.env.local && echo \"removed stub .env.local files\")",
|
||||
"Bash(cat deployment/terraform/modules/s3_iam_policy/main.tf)",
|
||||
"Bash(cat deployment/terraform/modules/s3_iam_policy/variables.tf)",
|
||||
"Bash(terraform fmt *)",
|
||||
"Bash(echo \"exit: $?\")",
|
||||
"Bash(pip install *)",
|
||||
"Bash(git -C /workspaces/assessment-model remote -v)",
|
||||
"Bash(gh label *)",
|
||||
"Bash(gh issue create --repo Hestia-Homes/assessment-model --label ready-for-agent --title 'Detect multi-entry rows and surface the largest-count sample on awaiting_review' --body ' *)",
|
||||
"Bash(gh issue create --repo Hestia-Homes/assessment-model --label ready-for-agent --title 'Confirm building-part ordering and gate Finalise on it' --body ' *)",
|
||||
"Bash(gh issue create --repo Hestia-Homes/assessment-model --label ready-for-agent --title 'Show our classification next to each multi-entry sample entry \\(read-only\\)' --body ' *)",
|
||||
"Bash(gh issue create --repo Hestia-Homes/assessment-model --label ready-for-agent --title 'Editable classification verification writing source='\\\\''user'\\\\'', gating Finalise' --body ' *)",
|
||||
"Bash(git check-ignore *)",
|
||||
"Bash(git ls-tree *)",
|
||||
"Bash(git worktree *)",
|
||||
"Read(//workspaces/mig-wt/src/app/db/**)",
|
||||
"Read(//workspaces/mig-wt/src/app/db/migrations/meta/**)",
|
||||
"Bash(git branch *)",
|
||||
"Bash(cp /workspaces/assessment-model/.env.local /tmp/mig-wt/.env.local; echo \"env copied\"; cat -n /tmp/mig-wt/src/app/db/schema/bulk_address_uploads.ts)",
|
||||
"Bash(node /workspaces/assessment-model/node_modules/drizzle-kit/bin.cjs generate)",
|
||||
"Bash(ln -s /workspaces/assessment-model/node_modules /tmp/mig-wt/node_modules)",
|
||||
"Bash(node node_modules/drizzle-kit/bin.cjs generate)",
|
||||
"Bash(git push *)",
|
||||
"Bash(npm run *)",
|
||||
"Bash(grep '\\\\.sql$')",
|
||||
"Bash(git status *)",
|
||||
"Bash(git checkout *)"
|
||||
],
|
||||
"deny": [
|
||||
"Bash(npx drizzle-kit generate)",
|
||||
|
|
@ -25,7 +51,9 @@
|
|||
"/workspaces/home/github/Model/backend/app/bulk_uploads",
|
||||
"/workspaces/home/github/Model/applications/landlord_description_overrides",
|
||||
"/workspaces/home/github/Model/orchestration",
|
||||
"/workspaces/home/github/Model/backend/address2UPRN/local_handler"
|
||||
"/workspaces/home/github/Model/backend/address2UPRN/local_handler",
|
||||
"/workspaces/home/github/Model/deployment/terraform/shared",
|
||||
"/tmp/mig-wt"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
|
|||
23
CONTEXT.md
23
CONTEXT.md
|
|
@ -45,6 +45,27 @@ _Avoid_: customer data, manual override, landlord data
|
|||
The translation from a Landlord's free-text description in a BulkUpload column (e.g. `"cavity: filledcavity"`) to a canonical domain enum value (e.g. `WallType.CAVITY`). Produced by a `ColumnClassifier` (today an LLM, tomorrow possibly a lookup table or rules engine) in the Model service. Stored per-Portfolio, one row per `(category, description)`. A row carries provenance (`classifier` or `user`) so user overrides survive re-classification.
|
||||
_Avoid_: column mapping (that's a separate concept — see `ColumnMapping` above), classification, dictionary
|
||||
|
||||
### Building parts
|
||||
|
||||
**Building part**:
|
||||
One physically distinct part of a dwelling described by a single entry within a multi-valued cell. A dwelling is one **Main building** plus zero or more **Extensions**. Per-part descriptions appear as comma-separated entries in physical-element columns (e.g. `Walls`, `Roofs`); whole-dwelling columns (e.g. `Property Type`) carry a single entry and are **not** split per part.
|
||||
_Avoid_: annexe, unit, section, dwelling part
|
||||
|
||||
**Main building**:
|
||||
The principal building part of a dwelling — exactly one per address. The others are **Extensions**.
|
||||
|
||||
**Extension**:
|
||||
A building part that is not the Main building, numbered **Extension 1 … Extension N-1** for an N-entry address.
|
||||
_Avoid_: annexe, addition, outbuilding
|
||||
|
||||
**Multi-entry**:
|
||||
The property of a BulkUpload row whose physical-element cells hold **more than one comma-separated entry**, one per **Building part**. Always intra-cell in our data — never multiple rows sharing one address/UPRN. Within a row, the multi-valued columns agree on entry-count, so **position `i` is the same Building part across every multi-valued column**.
|
||||
_Avoid_: multi-row, multi-record, duplicate address
|
||||
|
||||
**Building-part ordering** (a.k.a. **ordering**):
|
||||
The user's declaration, captured once per file, of which list-position maps to which Building part — because the entry order is a consistent per-file mistake (`"A, B"` could be `[Main, Extension 1]` or `[Extension 1, Main]`). Stored per entry-count as a permutation. See [ADR-0004](./docs/adr/0004-multi-entry-building-part-ordering.md).
|
||||
_Avoid_: sort order, sequence, column mapping
|
||||
|
||||
## Lifecycle
|
||||
|
||||
A **BulkUpload** moves through these statuses:
|
||||
|
|
@ -65,6 +86,8 @@ Re-mapping (PATCHing `columnMapping`) is legal only in `ready_for_processing` an
|
|||
|
||||
**Two writers**: Next.js owns transitions out of `mapping_complete`, into `processing`, and the terminal Finalise outcomes. FastAPI owns `combining` and `awaiting_review` — writing them direct to the DB during the combiner run. The BulkUpload aggregate observes both.
|
||||
|
||||
At `awaiting_review`, **Finalise is gated** (not a new status — a precondition on the action): when classifier columns were mapped the user must acknowledge the classification-verification step, and when the file is **Multi-entry** they must confirm the **Building-part ordering**. See [ADR-0004](./docs/adr/0004-multi-entry-building-part-ordering.md).
|
||||
|
||||
See [ADR-0001](./docs/adr/0001-bulk-upload-state-machine.md) for the deliberate "not yet" decisions baked into this lifecycle.
|
||||
|
||||
## Relationships
|
||||
|
|
|
|||
|
|
@ -3,20 +3,24 @@ import { getServerSession } from "next-auth";
|
|||
import { AuthOptions } from "@/app/api/auth/[...nextauth]/authOptions";
|
||||
import { createS3Client, createRetrofitDataS3Client, retrofitDataS3Bucket } from "@/app/utils/s3";
|
||||
import * as XLSX from "xlsx";
|
||||
import { loadForAddressMatching, triggerAddressMatching, triggerClassifier } from "@/lib/bulkUpload/server";
|
||||
import { loadForAddressMatching, saveMultiEntrySummary, triggerAddressMatching, triggerClassifier } from "@/lib/bulkUpload/server";
|
||||
import { readSessionToken } from "@/lib/session";
|
||||
import { ADDRESS_FIELDS } from "@/lib/bulkUpload/columnFields";
|
||||
import { ADDRESS_FIELDS, classifierMapping } from "@/lib/bulkUpload/columnFields";
|
||||
import { detectMultiEntry } from "@/lib/bulkUpload/multiEntry";
|
||||
|
||||
function transformFile(
|
||||
buffer: Buffer,
|
||||
columnMapping: Record<string, string> // field → source header
|
||||
): { csv: string; error?: never } | { csv?: never; error: string } {
|
||||
type SheetRow = Record<string, unknown>;
|
||||
|
||||
function readRows(buffer: Buffer): SheetRow[] {
|
||||
const wb = XLSX.read(buffer, { type: "buffer" });
|
||||
const sheet = wb.Sheets[wb.SheetNames[0]];
|
||||
const rows = XLSX.utils.sheet_to_json<Record<string, unknown>>(sheet, { defval: "" });
|
||||
|
||||
if (rows.length === 0) return { error: "Empty file" };
|
||||
return XLSX.utils.sheet_to_json<SheetRow>(sheet, { defval: "" });
|
||||
}
|
||||
|
||||
// Address-matching CSV: address fields only, renamed to canonical headers.
|
||||
function buildAddressCsv(
|
||||
rows: SheetRow[],
|
||||
columnMapping: Record<string, string> // field → source header
|
||||
): { csv: string; error?: never } | { csv?: never; error: string } {
|
||||
const outputHeaders: string[] = [];
|
||||
const outputToSource: Record<string, string> = {};
|
||||
for (const field of ADDRESS_FIELDS) {
|
||||
|
|
@ -32,7 +36,7 @@ function transformFile(
|
|||
return { error: 'Mapping must include "postcode"' };
|
||||
|
||||
const outputRows = rows.map((row) => {
|
||||
const out: Record<string, unknown> = {};
|
||||
const out: SheetRow = {};
|
||||
for (const [outName, src] of Object.entries(outputToSource)) {
|
||||
out[outName] = row[src] ?? "";
|
||||
}
|
||||
|
|
@ -43,6 +47,25 @@ function transformFile(
|
|||
return { csv: XLSX.utils.sheet_to_csv(outSheet) };
|
||||
}
|
||||
|
||||
// Classifier CSV: the mapped classifier source columns only, original headers
|
||||
// preserved (the lambda resolves them via column_mapping). Converting here means
|
||||
// the classifier always reads a real CSV even when the upload was .xlsx/.xls —
|
||||
// see ADR-0003. One source header may feed several categories, so dedupe to
|
||||
// distinct headers.
|
||||
function buildClassifierCsv(
|
||||
rows: SheetRow[],
|
||||
classifierMap: Record<string, string> // category → source header
|
||||
): string {
|
||||
const headers = [...new Set(Object.values(classifierMap))];
|
||||
const outputRows = rows.map((row) => {
|
||||
const out: SheetRow = {};
|
||||
for (const h of headers) out[h] = row[h] ?? "";
|
||||
return out;
|
||||
});
|
||||
const outSheet = XLSX.utils.json_to_sheet(outputRows, { header: headers });
|
||||
return XLSX.utils.sheet_to_csv(outSheet);
|
||||
}
|
||||
|
||||
export async function POST(
|
||||
request: NextRequest,
|
||||
{ params }: { params: Promise<{ portfolioId: string; uploadId: string }> }
|
||||
|
|
@ -81,7 +104,15 @@ export async function POST(
|
|||
return NextResponse.json({ error: "Failed to read source file" }, { status: 500 });
|
||||
}
|
||||
|
||||
const transformed = transformFile(fileBuffer, upload.columnMapping!);
|
||||
const rows = readRows(fileBuffer);
|
||||
if (rows.length === 0)
|
||||
return NextResponse.json({ error: "Empty file" }, { status: 422 });
|
||||
|
||||
// Detect multi-entry building parts now, while the whole file is parsed in
|
||||
// memory, so the awaiting_review surface never re-reads it (ADR-0004).
|
||||
await saveMultiEntrySummary(uploadId, detectMultiEntry(rows, upload.columnMapping!));
|
||||
|
||||
const transformed = buildAddressCsv(rows, upload.columnMapping!);
|
||||
if (transformed.error)
|
||||
return NextResponse.json({ error: transformed.error }, { status: 422 });
|
||||
|
||||
|
|
@ -102,13 +133,37 @@ export async function POST(
|
|||
|
||||
const s3Uri = `s3://${outputBucket}/${transformedKey}`;
|
||||
|
||||
// Convert the mapped classifier columns to their own CSV so the classifier
|
||||
// lambda always parses a real CSV, never the raw upload (which may be
|
||||
// .xlsx/.xls). Only when the user mapped ≥1 classifier column. See ADR-0003.
|
||||
const classifierMap = classifierMapping(upload.columnMapping!);
|
||||
let classifierS3Uri: string | undefined;
|
||||
if (Object.keys(classifierMap).length > 0) {
|
||||
const classifierKey = `bulk_onboarding_inputs/${portfolioId}/${uploadId}-classifier.csv`;
|
||||
try {
|
||||
await outputS3
|
||||
.putObject({
|
||||
Bucket: outputBucket,
|
||||
Key: classifierKey,
|
||||
Body: buildClassifierCsv(rows, classifierMap),
|
||||
ContentType: "text/csv",
|
||||
})
|
||||
.promise();
|
||||
classifierS3Uri = `s3://${outputBucket}/${classifierKey}`;
|
||||
} catch (err) {
|
||||
// Non-blocking: classification is skipped, address matching proceeds.
|
||||
console.error("Failed to upload classifier CSV:", err);
|
||||
}
|
||||
}
|
||||
|
||||
const sessionToken = readSessionToken(request);
|
||||
const trigger = await triggerAddressMatching({ uploadId, s3Uri, sessionToken });
|
||||
if (trigger.kind === "trigger_failed")
|
||||
return NextResponse.json({ error: trigger.message }, { status: trigger.status });
|
||||
|
||||
// Co-fire the landlord classifier (non-blocking) under the same task.
|
||||
await triggerClassifier({ taskId: trigger.taskId, uploadId, sessionToken });
|
||||
if (classifierS3Uri)
|
||||
await triggerClassifier({ taskId: trigger.taskId, uploadId, s3Uri: classifierS3Uri, sessionToken });
|
||||
|
||||
return NextResponse.json({ taskId: trigger.taskId }, { status: 200 });
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,30 @@
|
|||
import { pgTable, uuid, text, timestamp, jsonb } from "drizzle-orm/pg-core";
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
// Shape of the multi_entry_summary jsonb (ADR-0004). Co-located with the column
|
||||
// so the schema is self-contained; the detection logic in
|
||||
// src/lib/bulkUpload/multiEntry.ts imports these.
|
||||
export interface MultiEntryEntry {
|
||||
raw: string;
|
||||
description: string;
|
||||
}
|
||||
export interface MultiEntryColumn {
|
||||
field: string;
|
||||
header: string;
|
||||
entries: MultiEntryEntry[];
|
||||
}
|
||||
export interface MultiEntrySample {
|
||||
address: string;
|
||||
count: number;
|
||||
columns: MultiEntryColumn[];
|
||||
}
|
||||
export interface MultiEntrySummary {
|
||||
multiValuedFields: string[];
|
||||
countDistribution: Record<string, number>;
|
||||
largestCount: number;
|
||||
sample: MultiEntrySample | null;
|
||||
}
|
||||
|
||||
export const bulkAddressUploads = pgTable("bulk_address_uploads", {
|
||||
id: uuid("id").defaultRandom().primaryKey(),
|
||||
portfolioId: text("portfolio_id").notNull(),
|
||||
|
|
@ -11,6 +35,9 @@ export const bulkAddressUploads = pgTable("bulk_address_uploads", {
|
|||
status: text("status").notNull().default("ready_for_processing"),
|
||||
sourceHeaders: text("source_headers").array().notNull().default(sql`'{}'`),
|
||||
columnMapping: jsonb("column_mapping").$type<Record<string, string>>(),
|
||||
// Multi-entry building-part detection, computed at start-address-matching
|
||||
// and read by the awaiting_review review surface (ADR-0004).
|
||||
multiEntrySummary: jsonb("multi_entry_summary").$type<MultiEntrySummary>(),
|
||||
taskId: uuid("task_id"),
|
||||
combinedOutputS3Uri: text("combined_output_s3_uri"),
|
||||
createdAt: timestamp("created_at", { withTimezone: true }).notNull().defaultNow(),
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import {
|
|||
useFinalize,
|
||||
useRequestCombine,
|
||||
} from "@/lib/bulkUpload/client";
|
||||
import type { MultiEntrySample } from "@/lib/bulkUpload/multiEntry";
|
||||
|
||||
interface Props {
|
||||
portfolioSlug: string;
|
||||
|
|
@ -60,6 +61,13 @@ export default function OnboardingProgress({
|
|||
const canRunCombiner = taskDone && !taskFailed && upload.status === "processing";
|
||||
const canFinalize = upload.status === "awaiting_review";
|
||||
|
||||
// Multi-entry building-part sample, shown read-only on the review surface
|
||||
// (ADR-0004). Ordering confirmation arrives in a later slice.
|
||||
const multiEntrySample =
|
||||
upload.status === "awaiting_review"
|
||||
? (upload.multiEntrySummary?.sample ?? null)
|
||||
: null;
|
||||
|
||||
return (
|
||||
<div className="mt-6 space-y-3">
|
||||
<div className="w-full bg-gray-100 rounded-full h-2 overflow-hidden">
|
||||
|
|
@ -70,17 +78,24 @@ export default function OnboardingProgress({
|
|||
</div>
|
||||
|
||||
<div className="flex items-center gap-4 text-xs text-gray-500">
|
||||
{total > 0 && (
|
||||
<span>
|
||||
<span className="font-semibold text-gray-700">{completedSubtasks}</span> / {total} batches complete
|
||||
</span>
|
||||
)}
|
||||
{failedSubtasks > 0 && (
|
||||
<span className="flex items-center gap-1 text-red-500 font-semibold">
|
||||
<span className="w-1.5 h-1.5 rounded-full bg-red-400" />
|
||||
{failedSubtasks} failed
|
||||
</span>
|
||||
)}
|
||||
{/* Address matching: standardises addresses against the OS lookup, in batches. */}
|
||||
<span className="flex items-center gap-1">
|
||||
<span className="text-gray-400">Address matching:</span>
|
||||
{failedSubtasks > 0 ? (
|
||||
<span className="flex items-center gap-1 text-red-500 font-semibold">
|
||||
<span className="w-1.5 h-1.5 rounded-full bg-red-400" />
|
||||
{failedSubtasks} of {total} batches failed
|
||||
</span>
|
||||
) : total > 0 && completedSubtasks >= total ? (
|
||||
<span className="font-semibold text-green-600">complete</span>
|
||||
) : (
|
||||
<span className="flex items-center gap-1 text-blue-500">
|
||||
<span className="w-1.5 h-1.5 rounded-full bg-blue-400 animate-pulse" />
|
||||
running{total > 0 ? ` · ${completedSubtasks} / ${total} batches` : ""}
|
||||
</span>
|
||||
)}
|
||||
</span>
|
||||
{/* Classification: turns the landlord's free-text descriptions into EPC categories. */}
|
||||
{classifierTotal > 0 && (
|
||||
<span className="flex items-center gap-1">
|
||||
<span className="text-gray-400">Classification:</span>
|
||||
|
|
@ -99,12 +114,6 @@ export default function OnboardingProgress({
|
|||
)}
|
||||
</span>
|
||||
)}
|
||||
{!taskDone && (
|
||||
<span className="flex items-center gap-1 text-blue-500">
|
||||
<span className="w-1.5 h-1.5 rounded-full bg-blue-400 animate-pulse" />
|
||||
Running
|
||||
</span>
|
||||
)}
|
||||
{isCombining && (
|
||||
<span className="flex items-center gap-1 text-blue-500">
|
||||
<span className="w-1.5 h-1.5 rounded-full bg-blue-400 animate-pulse" />
|
||||
|
|
@ -119,6 +128,8 @@ export default function OnboardingProgress({
|
|||
)}
|
||||
</div>
|
||||
|
||||
{multiEntrySample && <MultiEntrySamplePanel sample={multiEntrySample} />}
|
||||
|
||||
{(canRunCombiner || canFinalize) && (
|
||||
<div className="flex flex-col gap-2 pt-2">
|
||||
{canRunCombiner && (
|
||||
|
|
@ -164,6 +175,51 @@ export default function OnboardingProgress({
|
|||
);
|
||||
}
|
||||
|
||||
// Read-only preview of the largest-count multi-entry row (ADR-0004). Each
|
||||
// comma-separated entry is a building part; the user will confirm their order
|
||||
// in a later slice. Positions are shown 1-based, unlabelled for now.
|
||||
function MultiEntrySamplePanel({ sample }: { sample: MultiEntrySample }) {
|
||||
return (
|
||||
<div className="rounded-lg border border-amber-200 bg-amber-50 px-4 py-3">
|
||||
<p className="text-sm font-semibold text-amber-900">
|
||||
Multiple building parts detected
|
||||
</p>
|
||||
<p className="mt-0.5 text-xs text-amber-800">
|
||||
{sample.address ? <span className="font-medium">{sample.address}</span> : "An address"}{" "}
|
||||
has {sample.count} building parts (e.g. a main building and extensions).
|
||||
You'll be asked to confirm their order before finalising.
|
||||
</p>
|
||||
|
||||
<div className="mt-3 overflow-x-auto">
|
||||
<table className="w-full border-collapse text-xs">
|
||||
<thead>
|
||||
<tr className="text-left text-amber-700">
|
||||
<th className="py-1 pr-3 font-medium">Position</th>
|
||||
{sample.columns.map((column) => (
|
||||
<th key={column.field} className="py-1 pr-3 font-medium">
|
||||
{column.header}
|
||||
</th>
|
||||
))}
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{Array.from({ length: sample.count }).map((_, position) => (
|
||||
<tr key={position} className="border-t border-amber-100 text-amber-900">
|
||||
<td className="py-1 pr-3 text-amber-600">{position + 1}</td>
|
||||
{sample.columns.map((column) => (
|
||||
<td key={column.field} className="py-1 pr-3">
|
||||
{column.entries[position]?.raw ?? "—"}
|
||||
</td>
|
||||
))}
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function StageButton({
|
||||
label,
|
||||
activeLabel,
|
||||
|
|
|
|||
|
|
@ -104,8 +104,8 @@ export default function MapColumnsClient({
|
|||
className="w-full text-sm border border-gray-200 rounded-lg px-3 py-2 bg-white text-gray-800 focus:outline-none focus:ring-2 focus:ring-[#14163d]/20 focus:border-[#14163d]"
|
||||
>
|
||||
<option value={NOT_PROVIDED}>Not provided</option>
|
||||
{sourceHeaders.map((header) => (
|
||||
<option key={header} value={header}>
|
||||
{sourceHeaders.map((header, index) => (
|
||||
<option key={`${header}-${index}`} value={header}>
|
||||
{header}
|
||||
</option>
|
||||
))}
|
||||
|
|
|
|||
119
src/lib/bulkUpload/multiEntry.ts
Normal file
119
src/lib/bulkUpload/multiEntry.ts
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
// Multi-entry building-part detection (ADR-0004).
|
||||
//
|
||||
// A BulkUpload row can carry several comma-separated entries in a physical-
|
||||
// element column (e.g. Walls = "Cavity: AsBuilt (1976-1982), Cavity:
|
||||
// FilledCavity"). Each entry is a Building part (Main building + Extensions).
|
||||
// This module finds that pattern and captures one sample — the row with the
|
||||
// MOST building parts — so the user can confirm the ordering downstream.
|
||||
//
|
||||
// Pure + I/O-free so it's unit-testable; the start-address-matching route runs
|
||||
// it over the already-parsed upload rows and persists the result on the upload.
|
||||
|
||||
import { ADDRESS_FIELDS, classifierMapping } from "./columnFields";
|
||||
import type {
|
||||
MultiEntryEntry,
|
||||
MultiEntryColumn,
|
||||
MultiEntrySummary,
|
||||
} from "@/app/db/schema/bulk_address_uploads";
|
||||
|
||||
// The jsonb shape lives with the column (schema/bulk_address_uploads.ts) so the
|
||||
// migration is self-contained; re-export here for callers of this module.
|
||||
export type {
|
||||
MultiEntryEntry,
|
||||
MultiEntryColumn,
|
||||
MultiEntrySample,
|
||||
MultiEntrySummary,
|
||||
} from "@/app/db/schema/bulk_address_uploads";
|
||||
|
||||
export const EMPTY_MULTI_ENTRY_SUMMARY: MultiEntrySummary = {
|
||||
multiValuedFields: [],
|
||||
countDistribution: {},
|
||||
largestCount: 0,
|
||||
sample: null,
|
||||
};
|
||||
|
||||
// Split a cell into building-part entries. Mirrors the classifier's
|
||||
// split(",") → trim → lower, dropping empty fragments so positions align
|
||||
// across raw and normalized forms.
|
||||
export function splitEntries(value: unknown): MultiEntryEntry[] {
|
||||
return String(value ?? "")
|
||||
.split(",")
|
||||
.map((s) => s.trim())
|
||||
.filter((s) => s.length > 0)
|
||||
.map((raw) => ({ raw, description: raw.toLowerCase() }));
|
||||
}
|
||||
|
||||
// Compose a display address from the mapped address fields (reference excluded).
|
||||
function buildAddress(
|
||||
row: Record<string, unknown>,
|
||||
columnMapping: Record<string, string>,
|
||||
): string {
|
||||
const parts: string[] = [];
|
||||
for (const field of ADDRESS_FIELDS) {
|
||||
if (field.value === "internal_reference") continue;
|
||||
const header = columnMapping[field.value];
|
||||
if (!header) continue;
|
||||
const value = String(row[header] ?? "").trim();
|
||||
if (value) parts.push(value);
|
||||
}
|
||||
return parts.join(", ");
|
||||
}
|
||||
|
||||
// Scan the mapped classifier columns for multi-entry rows and capture the
|
||||
// largest-count sample. Only classifier columns are considered — they're the
|
||||
// physical-element descriptions we slice into building parts; address columns
|
||||
// are single-valued by nature.
|
||||
export function detectMultiEntry(
|
||||
rows: Array<Record<string, unknown>>,
|
||||
columnMapping: Record<string, string>,
|
||||
): MultiEntrySummary {
|
||||
const classifierCols = Object.entries(classifierMapping(columnMapping));
|
||||
if (classifierCols.length === 0) return EMPTY_MULTI_ENTRY_SUMMARY;
|
||||
|
||||
const multiValued = new Set<string>();
|
||||
const countDistribution: Record<string, number> = {};
|
||||
let largestCount = 0;
|
||||
let sampleRowIndex = -1;
|
||||
|
||||
rows.forEach((row, index) => {
|
||||
let rowMax = 0;
|
||||
for (const [field, header] of classifierCols) {
|
||||
const n = splitEntries(row[header]).length;
|
||||
if (n > 1) multiValued.add(field);
|
||||
if (n > rowMax) rowMax = n;
|
||||
}
|
||||
if (rowMax >= 2) {
|
||||
const key = String(rowMax);
|
||||
countDistribution[key] = (countDistribution[key] ?? 0) + 1;
|
||||
// First row at a new maximum becomes the sample.
|
||||
if (rowMax > largestCount) {
|
||||
largestCount = rowMax;
|
||||
sampleRowIndex = index;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (sampleRowIndex === -1) return EMPTY_MULTI_ENTRY_SUMMARY;
|
||||
|
||||
const sampleRow = rows[sampleRowIndex];
|
||||
// Show only the columns that are actually split in the sample row;
|
||||
// single-value columns are whole-dwelling facts, not building parts.
|
||||
const columns: MultiEntryColumn[] = classifierCols
|
||||
.map(([field, header]) => ({
|
||||
field,
|
||||
header,
|
||||
entries: splitEntries(sampleRow[header]),
|
||||
}))
|
||||
.filter((column) => column.entries.length > 1);
|
||||
|
||||
return {
|
||||
multiValuedFields: [...multiValued],
|
||||
countDistribution,
|
||||
largestCount,
|
||||
sample: {
|
||||
address: buildAddress(sampleRow, columnMapping),
|
||||
count: largestCount,
|
||||
columns,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
|
@ -6,6 +6,7 @@ import { count, desc, eq, sql } from "drizzle-orm";
|
|||
import type { BulkUpload, BulkUploadStatus, ProgressView, TaskSummary } from "./types";
|
||||
import { validateColumnMapping, classifierMapping } from "./columnFields";
|
||||
import { SUBTASK_SERVICE } from "./types";
|
||||
import type { MultiEntrySummary } from "./multiEntry";
|
||||
|
||||
const REMAP_ALLOWED: ReadonlySet<BulkUploadStatus> = new Set([
|
||||
"ready_for_processing",
|
||||
|
|
@ -102,6 +103,20 @@ export async function getProgressView(uploadId: string): Promise<ProgressView |
|
|||
return { upload, task };
|
||||
}
|
||||
|
||||
// Persist the multi-entry building-part detection (ADR-0004). Computed once at
|
||||
// start-address-matching from the already-parsed rows; read back on the
|
||||
// awaiting_review surface. Only this column is touched, so the later
|
||||
// status/taskId update leaves it intact.
|
||||
export async function saveMultiEntrySummary(
|
||||
uploadId: string,
|
||||
summary: MultiEntrySummary,
|
||||
): Promise<void> {
|
||||
await db
|
||||
.update(bulkAddressUploads)
|
||||
.set({ multiEntrySummary: summary })
|
||||
.where(eq(bulkAddressUploads.id, uploadId));
|
||||
}
|
||||
|
||||
export type SetMappingOutcome =
|
||||
| { kind: "ok"; upload: BulkUpload }
|
||||
| { kind: "not_found" }
|
||||
|
|
@ -211,13 +226,16 @@ export async function triggerAddressMatching(args: {
|
|||
return { kind: "ok", taskId: task.id };
|
||||
}
|
||||
|
||||
// Co-fires the landlord classifier as a subtask under the address task. Reads
|
||||
// the ORIGINAL upload (the address-matching CSV strips the description columns)
|
||||
// and is non-blocking: a trigger failure marks only the classifier subtask, so
|
||||
// address matching is unaffected. See ADR-0003.
|
||||
// Co-fires the landlord classifier as a subtask under the address task. Reads a
|
||||
// dedicated classifier CSV (the classifier columns converted from the upload by
|
||||
// the start-address-matching route — the address-matching CSV strips the
|
||||
// description columns), so the lambda always parses a real CSV even for
|
||||
// .xlsx/.xls uploads. Non-blocking: a trigger failure marks only the classifier
|
||||
// subtask, so address matching is unaffected. See ADR-0003.
|
||||
export async function triggerClassifier(args: {
|
||||
taskId: string;
|
||||
uploadId: string;
|
||||
s3Uri: string;
|
||||
sessionToken: string | undefined;
|
||||
}): Promise<void> {
|
||||
const upload = await loadById(args.uploadId);
|
||||
|
|
@ -239,7 +257,7 @@ export async function triggerClassifier(args: {
|
|||
const payload = {
|
||||
task_id: args.taskId,
|
||||
sub_task_id: subTask.id,
|
||||
s3_uri: `s3://${upload.s3Bucket}/${upload.s3Key}`,
|
||||
s3_uri: args.s3Uri,
|
||||
portfolio_id: Number(upload.portfolioId),
|
||||
column_mapping: columnMapping,
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue