diff --git a/CONTEXT.md b/CONTEXT.md index 4bdd7bab..9f15c85a 100644 --- a/CONTEXT.md +++ b/CONTEXT.md @@ -73,6 +73,8 @@ See [ADR-0001](./docs/adr/0001-bulk-upload-state-machine.md) for the deliberate - A **BulkUpload** produces zero or more **Properties** when finalised. - A **BulkUpload** has at most one **Task** (the orchestration handle for the FastAPI pipeline run); a Task has many **SubTasks** (one per pipeline stage: address matching, combiner). - A **Portfolio** has many **VocabularyMappings** — one row per `(category, description)` it has ever encountered across all its BulkUploads. See [ADR-0002](./docs/adr/0002-landlord-override-vocabulary.md). +- A **Recommendation** belongs to exactly one **Plan**. Denormalised onto `recommendation.plan_id`; the `plan_recommendations` join table is being retired. +- A **Recommendation** has at most one **Material**. Denormalised onto `recommendation.material_id` (+ `material_quantity`, `material_quantity_unit`, `material_depth`). Historically (pre-~2023) a recommendation could carry multiple materials; ~128 such legacy rows were reconciled to one each on 2026-06-07. The cardinality guard in the backfill enforces this going forward. ### Baseline performance diff --git a/package.json b/package.json index 82682d55..e205a7e5 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,8 @@ "test:e2e:run": "cypress run", "migration:generate": "drizzle-kit generate", "migration:migrate": "drizzle-kit migrate", - "create_user": "tsx src/app/db/create_user.ts" + "create_user": "tsx src/app/db/create_user.ts", + "backfill:recommendation-denormalization": "tsx src/app/db/backfill-recommendation-denormalization.ts" }, "dependencies": { "@aws-sdk/client-s3": "^3.971.0", diff --git a/src/app/db/backfill-recommendation-denormalization.ts b/src/app/db/backfill-recommendation-denormalization.ts new file mode 100644 index 00000000..ce8b74d2 --- /dev/null +++ b/src/app/db/backfill-recommendation-denormalization.ts @@ -0,0 +1,226 @@ +/** + * Backfills the denormalised plan_id / material_* columns on `recommendation` + * from the plan_recommendations and recommendation_materials join tables, then + * builds the supporting indexes and validates the foreign keys — all ONLINE. + * + * Why this lives outside drizzle: + * drizzle-kit migrate wraps every pending migration in ONE transaction, so it + * cannot COMMIT between batches, cannot run CREATE INDEX CONCURRENTLY, and + * holds the ADD COLUMN's AccessExclusiveLock for the whole run. This script + * instead commits each batch, keeping locks tiny, WAL/bloat bounded, EBS IO + * burst balance healthy, and progress visible + resumable. + * See docs/adr/0001-data-backfills-outside-drizzle.md + * + * Run AFTER `npm run migration:migrate` has applied 0222/0224 (the column adds): + * npm run backfill:recommendation-denormalization + * + * Safe to re-run: every step is idempotent (IS NULL guards, IF NOT EXISTS, + * VALIDATE on an already-valid constraint is a no-op). If interrupted, just run + * it again — it resumes from wherever it left off. + * + * Tunables via env: + * BACKFILL_BATCH_SIZE rows scanned per committed batch (default 25000) + * BACKFILL_SLEEP_MS pause between batches, eases IO pressure (default 50) + */ +import dotenv from "dotenv"; +import { Pool, type PoolClient } from "pg"; + +dotenv.config({ path: ".env.local" }); + +const BATCH_SIZE = Number(process.env.BACKFILL_BATCH_SIZE ?? 25_000); +const SLEEP_MS = Number(process.env.BACKFILL_SLEEP_MS ?? 50); + +const pool = new Pool({ + host: process.env.DB_HOST, + port: Number(process.env.DB_PORT), + user: process.env.DB_USERNAME, + password: process.env.DB_PASSWORD, + database: process.env.DB_NAME, +}); + +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +/** Abort before touching data if the 1:1 assumption is violated. */ +async function assertSingleCardinality( + client: PoolClient, + joinTable: string, +): Promise { + const { rows } = await client.query( + `SELECT count(*)::int AS violations + FROM ( + SELECT recommendation_id + FROM ${joinTable} + GROUP BY recommendation_id + HAVING count(*) > 1 + ) dupes`, + ); + const violations = rows[0].violations as number; + if (violations > 0) { + throw new Error( + `Aborting: ${violations} recommendation(s) map to multiple ${joinTable} ` + + `rows. The backfill cannot pick deterministically. Resolve cardinality first.`, + ); + } +} + +/** + * Keyset-paginate the whole `recommendation` table by id and, for each batch, + * set the target columns from the join table where a match exists. Each batch + * is its own autocommitted statement (no surrounding BEGIN), so locks are held + * only for the rows in that batch. + * + * We scan by id (not `WHERE col IS NULL LIMIT n`) so rows with no match don't + * get re-selected forever — they're simply left NULL and we move past them. + */ +async function backfillColumns( + client: PoolClient, + label: string, + updateSql: (idFrom: string, limit: string) => string, +): Promise { + let lastId = "0"; + let scannedTotal = 0; + let updatedTotal = 0; + const startedAt = Date.now(); + + for (;;) { + const { rows } = await client.query(updateSql("$1", "$2"), [ + lastId, + BATCH_SIZE, + ]); + const { max_id, scanned, updated } = rows[0] as { + max_id: string | null; + scanned: number; + updated: number; + }; + + if (!scanned || max_id === null) break; + + scannedTotal += Number(scanned); + updatedTotal += Number(updated); + lastId = max_id; + + const rate = Math.round(scannedTotal / ((Date.now() - startedAt) / 1000)); + console.log( + `[${label}] up to id ${lastId} — scanned ${scannedTotal.toLocaleString()}, ` + + `updated ${updatedTotal.toLocaleString()} (${rate.toLocaleString()} rows/s scan)`, + ); + + if (SLEEP_MS > 0) await sleep(SLEEP_MS); + } + + console.log( + `[${label}] done: scanned ${scannedTotal.toLocaleString()}, ` + + `updated ${updatedTotal.toLocaleString()} in ` + + `${Math.round((Date.now() - startedAt) / 1000)}s`, + ); +} + +async function main(): Promise { + const client = await pool.connect(); + try { + console.log( + `Config: batch=${BATCH_SIZE.toLocaleString()} rows, sleep=${SLEEP_MS}ms\n`, + ); + + // 1. Guard the 1:1 assumption before mutating anything. + console.log("Checking cardinality..."); + await assertSingleCardinality(client, "plan_recommendations"); + await assertSingleCardinality(client, "recommendation_materials"); + + // 2. Backfill plan_id. + await backfillColumns( + client, + "plan_id", + (idFrom, limit) => ` + WITH batch AS ( + SELECT id FROM recommendation + WHERE id > ${idFrom} + ORDER BY id + LIMIT ${limit} + ), + upd AS ( + UPDATE recommendation r + SET plan_id = pr.plan_id + FROM batch b + JOIN plan_recommendations pr ON pr.recommendation_id = b.id + WHERE r.id = b.id + AND r.plan_id IS NULL + RETURNING r.id + ) + SELECT (SELECT max(id) FROM batch) AS max_id, + (SELECT count(*) FROM batch) AS scanned, + (SELECT count(*) FROM upd) AS updated`, + ); + + // 3. Backfill the four material_* columns in one pass. + await backfillColumns( + client, + "material", + (idFrom, limit) => ` + WITH batch AS ( + SELECT id FROM recommendation + WHERE id > ${idFrom} + ORDER BY id + LIMIT ${limit} + ), + upd AS ( + UPDATE recommendation r + SET material_id = rm.material_id, + material_quantity = rm.quantity, + material_quantity_unit = rm.quantity_unit, + material_depth = rm.depth + FROM batch b + JOIN recommendation_materials rm ON rm.recommendation_id = b.id + WHERE r.id = b.id + AND r.material_id IS NULL + RETURNING r.id + ) + SELECT (SELECT max(id) FROM batch) AS max_id, + (SELECT count(*) FROM batch) AS scanned, + (SELECT count(*) FROM upd) AS updated`, + ); + + // 4. Build indexes CONCURRENTLY (no write lock). Must NOT be in a txn — + // a pooled client with no BEGIN runs each statement autocommitted. + console.log("Creating indexes concurrently..."); + await client.query( + `CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_recommendation_plan_id + ON recommendation USING btree (plan_id)`, + ); + await client.query( + `CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_recommendation_material_id + ON recommendation USING btree (material_id)`, + ); + + // 5. Validate the FKs online (ShareUpdateExclusiveLock — allows reads/writes). + console.log("Validating foreign keys..."); + await client.query( + `ALTER TABLE recommendation + VALIDATE CONSTRAINT recommendation_plan_id_plan_id_fk`, + ); + await client.query( + `ALTER TABLE recommendation + VALIDATE CONSTRAINT recommendation_material_id_material_id_fk`, + ); + + // 6. Report any rows still unlinked (expected for material; investigate for plan). + const { rows } = await client.query( + `SELECT count(*) FILTER (WHERE plan_id IS NULL) AS plan_null, + count(*) FILTER (WHERE material_id IS NULL) AS material_null + FROM recommendation`, + ); + console.log( + `\nRemaining NULLs — plan_id: ${Number(rows[0].plan_null).toLocaleString()}, ` + + `material_id: ${Number(rows[0].material_null).toLocaleString()}`, + ); + console.log("Backfill complete."); + } finally { + client.release(); + await pool.end(); + } +} + +main().catch((err) => { + console.error("Backfill failed:", err); + process.exit(1); +}); diff --git a/src/app/db/migrations/0222_nifty_hellcat.sql b/src/app/db/migrations/0222_nifty_hellcat.sql index 2a70a411..53df1a60 100644 --- a/src/app/db/migrations/0222_nifty_hellcat.sql +++ b/src/app/db/migrations/0222_nifty_hellcat.sql @@ -1,3 +1,7 @@ +-- Metadata-only DDL: instant, holds AccessExclusiveLock on "recommendation" +-- only momentarily. The FK is added NOT VALID so no full-table validation scan +-- happens here; it is validated online by the standalone backfill script. The +-- index is created CONCURRENTLY by that script too, after the column is filled. +-- See docs/adr/0001-data-backfills-outside-drizzle.md ALTER TABLE "recommendation" ADD COLUMN "plan_id" bigint;--> statement-breakpoint -ALTER TABLE "recommendation" ADD CONSTRAINT "recommendation_plan_id_plan_id_fk" FOREIGN KEY ("plan_id") REFERENCES "public"."plan"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint -CREATE INDEX "idx_recommendation_plan_id" ON "recommendation" USING btree ("plan_id"); \ No newline at end of file +ALTER TABLE "recommendation" ADD CONSTRAINT "recommendation_plan_id_plan_id_fk" FOREIGN KEY ("plan_id") REFERENCES "public"."plan"("id") ON DELETE cascade ON UPDATE no action NOT VALID; diff --git a/src/app/db/migrations/0223_recommendation_plan_id_backfill.sql b/src/app/db/migrations/0223_recommendation_plan_id_backfill.sql index dc16877b..46ddbab9 100644 --- a/src/app/db/migrations/0223_recommendation_plan_id_backfill.sql +++ b/src/app/db/migrations/0223_recommendation_plan_id_backfill.sql @@ -1,23 +1,13 @@ --- Guard: fail if any recommendation is linked to more than one plan. --- The spec asserts each recommendation belongs to exactly one plan; if real --- data violates that the backfill cannot pick deterministically and must not --- proceed silently. -DO $$ -BEGIN - IF EXISTS ( - SELECT 1 - FROM plan_recommendations - GROUP BY recommendation_id - HAVING count(*) > 1 - ) THEN - RAISE EXCEPTION - 'plan_id backfill aborted: one or more recommendations appear in multiple ' - 'plan_recommendations rows. Resolve cardinality before re-running.'; - END IF; -END $$; - -UPDATE recommendation r -SET plan_id = pr.plan_id -FROM plan_recommendations pr -WHERE pr.recommendation_id = r.id - AND r.plan_id IS NULL; +-- Intentionally a no-op. +-- +-- The plan_id backfill used to live here as a single full-table UPDATE inside +-- drizzle's migration transaction. Because drizzle wraps ALL pending migrations +-- in one transaction, that UPDATE held an AccessExclusiveLock on "recommendation" +-- (from the ADD COLUMN in 0222) for the entire multi-hour run, blocked unrelated +-- migrations behind it, exhausted EBS IO burst balance, and could not report +-- progress or be resumed. +-- +-- The backfill now runs OUTSIDE drizzle, in committed batches, via: +-- npm run backfill:recommendation-denormalization +-- See docs/adr/0001-data-backfills-outside-drizzle.md +SELECT 1; diff --git a/src/app/db/migrations/0224_busy_nitro.sql b/src/app/db/migrations/0224_busy_nitro.sql index f410d98d..607a77ad 100644 --- a/src/app/db/migrations/0224_busy_nitro.sql +++ b/src/app/db/migrations/0224_busy_nitro.sql @@ -1,6 +1,10 @@ +-- Metadata-only DDL: adding nullable columns with no default is an instant +-- catalog change. The FK is added NOT VALID (no scan); it is validated online +-- and the index built CONCURRENTLY by the standalone backfill script, after the +-- columns are populated. +-- See docs/adr/0001-data-backfills-outside-drizzle.md ALTER TABLE "recommendation" ADD COLUMN "material_id" bigint;--> statement-breakpoint ALTER TABLE "recommendation" ADD COLUMN "material_quantity" real;--> statement-breakpoint ALTER TABLE "recommendation" ADD COLUMN "material_quantity_unit" "unit_quantity";--> statement-breakpoint ALTER TABLE "recommendation" ADD COLUMN "material_depth" real;--> statement-breakpoint -ALTER TABLE "recommendation" ADD CONSTRAINT "recommendation_material_id_material_id_fk" FOREIGN KEY ("material_id") REFERENCES "public"."material"("id") ON DELETE set null ON UPDATE no action;--> statement-breakpoint -CREATE INDEX "idx_recommendation_material_id" ON "recommendation" USING btree ("material_id"); \ No newline at end of file +ALTER TABLE "recommendation" ADD CONSTRAINT "recommendation_material_id_material_id_fk" FOREIGN KEY ("material_id") REFERENCES "public"."material"("id") ON DELETE set null ON UPDATE no action NOT VALID; diff --git a/src/app/db/migrations/0225_recommendation_material_id_backfill.sql b/src/app/db/migrations/0225_recommendation_material_id_backfill.sql index 8e569ade..d1f072cd 100644 --- a/src/app/db/migrations/0225_recommendation_material_id_backfill.sql +++ b/src/app/db/migrations/0225_recommendation_material_id_backfill.sql @@ -1,26 +1,12 @@ --- Guard: fail if any recommendation maps to more than one material row. --- The modelled fabric measures never produce this; if real data violates it --- the backfill cannot pick deterministically and must not proceed silently. -DO $$ -BEGIN - IF EXISTS ( - SELECT 1 - FROM recommendation_materials - GROUP BY recommendation_id - HAVING count(*) > 1 - ) THEN - RAISE EXCEPTION - 'recommendation_material_id backfill aborted: one or more recommendations ' - 'have multiple recommendation_materials rows. Resolve cardinality before re-running.'; - END IF; -END $$; - --- Backfill all four columns in a single pass. -UPDATE recommendation r -SET material_id = rm.material_id, - material_quantity = rm.quantity, - material_quantity_unit = rm.quantity_unit, - material_depth = rm.depth -FROM recommendation_materials rm -WHERE rm.recommendation_id = r.id - AND r.material_id IS NULL; +-- Intentionally a no-op. +-- +-- The materials backfill (material_id + quantity/unit/depth) used to live here as +-- a single full-table UPDATE inside drizzle's migration transaction, with the +-- material_id index created BEFORE the backfill in 0224 — so every updated row +-- also had to maintain that index. It moved out for the same reasons as the +-- plan_id backfill (see 0223). +-- +-- The backfill now runs OUTSIDE drizzle, in committed batches, via: +-- npm run backfill:recommendation-denormalization +-- See docs/adr/0001-data-backfills-outside-drizzle.md +SELECT 1;