mirror of
https://github.com/Hestia-Homes/assessment-model.git
synced 2026-06-30 12:55:02 +00:00
Merge branch 'main' into feature/magicplan-ventilation-tables
This commit is contained in:
commit
7e2be3efbd
7 changed files with 267 additions and 54 deletions
|
|
@ -73,6 +73,8 @@ See [ADR-0001](./docs/adr/0001-bulk-upload-state-machine.md) for the deliberate
|
|||
- A **BulkUpload** produces zero or more **Properties** when finalised.
|
||||
- A **BulkUpload** has at most one **Task** (the orchestration handle for the FastAPI pipeline run); a Task has many **SubTasks** (one per pipeline stage: address matching, combiner).
|
||||
- A **Portfolio** has many **VocabularyMappings** — one row per `(category, description)` it has ever encountered across all its BulkUploads. See [ADR-0002](./docs/adr/0002-landlord-override-vocabulary.md).
|
||||
- A **Recommendation** belongs to exactly one **Plan**. Denormalised onto `recommendation.plan_id`; the `plan_recommendations` join table is being retired.
|
||||
- A **Recommendation** has at most one **Material**. Denormalised onto `recommendation.material_id` (+ `material_quantity`, `material_quantity_unit`, `material_depth`). Historically (pre-~2023) a recommendation could carry multiple materials; ~128 such legacy rows were reconciled to one each on 2026-06-07. The cardinality guard in the backfill enforces this going forward.
|
||||
|
||||
### Baseline performance
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@
|
|||
"test:e2e:run": "cypress run",
|
||||
"migration:generate": "drizzle-kit generate",
|
||||
"migration:migrate": "drizzle-kit migrate",
|
||||
"create_user": "tsx src/app/db/create_user.ts"
|
||||
"create_user": "tsx src/app/db/create_user.ts",
|
||||
"backfill:recommendation-denormalization": "tsx src/app/db/backfill-recommendation-denormalization.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@aws-sdk/client-s3": "^3.971.0",
|
||||
|
|
|
|||
226
src/app/db/backfill-recommendation-denormalization.ts
Normal file
226
src/app/db/backfill-recommendation-denormalization.ts
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
/**
|
||||
* Backfills the denormalised plan_id / material_* columns on `recommendation`
|
||||
* from the plan_recommendations and recommendation_materials join tables, then
|
||||
* builds the supporting indexes and validates the foreign keys — all ONLINE.
|
||||
*
|
||||
* Why this lives outside drizzle:
|
||||
* drizzle-kit migrate wraps every pending migration in ONE transaction, so it
|
||||
* cannot COMMIT between batches, cannot run CREATE INDEX CONCURRENTLY, and
|
||||
* holds the ADD COLUMN's AccessExclusiveLock for the whole run. This script
|
||||
* instead commits each batch, keeping locks tiny, WAL/bloat bounded, EBS IO
|
||||
* burst balance healthy, and progress visible + resumable.
|
||||
* See docs/adr/0001-data-backfills-outside-drizzle.md
|
||||
*
|
||||
* Run AFTER `npm run migration:migrate` has applied 0222/0224 (the column adds):
|
||||
* npm run backfill:recommendation-denormalization
|
||||
*
|
||||
* Safe to re-run: every step is idempotent (IS NULL guards, IF NOT EXISTS,
|
||||
* VALIDATE on an already-valid constraint is a no-op). If interrupted, just run
|
||||
* it again — it resumes from wherever it left off.
|
||||
*
|
||||
* Tunables via env:
|
||||
* BACKFILL_BATCH_SIZE rows scanned per committed batch (default 25000)
|
||||
* BACKFILL_SLEEP_MS pause between batches, eases IO pressure (default 50)
|
||||
*/
|
||||
import dotenv from "dotenv";
|
||||
import { Pool, type PoolClient } from "pg";
|
||||
|
||||
dotenv.config({ path: ".env.local" });
|
||||
|
||||
const BATCH_SIZE = Number(process.env.BACKFILL_BATCH_SIZE ?? 25_000);
|
||||
const SLEEP_MS = Number(process.env.BACKFILL_SLEEP_MS ?? 50);
|
||||
|
||||
const pool = new Pool({
|
||||
host: process.env.DB_HOST,
|
||||
port: Number(process.env.DB_PORT),
|
||||
user: process.env.DB_USERNAME,
|
||||
password: process.env.DB_PASSWORD,
|
||||
database: process.env.DB_NAME,
|
||||
});
|
||||
|
||||
const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));
|
||||
|
||||
/** Abort before touching data if the 1:1 assumption is violated. */
|
||||
async function assertSingleCardinality(
|
||||
client: PoolClient,
|
||||
joinTable: string,
|
||||
): Promise<void> {
|
||||
const { rows } = await client.query(
|
||||
`SELECT count(*)::int AS violations
|
||||
FROM (
|
||||
SELECT recommendation_id
|
||||
FROM ${joinTable}
|
||||
GROUP BY recommendation_id
|
||||
HAVING count(*) > 1
|
||||
) dupes`,
|
||||
);
|
||||
const violations = rows[0].violations as number;
|
||||
if (violations > 0) {
|
||||
throw new Error(
|
||||
`Aborting: ${violations} recommendation(s) map to multiple ${joinTable} ` +
|
||||
`rows. The backfill cannot pick deterministically. Resolve cardinality first.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Keyset-paginate the whole `recommendation` table by id and, for each batch,
|
||||
* set the target columns from the join table where a match exists. Each batch
|
||||
* is its own autocommitted statement (no surrounding BEGIN), so locks are held
|
||||
* only for the rows in that batch.
|
||||
*
|
||||
* We scan by id (not `WHERE col IS NULL LIMIT n`) so rows with no match don't
|
||||
* get re-selected forever — they're simply left NULL and we move past them.
|
||||
*/
|
||||
async function backfillColumns(
|
||||
client: PoolClient,
|
||||
label: string,
|
||||
updateSql: (idFrom: string, limit: string) => string,
|
||||
): Promise<void> {
|
||||
let lastId = "0";
|
||||
let scannedTotal = 0;
|
||||
let updatedTotal = 0;
|
||||
const startedAt = Date.now();
|
||||
|
||||
for (;;) {
|
||||
const { rows } = await client.query(updateSql("$1", "$2"), [
|
||||
lastId,
|
||||
BATCH_SIZE,
|
||||
]);
|
||||
const { max_id, scanned, updated } = rows[0] as {
|
||||
max_id: string | null;
|
||||
scanned: number;
|
||||
updated: number;
|
||||
};
|
||||
|
||||
if (!scanned || max_id === null) break;
|
||||
|
||||
scannedTotal += Number(scanned);
|
||||
updatedTotal += Number(updated);
|
||||
lastId = max_id;
|
||||
|
||||
const rate = Math.round(scannedTotal / ((Date.now() - startedAt) / 1000));
|
||||
console.log(
|
||||
`[${label}] up to id ${lastId} — scanned ${scannedTotal.toLocaleString()}, ` +
|
||||
`updated ${updatedTotal.toLocaleString()} (${rate.toLocaleString()} rows/s scan)`,
|
||||
);
|
||||
|
||||
if (SLEEP_MS > 0) await sleep(SLEEP_MS);
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[${label}] done: scanned ${scannedTotal.toLocaleString()}, ` +
|
||||
`updated ${updatedTotal.toLocaleString()} in ` +
|
||||
`${Math.round((Date.now() - startedAt) / 1000)}s`,
|
||||
);
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
console.log(
|
||||
`Config: batch=${BATCH_SIZE.toLocaleString()} rows, sleep=${SLEEP_MS}ms\n`,
|
||||
);
|
||||
|
||||
// 1. Guard the 1:1 assumption before mutating anything.
|
||||
console.log("Checking cardinality...");
|
||||
await assertSingleCardinality(client, "plan_recommendations");
|
||||
await assertSingleCardinality(client, "recommendation_materials");
|
||||
|
||||
// 2. Backfill plan_id.
|
||||
await backfillColumns(
|
||||
client,
|
||||
"plan_id",
|
||||
(idFrom, limit) => `
|
||||
WITH batch AS (
|
||||
SELECT id FROM recommendation
|
||||
WHERE id > ${idFrom}
|
||||
ORDER BY id
|
||||
LIMIT ${limit}
|
||||
),
|
||||
upd AS (
|
||||
UPDATE recommendation r
|
||||
SET plan_id = pr.plan_id
|
||||
FROM batch b
|
||||
JOIN plan_recommendations pr ON pr.recommendation_id = b.id
|
||||
WHERE r.id = b.id
|
||||
AND r.plan_id IS NULL
|
||||
RETURNING r.id
|
||||
)
|
||||
SELECT (SELECT max(id) FROM batch) AS max_id,
|
||||
(SELECT count(*) FROM batch) AS scanned,
|
||||
(SELECT count(*) FROM upd) AS updated`,
|
||||
);
|
||||
|
||||
// 3. Backfill the four material_* columns in one pass.
|
||||
await backfillColumns(
|
||||
client,
|
||||
"material",
|
||||
(idFrom, limit) => `
|
||||
WITH batch AS (
|
||||
SELECT id FROM recommendation
|
||||
WHERE id > ${idFrom}
|
||||
ORDER BY id
|
||||
LIMIT ${limit}
|
||||
),
|
||||
upd AS (
|
||||
UPDATE recommendation r
|
||||
SET material_id = rm.material_id,
|
||||
material_quantity = rm.quantity,
|
||||
material_quantity_unit = rm.quantity_unit,
|
||||
material_depth = rm.depth
|
||||
FROM batch b
|
||||
JOIN recommendation_materials rm ON rm.recommendation_id = b.id
|
||||
WHERE r.id = b.id
|
||||
AND r.material_id IS NULL
|
||||
RETURNING r.id
|
||||
)
|
||||
SELECT (SELECT max(id) FROM batch) AS max_id,
|
||||
(SELECT count(*) FROM batch) AS scanned,
|
||||
(SELECT count(*) FROM upd) AS updated`,
|
||||
);
|
||||
|
||||
// 4. Build indexes CONCURRENTLY (no write lock). Must NOT be in a txn —
|
||||
// a pooled client with no BEGIN runs each statement autocommitted.
|
||||
console.log("Creating indexes concurrently...");
|
||||
await client.query(
|
||||
`CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_recommendation_plan_id
|
||||
ON recommendation USING btree (plan_id)`,
|
||||
);
|
||||
await client.query(
|
||||
`CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_recommendation_material_id
|
||||
ON recommendation USING btree (material_id)`,
|
||||
);
|
||||
|
||||
// 5. Validate the FKs online (ShareUpdateExclusiveLock — allows reads/writes).
|
||||
console.log("Validating foreign keys...");
|
||||
await client.query(
|
||||
`ALTER TABLE recommendation
|
||||
VALIDATE CONSTRAINT recommendation_plan_id_plan_id_fk`,
|
||||
);
|
||||
await client.query(
|
||||
`ALTER TABLE recommendation
|
||||
VALIDATE CONSTRAINT recommendation_material_id_material_id_fk`,
|
||||
);
|
||||
|
||||
// 6. Report any rows still unlinked (expected for material; investigate for plan).
|
||||
const { rows } = await client.query(
|
||||
`SELECT count(*) FILTER (WHERE plan_id IS NULL) AS plan_null,
|
||||
count(*) FILTER (WHERE material_id IS NULL) AS material_null
|
||||
FROM recommendation`,
|
||||
);
|
||||
console.log(
|
||||
`\nRemaining NULLs — plan_id: ${Number(rows[0].plan_null).toLocaleString()}, ` +
|
||||
`material_id: ${Number(rows[0].material_null).toLocaleString()}`,
|
||||
);
|
||||
console.log("Backfill complete.");
|
||||
} finally {
|
||||
client.release();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Backfill failed:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
|
|
@ -1,3 +1,7 @@
|
|||
-- Metadata-only DDL: instant, holds AccessExclusiveLock on "recommendation"
|
||||
-- only momentarily. The FK is added NOT VALID so no full-table validation scan
|
||||
-- happens here; it is validated online by the standalone backfill script. The
|
||||
-- index is created CONCURRENTLY by that script too, after the column is filled.
|
||||
-- See docs/adr/0001-data-backfills-outside-drizzle.md
|
||||
ALTER TABLE "recommendation" ADD COLUMN "plan_id" bigint;--> statement-breakpoint
|
||||
ALTER TABLE "recommendation" ADD CONSTRAINT "recommendation_plan_id_plan_id_fk" FOREIGN KEY ("plan_id") REFERENCES "public"."plan"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
CREATE INDEX "idx_recommendation_plan_id" ON "recommendation" USING btree ("plan_id");
|
||||
ALTER TABLE "recommendation" ADD CONSTRAINT "recommendation_plan_id_plan_id_fk" FOREIGN KEY ("plan_id") REFERENCES "public"."plan"("id") ON DELETE cascade ON UPDATE no action NOT VALID;
|
||||
|
|
|
|||
|
|
@ -1,23 +1,13 @@
|
|||
-- Guard: fail if any recommendation is linked to more than one plan.
|
||||
-- The spec asserts each recommendation belongs to exactly one plan; if real
|
||||
-- data violates that the backfill cannot pick deterministically and must not
|
||||
-- proceed silently.
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM plan_recommendations
|
||||
GROUP BY recommendation_id
|
||||
HAVING count(*) > 1
|
||||
) THEN
|
||||
RAISE EXCEPTION
|
||||
'plan_id backfill aborted: one or more recommendations appear in multiple '
|
||||
'plan_recommendations rows. Resolve cardinality before re-running.';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
UPDATE recommendation r
|
||||
SET plan_id = pr.plan_id
|
||||
FROM plan_recommendations pr
|
||||
WHERE pr.recommendation_id = r.id
|
||||
AND r.plan_id IS NULL;
|
||||
-- Intentionally a no-op.
|
||||
--
|
||||
-- The plan_id backfill used to live here as a single full-table UPDATE inside
|
||||
-- drizzle's migration transaction. Because drizzle wraps ALL pending migrations
|
||||
-- in one transaction, that UPDATE held an AccessExclusiveLock on "recommendation"
|
||||
-- (from the ADD COLUMN in 0222) for the entire multi-hour run, blocked unrelated
|
||||
-- migrations behind it, exhausted EBS IO burst balance, and could not report
|
||||
-- progress or be resumed.
|
||||
--
|
||||
-- The backfill now runs OUTSIDE drizzle, in committed batches, via:
|
||||
-- npm run backfill:recommendation-denormalization
|
||||
-- See docs/adr/0001-data-backfills-outside-drizzle.md
|
||||
SELECT 1;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
-- Metadata-only DDL: adding nullable columns with no default is an instant
|
||||
-- catalog change. The FK is added NOT VALID (no scan); it is validated online
|
||||
-- and the index built CONCURRENTLY by the standalone backfill script, after the
|
||||
-- columns are populated.
|
||||
-- See docs/adr/0001-data-backfills-outside-drizzle.md
|
||||
ALTER TABLE "recommendation" ADD COLUMN "material_id" bigint;--> statement-breakpoint
|
||||
ALTER TABLE "recommendation" ADD COLUMN "material_quantity" real;--> statement-breakpoint
|
||||
ALTER TABLE "recommendation" ADD COLUMN "material_quantity_unit" "unit_quantity";--> statement-breakpoint
|
||||
ALTER TABLE "recommendation" ADD COLUMN "material_depth" real;--> statement-breakpoint
|
||||
ALTER TABLE "recommendation" ADD CONSTRAINT "recommendation_material_id_material_id_fk" FOREIGN KEY ("material_id") REFERENCES "public"."material"("id") ON DELETE set null ON UPDATE no action;--> statement-breakpoint
|
||||
CREATE INDEX "idx_recommendation_material_id" ON "recommendation" USING btree ("material_id");
|
||||
ALTER TABLE "recommendation" ADD CONSTRAINT "recommendation_material_id_material_id_fk" FOREIGN KEY ("material_id") REFERENCES "public"."material"("id") ON DELETE set null ON UPDATE no action NOT VALID;
|
||||
|
|
|
|||
|
|
@ -1,26 +1,12 @@
|
|||
-- Guard: fail if any recommendation maps to more than one material row.
|
||||
-- The modelled fabric measures never produce this; if real data violates it
|
||||
-- the backfill cannot pick deterministically and must not proceed silently.
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1
|
||||
FROM recommendation_materials
|
||||
GROUP BY recommendation_id
|
||||
HAVING count(*) > 1
|
||||
) THEN
|
||||
RAISE EXCEPTION
|
||||
'recommendation_material_id backfill aborted: one or more recommendations '
|
||||
'have multiple recommendation_materials rows. Resolve cardinality before re-running.';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Backfill all four columns in a single pass.
|
||||
UPDATE recommendation r
|
||||
SET material_id = rm.material_id,
|
||||
material_quantity = rm.quantity,
|
||||
material_quantity_unit = rm.quantity_unit,
|
||||
material_depth = rm.depth
|
||||
FROM recommendation_materials rm
|
||||
WHERE rm.recommendation_id = r.id
|
||||
AND r.material_id IS NULL;
|
||||
-- Intentionally a no-op.
|
||||
--
|
||||
-- The materials backfill (material_id + quantity/unit/depth) used to live here as
|
||||
-- a single full-table UPDATE inside drizzle's migration transaction, with the
|
||||
-- material_id index created BEFORE the backfill in 0224 — so every updated row
|
||||
-- also had to maintain that index. It moved out for the same reasons as the
|
||||
-- plan_id backfill (see 0223).
|
||||
--
|
||||
-- The backfill now runs OUTSIDE drizzle, in committed batches, via:
|
||||
-- npm run backfill:recommendation-denormalization
|
||||
-- See docs/adr/0001-data-backfills-outside-drizzle.md
|
||||
SELECT 1;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue