only get most recent version of each uploaded file per s3 key

This commit is contained in:
Daniel Roth 2026-06-30 11:02:49 +00:00
parent 861bf144ae
commit ac6983ae68

View file

@ -40,7 +40,31 @@ export async function GET(req: Request) {
.from(uploadedFiles)
.where(condition);
const documents = rows.map((row) => ({
// Deduplicate rows that arise when the ingestion Lambda re-triggers for a deal
// that was already processed: same S3 key gets a new version, but a new DB row
// is written each time.
//
// Step 1: same s3FileKey → keep the row with the highest id (latest insert).
const latestByS3Key = new Map<string, (typeof rows)[number]>();
for (const row of rows) {
const existing = latestByS3Key.get(row.s3FileKey);
if (!existing || row.id > existing.id) latestByS3Key.set(row.s3FileKey, row);
}
// Step 2: among distinct keys, same (fileType, measureName) → keep latest.
// Rows with null fileType (unclassified) are kept as-is.
const latestByDocType = new Map<string, (typeof rows)[number]>();
const unclassified: (typeof rows)[number][] = [];
for (const row of latestByS3Key.values()) {
if (!row.fileType) { unclassified.push(row); continue; }
const key = `${row.fileType}:${row.measureName ?? ""}`;
const existing = latestByDocType.get(key);
if (!existing || row.id > existing.id) latestByDocType.set(key, row);
}
const dedupedRows = [...latestByDocType.values(), ...unclassified];
const documents = dedupedRows.map((row) => ({
id: String(row.id),
s3FileKey: row.s3FileKey,
s3FileBucket: row.s3FileBucket,