From e7c9387bccaa58ec0f8eb7058c79b6d53594679c Mon Sep 17 00:00:00 2001 From: Ben Vincent Date: Thu, 2 Jul 2026 22:43:18 +1000 Subject: [PATCH] fix: GC has no grace period (TOCTOU with dedup uploads) (#86) Fixes #71 ## Why `FindOrphanedBlobs` returned any blob not currently referenced. Because CAS dedups (the blob row can exist before its artifact/local_files row is written), a concurrent upload reusing an existing hash could have its S3 object deleted mid-flight by the GC. ## Changes - `FindOrphanedBlobs` now takes a `minAge` and only returns blobs with `created_at < now()-minAge`. - The collector passes a 1h `blobGracePeriod`. ## Validation - `go test ./internal/gc/...` and `make e2e` pass. --------- Co-authored-by: BenVincent Reviewed-on: https://git.unkin.net/unkin/artifactapi/pulls/86 Co-authored-by: Ben Vincent Co-committed-by: Ben Vincent --- internal/database/artifacts.go | 12 +++++++++--- internal/gc/gc.go | 7 ++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/internal/database/artifacts.go b/internal/database/artifacts.go index 5d73c15..ab1046f 100644 --- a/internal/database/artifacts.go +++ b/internal/database/artifacts.go @@ -138,16 +138,22 @@ func (db *DB) InsertAccessLogBatch(ctx context.Context, entries []AccessLogEntry return err } -func (db *DB) FindOrphanedBlobs(ctx context.Context) ([]models.Blob, error) { +// FindOrphanedBlobs returns blobs no longer referenced by any artifact or +// local file, restricted to those created before now()-minAge. The age cutoff +// is a grace period that avoids a TOCTOU race with in-flight dedup uploads, +// which insert the blob row before the referencing artifact/local_files row. +func (db *DB) FindOrphanedBlobs(ctx context.Context, minAge time.Duration) ([]models.Blob, error) { + cutoff := time.Now().Add(-minAge) rows, err := db.Pool.Query(ctx, ` SELECT b.content_hash, b.s3_key, b.size_bytes, b.content_type, b.created_at FROM blobs b - WHERE b.content_hash NOT IN ( + WHERE b.created_at < $1 + AND b.content_hash NOT IN ( SELECT content_hash FROM artifacts UNION SELECT content_hash FROM local_files ) - `) + `, cutoff) if err != nil { return nil, err } diff --git a/internal/gc/gc.go b/internal/gc/gc.go index d024334..fbf0e36 100644 --- a/internal/gc/gc.go +++ b/internal/gc/gc.go @@ -9,6 +9,11 @@ import ( "git.unkin.net/unkin/artifactapi/internal/storage" ) +// blobGracePeriod is how old an orphaned blob must be before GC will delete +// it. This avoids racing in-flight dedup uploads that insert the blob row +// before the referencing artifact/local_files row exists. +const blobGracePeriod = 1 * time.Hour + type Collector struct { db *database.DB store *storage.S3 @@ -38,7 +43,7 @@ func (c *Collector) Run(ctx context.Context) { func (c *Collector) sweep(ctx context.Context) { start := time.Now() - orphaned, err := c.db.FindOrphanedBlobs(ctx) + orphaned, err := c.db.FindOrphanedBlobs(ctx, blobGracePeriod) if err != nil { slog.Error("gc: find orphaned blobs", "error", err) return