From 5261af4c63d85015a429e587e3af597729d2dad5 Mon Sep 17 00:00:00 2001 From: Ben Vincent Date: Thu, 2 Jul 2026 22:08:29 +1000 Subject: [PATCH] fix: coalesce concurrent cache-miss fetches (thundering herd) (#93) Fixes #75 ## Why On a fetch-lock miss, `Engine.Fetch` slept a flat 500ms once, tried the store, and otherwise fell through to fetch upstream unlocked. A cold-cache stampede therefore still hit upstream once per waiter. ## Changes - Add `waitForStore`, which polls the store every 100ms for up to 5s (stopping on context cancellation) so waiters pick up the lock leaders populated result. - Only fall through to an upstream fetch if the leader has not populated the store within the wait budget. ## Validation - `make e2e` passes. Reviewed-on: https://git.unkin.net/unkin/artifactapi/pulls/93 Co-authored-by: Ben Vincent Co-committed-by: Ben Vincent --- internal/proxy/engine.go | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/internal/proxy/engine.go b/internal/proxy/engine.go index 4c17cea..47ecafc 100644 --- a/internal/proxy/engine.go +++ b/internal/proxy/engine.go @@ -118,9 +118,10 @@ func (e *Engine) Fetch(ctx context.Context, remote models.Remote, path string, p } if !locked { - time.Sleep(500 * time.Millisecond) - result, err := e.serveFromStore(ctx, remote, path) - if err == nil { + // Another request holds the fetch lock. Poll the store until the leader + // populates it rather than immediately racing to fetch upstream too; a + // cold-cache stampede otherwise hits upstream once per waiter. + if result := e.waitForStore(ctx, remote, path); result != nil { result.Source = "cache" e.logAccess(remote.Name, path, true, result.Size, 0) return result, nil @@ -380,6 +381,31 @@ func (e *Engine) fetchFromUpstream(ctx context.Context, remote models.Remote, pa }, nil } +// waitForStore polls the store for an artifact populated by the request that +// holds the fetch lock, returning it once available or nil if it does not +// appear within the wait budget (after which the caller fetches upstream +// itself). It stops early if the request context is cancelled. +func (e *Engine) waitForStore(ctx context.Context, remote models.Remote, path string) *FetchResult { + const ( + pollInterval = 100 * time.Millisecond + maxWait = 5 * time.Second + ) + deadline := time.Now().Add(maxWait) + for { + if result, err := e.serveFromStore(ctx, remote, path); err == nil { + return result + } + if time.Now().After(deadline) { + return nil + } + select { + case <-ctx.Done(): + return nil + case <-time.After(pollInterval): + } + } +} + func (e *Engine) serveFromStore(ctx context.Context, remote models.Remote, path string) (*FetchResult, error) { artifact, err := e.db.GetArtifact(ctx, remote.Name, path) if err == nil && artifact != nil {