fix: coalesce concurrent cache-miss fetches (thundering herd) (#93)

Fixes #75

## Why
On a fetch-lock miss, `Engine.Fetch` slept a flat 500ms once, tried the store, and otherwise fell through to fetch upstream unlocked. A cold-cache stampede therefore still hit upstream once per waiter.

## Changes
- Add `waitForStore`, which polls the store every 100ms for up to 5s (stopping on context cancellation) so waiters pick up the lock leaders populated result.
- Only fall through to an upstream fetch if the leader has not populated the store within the wait budget.

## Validation
- `make e2e` passes.

Reviewed-on: #93
Co-authored-by: Ben Vincent <ben@unkin.net>
Co-committed-by: Ben Vincent <ben@unkin.net>
This commit was merged in pull request #93.
This commit is contained in:
2026-07-02 22:08:29 +10:00
committed by BenVincent
parent 45d6cdbc64
commit 5261af4c63
+29 -3
View File
@@ -118,9 +118,10 @@ func (e *Engine) Fetch(ctx context.Context, remote models.Remote, path string, p
}
if !locked {
time.Sleep(500 * time.Millisecond)
result, err := e.serveFromStore(ctx, remote, path)
if err == nil {
// Another request holds the fetch lock. Poll the store until the leader
// populates it rather than immediately racing to fetch upstream too; a
// cold-cache stampede otherwise hits upstream once per waiter.
if result := e.waitForStore(ctx, remote, path); result != nil {
result.Source = "cache"
e.logAccess(remote.Name, path, true, result.Size, 0)
return result, nil
@@ -380,6 +381,31 @@ func (e *Engine) fetchFromUpstream(ctx context.Context, remote models.Remote, pa
}, nil
}
// waitForStore polls the store for an artifact populated by the request that
// holds the fetch lock, returning it once available or nil if it does not
// appear within the wait budget (after which the caller fetches upstream
// itself). It stops early if the request context is cancelled.
func (e *Engine) waitForStore(ctx context.Context, remote models.Remote, path string) *FetchResult {
const (
pollInterval = 100 * time.Millisecond
maxWait = 5 * time.Second
)
deadline := time.Now().Add(maxWait)
for {
if result, err := e.serveFromStore(ctx, remote, path); err == nil {
return result
}
if time.Now().After(deadline) {
return nil
}
select {
case <-ctx.Done():
return nil
case <-time.After(pollInterval):
}
}
}
func (e *Engine) serveFromStore(ctx context.Context, remote models.Remote, path string) (*FetchResult, error) {
artifact, err := e.db.GetArtifact(ctx, remote.Name, path)
if err == nil && artifact != nil {