diff --git a/README.md b/README.md index f9beae2..0e28684 100644 --- a/README.md +++ b/README.md @@ -352,3 +352,25 @@ Set `check_mutable_updates: true` to send `HEAD` with `If-None-Match` / `If-Modi ### Stale-on-upstream-error When a mutable file expires and the upstream is unreachable (connection refused, DNS failure, timeout), the cached copy is kept and its TTL refreshed. HTTP error responses (4xx, 5xx) are not treated as network failures and proceed with normal expiry. + +### Quarantine (supply-chain protection) + +Set `quarantine_new: true` and `quarantine_days: N` on a remote to block immutable artifacts published within the last N days. Requests return `404` until the quarantine period expires, giving time to detect malicious packages before they are consumed. + +```yaml +remotes: + pypi: + base_url: "https://files.pythonhosted.org" + type: "remote" + package: "pypi" + quarantine_new: true + quarantine_days: 3 # block packages published in the last 3 days + immutable_patterns: + - "packages/.*\\.whl$" + - "packages/.*\\.tar\\.gz$" + cache: + immutable_ttl: 0 + mutable_ttl: 600 +``` + +The upstream `Last-Modified` response header is used as the publish date proxy. Artifacts that have no `Last-Modified` header are allowed through (fail-open). Mutable files (index pages, tag manifests) are never quarantined. diff --git a/remotes.yaml b/remotes.yaml index f0a5836..a82b676 100644 --- a/remotes.yaml +++ b/remotes.yaml @@ -9,6 +9,13 @@ # immutable_ttl: TTL for immutable files (0 = forever, rarely needed to change). # mutable_ttl: TTL in seconds for mutable files. Omit to use the default (3600). # +# quarantine_new: Set to true to block immutable artifacts published within the last +# quarantine_days days. Requests return 404 until the quarantine period +# expires. Fails open when the publish date cannot be determined. +# quarantine_days: Number of days to quarantine newly published artifacts (requires +# quarantine_new: true). The upstream Last-Modified header is used as +# the publish date. +# # WARNING: this file may contain credentials — do not commit real values. # # Global configuration @@ -202,6 +209,11 @@ remotes: # simple/ requests are transparently fetched from pypi.org; package files come from # files.pythonhosted.org (base_url). URLs in the simple index are rewritten to this remote. check_mutable_updates: true + # Block packages published within the last 3 days (supply-chain attack mitigation). + # Immutable artifacts (wheel/sdist) newer than quarantine_days return 404 until + # the window passes. Disable by setting quarantine_new: false or removing both keys. + quarantine_new: true + quarantine_days: 3 immutable_patterns: - "packages/.*\\.whl$" - "packages/.*\\.whl\\.metadata$" diff --git a/src/artifactapi/artifact/docker.py b/src/artifactapi/artifact/docker.py index 5e3f4e5..ba40612 100644 --- a/src/artifactapi/artifact/docker.py +++ b/src/artifactapi/artifact/docker.py @@ -59,6 +59,18 @@ async def proxy(request: Request, remote_name: str, path: str, storage, cache, c logger.info(f"Mutable file cached with TTL: {remote_name}/{path} (ttl: {mutable_ttl}s)") if result.get("etag") or result.get("last_modified"): cache.store_mutable_meta(remote_name, path, result.get("etag"), result.get("last_modified")) + if not is_mutable: + published = result.get("last_modified") + if published: + cache.store_artifact_published(remote_name, path, published) + _proxy._check_quarantine(remote_name, published, config) + elif not is_mutable: + published = cache.get_artifact_published(remote_name, path) + if not published: + published = await _proxy._fetch_last_modified(remote_url, remote_config) + if published: + cache.store_artifact_published(remote_name, path, published) + _proxy._check_quarantine(remote_name, published, config) artifact_data = storage.download_object(storage.get_object_key(remote_name, path)) diff --git a/src/artifactapi/artifact/proxy.py b/src/artifactapi/artifact/proxy.py index 9741736..c3506a6 100644 --- a/src/artifactapi/artifact/proxy.py +++ b/src/artifactapi/artifact/proxy.py @@ -2,6 +2,8 @@ import base64 import logging import os import re +from datetime import UTC, datetime, timedelta +from email.utils import parsedate_to_datetime import httpx from fastapi import HTTPException, Request, Response @@ -19,6 +21,42 @@ class UpstreamUnreachable(Exception): """Raised when the upstream backend cannot be contacted (network or timeout error).""" +def _check_quarantine(remote_name: str, last_modified_str: str | None, config) -> None: + """Raise HTTP 404 if the artifact is within the per-remote quarantine window. + + Fails open (allows the request) when the publish date cannot be determined. + """ + enabled, days = config.get_quarantine_config(remote_name) + if not enabled or not days: + return + if not last_modified_str: + return # cannot determine age → allow + try: + publish_date = parsedate_to_datetime(last_modified_str) + except Exception: + return # unparseable → allow + cutoff = datetime.now(UTC) - timedelta(days=days) + if publish_date > cutoff: + available_on = (publish_date + timedelta(days=days)).date() + raise HTTPException( + status_code=404, + detail=( + f"Package quarantined: published {publish_date.date()}, available after {available_on} ({days}-day new-release quarantine)" + ), + ) + + +async def _fetch_last_modified(remote_url: str, remote_cfg: dict) -> str | None: + """HEAD the upstream URL and return the Last-Modified header, or None on any failure.""" + auth = _basic_auth_header(remote_cfg) + try: + async with httpx.AsyncClient(follow_redirects=True) as client: + response = await client.head(remote_url, headers=auth, timeout=10.0) + return response.headers.get("Last-Modified") + except Exception: + return None + + def _basic_auth_header(remote_cfg: dict) -> dict[str, str]: username = remote_cfg.get("username") password = remote_cfg.get("password") @@ -225,6 +263,14 @@ async def handle(request: Request, remote_name: str, path: str, storage, cache, cached_key = None if cached_key: + if not is_mutable: + published = cache.get_artifact_published(remote_name, path) + if not published: + published = await _fetch_last_modified(remote_url, remote_config) + if published: + cache.store_artifact_published(remote_name, path, published) + _check_quarantine(remote_name, published, config) + try: artifact_data = storage.download_object(cached_key) artifact_data, content_type = _resolve_content(artifact_data, path, filename, remote_config, request, remote_name) @@ -240,6 +286,8 @@ async def handle(request: Request, remote_name: str, path: str, storage, cache, "X-Artifact-Size": str(len(artifact_data)), }, ) + except HTTPException: + raise except Exception as e: raise HTTPException(status_code=500, detail=f"Error retrieving cached artifact: {str(e)}") @@ -258,6 +306,12 @@ async def handle(request: Request, remote_name: str, path: str, storage, cache, if result.get("etag") or result.get("last_modified"): cache.store_mutable_meta(remote_name, path, result.get("etag"), result.get("last_modified")) + if not is_mutable: + published = result.get("last_modified") + if published: + cache.store_artifact_published(remote_name, path, published) + _check_quarantine(remote_name, published, config) + try: cache_key = storage.get_object_key(remote_name, path) artifact_data = storage.download_object(cache_key) diff --git a/src/artifactapi/cache/redis.py b/src/artifactapi/cache/redis.py index 9d22a3b..a3012ae 100644 --- a/src/artifactapi/cache/redis.py +++ b/src/artifactapi/cache/redis.py @@ -78,6 +78,27 @@ class RedisCache: except Exception: pass + def get_artifact_published_key(self, remote_name: str, path: str) -> str: + return f"pkg:published:{remote_name}:{hashlib.sha256(path.encode()).hexdigest()[:16]}" + + def store_artifact_published(self, remote_name: str, path: str, last_modified: str) -> None: + """Persist the upstream Last-Modified header for a (typically immutable) artifact.""" + if not self.available: + return + try: + self.client.set(self.get_artifact_published_key(remote_name, path), last_modified) + except Exception: + pass + + def get_artifact_published(self, remote_name: str, path: str) -> str | None: + """Return the stored Last-Modified string for an artifact, or None.""" + if not self.available: + return None + try: + return self.client.get(self.get_artifact_published_key(remote_name, path)) + except Exception: + return None + def cleanup_expired_index(self, storage, remote_name: str, path: str) -> None: if not self.available: return diff --git a/src/artifactapi/config.py b/src/artifactapi/config.py index ab1d902..3c83f27 100644 --- a/src/artifactapi/config.py +++ b/src/artifactapi/config.py @@ -159,3 +159,16 @@ class ConfigManager: return {} return remote_config.get("cache", {}) + + def get_quarantine_config(self, remote_name: str) -> tuple[bool, int]: + """Return (enabled, quarantine_days) for a remote. + + When enabled=True and quarantine_days>0, immutable artifacts published + within the last quarantine_days days are blocked with a 404. + """ + remote_config = self.get_remote_config(remote_name) + if not remote_config: + return False, 0 + enabled = bool(remote_config.get("quarantine_new", False)) + days = int(remote_config.get("quarantine_days", 0)) + return enabled, days diff --git a/tests/conftest.py b/tests/conftest.py index 1fada7e..1178062 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -98,6 +98,24 @@ TEST_REMOTES = { "immutable_patterns": [r"\.tgz$"], "cache": {"immutable_ttl": 0, "mutable_ttl": 3600}, }, + "quarantine-test": { + "base_url": "https://releases.example.com", + "type": "remote", + "package": "generic", + "immutable_patterns": [r".*\.tar\.gz$"], + "quarantine_new": True, + "quarantine_days": 3, + "cache": {"immutable_ttl": 0, "mutable_ttl": 0}, + }, + "quarantine-disabled": { + "base_url": "https://releases.example.com", + "type": "remote", + "package": "generic", + "immutable_patterns": [r".*\.tar\.gz$"], + "quarantine_new": False, + "quarantine_days": 3, + "cache": {"immutable_ttl": 0, "mutable_ttl": 0}, + }, } } diff --git a/tests/test_cache.py b/tests/test_cache.py index 8308e0d..d30169c 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -283,3 +283,47 @@ class TestMutableMeta: def test_delete_no_op_when_unavailable(self, unavailable_cache): unavailable_cache.delete_mutable_meta("remote", "path") # must not raise + + +# --------------------------------------------------------------------------- +# artifact published date (quarantine support) +# --------------------------------------------------------------------------- + + +class TestArtifactPublished: + def test_key_format_is_deterministic(self, bare_cache): + path = "some/path/package-1.0.tar.gz" + expected_hash = hashlib.sha256(path.encode()).hexdigest()[:16] + assert bare_cache.get_artifact_published_key("myremote", path) == f"pkg:published:myremote:{expected_hash}" + + def test_key_hash_is_16_chars(self, bare_cache): + key = bare_cache.get_artifact_published_key("remote", "path/to/file.whl") + assert len(key.split(":")[-1]) == 16 + + def test_different_paths_produce_different_keys(self, bare_cache): + k1 = bare_cache.get_artifact_published_key("remote", "pkg-1.0.tar.gz") + k2 = bare_cache.get_artifact_published_key("remote", "pkg-2.0.tar.gz") + assert k1 != k2 + + def test_store_calls_set_with_correct_value(self, cache_with_redis, mock_redis_client): + lm = "Mon, 01 Jan 2024 00:00:00 GMT" + cache_with_redis.store_artifact_published("remote", "path/pkg.tar.gz", lm) + expected_key = cache_with_redis.get_artifact_published_key("remote", "path/pkg.tar.gz") + mock_redis_client.set.assert_called_once_with(expected_key, lm) + + def test_get_returns_stored_value(self, cache_with_redis, mock_redis_client): + lm = "Tue, 15 Mar 2022 12:00:00 GMT" + mock_redis_client.get.return_value = lm + result = cache_with_redis.get_artifact_published("remote", "path/pkg.tar.gz") + assert result == lm + + def test_get_returns_none_when_not_stored(self, cache_with_redis, mock_redis_client): + mock_redis_client.get.return_value = None + result = cache_with_redis.get_artifact_published("remote", "path/pkg.tar.gz") + assert result is None + + def test_store_no_op_when_unavailable(self, unavailable_cache): + unavailable_cache.store_artifact_published("remote", "path", "Mon, 01 Jan 2024 00:00:00 GMT") + + def test_get_returns_none_when_unavailable(self, unavailable_cache): + assert unavailable_cache.get_artifact_published("remote", "path") is None diff --git a/tests/test_config.py b/tests/test_config.py index 594c6cf..344627c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -351,3 +351,70 @@ class TestConfigReload: cfg._check_reload() assert "repo-a" in cfg.config["remotes"] + + +# --------------------------------------------------------------------------- +# get_quarantine_config +# --------------------------------------------------------------------------- + + +class TestGetQuarantineConfig: + def test_returns_false_zero_when_not_configured(self, make_config): + cfg = make_config({"r": {"type": "remote", "package": "generic", "base_url": "https://x.com"}}) + enabled, days = cfg.get_quarantine_config("r") + assert enabled is False + assert days == 0 + + def test_returns_false_zero_for_missing_remote(self, make_config): + cfg = make_config({}) + enabled, days = cfg.get_quarantine_config("nonexistent") + assert enabled is False + assert days == 0 + + def test_enabled_true_and_days_returned(self, make_config): + cfg = make_config( + { + "r": { + "type": "remote", + "package": "generic", + "base_url": "https://x.com", + "quarantine_new": True, + "quarantine_days": 7, + } + } + ) + enabled, days = cfg.get_quarantine_config("r") + assert enabled is True + assert days == 7 + + def test_quarantine_new_false_returns_disabled(self, make_config): + cfg = make_config( + { + "r": { + "type": "remote", + "package": "generic", + "base_url": "https://x.com", + "quarantine_new": False, + "quarantine_days": 7, + } + } + ) + enabled, days = cfg.get_quarantine_config("r") + assert enabled is False + assert days == 7 + + def test_enabled_with_zero_days_returns_zero(self, make_config): + cfg = make_config( + { + "r": { + "type": "remote", + "package": "generic", + "base_url": "https://x.com", + "quarantine_new": True, + "quarantine_days": 0, + } + } + ) + enabled, days = cfg.get_quarantine_config("r") + assert enabled is True + assert days == 0 diff --git a/tests/test_routes.py b/tests/test_routes.py index f755c77..bf9a1db 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -2,6 +2,7 @@ import hashlib import json +from datetime import UTC from unittest.mock import ANY, AsyncMock, MagicMock, patch import pytest @@ -924,3 +925,153 @@ class TestHelmRemote: response = client.get("/api/v1/remote/helm-test/vault.zip") assert response.status_code == 403 + + +# --------------------------------------------------------------------------- +# Quarantine (quarantine-test remote: quarantine_new=True, quarantine_days=3) +# --------------------------------------------------------------------------- + + +class TestQuarantine: + def _recent_date(self, days_ago=1): + """Return an HTTP-format date string N days in the past (within quarantine window).""" + from datetime import datetime, timedelta + from email.utils import format_datetime + + dt = datetime.now(UTC) - timedelta(days=days_ago) + return format_datetime(dt, usegmt=True) + + def _old_date(self, days_ago=10): + """Return an HTTP-format date string N days in the past (outside quarantine window).""" + from datetime import datetime, timedelta + from email.utils import format_datetime + + dt = datetime.now(UTC) - timedelta(days=days_ago) + return format_datetime(dt, usegmt=True) + + def test_cache_miss_recent_artifact_quarantined(self, client, patched_deps): + """Cache miss: artifact published within quarantine window → 404.""" + deps = patched_deps + deps["storage"].exists.return_value = False + deps["storage"].download_object.return_value = b"content" + deps["cache"].is_mutable_file.return_value = False + + with patch( + "artifactapi.artifact.proxy.cache_single_artifact", + new_callable=AsyncMock, + return_value={"status": "cached", "last_modified": self._recent_date()}, + ): + response = client.get("/api/v1/remote/quarantine-test/some/path/package-1.0.tar.gz") + + assert response.status_code == 404 + assert "quarantined" in response.json()["detail"].lower() + + def test_cache_miss_old_artifact_allowed(self, client, patched_deps): + """Cache miss: artifact published outside quarantine window → 200.""" + deps = patched_deps + deps["storage"].exists.return_value = False + deps["storage"].download_object.return_value = b"content" + deps["cache"].is_mutable_file.return_value = False + + with patch( + "artifactapi.artifact.proxy.cache_single_artifact", + new_callable=AsyncMock, + return_value={"status": "cached", "last_modified": self._old_date()}, + ): + response = client.get("/api/v1/remote/quarantine-test/some/path/package-1.0.tar.gz") + + assert response.status_code == 200 + + def test_cache_miss_no_last_modified_fails_open(self, client, patched_deps): + """Cache miss: no Last-Modified header → fail open (200, not quarantined).""" + deps = patched_deps + deps["storage"].exists.return_value = False + deps["storage"].download_object.return_value = b"content" + deps["cache"].is_mutable_file.return_value = False + + with patch( + "artifactapi.artifact.proxy.cache_single_artifact", + new_callable=AsyncMock, + return_value={"status": "cached", "last_modified": None}, + ): + response = client.get("/api/v1/remote/quarantine-test/some/path/package-1.0.tar.gz") + + assert response.status_code == 200 + + def test_cache_hit_recent_artifact_quarantined(self, client, patched_deps): + """Cache hit: stored publish date within quarantine window → 404.""" + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b"content" + deps["cache"].is_mutable_file.return_value = False + deps["cache"].get_artifact_published.return_value = self._recent_date() + + response = client.get("/api/v1/remote/quarantine-test/some/path/package-1.0.tar.gz") + + assert response.status_code == 404 + assert "quarantined" in response.json()["detail"].lower() + + def test_cache_hit_old_artifact_allowed(self, client, patched_deps): + """Cache hit: stored publish date outside quarantine window → 200.""" + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b"content" + deps["cache"].is_mutable_file.return_value = False + deps["cache"].get_artifact_published.return_value = self._old_date() + + response = client.get("/api/v1/remote/quarantine-test/some/path/package-1.0.tar.gz") + + assert response.status_code == 200 + + def test_cache_hit_no_stored_date_fetches_upstream(self, client, patched_deps): + """Cache hit: no stored date → HEAD upstream to get Last-Modified.""" + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b"content" + deps["cache"].is_mutable_file.return_value = False + deps["cache"].get_artifact_published.return_value = None + + with patch( + "artifactapi.artifact.proxy._fetch_last_modified", + new_callable=AsyncMock, + return_value=self._old_date(), + ) as mock_fetch: + response = client.get("/api/v1/remote/quarantine-test/some/path/package-1.0.tar.gz") + + mock_fetch.assert_called_once() + assert response.status_code == 200 + + def test_quarantine_disabled_allows_recent_artifact(self, client, patched_deps): + """quarantine_new=False: recent artifacts are not blocked.""" + deps = patched_deps + deps["storage"].exists.return_value = False + deps["storage"].download_object.return_value = b"content" + deps["cache"].is_mutable_file.return_value = False + + with patch( + "artifactapi.artifact.proxy.cache_single_artifact", + new_callable=AsyncMock, + return_value={"status": "cached", "last_modified": self._recent_date()}, + ): + response = client.get("/api/v1/remote/quarantine-disabled/some/path/package-1.0.tar.gz") + + assert response.status_code == 200 + + def test_quarantine_detail_includes_available_date(self, client, patched_deps): + """The 404 detail should include the date when the artifact becomes available.""" + deps = patched_deps + deps["storage"].exists.return_value = False + deps["storage"].download_object.return_value = b"content" + deps["cache"].is_mutable_file.return_value = False + + with patch( + "artifactapi.artifact.proxy.cache_single_artifact", + new_callable=AsyncMock, + return_value={"status": "cached", "last_modified": self._recent_date()}, + ): + response = client.get("/api/v1/remote/quarantine-test/some/path/package-1.0.tar.gz") + + assert response.status_code == 404 + detail = response.json()["detail"] + assert "available after" in detail + assert "3-day" in detail