diff --git a/remotes.yaml b/remotes.yaml index 959849f..d312461 100644 --- a/remotes.yaml +++ b/remotes.yaml @@ -70,6 +70,10 @@ remotes: # Branch archives can change on every push - ".*/archive/refs/heads/main\\.tar\\.gz$" - ".*/archive/refs/heads/master\\.tar\\.gz$" + # Before re-downloading an expired branch archive, check whether it has + # actually changed (304 Not Modified → just refresh the TTL, no transfer). + # Only applies to user-defined mutable_patterns, not package-type defaults. + check_mutable_updates: true cache: immutable_ttl: 0 # Tag archives cached indefinitely mutable_ttl: 86400 # Branch archives refreshed after 1 day @@ -113,6 +117,9 @@ remotes: description: "Alpine Linux APK package repository" immutable_patterns: - ".*/x86_64/.*\\.apk$" + # check_mutable_updates not set: APKINDEX.tar.gz is a package-type default + # and is always re-fetched on expiry — conditional checks are skipped for + # built-in mutable patterns regardless of this flag. cache: immutable_ttl: 0 # Files cached indefinitely mutable_ttl: 7200 # Index files (APKINDEX.tar.gz) cached for 2 hours @@ -127,9 +134,12 @@ remotes: - ".*/noarch/.*\\.rpm$" - ".*/repodata/.*$" - ".*\\.rpm$" # Allow all RPM files + # repomd.xml / repodata are package-type defaults — always re-fetched on + # expiry. check_mutable_updates would only apply to any custom + # mutable_patterns added here. cache: immutable_ttl: 0 # Files cached indefinitely - mutable_ttl: 7200 # Metadata files cached for 1 hour + mutable_ttl: 7200 # Metadata files cached for 2 hours epel: base_url: "http://mirror.aarnet.edu.au/pub/epel" @@ -168,6 +178,9 @@ remotes: description: "GitHub Container Registry" # username: "your-github-username" # password: "your-github-pat" # needs read:packages scope + # Docker manifest/tag-list patterns are package-type defaults — always + # re-fetched on expiry. check_mutable_updates only applies to any custom + # mutable_patterns you add (e.g. a metadata endpoint). cache: immutable_ttl: 0 mutable_ttl: 300 diff --git a/src/artifactapi/cache.py b/src/artifactapi/cache.py index e7e5712..9e3940b 100644 --- a/src/artifactapi/cache.py +++ b/src/artifactapi/cache.py @@ -26,11 +26,13 @@ class RedisCache: return any(re.search(p, file_path) for p in patterns) def get_index_cache_key(self, remote_name: str, path: str) -> str: - """Generate cache key for index files""" return f"index:{remote_name}:{hashlib.sha256(path.encode()).hexdigest()[:16]}" - def is_index_valid(self, remote_name: str, path: str, ttl_override: int = None) -> bool: - """Check if index file is still valid (not expired)""" + def get_mutable_meta_key(self, remote_name: str, path: str) -> str: + return f"mutable:meta:{remote_name}:{hashlib.sha256(path.encode()).hexdigest()[:16]}" + + def is_index_valid(self, remote_name: str, path: str) -> bool: + """Check if mutable file is still within its TTL window.""" if not self.available: return False @@ -41,7 +43,7 @@ class RedisCache: return False def mark_index_cached(self, remote_name: str, path: str, ttl: int = 300) -> None: - """Mark index file as cached with TTL""" + """Set or refresh the TTL key for a mutable file.""" if not self.available: return @@ -51,13 +53,45 @@ class RedisCache: except Exception: pass + def store_mutable_meta(self, remote_name: str, path: str, etag: str | None, last_modified: str | None) -> None: + """Persist ETag and Last-Modified for future conditional requests.""" + if not self.available: + return + data = {} + if etag: + data["etag"] = etag + if last_modified: + data["last_modified"] = last_modified + if not data: + return + try: + self.client.hset(self.get_mutable_meta_key(remote_name, path), mapping=data) + except Exception: + pass + + def get_mutable_meta(self, remote_name: str, path: str) -> dict: + """Return stored ETag/Last-Modified for a mutable file, or {}.""" + if not self.available: + return {} + try: + return self.client.hgetall(self.get_mutable_meta_key(remote_name, path)) or {} + except Exception: + return {} + + def delete_mutable_meta(self, remote_name: str, path: str) -> None: + if not self.available: + return + try: + self.client.delete(self.get_mutable_meta_key(remote_name, path)) + except Exception: + pass + def cleanup_expired_index(self, storage, remote_name: str, path: str) -> None: - """Remove expired index from S3 storage""" + """Remove an expired mutable file from S3 and clear its Redis meta.""" if not self.available: return try: - # Construct the URL the same way as in the main flow import os from .config import ConfigManager @@ -69,9 +103,10 @@ class RedisCache: if remote_config: base_url = remote_config.get("base_url") if base_url: - # Use hierarchical path-based key (same as cache_single_artifact) s3_key = storage.get_object_key(remote_name, path) if storage.exists(s3_key): storage.client.delete_object(Bucket=storage.bucket, Key=s3_key) except Exception: pass + + self.delete_mutable_meta(remote_name, path) diff --git a/src/artifactapi/config.py b/src/artifactapi/config.py index 0bb7bc3..ef138f3 100644 --- a/src/artifactapi/config.py +++ b/src/artifactapi/config.py @@ -124,6 +124,13 @@ class ConfigManager: db_url = f"postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}" return {"url": db_url} + def get_user_mutable_patterns(self, remote_name: str) -> list[str]: + """Return only user-configured mutable_patterns, excluding package-type defaults.""" + remote_config = self.get_remote_config(remote_name) + if not remote_config: + return [] + return remote_config.get("mutable_patterns", []) + def get_mutable_patterns(self, remote_name: str) -> list[str]: """Return mutable-file patterns for a remote (TTL is configured per-remote in cache.index_ttl). diff --git a/src/artifactapi/main.py b/src/artifactapi/main.py index b26f7b0..d26db7b 100644 --- a/src/artifactapi/main.py +++ b/src/artifactapi/main.py @@ -87,8 +87,10 @@ def flush_cache( if cache_type in ["all", "index"]: if remote: patterns.append(f"index:{remote}:*") + patterns.append(f"mutable:meta:{remote}:*") else: patterns.append("index:*") + patterns.append("mutable:meta:*") if cache_type in ["all", "metrics"]: if remote: @@ -240,12 +242,36 @@ async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict: "storage_path": storage_path, "size": len(response.content), "status": "cached", + "etag": response.headers.get("ETag"), + "last_modified": response.headers.get("Last-Modified"), } except Exception as e: return {"url": url, "status": "error", "error": str(e)} +async def check_upstream_changed(remote_url: str, remote_name: str, path: str) -> bool: + """Conditional HEAD against upstream. Returns False only on a definitive 304.""" + meta = cache.get_mutable_meta(remote_name, path) + if not meta: + return True + + headers = {} + if meta.get("etag"): + headers["If-None-Match"] = meta["etag"] + if meta.get("last_modified"): + headers["If-Modified-Since"] = meta["last_modified"] + if not headers: + return True + + try: + async with httpx.AsyncClient(follow_redirects=True) as client: + response = await client.head(remote_url, headers=headers) + return response.status_code != 304 + except Exception: + return True + + @app.get("/api/v1/remote/{remote_name}/{path:path}") async def get_artifact(remote_name: str, path: str): # Check if remote is configured @@ -302,9 +328,23 @@ async def get_artifact(remote_name: str, path: str): if cached_key and is_mutable: if not cache.is_index_valid(remote_name, path): - logger.info(f"Mutable file EXPIRED: {remote_name}/{path} - removing from cache") - cache.cleanup_expired_index(storage, remote_name, path) - cached_key = None # Force re-download + remote_cfg = config.get_remote_config(remote_name) or {} + check_updates = remote_cfg.get("check_mutable_updates", False) + user_mutable = check_updates and cache.is_mutable_file(path, config.get_user_mutable_patterns(remote_name)) + if user_mutable: + changed = await check_upstream_changed(remote_url, remote_name, path) + if not changed: + mutable_ttl = config.get_cache_config(remote_name).get("mutable_ttl", 3600) + cache.mark_index_cached(remote_name, path, mutable_ttl) + logger.info(f"Mutable file UNCHANGED: {remote_name}/{path} - TTL refreshed ({mutable_ttl}s)") + else: + logger.info(f"Mutable file CHANGED: {remote_name}/{path} - re-downloading") + cache.cleanup_expired_index(storage, remote_name, path) + cached_key = None + else: + logger.info(f"Mutable file EXPIRED: {remote_name}/{path} - removing from cache") + cache.cleanup_expired_index(storage, remote_name, path) + cached_key = None if cached_key: # Return cached artifact @@ -362,6 +402,8 @@ async def get_artifact(remote_name: str, path: str): mutable_ttl = cache_config.get("mutable_ttl", 3600) cache.mark_index_cached(remote_name, path, mutable_ttl) logger.info(f"Mutable file cached with TTL: {remote_name}/{path} (ttl: {mutable_ttl}s)") + if result.get("etag") or result.get("last_modified"): + cache.store_mutable_meta(remote_name, path, result.get("etag"), result.get("last_modified")) # Now return the cached artifact try: @@ -439,9 +481,23 @@ async def docker_v2_proxy(request: Request, remote_name: str, path: str): if cached_key and is_mutable: if not cache.is_index_valid(remote_name, path): - logger.info(f"Mutable file EXPIRED: {remote_name}/{path} - removing from cache") - cache.cleanup_expired_index(storage, remote_name, path) - cached_key = None + remote_cfg = config.get_remote_config(remote_name) or {} + check_updates = remote_cfg.get("check_mutable_updates", False) + user_mutable = check_updates and cache.is_mutable_file(path, config.get_user_mutable_patterns(remote_name)) + if user_mutable: + changed = await check_upstream_changed(remote_url, remote_name, path) + if not changed: + mutable_ttl = config.get_cache_config(remote_name).get("mutable_ttl", 3600) + cache.mark_index_cached(remote_name, path, mutable_ttl) + logger.info(f"Mutable file UNCHANGED: {remote_name}/{path} - TTL refreshed ({mutable_ttl}s)") + else: + logger.info(f"Mutable file CHANGED: {remote_name}/{path} - re-downloading") + cache.cleanup_expired_index(storage, remote_name, path) + cached_key = None + else: + logger.info(f"Mutable file EXPIRED: {remote_name}/{path} - removing from cache") + cache.cleanup_expired_index(storage, remote_name, path) + cached_key = None if not cached_key: logger.info(f"Cache MISS: {remote_name}/{path} - fetching from remote: {remote_url}") @@ -453,6 +509,8 @@ async def docker_v2_proxy(request: Request, remote_name: str, path: str): mutable_ttl = cache_config.get("mutable_ttl", 3600) cache.mark_index_cached(remote_name, path, mutable_ttl) logger.info(f"Mutable file cached with TTL: {remote_name}/{path} (ttl: {mutable_ttl}s)") + if result.get("etag") or result.get("last_modified"): + cache.store_mutable_meta(remote_name, path, result.get("etag"), result.get("last_modified")) artifact_data = storage.download_object(storage.get_object_key(remote_name, path)) diff --git a/tests/conftest.py b/tests/conftest.py index 772fdd0..9326244 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,6 +59,14 @@ TEST_REMOTES = { "mutable_patterns": ["metadata\\.json$"], "cache": {"immutable_ttl": 0, "mutable_ttl": 600}, }, + "check-mutable-test": { + "base_url": "https://example.com", + "type": "remote", + "package": "generic", + "mutable_patterns": ["metadata\\.json$"], + "check_mutable_updates": True, + "cache": {"immutable_ttl": 0, "mutable_ttl": 600}, + }, "local-test": { "type": "local", "package": "generic", diff --git a/tests/test_cache.py b/tests/test_cache.py index 7a1cc62..8308e0d 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -235,3 +235,51 @@ class TestIndexValidity: # client is None when Redis is unavailable — setex cannot be called assert unavailable_cache.client is None unavailable_cache.mark_index_cached("remote", "some/path", 300) # must not raise + + +# --------------------------------------------------------------------------- +# mutable meta (ETag / Last-Modified storage) +# --------------------------------------------------------------------------- + + +class TestMutableMeta: + def test_meta_key_format(self, bare_cache): + path = "repo/metadata.json" + expected_hash = hashlib.sha256(path.encode()).hexdigest()[:16] + assert bare_cache.get_mutable_meta_key("myremote", path) == f"mutable:meta:myremote:{expected_hash}" + + def test_meta_key_hash_is_16_chars(self, bare_cache): + key = bare_cache.get_mutable_meta_key("remote", "some/path/file.json") + assert len(key.split(":")[-1]) == 16 + + def test_store_and_retrieve_etag(self, cache_with_redis, mock_redis_client): + mock_redis_client.hgetall.return_value = {"etag": '"abc123"'} + cache_with_redis.store_mutable_meta("remote", "path/meta.json", '"abc123"', None) + mock_redis_client.hset.assert_called_once() + meta = cache_with_redis.get_mutable_meta("remote", "path/meta.json") + assert meta["etag"] == '"abc123"' + + def test_store_and_retrieve_last_modified(self, cache_with_redis, mock_redis_client): + lm = "Mon, 01 Jan 2024 00:00:00 GMT" + mock_redis_client.hgetall.return_value = {"last_modified": lm} + cache_with_redis.store_mutable_meta("remote", "path/meta.json", None, lm) + meta = cache_with_redis.get_mutable_meta("remote", "path/meta.json") + assert meta["last_modified"] == lm + + def test_store_no_op_when_both_none(self, cache_with_redis, mock_redis_client): + cache_with_redis.store_mutable_meta("remote", "path/meta.json", None, None) + mock_redis_client.hset.assert_not_called() + + def test_store_no_op_when_unavailable(self, unavailable_cache): + unavailable_cache.store_mutable_meta("remote", "path", "etag", None) # must not raise + + def test_get_returns_empty_when_unavailable(self, unavailable_cache): + assert unavailable_cache.get_mutable_meta("remote", "path") == {} + + def test_delete_removes_meta_key(self, cache_with_redis, mock_redis_client): + expected_key = cache_with_redis.get_mutable_meta_key("remote", "path/meta.json") + cache_with_redis.delete_mutable_meta("remote", "path/meta.json") + mock_redis_client.delete.assert_called_once_with(expected_key) + + def test_delete_no_op_when_unavailable(self, unavailable_cache): + unavailable_cache.delete_mutable_meta("remote", "path") # must not raise diff --git a/tests/test_config.py b/tests/test_config.py index 2fb1233..9137840 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -208,6 +208,39 @@ class TestGetImmutablePatterns: assert cfg.get_immutable_patterns("r", "/unknown/path") == [r".*\.tar\.gz$"] +# --------------------------------------------------------------------------- +# get_user_mutable_patterns +# --------------------------------------------------------------------------- + + +class TestGetUserMutablePatterns: + def test_returns_only_user_patterns(self, make_config): + cfg = make_config( + { + "r": { + "type": "remote", + "package": "alpine", + "base_url": "https://x.com", + "mutable_patterns": [r"custom\.json$"], + } + } + ) + assert cfg.get_user_mutable_patterns("r") == [r"custom\.json$"] + + def test_excludes_package_defaults(self, make_config): + # Package defaults (APKINDEX etc.) must NOT appear here + cfg = make_config({"r": {"type": "remote", "package": "alpine", "base_url": "https://x.com"}}) + assert cfg.get_user_mutable_patterns("r") == [] + + def test_returns_empty_for_missing_remote(self, make_config): + cfg = make_config({}) + assert cfg.get_user_mutable_patterns("nonexistent") == [] + + def test_returns_empty_when_key_absent(self, make_config): + cfg = make_config({"r": {"type": "remote", "package": "generic", "base_url": "https://x.com"}}) + assert cfg.get_user_mutable_patterns("r") == [] + + # --------------------------------------------------------------------------- # get_cache_config # --------------------------------------------------------------------------- diff --git a/tests/test_routes.py b/tests/test_routes.py index 8e9a226..92a3d0d 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -419,6 +419,53 @@ class TestGenericArtifactRoute: response = client.get("/api/v1/remote/alpine-test/alpine/v3.18/x86_64/APKINDEX.tar.gz") assert response.status_code == 200 + def test_mutable_unchanged_refreshes_ttl_without_redownload(self, client, patched_deps): + """When check_mutable_updates=True and upstream says 304, TTL is refreshed in place.""" + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b"metadata content" + # File is mutable and its TTL has expired + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = False + deps["cache"].get_mutable_meta.return_value = {"etag": '"abc"'} + + with patch("artifactapi.main.check_upstream_changed", new_callable=AsyncMock, return_value=False): + response = client.get("/api/v1/remote/check-mutable-test/metadata.json") + + assert response.status_code == 200 + deps["cache"].mark_index_cached.assert_called() + # S3 object must NOT have been deleted (no re-download) + deps["storage"].client.delete_object.assert_not_called() + + def test_mutable_changed_triggers_redownload(self, client, patched_deps): + """When check_mutable_updates=True and upstream says 200, cache is invalidated.""" + deps = patched_deps + deps["storage"].exists.return_value = False + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = False + deps["cache"].get_mutable_meta.return_value = {"etag": '"abc"'} + + with patch("artifactapi.main.check_upstream_changed", new_callable=AsyncMock, return_value=True): + with patch("artifactapi.main.cache_single_artifact", new_callable=AsyncMock) as mock_cache: + mock_cache.return_value = {"status": "error", "error": "upstream gone"} + response = client.get("/api/v1/remote/check-mutable-test/metadata.json") + + assert response.status_code == 502 + + def test_mutable_flag_off_skips_conditional_check(self, client, patched_deps): + """When check_mutable_updates is not set, expired mutable files are always re-fetched.""" + deps = patched_deps + deps["storage"].exists.return_value = False + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = False + + with patch("artifactapi.main.check_upstream_changed", new_callable=AsyncMock) as mock_check: + with patch("artifactapi.main.cache_single_artifact", new_callable=AsyncMock) as mock_cache: + mock_cache.return_value = {"status": "error", "error": "upstream gone"} + client.get("/api/v1/remote/custom-index-test/metadata.json") + + mock_check.assert_not_called() + def test_local_repo_file_not_found_returns_404(self, client, patched_deps): deps = patched_deps deps["database"].get_local_file_metadata.return_value = None @@ -519,8 +566,8 @@ class TestCacheFlushEndpoint: deps["cache"].available = True redis_mock = MagicMock() deps["cache"].client = redis_mock - # First pattern (index:*) returns keys; subsequent pattern returns nothing - redis_mock.keys.side_effect = [["index:test:abc", "index:test:def"], []] + # index:* returns keys; mutable:meta:* and metrics:* return nothing + redis_mock.keys.side_effect = [["index:test:abc", "index:test:def"], [], []] deps["storage"].client.list_objects_v2.return_value = {} response = client.put("/cache/flush")