From 8e9d31389205c520ba9b2c237d4dcd672db737b8 Mon Sep 17 00:00:00 2001 From: Ben Vincent Date: Mon, 27 Apr 2026 14:31:33 +1000 Subject: [PATCH] feat: add pypi remote type with URL rewriting and basic auth - Add 'pypi' package type to config.py; simple/ paths are mutable by default - Refactor content-type detection into _get_content_type() helper; add .whl - Add _resolve_content() which rewrites files host URLs in simple index HTML to go through the proxy (pypi_files_url / pypi_files_remote config keys), and returns text/html content-type for simple index responses - Add basic auth support for non-Docker remotes (username + password/token in remote config); thread auth through _upstream_reachable and check_upstream_changed so mutable TTL checks also authenticate - Add 'pypi' remote (pypi.org simple index) and 'pypi-files' remote (files.pythonhosted.org) to remotes.yaml; add 'pypi-gitea' example for Gitea package registries where index and files share the same base URL - Add unit tests: simple index URL rewriting, HTML content-type, .whl/.tar.gz content-types, mutable index detection, and immutable pattern enforcement --- remotes.yaml | 49 +++++++++++++++++ src/artifactapi/config.py | 3 ++ src/artifactapi/main.py | 108 ++++++++++++++++++++++---------------- tests/conftest.py | 19 +++++++ tests/test_routes.py | 89 +++++++++++++++++++++++++++++++ 5 files changed, 223 insertions(+), 45 deletions(-) diff --git a/remotes.yaml b/remotes.yaml index d312461..127de7a 100644 --- a/remotes.yaml +++ b/remotes.yaml @@ -194,6 +194,55 @@ remotes: immutable_ttl: 0 mutable_ttl: 300 + pypi: + base_url: "https://pypi.org" + type: "remote" + package: "pypi" + description: "Python Package Index — simple repository API" + # pypi_files_url: the upstream host used in simple-index hrefs (default: files.pythonhosted.org) + # pypi_files_remote: our proxy remote that will serve those files (default: pypi-files) + pypi_files_url: "https://files.pythonhosted.org" + pypi_files_remote: "pypi-files" + check_mutable_updates: true + cache: + immutable_ttl: 0 + mutable_ttl: 600 # Simple index pages refreshed after 10 minutes + + pypi-gitea: + base_url: "https://gitea.example.com/api/packages/myorg/pypi" + type: "remote" + package: "pypi" + description: "Private Gitea PyPI registry" + # username: "your-gitea-username" + # password: "your-personal-access-token" # needs package:read scope + # Files are served from the same Gitea instance — rewrite back to this same remote + pypi_files_url: "https://gitea.example.com/api/packages/myorg/pypi" + pypi_files_remote: "pypi-gitea" + check_mutable_updates: true + immutable_patterns: + - "files/.*\\.whl$" + - "files/.*\\.whl\\.metadata$" + - "files/.*\\.tar\\.gz$" + - "files/.*\\.zip$" + - "files/.*\\.egg$" + cache: + immutable_ttl: 0 + mutable_ttl: 600 + + pypi-files: + base_url: "https://files.pythonhosted.org" + type: "remote" + package: "generic" + description: "Python Package Index — file storage (wheels, sdists)" + immutable_patterns: + - "packages/.*\\.whl$" + - "packages/.*\\.whl\\.metadata$" + - "packages/.*\\.tar\\.gz$" + - "packages/.*\\.zip$" + - "packages/.*\\.egg$" + cache: + immutable_ttl: 0 # Package files are content-addressed — cache forever + local-generic: type: "local" package: "generic" diff --git a/src/artifactapi/config.py b/src/artifactapi/config.py index ef138f3..c08860b 100644 --- a/src/artifactapi/config.py +++ b/src/artifactapi/config.py @@ -18,6 +18,9 @@ _PACKAGE_MUTABLE_PATTERNS: dict[str, list[str]] = { r"/manifests/(?!sha256:)[^/]+$", r"/tags/list$", ], + "pypi": [ + r"simple/", # Per-package and top-level simple index pages + ], "generic": [], } diff --git a/src/artifactapi/main.py b/src/artifactapi/main.py index 148fc77..2f8fd1e 100644 --- a/src/artifactapi/main.py +++ b/src/artifactapi/main.py @@ -1,3 +1,4 @@ +import base64 import hashlib import json import logging @@ -208,8 +209,11 @@ async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict: remote_config = config.get_remote_config(remote_name) or {} is_docker = remote_config.get("package") == "docker" or "/v2/" in url - # Prepare headers for Docker registry requests + # Prepare headers headers = {} + username = remote_config.get("username") + password = remote_config.get("password") + if is_docker: if "/manifests/" in url: headers["Accept"] = ( @@ -220,6 +224,8 @@ async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict: ) elif "/blobs/" in url: headers["Accept"] = "application/octet-stream" + elif username and password: + headers["Authorization"] = "Basic " + base64.b64encode(f"{username}:{password}".encode()).decode() async with httpx.AsyncClient(follow_redirects=True) as client: response = await client.get(url, headers=headers) @@ -254,11 +260,20 @@ async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict: return {"url": url, "status": "error", "error": str(e)} -async def _upstream_reachable(url: str) -> bool: +def _basic_auth_header(remote_cfg: dict) -> dict[str, str]: + username = remote_cfg.get("username") + password = remote_cfg.get("password") + if username and password: + token = base64.b64encode(f"{username}:{password}".encode()).decode() + return {"Authorization": f"Basic {token}"} + return {} + + +async def _upstream_reachable(url: str, auth_headers: dict | None = None) -> bool: """HEAD with a short timeout. Returns False only on network/timeout errors.""" try: async with httpx.AsyncClient(follow_redirects=True) as client: - await client.head(url, timeout=10.0) + await client.head(url, headers=auth_headers or {}, timeout=10.0) return True except (httpx.NetworkError, httpx.TimeoutException): return False @@ -266,19 +281,19 @@ async def _upstream_reachable(url: str) -> bool: return True # 4xx/5xx means backend is up -async def check_upstream_changed(remote_url: str, remote_name: str, path: str) -> bool: +async def check_upstream_changed(remote_url: str, remote_name: str, path: str, auth_headers: dict | None = None) -> bool: """Conditional HEAD against upstream. Returns False only on a definitive 304. Raises UpstreamUnreachable if the backend cannot be contacted.""" meta = cache.get_mutable_meta(remote_name, path) if not meta: return True - headers = {} + headers = dict(auth_headers or {}) if meta.get("etag"): headers["If-None-Match"] = meta["etag"] if meta.get("last_modified"): headers["If-Modified-Since"] = meta["last_modified"] - if not headers: + if not (meta.get("etag") or meta.get("last_modified")): return True try: @@ -294,12 +309,13 @@ async def handle_expired_mutable(remote_name: str, path: str, remote_url: str) - mutable_ttl = config.get_cache_config(remote_name).get("mutable_ttl", 3600) remote_cfg = config.get_remote_config(remote_name) or {} + auth = _basic_auth_header(remote_cfg) check_updates = remote_cfg.get("check_mutable_updates", False) user_mutable = check_updates and cache.is_mutable_file(path, config.get_user_mutable_patterns(remote_name)) if user_mutable: try: - changed = await check_upstream_changed(remote_url, remote_name, path) + changed = await check_upstream_changed(remote_url, remote_name, path, auth) except UpstreamUnreachable: cache.mark_index_cached(remote_name, path, mutable_ttl) logger.warning(f"Mutable STALE (backend unreachable): {remote_name}/{path} - TTL extended ({mutable_ttl}s)") @@ -310,7 +326,7 @@ async def handle_expired_mutable(remote_name: str, path: str, remote_url: str) - return True logger.info(f"Mutable file CHANGED: {remote_name}/{path} - re-downloading") else: - if not await _upstream_reachable(remote_url): + if not await _upstream_reachable(remote_url, auth): cache.mark_index_cached(remote_name, path, mutable_ttl) logger.warning(f"Mutable STALE (backend unreachable): {remote_name}/{path} - TTL extended ({mutable_ttl}s)") return True @@ -320,8 +336,44 @@ async def handle_expired_mutable(remote_name: str, path: str, remote_url: str) - return False +def _get_content_type(filename: str) -> str: + if filename.endswith(".tar.gz"): + return "application/gzip" + if filename.endswith(".zip") or filename.endswith(".whl"): + return "application/zip" + if filename.endswith(".exe"): + return "application/x-msdownload" + if filename.endswith(".rpm"): + return "application/x-rpm" + if filename.endswith(".xml"): + return "application/xml" + if filename.endswith((".xml.gz", ".xml.bz2", ".xml.xz")): + return "application/gzip" + return "application/octet-stream" + + +def _resolve_content( + data: bytes, + path: str, + filename: str, + remote_config: dict, + request: Request, +) -> tuple[bytes, str]: + """Return (possibly-rewritten data, content_type) for a cached artifact.""" + if remote_config.get("package") == "pypi" and "simple/" in path: + files_url = remote_config.get("pypi_files_url", "https://files.pythonhosted.org") + files_remote = remote_config.get("pypi_files_remote", "pypi-files") + proxy_base = str(request.base_url).rstrip("/") + data = data.replace( + files_url.rstrip("/").encode(), + f"{proxy_base}/api/v1/remote/{files_remote}".encode(), + ) + return data, "text/html; charset=utf-8" + return data, _get_content_type(filename) + + @app.get("/api/v1/remote/{remote_name}/{path:path}") -async def get_artifact(remote_name: str, path: str): +async def get_artifact(request: Request, remote_name: str, path: str): # Check if remote is configured remote_config = config.get_remote_config(remote_name) if not remote_config: @@ -384,29 +436,11 @@ async def get_artifact(remote_name: str, path: str): try: artifact_data = storage.download_object(cached_key) filename = os.path.basename(path) + artifact_data, content_type = _resolve_content(artifact_data, path, filename, remote_config, request) - # Log cache hit logger.info(f"Cache HIT: {remote_name}/{path} (size: {len(artifact_data)} bytes, key: {cached_key})") - # Determine content type based on file extension - content_type = "application/octet-stream" - if filename.endswith(".tar.gz"): - content_type = "application/gzip" - elif filename.endswith(".zip"): - content_type = "application/zip" - elif filename.endswith(".exe"): - content_type = "application/x-msdownload" - elif filename.endswith(".rpm"): - content_type = "application/x-rpm" - elif filename.endswith(".xml"): - content_type = "application/xml" - elif filename.endswith((".xml.gz", ".xml.bz2", ".xml.xz")): - content_type = "application/gzip" - - # Record cache hit metrics metrics.record_cache_hit(remote_name, len(artifact_data)) - - # Record artifact mapping in database if not already recorded database.record_artifact_mapping(cached_key, remote_name, path, len(artifact_data)) return Response( @@ -443,25 +477,9 @@ async def get_artifact(remote_name: str, path: str): cache_key = storage.get_object_key(remote_name, path) artifact_data = storage.download_object(cache_key) filename = os.path.basename(path) + artifact_data, content_type = _resolve_content(artifact_data, path, filename, remote_config, request) - content_type = "application/octet-stream" - if filename.endswith(".tar.gz"): - content_type = "application/gzip" - elif filename.endswith(".zip"): - content_type = "application/zip" - elif filename.endswith(".exe"): - content_type = "application/x-msdownload" - elif filename.endswith(".rpm"): - content_type = "application/x-rpm" - elif filename.endswith(".xml"): - content_type = "application/xml" - elif filename.endswith((".xml.gz", ".xml.bz2", ".xml.xz")): - content_type = "application/gzip" - - # Record cache miss metrics metrics.record_cache_miss(remote_name, len(artifact_data)) - - # Record artifact mapping in database cache_key = storage.get_object_key(remote_name, path) database.record_artifact_mapping(cache_key, remote_name, path, len(artifact_data)) diff --git a/tests/conftest.py b/tests/conftest.py index 9326244..f46e0d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -72,6 +72,25 @@ TEST_REMOTES = { "package": "generic", "cache": {"immutable_ttl": 0, "mutable_ttl": 0}, }, + "pypi-test": { + "base_url": "https://pypi.org", + "type": "remote", + "package": "pypi", + "pypi_files_url": "https://files.pythonhosted.org", + "pypi_files_remote": "pypi-files-test", + "cache": {"immutable_ttl": 0, "mutable_ttl": 600}, + }, + "pypi-files-test": { + "base_url": "https://files.pythonhosted.org", + "type": "remote", + "package": "generic", + "immutable_patterns": [ + "packages/.*\\.whl$", + "packages/.*\\.whl\\.metadata$", + "packages/.*\\.tar\\.gz$", + ], + "cache": {"immutable_ttl": 0, "mutable_ttl": 0}, + }, } } diff --git a/tests/test_routes.py b/tests/test_routes.py index d2dc643..9d8bdaa 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -652,3 +652,92 @@ class TestConfigEndpoint: data = response.json() assert "remotes" in data assert "alpine-test" in data["remotes"] + + +# --------------------------------------------------------------------------- +# PyPI remote /api/v1/remote/pypi-test/... +# --------------------------------------------------------------------------- + + +class TestPyPIRemote: + def test_simple_index_is_mutable(self, client, patched_deps): + """simple/ paths are detected as mutable (package-type default).""" + deps = patched_deps + html = b"..." + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = html + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = True + + response = client.get("/api/v1/remote/pypi-test/simple/requests/") + assert response.status_code == 200 + deps["cache"].mark_index_cached.assert_not_called() + + def test_simple_index_urls_rewritten_to_proxy(self, client, patched_deps): + """files.pythonhosted.org URLs in a cached simple index are rewritten to our proxy.""" + deps = patched_deps + html = b"..." + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = html + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = True + + response = client.get("/api/v1/remote/pypi-test/simple/requests/") + assert response.status_code == 200 + assert b"files.pythonhosted.org" not in response.content + assert b"/api/v1/remote/pypi-files-test/packages/requests-2.31.0.tar.gz" in response.content + + def test_simple_index_content_type_is_html(self, client, patched_deps): + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b"" + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = True + + response = client.get("/api/v1/remote/pypi-test/simple/requests/") + assert response.status_code == 200 + assert "text/html" in response.headers["content-type"] + + def test_simple_index_cache_miss_fetches_upstream(self, client, patched_deps): + deps = patched_deps + html = b"..." + deps["storage"].exists.return_value = False + deps["storage"].download_object.return_value = html + deps["cache"].is_mutable_file.return_value = True + + with patch( + "artifactapi.main.cache_single_artifact", + new_callable=AsyncMock, + return_value={"status": "cached"}, + ) as mock_fetch: + response = client.get("/api/v1/remote/pypi-test/simple/requests/") + + mock_fetch.assert_called_once() + assert response.status_code == 200 + assert b"files.pythonhosted.org" not in response.content + + def test_wheel_file_immutable_returns_correct_content_type(self, client, patched_deps): + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b"PK wheel bytes" + deps["cache"].is_mutable_file.return_value = False + + response = client.get("/api/v1/remote/pypi-files-test/packages/requests-2.31.0-py3-none-any.whl") + assert response.status_code == 200 + assert "application/zip" in response.headers["content-type"] + assert response.headers["X-Artifact-Source"] == "cache" + + def test_sdist_immutable_returns_correct_content_type(self, client, patched_deps): + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b"tar bytes" + deps["cache"].is_mutable_file.return_value = False + + response = client.get("/api/v1/remote/pypi-files-test/packages/requests-2.31.0.tar.gz") + assert response.status_code == 200 + assert "application/gzip" in response.headers["content-type"] + + def test_blocked_path_on_files_remote_returns_403(self, client, patched_deps): + """Paths that don't match immutable_patterns on pypi-files-test are blocked.""" + response = client.get("/api/v1/remote/pypi-files-test/packages/requests.unknown") + assert response.status_code == 403