feat: add pypi remote type with URL rewriting and basic auth
- Add 'pypi' package type to config.py; simple/ paths are mutable by default - Refactor content-type detection into _get_content_type() helper; add .whl - Add _resolve_content() which rewrites files host URLs in simple index HTML to go through the proxy (pypi_files_url / pypi_files_remote config keys), and returns text/html content-type for simple index responses - Add basic auth support for non-Docker remotes (username + password/token in remote config); thread auth through _upstream_reachable and check_upstream_changed so mutable TTL checks also authenticate - Add 'pypi' remote (pypi.org simple index) and 'pypi-files' remote (files.pythonhosted.org) to remotes.yaml; add 'pypi-gitea' example for Gitea package registries where index and files share the same base URL - Add unit tests: simple index URL rewriting, HTML content-type, .whl/.tar.gz content-types, mutable index detection, and immutable pattern enforcement
This commit is contained in:
@@ -194,6 +194,55 @@ remotes:
|
||||
immutable_ttl: 0
|
||||
mutable_ttl: 300
|
||||
|
||||
pypi:
|
||||
base_url: "https://pypi.org"
|
||||
type: "remote"
|
||||
package: "pypi"
|
||||
description: "Python Package Index — simple repository API"
|
||||
# pypi_files_url: the upstream host used in simple-index hrefs (default: files.pythonhosted.org)
|
||||
# pypi_files_remote: our proxy remote that will serve those files (default: pypi-files)
|
||||
pypi_files_url: "https://files.pythonhosted.org"
|
||||
pypi_files_remote: "pypi-files"
|
||||
check_mutable_updates: true
|
||||
cache:
|
||||
immutable_ttl: 0
|
||||
mutable_ttl: 600 # Simple index pages refreshed after 10 minutes
|
||||
|
||||
pypi-gitea:
|
||||
base_url: "https://gitea.example.com/api/packages/myorg/pypi"
|
||||
type: "remote"
|
||||
package: "pypi"
|
||||
description: "Private Gitea PyPI registry"
|
||||
# username: "your-gitea-username"
|
||||
# password: "your-personal-access-token" # needs package:read scope
|
||||
# Files are served from the same Gitea instance — rewrite back to this same remote
|
||||
pypi_files_url: "https://gitea.example.com/api/packages/myorg/pypi"
|
||||
pypi_files_remote: "pypi-gitea"
|
||||
check_mutable_updates: true
|
||||
immutable_patterns:
|
||||
- "files/.*\\.whl$"
|
||||
- "files/.*\\.whl\\.metadata$"
|
||||
- "files/.*\\.tar\\.gz$"
|
||||
- "files/.*\\.zip$"
|
||||
- "files/.*\\.egg$"
|
||||
cache:
|
||||
immutable_ttl: 0
|
||||
mutable_ttl: 600
|
||||
|
||||
pypi-files:
|
||||
base_url: "https://files.pythonhosted.org"
|
||||
type: "remote"
|
||||
package: "generic"
|
||||
description: "Python Package Index — file storage (wheels, sdists)"
|
||||
immutable_patterns:
|
||||
- "packages/.*\\.whl$"
|
||||
- "packages/.*\\.whl\\.metadata$"
|
||||
- "packages/.*\\.tar\\.gz$"
|
||||
- "packages/.*\\.zip$"
|
||||
- "packages/.*\\.egg$"
|
||||
cache:
|
||||
immutable_ttl: 0 # Package files are content-addressed — cache forever
|
||||
|
||||
local-generic:
|
||||
type: "local"
|
||||
package: "generic"
|
||||
|
||||
@@ -18,6 +18,9 @@ _PACKAGE_MUTABLE_PATTERNS: dict[str, list[str]] = {
|
||||
r"/manifests/(?!sha256:)[^/]+$",
|
||||
r"/tags/list$",
|
||||
],
|
||||
"pypi": [
|
||||
r"simple/", # Per-package and top-level simple index pages
|
||||
],
|
||||
"generic": [],
|
||||
}
|
||||
|
||||
|
||||
+63
-45
@@ -1,3 +1,4 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
@@ -208,8 +209,11 @@ async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict:
|
||||
remote_config = config.get_remote_config(remote_name) or {}
|
||||
is_docker = remote_config.get("package") == "docker" or "/v2/" in url
|
||||
|
||||
# Prepare headers for Docker registry requests
|
||||
# Prepare headers
|
||||
headers = {}
|
||||
username = remote_config.get("username")
|
||||
password = remote_config.get("password")
|
||||
|
||||
if is_docker:
|
||||
if "/manifests/" in url:
|
||||
headers["Accept"] = (
|
||||
@@ -220,6 +224,8 @@ async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict:
|
||||
)
|
||||
elif "/blobs/" in url:
|
||||
headers["Accept"] = "application/octet-stream"
|
||||
elif username and password:
|
||||
headers["Authorization"] = "Basic " + base64.b64encode(f"{username}:{password}".encode()).decode()
|
||||
|
||||
async with httpx.AsyncClient(follow_redirects=True) as client:
|
||||
response = await client.get(url, headers=headers)
|
||||
@@ -254,11 +260,20 @@ async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict:
|
||||
return {"url": url, "status": "error", "error": str(e)}
|
||||
|
||||
|
||||
async def _upstream_reachable(url: str) -> bool:
|
||||
def _basic_auth_header(remote_cfg: dict) -> dict[str, str]:
|
||||
username = remote_cfg.get("username")
|
||||
password = remote_cfg.get("password")
|
||||
if username and password:
|
||||
token = base64.b64encode(f"{username}:{password}".encode()).decode()
|
||||
return {"Authorization": f"Basic {token}"}
|
||||
return {}
|
||||
|
||||
|
||||
async def _upstream_reachable(url: str, auth_headers: dict | None = None) -> bool:
|
||||
"""HEAD with a short timeout. Returns False only on network/timeout errors."""
|
||||
try:
|
||||
async with httpx.AsyncClient(follow_redirects=True) as client:
|
||||
await client.head(url, timeout=10.0)
|
||||
await client.head(url, headers=auth_headers or {}, timeout=10.0)
|
||||
return True
|
||||
except (httpx.NetworkError, httpx.TimeoutException):
|
||||
return False
|
||||
@@ -266,19 +281,19 @@ async def _upstream_reachable(url: str) -> bool:
|
||||
return True # 4xx/5xx means backend is up
|
||||
|
||||
|
||||
async def check_upstream_changed(remote_url: str, remote_name: str, path: str) -> bool:
|
||||
async def check_upstream_changed(remote_url: str, remote_name: str, path: str, auth_headers: dict | None = None) -> bool:
|
||||
"""Conditional HEAD against upstream. Returns False only on a definitive 304.
|
||||
Raises UpstreamUnreachable if the backend cannot be contacted."""
|
||||
meta = cache.get_mutable_meta(remote_name, path)
|
||||
if not meta:
|
||||
return True
|
||||
|
||||
headers = {}
|
||||
headers = dict(auth_headers or {})
|
||||
if meta.get("etag"):
|
||||
headers["If-None-Match"] = meta["etag"]
|
||||
if meta.get("last_modified"):
|
||||
headers["If-Modified-Since"] = meta["last_modified"]
|
||||
if not headers:
|
||||
if not (meta.get("etag") or meta.get("last_modified")):
|
||||
return True
|
||||
|
||||
try:
|
||||
@@ -294,12 +309,13 @@ async def handle_expired_mutable(remote_name: str, path: str, remote_url: str) -
|
||||
mutable_ttl = config.get_cache_config(remote_name).get("mutable_ttl", 3600)
|
||||
|
||||
remote_cfg = config.get_remote_config(remote_name) or {}
|
||||
auth = _basic_auth_header(remote_cfg)
|
||||
check_updates = remote_cfg.get("check_mutable_updates", False)
|
||||
user_mutable = check_updates and cache.is_mutable_file(path, config.get_user_mutable_patterns(remote_name))
|
||||
|
||||
if user_mutable:
|
||||
try:
|
||||
changed = await check_upstream_changed(remote_url, remote_name, path)
|
||||
changed = await check_upstream_changed(remote_url, remote_name, path, auth)
|
||||
except UpstreamUnreachable:
|
||||
cache.mark_index_cached(remote_name, path, mutable_ttl)
|
||||
logger.warning(f"Mutable STALE (backend unreachable): {remote_name}/{path} - TTL extended ({mutable_ttl}s)")
|
||||
@@ -310,7 +326,7 @@ async def handle_expired_mutable(remote_name: str, path: str, remote_url: str) -
|
||||
return True
|
||||
logger.info(f"Mutable file CHANGED: {remote_name}/{path} - re-downloading")
|
||||
else:
|
||||
if not await _upstream_reachable(remote_url):
|
||||
if not await _upstream_reachable(remote_url, auth):
|
||||
cache.mark_index_cached(remote_name, path, mutable_ttl)
|
||||
logger.warning(f"Mutable STALE (backend unreachable): {remote_name}/{path} - TTL extended ({mutable_ttl}s)")
|
||||
return True
|
||||
@@ -320,8 +336,44 @@ async def handle_expired_mutable(remote_name: str, path: str, remote_url: str) -
|
||||
return False
|
||||
|
||||
|
||||
def _get_content_type(filename: str) -> str:
|
||||
if filename.endswith(".tar.gz"):
|
||||
return "application/gzip"
|
||||
if filename.endswith(".zip") or filename.endswith(".whl"):
|
||||
return "application/zip"
|
||||
if filename.endswith(".exe"):
|
||||
return "application/x-msdownload"
|
||||
if filename.endswith(".rpm"):
|
||||
return "application/x-rpm"
|
||||
if filename.endswith(".xml"):
|
||||
return "application/xml"
|
||||
if filename.endswith((".xml.gz", ".xml.bz2", ".xml.xz")):
|
||||
return "application/gzip"
|
||||
return "application/octet-stream"
|
||||
|
||||
|
||||
def _resolve_content(
|
||||
data: bytes,
|
||||
path: str,
|
||||
filename: str,
|
||||
remote_config: dict,
|
||||
request: Request,
|
||||
) -> tuple[bytes, str]:
|
||||
"""Return (possibly-rewritten data, content_type) for a cached artifact."""
|
||||
if remote_config.get("package") == "pypi" and "simple/" in path:
|
||||
files_url = remote_config.get("pypi_files_url", "https://files.pythonhosted.org")
|
||||
files_remote = remote_config.get("pypi_files_remote", "pypi-files")
|
||||
proxy_base = str(request.base_url).rstrip("/")
|
||||
data = data.replace(
|
||||
files_url.rstrip("/").encode(),
|
||||
f"{proxy_base}/api/v1/remote/{files_remote}".encode(),
|
||||
)
|
||||
return data, "text/html; charset=utf-8"
|
||||
return data, _get_content_type(filename)
|
||||
|
||||
|
||||
@app.get("/api/v1/remote/{remote_name}/{path:path}")
|
||||
async def get_artifact(remote_name: str, path: str):
|
||||
async def get_artifact(request: Request, remote_name: str, path: str):
|
||||
# Check if remote is configured
|
||||
remote_config = config.get_remote_config(remote_name)
|
||||
if not remote_config:
|
||||
@@ -384,29 +436,11 @@ async def get_artifact(remote_name: str, path: str):
|
||||
try:
|
||||
artifact_data = storage.download_object(cached_key)
|
||||
filename = os.path.basename(path)
|
||||
artifact_data, content_type = _resolve_content(artifact_data, path, filename, remote_config, request)
|
||||
|
||||
# Log cache hit
|
||||
logger.info(f"Cache HIT: {remote_name}/{path} (size: {len(artifact_data)} bytes, key: {cached_key})")
|
||||
|
||||
# Determine content type based on file extension
|
||||
content_type = "application/octet-stream"
|
||||
if filename.endswith(".tar.gz"):
|
||||
content_type = "application/gzip"
|
||||
elif filename.endswith(".zip"):
|
||||
content_type = "application/zip"
|
||||
elif filename.endswith(".exe"):
|
||||
content_type = "application/x-msdownload"
|
||||
elif filename.endswith(".rpm"):
|
||||
content_type = "application/x-rpm"
|
||||
elif filename.endswith(".xml"):
|
||||
content_type = "application/xml"
|
||||
elif filename.endswith((".xml.gz", ".xml.bz2", ".xml.xz")):
|
||||
content_type = "application/gzip"
|
||||
|
||||
# Record cache hit metrics
|
||||
metrics.record_cache_hit(remote_name, len(artifact_data))
|
||||
|
||||
# Record artifact mapping in database if not already recorded
|
||||
database.record_artifact_mapping(cached_key, remote_name, path, len(artifact_data))
|
||||
|
||||
return Response(
|
||||
@@ -443,25 +477,9 @@ async def get_artifact(remote_name: str, path: str):
|
||||
cache_key = storage.get_object_key(remote_name, path)
|
||||
artifact_data = storage.download_object(cache_key)
|
||||
filename = os.path.basename(path)
|
||||
artifact_data, content_type = _resolve_content(artifact_data, path, filename, remote_config, request)
|
||||
|
||||
content_type = "application/octet-stream"
|
||||
if filename.endswith(".tar.gz"):
|
||||
content_type = "application/gzip"
|
||||
elif filename.endswith(".zip"):
|
||||
content_type = "application/zip"
|
||||
elif filename.endswith(".exe"):
|
||||
content_type = "application/x-msdownload"
|
||||
elif filename.endswith(".rpm"):
|
||||
content_type = "application/x-rpm"
|
||||
elif filename.endswith(".xml"):
|
||||
content_type = "application/xml"
|
||||
elif filename.endswith((".xml.gz", ".xml.bz2", ".xml.xz")):
|
||||
content_type = "application/gzip"
|
||||
|
||||
# Record cache miss metrics
|
||||
metrics.record_cache_miss(remote_name, len(artifact_data))
|
||||
|
||||
# Record artifact mapping in database
|
||||
cache_key = storage.get_object_key(remote_name, path)
|
||||
database.record_artifact_mapping(cache_key, remote_name, path, len(artifact_data))
|
||||
|
||||
|
||||
@@ -72,6 +72,25 @@ TEST_REMOTES = {
|
||||
"package": "generic",
|
||||
"cache": {"immutable_ttl": 0, "mutable_ttl": 0},
|
||||
},
|
||||
"pypi-test": {
|
||||
"base_url": "https://pypi.org",
|
||||
"type": "remote",
|
||||
"package": "pypi",
|
||||
"pypi_files_url": "https://files.pythonhosted.org",
|
||||
"pypi_files_remote": "pypi-files-test",
|
||||
"cache": {"immutable_ttl": 0, "mutable_ttl": 600},
|
||||
},
|
||||
"pypi-files-test": {
|
||||
"base_url": "https://files.pythonhosted.org",
|
||||
"type": "remote",
|
||||
"package": "generic",
|
||||
"immutable_patterns": [
|
||||
"packages/.*\\.whl$",
|
||||
"packages/.*\\.whl\\.metadata$",
|
||||
"packages/.*\\.tar\\.gz$",
|
||||
],
|
||||
"cache": {"immutable_ttl": 0, "mutable_ttl": 0},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -652,3 +652,92 @@ class TestConfigEndpoint:
|
||||
data = response.json()
|
||||
assert "remotes" in data
|
||||
assert "alpine-test" in data["remotes"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# PyPI remote /api/v1/remote/pypi-test/...
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPyPIRemote:
|
||||
def test_simple_index_is_mutable(self, client, patched_deps):
|
||||
"""simple/ paths are detected as mutable (package-type default)."""
|
||||
deps = patched_deps
|
||||
html = b"<html><body><a href='https://files.pythonhosted.org/packages/requests-2.31.0.tar.gz'>...</a></body></html>"
|
||||
deps["storage"].exists.return_value = True
|
||||
deps["storage"].download_object.return_value = html
|
||||
deps["cache"].is_mutable_file.return_value = True
|
||||
deps["cache"].is_index_valid.return_value = True
|
||||
|
||||
response = client.get("/api/v1/remote/pypi-test/simple/requests/")
|
||||
assert response.status_code == 200
|
||||
deps["cache"].mark_index_cached.assert_not_called()
|
||||
|
||||
def test_simple_index_urls_rewritten_to_proxy(self, client, patched_deps):
|
||||
"""files.pythonhosted.org URLs in a cached simple index are rewritten to our proxy."""
|
||||
deps = patched_deps
|
||||
html = b"<html><body><a href='https://files.pythonhosted.org/packages/requests-2.31.0.tar.gz'>...</a></body></html>"
|
||||
deps["storage"].exists.return_value = True
|
||||
deps["storage"].download_object.return_value = html
|
||||
deps["cache"].is_mutable_file.return_value = True
|
||||
deps["cache"].is_index_valid.return_value = True
|
||||
|
||||
response = client.get("/api/v1/remote/pypi-test/simple/requests/")
|
||||
assert response.status_code == 200
|
||||
assert b"files.pythonhosted.org" not in response.content
|
||||
assert b"/api/v1/remote/pypi-files-test/packages/requests-2.31.0.tar.gz" in response.content
|
||||
|
||||
def test_simple_index_content_type_is_html(self, client, patched_deps):
|
||||
deps = patched_deps
|
||||
deps["storage"].exists.return_value = True
|
||||
deps["storage"].download_object.return_value = b"<html></html>"
|
||||
deps["cache"].is_mutable_file.return_value = True
|
||||
deps["cache"].is_index_valid.return_value = True
|
||||
|
||||
response = client.get("/api/v1/remote/pypi-test/simple/requests/")
|
||||
assert response.status_code == 200
|
||||
assert "text/html" in response.headers["content-type"]
|
||||
|
||||
def test_simple_index_cache_miss_fetches_upstream(self, client, patched_deps):
|
||||
deps = patched_deps
|
||||
html = b"<html><body><a href='https://files.pythonhosted.org/packages/p-1.0.whl'>...</a></body></html>"
|
||||
deps["storage"].exists.return_value = False
|
||||
deps["storage"].download_object.return_value = html
|
||||
deps["cache"].is_mutable_file.return_value = True
|
||||
|
||||
with patch(
|
||||
"artifactapi.main.cache_single_artifact",
|
||||
new_callable=AsyncMock,
|
||||
return_value={"status": "cached"},
|
||||
) as mock_fetch:
|
||||
response = client.get("/api/v1/remote/pypi-test/simple/requests/")
|
||||
|
||||
mock_fetch.assert_called_once()
|
||||
assert response.status_code == 200
|
||||
assert b"files.pythonhosted.org" not in response.content
|
||||
|
||||
def test_wheel_file_immutable_returns_correct_content_type(self, client, patched_deps):
|
||||
deps = patched_deps
|
||||
deps["storage"].exists.return_value = True
|
||||
deps["storage"].download_object.return_value = b"PK wheel bytes"
|
||||
deps["cache"].is_mutable_file.return_value = False
|
||||
|
||||
response = client.get("/api/v1/remote/pypi-files-test/packages/requests-2.31.0-py3-none-any.whl")
|
||||
assert response.status_code == 200
|
||||
assert "application/zip" in response.headers["content-type"]
|
||||
assert response.headers["X-Artifact-Source"] == "cache"
|
||||
|
||||
def test_sdist_immutable_returns_correct_content_type(self, client, patched_deps):
|
||||
deps = patched_deps
|
||||
deps["storage"].exists.return_value = True
|
||||
deps["storage"].download_object.return_value = b"tar bytes"
|
||||
deps["cache"].is_mutable_file.return_value = False
|
||||
|
||||
response = client.get("/api/v1/remote/pypi-files-test/packages/requests-2.31.0.tar.gz")
|
||||
assert response.status_code == 200
|
||||
assert "application/gzip" in response.headers["content-type"]
|
||||
|
||||
def test_blocked_path_on_files_remote_returns_403(self, client, patched_deps):
|
||||
"""Paths that don't match immutable_patterns on pypi-files-test are blocked."""
|
||||
response = client.get("/api/v1/remote/pypi-files-test/packages/requests.unknown")
|
||||
assert response.status_code == 403
|
||||
|
||||
Reference in New Issue
Block a user