diff --git a/README.md b/README.md index 66f650f..adea4e8 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ A generic FastAPI-based artifact caching system that downloads and stores files - **Stale-on-Upstream-Error**: Expired mutable files are kept and their TTL refreshed when the backend cannot be reached, so cached data remains available during upstream outages - **S3 Storage**: MinIO/S3 backend with predictable paths - **Docker Registry Proxy**: Full Docker Registry HTTP API v2 for transparent container image caching +- **npm Package Proxy**: Caching proxy for the npm registry with metadata URL rewriting so tarballs also pass through cache - **Content-Type Detection**: Automatic MIME type detection for downloads ## Architecture @@ -1031,4 +1032,68 @@ When uv requests the simple index for a package, the proxy: uv then downloads wheels and `.whl.metadata` files via the rewritten URLs, which also pass through the proxy and are cached as immutable artifacts. -For self-hosted registries like Gitea, both the index and file downloads share the same base URL. Setting `pypi_files_url` and `pypi_files_remote` to the same remote causes file links to be rewritten back through the same proxy entry. \ No newline at end of file +For self-hosted registries like Gitea, both the index and file downloads share the same base URL. Setting `pypi_files_url` and `pypi_files_remote` to the same remote causes file links to be rewritten back through the same proxy entry. + +## npm Package Proxy + +The `npm` package type turns the artifact API into a caching npm registry proxy. Since the npm registry serves both metadata and tarballs from the same host, a single remote handles everything. Package metadata (e.g. `GET /express`) is mutable and expires after `mutable_ttl`; tarballs (`.tgz`) are immutable and cached forever. `dist.tarball` URLs in metadata JSON are rewritten on the fly to point back through the same remote, so both the metadata lookup and the tarball download are served from cache. + +### remotes.yaml + +```yaml +remotes: + npm: + base_url: "https://registry.npmjs.org" + type: "remote" + package: "npm" + npm_files_url: "https://registry.npmjs.org" # URL prefix to rewrite in metadata JSON + npm_files_remote: "npm" # rewrite back to this same remote + check_mutable_updates: true + immutable_patterns: + - "\.tgz$" # versioned tarballs are content-addressed — cache forever + mutable_patterns: + - "^(?!.*\.tgz$).*" # everything else (package metadata) expires after mutable_ttl + cache: + immutable_ttl: 0 + mutable_ttl: 600 # re-check package metadata after 10 minutes +``` + +### Configuring npm / yarn / pnpm + +**npm** — per-project `.npmrc` or `~/.npmrc`: + +```ini +registry=https://artifacts.example.com/api/v1/remote/npm/ +``` + +**yarn** — `~/.yarnrc.yml`: + +```yaml +npmRegistryServer: "https://artifacts.example.com/api/v1/remote/npm/" +``` + +**pnpm** — `.npmrc`: + +```ini +registry=https://artifacts.example.com/api/v1/remote/npm/ +``` + +### How the rewriting works + +When a client requests package metadata, the proxy: + +1. Fetches `https://registry.npmjs.org/{package}` (or returns a cached copy within `mutable_ttl`) +2. Rewrites every `https://registry.npmjs.org/...` tarball URL to `https://artifacts.example.com/api/v1/remote/npm/...` +3. Returns the rewritten JSON to the client + +The client then downloads the tarball via the rewritten URL, which hits the same `npm` remote and is cached as an immutable artifact. Subsequent installs of the same package version are served entirely from S3. + +### Mutable vs immutable paths + +| Path pattern | Type | Example | +|---|---|---| +| `/{package}` | Mutable (TTL) | `/express` | +| `/@{scope}/{package}` | Mutable (TTL) | `/@babel/core` | +| `/-/all` | Mutable (TTL) | `/-/all` | +| `/{package}/-/{package}-{version}.tgz` | Immutable (forever) | `/express/-/express-4.18.2.tgz` | +| `/@{scope}/{pkg}/-/{pkg}-{ver}.tgz` | Immutable (forever) | `/@babel/core/-/core-7.21.0.tgz` | \ No newline at end of file diff --git a/remotes.yaml b/remotes.yaml index 127de7a..a55359f 100644 --- a/remotes.yaml +++ b/remotes.yaml @@ -243,6 +243,24 @@ remotes: cache: immutable_ttl: 0 # Package files are content-addressed — cache forever + npm: + base_url: "https://registry.npmjs.org" + type: "remote" + package: "npm" + description: "npm registry — package metadata with tarball URL rewriting" + # npm_files_url: the upstream host used in metadata tarball hrefs (default: https://registry.npmjs.org) + # npm_files_remote: our proxy remote that will serve those tarballs (default: npm-files) + npm_files_url: "https://registry.npmjs.org" + npm_files_remote: "npm" + check_mutable_updates: true + immutable_patterns: + - \.tgz$ + mutable_patterns: + - ^(?!.*\.tgz$).* + cache: + immutable_ttl: 0 + mutable_ttl: 600 # Package metadata refreshed after 10 minutes + local-generic: type: "local" package: "generic" diff --git a/src/artifactapi/config.py b/src/artifactapi/config.py index c08860b..1d4b330 100644 --- a/src/artifactapi/config.py +++ b/src/artifactapi/config.py @@ -21,6 +21,7 @@ _PACKAGE_MUTABLE_PATTERNS: dict[str, list[str]] = { "pypi": [ r"simple/", # Per-package and top-level simple index pages ], + "npm": [], "generic": [], } diff --git a/src/artifactapi/main.py b/src/artifactapi/main.py index 2f8fd1e..cac6e61 100644 --- a/src/artifactapi/main.py +++ b/src/artifactapi/main.py @@ -337,7 +337,7 @@ async def handle_expired_mutable(remote_name: str, path: str, remote_url: str) - def _get_content_type(filename: str) -> str: - if filename.endswith(".tar.gz"): + if filename.endswith((".tar.gz", ".tgz")): return "application/gzip" if filename.endswith(".zip") or filename.endswith(".whl"): return "application/zip" @@ -369,6 +369,15 @@ def _resolve_content( f"{proxy_base}/api/v1/remote/{files_remote}".encode(), ) return data, "text/html; charset=utf-8" + if remote_config.get("package") == "npm" and not path.endswith(".tgz"): + files_url = remote_config.get("npm_files_url", "https://registry.npmjs.org") + files_remote = remote_config.get("npm_files_remote", "npm-files") + proxy_base = str(request.base_url).rstrip("/") + data = data.replace( + files_url.rstrip("/").encode(), + f"{proxy_base}/api/v1/remote/{files_remote}".encode(), + ) + return data, "application/json" return data, _get_content_type(filename) diff --git a/tests/conftest.py b/tests/conftest.py index f46e0d4..4500815 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -91,6 +91,16 @@ TEST_REMOTES = { ], "cache": {"immutable_ttl": 0, "mutable_ttl": 0}, }, + "npm-test": { + "base_url": "https://registry.npmjs.org", + "type": "remote", + "package": "npm", + "npm_files_url": "https://registry.npmjs.org", + "npm_files_remote": "npm-test", + "immutable_patterns": [r"\.tgz$"], + "mutable_patterns": [r"^(?!.*\.tgz$).*"], + "cache": {"immutable_ttl": 0, "mutable_ttl": 600}, + }, } } diff --git a/tests/test_config.py b/tests/test_config.py index 9137840..147b62c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -133,6 +133,44 @@ class TestGetMutablePatterns: assert r"repomd\.xml$" in patterns assert r"custom-meta\.xml$" in patterns + def test_npm_has_no_package_defaults(self, make_config): + cfg = make_config({"r": {"type": "remote", "package": "npm", "base_url": "https://x.com"}}) + assert cfg.get_mutable_patterns("r") == [] + + def test_npm_explicit_mutable_pattern_matches_metadata(self, make_config): + import re + + cfg = make_config( + { + "r": { + "type": "remote", + "package": "npm", + "base_url": "https://x.com", + "mutable_patterns": [r"^(?!.*\.tgz$).*"], + } + } + ) + patterns = cfg.get_mutable_patterns("r") + assert any(re.search(p, "express") for p in patterns) + assert any(re.search(p, "@babel/core") for p in patterns) + + def test_npm_explicit_mutable_pattern_excludes_tarballs(self, make_config): + import re + + cfg = make_config( + { + "r": { + "type": "remote", + "package": "npm", + "base_url": "https://x.com", + "mutable_patterns": [r"^(?!.*\.tgz$).*"], + } + } + ) + patterns = cfg.get_mutable_patterns("r") + assert not any(re.search(p, "express-4.18.2.tgz") for p in patterns) + assert not any(re.search(p, "express/-/express-4.18.2.tgz") for p in patterns) + # --------------------------------------------------------------------------- # get_immutable_patterns diff --git a/tests/test_routes.py b/tests/test_routes.py index 9d8bdaa..1773a7e 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -741,3 +741,103 @@ class TestPyPIRemote: """Paths that don't match immutable_patterns on pypi-files-test are blocked.""" response = client.get("/api/v1/remote/pypi-files-test/packages/requests.unknown") assert response.status_code == 403 + + +# --------------------------------------------------------------------------- +# npm remote /api/v1/remote/npm-test/... +# --------------------------------------------------------------------------- + + +class TestNpmRemote: + def test_package_metadata_is_mutable(self, client, patched_deps): + """Top-level package metadata paths are detected as mutable.""" + deps = patched_deps + meta = b'{"name":"express","versions":{}}' + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = meta + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = True + + response = client.get("/api/v1/remote/npm-test/express") + assert response.status_code == 200 + deps["cache"].mark_index_cached.assert_not_called() + + def test_metadata_tarball_urls_rewritten_to_proxy(self, client, patched_deps): + """registry.npmjs.org tarball URLs in metadata JSON are rewritten to our proxy.""" + deps = patched_deps + meta = b'{"dist":{"tarball":"https://registry.npmjs.org/express/-/express-4.18.2.tgz"}}' + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = meta + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = True + + response = client.get("/api/v1/remote/npm-test/express") + assert response.status_code == 200 + assert b"registry.npmjs.org" not in response.content + assert b"/api/v1/remote/npm-test/express/-/express-4.18.2.tgz" in response.content + + def test_metadata_content_type_is_json(self, client, patched_deps): + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b'{"name":"express"}' + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = True + + response = client.get("/api/v1/remote/npm-test/express") + assert response.status_code == 200 + assert "application/json" in response.headers["content-type"] + + def test_scoped_package_metadata_rewritten(self, client, patched_deps): + """@scope/package metadata URLs are also rewritten back to the same npm-test remote.""" + deps = patched_deps + meta = b'{"dist":{"tarball":"https://registry.npmjs.org/@babel/core/-/core-7.21.0.tgz"}}' + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = meta + deps["cache"].is_mutable_file.return_value = True + deps["cache"].is_index_valid.return_value = True + + response = client.get("/api/v1/remote/npm-test/@babel/core") + assert response.status_code == 200 + assert b"registry.npmjs.org" not in response.content + assert b"/api/v1/remote/npm-test/@babel/core/-/core-7.21.0.tgz" in response.content + + def test_tarball_not_rewritten(self, client, patched_deps): + """Tarball requests (.tgz) bypass URL rewriting and return binary.""" + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b"\x1f\x8b tgz bytes" + deps["cache"].is_mutable_file.return_value = False + + response = client.get("/api/v1/remote/npm-test/express/-/express-4.18.2.tgz") + assert response.status_code == 200 + assert "application/gzip" in response.headers["content-type"] + assert response.headers["X-Artifact-Source"] == "cache" + + def test_metadata_cache_miss_fetches_upstream(self, client, patched_deps): + deps = patched_deps + meta = b'{"dist":{"tarball":"https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz"}}' + deps["storage"].exists.return_value = False + deps["storage"].download_object.return_value = meta + deps["cache"].is_mutable_file.return_value = True + + with patch( + "artifactapi.main.cache_single_artifact", + new_callable=AsyncMock, + return_value={"status": "cached"}, + ) as mock_fetch: + response = client.get("/api/v1/remote/npm-test/lodash") + + mock_fetch.assert_called_once() + assert response.status_code == 200 + assert b"registry.npmjs.org" not in response.content + + def test_tarball_immutable_allowed_on_npm_remote(self, client, patched_deps): + """Tarballs (.tgz) match immutable_patterns and are served without rewriting.""" + deps = patched_deps + deps["storage"].exists.return_value = True + deps["storage"].download_object.return_value = b"tgz bytes" + deps["cache"].is_mutable_file.return_value = False + + response = client.get("/api/v1/remote/npm-test/express/-/express-4.18.2.tgz") + assert response.status_code == 200 + assert "application/gzip" in response.headers["content-type"]