feat: cache parsed member indexes as msgpack to skip YAML re-parse on rebuild
ci/woodpecker/pr/test Pipeline was successful
ci/woodpecker/pr/pre-commit Pipeline was successful
ci/woodpecker/pr/build Pipeline was successful

Warm rebuilds of virtual repos (member caches valid, virtual TTL expired)
previously re-parsed all member index.yaml files on every rebuild. With 19
Helm members totalling 14 MB, YAML parsing was 60% of merge time (~6.3s of
~9.6s). Parsing each member's YAML also produces msgpack and stores it in S3
alongside the raw index. Subsequent rebuilds load the compact msgpack and skip
YAML parsing entirely.

Before: warm rebuild ~9.6s (CSafeLoader baseline)
After:  warm rebuild ~5.9s (38% faster, merge=4.7s down from ~9.6s)
This commit is contained in:
2026-05-02 17:04:19 +10:00
parent 15f934cd0b
commit 1e0f4dc840
4 changed files with 284 additions and 41 deletions
+86 -18
View File
@@ -6,6 +6,7 @@ from datetime import UTC, date, datetime
from typing import Protocol, runtime_checkable
import httpx
import msgpack as _msgpack
import yaml
from fastapi import HTTPException, Request, Response
@@ -42,21 +43,43 @@ _HelmDumper.add_representer(datetime, _repr_datetime)
_HelmDumper.add_representer(date, _repr_date)
def _entries_to_msgpack_safe(entries: dict) -> dict:
"""Convert datetime/date values to ISO strings for msgpack serialization."""
result = {}
for chart, versions in entries.items():
safe_versions = []
for v in versions:
safe_v = {}
for k, val in v.items():
if isinstance(val, datetime):
safe_v[k] = val.isoformat()
elif isinstance(val, date):
safe_v[k] = val.isoformat()
else:
safe_v[k] = val
safe_versions.append(safe_v)
result[chart] = safe_versions
return result
async def _get_member_index(
member_name: str,
member_cfg: dict,
path: str,
storage,
cache,
) -> tuple[str, dict, int, bytes | None]:
) -> tuple[str, dict, int, bytes | None, dict | None]:
"""Fetch or retrieve cached index.yaml for one member remote.
Returns (member_name, member_cfg, ttl, raw_bytes).
Returns (member_name, member_cfg, ttl, raw_bytes, parsed_entries).
raw_bytes is None if the member is unreachable and not in S3.
parsed_entries is the pre-parsed entries dict (from msgpack cache), or None.
"""
member_ttl = member_cfg.get("cache", {}).get("mutable_ttl", 3600)
s3_key = storage.get_object_key(member_name, path)
msgpack_key = storage.get_object_key(member_name, "index.msgpack")
raw_data: bytes | None = None
parsed_entries: dict | None = None
if storage.exists(s3_key) and cache.is_index_valid(member_name, path):
try:
@@ -64,6 +87,13 @@ async def _get_member_index(
logger.info(f"Virtual: cache hit for member '{member_name}'")
except Exception:
raw_data = None
if raw_data is not None and storage.exists(msgpack_key):
try:
packed = storage.download_object(msgpack_key)
parsed_entries = _msgpack.unpackb(packed, raw=False)
logger.debug(f"Virtual: msgpack hit for member '{member_name}'")
except Exception:
parsed_entries = None
if raw_data is None:
base_url = member_cfg.get("base_url", "").rstrip("/")
@@ -81,14 +111,23 @@ async def _get_member_index(
raw_data = response.content
except Exception as e:
logger.warning(f"Virtual: failed to fetch index.yaml from member '{member_name}': {e}")
return member_name, member_cfg, member_ttl, None
return member_name, member_cfg, member_ttl, None, None
try:
storage.upload(s3_key, raw_data)
cache.mark_index_cached(member_name, path, member_ttl)
except Exception as e:
logger.warning(f"Virtual: failed to cache index.yaml for member '{member_name}': {e}")
return member_name, member_cfg, member_ttl, raw_data
if parsed_entries is None and raw_data is not None:
try:
index = yaml.load(raw_data, Loader=_YamlLoader)
safe_entries = _entries_to_msgpack_safe(index.get("entries") or {})
storage.upload(msgpack_key, _msgpack.packb(safe_entries, use_bin_type=True))
parsed_entries = safe_entries
except Exception as e:
logger.warning(f"Virtual: failed to build msgpack cache for '{member_name}': {e}")
return member_name, member_cfg, member_ttl, raw_data, parsed_entries
def _rewrite_urls(urls: list, base_url: str, proxy_base: str, member_name: str) -> list:
@@ -104,24 +143,35 @@ def _rewrite_urls(urls: list, base_url: str, proxy_base: str, member_name: str)
return rewritten
def _merge_helm_indexes(raw_indexes: list[bytes], member_names: list[str], member_configs: list[dict], proxy_base: str) -> bytes:
def _merge_helm_indexes(
raw_indexes: list[bytes],
parsed_entries_list: list[dict | None],
member_names: list[str],
member_configs: list[dict],
proxy_base: str,
) -> bytes:
"""Merge helm index.yaml files with per-member URL rewriting.
Priority is determined by position in member_names: earlier members win
when the same chart name + version appears in multiple remotes.
Uses pre-parsed msgpack entries when available to skip YAML parsing.
"""
merged_entries: dict[str, list] = {}
for raw_data, member_name, member_cfg in zip(raw_indexes, member_names, member_configs):
for raw_data, pre_parsed, member_name, member_cfg in zip(raw_indexes, parsed_entries_list, member_names, member_configs):
base_url = member_cfg.get("base_url", "").rstrip("/")
try:
index = yaml.load(raw_data, Loader=_YamlLoader)
except Exception as e:
logger.warning(f"Virtual: failed to parse index.yaml from member '{member_name}': {e}")
continue
if pre_parsed is not None:
entries = pre_parsed
else:
try:
index = yaml.load(raw_data, Loader=_YamlLoader)
except Exception as e:
logger.warning(f"Virtual: failed to parse index.yaml from member '{member_name}': {e}")
continue
entries = index.get("entries") or {}
for chart_name, versions in (index.get("entries") or {}).items():
for chart_name, versions in entries.items():
for version_entry in versions:
version_entry["urls"] = _rewrite_urls(
version_entry.get("urls") or [],
@@ -150,7 +200,14 @@ def _merge_helm_indexes(raw_indexes: list[bytes], member_names: list[str], membe
@runtime_checkable
class _VirtualHandler(Protocol):
def accepts_path(self, path: str) -> bool: ...
def merge(self, raw_indexes: list[bytes], member_names: list[str], member_configs: list[dict], proxy_base: str) -> bytes: ...
def merge(
self,
raw_indexes: list[bytes],
parsed_entries: list[dict | None],
member_names: list[str],
member_configs: list[dict],
proxy_base: str,
) -> bytes: ...
def path_error(self) -> str: ...
@@ -158,8 +215,15 @@ class _HelmHandler:
def accepts_path(self, path: str) -> bool:
return path == "index.yaml"
def merge(self, raw_indexes: list[bytes], member_names: list[str], member_configs: list[dict], proxy_base: str) -> bytes:
return _merge_helm_indexes(raw_indexes, member_names, member_configs, proxy_base)
def merge(
self,
raw_indexes: list[bytes],
parsed_entries: list[dict | None],
member_names: list[str],
member_configs: list[dict],
proxy_base: str,
) -> bytes:
return _merge_helm_indexes(raw_indexes, parsed_entries, member_names, member_configs, proxy_base)
def path_error(self) -> str:
return "Virtual helm repositories only serve index.yaml; chart tarballs are served directly by member remotes"
@@ -210,17 +274,19 @@ async def handle(request: Request, virtual_name: str, path: str, storage, cache,
fetch_ms = int((time.perf_counter() - t_fetch) * 1000)
raw_indexes: list[bytes] = []
used_parsed: list[dict | None] = []
used_members: list[str] = []
used_configs: list[dict] = []
min_ttl: int | None = None
for member_name, member_cfg, member_ttl, raw_data in results:
for member_name, member_cfg, member_ttl, raw_data, parsed_entries in results:
if min_ttl is None or member_ttl < min_ttl:
min_ttl = member_ttl
if raw_data is None:
logger.warning(f"Virtual '{virtual_name}': skipping unreachable member '{member_name}'")
continue
raw_indexes.append(raw_data)
used_parsed.append(parsed_entries)
used_members.append(member_name)
used_configs.append(member_cfg)
@@ -231,7 +297,7 @@ async def handle(request: Request, virtual_name: str, path: str, storage, cache,
min_ttl = 3600
t_merge = time.perf_counter()
merged = await asyncio.to_thread(handler.merge, raw_indexes, used_members, used_configs, proxy_base)
merged = await asyncio.to_thread(handler.merge, raw_indexes, used_parsed, used_members, used_configs, proxy_base)
merge_ms = int((time.perf_counter() - t_merge) * 1000)
try:
@@ -239,9 +305,11 @@ async def handle(request: Request, virtual_name: str, path: str, storage, cache,
storage.upload(virtual_key, merged)
cache.mark_index_cached(virtual_name, path, min_ttl)
store_ms = int((time.perf_counter() - t_store) * 1000)
msgpack_hits = sum(1 for p in used_parsed if p is not None)
logger.info(
f"Virtual MISS: {virtual_name}/{path} rebuilt from {used_members} "
f"(fetch={fetch_ms}ms merge={merge_ms}ms store={store_ms}ms ttl={min_ttl}s)"
f"(fetch={fetch_ms}ms merge={merge_ms}ms store={store_ms}ms ttl={min_ttl}s "
f"msgpack={msgpack_hits}/{len(used_members)})"
)
except Exception as e:
logger.warning(f"Virtual: failed to store merged index for '{virtual_name}': {e}")