feat: implement hierarchical S3 keys and automated version management

This commit introduces two major improvements:

1. **Hierarchical S3 Key Structure**:
   - Replace URL-based hashing with remote-name/hash(directory_path)/filename format
   - Enables remote-specific cache operations and intuitive S3 organization
   - Cache keys now independent of mirror URL changes
   - Example: fedora/886d215f6d1a0108/eccodes-2.44.0-1.fc42.x86_64.rpm

2. **Automated Version Management**:
   - Add bumpver for semantic version bumping
   - Single source of truth in pyproject.toml
   - FastAPI dynamically reads version from package metadata
   - Eliminates manual version synchronization between files

Changes:
- storage.py: New get_object_key(remote_name, path) method with directory hashing
- main.py: Dynamic version import and updated cache key generation calls
- cache.py: Updated to use new hierarchical key structure
- pyproject.toml: Added bumpver config and dev dependency

Breaking change: S3 key format changed, existing cache will need regeneration
This commit is contained in:
Ben Vincent 2026-01-24 23:51:03 +11:00
parent 1a71a2c9fa
commit 5a0e8b4e0b
4 changed files with 49 additions and 33 deletions

View File

@ -40,4 +40,18 @@ dev = [
"isort>=5.12.0", "isort>=5.12.0",
"mypy>=1.6.0", "mypy>=1.6.0",
"ruff>=0.1.0", "ruff>=0.1.0",
"bumpver>=2025.1131",
] ]
[tool.bumpver]
current_version = "2.0.2"
version_pattern = "MAJOR.MINOR.PATCH"
commit_message = "Bump version {old_version} → {new_version}"
commit = true
tag = true
push = false
[[tool.bumpver.files]]
filename = "pyproject.toml"
search = 'version = "{current_version}"'
replace = 'version = "{new_version}"'

View File

@ -76,10 +76,8 @@ class RedisCache:
if remote_config: if remote_config:
base_url = remote_config.get("base_url") base_url = remote_config.get("base_url")
if base_url: if base_url:
# Construct URL the same way as construct_remote_url # Use hierarchical path-based key (same as cache_single_artifact)
remote_url = f"{base_url.rstrip('/')}/{path}" s3_key = storage.get_object_key(remote_name, path)
# Use URL-based key (same as cache_single_artifact)
s3_key = storage.get_object_key(remote_url)
if storage.exists(s3_key): if storage.exists(s3_key):
storage.client.delete_object(Bucket=storage.bucket, Key=s3_key) storage.client.delete_object(Bucket=storage.bucket, Key=s3_key)
except Exception: except Exception:

View File

@ -9,6 +9,13 @@ from fastapi.responses import PlainTextResponse, JSONResponse
from pydantic import BaseModel from pydantic import BaseModel
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
try:
from importlib.metadata import version
__version__ = version("artifactapi")
except ImportError:
# Fallback for development when package isn't installed
__version__ = "dev"
from .config import ConfigManager from .config import ConfigManager
from .database import DatabaseManager from .database import DatabaseManager
from .storage import S3Storage from .storage import S3Storage
@ -28,7 +35,7 @@ logging.basicConfig(
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
app = FastAPI(title="Artifact Storage API", version="2.0.2") app = FastAPI(title="Artifact Storage API", version=__version__)
# Initialize components using config # Initialize components using config
config_path = os.environ.get("CONFIG_PATH") config_path = os.environ.get("CONFIG_PATH")
@ -181,11 +188,8 @@ async def check_artifact_patterns(
async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict: async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict:
# Check if using URL-based key or path-based key # Use hierarchical path-based key
if url.startswith("http"): key = storage.get_object_key(remote_name, path)
key = storage.get_object_key(url)
else:
key = storage.get_object_key_from_path(remote_name, path)
if storage.exists(key): if storage.exists(key):
logger.info(f"Cache ALREADY EXISTS: {url} (key: {key})") logger.info(f"Cache ALREADY EXISTS: {url} (key: {key})")
@ -267,15 +271,10 @@ async def get_artifact(remote_name: str, path: str):
# Construct the remote URL # Construct the remote URL
remote_url = await construct_remote_url(remote_name, path) remote_url = await construct_remote_url(remote_name, path)
# Check if artifact is already cached (try both URL and path-based keys) # Check if artifact is already cached
url_key = storage.get_object_key(remote_url) cached_key = storage.get_object_key(remote_name, path)
path_key = storage.get_object_key_from_path(remote_name, path) if not storage.exists(cached_key):
cached_key = None
cached_key = None
if storage.exists(url_key):
cached_key = url_key
elif storage.exists(path_key):
cached_key = path_key
# For index files, check Redis TTL validity # For index files, check Redis TTL validity
filename = os.path.basename(path) filename = os.path.basename(path)
@ -355,7 +354,7 @@ async def get_artifact(remote_name: str, path: str):
# Now return the cached artifact # Now return the cached artifact
try: try:
cache_key = storage.get_object_key(remote_url) cache_key = storage.get_object_key(remote_name, path)
artifact_data = storage.download_object(cache_key) artifact_data = storage.download_object(cache_key)
filename = os.path.basename(path) filename = os.path.basename(path)
@ -377,7 +376,7 @@ async def get_artifact(remote_name: str, path: str):
metrics.record_cache_miss(remote_name, len(artifact_data)) metrics.record_cache_miss(remote_name, len(artifact_data))
# Record artifact mapping in database # Record artifact mapping in database
cache_key = storage.get_object_key(remote_url) cache_key = storage.get_object_key(remote_name, path)
database.record_artifact_mapping( database.record_artifact_mapping(
cache_key, remote_name, path, len(artifact_data) cache_key, remote_name, path, len(artifact_data)
) )
@ -625,7 +624,11 @@ async def list_cached_artifacts(
cached_artifacts = [] cached_artifacts = []
for url in matching_urls: for url in matching_urls:
key = storage.get_object_key(url) # Extract path from URL for hierarchical key generation
from urllib.parse import urlparse
parsed = urlparse(url)
path = parsed.path
key = storage.get_object_key(remote, path)
if storage.exists(key): if storage.exists(key):
cached_artifacts.append( cached_artifacts.append(
{"url": url, "cached_url": storage.get_url(key), "key": key} {"url": url, "cached_url": storage.get_url(key), "key": key}

View File

@ -1,6 +1,5 @@
import os import os
import hashlib import hashlib
from urllib.parse import urlparse
import boto3 import boto3
from botocore.config import Config from botocore.config import Config
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
@ -55,17 +54,19 @@ class S3Storage:
except ClientError: except ClientError:
self.client.create_bucket(Bucket=self.bucket) self.client.create_bucket(Bucket=self.bucket)
def get_object_key(self, url: str) -> str: def get_object_key(self, remote_name: str, path: str) -> str:
url_hash = hashlib.sha256(url.encode()).hexdigest()[:16] # Extract directory path and filename
parsed = urlparse(url) clean_path = path.lstrip('/')
filename = os.path.basename(parsed.path) filename = os.path.basename(clean_path)
return f"{parsed.netloc}/{url_hash}/{filename}" directory_path = os.path.dirname(clean_path)
def get_object_key_from_path(self, remote_name: str, path: str) -> str: # Hash the directory path to keep keys manageable while preserving remote structure
# Create a key based on the API path for direct access if directory_path:
path_hash = hashlib.sha256(path.encode()).hexdigest()[:16] path_hash = hashlib.sha256(directory_path.encode()).hexdigest()[:16]
filename = os.path.basename(path) return f"{remote_name}/{path_hash}/{filename}"
return f"{remote_name}/{path_hash}/{filename}" else:
# If no directory, just use remote and filename
return f"{remote_name}/{filename}"
def exists(self, key: str) -> bool: def exists(self, key: str) -> bool:
try: try: