feat: implement hierarchical S3 keys and automated version management
This commit introduces two major improvements: 1. **Hierarchical S3 Key Structure**: - Replace URL-based hashing with remote-name/hash(directory_path)/filename format - Enables remote-specific cache operations and intuitive S3 organization - Cache keys now independent of mirror URL changes - Example: fedora/886d215f6d1a0108/eccodes-2.44.0-1.fc42.x86_64.rpm 2. **Automated Version Management**: - Add bumpver for semantic version bumping - Single source of truth in pyproject.toml - FastAPI dynamically reads version from package metadata - Eliminates manual version synchronization between files Changes: - storage.py: New get_object_key(remote_name, path) method with directory hashing - main.py: Dynamic version import and updated cache key generation calls - cache.py: Updated to use new hierarchical key structure - pyproject.toml: Added bumpver config and dev dependency Breaking change: S3 key format changed, existing cache will need regeneration
This commit is contained in:
parent
1a71a2c9fa
commit
5a0e8b4e0b
@ -40,4 +40,18 @@ dev = [
|
||||
"isort>=5.12.0",
|
||||
"mypy>=1.6.0",
|
||||
"ruff>=0.1.0",
|
||||
"bumpver>=2025.1131",
|
||||
]
|
||||
|
||||
[tool.bumpver]
|
||||
current_version = "2.0.2"
|
||||
version_pattern = "MAJOR.MINOR.PATCH"
|
||||
commit_message = "Bump version {old_version} → {new_version}"
|
||||
commit = true
|
||||
tag = true
|
||||
push = false
|
||||
|
||||
[[tool.bumpver.files]]
|
||||
filename = "pyproject.toml"
|
||||
search = 'version = "{current_version}"'
|
||||
replace = 'version = "{new_version}"'
|
||||
|
||||
@ -76,10 +76,8 @@ class RedisCache:
|
||||
if remote_config:
|
||||
base_url = remote_config.get("base_url")
|
||||
if base_url:
|
||||
# Construct URL the same way as construct_remote_url
|
||||
remote_url = f"{base_url.rstrip('/')}/{path}"
|
||||
# Use URL-based key (same as cache_single_artifact)
|
||||
s3_key = storage.get_object_key(remote_url)
|
||||
# Use hierarchical path-based key (same as cache_single_artifact)
|
||||
s3_key = storage.get_object_key(remote_name, path)
|
||||
if storage.exists(s3_key):
|
||||
storage.client.delete_object(Bucket=storage.bucket, Key=s3_key)
|
||||
except Exception:
|
||||
|
||||
@ -9,6 +9,13 @@ from fastapi.responses import PlainTextResponse, JSONResponse
|
||||
from pydantic import BaseModel
|
||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||
|
||||
try:
|
||||
from importlib.metadata import version
|
||||
__version__ = version("artifactapi")
|
||||
except ImportError:
|
||||
# Fallback for development when package isn't installed
|
||||
__version__ = "dev"
|
||||
|
||||
from .config import ConfigManager
|
||||
from .database import DatabaseManager
|
||||
from .storage import S3Storage
|
||||
@ -28,7 +35,7 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(title="Artifact Storage API", version="2.0.2")
|
||||
app = FastAPI(title="Artifact Storage API", version=__version__)
|
||||
|
||||
# Initialize components using config
|
||||
config_path = os.environ.get("CONFIG_PATH")
|
||||
@ -181,11 +188,8 @@ async def check_artifact_patterns(
|
||||
|
||||
|
||||
async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict:
|
||||
# Check if using URL-based key or path-based key
|
||||
if url.startswith("http"):
|
||||
key = storage.get_object_key(url)
|
||||
else:
|
||||
key = storage.get_object_key_from_path(remote_name, path)
|
||||
# Use hierarchical path-based key
|
||||
key = storage.get_object_key(remote_name, path)
|
||||
|
||||
if storage.exists(key):
|
||||
logger.info(f"Cache ALREADY EXISTS: {url} (key: {key})")
|
||||
@ -267,15 +271,10 @@ async def get_artifact(remote_name: str, path: str):
|
||||
# Construct the remote URL
|
||||
remote_url = await construct_remote_url(remote_name, path)
|
||||
|
||||
# Check if artifact is already cached (try both URL and path-based keys)
|
||||
url_key = storage.get_object_key(remote_url)
|
||||
path_key = storage.get_object_key_from_path(remote_name, path)
|
||||
|
||||
cached_key = None
|
||||
if storage.exists(url_key):
|
||||
cached_key = url_key
|
||||
elif storage.exists(path_key):
|
||||
cached_key = path_key
|
||||
# Check if artifact is already cached
|
||||
cached_key = storage.get_object_key(remote_name, path)
|
||||
if not storage.exists(cached_key):
|
||||
cached_key = None
|
||||
|
||||
# For index files, check Redis TTL validity
|
||||
filename = os.path.basename(path)
|
||||
@ -355,7 +354,7 @@ async def get_artifact(remote_name: str, path: str):
|
||||
|
||||
# Now return the cached artifact
|
||||
try:
|
||||
cache_key = storage.get_object_key(remote_url)
|
||||
cache_key = storage.get_object_key(remote_name, path)
|
||||
artifact_data = storage.download_object(cache_key)
|
||||
filename = os.path.basename(path)
|
||||
|
||||
@ -377,7 +376,7 @@ async def get_artifact(remote_name: str, path: str):
|
||||
metrics.record_cache_miss(remote_name, len(artifact_data))
|
||||
|
||||
# Record artifact mapping in database
|
||||
cache_key = storage.get_object_key(remote_url)
|
||||
cache_key = storage.get_object_key(remote_name, path)
|
||||
database.record_artifact_mapping(
|
||||
cache_key, remote_name, path, len(artifact_data)
|
||||
)
|
||||
@ -625,7 +624,11 @@ async def list_cached_artifacts(
|
||||
|
||||
cached_artifacts = []
|
||||
for url in matching_urls:
|
||||
key = storage.get_object_key(url)
|
||||
# Extract path from URL for hierarchical key generation
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path
|
||||
key = storage.get_object_key(remote, path)
|
||||
if storage.exists(key):
|
||||
cached_artifacts.append(
|
||||
{"url": url, "cached_url": storage.get_url(key), "key": key}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
import os
|
||||
import hashlib
|
||||
from urllib.parse import urlparse
|
||||
import boto3
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
@ -55,17 +54,19 @@ class S3Storage:
|
||||
except ClientError:
|
||||
self.client.create_bucket(Bucket=self.bucket)
|
||||
|
||||
def get_object_key(self, url: str) -> str:
|
||||
url_hash = hashlib.sha256(url.encode()).hexdigest()[:16]
|
||||
parsed = urlparse(url)
|
||||
filename = os.path.basename(parsed.path)
|
||||
return f"{parsed.netloc}/{url_hash}/{filename}"
|
||||
def get_object_key(self, remote_name: str, path: str) -> str:
|
||||
# Extract directory path and filename
|
||||
clean_path = path.lstrip('/')
|
||||
filename = os.path.basename(clean_path)
|
||||
directory_path = os.path.dirname(clean_path)
|
||||
|
||||
def get_object_key_from_path(self, remote_name: str, path: str) -> str:
|
||||
# Create a key based on the API path for direct access
|
||||
path_hash = hashlib.sha256(path.encode()).hexdigest()[:16]
|
||||
filename = os.path.basename(path)
|
||||
return f"{remote_name}/{path_hash}/{filename}"
|
||||
# Hash the directory path to keep keys manageable while preserving remote structure
|
||||
if directory_path:
|
||||
path_hash = hashlib.sha256(directory_path.encode()).hexdigest()[:16]
|
||||
return f"{remote_name}/{path_hash}/{filename}"
|
||||
else:
|
||||
# If no directory, just use remote and filename
|
||||
return f"{remote_name}/{filename}"
|
||||
|
||||
def exists(self, key: str) -> bool:
|
||||
try:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user