commit 46711eec6acd49967b1dcf29ecd72484271c1f79 Author: Ben Vincent Date: Wed Dec 31 20:07:32 2025 +1100 Initial implementation of generic artifact storage system - FastAPI-based caching proxy for remote file servers - YAML configuration for multiple remotes (GitHub, Gitea, HashiCorp, etc.) - Direct URL API: /api/v1/remote/{remote}/{path} with auto-download and caching - Pattern-based access control with regex filtering - S3/MinIO backend storage with predictable paths - Docker Compose setup with MinIO for local development diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0579fc4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,47 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual environment +.venv/ +venv/ +ENV/ +env/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Environment variables +.env +remotes.yaml + +# Logs +*.log + +# uv +uv.lock + +# Docker volumes +minio_data/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3279669 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,49 @@ +# Use Alpine Linux as base image +FROM python:3.11-alpine + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apk add --no-cache \ + gcc \ + musl-dev \ + libffi-dev \ + postgresql-dev \ + curl \ + wget \ + tar + +# Install uv +ARG PACKAGE_VERSION=0.9.21 +RUN wget -O /app/uv-x86_64-unknown-linux-musl.tar.gz https://github.com/astral-sh/uv/releases/download/${PACKAGE_VERSION}/uv-x86_64-unknown-linux-musl.tar.gz && \ + tar xf /app/uv-x86_64-unknown-linux-musl.tar.gz -C /app && \ + mv /app/uv-x86_64-unknown-linux-musl/uv /usr/local/bin/uv && \ + rm -rf /app/uv-x86_64-unknown-linux-musl* && \ + chmod +x /usr/local/bin/uv && \ + uv --version + +# Create non-root user first +RUN adduser -D -s /bin/sh appuser && \ + chown -R appuser:appuser /app + +# Copy dependency files and change ownership +COPY --chown=appuser:appuser pyproject.toml uv.lock README.md ./ + +# Switch to appuser and install Python dependencies +USER appuser +RUN uv sync --frozen + +# Copy application source +COPY --chown=appuser:appuser src/ ./src/ +COPY --chown=appuser:appuser remotes.yaml ./ + +# Expose port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +# Run the application +CMD ["uv", "run", "python", "-m", "src.artifactapi.main"] \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cb85240 --- /dev/null +++ b/Makefile @@ -0,0 +1,47 @@ +.PHONY: build install dev clean test lint format docker-build docker-up docker-down docker-logs docker-rebuild docker-clean docker-restart + +build: + docker build --no-cache -t artifactapi:latest . + +install: build + +docker-build: build + +dev: build + uv sync --dev + +clean: + rm -rf .venv + rm -rf build/ + rm -rf dist/ + rm -rf *.egg-info/ + +test: + uv run pytest + +lint: + uv run ruff check --fix . + +format: + uv run ruff format . + +run: + uv run python -m src.artifactapi.main + +docker-up: + docker-compose up --build --force-recreate -d + +docker-down: + docker-compose down + +docker-logs: + docker-compose logs -f + +docker-rebuild: + docker-compose build --no-cache + +docker-clean: + docker-compose down -v --remove-orphans + docker system prune -f + +docker-restart: docker-down docker-up diff --git a/README.md b/README.md new file mode 100644 index 0000000..508c9d9 --- /dev/null +++ b/README.md @@ -0,0 +1,664 @@ +# Artifact Storage System + +A generic FastAPI-based artifact caching system that downloads and stores files from remote sources (GitHub, Gitea, HashiCorp, etc.) in S3-compatible storage with configuration-based access control. + +## Features + +- **Generic Remote Support**: Works with any HTTP-based file server (GitHub, Gitea, HashiCorp, custom servers) +- **Configuration-Based**: YAML configuration for remotes, patterns, and access control +- **Direct URL API**: Access cached files via clean URLs like `/api/github/owner/repo/path/file.tar.gz` +- **Pattern Filtering**: Regex-based inclusion patterns for security and organization +- **Smart Caching**: Automatic download and cache on first access, serve from cache afterward +- **S3 Storage**: MinIO/S3 backend with predictable paths +- **Content-Type Detection**: Automatic MIME type detection for downloads + +## Architecture + +The system acts as a caching proxy that: +1. Receives requests via the `/api/{remote}/{path}` endpoint +2. Checks if the file is already cached +3. If not cached, downloads from the configured remote and caches it +4. Serves the file with appropriate headers and content types +5. Enforces access control via configurable regex patterns + +## Quick Start + +1. Start MinIO container: +```bash +docker-compose up -d +``` + +2. Create virtual environment and install dependencies: +```bash +uv venv +source .venv/bin/activate +uv pip install -r requirements.txt +``` + +3. Start the API: +```bash +python main.py +``` + +4. Access artifacts directly via URL: +```bash +# This will download and cache the file on first access +xh GET localhost:8000/api/github/gruntwork-io/terragrunt/releases/download/v0.96.1/terragrunt_linux_amd64.tar.gz + +# Subsequent requests serve from cache (see X-Artifact-Source: cache header) +curl -I localhost:8000/api/github/gruntwork-io/terragrunt/releases/download/v0.96.1/terragrunt_linux_amd64.tar.gz +``` + +## API Endpoints + +### Direct Access +- `GET /api/{remote}/{path}` - Direct access to artifacts with auto-caching + +### Management +- `GET /` - API info and available remotes +- `GET /health` - Health check +- `GET /config` - View current configuration +- `POST /cache-artifact` - Batch cache artifacts matching pattern +- `GET /artifacts/{remote}` - List cached artifacts + +## Configuration + +The system uses `remotes.yaml` to define remote repositories and access patterns. All other configuration is provided via environment variables. + +### remotes.yaml Structure + +```yaml +remotes: + remote-name: + base_url: "https://example.com" # Base URL for the remote + type: "remote" # Type: "remote" or "local" + package: "generic" # Package type: "generic", "alpine", "rpm" + description: "Human readable description" + include_patterns: # Regex patterns for allowed files + - "pattern1" + - "pattern2" + cache: # Cache configuration (optional) + file_ttl: 0 # File cache TTL (0 = indefinite) + index_ttl: 300 # Index file TTL in seconds +``` + +### Remote Types + +#### Generic Remotes +For general file hosting (GitHub releases, custom servers): + +```yaml +remotes: + github: + base_url: "https://github.com" + type: "remote" + package: "generic" + description: "GitHub releases and files" + include_patterns: + - "gruntwork-io/terragrunt/.*terragrunt_linux_amd64.*" + - "lxc/incus/.*\\.tar\\.gz$" + - "prometheus/node_exporter/.*/node_exporter-.*\\.linux-amd64\\.tar\\.gz$" + cache: + file_ttl: 0 # Cache files indefinitely + index_ttl: 0 # No index files for generic remotes + + hashicorp-releases: + base_url: "https://releases.hashicorp.com" + type: "remote" + package: "generic" + description: "HashiCorp product releases" + include_patterns: + - "terraform/.*terraform_.*_linux_amd64\\.zip$" + - "vault/.*vault_.*_linux_amd64\\.zip$" + - "consul/.*/consul_.*_linux_amd64\\.zip$" + cache: + file_ttl: 0 + index_ttl: 0 +``` + +#### Package Repository Remotes +For Linux package repositories with index files: + +```yaml +remotes: + alpine: + base_url: "https://dl-cdn.alpinelinux.org" + type: "remote" + package: "alpine" + description: "Alpine Linux APK package repository" + include_patterns: + - ".*/x86_64/.*\\.apk$" # Only x86_64 packages + cache: + file_ttl: 0 # Cache packages indefinitely + index_ttl: 7200 # Cache APKINDEX.tar.gz for 2 hours + + almalinux: + base_url: "http://mirror.aarnet.edu.au/pub/almalinux" + type: "remote" + package: "rpm" + description: "AlmaLinux RPM package repository" + include_patterns: + - ".*/x86_64/.*\\.rpm$" + - ".*/noarch/.*\\.rpm$" + cache: + file_ttl: 0 + index_ttl: 7200 # Cache metadata files for 2 hours +``` + +#### Local Repositories +For storing custom artifacts: + +```yaml +remotes: + local-generic: + type: "local" + package: "generic" + description: "Local generic file repository" + cache: + file_ttl: 0 + index_ttl: 0 +``` + +### Include Patterns + +Include patterns are regular expressions that control which files can be accessed: + +```yaml +include_patterns: + # Specific project patterns + - "gruntwork-io/terragrunt/.*terragrunt_linux_amd64.*" + + # File extension patterns + - ".*\\.tar\\.gz$" + - ".*\\.zip$" + - ".*\\.rpm$" + + # Architecture-specific patterns + - ".*/x86_64/.*" + - ".*/linux-amd64/.*" + + # Version-specific patterns + - "prometheus/node_exporter/.*/node_exporter-.*\\.linux-amd64\\.tar\\.gz$" +``` + +**Security Note**: Only files matching at least one include pattern are accessible. Files not matching any pattern return HTTP 403. + +### Cache Configuration + +Control how long different file types are cached: + +```yaml +cache: + file_ttl: 0 # Regular files (0 = cache indefinitely) + index_ttl: 300 # Index files like APKINDEX.tar.gz (seconds) +``` + +**Index Files**: Repository metadata files that change frequently: +- Alpine: `APKINDEX.tar.gz` +- RPM: `repomd.xml`, `*-primary.xml.gz`, etc. +- These are automatically detected and use `index_ttl` + +### Environment Variables + +All runtime configuration comes from environment variables: + +**Database Configuration:** +- `DBHOST` - PostgreSQL host +- `DBPORT` - PostgreSQL port +- `DBUSER` - PostgreSQL username +- `DBPASS` - PostgreSQL password +- `DBNAME` - PostgreSQL database name + +**Redis Configuration:** +- `REDIS_URL` - Redis connection URL (e.g., `redis://localhost:6379`) + +**S3/MinIO Configuration:** +- `MINIO_ENDPOINT` - MinIO/S3 endpoint +- `MINIO_ACCESS_KEY` - S3 access key +- `MINIO_SECRET_KEY` - S3 secret key +- `MINIO_BUCKET` - S3 bucket name +- `MINIO_SECURE` - Use HTTPS (`true`/`false`) + +## Usage Examples + +### Direct File Access +```bash +# Access GitHub releases +curl localhost:8000/api/github/gruntwork-io/terragrunt/releases/download/v0.96.1/terragrunt_linux_amd64.tar.gz + +# Access HashiCorp releases (when configured) +curl localhost:8000/api/hashicorp/terraform/1.6.0/terraform_1.6.0_linux_amd64.zip + +# Access custom remotes +curl localhost:8000/api/custom/path/to/file.tar.gz +``` + +### Response Headers +- `X-Artifact-Source: cache|remote` - Indicates if served from cache or freshly downloaded +- `Content-Type` - Automatically detected (application/gzip, application/zip, etc.) +- `Content-Disposition` - Download filename +- `Content-Length` - File size + +### Pattern Enforcement +Access is controlled by regex patterns in the configuration. Requests for files not matching any pattern return HTTP 403. + +## Storage Path Format + +Files are stored with keys like: +- `{remote_name}/{path_hash}/{filename}` for direct API access +- `{hostname}/{url_hash}/{filename}` for legacy batch operations + +Example: `github/a1b2c3d4e5f6g7h8/terragrunt_linux_amd64.tar.gz` + +## Kubernetes Deployment + +Deploy the artifact storage system to Kubernetes using the following manifests: + +### 1. Namespace +```yaml +apiVersion: v1 +kind: Namespace +metadata: + name: artifact-storage +``` + +### 2. ConfigMap for remotes.yaml +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: artifactapi-config + namespace: artifact-storage +data: + remotes.yaml: | + remotes: + github: + base_url: "https://github.com" + type: "remote" + package: "generic" + description: "GitHub releases and files" + include_patterns: + - "gruntwork-io/terragrunt/.*terragrunt_linux_amd64.*" + - "lxc/incus/.*\\.tar\\.gz$" + - "prometheus/node_exporter/.*/node_exporter-.*\\.linux-amd64\\.tar\\.gz$" + cache: + file_ttl: 0 + index_ttl: 0 + + hashicorp-releases: + base_url: "https://releases.hashicorp.com" + type: "remote" + package: "generic" + description: "HashiCorp product releases" + include_patterns: + - "terraform/.*terraform_.*_linux_amd64\\.zip$" + - "vault/.*vault_.*_linux_amd64\\.zip$" + - "consul/.*/consul_.*_linux_amd64\\.zip$" + cache: + file_ttl: 0 + index_ttl: 0 +``` + +### 3. Secret for Environment Variables +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: artifactapi-secret + namespace: artifact-storage +type: Opaque +stringData: + DBHOST: "postgres-service" + DBPORT: "5432" + DBUSER: "artifacts" + DBPASS: "artifacts123" + DBNAME: "artifacts" + REDIS_URL: "redis://redis-service:6379" + MINIO_ENDPOINT: "minio-service:9000" + MINIO_ACCESS_KEY: "minioadmin" + MINIO_SECRET_KEY: "minioadmin" + MINIO_BUCKET: "artifacts" + MINIO_SECURE: "false" +``` + +### 4. PostgreSQL Deployment +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres + namespace: artifact-storage +spec: + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + containers: + - name: postgres + image: postgres:15-alpine + env: + - name: POSTGRES_DB + value: artifacts + - name: POSTGRES_USER + value: artifacts + - name: POSTGRES_PASSWORD + value: artifacts123 + ports: + - containerPort: 5432 + volumeMounts: + - name: postgres-storage + mountPath: /var/lib/postgresql/data + livenessProbe: + exec: + command: ["pg_isready", "-U", "artifacts", "-d", "artifacts"] + initialDelaySeconds: 30 + periodSeconds: 30 + volumes: + - name: postgres-storage + persistentVolumeClaim: + claimName: postgres-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres-service + namespace: artifact-storage +spec: + selector: + app: postgres + ports: + - port: 5432 + targetPort: 5432 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: postgres-pvc + namespace: artifact-storage +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi +``` + +### 5. Redis Deployment +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis + namespace: artifact-storage +spec: + replicas: 1 + selector: + matchLabels: + app: redis + template: + metadata: + labels: + app: redis + spec: + containers: + - name: redis + image: redis:7-alpine + command: ["redis-server", "--save", "20", "1"] + ports: + - containerPort: 6379 + volumeMounts: + - name: redis-storage + mountPath: /data + livenessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 30 + periodSeconds: 30 + volumes: + - name: redis-storage + persistentVolumeClaim: + claimName: redis-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: redis-service + namespace: artifact-storage +spec: + selector: + app: redis + ports: + - port: 6379 + targetPort: 6379 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: redis-pvc + namespace: artifact-storage +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi +``` + +### 6. MinIO Deployment +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: minio + namespace: artifact-storage +spec: + replicas: 1 + selector: + matchLabels: + app: minio + template: + metadata: + labels: + app: minio + spec: + containers: + - name: minio + image: minio/minio:latest + command: ["minio", "server", "/data", "--console-address", ":9001"] + env: + - name: MINIO_ROOT_USER + value: minioadmin + - name: MINIO_ROOT_PASSWORD + value: minioadmin + ports: + - containerPort: 9000 + - containerPort: 9001 + volumeMounts: + - name: minio-storage + mountPath: /data + livenessProbe: + httpGet: + path: /minio/health/live + port: 9000 + initialDelaySeconds: 30 + periodSeconds: 30 + volumes: + - name: minio-storage + persistentVolumeClaim: + claimName: minio-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: minio-service + namespace: artifact-storage +spec: + selector: + app: minio + ports: + - name: api + port: 9000 + targetPort: 9000 + - name: console + port: 9001 + targetPort: 9001 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: minio-pvc + namespace: artifact-storage +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi +``` + +### 7. Artifact API Deployment +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: artifactapi + namespace: artifact-storage +spec: + replicas: 2 + selector: + matchLabels: + app: artifactapi + template: + metadata: + labels: + app: artifactapi + spec: + containers: + - name: artifactapi + image: artifactapi:latest + ports: + - containerPort: 8000 + envFrom: + - secretRef: + name: artifactapi-secret + volumeMounts: + - name: config-volume + mountPath: /app/remotes.yaml + subPath: remotes.yaml + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 5 + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + volumes: + - name: config-volume + configMap: + name: artifactapi-config +--- +apiVersion: v1 +kind: Service +metadata: + name: artifactapi-service + namespace: artifact-storage +spec: + selector: + app: artifactapi + ports: + - port: 8000 + targetPort: 8000 + type: ClusterIP +``` + +### 8. Ingress (Optional) +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: artifactapi-ingress + namespace: artifact-storage + annotations: + nginx.ingress.kubernetes.io/rewrite-target: / + nginx.ingress.kubernetes.io/proxy-body-size: "10g" + nginx.ingress.kubernetes.io/proxy-read-timeout: "600" +spec: + rules: + - host: artifacts.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: artifactapi-service + port: + number: 8000 +``` + +### Deployment Commands +```bash +# Create namespace +kubectl apply -f namespace.yaml + +# Deploy PostgreSQL, Redis, and MinIO +kubectl apply -f postgres.yaml +kubectl apply -f redis.yaml +kubectl apply -f minio.yaml + +# Wait for databases to be ready +kubectl wait --for=condition=ready pod -l app=postgres -n artifact-storage --timeout=300s +kubectl wait --for=condition=ready pod -l app=redis -n artifact-storage --timeout=300s +kubectl wait --for=condition=ready pod -l app=minio -n artifact-storage --timeout=300s + +# Deploy configuration and application +kubectl apply -f configmap.yaml +kubectl apply -f secret.yaml +kubectl apply -f artifactapi.yaml + +# Optional: Deploy ingress +kubectl apply -f ingress.yaml + +# Check deployment status +kubectl get pods -n artifact-storage +kubectl logs -f deployment/artifactapi -n artifact-storage +``` + +### Access the API +```bash +# Port-forward to access locally +kubectl port-forward service/artifactapi-service 8000:8000 -n artifact-storage + +# Test the API +curl http://localhost:8000/health +curl http://localhost:8000/ + +# Access artifacts +curl "http://localhost:8000/api/github/gruntwork-io/terragrunt/releases/download/v0.96.1/terragrunt_linux_amd64" +``` + +### Notes for Production +- Use proper secrets management (e.g., Vault, Sealed Secrets) +- Configure resource limits and requests appropriately +- Set up monitoring and alerting +- Use external managed databases for production workloads +- Configure backup strategies for persistent volumes +- Set up proper TLS certificates for ingress +- Consider using StatefulSets for databases with persistent storage \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..be2583a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,85 @@ +version: '3.8' + +services: + artifactapi: + build: + context: . + dockerfile: Dockerfile + no_cache: true + ports: + - "8000:8000" + environment: + - DBHOST=postgres + - DBPORT=5432 + - DBUSER=artifacts + - DBPASS=artifacts123 + - DBNAME=artifacts + - REDIS_URL=redis://redis:6379 + - MINIO_ENDPOINT=minio:9000 + - MINIO_ACCESS_KEY=minioadmin + - MINIO_SECRET_KEY=minioadmin + - MINIO_BUCKET=artifacts + - MINIO_SECURE=false + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + minio: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + + minio: + image: minio/minio:latest + ports: + - "9000:9000" + - "9001:9001" + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + command: server /data --console-address ":9001" + volumes: + - minio_data:/data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + redis: + image: redis:7-alpine + ports: + - "6379:6379" + volumes: + - redis_data:/data + command: redis-server --save 20 1 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 30s + timeout: 10s + retries: 3 + + postgres: + image: postgres:15-alpine + ports: + - "5432:5432" + environment: + POSTGRES_DB: artifacts + POSTGRES_USER: artifacts + POSTGRES_PASSWORD: artifacts123 + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U artifacts -d artifacts"] + interval: 30s + timeout: 10s + retries: 3 + +volumes: + minio_data: + redis_data: + postgres_data: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0ce9b10 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,43 @@ +[project] +name = "artifactapi" +version = "2.0.0" +description = "Generic artifact caching system with support for various package managers" + +dependencies = [ + "fastapi>=0.104.0", + "uvicorn[standard]>=0.24.0", + "httpx>=0.25.0", + "redis>=5.0.0", + "boto3>=1.29.0", + "psycopg2-binary>=2.9.0", + "pyyaml>=6.0", + "lxml>=4.9.0", + "prometheus-client>=0.19.0", + "python-multipart>=0.0.6", +] +requires-python = ">=3.11" +readme = "README.md" +license = {text = "MIT"} + +[project.scripts] +artifactapi = "artifactapi.main:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["src/artifactapi"] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "pytest-asyncio>=0.21.0", + "black>=23.9.0", + "isort>=5.12.0", + "mypy>=1.6.0", + "ruff>=0.1.0", +] diff --git a/src/artifactapi/__init__.py b/src/artifactapi/__init__.py new file mode 100644 index 0000000..551d7c3 --- /dev/null +++ b/src/artifactapi/__init__.py @@ -0,0 +1 @@ +# Artifact API package diff --git a/src/artifactapi/cache.py b/src/artifactapi/cache.py new file mode 100644 index 0000000..6f99bfc --- /dev/null +++ b/src/artifactapi/cache.py @@ -0,0 +1,69 @@ +import time +import hashlib +import redis + + +class RedisCache: + def __init__(self, redis_url: str): + self.redis_url = redis_url + + try: + self.client = redis.from_url(self.redis_url, decode_responses=True) + # Test connection + self.client.ping() + self.available = True + except Exception as e: + print(f"Redis not available: {e}") + self.client = None + self.available = False + + def is_index_file(self, file_path: str) -> bool: + """Check if the file is an index file that should have TTL""" + return ( + file_path.endswith("APKINDEX.tar.gz") + or file_path.endswith("Packages.gz") + or file_path.endswith("repomd.xml") + or "repodata/" in file_path + and file_path.endswith((".xml", ".xml.gz", ".xml.bz2", ".xml.xz")) + ) + + def get_index_cache_key(self, remote_name: str, path: str) -> str: + """Generate cache key for index files""" + return f"index:{remote_name}:{hashlib.sha256(path.encode()).hexdigest()[:16]}" + + def is_index_valid( + self, remote_name: str, path: str, ttl_override: int = None + ) -> bool: + """Check if index file is still valid (not expired)""" + if not self.available: + return False + + try: + key = self.get_index_cache_key(remote_name, path) + return self.client.exists(key) > 0 + except Exception: + return False + + def mark_index_cached(self, remote_name: str, path: str, ttl: int = 300) -> None: + """Mark index file as cached with TTL""" + if not self.available: + return + + try: + key = self.get_index_cache_key(remote_name, path) + self.client.setex(key, ttl, str(int(time.time()))) + except Exception: + pass + + def cleanup_expired_index(self, storage, remote_name: str, path: str) -> None: + """Remove expired index from S3 storage""" + if not self.available: + return + + try: + # Get the S3 key and remove it + s3_key = storage.get_object_key_from_path(remote_name, path) + if storage.exists(s3_key): + storage.client.delete_object(Bucket=storage.bucket, Key=s3_key) + except Exception: + pass diff --git a/src/artifactapi/config.py b/src/artifactapi/config.py new file mode 100644 index 0000000..b316ab4 --- /dev/null +++ b/src/artifactapi/config.py @@ -0,0 +1,120 @@ +import os +import json +import yaml +from typing import Optional + + +class ConfigManager: + def __init__(self, config_file: str = "remotes.yaml"): + self.config_file = config_file + self._last_modified = 0 + self.config = self._load_config() + + def _load_config(self) -> dict: + try: + with open(self.config_file, "r") as f: + if self.config_file.endswith(".yaml") or self.config_file.endswith( + ".yml" + ): + return yaml.safe_load(f) + else: + return json.load(f) + except FileNotFoundError: + return {"remotes": {}} + + def _check_reload(self) -> None: + """Check if config file has been modified and reload if needed""" + try: + import os + + current_modified = os.path.getmtime(self.config_file) + if current_modified > self._last_modified: + self._last_modified = current_modified + self.config = self._load_config() + print(f"Config reloaded from {self.config_file}") + except OSError: + pass + + def get_remote_config(self, remote_name: str) -> Optional[dict]: + self._check_reload() + return self.config.get("remotes", {}).get(remote_name) + + def get_repository_patterns(self, remote_name: str, repo_path: str) -> list: + remote_config = self.get_remote_config(remote_name) + if not remote_config: + return [] + + repositories = remote_config.get("repositories", {}) + + # Handle both dict (GitHub style) and list (Alpine style) repositories + if isinstance(repositories, dict): + repo_config = repositories.get(repo_path) + if repo_config: + patterns = repo_config.get("include_patterns", []) + else: + patterns = remote_config.get("include_patterns", []) + elif isinstance(repositories, list): + # For Alpine, repositories is just a list of allowed repo names + # Pattern matching is handled by the main include_patterns + patterns = remote_config.get("include_patterns", []) + else: + patterns = remote_config.get("include_patterns", []) + + return patterns + + def get_s3_config(self) -> dict: + """Get S3 configuration from environment variables""" + endpoint = os.getenv("MINIO_ENDPOINT") + access_key = os.getenv("MINIO_ACCESS_KEY") + secret_key = os.getenv("MINIO_SECRET_KEY") + bucket = os.getenv("MINIO_BUCKET") + + if not endpoint: + raise ValueError("MINIO_ENDPOINT environment variable is required") + if not access_key: + raise ValueError("MINIO_ACCESS_KEY environment variable is required") + if not secret_key: + raise ValueError("MINIO_SECRET_KEY environment variable is required") + if not bucket: + raise ValueError("MINIO_BUCKET environment variable is required") + + return { + "endpoint": endpoint, + "access_key": access_key, + "secret_key": secret_key, + "bucket": bucket, + "secure": os.getenv("MINIO_SECURE", "false").lower() == "true", + } + + def get_redis_config(self) -> dict: + """Get Redis configuration from environment variables""" + redis_url = os.getenv("REDIS_URL") + if not redis_url: + raise ValueError("REDIS_URL environment variable is required") + + return { + "url": redis_url + } + + def get_database_config(self) -> dict: + """Get database configuration from environment variables""" + db_host = os.getenv("DBHOST") + db_port = os.getenv("DBPORT") + db_user = os.getenv("DBUSER") + db_pass = os.getenv("DBPASS") + db_name = os.getenv("DBNAME") + + if not all([db_host, db_port, db_user, db_pass, db_name]): + missing = [var for var, val in [("DBHOST", db_host), ("DBPORT", db_port), ("DBUSER", db_user), ("DBPASS", db_pass), ("DBNAME", db_name)] if not val] + raise ValueError(f"All database environment variables are required: {', '.join(missing)}") + + db_url = f"postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}" + return {"url": db_url} + + def get_cache_config(self, remote_name: str) -> dict: + """Get cache configuration for a specific remote""" + remote_config = self.get_remote_config(remote_name) + if not remote_config: + return {} + + return remote_config.get("cache", {}) diff --git a/src/artifactapi/database.py b/src/artifactapi/database.py new file mode 100644 index 0000000..499799b --- /dev/null +++ b/src/artifactapi/database.py @@ -0,0 +1,282 @@ +import os +from typing import Optional +import psycopg2 +from psycopg2.extras import RealDictCursor + + +class DatabaseManager: + def __init__(self, db_url: str): + self.db_url = db_url + self.available = False + self._init_database() + + def _init_database(self): + """Initialize database connection and create schema if needed""" + try: + self.connection = psycopg2.connect(self.db_url) + self.connection.autocommit = True + self._create_schema() + self.available = True + print("Database connection established") + except Exception as e: + print(f"Database not available: {e}") + self.available = False + + def _create_schema(self): + """Create tables if they don't exist""" + try: + with self.connection.cursor() as cursor: + # Create table to map S3 keys to remote names + cursor.execute(""" + CREATE TABLE IF NOT EXISTS artifact_mappings ( + id SERIAL PRIMARY KEY, + s3_key VARCHAR(255) UNIQUE NOT NULL, + remote_name VARCHAR(100) NOT NULL, + file_path TEXT NOT NULL, + size_bytes BIGINT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS local_files ( + id SERIAL PRIMARY KEY, + repository_name VARCHAR(100) NOT NULL, + file_path TEXT NOT NULL, + s3_key VARCHAR(255) UNIQUE NOT NULL, + size_bytes BIGINT NOT NULL, + sha256_sum VARCHAR(64) NOT NULL, + content_type VARCHAR(100), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + uploaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + UNIQUE(repository_name, file_path) + ) + """) + + # Create indexes separately + cursor.execute( + "CREATE INDEX IF NOT EXISTS idx_s3_key ON artifact_mappings (s3_key)" + ) + cursor.execute( + "CREATE INDEX IF NOT EXISTS idx_remote_name ON artifact_mappings (remote_name)" + ) + cursor.execute( + "CREATE INDEX IF NOT EXISTS idx_local_repo_path ON local_files (repository_name, file_path)" + ) + cursor.execute( + "CREATE INDEX IF NOT EXISTS idx_local_s3_key ON local_files (s3_key)" + ) + print("Database schema initialized") + except Exception as e: + print(f"Error creating schema: {e}") + + def record_artifact_mapping( + self, s3_key: str, remote_name: str, file_path: str, size_bytes: int + ): + """Record mapping between S3 key and remote""" + if not self.available: + return + + try: + with self.connection.cursor() as cursor: + cursor.execute( + """ + INSERT INTO artifact_mappings (s3_key, remote_name, file_path, size_bytes) + VALUES (%s, %s, %s, %s) + ON CONFLICT (s3_key) + DO UPDATE SET + remote_name = EXCLUDED.remote_name, + file_path = EXCLUDED.file_path, + size_bytes = EXCLUDED.size_bytes + """, + (s3_key, remote_name, file_path, size_bytes), + ) + except Exception as e: + print(f"Error recording artifact mapping: {e}") + + def get_storage_by_remote(self) -> dict[str, int]: + """Get storage size breakdown by remote from database""" + if not self.available: + return {} + + try: + with self.connection.cursor(cursor_factory=RealDictCursor) as cursor: + cursor.execute(""" + SELECT remote_name, SUM(size_bytes) as total_size + FROM artifact_mappings + GROUP BY remote_name + """) + results = cursor.fetchall() + return {row["remote_name"]: row["total_size"] or 0 for row in results} + except Exception as e: + print(f"Error getting storage by remote: {e}") + return {} + + def get_remote_for_s3_key(self, s3_key: str) -> Optional[str]: + """Get remote name for given S3 key""" + if not self.available: + return None + + try: + with self.connection.cursor() as cursor: + cursor.execute( + "SELECT remote_name FROM artifact_mappings WHERE s3_key = %s", + (s3_key,), + ) + result = cursor.fetchone() + return result[0] if result else None + except Exception as e: + print(f"Error getting remote for S3 key: {e}") + return None + + def add_local_file( + self, + repository_name: str, + file_path: str, + s3_key: str, + size_bytes: int, + sha256_sum: str, + content_type: str = None, + ): + """Add a file to local repository""" + if not self.available: + return False + + try: + with self.connection.cursor() as cursor: + cursor.execute( + """ + INSERT INTO local_files (repository_name, file_path, s3_key, size_bytes, sha256_sum, content_type) + VALUES (%s, %s, %s, %s, %s, %s) + """, + ( + repository_name, + file_path, + s3_key, + size_bytes, + sha256_sum, + content_type, + ), + ) + self.connection.commit() + return True + except Exception as e: + print(f"Error adding local file: {e}") + return False + + def get_local_file_metadata(self, repository_name: str, file_path: str): + """Get metadata for a local file""" + if not self.available: + return None + + try: + with self.connection.cursor() as cursor: + cursor.execute( + """ + SELECT repository_name, file_path, s3_key, size_bytes, sha256_sum, content_type, created_at, uploaded_at + FROM local_files + WHERE repository_name = %s AND file_path = %s + """, + (repository_name, file_path), + ) + result = cursor.fetchone() + if result: + return { + "repository_name": result[0], + "file_path": result[1], + "s3_key": result[2], + "size_bytes": result[3], + "sha256_sum": result[4], + "content_type": result[5], + "created_at": result[6], + "uploaded_at": result[7], + } + return None + except Exception as e: + print(f"Error getting local file metadata: {e}") + return None + + def list_local_files(self, repository_name: str, prefix: str = ""): + """List files in local repository with optional path prefix""" + if not self.available: + return [] + + try: + with self.connection.cursor() as cursor: + if prefix: + cursor.execute( + """ + SELECT file_path, size_bytes, sha256_sum, content_type, created_at, uploaded_at + FROM local_files + WHERE repository_name = %s AND file_path LIKE %s + ORDER BY file_path + """, + (repository_name, f"{prefix}%"), + ) + else: + cursor.execute( + """ + SELECT file_path, size_bytes, sha256_sum, content_type, created_at, uploaded_at + FROM local_files + WHERE repository_name = %s + ORDER BY file_path + """, + (repository_name,), + ) + + results = cursor.fetchall() + return [ + { + "file_path": result[0], + "size_bytes": result[1], + "sha256_sum": result[2], + "content_type": result[3], + "created_at": result[4], + "uploaded_at": result[5], + } + for result in results + ] + except Exception as e: + print(f"Error listing local files: {e}") + return [] + + def delete_local_file(self, repository_name: str, file_path: str): + """Delete a file from local repository""" + if not self.available: + return False + + try: + with self.connection.cursor() as cursor: + cursor.execute( + """ + DELETE FROM local_files + WHERE repository_name = %s AND file_path = %s + RETURNING s3_key + """, + (repository_name, file_path), + ) + result = cursor.fetchone() + self.connection.commit() + return result[0] if result else None + except Exception as e: + print(f"Error deleting local file: {e}") + return None + + def file_exists(self, repository_name: str, file_path: str): + """Check if file exists in local repository""" + if not self.available: + return False + + try: + with self.connection.cursor() as cursor: + cursor.execute( + """ + SELECT 1 FROM local_files + WHERE repository_name = %s AND file_path = %s + """, + (repository_name, file_path), + ) + return cursor.fetchone() is not None + except Exception as e: + print(f"Error checking file existence: {e}") + return False diff --git a/src/artifactapi/main.py b/src/artifactapi/main.py new file mode 100644 index 0000000..1e52aff --- /dev/null +++ b/src/artifactapi/main.py @@ -0,0 +1,580 @@ +import os +import re +import hashlib +from typing import Dict, Any, Optional +import httpx +from fastapi import FastAPI, HTTPException, Response, Query, File, UploadFile +from fastapi.responses import PlainTextResponse, JSONResponse +from pydantic import BaseModel +from prometheus_client import generate_latest, CONTENT_TYPE_LATEST + +from .config import ConfigManager +from .database import DatabaseManager +from .storage import S3Storage +from .cache import RedisCache +from .metrics import MetricsManager + + +class ArtifactRequest(BaseModel): + remote: str + include_pattern: str + + +app = FastAPI(title="Artifact Storage API", version="2.0.0") + +# Initialize components using config +config = ConfigManager("remotes.yaml") + +# Get configurations +s3_config = config.get_s3_config() +redis_config = config.get_redis_config() +db_config = config.get_database_config() + +# Initialize services +storage = S3Storage(**s3_config) +cache = RedisCache(redis_config["url"]) +database = DatabaseManager(db_config["url"]) +metrics = MetricsManager(cache, database) + + +@app.get("/") +def read_root(): + config._check_reload() + return { + "message": "Artifact Storage API", + "version": "2.0.0", + "remotes": list(config.config.get("remotes", {}).keys()), + } + + +@app.get("/health") +def health_check(): + return {"status": "healthy"} + + +async def construct_remote_url(remote_name: str, path: str) -> str: + remote_config = config.get_remote_config(remote_name) + if not remote_config: + raise HTTPException( + status_code=404, detail=f"Remote '{remote_name}' not configured" + ) + + base_url = remote_config.get("base_url") + if not base_url: + raise HTTPException( + status_code=500, detail=f"No base_url configured for remote '{remote_name}'" + ) + + return f"{base_url}/{path}" + + +async def check_artifact_patterns( + remote_name: str, repo_path: str, file_path: str, full_path: str +) -> bool: + # First check if this is an index file - always allow index files + if cache.is_index_file(file_path) or cache.is_index_file(full_path): + return True + + # Then check basic include patterns + patterns = config.get_repository_patterns(remote_name, repo_path) + if not patterns: + return True # Allow all if no patterns configured + + pattern_matched = False + for pattern in patterns: + # Check both file_path and full_path to handle different pattern types + if re.search(pattern, file_path) or re.search(pattern, full_path): + pattern_matched = True + break + + if not pattern_matched: + return False + + # All remotes now use pattern-based filtering only - no additional checks needed + return True + + +async def cache_single_artifact(url: str, remote_name: str, path: str) -> dict: + # Check if using URL-based key or path-based key + if url.startswith("http"): + key = storage.get_object_key(url) + else: + key = storage.get_object_key_from_path(remote_name, path) + + if storage.exists(key): + return { + "url": url, + "cached_url": storage.get_url(key), + "status": "already_cached", + } + + try: + async with httpx.AsyncClient(follow_redirects=True) as client: + response = await client.get(url) + response.raise_for_status() + + storage_path = storage.upload(key, response.content) + + return { + "url": url, + "cached_url": storage.get_url(key), + "storage_path": storage_path, + "size": len(response.content), + "status": "cached", + } + + except Exception as e: + return {"url": url, "status": "error", "error": str(e)} + + +@app.get("/api/v1/remote/{remote_name}/{path:path}") +async def get_artifact(remote_name: str, path: str): + # Check if remote is configured + remote_config = config.get_remote_config(remote_name) + if not remote_config: + raise HTTPException( + status_code=404, detail=f"Remote '{remote_name}' not configured" + ) + + # Check if this is a local repository + if remote_config.get("type") == "local": + # Handle local repository download + metadata = database.get_local_file_metadata(remote_name, path) + if not metadata: + raise HTTPException(status_code=404, detail="File not found") + + # Get file from S3 + content = storage.download_object(metadata["s3_key"]) + if content is None: + raise HTTPException(status_code=500, detail="File not accessible") + + # Determine content type + content_type = metadata.get("content_type", "application/octet-stream") + + return Response( + content=content, + media_type=content_type, + headers={ + "Content-Disposition": f"attachment; filename={os.path.basename(path)}" + }, + ) + + # Extract repository path for pattern checking + path_parts = path.split("/") + if len(path_parts) >= 2: + repo_path = f"{path_parts[0]}/{path_parts[1]}" + file_path = "/".join(path_parts[2:]) + else: + repo_path = path + file_path = path + + # Check if artifact matches configured patterns + if not await check_artifact_patterns(remote_name, repo_path, file_path, path): + raise HTTPException( + status_code=403, detail="Artifact not allowed by configuration patterns" + ) + + # Construct the remote URL + remote_url = await construct_remote_url(remote_name, path) + + # Check if artifact is already cached (try both URL and path-based keys) + url_key = storage.get_object_key(remote_url) + path_key = storage.get_object_key_from_path(remote_name, path) + + cached_key = None + if storage.exists(url_key): + cached_key = url_key + elif storage.exists(path_key): + cached_key = path_key + + # For index files, check Redis TTL validity + filename = os.path.basename(path) + is_index = cache.is_index_file(filename) + + if cached_key and is_index: + # Index file exists, but check if it's still valid + if not cache.is_index_valid(remote_name, path): + # Index has expired, remove it from S3 + cache.cleanup_expired_index(storage, remote_name, path) + cached_key = None # Force re-download + + if cached_key: + # Return cached artifact + try: + artifact_data = storage.download_object(cached_key) + filename = os.path.basename(path) + + # Determine content type based on file extension + content_type = "application/octet-stream" + if filename.endswith(".tar.gz"): + content_type = "application/gzip" + elif filename.endswith(".zip"): + content_type = "application/zip" + elif filename.endswith(".exe"): + content_type = "application/x-msdownload" + elif filename.endswith(".rpm"): + content_type = "application/x-rpm" + elif filename.endswith(".xml"): + content_type = "application/xml" + elif filename.endswith((".xml.gz", ".xml.bz2", ".xml.xz")): + content_type = "application/gzip" + + # Record cache hit metrics + metrics.record_cache_hit(remote_name, len(artifact_data)) + + # Record artifact mapping in database if not already recorded + database.record_artifact_mapping( + cached_key, remote_name, path, len(artifact_data) + ) + + return Response( + content=artifact_data, + media_type=content_type, + headers={ + "Content-Disposition": f"attachment; filename={filename}", + "X-Artifact-Source": "cache", + "X-Artifact-Size": str(len(artifact_data)), + }, + ) + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Error retrieving cached artifact: {str(e)}" + ) + + # Artifact not cached, cache it first + result = await cache_single_artifact(remote_url, remote_name, path) + + if result["status"] == "error": + raise HTTPException( + status_code=502, detail=f"Failed to fetch artifact: {result['error']}" + ) + + # Mark index files as cached in Redis if this was a new download + if result["status"] == "cached" and is_index: + # Get TTL from remote config + cache_config = config.get_cache_config(remote_name) + index_ttl = cache_config.get("index_ttl", 300) # Default 5 minutes + cache.mark_index_cached(remote_name, path, index_ttl) + + # Now return the cached artifact + try: + cache_key = storage.get_object_key(remote_url) + artifact_data = storage.download_object(cache_key) + filename = os.path.basename(path) + + content_type = "application/octet-stream" + if filename.endswith(".tar.gz"): + content_type = "application/gzip" + elif filename.endswith(".zip"): + content_type = "application/zip" + elif filename.endswith(".exe"): + content_type = "application/x-msdownload" + elif filename.endswith(".rpm"): + content_type = "application/x-rpm" + elif filename.endswith(".xml"): + content_type = "application/xml" + elif filename.endswith((".xml.gz", ".xml.bz2", ".xml.xz")): + content_type = "application/gzip" + + # Record cache miss metrics + metrics.record_cache_miss(remote_name, len(artifact_data)) + + # Record artifact mapping in database + cache_key = storage.get_object_key(remote_url) + database.record_artifact_mapping( + cache_key, remote_name, path, len(artifact_data) + ) + + return Response( + content=artifact_data, + media_type=content_type, + headers={ + "Content-Disposition": f"attachment; filename={filename}", + "X-Artifact-Source": "remote", + "X-Artifact-Size": str(len(artifact_data)), + }, + ) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error serving artifact: {str(e)}") + + +async def discover_artifacts(remote: str, include_pattern: str) -> list[str]: + if "github.com" in remote: + return await discover_github_releases(remote, include_pattern) + else: + raise HTTPException(status_code=400, detail=f"Unsupported remote: {remote}") + + +async def discover_github_releases(remote: str, include_pattern: str) -> list[str]: + match = re.match(r"github\.com/([^/]+)/([^/]+)", remote) + if not match: + raise HTTPException(status_code=400, detail="Invalid GitHub remote format") + + owner, repo = match.groups() + + async with httpx.AsyncClient(follow_redirects=True) as client: + response = await client.get( + f"https://api.github.com/repos/{owner}/{repo}/releases" + ) + + if response.status_code != 200: + raise HTTPException( + status_code=response.status_code, + detail=f"Failed to fetch releases: {response.text}", + ) + + releases = response.json() + + matching_urls = [] + pattern = include_pattern.replace("*", ".*") + regex = re.compile(pattern) + + for release in releases: + for asset in release.get("assets", []): + download_url = asset["browser_download_url"] + if regex.search(download_url): + matching_urls.append(download_url) + + return matching_urls + + +@app.put("/api/v1/remote/{remote_name}/{path:path}") +async def upload_file(remote_name: str, path: str, file: UploadFile = File(...)): + """Upload a file to local repository""" + # Check if remote is configured and is local + remote_config = config.get_remote_config(remote_name) + if not remote_config: + raise HTTPException( + status_code=404, detail=f"Remote '{remote_name}' not configured" + ) + + if remote_config.get("type") != "local": + raise HTTPException( + status_code=400, detail="Upload only supported for local repositories" + ) + + try: + # Read file content + content = await file.read() + + # Calculate SHA256 + sha256_sum = hashlib.sha256(content).hexdigest() + + # Check if file already exists (prevent overwrite) + if database.file_exists(remote_name, path): + raise HTTPException(status_code=409, detail="File already exists") + + # Generate S3 key + s3_key = f"local/{remote_name}/{path}" + + # Determine content type + content_type = file.content_type or "application/octet-stream" + + # Upload to S3 + try: + storage.upload(s3_key, content) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Upload failed: {e}") + + # Add to database + success = database.add_local_file( + repository_name=remote_name, + file_path=path, + s3_key=s3_key, + size_bytes=len(content), + sha256_sum=sha256_sum, + content_type=content_type, + ) + + if not success: + # Clean up S3 if database insert failed + storage.delete_object(s3_key) + raise HTTPException(status_code=500, detail="Failed to save file metadata") + + return JSONResponse( + { + "message": "File uploaded successfully", + "file_path": path, + "size_bytes": len(content), + "sha256_sum": sha256_sum, + "content_type": content_type, + } + ) + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}") + + +@app.head("/api/v1/remote/{remote_name}/{path:path}") +def check_file_exists(remote_name: str, path: str): + """Check if file exists (for CI jobs) - supports local repositories only""" + # Check if remote is configured + remote_config = config.get_remote_config(remote_name) + if not remote_config: + raise HTTPException( + status_code=404, detail=f"Remote '{remote_name}' not configured" + ) + + # Handle local repository + if remote_config.get("type") == "local": + try: + metadata = database.get_local_file_metadata(remote_name, path) + if not metadata: + raise HTTPException(status_code=404, detail="File not found") + + return Response( + headers={ + "Content-Length": str(metadata["size_bytes"]), + "Content-Type": metadata.get( + "content_type", "application/octet-stream" + ), + "X-SHA256": metadata["sha256_sum"], + "X-Created-At": metadata["created_at"].isoformat() + if metadata["created_at"] + else "", + "X-Uploaded-At": metadata["uploaded_at"].isoformat() + if metadata["uploaded_at"] + else "", + } + ) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=f"Check failed: {str(e)}") + else: + # For remote repositories, just return 405 Method Not Allowed + raise HTTPException( + status_code=405, detail="HEAD method only supported for local repositories" + ) + + +@app.delete("/api/v1/remote/{remote_name}/{path:path}") +def delete_file(remote_name: str, path: str): + """Delete a file from local repository""" + # Check if remote is configured and is local + remote_config = config.get_remote_config(remote_name) + if not remote_config: + raise HTTPException( + status_code=404, detail=f"Remote '{remote_name}' not configured" + ) + + if remote_config.get("type") != "local": + raise HTTPException( + status_code=400, detail="Delete only supported for local repositories" + ) + + try: + # Get S3 key before deleting from database + s3_key = database.delete_local_file(remote_name, path) + if not s3_key: + raise HTTPException(status_code=404, detail="File not found") + + # Delete from S3 + if not storage.delete_object(s3_key): + # File was deleted from database but not from S3 - log warning but continue + print(f"Warning: Failed to delete S3 object {s3_key}") + + return JSONResponse({"message": "File deleted successfully"}) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=f"Delete failed: {str(e)}") + + +@app.post("/api/v1/artifacts/cache") +async def cache_artifact(request: ArtifactRequest) -> Dict[str, Any]: + try: + matching_urls = await discover_artifacts( + request.remote, request.include_pattern + ) + + if not matching_urls: + return { + "message": "No matching artifacts found", + "cached_count": 0, + "artifacts": [], + } + + cached_artifacts = [] + + for url in matching_urls: + result = await cache_single_artifact(url, "", "") + cached_artifacts.append(result) + + cached_count = sum( + 1 + for artifact in cached_artifacts + if artifact["status"] in ["cached", "already_cached"] + ) + + return { + "message": f"Processed {len(matching_urls)} artifacts, {cached_count} successfully cached", + "cached_count": cached_count, + "artifacts": cached_artifacts, + } + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/api/v1/artifacts/{remote:path}") +async def list_cached_artifacts( + remote: str, include_pattern: str = ".*" +) -> Dict[str, Any]: + try: + matching_urls = await discover_artifacts(remote, include_pattern) + + cached_artifacts = [] + for url in matching_urls: + key = storage.get_object_key(url) + if storage.exists(key): + cached_artifacts.append( + {"url": url, "cached_url": storage.get_url(key), "key": key} + ) + + return { + "remote": remote, + "pattern": include_pattern, + "total_found": len(matching_urls), + "cached_count": len(cached_artifacts), + "artifacts": cached_artifacts, + } + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/metrics") +def get_metrics( + json: Optional[bool] = Query( + False, description="Return JSON format instead of Prometheus" + ), +): + """Get comprehensive metrics about the artifact storage system""" + config._check_reload() + + if json: + # Return JSON format + return metrics.get_metrics(storage, config) + else: + # Return Prometheus format + metrics.get_metrics(storage, config) # Update gauges + prometheus_data = generate_latest().decode("utf-8") + return PlainTextResponse(prometheus_data, media_type=CONTENT_TYPE_LATEST) + + +@app.get("/config") +def get_config(): + return config.config + + +def main(): + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=8000) + + +if __name__ == "__main__": + main() diff --git a/src/artifactapi/metrics.py b/src/artifactapi/metrics.py new file mode 100644 index 0000000..769560d --- /dev/null +++ b/src/artifactapi/metrics.py @@ -0,0 +1,229 @@ +from datetime import datetime +from typing import Dict, Any +from prometheus_client import Counter, Gauge + + +# Prometheus metrics +request_counter = Counter( + "artifact_requests_total", "Total artifact requests", ["remote", "status"] +) +cache_hit_counter = Counter("artifact_cache_hits_total", "Total cache hits", ["remote"]) +cache_miss_counter = Counter( + "artifact_cache_misses_total", "Total cache misses", ["remote"] +) +bandwidth_saved_counter = Counter( + "artifact_bandwidth_saved_bytes_total", "Total bandwidth saved", ["remote"] +) +storage_size_gauge = Gauge( + "artifact_storage_size_bytes", "Storage size by remote", ["remote"] +) +redis_keys_gauge = Gauge("artifact_redis_keys_total", "Total Redis keys") + + +class MetricsManager: + def __init__(self, redis_client=None, database_manager=None): + self.redis_client = redis_client + self.database_manager = database_manager + self.start_time = datetime.now() + + def record_cache_hit(self, remote_name: str, size_bytes: int): + """Record a cache hit with size for bandwidth calculation""" + # Update Prometheus metrics + request_counter.labels(remote=remote_name, status="cache_hit").inc() + cache_hit_counter.labels(remote=remote_name).inc() + bandwidth_saved_counter.labels(remote=remote_name).inc(size_bytes) + + # Update Redis for persistence across instances + if self.redis_client and self.redis_client.available: + try: + # Increment global counters + self.redis_client.client.incr("metrics:cache_hits") + self.redis_client.client.incr("metrics:total_requests") + self.redis_client.client.incrby("metrics:bandwidth_saved", size_bytes) + + # Increment per-remote counters + self.redis_client.client.incr(f"metrics:cache_hits:{remote_name}") + self.redis_client.client.incr(f"metrics:total_requests:{remote_name}") + self.redis_client.client.incrby( + f"metrics:bandwidth_saved:{remote_name}", size_bytes + ) + except Exception: + pass + + def record_cache_miss(self, remote_name: str, size_bytes: int): + """Record a cache miss (new download)""" + # Update Prometheus metrics + request_counter.labels(remote=remote_name, status="cache_miss").inc() + cache_miss_counter.labels(remote=remote_name).inc() + + # Update Redis for persistence across instances + if self.redis_client and self.redis_client.available: + try: + # Increment global counters + self.redis_client.client.incr("metrics:cache_misses") + self.redis_client.client.incr("metrics:total_requests") + + # Increment per-remote counters + self.redis_client.client.incr(f"metrics:cache_misses:{remote_name}") + self.redis_client.client.incr(f"metrics:total_requests:{remote_name}") + except Exception: + pass + + def get_redis_key_count(self) -> int: + """Get total number of keys in Redis""" + if self.redis_client and self.redis_client.available: + try: + return self.redis_client.client.dbsize() + except Exception: + return 0 + return 0 + + def get_s3_total_size(self, storage) -> int: + """Get total size of all objects in S3 bucket""" + try: + total_size = 0 + paginator = storage.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=storage.bucket): + if "Contents" in page: + for obj in page["Contents"]: + total_size += obj["Size"] + return total_size + except Exception: + return 0 + + def get_s3_size_by_remote(self, storage, config_manager) -> Dict[str, int]: + """Get size of stored data per remote using database mappings""" + if self.database_manager and self.database_manager.available: + # Get from database if available + db_sizes = self.database_manager.get_storage_by_remote() + if db_sizes: + # Initialize all configured remotes to 0 + remote_sizes = {} + for remote in config_manager.config.get("remotes", {}).keys(): + remote_sizes[remote] = db_sizes.get(remote, 0) + + # Update Prometheus gauges + for remote, size in remote_sizes.items(): + storage_size_gauge.labels(remote=remote).set(size) + + return remote_sizes + + # Fallback to S3 scanning if database not available + try: + remote_sizes = {} + remotes = config_manager.config.get("remotes", {}).keys() + + # Initialize all remotes to 0 + for remote in remotes: + remote_sizes[remote] = 0 + + paginator = storage.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=storage.bucket): + if "Contents" in page: + for obj in page["Contents"]: + key = obj["Key"] + # Try to map from database first + remote = None + if self.database_manager: + remote = self.database_manager.get_remote_for_s3_key(key) + + # Fallback to key parsing + if not remote: + remote = key.split("/")[0] if "/" in key else "unknown" + + if remote in remote_sizes: + remote_sizes[remote] += obj["Size"] + else: + remote_sizes.setdefault("unknown", 0) + remote_sizes["unknown"] += obj["Size"] + + # Update Prometheus gauges + for remote, size in remote_sizes.items(): + if remote != "unknown": # Don't set gauge for unknown + storage_size_gauge.labels(remote=remote).set(size) + + return remote_sizes + except Exception: + return {} + + def get_metrics(self, storage, config_manager) -> Dict[str, Any]: + """Get comprehensive metrics""" + # Update Redis keys gauge + redis_key_count = self.get_redis_key_count() + redis_keys_gauge.set(redis_key_count) + + metrics = { + "timestamp": datetime.now().isoformat(), + "uptime_seconds": int((datetime.now() - self.start_time).total_seconds()), + "redis": {"total_keys": redis_key_count}, + "storage": { + "total_size_bytes": self.get_s3_total_size(storage), + "size_by_remote": self.get_s3_size_by_remote(storage, config_manager), + }, + "requests": { + "cache_hits": 0, + "cache_misses": 0, + "total_requests": 0, + "cache_hit_ratio": 0.0, + }, + "bandwidth": {"saved_bytes": 0}, + "per_remote": {}, + } + + if self.redis_client and self.redis_client.available: + try: + # Get global metrics + cache_hits = int( + self.redis_client.client.get("metrics:cache_hits") or 0 + ) + cache_misses = int( + self.redis_client.client.get("metrics:cache_misses") or 0 + ) + total_requests = cache_hits + cache_misses + bandwidth_saved = int( + self.redis_client.client.get("metrics:bandwidth_saved") or 0 + ) + + metrics["requests"]["cache_hits"] = cache_hits + metrics["requests"]["cache_misses"] = cache_misses + metrics["requests"]["total_requests"] = total_requests + metrics["requests"]["cache_hit_ratio"] = ( + cache_hits / total_requests if total_requests > 0 else 0.0 + ) + metrics["bandwidth"]["saved_bytes"] = bandwidth_saved + + # Get per-remote metrics + for remote in config_manager.config.get("remotes", {}).keys(): + remote_cache_hits = int( + self.redis_client.client.get(f"metrics:cache_hits:{remote}") + or 0 + ) + remote_cache_misses = int( + self.redis_client.client.get(f"metrics:cache_misses:{remote}") + or 0 + ) + remote_total = remote_cache_hits + remote_cache_misses + remote_bandwidth_saved = int( + self.redis_client.client.get( + f"metrics:bandwidth_saved:{remote}" + ) + or 0 + ) + + metrics["per_remote"][remote] = { + "cache_hits": remote_cache_hits, + "cache_misses": remote_cache_misses, + "total_requests": remote_total, + "cache_hit_ratio": remote_cache_hits / remote_total + if remote_total > 0 + else 0.0, + "bandwidth_saved_bytes": remote_bandwidth_saved, + "storage_size_bytes": metrics["storage"]["size_by_remote"].get( + remote, 0 + ), + } + + except Exception: + pass + + return metrics diff --git a/src/artifactapi/storage.py b/src/artifactapi/storage.py new file mode 100644 index 0000000..3c6da90 --- /dev/null +++ b/src/artifactapi/storage.py @@ -0,0 +1,96 @@ +import os +import hashlib +from urllib.parse import urlparse +import boto3 +from botocore.exceptions import ClientError +from fastapi import HTTPException + + +class S3Storage: + def __init__( + self, + endpoint: str, + access_key: str, + secret_key: str, + bucket: str, + secure: bool = False, + ): + self.endpoint = endpoint + self.access_key = access_key + self.secret_key = secret_key + self.bucket = bucket + self.secure = secure + + self.client = boto3.client( + "s3", + endpoint_url=f"http{'s' if self.secure else ''}://{self.endpoint}", + aws_access_key_id=self.access_key, + aws_secret_access_key=self.secret_key, + ) + + # Try to ensure bucket exists, but don't fail if MinIO isn't ready yet + try: + self._ensure_bucket_exists() + except Exception as e: + print(f"Warning: Could not ensure bucket exists during initialization: {e}") + print("Bucket creation will be attempted on first use") + + def _ensure_bucket_exists(self): + try: + self.client.head_bucket(Bucket=self.bucket) + except ClientError: + self.client.create_bucket(Bucket=self.bucket) + + def get_object_key(self, url: str) -> str: + url_hash = hashlib.sha256(url.encode()).hexdigest()[:16] + parsed = urlparse(url) + filename = os.path.basename(parsed.path) + return f"{parsed.netloc}/{url_hash}/{filename}" + + def get_object_key_from_path(self, remote_name: str, path: str) -> str: + # Create a key based on the API path for direct access + path_hash = hashlib.sha256(path.encode()).hexdigest()[:16] + filename = os.path.basename(path) + return f"{remote_name}/{path_hash}/{filename}" + + def exists(self, key: str) -> bool: + try: + self._ensure_bucket_exists() + self.client.head_object(Bucket=self.bucket, Key=key) + return True + except ClientError: + return False + + def upload(self, key: str, data: bytes) -> str: + self._ensure_bucket_exists() + self.client.put_object(Bucket=self.bucket, Key=key, Body=data) + return f"s3://{self.bucket}/{key}" + + def get_url(self, key: str) -> str: + return f"http://{self.endpoint}/{self.bucket}/{key}" + + def get_presigned_url(self, key: str, expiration: int = 3600) -> str: + try: + return self.client.generate_presigned_url( + "get_object", + Params={"Bucket": self.bucket, "Key": key}, + ExpiresIn=expiration, + ) + except Exception: + return self.get_url(key) + + def download_object(self, key: str) -> bytes: + try: + self._ensure_bucket_exists() + response = self.client.get_object(Bucket=self.bucket, Key=key) + return response["Body"].read() + except ClientError: + raise HTTPException(status_code=404, detail="Artifact not found") + + def delete_object(self, key: str) -> bool: + try: + self._ensure_bucket_exists() + self.client.delete_object(Bucket=self.bucket, Key=key) + return True + except ClientError: + return False