observability: migrate VictoriaMetrics from helm charts to operator CRDs
ci/woodpecker/pr/pre-commit Pipeline was successful
ci/woodpecker/pr/kubeconform Pipeline was successful

The k8s au-syd1 VictoriaMetrics stack ran as two helm charts
(victoria-metrics-cluster + victoria-metrics-agent) and only scraped
in-cluster targets. The victoria-metrics-operator already runs in
vm-system, so move the stack onto operator-managed CRDs. This lets the
VMAgent consume VMServiceScrape/VMPodScrape (auto-converted from
Prometheus ServiceMonitors) and adds Consul service discovery so the
cluster scrapes the same puppet-prod targets as the puppet vmagent.

Changes:
- Add VMCluster `main`: vmstorage 2 replicas (down from 3, replicationFactor
  2, cephrbd-fast-delete 200Gi, 180d retention), vminsert/vmselect 2 replicas
  + HPA (2-10, 60% cpu).
- Add VMAgent `main`: keeps the kubernetes SD jobs (apiservers/nodes/cadvisor),
  selectAllByDefault for VMServiceScrape/VMPodScrape, and a Consul SD job
  against consul.service.consul (puppet Consul) replicating the puppet vmagent
  relabels (keep tag `metrics`, scheme from `metrics_scheme`, job from
  `metrics_job`). TLS verified against the reflected vault-ca-cert (no
  insecure skip-verify).
- Expose vmselect/vminsert/vmagent via Gateway API (traefik-internal Gateway +
  HTTPRoute, http->https redirect), same hostnames as before.
- Remove the two helm charts, their values files, and vendored charts.
This commit is contained in:
2026-07-05 22:09:59 +10:00
parent 53b55419a7
commit 41ab3ff614
8 changed files with 523 additions and 301 deletions
+117
View File
@@ -0,0 +1,117 @@
---
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: vmselect
namespace: observability
labels:
app.kubernetes.io/name: vmselect
app.kubernetes.io/instance: victoria-metrics
traefik.io/instance: internal
annotations:
cert-manager.io/cluster-issuer: vault-issuer
cert-manager.io/common-name: vmselect.k8s.syd1.au.unkin.net
cert-manager.io/private-key-size: "4096"
external-dns.alpha.kubernetes.io/hostname: vmselect.k8s.syd1.au.unkin.net
external-dns.alpha.kubernetes.io/target: 198.18.200.4
spec:
gatewayClassName: traefik-internal
listeners:
- name: http
port: 80
protocol: HTTP
hostname: vmselect.k8s.syd1.au.unkin.net
allowedRoutes:
namespaces:
from: Same
- name: https
port: 443
protocol: HTTPS
hostname: vmselect.k8s.syd1.au.unkin.net
allowedRoutes:
namespaces:
from: Same
tls:
mode: Terminate
certificateRefs:
- group: ""
kind: Secret
name: vmselect-tls
---
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: vminsert
namespace: observability
labels:
app.kubernetes.io/name: vminsert
app.kubernetes.io/instance: victoria-metrics
traefik.io/instance: internal
annotations:
cert-manager.io/cluster-issuer: vault-issuer
cert-manager.io/common-name: vminsert.k8s.syd1.au.unkin.net
cert-manager.io/private-key-size: "4096"
external-dns.alpha.kubernetes.io/hostname: vminsert.k8s.syd1.au.unkin.net
external-dns.alpha.kubernetes.io/target: 198.18.200.4
spec:
gatewayClassName: traefik-internal
listeners:
- name: http
port: 80
protocol: HTTP
hostname: vminsert.k8s.syd1.au.unkin.net
allowedRoutes:
namespaces:
from: Same
- name: https
port: 443
protocol: HTTPS
hostname: vminsert.k8s.syd1.au.unkin.net
allowedRoutes:
namespaces:
from: Same
tls:
mode: Terminate
certificateRefs:
- group: ""
kind: Secret
name: vminsert-tls
---
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: vmagent
namespace: observability
labels:
app.kubernetes.io/name: vmagent
app.kubernetes.io/instance: victoria-metrics
traefik.io/instance: internal
annotations:
cert-manager.io/cluster-issuer: vault-issuer
cert-manager.io/common-name: vmagent.k8s.syd1.au.unkin.net
cert-manager.io/private-key-size: "4096"
external-dns.alpha.kubernetes.io/hostname: vmagent.k8s.syd1.au.unkin.net
external-dns.alpha.kubernetes.io/target: 198.18.200.4
spec:
gatewayClassName: traefik-internal
listeners:
- name: http
port: 80
protocol: HTTP
hostname: vmagent.k8s.syd1.au.unkin.net
allowedRoutes:
namespaces:
from: Same
- name: https
port: 443
protocol: HTTPS
hostname: vmagent.k8s.syd1.au.unkin.net
allowedRoutes:
namespaces:
from: Same
tls:
mode: Terminate
certificateRefs:
- group: ""
kind: Secret
name: vmagent-tls
+165
View File
@@ -0,0 +1,165 @@
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: vmselect-http-redirect
namespace: observability
labels:
app.kubernetes.io/name: vmselect
app.kubernetes.io/instance: victoria-metrics
spec:
hostnames:
- vmselect.k8s.syd1.au.unkin.net
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: vmselect
sectionName: http
rules:
- filters:
- type: RequestRedirect
requestRedirect:
scheme: https
statusCode: 301
matches:
- path:
type: PathPrefix
value: /
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: vmselect
namespace: observability
labels:
app.kubernetes.io/name: vmselect
app.kubernetes.io/instance: victoria-metrics
spec:
hostnames:
- vmselect.k8s.syd1.au.unkin.net
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: vmselect
sectionName: https
rules:
- backendRefs:
- group: ""
kind: Service
name: vmselect-main
port: 8481
weight: 1
matches:
- path:
type: PathPrefix
value: /
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: vminsert-http-redirect
namespace: observability
labels:
app.kubernetes.io/name: vminsert
app.kubernetes.io/instance: victoria-metrics
spec:
hostnames:
- vminsert.k8s.syd1.au.unkin.net
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: vminsert
sectionName: http
rules:
- filters:
- type: RequestRedirect
requestRedirect:
scheme: https
statusCode: 301
matches:
- path:
type: PathPrefix
value: /
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: vminsert
namespace: observability
labels:
app.kubernetes.io/name: vminsert
app.kubernetes.io/instance: victoria-metrics
spec:
hostnames:
- vminsert.k8s.syd1.au.unkin.net
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: vminsert
sectionName: https
rules:
- backendRefs:
- group: ""
kind: Service
name: vminsert-main
port: 8480
weight: 1
matches:
- path:
type: PathPrefix
value: /
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: vmagent-http-redirect
namespace: observability
labels:
app.kubernetes.io/name: vmagent
app.kubernetes.io/instance: victoria-metrics
spec:
hostnames:
- vmagent.k8s.syd1.au.unkin.net
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: vmagent
sectionName: http
rules:
- filters:
- type: RequestRedirect
requestRedirect:
scheme: https
statusCode: 301
matches:
- path:
type: PathPrefix
value: /
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: vmagent
namespace: observability
labels:
app.kubernetes.io/name: vmagent
app.kubernetes.io/instance: victoria-metrics
spec:
hostnames:
- vmagent.k8s.syd1.au.unkin.net
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: vmagent
sectionName: https
rules:
- backendRefs:
- group: ""
kind: Service
name: vmagent-main
port: 8429
weight: 1
matches:
- path:
type: PathPrefix
value: /
@@ -4,3 +4,7 @@ kind: Kustomization
resources: resources:
- namespace.yaml - namespace.yaml
- vmcluster.yaml
- vmagent.yaml
- gateway.yaml
- httproute.yaml
+122
View File
@@ -0,0 +1,122 @@
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMAgent
metadata:
name: main
namespace: observability
spec:
replicaCount: 2
scrapeInterval: 15s
# Also consume VMServiceScrape / VMPodScrape / VMNodeScrape from every namespace
# (the operator auto-converts Prometheus ServiceMonitors -> VMServiceScrape).
selectAllByDefault: true
extraArgs:
loggerFormat: json
remoteWrite:
- url: http://vminsert-main.observability.svc.cluster.local:8480/insert/0/prometheus/
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
cpu: "1"
memory: 2Gi
# Reflected Vault intermediate CA (unkin.net) for verifying puppet Consul + metrics targets.
volumes:
- name: vault-ca
secret:
secretName: vault-ca-cert
volumeMounts:
- name: vault-ca
mountPath: /etc/vmagent-tls
readOnly: true
inlineScrapeConfig: |
- job_name: vmagent
static_configs:
- targets: ["localhost:8429"]
- job_name: "kubernetes-apiservers"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
action: keep
regex: default;kubernetes;https
- job_name: "kubernetes-nodes"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: "kubernetes-nodes-cadvisor"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
metrics_path: /metrics/cadvisor
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- source_labels: [__metrics_path__]
target_label: metrics_path
metric_relabel_configs:
- action: replace
source_labels: [pod]
regex: '(.+)'
target_label: pod_name
replacement: '${1}'
- action: replace
source_labels: [container]
regex: '(.+)'
target_label: container_name
replacement: '${1}'
- action: replace
target_label: name
replacement: k8s_stub
- action: replace
source_labels: [id]
regex: '^/system\.slice/(.+)\.service$'
target_label: systemd_service_name
replacement: '${1}'
# puppet-prod Consul service discovery (same targets as the puppet vmagent).
# consul.service.consul resolves to the puppet Consul from in-cluster pods.
- job_name: consul
consul_sd_configs:
- server: consul.service.consul:443
scheme: https
tls_config:
ca_file: /etc/vmagent-tls/ca.crt
relabel_configs:
- source_labels: [__meta_consul_tagpresent_metrics]
regex: "true"
action: keep
- source_labels: [__meta_consul_node, __meta_consul_service_port]
separator: ":"
target_label: __address__
replacement: "${1}:${2}"
action: replace
- source_labels: [__meta_consul_tag_metrics_scheme]
target_label: __scheme__
action: replace
- target_label: __metrics_path__
replacement: /metrics
- source_labels: [__meta_consul_tag_metrics_job]
target_label: job
action: replace
tls_config:
ca_file: /etc/vmagent-tls/ca.crt
+115
View File
@@ -0,0 +1,115 @@
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMCluster
metadata:
name: main
namespace: observability
spec:
retentionPeriod: "180d"
replicationFactor: 2
vmstorage:
replicaCount: 2
extraArgs:
dedup.minScrapeInterval: 15s
loggerFormat: json
storage:
volumeClaimTemplate:
spec:
storageClassName: cephrbd-fast-delete
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 200Gi
resources:
requests:
cpu: "1"
memory: 2Gi
limits:
cpu: "2"
memory: 8Gi
vmselect:
replicaCount: 2
extraArgs:
dedup.minScrapeInterval: 15s
loggerFormat: json
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 1024Mi
hpa:
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 60
behavior:
scaleUp:
stabilizationWindowSeconds: 0
selectPolicy: Max
policies:
- type: Percent
value: 100
periodSeconds: 30
- type: Pods
value: 4
periodSeconds: 30
scaleDown:
stabilizationWindowSeconds: 300
selectPolicy: Min
policies:
- type: Percent
value: 10
periodSeconds: 60
- type: Pods
value: 2
periodSeconds: 60
vminsert:
replicaCount: 2
extraArgs:
loggerFormat: json
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 1024Mi
hpa:
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 60
behavior:
scaleUp:
stabilizationWindowSeconds: 0
selectPolicy: Max
policies:
- type: Percent
value: 100
periodSeconds: 30
- type: Pods
value: 4
periodSeconds: 30
scaleDown:
stabilizationWindowSeconds: 300
selectPolicy: Min
policies:
- type: Percent
value: 10
periodSeconds: 60
- type: Pods
value: 2
periodSeconds: 60
@@ -6,17 +6,3 @@ namespace: observability
resources: resources:
- ../../../base/observability - ../../../base/observability
helmCharts:
- name: victoria-metrics-cluster
repo: https://artifactapi.k8s.syd1.au.unkin.net/api/v1/virtual/helm
version: "0.33.0"
releaseName: victoria-metrics-cluster
namespace: observability
valuesFile: values-vmcluster.yaml
- name: victoria-metrics-agent
repo: https://artifactapi.k8s.syd1.au.unkin.net/api/v1/virtual/helm
version: "0.30.0"
releaseName: victoria-metrics-agent
namespace: observability
valuesFile: values-vmagent.yaml
@@ -1,102 +0,0 @@
image:
repository: victoriametrics/vmagent
pullPolicy: IfNotPresent
global:
scrape_interval: 15s
podDisruptionBudget:
enabled: true
maxUnavailable: 1
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8481"
replicaCount: 3
extraArgs:
envflag.enable: true
envflag.prefix: VM_
loggerFormat: json
httpListenAddr: :8429
service:
enabled: true
ingress:
enabled: true
annotations:
cert-manager.io/cluster-issuer: vault-issuer
cert-manager.io/common-name: vmagent.k8s.syd1.au.unkin.net
cert-manager.io/private-key-size: "4096"
external-dns.alpha.kubernetes.io/hostname: vmagent.k8s.syd1.au.unkin.net
external-dns.alpha.kubernetes.io/target: 198.18.200.0
hosts:
- name: vmagent.k8s.syd1.au.unkin.net
path:
- /
port: http
tls:
- hosts:
- vmagent.k8s.syd1.au.unkin.net
secretName: vmagent-tls
ingressClassName: nginx
remoteWrite:
- url: http://victoria-metrics-cluster-vminsert.observability.svc.cluster.local:8480/insert/0/prometheus/
scrape_configs:
- job_name: vmagent
static_configs:
- targets: ["localhost:8429"]
- job_name: "kubernetes-apiservers"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
action: keep
regex: default;kubernetes;https
- job_name: "kubernetes-nodes"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: "kubernetes-nodes-cadvisor"
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
metrics_path: /metrics/cadvisor
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- source_labels: [__metrics_path__]
target_label: metrics_path
metric_relabel_configs:
- action: replace
source_labels: [pod]
regex: '(.+)'
target_label: pod_name
replacement: '${1}'
- action: replace
source_labels: [container]
regex: '(.+)'
target_label: container_name
replacement: '${1}'
- action: replace
target_label: name
replacement: k8s_stub
- action: replace
source_labels: [id]
regex: '^/system\.slice/(.+)\.service$'
target_label: systemd_service_name
replacement: '${1}'
@@ -1,185 +0,0 @@
vmselect:
enabled: true
image:
repository: victoriametrics/vmselect
pullPolicy: IfNotPresent
variant: cluster
extraArgs:
envflag.enable: true
envflag.prefix: VM_
loggerFormat: json
httpListenAddr: :8481
dedup.minScrapeInterval: 15s
replicationFactor: 2
resources:
limits:
cpu: 500m
memory: 1024Mi
requests:
cpu: 50m
memory: 128Mi
horizontalPodAutoscaler:
enabled: true
maxReplicas: 10
minReplicas: 2
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 60
behavior:
scaleUp:
stabilizationWindowSeconds: 0
selectPolicy: Max
policies:
- type: Percent
value: 100
periodSeconds: 30
- type: Pods
value: 4
periodSeconds: 30
scaleDown:
stabilizationWindowSeconds: 300
selectPolicy: Min
policies:
- type: Percent
value: 10
periodSeconds: 60
- type: Pods
value: 2
periodSeconds: 60
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8481"
podDisruptionBudget:
enabled: true
maxUnavailable: 1
replicaCount: 2
ingress:
enabled: true
annotations:
cert-manager.io/cluster-issuer: vault-issuer
cert-manager.io/common-name: vmselect.k8s.syd1.au.unkin.net
cert-manager.io/private-key-size: "4096"
external-dns.alpha.kubernetes.io/hostname: vmselect.k8s.syd1.au.unkin.net
external-dns.alpha.kubernetes.io/target: 198.18.200.0
hosts:
- name: vmselect.k8s.syd1.au.unkin.net
path:
- /
port: http
tls:
- hosts:
- vmselect.k8s.syd1.au.unkin.net
secretName: vmselect-tls
ingressClassName: nginx
vminsert:
enabled: true
image:
repository: victoriametrics/vminsert
variant: cluster
pullPolicy: IfNotPresent
extraArgs:
envflag.enable: true
envflag.prefix: VM_
loggerFormat: json
httpListenAddr: :8480
replicationFactor: 2
resources:
limits:
cpu: 500m
memory: 1024Mi
requests:
cpu: 50m
memory: 128Mi
horizontalPodAutoscaler:
enabled: true
maxReplicas: 10
minReplicas: 2
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 60
behavior:
scaleUp:
stabilizationWindowSeconds: 0
selectPolicy: Max
policies:
- type: Percent
value: 100
periodSeconds: 30
- type: Pods
value: 4
periodSeconds: 30
scaleDown:
stabilizationWindowSeconds: 300
selectPolicy: Min
policies:
- type: Percent
value: 10
periodSeconds: 60
- type: Pods
value: 2
periodSeconds: 60
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8480"
podDisruptionBudget:
enabled: true
maxUnavailable: 1
replicaCount: 2
ingress:
enabled: true
annotations:
cert-manager.io/cluster-issuer: vault-issuer
cert-manager.io/common-name: vminsert.k8s.syd1.au.unkin.net
cert-manager.io/private-key-size: "4096"
external-dns.alpha.kubernetes.io/hostname: vminsert.k8s.syd1.au.unkin.net
external-dns.alpha.kubernetes.io/target: 198.18.200.0
nginx.ingress.kubernetes.io/proxy-body-size: "100m"
hosts:
- name: vminsert.k8s.syd1.au.unkin.net
path:
- /
port: http
tls:
- hosts:
- vminsert.k8s.syd1.au.unkin.net
secretName: vminsert-tls
ingressClassName: nginx
vmstorage:
enabled: true
image:
repository: victoriametrics/vmstorage
variant: cluster
pullPolicy: IfNotPresent
retentionPeriod: 180d
extraArgs:
envflag.enable: true
envflag.prefix: VM_
loggerFormat: json
httpListenAddr: :8482
dedup.minScrapeInterval: 15s
podAnnotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8482"
podDisruptionBudget:
enabled: true
maxUnavailable: 1
persistentVolume:
enabled: true
name: vmstorage-volume
accessModes:
- ReadWriteOnce
storageClassName: cephrbd-fast-delete
mountPath: /storage
size: 200Gi
replicaCount: 3
podManagementPolicy: OrderedReady