diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-16 16:06:14 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-16 16:06:14 +0300 |
| commit | 4b4cde4fe3848c30e9f1cf1efc8cbc46fd50da83 (patch) | |
| tree | d93511d43226331605c095fa0993cc15b7169e8a | |
| parent | 107ccb68af18cf3f4bd04bc93bdde1d7c1169f93 (diff) | |
Grafana's SQLite-on-NFS persistence is unreliable across restarts (the
new pod can't reacquire a clean exclusive lock after any NFS bounce),
and with Loki + Tempo also gone there's nothing left for it to
visualize. Keeping Prometheus alone for metrics + alerting.
Changes:
- prometheus.yaml: add grafana.enabled=false in the kube-prometheus-stack
values so the subchart no longer renders the grafana deployment/pvc.
- loki.yaml, tempo.yaml, grafana-ingress.yaml: renamed to .disabled
(same pattern as commit 03a18c6) so 'kubectl apply -f argocd-apps/'
stops re-creating them; the cluster Applications were also deleted,
which cascade-removes the helm resources via the resources-finalizer.
- alloy.yaml: drop the loki.write and otelcol.* blocks (no destinations
to ship to). DaemonSet stays deployed with a minimal 'logging' block
so the chart can be re-enabled by restoring the blocks here.
Prometheus TSDB was also wiped (corrupted zero-byte WAL segments from
the same NFS blip that took grafana down) — done separately, not part
of this commit.
| -rw-r--r-- | f3s/argocd-apps/monitoring/alloy.yaml | 99 | ||||
| -rw-r--r-- | f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled (renamed from f3s/argocd-apps/monitoring/grafana-ingress.yaml) | 0 | ||||
| -rw-r--r-- | f3s/argocd-apps/monitoring/loki.yaml.disabled (renamed from f3s/argocd-apps/monitoring/loki.yaml) | 0 | ||||
| -rw-r--r-- | f3s/argocd-apps/monitoring/prometheus.yaml | 6 | ||||
| -rw-r--r-- | f3s/argocd-apps/monitoring/tempo.yaml.disabled (renamed from f3s/argocd-apps/monitoring/tempo.yaml) | 0 |
5 files changed, 12 insertions, 93 deletions
diff --git a/f3s/argocd-apps/monitoring/alloy.yaml b/f3s/argocd-apps/monitoring/alloy.yaml index c5574b1..e2105e3 100644 --- a/f3s/argocd-apps/monitoring/alloy.yaml +++ b/f3s/argocd-apps/monitoring/alloy.yaml @@ -15,101 +15,14 @@ spec: releaseName: alloy valuesObject: alloy: - service: - ports: - otlp-grpc: - enabled: true - port: 4317 - targetPort: 4317 - protocol: TCP - otlp-http: - enabled: true - port: 4318 - targetPort: 4318 - protocol: TCP - + # Log shipping (to Loki) and trace forwarding (to Tempo) are + # disabled — Loki and Tempo apps are off. The DaemonSet stays + # deployed with a no-op config so the chart can be re-enabled + # by restoring the discovery/loki/otelcol blocks here. configMap: content: | - discovery.kubernetes "pods" { - role = "pod" - } - - discovery.relabel "pods" { - targets = discovery.kubernetes.pods.targets - - rule { - source_labels = ["__meta_kubernetes_namespace"] - target_label = "namespace" - } - - rule { - source_labels = ["__meta_kubernetes_pod_name"] - target_label = "pod" - } - - rule { - source_labels = ["__meta_kubernetes_pod_container_name"] - target_label = "container" - } - - rule { - source_labels = ["__meta_kubernetes_pod_label_app"] - target_label = "app" - } - } - - loki.source.kubernetes "pods" { - targets = discovery.relabel.pods.output - forward_to = [loki.write.default.receiver] - } - - loki.write "default" { - endpoint { - url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push" - } - } - - // ======================================== - // TRACES COLLECTION - // ======================================== - - // OTLP receiver for traces via gRPC and HTTP - otelcol.receiver.otlp "default" { - grpc { - endpoint = "0.0.0.0:4317" - } - - http { - endpoint = "0.0.0.0:4318" - } - - output { - traces = [otelcol.processor.batch.default.input] - } - } - - // Batch processor for efficient trace forwarding - otelcol.processor.batch "default" { - timeout = "5s" - send_batch_size = 100 - send_batch_max_size = 200 - - output { - traces = [otelcol.exporter.otlp.tempo.input] - } - } - - // OTLP exporter to send traces to Tempo - otelcol.exporter.otlp "tempo" { - client { - endpoint = "tempo.monitoring.svc.cluster.local:4317" - - tls { - insecure = true - } - - compression = "gzip" - } + logging { + level = "info" } destination: diff --git a/f3s/argocd-apps/monitoring/grafana-ingress.yaml b/f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled index 49b52de..49b52de 100644 --- a/f3s/argocd-apps/monitoring/grafana-ingress.yaml +++ b/f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled diff --git a/f3s/argocd-apps/monitoring/loki.yaml b/f3s/argocd-apps/monitoring/loki.yaml.disabled index c7985c2..c7985c2 100644 --- a/f3s/argocd-apps/monitoring/loki.yaml +++ b/f3s/argocd-apps/monitoring/loki.yaml.disabled diff --git a/f3s/argocd-apps/monitoring/prometheus.yaml b/f3s/argocd-apps/monitoring/prometheus.yaml index ecd3f5c..a63aab7 100644 --- a/f3s/argocd-apps/monitoring/prometheus.yaml +++ b/f3s/argocd-apps/monitoring/prometheus.yaml @@ -169,6 +169,12 @@ spec: equal: ['namespace', 'alertname'] grafana: + # Disabled: SQLite-on-NFS is unreliable across restarts (lock + # state cannot be reacquired cleanly), and Loki + Tempo are + # also disabled, so there's nothing to visualize. Prometheus + # alone is kept for metrics + alerting. + enabled: false + persistence: enabled: true type: pvc diff --git a/f3s/argocd-apps/monitoring/tempo.yaml b/f3s/argocd-apps/monitoring/tempo.yaml.disabled index 0fd6bc1..0fd6bc1 100644 --- a/f3s/argocd-apps/monitoring/tempo.yaml +++ b/f3s/argocd-apps/monitoring/tempo.yaml.disabled |
