f3s/monitoring: disable grafana, loki, tempo; reduce alloy to no-opHEAD master

Grafana's SQLite-on-NFS persistence is unreliable across restarts (the new pod can't reacquire a clean exclusive lock after any NFS bounce), and with Loki + Tempo also gone there's nothing left for it to visualize. Keeping Prometheus alone for metrics + alerting. Changes: - prometheus.yaml: add grafana.enabled=false in the kube-prometheus-stack values so the subchart no longer renders the grafana deployment/pvc. - loki.yaml, tempo.yaml, grafana-ingress.yaml: renamed to .disabled (same pattern as commit 03a18c6) so 'kubectl apply -f argocd-apps/' stops re-creating them; the cluster Applications were also deleted, which cascade-removes the helm resources via the resources-finalizer. - alloy.yaml: drop the loki.write and otelcol.* blocks (no destinations to ship to). DaemonSet stays deployed with a minimal 'logging' block so the chart can be re-enabled by restoring the blocks here. Prometheus TSDB was also wiped (corrupted zero-byte WAL segments from the same NFS blip that took grafana down) — done separately, not part of this commit.
author: Paul Buetow <paul@buetow.org> 2026-05-16 16:06:14 +0300
committer: Paul Buetow <paul@buetow.org> 2026-05-16 16:06:14 +0300
commit: 4b4cde4fe3848c30e9f1cf1efc8cbc46fd50da83 (patch)
tree: d93511d43226331605c095fa0993cc15b7169e8a
parent: 107ccb68af18cf3f4bd04bc93bdde1d7c1169f93 (diff)
5 files changed, 12 insertions, 93 deletions
diff --git a/f3s/argocd-apps/monitoring/alloy.yaml b/f3s/argocd-apps/monitoring/alloy.yaml
index c5574b1..e2105e3 100644
--- a/f3s/argocd-apps/monitoring/alloy.yaml
+++ b/f3s/argocd-apps/monitoring/alloy.yaml
@@ -15,101 +15,14 @@ spec:
       releaseName: alloy
       valuesObject:
         alloy:
-          service:
-            ports:
-              otlp-grpc:
-                enabled: true
-                port: 4317
-                targetPort: 4317
-                protocol: TCP
-              otlp-http:
-                enabled: true
-                port: 4318
-                targetPort: 4318
-                protocol: TCP
-
+          # Log shipping (to Loki) and trace forwarding (to Tempo) are
+          # disabled — Loki and Tempo apps are off. The DaemonSet stays
+          # deployed with a no-op config so the chart can be re-enabled
+          # by restoring the discovery/loki/otelcol blocks here.
           configMap:
             content: |
-              discovery.kubernetes "pods" {
-                role = "pod"
-              }
-
-              discovery.relabel "pods" {
-                targets = discovery.kubernetes.pods.targets
-
-                rule {
-                  source_labels = ["__meta_kubernetes_namespace"]
-                  target_label  = "namespace"
-                }
-
-                rule {
-                  source_labels = ["__meta_kubernetes_pod_name"]
-                  target_label  = "pod"
-                }
-
-                rule {
-                  source_labels = ["__meta_kubernetes_pod_container_name"]
-                  target_label  = "container"
-                }
-
-                rule {
-                  source_labels = ["__meta_kubernetes_pod_label_app"]
-                  target_label  = "app"
-                }
-              }
-
-              loki.source.kubernetes "pods" {
-                targets    = discovery.relabel.pods.output
-                forward_to = [loki.write.default.receiver]
-              }
-
-              loki.write "default" {
-                endpoint {
-                  url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
-                }
-              }
-
-              // ========================================
-              // TRACES COLLECTION
-              // ========================================
-
-              // OTLP receiver for traces via gRPC and HTTP
-              otelcol.receiver.otlp "default" {
-                grpc {
-                  endpoint = "0.0.0.0:4317"
-                }
-
-                http {
-                  endpoint = "0.0.0.0:4318"
-                }
-
-                output {
-                  traces = [otelcol.processor.batch.default.input]
-                }
-              }
-
-              // Batch processor for efficient trace forwarding
-              otelcol.processor.batch "default" {
-                timeout = "5s"
-                send_batch_size = 100
-                send_batch_max_size = 200
-
-                output {
-                  traces = [otelcol.exporter.otlp.tempo.input]
-                }
-              }
-
-              // OTLP exporter to send traces to Tempo
-              otelcol.exporter.otlp "tempo" {
-                client {
-                  endpoint = "tempo.monitoring.svc.cluster.local:4317"
-
-                  tls {
-                    insecure = true
-                  }
-
-                  compression = "gzip"
-                }
+              logging {
+                level = "info"
               }
 
   destination:
diff --git a/f3s/argocd-apps/monitoring/grafana-ingress.yaml b/f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled
index 49b52de..49b52de 100644
--- a/f3s/argocd-apps/monitoring/grafana-ingress.yaml
+++ b/f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled
diff --git a/f3s/argocd-apps/monitoring/loki.yaml b/f3s/argocd-apps/monitoring/loki.yaml.disabled
index c7985c2..c7985c2 100644
--- a/f3s/argocd-apps/monitoring/loki.yaml
+++ b/f3s/argocd-apps/monitoring/loki.yaml.disabled
diff --git a/f3s/argocd-apps/monitoring/prometheus.yaml b/f3s/argocd-apps/monitoring/prometheus.yaml
index ecd3f5c..a63aab7 100644
--- a/f3s/argocd-apps/monitoring/prometheus.yaml
+++ b/f3s/argocd-apps/monitoring/prometheus.yaml
@@ -169,6 +169,12 @@ spec:
                   equal: ['namespace', 'alertname']
 
           grafana:
+            # Disabled: SQLite-on-NFS is unreliable across restarts (lock
+            # state cannot be reacquired cleanly), and Loki + Tempo are
+            # also disabled, so there's nothing to visualize. Prometheus
+            # alone is kept for metrics + alerting.
+            enabled: false
+
             persistence:
               enabled: true
               type: pvc
diff --git a/f3s/argocd-apps/monitoring/tempo.yaml b/f3s/argocd-apps/monitoring/tempo.yaml.disabled
index 0fd6bc1..0fd6bc1 100644
--- a/f3s/argocd-apps/monitoring/tempo.yaml
+++ b/f3s/argocd-apps/monitoring/tempo.yaml.disabled
author	Paul Buetow <paul@buetow.org>	2026-05-16 16:06:14 +0300
committer	Paul Buetow <paul@buetow.org>	2026-05-16 16:06:14 +0300
commit	4b4cde4fe3848c30e9f1cf1efc8cbc46fd50da83 (patch)
tree	d93511d43226331605c095fa0993cc15b7169e8a
parent	107ccb68af18cf3f4bd04bc93bdde1d7c1169f93 (diff)