From 4b4cde4fe3848c30e9f1cf1efc8cbc46fd50da83 Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Sat, 16 May 2026 16:06:14 +0300 Subject: f3s/monitoring: disable grafana, loki, tempo; reduce alloy to no-op MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Grafana's SQLite-on-NFS persistence is unreliable across restarts (the new pod can't reacquire a clean exclusive lock after any NFS bounce), and with Loki + Tempo also gone there's nothing left for it to visualize. Keeping Prometheus alone for metrics + alerting. Changes: - prometheus.yaml: add grafana.enabled=false in the kube-prometheus-stack values so the subchart no longer renders the grafana deployment/pvc. - loki.yaml, tempo.yaml, grafana-ingress.yaml: renamed to .disabled (same pattern as commit 03a18c6) so 'kubectl apply -f argocd-apps/' stops re-creating them; the cluster Applications were also deleted, which cascade-removes the helm resources via the resources-finalizer. - alloy.yaml: drop the loki.write and otelcol.* blocks (no destinations to ship to). DaemonSet stays deployed with a minimal 'logging' block so the chart can be re-enabled by restoring the blocks here. Prometheus TSDB was also wiped (corrupted zero-byte WAL segments from the same NFS blip that took grafana down) — done separately, not part of this commit. --- f3s/argocd-apps/monitoring/alloy.yaml | 99 ++-------------------- f3s/argocd-apps/monitoring/grafana-ingress.yaml | 28 ------ .../monitoring/grafana-ingress.yaml.disabled | 28 ++++++ f3s/argocd-apps/monitoring/loki.yaml | 86 ------------------- f3s/argocd-apps/monitoring/loki.yaml.disabled | 86 +++++++++++++++++++ f3s/argocd-apps/monitoring/prometheus.yaml | 6 ++ f3s/argocd-apps/monitoring/tempo.yaml | 97 --------------------- f3s/argocd-apps/monitoring/tempo.yaml.disabled | 97 +++++++++++++++++++++ 8 files changed, 223 insertions(+), 304 deletions(-) delete mode 100644 f3s/argocd-apps/monitoring/grafana-ingress.yaml create mode 100644 f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled delete mode 100644 f3s/argocd-apps/monitoring/loki.yaml create mode 100644 f3s/argocd-apps/monitoring/loki.yaml.disabled delete mode 100644 f3s/argocd-apps/monitoring/tempo.yaml create mode 100644 f3s/argocd-apps/monitoring/tempo.yaml.disabled diff --git a/f3s/argocd-apps/monitoring/alloy.yaml b/f3s/argocd-apps/monitoring/alloy.yaml index c5574b1..e2105e3 100644 --- a/f3s/argocd-apps/monitoring/alloy.yaml +++ b/f3s/argocd-apps/monitoring/alloy.yaml @@ -15,101 +15,14 @@ spec: releaseName: alloy valuesObject: alloy: - service: - ports: - otlp-grpc: - enabled: true - port: 4317 - targetPort: 4317 - protocol: TCP - otlp-http: - enabled: true - port: 4318 - targetPort: 4318 - protocol: TCP - + # Log shipping (to Loki) and trace forwarding (to Tempo) are + # disabled — Loki and Tempo apps are off. The DaemonSet stays + # deployed with a no-op config so the chart can be re-enabled + # by restoring the discovery/loki/otelcol blocks here. configMap: content: | - discovery.kubernetes "pods" { - role = "pod" - } - - discovery.relabel "pods" { - targets = discovery.kubernetes.pods.targets - - rule { - source_labels = ["__meta_kubernetes_namespace"] - target_label = "namespace" - } - - rule { - source_labels = ["__meta_kubernetes_pod_name"] - target_label = "pod" - } - - rule { - source_labels = ["__meta_kubernetes_pod_container_name"] - target_label = "container" - } - - rule { - source_labels = ["__meta_kubernetes_pod_label_app"] - target_label = "app" - } - } - - loki.source.kubernetes "pods" { - targets = discovery.relabel.pods.output - forward_to = [loki.write.default.receiver] - } - - loki.write "default" { - endpoint { - url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push" - } - } - - // ======================================== - // TRACES COLLECTION - // ======================================== - - // OTLP receiver for traces via gRPC and HTTP - otelcol.receiver.otlp "default" { - grpc { - endpoint = "0.0.0.0:4317" - } - - http { - endpoint = "0.0.0.0:4318" - } - - output { - traces = [otelcol.processor.batch.default.input] - } - } - - // Batch processor for efficient trace forwarding - otelcol.processor.batch "default" { - timeout = "5s" - send_batch_size = 100 - send_batch_max_size = 200 - - output { - traces = [otelcol.exporter.otlp.tempo.input] - } - } - - // OTLP exporter to send traces to Tempo - otelcol.exporter.otlp "tempo" { - client { - endpoint = "tempo.monitoring.svc.cluster.local:4317" - - tls { - insecure = true - } - - compression = "gzip" - } + logging { + level = "info" } destination: diff --git a/f3s/argocd-apps/monitoring/grafana-ingress.yaml b/f3s/argocd-apps/monitoring/grafana-ingress.yaml deleted file mode 100644 index 49b52de..0000000 --- a/f3s/argocd-apps/monitoring/grafana-ingress.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: grafana-ingress - namespace: cicd - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: default - source: - repoURL: http://git-server.cicd.svc.cluster.local/conf.git - targetRevision: master - path: f3s/prometheus/grafana-ingress - destination: - server: https://kubernetes.default.svc - namespace: monitoring - syncPolicy: - automated: - prune: true - selfHeal: true - syncOptions: - - CreateNamespace=false - retry: - limit: 3 - backoff: - duration: 5s - factor: 2 - maxDuration: 1m diff --git a/f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled b/f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled new file mode 100644 index 0000000..49b52de --- /dev/null +++ b/f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled @@ -0,0 +1,28 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: grafana-ingress + namespace: cicd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: http://git-server.cicd.svc.cluster.local/conf.git + targetRevision: master + path: f3s/prometheus/grafana-ingress + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false + retry: + limit: 3 + backoff: + duration: 5s + factor: 2 + maxDuration: 1m diff --git a/f3s/argocd-apps/monitoring/loki.yaml b/f3s/argocd-apps/monitoring/loki.yaml deleted file mode 100644 index c7985c2..0000000 --- a/f3s/argocd-apps/monitoring/loki.yaml +++ /dev/null @@ -1,86 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: loki - namespace: cicd - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: default - source: - repoURL: https://grafana.github.io/helm-charts - chart: loki - targetRevision: 6.6.3 - helm: - releaseName: loki - valuesObject: - deploymentMode: SingleBinary - - loki: - auth_enabled: false - commonConfig: - replication_factor: 1 - storage: - type: filesystem - schemaConfig: - configs: - - from: "2024-01-01" - store: tsdb - object_store: filesystem - schema: v13 - index: - prefix: index_ - period: 24h - - singleBinary: - replicas: 1 - extraVolumes: - - name: loki-data - persistentVolumeClaim: - claimName: loki-data-pvc - extraVolumeMounts: - - name: loki-data - mountPath: /var/loki - persistence: - enabled: false - - read: - replicas: 0 - - write: - replicas: 0 - - backend: - replicas: 0 - - gateway: - enabled: false - - chunksCache: - enabled: false - - resultsCache: - enabled: false - - lokiCanary: - enabled: false - - test: - enabled: false - - destination: - server: https://kubernetes.default.svc - namespace: monitoring - - syncPolicy: - automated: - prune: true - selfHeal: true - syncOptions: - - CreateNamespace=false - retry: - limit: 3 - backoff: - duration: 5s - factor: 2 - maxDuration: 1m diff --git a/f3s/argocd-apps/monitoring/loki.yaml.disabled b/f3s/argocd-apps/monitoring/loki.yaml.disabled new file mode 100644 index 0000000..c7985c2 --- /dev/null +++ b/f3s/argocd-apps/monitoring/loki.yaml.disabled @@ -0,0 +1,86 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: loki + namespace: cicd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://grafana.github.io/helm-charts + chart: loki + targetRevision: 6.6.3 + helm: + releaseName: loki + valuesObject: + deploymentMode: SingleBinary + + loki: + auth_enabled: false + commonConfig: + replication_factor: 1 + storage: + type: filesystem + schemaConfig: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + + singleBinary: + replicas: 1 + extraVolumes: + - name: loki-data + persistentVolumeClaim: + claimName: loki-data-pvc + extraVolumeMounts: + - name: loki-data + mountPath: /var/loki + persistence: + enabled: false + + read: + replicas: 0 + + write: + replicas: 0 + + backend: + replicas: 0 + + gateway: + enabled: false + + chunksCache: + enabled: false + + resultsCache: + enabled: false + + lokiCanary: + enabled: false + + test: + enabled: false + + destination: + server: https://kubernetes.default.svc + namespace: monitoring + + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false + retry: + limit: 3 + backoff: + duration: 5s + factor: 2 + maxDuration: 1m diff --git a/f3s/argocd-apps/monitoring/prometheus.yaml b/f3s/argocd-apps/monitoring/prometheus.yaml index ecd3f5c..a63aab7 100644 --- a/f3s/argocd-apps/monitoring/prometheus.yaml +++ b/f3s/argocd-apps/monitoring/prometheus.yaml @@ -169,6 +169,12 @@ spec: equal: ['namespace', 'alertname'] grafana: + # Disabled: SQLite-on-NFS is unreliable across restarts (lock + # state cannot be reacquired cleanly), and Loki + Tempo are + # also disabled, so there's nothing to visualize. Prometheus + # alone is kept for metrics + alerting. + enabled: false + persistence: enabled: true type: pvc diff --git a/f3s/argocd-apps/monitoring/tempo.yaml b/f3s/argocd-apps/monitoring/tempo.yaml deleted file mode 100644 index 0fd6bc1..0000000 --- a/f3s/argocd-apps/monitoring/tempo.yaml +++ /dev/null @@ -1,97 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: tempo - namespace: cicd - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: default - source: - repoURL: https://grafana.github.io/helm-charts - chart: tempo - targetRevision: 1.24.1 - helm: - releaseName: tempo - valuesObject: - # Grafana Tempo - Monolithic Mode Configuration - tempo: - # Retention policy for traces (7 days) - retention: 168h - - # Storage configuration - Local filesystem backend - storage: - trace: - backend: local - local: - path: /var/tempo/traces - wal: - path: /var/tempo/wal - - # Distributor configuration with OTLP receivers - receivers: - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - http: - endpoint: 0.0.0.0:4318 - - # Persistence configuration using hostPath PV - persistence: - enabled: true - size: 10Gi - storageClassName: "" # Empty string for manual PV binding - accessModes: - - ReadWriteOnce - - # Service configuration - service: - type: ClusterIP - - # Resource limits - resources: - limits: - cpu: 1000m - memory: 2Gi - requests: - cpu: 500m - memory: 1Gi - - # Security context - securityContext: - fsGroup: 10001 - runAsUser: 10001 - runAsGroup: 10001 - runAsNonRoot: true - - # Disable components not needed in monolithic mode - gateway: - enabled: false - - # Monitoring integration with Prometheus - serviceMonitor: - enabled: true - labels: - release: prometheus - - # Test pod disabled - test: - enabled: false - - destination: - server: https://kubernetes.default.svc - namespace: monitoring - - syncPolicy: - automated: - prune: true - selfHeal: true - syncOptions: - - CreateNamespace=false - retry: - limit: 3 - backoff: - duration: 5s - factor: 2 - maxDuration: 1m diff --git a/f3s/argocd-apps/monitoring/tempo.yaml.disabled b/f3s/argocd-apps/monitoring/tempo.yaml.disabled new file mode 100644 index 0000000..0fd6bc1 --- /dev/null +++ b/f3s/argocd-apps/monitoring/tempo.yaml.disabled @@ -0,0 +1,97 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: tempo + namespace: cicd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://grafana.github.io/helm-charts + chart: tempo + targetRevision: 1.24.1 + helm: + releaseName: tempo + valuesObject: + # Grafana Tempo - Monolithic Mode Configuration + tempo: + # Retention policy for traces (7 days) + retention: 168h + + # Storage configuration - Local filesystem backend + storage: + trace: + backend: local + local: + path: /var/tempo/traces + wal: + path: /var/tempo/wal + + # Distributor configuration with OTLP receivers + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + # Persistence configuration using hostPath PV + persistence: + enabled: true + size: 10Gi + storageClassName: "" # Empty string for manual PV binding + accessModes: + - ReadWriteOnce + + # Service configuration + service: + type: ClusterIP + + # Resource limits + resources: + limits: + cpu: 1000m + memory: 2Gi + requests: + cpu: 500m + memory: 1Gi + + # Security context + securityContext: + fsGroup: 10001 + runAsUser: 10001 + runAsGroup: 10001 + runAsNonRoot: true + + # Disable components not needed in monolithic mode + gateway: + enabled: false + + # Monitoring integration with Prometheus + serviceMonitor: + enabled: true + labels: + release: prometheus + + # Test pod disabled + test: + enabled: false + + destination: + server: https://kubernetes.default.svc + namespace: monitoring + + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false + retry: + limit: 3 + backoff: + duration: 5s + factor: 2 + maxDuration: 1m -- cgit v1.2.3