summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-16 16:06:14 +0300
committerPaul Buetow <paul@buetow.org>2026-05-16 16:06:14 +0300
commit4b4cde4fe3848c30e9f1cf1efc8cbc46fd50da83 (patch)
treed93511d43226331605c095fa0993cc15b7169e8a
parent107ccb68af18cf3f4bd04bc93bdde1d7c1169f93 (diff)
f3s/monitoring: disable grafana, loki, tempo; reduce alloy to no-opHEADmaster
Grafana's SQLite-on-NFS persistence is unreliable across restarts (the new pod can't reacquire a clean exclusive lock after any NFS bounce), and with Loki + Tempo also gone there's nothing left for it to visualize. Keeping Prometheus alone for metrics + alerting. Changes: - prometheus.yaml: add grafana.enabled=false in the kube-prometheus-stack values so the subchart no longer renders the grafana deployment/pvc. - loki.yaml, tempo.yaml, grafana-ingress.yaml: renamed to .disabled (same pattern as commit 03a18c6) so 'kubectl apply -f argocd-apps/' stops re-creating them; the cluster Applications were also deleted, which cascade-removes the helm resources via the resources-finalizer. - alloy.yaml: drop the loki.write and otelcol.* blocks (no destinations to ship to). DaemonSet stays deployed with a minimal 'logging' block so the chart can be re-enabled by restoring the blocks here. Prometheus TSDB was also wiped (corrupted zero-byte WAL segments from the same NFS blip that took grafana down) — done separately, not part of this commit.
-rw-r--r--f3s/argocd-apps/monitoring/alloy.yaml99
-rw-r--r--f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled (renamed from f3s/argocd-apps/monitoring/grafana-ingress.yaml)0
-rw-r--r--f3s/argocd-apps/monitoring/loki.yaml.disabled (renamed from f3s/argocd-apps/monitoring/loki.yaml)0
-rw-r--r--f3s/argocd-apps/monitoring/prometheus.yaml6
-rw-r--r--f3s/argocd-apps/monitoring/tempo.yaml.disabled (renamed from f3s/argocd-apps/monitoring/tempo.yaml)0
5 files changed, 12 insertions, 93 deletions
diff --git a/f3s/argocd-apps/monitoring/alloy.yaml b/f3s/argocd-apps/monitoring/alloy.yaml
index c5574b1..e2105e3 100644
--- a/f3s/argocd-apps/monitoring/alloy.yaml
+++ b/f3s/argocd-apps/monitoring/alloy.yaml
@@ -15,101 +15,14 @@ spec:
releaseName: alloy
valuesObject:
alloy:
- service:
- ports:
- otlp-grpc:
- enabled: true
- port: 4317
- targetPort: 4317
- protocol: TCP
- otlp-http:
- enabled: true
- port: 4318
- targetPort: 4318
- protocol: TCP
-
+ # Log shipping (to Loki) and trace forwarding (to Tempo) are
+ # disabled — Loki and Tempo apps are off. The DaemonSet stays
+ # deployed with a no-op config so the chart can be re-enabled
+ # by restoring the discovery/loki/otelcol blocks here.
configMap:
content: |
- discovery.kubernetes "pods" {
- role = "pod"
- }
-
- discovery.relabel "pods" {
- targets = discovery.kubernetes.pods.targets
-
- rule {
- source_labels = ["__meta_kubernetes_namespace"]
- target_label = "namespace"
- }
-
- rule {
- source_labels = ["__meta_kubernetes_pod_name"]
- target_label = "pod"
- }
-
- rule {
- source_labels = ["__meta_kubernetes_pod_container_name"]
- target_label = "container"
- }
-
- rule {
- source_labels = ["__meta_kubernetes_pod_label_app"]
- target_label = "app"
- }
- }
-
- loki.source.kubernetes "pods" {
- targets = discovery.relabel.pods.output
- forward_to = [loki.write.default.receiver]
- }
-
- loki.write "default" {
- endpoint {
- url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
- }
- }
-
- // ========================================
- // TRACES COLLECTION
- // ========================================
-
- // OTLP receiver for traces via gRPC and HTTP
- otelcol.receiver.otlp "default" {
- grpc {
- endpoint = "0.0.0.0:4317"
- }
-
- http {
- endpoint = "0.0.0.0:4318"
- }
-
- output {
- traces = [otelcol.processor.batch.default.input]
- }
- }
-
- // Batch processor for efficient trace forwarding
- otelcol.processor.batch "default" {
- timeout = "5s"
- send_batch_size = 100
- send_batch_max_size = 200
-
- output {
- traces = [otelcol.exporter.otlp.tempo.input]
- }
- }
-
- // OTLP exporter to send traces to Tempo
- otelcol.exporter.otlp "tempo" {
- client {
- endpoint = "tempo.monitoring.svc.cluster.local:4317"
-
- tls {
- insecure = true
- }
-
- compression = "gzip"
- }
+ logging {
+ level = "info"
}
destination:
diff --git a/f3s/argocd-apps/monitoring/grafana-ingress.yaml b/f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled
index 49b52de..49b52de 100644
--- a/f3s/argocd-apps/monitoring/grafana-ingress.yaml
+++ b/f3s/argocd-apps/monitoring/grafana-ingress.yaml.disabled
diff --git a/f3s/argocd-apps/monitoring/loki.yaml b/f3s/argocd-apps/monitoring/loki.yaml.disabled
index c7985c2..c7985c2 100644
--- a/f3s/argocd-apps/monitoring/loki.yaml
+++ b/f3s/argocd-apps/monitoring/loki.yaml.disabled
diff --git a/f3s/argocd-apps/monitoring/prometheus.yaml b/f3s/argocd-apps/monitoring/prometheus.yaml
index ecd3f5c..a63aab7 100644
--- a/f3s/argocd-apps/monitoring/prometheus.yaml
+++ b/f3s/argocd-apps/monitoring/prometheus.yaml
@@ -169,6 +169,12 @@ spec:
equal: ['namespace', 'alertname']
grafana:
+ # Disabled: SQLite-on-NFS is unreliable across restarts (lock
+ # state cannot be reacquired cleanly), and Loki + Tempo are
+ # also disabled, so there's nothing to visualize. Prometheus
+ # alone is kept for metrics + alerting.
+ enabled: false
+
persistence:
enabled: true
type: pvc
diff --git a/f3s/argocd-apps/monitoring/tempo.yaml b/f3s/argocd-apps/monitoring/tempo.yaml.disabled
index 0fd6bc1..0fd6bc1 100644
--- a/f3s/argocd-apps/monitoring/tempo.yaml
+++ b/f3s/argocd-apps/monitoring/tempo.yaml.disabled