diff options
| author | Paul Buetow <paul@buetow.org> | 2026-02-14 13:54:54 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-02-14 13:54:54 +0200 |
| commit | 3a6e01c1abd4a68810f1d85c9aa75293af47f579 (patch) | |
| tree | 2e3c066392cf2a292e89c90f259d039ce0afcb9b | |
| parent | f3ea9a7a1f466b6109271c76eb58189d2a799998 (diff) | |
docs: restructure documentation and move scripts to scripts/
- Add docs/ hierarchy: guides, backends, operations, reference, design
- Slim root README; add documentation index and links to docs/
- Add missing docs: csv-format-flexibility, dns-resolution, dtail-metrics-example, magefile
- Document Prometheus/VictoriaMetrics and ClickHouse backends
- Move all helper shell scripts to scripts/; update Magefile and doc references
- Add ASCII diagrams for watch mode (CSV watcher), auto mode, and ingestion paths
- Add .gitignore
Co-authored-by: Cursor <cursoragent@cursor.com>
33 files changed, 1715 insertions, 1049 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c0f69ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +# Binaries +./epimetheus +prometheus-pusher + +# Test coverage +coverage.out +coverage.html +*.prof + +# Logs +*.log + +# OS files +.DS_Store +Thumbs.db + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Test data +test-*.csv +test-*.json + +# Temporary files +tmp/ +benchmark-data-100mb.csv +benchmark-results/ +OLD/ diff --git a/Magefile.go b/Magefile.go index 3cce6e0..6a0b124 100644 --- a/Magefile.go +++ b/Magefile.go @@ -179,7 +179,7 @@ func Dev() error { // GenerateTestData creates test data files func GenerateTestData() error { fmt.Println("Generating test data...") - return sh.RunV("./generate-test-data.sh") + return sh.RunV("./scripts/generate-test-data.sh") } // Backfill runs backfill for the last 48 hours @@ -192,31 +192,31 @@ func Backfill() error { // Benchmark100MB runs the 100MB benchmark func Benchmark100MB() error { fmt.Println("Running 100MB benchmark...") - return sh.RunV("./benchmark-100mb.sh") + return sh.RunV("./scripts/benchmark-100mb.sh") } // Benchmark1GB runs the 1GB benchmark func Benchmark1GB() error { fmt.Println("Running 1GB benchmark...") - return sh.RunV("./benchmark-1gb.sh") + return sh.RunV("./scripts/benchmark-1gb.sh") } // CleanupBenchmarkData removes benchmark data from Prometheus func CleanupBenchmarkData() error { fmt.Println("Cleaning up benchmark data...") - return sh.RunV("./cleanup-benchmark-data.sh") + return sh.RunV("./scripts/cleanup-benchmark-data.sh") } // CleanupBenchmarkMetrics removes benchmark metric files func CleanupBenchmarkMetrics() error { fmt.Println("Cleaning up benchmark metric files...") - return sh.RunV("./cleanup-benchmark-metrics.sh") + return sh.RunV("./scripts/cleanup-benchmark-metrics.sh") } // DeployDashboard deploys the Grafana dashboard func DeployDashboard() error { fmt.Println("Deploying Grafana dashboard...") - return sh.RunV("./deploy-dashboard.sh") + return sh.RunV("./scripts/deploy-dashboard.sh") } // Help prints available targets @@ -4,993 +4,82 @@ # Epimetheus -A versatile Go tool for pushing metrics to Prometheus with support for both realtime and historic data ingestion. +A versatile Go tool for pushing metrics to Prometheus (and Prometheus-compatible backends like VictoriaMetrics) and ClickHouse, with support for realtime and historic data ingestion. ## Why "Epimetheus"? -In Greek mythology, [Epimetheus](https://en.wikipedia.org/wiki/Epimetheus_(mythology)) is Prometheus's brother, whose name means "afterthought" or "hindsight" (while Prometheus means "forethought"). This name cleverly captures the tool's purpose: bringing data to Prometheus **after** collection, whether it's historic data from hours, days, or weeks ago, or realtime data pushed on-demand. - -While Epimetheus is sometimes depicted as foolish in myths (he accepted Pandora's box despite warnings), this tool embraces the "afterthought" aspect productively - it's never too late to bring your metrics home to Prometheus! - -## Architecture - -``` -┌─────────────────────────────────────────────────────────────────────────┐ -│ Epimetheus │ -│ (Metrics Ingestion Tool) │ -│ │ -│ Modes: │ -│ • Realtime - Current metrics (< 5 min old) │ -│ • Historic - Historic metrics (≥ 5 min old) │ -│ • Backfill - Range of historic data │ -│ • Auto - Automatic routing based on timestamp age │ -└─────────────────────────────────────────────────────────────────────────┘ - │ │ - │ Realtime Data │ Historic Data - │ (via HTTP POST) │ (via Remote Write API) - │ Uses "now" timestamp │ Preserves timestamps - ▼ ▼ -┌─────────────────────┐ ┌─────────────────────┐ -│ Pushgateway │ │ Prometheus │ -│ (Port 9091) │ │ (Port 9090) │ -│ │ │ │ -│ • Buffers metrics │ │ Remote Write API: │ -│ • Scraped by │──── Scraped ─────▶ │ /api/v1/write │ -│ Prometheus │ every 15-30s │ │ -│ • No timestamp │ │ Feature Required: │ -│ preservation │ │ --enable-feature= │ -│ │ │ remote-write- │ -│ │ │ receiver │ -└─────────────────────┘ └─────────────────────┘ - │ - │ Prometheus Query API - │ /api/v1/query - ▼ - ┌─────────────────────┐ - │ Grafana │ - │ (Port 3000) │ - │ │ - │ • Prometheus as │ - │ datasource │ - │ • Dashboards: │ - │ - Epimetheus │ - │ Test Metrics │ - │ • Auto-refresh │ - └─────────────────────┘ -``` - -### Data Flow - -1. **Realtime Path** (for current data): - - Epimetheus → Pushgateway (HTTP POST) - - Prometheus scrapes Pushgateway periodically - - Timestamp = "now" when Prometheus scrapes - -2. **Historic Path** (for old data): - - Epimetheus → Prometheus Remote Write API (HTTP POST) - - Direct write to Prometheus TSDB - - Timestamp preserved from original data - -3. **Visualization**: - - Grafana queries Prometheus - - Displays metrics in dashboards - - Auto-refresh every 10 seconds +In Greek mythology, [Epimetheus](https://en.wikipedia.org/wiki/Epimetheus_(mythology)) is Prometheus's brother—"afterthought" or "hindsight" (while Prometheus means "forethought"). This tool brings data to Prometheus **after** collection: historic data from hours or days ago, or realtime data pushed on-demand. It's never too late to bring your metrics home. ## Overview -**epimetheus** is a standalone binary that: -- **Generates** realistic example metrics simulating production applications -- **Pushes** metrics via Pushgateway (realtime) or Remote Write API (historic) -- **Automatically detects** timestamp age and chooses the optimal ingestion method -- **Supports** multiple data formats (CSV, JSON) and all Prometheus metric types -- **Provides** Grafana dashboard for visualizing test metrics - -## Quick Start - -### 1. Deploy Pushgateway (one-time setup) - -The Pushgateway Helm chart is available in the [conf repository](https://codeberg.org/snonux/conf) at `f3s/pushgateway/helm-chart`. - -```bash -# Clone the conf repository if you haven't already -git clone https://codeberg.org/snonux/conf.git -cd conf/f3s/pushgateway/helm-chart - -# Deploy Pushgateway -helm upgrade --install pushgateway . -n monitoring --create-namespace -``` - -Alternatively, deploy Pushgateway using the official chart: - -```bash -helm repo add prometheus-community https://prometheus-community.github.io/helm-charts -helm install pushgateway prometheus-community/prometheus-pushgateway -n monitoring --create-namespace -``` - -### 2. Run in Realtime Mode - -```bash -# Port-forward Pushgateway -kubectl port-forward -n monitoring svc/pushgateway 9091:9091 & - -# Push test metrics continuously -cd /home/paul/git/conf/f3s/epimetheus -./epimetheus -mode=realtime -continuous -``` - -The binary pushes metrics every 15 seconds. Press Ctrl+C to stop. - -### 3. View Metrics - -```bash -# Pushgateway UI -open http://localhost:9091 - -# Prometheus UI -kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 & -open http://localhost:9090 -``` - -## Operating Modes - -### 👁️ Watch Mode -Monitor CSV files for changes and push metrics to Prometheus with file modification timestamps. - -**Works with ANY CSV format** - automatically detects numeric vs string columns and sanitizes names. - -**NEW: Automatic DNS Resolution** - IP addresses are automatically resolved to hostnames for better observability in Grafana. - -```bash -./epimetheus -mode=watch \ - -file=mydata.csv \ - -metric-name=myapp \ - -prometheus=http://localhost:9090/api/v1/write -``` - -**Features:** -- 🔍 **Format-agnostic**: Works with any tabular CSV structure -- 📊 **Automatic detection**: Numeric columns → metrics, String columns → labels -- 🏷️ **Name sanitization**: `min(potatoes)`, `avg(time)`, `p99(latency)` → valid metric names -- 🌐 **DNS Resolution**: IP addresses → hostnames (e.g., `10.50.52.61` → `foo.example.lan`) -- 💾 **Smart Caching**: In-memory cache prevents redundant DNS lookups -- ⏱️ **Timestamp preservation**: Uses file modification time -- 🔄 **Continuous monitoring**: Polls file every 1 second -- 💪 **Error resilient**: Continues watching despite failures -- 🎯 **Remote Write**: Pushes to Prometheus (preserves timestamps) - -**CSV Format:** -Works with any tabular CSV: -- First row: column headers (automatically sanitized) -- Subsequent rows: data values -- Column names can be anything: `min(x)`, `avg(y)`, `p99(latency)`, etc. - -**Example 1** - Web metrics: -```csv -avg(response_time),p99(latency),endpoint,method -45.2,120.5,/api/users,GET -52.1,135.8,/api/orders,POST -``` - -Generates: -```promql -web_avg_response_time{endpoint="/api/users",method="GET"} 45.2 -web_p99_latency{endpoint="/api/users",method="GET"} 120.5 -web_avg_response_time{endpoint="/api/orders",method="POST"} 52.1 -web_p99_latency{endpoint="/api/orders",method="POST"} 135.8 -``` - -**Example 2** - Food metrics: -```csv -min(potatoes),last(coke),avg(price),country,store_type -5.2,10.5,12.99,USA,grocery -3.8,8.2,9.99,Canada,convenience -``` - -Generates: -```promql -food_min_potatoes{country="USA",store_type="grocery"} 5.2 -food_last_coke{country="USA",store_type="grocery"} 10.5 -food_avg_price{country="USA",store_type="grocery"} 12.99 -# ... etc -``` - -Each row generates N samples (N = number of numeric columns). - -See [CSV-FORMAT-FLEXIBILITY.md](CSV-FORMAT-FLEXIBILITY.md) for more examples. - -**Options:** -- `-file` - CSV file to watch (required) -- `-metric-name` - Base metric name (required, e.g., `food`, `network`, `database`) -- `-prometheus` - Prometheus Remote Write URL (default: http://localhost:9090/api/v1/write) -- `-clickhouse` - ClickHouse HTTP URL (e.g. http://localhost:8123) to also ingest metrics -- `-clickhouse-table` - ClickHouse table name (default: epimetheus_metrics) -- `-job` - Job name for metrics (default: example_metrics_pusher) -- `-resolve-ip-labels` - Additional IP labels to resolve via DNS (default: ip is always resolved) - -**ClickHouse Support:** -Watch mode can ingest to ClickHouse in addition to (or instead of) Prometheus: - -```bash -# Ingest to both Prometheus and ClickHouse -./epimetheus -mode=watch -file=data.csv -metric-name=myapp \ - -prometheus=http://localhost:9090/api/v1/write \ - -clickhouse=http://localhost:8123 - -# ClickHouse only (use -prometheus= to disable Prometheus) -./epimetheus -mode=watch -file=test-data/watch-clickhouse-test.csv \ - -metric-name=watch_test -clickhouse=http://localhost:8123 -prometheus= - -# Verify data in ClickHouse -./verify-clickhouse.sh -``` - -**DNS Resolution:** -By default, the `ip` label is automatically resolved to a hostname. To resolve additional IP labels: - -```bash -./epimetheus -mode=watch \ - -file=network.csv \ - -metric-name=network \ - -resolve-ip-labels=source_ip,dest_ip -``` - -This will resolve: `ip` (default) + `source_ip` + `dest_ip` - -**Example:** -- Input: `ip="10.50.52.61"` -- Output: `ip="foo.example.lan"` -- Failed lookups: IP remains unchanged - -**Documentation:** -- [DNS-RESOLUTION-FEATURE.md](DNS-RESOLUTION-FEATURE.md) - Complete DNS resolution guide -- [CSV-FORMAT-FLEXIBILITY.md](CSV-FORMAT-FLEXIBILITY.md) - Works with ANY CSV format -- [DTAIL-METRICS-EXAMPLE.md](DTAIL-METRICS-EXAMPLE.md) - Detailed dtail.csv example - -### 🔄 Realtime Mode (Default) -Push current metrics to Pushgateway with "now" timestamp. - -```bash -./epimetheus -mode=realtime -continuous -``` - -**Options:** -- `-pushgateway` - Pushgateway URL (default: http://localhost:9091) -- `-job` - Job name (default: example_metrics_pusher) -- `-continuous` - Keep pushing every 15 seconds - -### ⏰ Historic Mode -Push a single datapoint from the past using Remote Write API. - -```bash -# Port-forward Prometheus -kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 & - -# Push data from 24 hours ago -./epimetheus -mode=historic -hours-ago=24 -``` - -**Options:** -- `-prometheus` - Prometheus URL (default: http://localhost:9090/api/v1/write) -- `-hours-ago` - Hours in the past (default: 24) - -### 📦 Backfill Mode -Import a range of historic data points. - -```bash -# Backfill last 48 hours with 1-hour intervals -./epimetheus -mode=backfill -start-hours=48 -end-hours=0 -interval=1 - -# Backfill last week with 6-hour intervals -./epimetheus -mode=backfill -start-hours=168 -end-hours=0 -interval=6 -``` - -**Options:** -- `-start-hours` - Start time in hours ago -- `-end-hours` - End time in hours ago (0 = now) -- `-interval` - Interval between points in hours - -### 🤖 Auto Mode (Recommended!) -Automatically detect timestamp age and route to the correct ingestion method. - -```bash -# Generate test data -./generate-test-data.sh - -# Import mixed current and historic data -./epimetheus -mode=auto -file=test-all-ages.csv -``` - -**Detection Logic:** -- Data < 5 minutes old → Pushgateway (realtime) -- Data ≥ 5 minutes old → Remote Write (historic) - -**Options:** -- `-file` - Input file path -- `-format` - Data format: csv or json (default: csv) -- `-pushgateway` - Pushgateway URL -- `-prometheus` - Prometheus Remote Write URL - -## Data Formats - -### CSV Format - -```csv -# Format: metric_name,labels,value,timestamp_ms -# Labels: key1=value1;key2=value2 -epimetheus_test_requests_total,instance=web1;env=prod,100,1767125148000 -epimetheus_test_temperature_celsius,instance=web2,22.5,1767038748000 - -# Timestamp is optional (uses "now" if omitted) -epimetheus_test_active_connections,instance=web3,42, -``` - -### JSON Format - -```json -[ - { - "metric": "epimetheus_test_requests_total", - "labels": {"instance": "web1", "env": "prod"}, - "value": 100, - "timestamp_ms": 1767125148000 - }, - { - "metric": "epimetheus_test_temperature_celsius", - "labels": {"instance": "web2"}, - "value": 22.5, - "timestamp_ms": 1767038748000 - } -] -``` - -## Test Metrics - -All generated metrics use the `epimetheus_test_` prefix to clearly identify them as test data. +Epimetheus is a standalone binary that: -### Counter: `epimetheus_test_requests_total` -- **Type:** Counter (monotonically increasing) -- **Description:** Total number of requests processed -- **Use case:** Counting total events, requests, errors +- Pushes metrics via **Pushgateway** (realtime) or **Remote Write API** (historic, watch) +- Optionally ingests to **ClickHouse** in watch mode +- Supports **Prometheus-compatible backends** (e.g. VictoriaMetrics) by using their Remote Write URL +- Offers modes: realtime, historic, backfill, auto, and watch (CSV file monitoring) +- Accepts CSV and JSON input and provides a Grafana dashboard for test metrics -### Gauge: `epimetheus_test_active_connections` -- **Type:** Gauge (can increase or decrease) -- **Description:** Current number of active connections (0-100) -- **Use case:** Current state measurements, capacity - -### Gauge: `epimetheus_test_temperature_celsius` -- **Type:** Gauge -- **Description:** Current temperature in Celsius (0-50°C) -- **Use case:** Environmental monitoring - -### Histogram: `epimetheus_test_request_duration_seconds` -- **Type:** Histogram (distribution) -- **Description:** Request duration distribution -- **Buckets:** 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10 seconds -- **Use case:** Latency measurements, SLO tracking - -### Labeled Counter: `epimetheus_test_jobs_processed_total` -- **Type:** Counter with labels -- **Description:** Jobs processed by type and status -- **Labels:** - - `job_type`: email, report, backup - - `status`: success, failed -- **Use case:** Categorized counting, multi-dimensional metrics - -## Grafana Dashboard - -A comprehensive dashboard is available showcasing all test metrics. - -### Dashboard Features - -- **8 Panels:** - 1. Request Rate (line graph) - 2. Total Requests (stat panel) - 3. Active Connections (gauge with thresholds) - 4. Temperature (gauge with thresholds) - 5. Request Duration Histogram (p50, p90, p99) - 6. Average Request Duration (stat) - 7. Jobs Processed by Type (bar gauge) - 8. Jobs Status Breakdown (table) - -- **Auto-refresh:** Every 10 seconds -- **Time range:** Last 15 minutes (customizable) -- **Dark theme optimized** - -### Deploy Dashboard - -#### Option 1: Helm/Kubernetes ConfigMap (Recommended) - -```bash -# Deploy via Kubernetes ConfigMap -kubectl apply -f ../prometheus/epimetheus-dashboard.yaml -``` - -The dashboard will be automatically discovered by Grafana. - -#### Option 2: Manual Import - -```bash -# Port-forward Grafana -kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80 - -# Open Grafana -open http://localhost:3000 - -# Go to Dashboards → Import → Upload grafana-dashboard.json -``` - -#### Option 3: Automated Script - -```bash -# Deploy via API -./deploy-dashboard.sh - -# Or with custom credentials -GRAFANA_URL="http://localhost:3000" \ -GRAFANA_USER="admin" \ -GRAFANA_PASSWORD="yourpassword" \ -./deploy-dashboard.sh -``` - -## Example Queries - -### Basic Queries - -```promql -# View total requests -epimetheus_test_requests_total - -# View request rate over last 5 minutes -rate(epimetheus_test_requests_total[5m]) - -# View current active connections -epimetheus_test_active_connections - -# View current temperature -epimetheus_test_temperature_celsius -``` - -### Histogram Queries - -```promql -# 95th percentile request duration -histogram_quantile(0.95, rate(epimetheus_test_request_duration_seconds_bucket[5m])) - -# 50th percentile (median) -histogram_quantile(0.50, rate(epimetheus_test_request_duration_seconds_bucket[5m])) - -# Average request duration -rate(epimetheus_test_request_duration_seconds_sum[5m]) / -rate(epimetheus_test_request_duration_seconds_count[5m]) -``` - -### Labeled Counter Queries - -```promql -# Failed jobs by type -epimetheus_test_jobs_processed_total{status="failed"} - -# Job success rate -rate(epimetheus_test_jobs_processed_total{status="success"}[5m]) / -rate(epimetheus_test_jobs_processed_total[5m]) - -# Total jobs by type -sum by (job_type) (epimetheus_test_jobs_processed_total) -``` - -### Curl Examples - -```bash -# Port-forward Prometheus -kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 & - -# Query total requests -curl -s "http://localhost:9090/api/v1/query?query=epimetheus_test_requests_total" | jq . - -# Query temperature -curl -s "http://localhost:9090/api/v1/query?query=epimetheus_test_temperature_celsius" | jq . - -# Query request rate -curl -s "http://localhost:9090/api/v1/query?query=rate(epimetheus_test_requests_total[5m])" | jq . - -# Query histogram p95 -curl -s "http://localhost:9090/api/v1/query?query=histogram_quantile(0.95,rate(epimetheus_test_request_duration_seconds_bucket[5m]))" | jq . -``` - -## Time Range Limitations - -### ✅ Supported Time Ranges - -| Time Range | Status | Method | -|------------|--------|--------| -| Current (< 5 min) | ✅ Works | Pushgateway | -| 1 hour old | ✅ Works | Remote Write | -| 1 day old | ✅ Works | Remote Write | -| 1 week old | ✅ Works | Remote Write | -| 1 month old | ✅ Works | Remote Write | - -### ⚠️ Potential Issues - -- **Future timestamps:** Rejected (> 5 minutes in future) -- **Very old data (6+ months):** May be rejected depending on Prometheus retention -- **Years old:** Likely rejected - use `promtool tsdb create-blocks-from` instead -- **Out-of-order samples:** Can't insert older data into existing time series (use different labels) - -### Prometheus Configuration - -Check your retention settings: - -```bash -# View retention -kubectl get prometheus -n monitoring prometheus-kube-prometheus-prometheus \ - -o jsonpath='{.spec.retention}' - -# Default is typically 15 days -``` - -For very old data: -- Increase retention in Prometheus config -- Enable out-of-order ingestion (experimental) -- Use `promtool` for direct TSDB block creation - -## Project Structure - -``` -epimetheus/ -├── cmd/ -│ └── epimetheus/ -│ └── main.go # Main entry point -├── internal/ -│ ├── config/ # Configuration -│ ├── metrics/ # Metric generators -│ ├── parser/ # CSV/JSON parsers (includes tabular CSV) -│ ├── ingester/ # Pushgateway & Remote Write ingesters -│ └── watcher/ # File watcher for watch mode -├── epimetheus # Compiled binary -├── grafana-dashboard.json # Grafana dashboard definition -├── deploy-dashboard.sh # Dashboard deployment script -├── generate-test-data.sh # Test data generator -├── run.sh # Helper script -└── README.md # This file -``` - -## Setup Requirements - -### 1. Enable Prometheus Remote Write Receiver ⚠️ **REQUIRED for Historic Data** - -**IMPORTANT**: To use historic mode, backfill mode, or auto mode with old data, you **must** enable the Prometheus Remote Write receiver. Without this feature, Epimetheus can only push realtime data via Pushgateway. - -The Remote Write receiver is configured in the [conf repository](https://codeberg.org/snonux/conf) at `f3s/prometheus/persistence-values.yaml`: - -```yaml -# In prometheus/persistence-values.yaml (from conf repository) -prometheus: - prometheusSpec: - # Enable Remote Write receiver endpoint and Admin API (Prometheus 3.x syntax) - additionalArgs: - - name: web.enable-remote-write-receiver - value: "" - - name: web.enable-admin-api - value: "" - - # Enable out-of-order ingestion for backfilling - # Allows writing data points older than existing data for the same time series - enableFeatures: - - exemplar-storage - - otlp-write-receiver - - # Allow backfilling up to 31 days in the past (provides 1-day buffer for 30-day datasets) - tsdb: - outOfOrderTimeWindow: 744h # 31 days -``` - -**What This Enables:** -- **Remote Write API**: HTTP endpoint at `/api/v1/write` for ingesting metrics with custom timestamps -- **Admin API**: HTTP endpoints at `/api/v1/admin/tsdb/*` for data deletion and management -- **Out-of-Order Ingestion**: Allows writing data points older than existing data for the same time series -- **31-Day Window**: Can backfill data up to 31 days in the past (provides 1-day buffer for 30-day datasets) - -After updating the configuration, upgrade your Prometheus installation: - -```bash -cd conf/f3s/prometheus -just upgrade # Or manually: -# helm upgrade prometheus prometheus-community/kube-prometheus-stack \ -# -n monitoring -f persistence-values.yaml -``` - -Verify the features are enabled: - -```bash -# Check Remote Write receiver flag -kubectl get pod -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 \ - -o jsonpath='{.spec.containers[0].args}' | grep -o "web.enable-remote-write-receiver" - -# Check out-of-order time window -kubectl get prometheus -n monitoring prometheus-kube-prometheus-prometheus \ - -o jsonpath='{.spec.tsdb.outOfOrderTimeWindow}' -# Should output: 744h - -# Check admin API flag -kubectl get pod -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 \ - -o jsonpath='{.spec.containers[0].args}' | grep -o "web.enable-admin-api" -``` - -**Performance Considerations:** - -This configuration is designed for ad-hoc troubleshooting and development, **NOT production use**. Enabling these features has trade-offs: - -- **Increased Memory Usage**: Out-of-order ingestion requires additional memory for buffering and sorting time series -- **Higher TSDB Overhead**: Prometheus TSDB needs to handle non-sequential writes, increasing disk I/O -- **Query Performance**: Queries may be slower due to fragmented data blocks -- **Storage Amplification**: Out-of-order samples can trigger additional compactions, increasing storage usage - -**Recommendation for Production:** -- Keep `outOfOrderTimeWindow` as small as possible (or disabled) -- Monitor Prometheus memory and disk usage closely -- Use Remote Write only when necessary -- Consider using dedicated testing/development Prometheus instances - -**Note**: The syntax changed in Prometheus 3.x - use `additionalArgs` with `web.enable-remote-write-receiver` instead of the deprecated `enableFeatures: [remote-write-receiver]`. - -### 2. Update Prometheus Scrape Config +## Quick Start -Ensure Pushgateway is in scrape targets: +1. **Build:** `mage build` or `go build -o epimetheus cmd/epimetheus/main.go` +2. **Realtime (Pushgateway):** Deploy Pushgateway and Prometheus, then run: + ```bash + ./epimetheus -mode=realtime -continuous + ``` +3. **Watch (Remote Write):** Enable [Remote Write receiver](docs/operations/setup-prometheus.md), then: + ```bash + ./epimetheus -mode=watch -file=mydata.csv -metric-name=myapp -prometheus=http://localhost:9090/api/v1/write + ``` +4. **View:** Prometheus at http://localhost:9090 (after port-forward if needed). For full steps see [Quick Start](docs/guides/quickstart.md). -```yaml -# additional-scrape-configs.yaml -- job_name: 'pushgateway' - honor_labels: true - static_configs: - - targets: - - 'pushgateway.monitoring.svc.cluster.local:9091' -``` +## Documentation -Apply the configuration: +Full documentation is in the [docs](docs/README.md) directory: -```bash -kubectl create secret generic additional-scrape-configs \ - --from-file=/home/paul/git/conf/f3s/prometheus/additional-scrape-configs.yaml \ - --dry-run=client -o yaml -n monitoring | kubectl apply -f - -``` +| Section | Description | +|---------|-------------| +| [Guides](docs/guides/quickstart.md) | [Quick Start](docs/guides/quickstart.md), [Modes](docs/guides/modes.md), [Data Formats](docs/guides/data-formats.md), [CSV flexibility](docs/guides/csv-format-flexibility.md), [DNS resolution](docs/guides/dns-resolution.md), [Dtail example](docs/guides/dtail-metrics-example.md) | +| [Backends](docs/backends/prometheus.md) | [Prometheus / VictoriaMetrics](docs/backends/prometheus.md), [ClickHouse](docs/backends/clickhouse.md) | +| [Operations](docs/operations/setup-prometheus.md) | [Setup Prometheus](docs/operations/setup-prometheus.md), [Setup ClickHouse](docs/operations/setup-clickhouse.md), [Troubleshooting](docs/operations/troubleshooting.md), [Cleanup](docs/operations/cleanup.md), [macOS](docs/operations/macos-setup.md), [Kubernetes](docs/operations/kubernetes.md) | +| [Reference](docs/reference/cli.md) | [CLI](docs/reference/cli.md), [Test metrics](docs/reference/test-metrics.md), [Grafana dashboard](docs/reference/grafana-dashboard.md), [Example queries](docs/reference/example-queries.md), [Magefile](docs/reference/magefile.md) | +| [Design](docs/design/architecture.md) | [Architecture](docs/design/architecture.md) | -## Building from Source +[Documentation index](docs/README.md) — complete list with one-line descriptions. -### Using Mage (Recommended) +## Building -This project includes a [Magefile](./MAGEFILE.md) for easy building, testing, and running: +**Using Mage (recommended):** ```bash -# Install Mage (one-time setup) go install github.com/magefile/mage@latest - -# Build binary mage build - -# Run tests mage test - -# Run with coverage report -mage testCoverage - -# Run in realtime mode -mage run - -# See all available targets -mage -l +mage run # realtime mode ``` -See [MAGEFILE.md](./MAGEFILE.md) for complete documentation. +See [Magefile reference](docs/reference/magefile.md) for all targets. -### Using Go directly +**Using Go:** ```bash -# Build binary go build -o epimetheus cmd/epimetheus/main.go - -# Run tests -go test ./... -v - -# Check test coverage -go test ./... -cover -``` - -## Troubleshooting - -### Binary can't connect to Pushgateway - -```bash -# Check port-forward is running -ps aux | grep "port-forward.*9091" - -# Restart port-forward -kubectl port-forward -n monitoring svc/pushgateway 9091:9091 -``` - -### Metrics not appearing in Prometheus - -```bash -# Check Pushgateway has metrics -curl http://localhost:9091/metrics | grep "prometheus_pusher_test" - -# Check Prometheus scrape targets -# Open http://localhost:9090/targets - look for "pushgateway" job - -# Check Prometheus logs -kubectl logs -n monitoring -l app.kubernetes.io/name=prometheus +go test ./... ``` -### "Remote write receiver not enabled" error - -```bash -# Verify feature is enabled -kubectl logs -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 | grep "remote-write-receiver" - -# Should see: msg="Experimental features enabled" features=[remote-write-receiver] -``` - -### "Out of order sample" error - -This occurs when trying to insert data older than existing data for the same time series. - -**Solutions:** -- Use different job labels for historic data (e.g., `job="historic_data"`) -- Enable out-of-order ingestion in Prometheus (experimental) -- Ensure backfill goes from oldest to newest - -### Dashboard not appearing in Grafana - -```bash -# Check ConfigMap exists -kubectl get configmap -n monitoring | grep epimetheus - -# Check labels -kubectl get configmap epimetheus-dashboard -n monitoring -o yaml | grep "grafana_dashboard" - -# Restart Grafana to force reload -kubectl rollout restart deployment/prometheus-grafana -n monitoring -``` - -## Architecture - -``` -┌─────────────────┐ -│ Go Binary │ -│ (prometheus- │──Push realtime──┐ -│ pusher) │ │ -└─────────────────┘ ▼ - │ ┌──────────────────┐ - │ │ Pushgateway │◄──Scrape──┐ - │ │ (Port 9091) │ │ - │ └──────────────────┘ │ - │ │ - └──Push historic──────────────────┐ │ - ▼ │ - ┌─────────────────┐ │ - │ Prometheus │◄────┘ - │ (Port 9090) │ - │ Remote Write API│ - └─────────────────┘ - │ - │ Datasource - ▼ - ┌─────────────────┐ - │ Grafana │ - │ (Port 3000) │ - │ Dashboards │ - └─────────────────┘ -``` - -## Best Practices - -### When to Use Pushgateway vs. Remote Write - -**Use Pushgateway (realtime mode):** -- Short-lived batch jobs -- Service-level metrics -- Jobs behind firewalls -- Current/recent data (< 5 minutes old) - -**Use Remote Write (historic mode):** -- Historic data import -- Backfilling gaps -- Data migration -- Data older than 5 minutes - -**Use Auto Mode:** -- Mixed current and historic data -- Importing from files -- Unknown timestamp ages -- General-purpose ingestion - -### Metric Design - -- **Use appropriate metric types:** - - Counter for cumulative values (requests, errors) - - Gauge for point-in-time values (temperature, connections) - - Histogram for distributions (latency, sizes) - -- **Label cardinality:** - - Include meaningful labels - - Avoid high-cardinality labels (user IDs, timestamps) - - Keep label combinations reasonable (< 1000 per metric) - -- **Naming conventions:** - - Use descriptive names - - Include units in gauge names (\_celsius, \_bytes) - - Use \_total suffix for counters - -## Cleanup - -### Cleaning Up Benchmark Data from Prometheus - -For cleaning up benchmark metrics from Prometheus, use the provided cleanup script: - -```bash -# Port-forward to Prometheus -kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 & - -# Run the cleanup script -./cleanup-benchmark-data.sh -``` - -The script will: -1. Delete all `epimetheus_benchmark_*` metrics using the Prometheus Admin API -2. Clean up tombstones to free disk space -3. Provide clear success/error feedback - -**Manual cleanup** (if you prefer): - -```bash -# Delete specific metric -curl -X POST 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]=epimetheus_benchmark_cpu_usage' - -# Clean up tombstones -curl -X POST 'http://localhost:9090/api/v1/admin/tsdb/clean_tombstones' -``` - -### Other Cleanup Tasks - -```bash -# Stop port-forwards -pkill -f "port-forward.*9091" -pkill -f "port-forward.*9090" -pkill -f "port-forward.*3000" - -# Delete test metrics from Pushgateway -curl -X DELETE http://localhost:9091/metrics/job/example_metrics_pusher - -# Uninstall Pushgateway (if needed) -helm uninstall pushgateway -n monitoring -``` - -## MacOS Setup - -### Basic Installation - -```bash -brew install prometheus -brew install grafana -go install github.com/prometheus/pushgateway@latest -brew services start grafana -brew services start prometheus -~/go/bin/pushgateway & -``` - -Once done, login to http://localhost:3000 as admin:admin, you will be prompted to change the password. Afterwards, add http://localhost:9090 as a Prometheus datasource. - -### Enable Remote Write Receiver (Required for Watch Mode) - -⚠️ **Important**: Watch mode, historic mode, backfill mode, and auto mode require the Prometheus Remote Write receiver to be enabled. - -#### Option 1: Permanent Configuration (Recommended) - -Edit the Prometheus arguments file: - -```bash -# Edit the arguments file -nano /opt/homebrew/etc/prometheus.args -``` - -Add this line at the end: -``` ---web.enable-remote-write-receiver -``` - -The complete file should look like: -``` ---config.file /opt/homebrew/etc/prometheus.yml ---web.listen-address=127.0.0.1:9090 ---storage.tsdb.path /opt/homebrew/var/prometheus ---web.enable-remote-write-receiver ---web.enable-admin-api -``` - -**Note:** `--web.enable-admin-api` is optional but recommended for easier data management (allows deleting old metrics). - -Restart Prometheus: -```bash -brew services restart prometheus -``` - -Verify it's working: -```bash -# Check Prometheus is healthy -curl http://localhost:9090/-/healthy - -# Test Remote Write endpoint (should return 400, not 404) -curl -X POST http://localhost:9090/api/v1/write -``` - -#### Option 2: Temporary (For Testing) - -Stop the service and start manually: - -```bash -# Stop brew service -brew services stop prometheus - -# Start with Remote Write enabled -prometheus --web.enable-remote-write-receiver -``` - -Keep this terminal open. In another terminal, run your epimetheus commands. - -**Note**: This only lasts until you stop the terminal. Use Option 1 for permanent setup. - -### Clearing Old Metrics (Optional) - -If you need to delete old metrics and start fresh: - -```bash -# Delete specific metrics (e.g., blockstore) -curl -X POST -g 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]={__name__=~"blockstore_.*"}' - -# Clean up deleted data -curl -X POST http://localhost:9090/api/v1/admin/tsdb/clean_tombstones - -# Wait a moment for cleanup -sleep 2 -``` - -**Note:** Admin API must be enabled (add `--web.enable-admin-api` to prometheus.args). - -### Verify Setup - -Once Remote Write is enabled, test watch mode: - -```bash -# Create a test CSV -cat > /tmp/test.csv << EOF -status,count,method -200,100,GET -404,50,POST -EOF - -# Watch the file -./epimetheus -mode=watch \ - -file=/tmp/test.csv \ - -metric-name=test \ - -prometheus=http://localhost:9090/api/v1/write -``` +## Project Structure -You should see: -``` -✅ Successfully pushed X samples to Prometheus ``` - -Query in Prometheus (http://localhost:9090): -```promql -{__name__=~"test_.*"} +epimetheus/ +├── cmd/epimetheus/ # Main entry point +├── internal/ # config, ingester, metrics, parser, resolver, watcher +├── docs/ # Documentation +├── scripts/ # Helper shell scripts (verify-clickhouse, generate-test-data, etc.) +├── test-data/ # Test CSVs +├── Magefile.go # Build and run targets +└── README.md ``` -## Additional Resources - -- [Prometheus Documentation](https://prometheus.io/docs/) -- [Pushgateway Documentation](https://github.com/prometheus/pushgateway) -- [Prometheus Remote Write Spec](https://prometheus.io/docs/concepts/remote_write_spec/) -- [Grafana Documentation](https://grafana.com/docs/) - ## Version Current version: 0.0.0 diff --git a/docs/DOCS-RESTRUCTURE-PLAN.md b/docs/DOCS-RESTRUCTURE-PLAN.md new file mode 100644 index 0000000..c688993 --- /dev/null +++ b/docs/DOCS-RESTRUCTURE-PLAN.md @@ -0,0 +1,235 @@ +# Documentation Restructure Plan + +This plan addresses the current documentation sprawl and clarifies the **multiple ingestion backends** (Prometheus, ClickHouse, and future backends such as VictoriaMetrics) and **modes** (realtime, historic, backfill, auto, watch). + +--- + +## 1. Current State Summary + +### 1.1 Existing Markdown Files + +| File | Purpose | Issues | +|------------|-----------------------------------|--------| +| `README.md` | Single ~995-line doc: intro, modes, backends, setup, troubleshooting, macOS, cleanup | Too long; mixes audiences and backends; hard to maintain | +| `AGENT.md` | Agent rules (Grafana dashboard guidelines + ref to `~/git/conf/snippets/go/go-projects.md`) | Fine as-is; not user docs | +| `CLAUDE.md` | One-line pointer to AGENT.md | Fine as-is | + +### 1.2 Broken or Missing References in README + +- `CSV-FORMAT-FLEXIBILITY.md` – linked, **does not exist** +- `DNS-RESOLUTION-FEATURE.md` – linked, **does not exist** +- `DTAIL-METRICS-EXAMPLE.md` – linked, **does not exist** +- `MAGEFILE.md` – linked, **does not exist** (build logic lives in `Magefile.go`) + +### 1.3 Ingestion Backends (from codebase) + +| Backend | Modes | Notes | +|-----------|---------------------------|--------| +| **Prometheus** | realtime (Pushgateway), historic/backfill/auto (Remote Write), watch (Remote Write) | Primary; Remote Write requires feature flag | +| **ClickHouse** | watch only | Optional; can run with Prometheus or alone | + +*VictoriaDB / VictoriaMetrics:* Not present in code today. Plan leaves room for a dedicated backend doc when added. + +--- + +## 2. Goals + +1. **Separate by ingestion backend** so Prometheus vs ClickHouse (and future backends) have clear, non-redundant docs. +2. **Split by audience and topic**: quick start vs reference vs operations (setup, troubleshooting, cleanup). +3. **Fix broken links**: either add the missing docs or replace links with in-README sections / new doc paths. +4. **Single source of truth** for each concept (e.g. “how watch mode works” and “how to configure Prometheus” in one place each). +5. **Easier maintenance**: smaller, focused files; clear naming; one `docs/` tree. + +--- + +## 3. Proposed Directory Layout + +``` +epimetheus/ +├── README.md # Short overview + quick start + doc index (slimmed) +├── AGENT.md # Unchanged +├── CLAUDE.md # Unchanged +├── docs/ +│ ├── README.md # Documentation index (nav + short descriptions) +│ │ +│ ├── guides/ # How-to and concepts +│ │ ├── quickstart.md # Minimal path to first push (Prometheus or ClickHouse) +│ │ ├── modes.md # All modes: realtime, historic, backfill, auto, watch +│ │ ├── data-formats.md # CSV (epimetheus + tabular) and JSON +│ │ ├── csv-format-flexibility.md # “Any CSV” + examples (replaces missing file) +│ │ ├── dns-resolution.md # IP → hostname resolution (replaces missing file) +│ │ └── dtail-metrics-example.md # Optional: dtail.csv walkthrough (replaces missing file) +│ │ +│ ├── backends/ # One doc per ingestion backend +│ │ ├── prometheus.md # Pushgateway + Remote Write, config, limits +│ │ ├── clickhouse.md # Watch-only; schema; verify script +│ │ └── (future) victoriametrics.md # When/if added +│ │ +│ ├── operations/ # Setup, runbooks, platform-specific +│ │ ├── setup-prometheus.md # Remote Write receiver, scrape config, retention +│ │ ├── setup-clickhouse.md # Table creation, verify-clickhouse.sh +│ │ ├── troubleshooting.md # Connection issues, “no metrics”, out-of-order, etc. +│ │ ├── cleanup.md # Benchmark cleanup, Pushgateway delete, port-forwards +│ │ ├── macos-setup.md # Brew, Prometheus args, Remote Write on macOS +│ │ └── kubernetes.md # Port-forwards, Helm, ConfigMaps (from current README) +│ │ +│ ├── reference/ # Reference material +│ │ ├── cli.md # All flags by mode +│ │ ├── test-metrics.md # epimetheus_test_* metrics and types +│ │ ├── grafana-dashboard.md # Panels, deploy options, datasource +│ │ ├── example-queries.md # PromQL and curl examples +│ │ └── magefile.md # Mage targets (replaces missing MAGEFILE.md) +│ │ +│ └── design/ # Optional, for contributors +│ └── architecture.md # High-level data flow (current ASCII diagrams) +``` + +--- + +## 4. File-by-File Plan + +### 4.1 Root `README.md` (slimmed) + +- **Keep:** Project name, tagline, “Why Epimetheus”, **one** high-level architecture diagram (simplified). +- **Keep:** Very short “Overview” (1 paragraph) and **Quick Start** (3–5 steps pointing at `docs/guides/quickstart.md` for details). +- **Add:** **Documentation index** – bullet list with links to: + - `docs/README.md` + - `docs/guides/quickstart.md`, `docs/guides/modes.md` + - `docs/backends/prometheus.md`, `docs/backends/clickhouse.md` + - `docs/operations/setup-prometheus.md`, `docs/operations/troubleshooting.md` + - `docs/reference/cli.md`, `docs/reference/magefile.md` +- **Move out of README into `docs/`:** + - All mode details → `docs/guides/modes.md` + - Backend-specific behaviour → `docs/backends/*.md` + - Setup (Prometheus, ClickHouse, k8s, macOS) → `docs/operations/*.md` + - Data formats → `docs/guides/data-formats.md` (+ csv-format-flexibility, dns-resolution, dtail example) + - Test metrics, Grafana, example queries → `docs/reference/*.md` + - Troubleshooting, cleanup → `docs/operations/*.md` + - Time range / retention → `docs/backends/prometheus.md` and `docs/operations/setup-prometheus.md` +- **Fix links:** Remove links to `CSV-FORMAT-FLEXIBILITY.md`, `DNS-RESOLUTION-FEATURE.md`, `DTAIL-METRICS-EXAMPLE.md`, `MAGEFILE.md` from README; point to `docs/guides/...` and `docs/reference/magefile.md` instead. + +**Target:** README under ~150–200 lines. + +--- + +### 4.2 `docs/README.md` (new) + +- Title: “Epimetheus Documentation”. +- Short intro (2–3 sentences). +- **Structured index** with sections: + - **Guides:** quickstart, modes, data formats, CSV flexibility, DNS resolution, dtail example. + - **Ingestion backends:** Prometheus, ClickHouse (and placeholder for Victoria* if desired). + - **Operations:** setup (Prometheus, ClickHouse), troubleshooting, cleanup, macOS, Kubernetes. + - **Reference:** CLI, test metrics, Grafana, example queries, Mage. +- Each entry: link + one-line description. + +--- + +### 4.3 Guides + +| Doc | Content | Source | +|-----|--------|--------| +| `guides/quickstart.md` | Minimal steps: build/run, push to Prometheus or ClickHouse, view (Prometheus UI or verify-clickhouse.sh). | Current README “Quick Start” + “Run in Realtime Mode” + one watch example. | +| `guides/modes.md` | Table: mode name, purpose, which backends, main flags. Then one subsection per mode (realtime, historic, backfill, auto, watch) with short description and example command. | Current README “Operating Modes”. | +| `guides/data-formats.md` | Epimetheus CSV (metric_name, labels, value, timestamp_ms), JSON format, optional timestamp. Link to csv-format-flexibility for tabular CSV. | Current README “Data Formats”. | +| `guides/csv-format-flexibility.md` | “Works with any CSV”: headers → metric names/labels, numeric vs string columns, sanitization, examples (web, food). | New content; replaces missing `CSV-FORMAT-FLEXIBILITY.md`. | +| `guides/dns-resolution.md` | Default `ip` resolution; `-resolve-ip-labels`; behaviour on failure. | New content; replaces missing `DNS-RESOLUTION-FEATURE.md`. | +| `guides/dtail-metrics-example.md` | Optional: step-by-step dtail.csv example. | New content; replaces missing `DTAIL-METRICS-EXAMPLE.md`; can be short. | + +--- + +### 4.4 Backends + +| Doc | Content | Source | +|-----|--------|--------| +| `backends/prometheus.md` | Pushgateway (realtime) vs Remote Write (historic/watch); URLs; time range and retention limits; out-of-order; link to setup-prometheus. | README Prometheus bits + “Time Range Limitations” + “Setup Requirements” (Remote Write). | +| `backends/clickhouse.md` | Watch-only; `-clickhouse`, `-clickhouse-table`; table schema (from code/comments); `verify-clickhouse.sh`; Prometheus + ClickHouse together. | README “ClickHouse Support” + verify-clickhouse.sh + internal/ingester/clickhouse.go. | + +--- + +### 4.5 Operations + +| Doc | Content | Source | +|-----|--------|--------| +| `operations/setup-prometheus.md` | Enable Remote Write receiver (and Admin API); scrape config for Pushgateway; retention; Prometheus 3.x syntax; verify commands. | Current README “Setup Requirements” (Prometheus). | +| `operations/setup-clickhouse.md` | Ensure table exists (e.g. from ingester); run verify script; optional Docker/systemd. | From README + scripts + code. | +| `operations/troubleshooting.md` | Pushgateway connection; metrics not in Prometheus; “Remote write receiver not enabled”; out-of-order errors; dashboard not in Grafana; ClickHouse connection. | Current README “Troubleshooting”. | +| `operations/cleanup.md` | Cleanup benchmark data script; manual Prometheus delete/tombstones; Pushgateway delete; stop port-forwards; uninstall Pushgateway. | Current README “Cleanup”. | +| `operations/macos-setup.md` | Brew install; prometheus.args (Remote Write, Admin API); verify; optional “temporary” run. | Current README “MacOS Setup”. | +| `operations/kubernetes.md` | Port-forwards (Pushgateway, Prometheus, Grafana); Helm/ConfigMap for dashboard; namespace. | Extracted from README examples. | + +--- + +### 4.6 Reference + +| Doc | Content | Source | +|-----|--------|--------| +| `reference/cli.md` | Table or list of all flags by mode (realtime, historic, backfill, auto, watch); default values. | From README + `cmd/epimetheus/main.go`. | +| `reference/test-metrics.md` | Each `epimetheus_test_*` metric: type, description, labels, use case. | Current README “Test Metrics”. | +| `reference/grafana-dashboard.md` | Panels list; deploy (ConfigMap, manual import, script); datasource; link to AGENT.md for panel guidelines. | Current README “Grafana Dashboard”. | +| `reference/example-queries.md` | PromQL and curl examples (basic, histogram, labeled counter). | Current README “Example Queries”. | +| `reference/magefile.md` | List of Mage targets (build, test, run, RunWatchClickHouse, cleanup, etc.) with one-line description and example. | From `Magefile.go`; replaces missing `MAGEFILE.md`. | + +--- + +### 4.7 Design (optional) + +| Doc | Content | Source | +|-----|--------|--------| +| `design/architecture.md` | High-level data flow; ASCII diagrams (current README); “when to use Pushgateway vs Remote Write” and “when to use which backend”. | Current README “Architecture” and “Best Practices”. | + +--- + +## 5. Implementation Order + +1. **Create `docs/` and index** + - Create `docs/README.md` with the full index (links can target paths that don’t exist yet). +2. **Fix broken links and add missing content** + - Add `docs/guides/csv-format-flexibility.md`, `docs/guides/dns-resolution.md`, `docs/guides/dtail-metrics-example.md`, `docs/reference/magefile.md` so all current README links resolve. +3. **Backend-centric docs** + - Add `docs/backends/prometheus.md` and `docs/backends/clickhouse.md`; move/duplicate content from README. +4. **Operations** + - Add `docs/operations/setup-prometheus.md`, `setup-clickhouse.md`, `troubleshooting.md`, `cleanup.md`, `macos-setup.md`, `kubernetes.md`; move content from README. +5. **Guides** + - Add `docs/guides/quickstart.md`, `modes.md`, `data-formats.md`; move content from README. +6. **Reference** + - Add `docs/reference/cli.md`, `test-metrics.md`, `grafana-dashboard.md`, `example-queries.md`; move content from README. +7. **Slim README** + - Cut README down to overview, quick start, and doc index; replace old links with `docs/...` links. +8. **Optional** + - Add `docs/design/architecture.md` and link from `docs/README.md`. + +--- + +## 6. Cross-Cutting Conventions + +- **Links:** Prefer relative links from repo root (e.g. `[Modes](docs/guides/modes.md)`) or from `docs/` (e.g. `[Prometheus](backends/prometheus.md)` inside docs). +- **Backend mentions:** In mode/CLI docs, use a short table or sentence: “Supported backends: Prometheus (all modes), ClickHouse (watch only).” +- **One diagram:** Keep one high-level diagram in README or `design/architecture.md`; avoid duplicating large ASCII art in multiple files. +- **CLI and defaults:** Single source of truth in `reference/cli.md`; guides and backend docs can quote the relevant subset. +- **Version/legal:** Keep “Version” and “License” in root README (or CONTRIBUTING.md if you add one). + +--- + +## 7. Future: VictoriaMetrics / VictoriaDB + +When adding a new backend (e.g. VictoriaMetrics, which speaks Prometheus Remote Write): + +- Add `docs/backends/victoriametrics.md` (or `victoriadb.md`) with URL format, any extra flags, and differences from Prometheus. +- In `docs/README.md` and root README, add one line to the “Ingestion backends” section. +- In `docs/guides/modes.md` and `reference/cli.md`, extend the “which backends support which mode” table and flags. +- No need to duplicate full setup/troubleshooting if it matches Prometheus; link to `backends/prometheus.md` and note compatibility where relevant. + +--- + +## 8. Checklist Before Calling Done + +- [ ] All current README links resolve (no 404s). +- [ ] README is under ~200 lines and ends with doc index. +- [ ] `docs/README.md` lists every new doc with link and one-line description. +- [ ] Prometheus vs ClickHouse (and modes) are clearly separated in backends and guides. +- [ ] Setup, troubleshooting, and cleanup live under `docs/operations/`. +- [ ] Mage is documented in `docs/reference/magefile.md` and linked from root README. +- [ ] Optional: `docs/design/architecture.md` exists and is linked from index. + +This plan gives you a single place to extend when you add VictoriaDB/VictoriaMetrics or another backend, and keeps the root README short while all detailed docs live under `docs/` with a clear structure by topic and backend. diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..5f944d4 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,66 @@ +# Epimetheus Documentation + +Documentation for Epimetheus: a Go tool for pushing metrics to Prometheus (and Prometheus-compatible backends) and ClickHouse. + +## Index + +### Guides + +| Document | Description | +|----------|-------------| +| [Quick Start](guides/quickstart.md) | Minimal path to first push: build, run, view in Prometheus or ClickHouse | +| [Operating Modes](guides/modes.md) | Realtime, historic, backfill, auto, and watch modes with examples | +| [Data Formats](guides/data-formats.md) | Epimetheus CSV and JSON input formats | +| [CSV Format Flexibility](guides/csv-format-flexibility.md) | Use any tabular CSV; numeric vs string columns; sanitization and examples | +| [DNS Resolution](guides/dns-resolution.md) | IP-to-hostname resolution for watch mode labels | +| [Dtail Metrics Example](guides/dtail-metrics-example.md) | Walkthrough using dtail.csv | + +### Ingestion Backends + +| Document | Description | +|----------|-------------| +| [Prometheus (and Prometheus-compatible)](backends/prometheus.md) | Pushgateway, Remote Write, time ranges; VictoriaMetrics via same URL | +| [ClickHouse](backends/clickhouse.md) | Watch-mode ingestion; table schema; verify script | + +### Operations + +| Document | Description | +|----------|-------------| +| [Setup: Prometheus](operations/setup-prometheus.md) | Enable Remote Write receiver, scrape config, retention | +| [Setup: ClickHouse](operations/setup-clickhouse.md) | Table creation, verification | +| [Troubleshooting](operations/troubleshooting.md) | Connection issues, metrics not appearing, out-of-order errors | +| [Cleanup](operations/cleanup.md) | Benchmark cleanup, Pushgateway delete, port-forwards | +| [macOS Setup](operations/macos-setup.md) | Homebrew, Prometheus args, Remote Write on macOS | +| [Kubernetes](operations/kubernetes.md) | Port-forwards, Helm, ConfigMaps | + +### Reference + +| Document | Description | +|----------|-------------| +| [CLI Reference](reference/cli.md) | All flags by mode with defaults | +| [Test Metrics](reference/test-metrics.md) | epimetheus_test_* metrics and types | +| [Grafana Dashboard](reference/grafana-dashboard.md) | Panels, deployment options, datasource | +| [Example Queries](reference/example-queries.md) | PromQL and curl examples | +| [Magefile](reference/magefile.md) | Mage build and run targets | + +### Design + +| Document | Description | +|----------|-------------| +| [Architecture](design/architecture.md) | Data flow, when to use Pushgateway vs Remote Write, backend choice | + +### Helper scripts + +Helper shell scripts live in **`scripts/`** at the repo root. Run them from the repo root (e.g. `./scripts/verify-clickhouse.sh`). + +| Script | Purpose | +|--------|---------| +| `verify-clickhouse.sh` | Verify ClickHouse ingestion (row count, sample data) | +| `generate-test-data.sh` | Generate `test-all-ages.csv` for auto mode | +| `cleanup-benchmark-data.sh` | Delete benchmark metrics from Prometheus (Admin API) | +| `cleanup-benchmark-metrics.sh` | Same + starts port-forward, then cleans up | +| `benchmark-100mb.sh` | 100MB ingestion benchmark | +| `benchmark-1gb.sh` | 1GB ingestion benchmark | +| `backfill-historic-data.sh` | Backfill 7 days of historic data to Prometheus | +| `run.sh` | Port-forward Pushgateway and run epimetheus in realtime mode | +| `deploy-dashboard.sh` | Deploy Grafana dashboard via API (if present) | diff --git a/docs/backends/clickhouse.md b/docs/backends/clickhouse.md new file mode 100644 index 0000000..ad1b5f0 --- /dev/null +++ b/docs/backends/clickhouse.md @@ -0,0 +1,92 @@ +# ClickHouse + +Epimetheus can ingest metrics into ClickHouse in **watch mode** only. ClickHouse is optional: you can use it in addition to Prometheus or as the only backend (by setting `-prometheus=` to disable Prometheus ingestion). + +## Data flow (watch mode only) + +``` +┌─────────────────┐ poll (1s) ┌─────────────────────────────────────┐ +│ CSV file(s) │ ─────────────────▶ │ Epimetheus (watch mode) │ +│ (mtime = │ │ • Parse tabular CSV │ +│ timestamp) │ │ • -metric-name + columns → metrics │ +└─────────────────┘ └─────────────────────────────────────┘ + │ + ┌────────────────────┼────────────────────┐ + │ │ │ + ▼ ▼ │ + ┌───────────────┐ ┌───────────────┐ │ + │ Prometheus │ │ ClickHouse │ │ + │ (optional) │ │ (optional) │ │ + │ -prometheus= │ │ -clickhouse= │ │ + │ Remote Write │ │ HTTP insert │ │ + └───────────────┘ └───────────────┘ │ + │ + At least one of -prometheus or -clickhouse │ +``` + +## When It's Used + +- **Mode:** Watch only. Other modes (realtime, historic, backfill, auto) do not write to ClickHouse. +- **Flags:** + - `-clickhouse` – ClickHouse HTTP URL (e.g. `http://localhost:8123`). If empty, no ClickHouse ingestion. + - `-clickhouse-table` – Table name (default: `epimetheus_metrics`). + +At least one of `-prometheus` or `-clickhouse` must be set for watch mode. + +## Table Schema + +Epimetheus creates the table if it does not exist. Schema: + +```sql +CREATE TABLE IF NOT EXISTS epimetheus_metrics ( + metric String, + labels Map(String, String), + value Float64, + timestamp DateTime64(3) +) ENGINE = MergeTree() +ORDER BY (metric, timestamp) +``` + +- `metric` – metric name (e.g. from `-metric-name` and column headers in tabular CSV). +- `labels` – key-value map of label names and values. +- `value` – sample value. +- `timestamp` – sample time (millisecond precision). + +## Examples + +**Prometheus and ClickHouse:** + +```bash +./epimetheus -mode=watch -file=data.csv -metric-name=myapp \ + -prometheus=http://localhost:9090/api/v1/write \ + -clickhouse=http://localhost:8123 +``` + +**ClickHouse only:** + +```bash +./epimetheus -mode=watch -file=test-data/watch-clickhouse-test.csv \ + -metric-name=watch_test \ + -clickhouse=http://localhost:8123 \ + -prometheus= +``` + +**Custom table:** + +```bash +./epimetheus -mode=watch -file=data.csv -metric-name=myapp \ + -clickhouse=http://localhost:8123 \ + -clickhouse-table=my_metrics +``` + +## Verification + +Use the provided script to check that data landed in ClickHouse: + +```bash +./scripts/verify-clickhouse.sh +# Or with custom URL/table: +./scripts/verify-clickhouse.sh http://localhost:8123 epimetheus_metrics +``` + +The script checks connectivity, row count, distinct metrics, sample rows, and rows per metric. See [Setup: ClickHouse](../operations/setup-clickhouse.md) for getting ClickHouse running. diff --git a/docs/backends/prometheus.md b/docs/backends/prometheus.md new file mode 100644 index 0000000..f8d2a9b --- /dev/null +++ b/docs/backends/prometheus.md @@ -0,0 +1,76 @@ +# Prometheus (and Prometheus-Compatible Backends) + +Epimetheus can ingest metrics into Prometheus via two paths. Any backend that exposes the Prometheus Remote Write API (including **VictoriaMetrics**) is supported by pointing `-prometheus=` at that backend's write URL (e.g. `http://victoriametrics:8428/api/v1/write`). + +## Ingestion paths (overview) + +``` + Epimetheus + │ + ┌───────────────┼───────────────┐ + │ │ │ + ▼ ▼ ▼ + Realtime mode Historic/Backfill Watch mode + (current data) (old data) (CSV file mtime) + │ │ │ + ▼ │ │ + ┌───────────┐ │ │ + │Pushgateway │ │ │ + │ (HTTP POST)│ │ │ + └─────┬─────┘ │ │ + │ Scrape │ │ + │ (15–30s) │ │ + ▼ ▼ ▼ + ┌─────────────────────────────────────────────┐ + │ Prometheus / VictoriaMetrics │ + │ Remote Write API: /api/v1/write │ + │ (realtime: via Pushgateway scrape; │ + │ historic/watch: direct POST) │ + └─────────────────────────────────────────────┘ +``` + +## Ingestion Paths + +### Realtime: Pushgateway + +- **Used by:** realtime mode, and auto mode for samples < 5 minutes old. +- **Flow:** Epimetheus pushes to Pushgateway (HTTP POST); Prometheus scrapes Pushgateway on its schedule. Timestamps become "now" at scrape time. +- **Flags:** `-pushgateway` (default `http://localhost:9091`), `-job`, `-continuous`. + +### Historic: Remote Write API + +- **Used by:** historic mode, backfill mode, auto mode for samples ≥ 5 minutes old, and watch mode (when `-prometheus` is set). +- **Flow:** Epimetheus sends samples to the Remote Write endpoint (e.g. `/api/v1/write`). Timestamps from the data are preserved. +- **Flags:** `-prometheus` (default `http://localhost:9090/api/v1/write`). + +The Remote Write receiver must be enabled on Prometheus for historic/watch/backfill/auto with old data. See [Setup: Prometheus](../operations/setup-prometheus.md). + +## Prometheus-Compatible Backends (e.g. VictoriaMetrics) + +Backends that implement the [Prometheus Remote Write](https://prometheus.io/docs/concepts/remote_write_spec/) API work with Epimetheus without any code changes. Use their write endpoint as the `-prometheus=` URL. + +**Example (VictoriaMetrics):** + +```bash +./epimetheus -mode=watch -file=data.csv -metric-name=myapp \ + -prometheus=http://victoriametrics:8428/api/v1/write +``` + +Replace host/port with your VictoriaMetrics (or other compatible) write URL. Realtime mode still uses Pushgateway (scraped by your Prometheus or VictoriaMetrics); for watch/historic/backfill/auto, only the `-prometheus=` target changes. + +## Time Ranges + +| Time range | Status | Method | +|------------|--------|--------| +| Current (< 5 min) | Supported | Pushgateway | +| 1 hour old | Supported | Remote Write | +| 1 day to 1 month old | Supported | Remote Write | +| 6+ months | May be rejected (retention) | Remote Write | +| Years old | Likely rejected; use `promtool tsdb create-blocks-from` | — | +| Future (> 5 min ahead) | Rejected | — | + +Out-of-order samples (older than existing data for the same series) require out-of-order ingestion to be enabled on the backend, or use different labels. See [Troubleshooting](../operations/troubleshooting.md). + +## Retention and Configuration + +Check your backend's retention (e.g. Prometheus `retention`, VictoriaMetrics settings). For very old data you may need to increase retention or enable out-of-order ingestion. See [Setup: Prometheus](../operations/setup-prometheus.md) for Prometheus-specific options. diff --git a/docs/design/architecture.md b/docs/design/architecture.md new file mode 100644 index 0000000..2a01e09 --- /dev/null +++ b/docs/design/architecture.md @@ -0,0 +1,101 @@ +# Architecture + +High-level data flow and when to use each ingestion path or backend. + +## Data flow + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ Epimetheus │ +│ (Metrics Ingestion Tool) │ +│ │ +│ Modes: │ +│ • Realtime - Current metrics (< 5 min old) │ +│ • Historic - Historic metrics (≥ 5 min old) │ +│ • Backfill - Range of historic data │ +│ • Auto - Automatic routing based on timestamp age │ +│ • Watch - CSV file monitoring (Prometheus and/or ClickHouse) │ +└─────────────────────────────────────────────────────────────────────────┘ + │ │ + │ Realtime Data │ Historic Data + │ (via HTTP POST) │ (via Remote Write API) + │ Uses "now" timestamp │ Preserves timestamps + ▼ ▼ +┌─────────────────────┐ ┌─────────────────────┐ +│ Pushgateway │ │ Prometheus / │ +│ (Port 9091) │ │ VictoriaMetrics │ +│ │ │ (Remote Write) │ +│ • Buffers metrics │ │ │ +│ • Scraped by │──── Scraped ─────▶ │ /api/v1/write │ +│ Prometheus │ every 15-30s │ │ +└─────────────────────┘ └─────────────────────┘ + │ + │ Query API + ▼ + ┌─────────────────────┐ + │ Grafana │ + │ Dashboards │ + └─────────────────────┘ +``` + +**Watch mode** can also write to **ClickHouse** (separate path; see [ClickHouse backend](../backends/clickhouse.md)). + +## Watch mode (CSV file watcher) + +Watch mode polls CSV file(s), uses file modification time as the sample timestamp, and can push to Prometheus (Remote Write) and/or ClickHouse. + +``` +┌─────────────────┐ poll (1s) ┌─────────────────────────────────────┐ +│ CSV file(s) │ ─────────────────▶ │ Epimetheus (watch mode) │ +│ │ │ • Parse tabular CSV │ +│ File mtime = │ │ • Numeric columns → metrics │ +│ sample time │ │ • String columns → labels │ +└─────────────────┘ │ • Optional DNS resolution (IPs) │ + └─────────────────────────────────────┘ + │ + ┌────────────────────┼────────────────────┐ + │ │ │ + ▼ ▼ │ + ┌───────────────┐ ┌───────────────┐ │ + │ Prometheus │ │ ClickHouse │ │ + │ (optional) │ │ (optional) │ │ + │ Remote Write │ │ HTTP insert │ │ + │ /api/v1/write│ │ (batched) │ │ + └───────────────┘ └───────────────┘ │ +``` + +At least one of `-prometheus` or `-clickhouse` must be set. See [Operating Modes](../guides/modes.md) and [ClickHouse backend](../backends/clickhouse.md). + +## When to use Pushgateway vs Remote Write + +**Use Pushgateway (realtime mode):** + +- Short-lived batch jobs +- Service-level metrics +- Jobs behind firewalls +- Current/recent data (< 5 minutes old) + +**Use Remote Write (historic, backfill, watch, or auto with old data):** + +- Historic data import +- Backfilling gaps +- Data migration +- Data older than 5 minutes +- Watch mode (to preserve file mtime as timestamp) + +**Use Auto mode:** + +- Mixed current and historic data in one file +- Unknown timestamp ages +- General-purpose file import + +## When to use which backend + +- **Prometheus or VictoriaMetrics:** Set `-prometheus=` to the backend’s Remote Write URL. Use for realtime (via Pushgateway scraped by Prometheus/VM), historic, backfill, auto, and watch. +- **ClickHouse:** Set `-clickhouse=` in watch mode for analytics/long-term storage. Can be used together with Prometheus or alone (with `-prometheus=` empty). + +## Metric design (best practices) + +- **Types:** Counter for cumulative values (requests, errors); Gauge for point-in-time (temperature, connections); Histogram for distributions (latency). +- **Labels:** Meaningful labels; avoid high cardinality (user IDs, raw timestamps); keep combinations reasonable (< 1000 per metric). +- **Naming:** Descriptive names; units in gauge names (e.g. `_celsius`, `_bytes`); `_total` suffix for counters. diff --git a/docs/guides/csv-format-flexibility.md b/docs/guides/csv-format-flexibility.md new file mode 100644 index 0000000..180dc28 --- /dev/null +++ b/docs/guides/csv-format-flexibility.md @@ -0,0 +1,52 @@ +# CSV Format Flexibility + +Watch mode works with **any tabular CSV**. You do not need a fixed schema; Epimetheus infers metric names and labels from column headers and value types. + +## How It Works + +- **First row:** Column headers (automatically sanitized for Prometheus label/metric names). +- **Numeric columns:** Treated as metric values. Each gets a metric name derived from the base metric name and the column header. +- **String columns:** Treated as labels. Each row’s value becomes the label value for that series. +- **Metric name:** Set with `-metric-name` (e.g. `web`, `food`, `network`). It is used as a prefix for all numeric columns. + +Column names can contain characters that are invalid in Prometheus (e.g. parentheses, spaces). They are sanitized: for example `min(potatoes)` becomes a valid metric suffix like `min_potatoes`. + +## Examples + +### Web metrics + +```csv +avg(response_time),p99(latency),endpoint,method +45.2,120.5,/api/users,GET +52.1,135.8,/api/orders,POST +``` + +With `-metric-name=web` this produces series such as: + +- `web_avg_response_time{endpoint="/api/users",method="GET"} 45.2` +- `web_p99_latency{endpoint="/api/users",method="GET"} 120.5` +- `web_avg_response_time{endpoint="/api/orders",method="POST"} 52.1` +- `web_p99_latency{endpoint="/api/orders",method="POST"} 135.8` + +### Food / business metrics + +```csv +min(potatoes),last(coke),avg(price),country,store_type +5.2,10.5,12.99,USA,grocery +3.8,8.2,9.99,Canada,convenience +``` + +With `-metric-name=food` this produces series such as: + +- `food_min_potatoes{country="USA",store_type="grocery"} 5.2` +- `food_last_coke{country="USA",store_type="grocery"} 10.5` +- `food_avg_price{country="USA",store_type="grocery"} 12.99` +- and the same metrics with `country="Canada",store_type="convenience"`. + +### Summary + +- Each **row** becomes one or more samples (one per numeric column). +- **Numeric columns** → different metrics (same labels for that row). +- **String columns** → labels shared by all those metrics for that row. + +For the standard Epimetheus CSV format (explicit metric name, labels, value, timestamp) see [Data Formats](data-formats.md). For modes and watch options see [Operating Modes](modes.md). diff --git a/docs/guides/data-formats.md b/docs/guides/data-formats.md new file mode 100644 index 0000000..24d7755 --- /dev/null +++ b/docs/guides/data-formats.md @@ -0,0 +1,49 @@ +# Data Formats + +Epimetheus accepts CSV and JSON input for **auto mode** (and for **watch mode**, watch uses tabular CSV; see [CSV Format Flexibility](csv-format-flexibility.md)). + +## Epimetheus CSV (auto mode) + +Format: one metric per line with explicit metric name, labels, value, and optional timestamp. + +```csv +# Format: metric_name,labels,value,timestamp_ms +# Labels: key1=value1;key2=value2 +epimetheus_test_requests_total,instance=web1;env=prod,100,1767125148000 +epimetheus_test_temperature_celsius,instance=web2,22.5,1767038748000 + +# Timestamp optional (uses "now" if omitted) +epimetheus_test_active_connections,instance=web3,42, +``` + +- **metric_name** – Prometheus metric name. +- **labels** – Semicolon-separated `key=value` pairs. +- **value** – Numeric value. +- **timestamp_ms** – Unix milliseconds. Omit or leave empty for "now". + +## JSON (auto mode) + +Array of objects with `metric`, `labels`, `value`, and optional `timestamp_ms`: + +```json +[ + { + "metric": "epimetheus_test_requests_total", + "labels": {"instance": "web1", "env": "prod"}, + "value": 100, + "timestamp_ms": 1767125148000 + }, + { + "metric": "epimetheus_test_temperature_celsius", + "labels": {"instance": "web2"}, + "value": 22.5, + "timestamp_ms": 1767038748000 + } +] +``` + +Omit `timestamp_ms` for "now". + +## Watch mode CSV + +Watch mode uses **tabular CSV**: first row = headers, following rows = data. Numeric columns become metrics (with `-metric-name` as prefix), string columns become labels. See [CSV Format Flexibility](csv-format-flexibility.md). diff --git a/docs/guides/dns-resolution.md b/docs/guides/dns-resolution.md new file mode 100644 index 0000000..42478a2 --- /dev/null +++ b/docs/guides/dns-resolution.md @@ -0,0 +1,42 @@ +# DNS Resolution (Watch Mode) + +In watch mode, Epimetheus can resolve IP addresses in label values to hostnames. This improves readability in Grafana and other tools that display label values. + +## Default Behaviour + +- The label **`ip`** is always resolved by default (when present). +- Resolution is done via reverse DNS. The result is used as the label value (e.g. `10.50.52.61` → `foo.example.lan`). +- Failed lookups leave the original IP unchanged. +- Results are cached in memory to avoid repeated DNS lookups. + +## Additional Labels + +To resolve other IP-carrying labels, use `-resolve-ip-labels` with a comma-separated list of label names: + +```bash +./epimetheus -mode=watch \ + -file=network.csv \ + -metric-name=network \ + -resolve-ip-labels=source_ip,dest_ip +``` + +This resolves: + +- `ip` (always, if present) +- `source_ip` +- `dest_ip` + +Duplicate or empty entries (e.g. listing `ip` again) are ignored. + +## Example + +- **Input label:** `ip="10.50.52.61"` +- **After resolution:** `ip="foo.example.lan"` (if reverse DNS returns that name) +- **If resolution fails:** `ip="10.50.52.61"` (unchanged) + +## When to Use + +- CSV columns that contain IPs and are used as labels (e.g. `ip`, `host`, `source_ip`, `dest_ip`). +- When you want dashboards to show hostnames instead of raw IPs. + +DNS resolution only applies in **watch mode**. Other modes do not use this feature. See [Operating Modes](modes.md) and [CLI Reference](../reference/cli.md) for full options. diff --git a/docs/guides/dtail-metrics-example.md b/docs/guides/dtail-metrics-example.md new file mode 100644 index 0000000..5416726 --- /dev/null +++ b/docs/guides/dtail-metrics-example.md @@ -0,0 +1,49 @@ +# Dtail Metrics Example + +This page walks through using Epimetheus watch mode with a CSV that could come from a tool like [Dtail](https://dtail.dev/) or any similar log/aggregation export. + +## Scenario + +You have a CSV file (e.g. `dtail.csv`) with columns that mix numeric stats and identifiers (host, service, etc.). You want to turn those into Prometheus metrics so you can graph them in Grafana. + +## Steps + +1. **Ensure the CSV has a header row** + First line = column names. Epimetheus will sanitize them for use as metric names and labels. + +2. **Identify numeric vs string columns** + - Numeric columns (e.g. `count`, `avg_latency_ms`, `p99`) become metric values. + - String columns (e.g. `host`, `service`, `region`) become labels. + +3. **Run watch mode** with a base metric name and your Prometheus (or Prometheus-compatible) write URL: + + ```bash + ./epimetheus -mode=watch \ + -file=dtail.csv \ + -metric-name=dtail \ + -prometheus=http://localhost:9090/api/v1/write + ``` + +4. **Optional: resolve IPs to hostnames** + If one of your label columns contains IPs (e.g. `host` or `ip`), you can resolve them: + + ```bash + ./epimetheus -mode=watch \ + -file=dtail.csv \ + -metric-name=dtail \ + -prometheus=http://localhost:9090/api/v1/write \ + -resolve-ip-labels=host + ``` + +5. **Query in Prometheus / Grafana** + Metrics will appear as `dtail_<column_name>` with your string columns as labels, e.g.: + + ```promql + dtail_avg_latency_ms{service="api", region="eu"} + ``` + +## References + +- [CSV Format Flexibility](csv-format-flexibility.md) – how column types and names are interpreted. +- [DNS Resolution](dns-resolution.md) – IP-to-hostname resolution. +- [Operating Modes](modes.md) – all watch mode options. diff --git a/docs/guides/modes.md b/docs/guides/modes.md new file mode 100644 index 0000000..bcbbb6b --- /dev/null +++ b/docs/guides/modes.md @@ -0,0 +1,130 @@ +# Operating Modes + +Epimetheus has five modes. Backend support: + +| Mode | Prometheus (Pushgateway) | Prometheus (Remote Write) | ClickHouse | +|-----------|--------------------------|---------------------------|------------| +| Realtime | Yes | No | No | +| Historic | No | Yes | No | +| Backfill | No | Yes | No | +| Auto | Yes (samples < 5 min) | Yes (samples ≥ 5 min) | No | +| Watch | Optional | Optional | Optional | + +At least one of Prometheus or ClickHouse must be configured for watch mode. + +--- + +## Watch Mode + +Monitor CSV files and push metrics using file modification time as the timestamp. Works with any tabular CSV; numeric columns become metrics, string columns become labels. + +### Watch mode data flow + +``` +┌─────────────────┐ poll (1s) ┌─────────────────────────────────────┐ +│ CSV file(s) │ ─────────────────▶ │ Epimetheus (watch mode) │ +│ │ │ • Parse tabular CSV │ +│ File mtime = │ │ • Numeric columns → metrics │ +│ sample time │ │ • String columns → labels │ +└─────────────────┘ │ • Optional DNS resolution (IPs) │ + └─────────────────────────────────────┘ + │ + ┌────────────────────┼────────────────────┐ + │ │ │ + ▼ ▼ │ + ┌───────────────┐ ┌───────────────┐ │ + │ Prometheus │ │ ClickHouse │ │ + │ (optional) │ │ (optional) │ │ + │ │ │ │ │ + │ Remote Write │ │ HTTP insert │ │ + │ /api/v1/write│ │ (batched) │ │ + └───────────────┘ └───────────────┘ │ + │ │ │ + └────────────────────┴────────────────────┘ + At least one of -prometheus or -clickhouse +``` + +```bash +./epimetheus -mode=watch -file=mydata.csv -metric-name=myapp \ + -prometheus=http://localhost:9090/api/v1/write +``` + +**Options:** `-file`, `-metric-name`, `-prometheus`, `-clickhouse`, `-clickhouse-table`, `-job`, `-resolve-ip-labels`. See [CLI Reference](../reference/cli.md). + +**Features:** Format-agnostic CSV, automatic numeric/string detection, label name sanitization, optional DNS resolution for IP labels, timestamp from file mtime, continuous polling (1s), Remote Write (and optionally ClickHouse). See [CSV Format Flexibility](csv-format-flexibility.md) and [DNS Resolution](dns-resolution.md). + +--- + +## Realtime Mode (default) + +Push current metrics to Pushgateway with "now" timestamp. + +```bash +./epimetheus -mode=realtime -continuous +``` + +**Options:** `-pushgateway` (default `http://localhost:9091`), `-job`, `-continuous`. Pushes every 15 seconds when `-continuous` is set. + +--- + +## Historic Mode + +Push a single historic datapoint via Remote Write. + +```bash +./epimetheus -mode=historic -hours-ago=24 +``` + +**Options:** `-prometheus` (default `http://localhost:9090/api/v1/write`), `-hours-ago` (default 24). Requires Remote Write receiver. See [Backends: Prometheus](../backends/prometheus.md). + +--- + +## Backfill Mode + +Import a range of historic data points. + +```bash +./epimetheus -mode=backfill -start-hours=48 -end-hours=0 -interval=1 +./epimetheus -mode=backfill -start-hours=168 -end-hours=0 -interval=6 +``` + +**Options:** `-start-hours`, `-end-hours` (0 = now), `-interval` (hours between points). Requires Remote Write receiver. + +--- + +## Auto Mode + +Route samples by timestamp age: < 5 minutes → Pushgateway; ≥ 5 minutes → Remote Write. Use for mixed or unknown-age data. + +### Auto mode data flow + +``` +┌─────────────────┐ ┌─────────────────────────────────────┐ +│ CSV/JSON file │ ─────────────────▶ │ Epimetheus (auto mode) │ +│ (per-sample │ │ • Parse file (csv or json) │ +│ timestamps) │ │ • Route by sample age: │ +└─────────────────┘ │ < 5 min → Pushgateway │ + │ ≥ 5 min → Remote Write │ + └─────────────────────────────────────┘ + │ + ┌────────────────────┴────────────────────┐ + ▼ ▼ │ + ┌───────────────┐ ┌───────────────┐ │ + │ Pushgateway │ │ Prometheus │ │ + │ (realtime │ │ Remote Write │ │ + │ samples) │ │ (historic │ │ + └───────┬───────┘ │ samples) │ │ + │ └───────────────┘ │ + │ Scraped by Prometheus │ + ▼ │ + ┌───────────────┐ │ + │ Prometheus │◀──────────────────────────────────┘ + └───────────────┘ +``` + +```bash +./scripts/generate-test-data.sh +./epimetheus -mode=auto -file=test-all-ages.csv +``` + +**Options:** `-file`, `-format` (csv or json), `-pushgateway`, `-prometheus`. See [Data Formats](data-formats.md). diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md new file mode 100644 index 0000000..adeea2b --- /dev/null +++ b/docs/guides/quickstart.md @@ -0,0 +1,56 @@ +# Quick Start + +Minimal path to push metrics and see them in Prometheus or ClickHouse. + +## 1. Build + +```bash +go build -o epimetheus cmd/epimetheus/main.go +# Or: mage build +``` + +## 2. Run (Prometheus path) + +**Realtime mode** (Pushgateway + Prometheus): + +1. Deploy and expose Pushgateway (see [Kubernetes](../operations/kubernetes.md) or run Pushgateway locally). +2. Ensure Prometheus scrapes Pushgateway (see [Setup: Prometheus](../operations/setup-prometheus.md)). +3. Port-forward if needed, then run: + +```bash +kubectl port-forward -n monitoring svc/pushgateway 9091:9091 & +./epimetheus -mode=realtime -continuous +``` + +Metrics are pushed every 15 seconds. Stop with Ctrl+C. + +**Watch mode** (Remote Write; preserves timestamps): + +1. Enable the Prometheus Remote Write receiver (see [Setup: Prometheus](../operations/setup-prometheus.md)). +2. Port-forward Prometheus, then run: + +```bash +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 & +./epimetheus -mode=watch -file=mydata.csv -metric-name=myapp \ + -prometheus=http://localhost:9090/api/v1/write +``` + +## 3. View + +- **Pushgateway:** http://localhost:9091 +- **Prometheus:** http://localhost:9090 (e.g. query `epimetheus_test_requests_total` or your metric name) +- **Grafana:** Add Prometheus as a datasource and import the Epimetheus dashboard (see [Grafana Dashboard](../reference/grafana-dashboard.md)). + +## ClickHouse path (watch only) + +1. Run ClickHouse (e.g. `sudo systemctl start clickhouse-server` or Docker). See [Setup: ClickHouse](../operations/setup-clickhouse.md). +2. Run watch mode with ClickHouse: + +```bash +./epimetheus -mode=watch -file=test-data/watch-clickhouse-test.csv \ + -metric-name=watch_test -clickhouse=http://localhost:8123 -prometheus= +``` + +3. Verify: `./scripts/verify-clickhouse.sh` + +For all modes and options see [Operating Modes](modes.md) and [CLI Reference](../reference/cli.md). diff --git a/docs/operations/cleanup.md b/docs/operations/cleanup.md new file mode 100644 index 0000000..7835b21 --- /dev/null +++ b/docs/operations/cleanup.md @@ -0,0 +1,48 @@ +# Cleanup + +## Benchmark data in Prometheus + +To remove benchmark metrics from Prometheus, use the provided script: + +```bash +# Port-forward to Prometheus if needed +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 & + +./scripts/cleanup-benchmark-data.sh +``` + +The script deletes all `epimetheus_benchmark_*` series via the Admin API and runs clean_tombstones. + +**Manual deletion:** + +```bash +# Delete a specific metric +curl -X POST 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]=epimetheus_benchmark_cpu_usage' + +# Clean tombstones +curl -X POST http://localhost:9090/api/v1/admin/tsdb/clean_tombstones +``` + +The Admin API must be enabled on Prometheus (see [Setup: Prometheus](setup-prometheus.md)). + +## Other cleanup + +**Stop port-forwards:** + +```bash +pkill -f "port-forward.*9091" +pkill -f "port-forward.*9090" +pkill -f "port-forward.*3000" +``` + +**Remove test metrics from Pushgateway:** + +```bash +curl -X DELETE http://localhost:9091/metrics/job/example_metrics_pusher +``` + +**Uninstall Pushgateway (Helm):** + +```bash +helm uninstall pushgateway -n monitoring +``` diff --git a/docs/operations/kubernetes.md b/docs/operations/kubernetes.md new file mode 100644 index 0000000..20b8b07 --- /dev/null +++ b/docs/operations/kubernetes.md @@ -0,0 +1,51 @@ +# Kubernetes + +Common tasks when running Epimetheus against Prometheus, Pushgateway, and Grafana in Kubernetes. + +## Port-forwards + +To run Epimetheus on your laptop against cluster services: + +```bash +# Pushgateway (realtime mode) +kubectl port-forward -n monitoring svc/pushgateway 9091:9091 & + +# Prometheus (historic/watch, queries) +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 & + +# Grafana (dashboards) +kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80 +``` + +Then use `http://localhost:9091`, `http://localhost:9090`, and `http://localhost:3000` in Epimetheus flags and in the browser. Adjust service names and namespaces to match your cluster (e.g. `prometheus-kube-prometheus-prometheus` for kube-prometheus-stack). + +## Deploying Pushgateway + +Example using the official Helm chart: + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm install pushgateway prometheus-community/prometheus-pushgateway -n monitoring --create-namespace +``` + +Alternatively use your own chart (e.g. from the [conf repository](https://codeberg.org/snonux/conf) at `f3s/pushgateway/helm-chart`). + +## Deploying the Epimetheus Grafana dashboard + +**ConfigMap (recommended):** If you have a manifest that creates a ConfigMap with the dashboard JSON and the Grafana label for auto-discovery: + +```bash +kubectl apply -f ../prometheus/epimetheus-dashboard.yaml +``` + +**Script:** From the repo, with Grafana reachable (e.g. after port-forward): + +```bash +./scripts/deploy-dashboard.sh +# Or with credentials: +GRAFANA_URL="http://localhost:3000" GRAFANA_USER="admin" GRAFANA_PASSWORD="yourpassword" ./scripts/deploy-dashboard.sh +``` + +## Namespace and service names + +Replace `monitoring` and the Prometheus/Pushgateway/Grafana service names with whatever your Helm release or manifests use. Epimetheus only needs the URLs; it does not need to run inside the cluster. diff --git a/docs/operations/macos-setup.md b/docs/operations/macos-setup.md new file mode 100644 index 0000000..8ed47c9 --- /dev/null +++ b/docs/operations/macos-setup.md @@ -0,0 +1,91 @@ +# macOS Setup + +## Basic installation + +```bash +brew install prometheus +brew install grafana +go install github.com/prometheus/pushgateway@latest +brew services start grafana +brew services start prometheus +~/go/bin/pushgateway & +``` + +Log in to Grafana at http://localhost:3000 (default admin:admin; you will be prompted to change the password). Add http://localhost:9090 as a Prometheus datasource. + +## Enable Remote Write receiver (required for watch/historic/backfill/auto) + +Watch mode, historic mode, backfill mode, and auto mode with old data require the Prometheus Remote Write receiver. + +### Option 1: Permanent configuration + +Edit the Prometheus arguments file (Homebrew example): + +```bash +nano /opt/homebrew/etc/prometheus.args +``` + +Add at the end: + +``` +--web.enable-remote-write-receiver +--web.enable-admin-api +``` + +Example full file: + +``` +--config.file /opt/homebrew/etc/prometheus.yml +--web.listen-address=127.0.0.1:9090 +--storage.tsdb.path /opt/homebrew/var/prometheus +--web.enable-remote-write-receiver +--web.enable-admin-api +``` + +Restart Prometheus: + +```bash +brew services restart prometheus +``` + +Verify: + +```bash +curl http://localhost:9090/-/healthy +curl -X POST http://localhost:9090/api/v1/write # expect 400, not 404 +``` + +### Option 2: Temporary (testing only) + +```bash +brew services stop prometheus +prometheus --web.enable-remote-write-receiver +``` + +Keep that terminal open; use another for Epimetheus. This stops when you close the terminal. + +## Clearing old metrics (optional) + +If the Admin API is enabled: + +```bash +# Delete metrics by name pattern +curl -X POST -g 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]={__name__=~"blockstore_.*"}' +curl -X POST http://localhost:9090/api/v1/admin/tsdb/clean_tombstones +sleep 2 +``` + +## Verify watch mode + +```bash +cat > /tmp/test.csv << EOF +status,count,method +200,100,GET +404,50,POST +EOF + +./epimetheus -mode=watch -file=/tmp/test.csv -metric-name=test \ + -prometheus=http://localhost:9090/api/v1/write +``` + +You should see a success message. In Prometheus (http://localhost:9090), query `{__name__=~"test_.*"}`. diff --git a/docs/operations/setup-clickhouse.md b/docs/operations/setup-clickhouse.md new file mode 100644 index 0000000..acc8247 --- /dev/null +++ b/docs/operations/setup-clickhouse.md @@ -0,0 +1,43 @@ +# Setup: ClickHouse + +ClickHouse is only used in **watch mode**. Epimetheus creates the metrics table automatically if it does not exist. + +## Running ClickHouse + +- **Linux (systemd):** `sudo systemctl start clickhouse-server` +- **Docker:** Use the official [ClickHouse image](https://hub.docker.com/r/clickhouse/clickhouse-server) and expose the HTTP interface (default port 8123). +- **Kubernetes:** Deploy ClickHouse and expose a Service; use the HTTP URL (e.g. `http://clickhouse.monitoring.svc.cluster.local:8123`) as `-clickhouse`. + +Default HTTP port is **8123**. Epimetheus uses the HTTP interface, not the native protocol. + +## Table Creation + +You do not need to create the table manually. On first ingest, Epimetheus runs: + +```sql +CREATE TABLE IF NOT EXISTS epimetheus_metrics ( + metric String, + labels Map(String, String), + value Float64, + timestamp DateTime64(3) +) ENGINE = MergeTree() +ORDER BY (metric, timestamp) +``` + +To use a different table name, set `-clickhouse-table`. + +## Verification + +After running watch mode with `-clickhouse` set, verify ingestion: + +```bash +./scripts/verify-clickhouse.sh +``` + +With custom URL or table: + +```bash +./scripts/verify-clickhouse.sh http://localhost:8123 epimetheus_metrics +``` + +The script checks connectivity (`/ping`), row count, distinct metrics, sample rows, and rows per metric. If the table is empty or missing, it prints a reminder command to run Epimetheus in watch mode with `-clickhouse`. See [ClickHouse backend](../backends/clickhouse.md) for usage. diff --git a/docs/operations/setup-prometheus.md b/docs/operations/setup-prometheus.md new file mode 100644 index 0000000..294ce20 --- /dev/null +++ b/docs/operations/setup-prometheus.md @@ -0,0 +1,82 @@ +# Setup: Prometheus + +To use historic mode, backfill mode, auto mode with old data, or watch mode with `-prometheus`, you must enable the Prometheus Remote Write receiver. Without it, Epimetheus can only push realtime data via Pushgateway. + +## 1. Enable Remote Write Receiver and Admin API + +Example configuration (Prometheus 3.x style). Adjust paths and stack to match your environment (e.g. [conf repository](https://codeberg.org/snonux/conf) at `f3s/prometheus/persistence-values.yaml`): + +```yaml +prometheus: + prometheusSpec: + additionalArgs: + - name: web.enable-remote-write-receiver + value: "" + - name: web.enable-admin-api + value: "" + + enableFeatures: + - exemplar-storage + - otlp-write-receiver + + tsdb: + outOfOrderTimeWindow: 744h # 31 days for backfilling +``` + +This provides: + +- **Remote Write API** at `/api/v1/write` for ingesting metrics with custom timestamps. +- **Admin API** at `/api/v1/admin/tsdb/*` for deleting series and cleaning tombstones. +- **Out-of-order ingestion** so older points can be written for existing series (within the time window). + +After changing config, upgrade Prometheus (e.g. `helm upgrade` or your usual apply). + +### Verify + +```bash +# Remote Write receiver +kubectl get pod -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 \ + -o jsonpath='{.spec.containers[0].args}' | grep -o "web.enable-remote-write-receiver" + +# Out-of-order window +kubectl get prometheus -n monitoring prometheus-kube-prometheus-prometheus \ + -o jsonpath='{.spec.tsdb.outOfOrderTimeWindow}' + +# Admin API +kubectl get pod -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 \ + -o jsonpath='{.spec.containers[0].args}' | grep -o "web.enable-admin-api" +``` + +**Note:** In Prometheus 3.x use `additionalArgs` for `web.enable-remote-write-receiver`; the older `enableFeatures: [remote-write-receiver]` is deprecated. + +## 2. Scrape Config for Pushgateway + +For realtime mode, Prometheus must scrape Pushgateway. Example: + +```yaml +# additional-scrape-configs.yaml +- job_name: 'pushgateway' + honor_labels: true + static_configs: + - targets: + - 'pushgateway.monitoring.svc.cluster.local:9091' +``` + +Apply as a Secret (example): + +```bash +kubectl create secret generic additional-scrape-configs \ + --from-file=additional-scrape-configs.yaml \ + --dry-run=client -o yaml -n monitoring | kubectl apply -f - +``` + +## 3. Retention + +Check retention so you know how far back Epimetheus can write: + +```bash +kubectl get prometheus -n monitoring prometheus-kube-prometheus-prometheus \ + -o jsonpath='{.spec.retention}' +``` + +For very old data, increase retention or use a dedicated dev/test Prometheus. Enabling out-of-order ingestion and a large `outOfOrderTimeWindow` has memory and I/O trade-offs; see [Prometheus backend](../backends/prometheus.md) and keep production config conservative. diff --git a/docs/operations/troubleshooting.md b/docs/operations/troubleshooting.md new file mode 100644 index 0000000..9446508 --- /dev/null +++ b/docs/operations/troubleshooting.md @@ -0,0 +1,43 @@ +# Troubleshooting + +## Binary can't connect to Pushgateway + +- Confirm a port-forward or route to Pushgateway is running, e.g. `ps aux | grep "port-forward.*9091"`. +- Restart port-forward: `kubectl port-forward -n monitoring svc/pushgateway 9091:9091`. +- Ensure `-pushgateway` points at the URL you use (e.g. `http://localhost:9091`). + +## Metrics not appearing in Prometheus + +- **Pushgateway:** `curl http://localhost:9091/metrics | grep "prometheus_pusher_test"` (or your job/metric name). If empty, Epimetheus may not be pushing or the job name may differ. +- **Scrape:** In Prometheus UI (e.g. http://localhost:9090/targets), check that the Pushgateway job exists and is up. +- **Logs:** `kubectl logs -n monitoring -l app.kubernetes.io/name=prometheus` (or your Prometheus pod) for scrape/remote-write errors. + +## "Remote write receiver not enabled" error + +Prometheus must be started with the Remote Write receiver enabled. Verify: + +```bash +kubectl logs -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 | grep "remote-write-receiver" +``` + +You should see the feature listed in the enabled features. If not, add `web.enable-remote-write-receiver` (see [Setup: Prometheus](setup-prometheus.md)) and restart Prometheus. + +## "Out of order sample" error + +You are writing a sample older than existing data for the same series. + +- Use different labels for historic data (e.g. `job="historic_data"`), or +- Enable out-of-order ingestion on Prometheus and set `tsdb.outOfOrderTimeWindow` (see [Setup: Prometheus](setup-prometheus.md)), or +- Run backfills from oldest to newest. + +## Dashboard not appearing in Grafana + +- Check the dashboard ConfigMap exists: `kubectl get configmap -n monitoring | grep epimetheus`. +- Ensure the ConfigMap has the label Grafana uses for dashboard discovery (e.g. `grafana_dashboard: "1"`): `kubectl get configmap epimetheus-dashboard -n monitoring -o yaml | grep "grafana_dashboard"`. +- Restart Grafana to reload dashboards: `kubectl rollout restart deployment/prometheus-grafana -n monitoring` (adjust deployment name to your setup). + +## ClickHouse connection failed + +- Ensure ClickHouse is listening on HTTP (default port 8123): `curl -sS http://localhost:8123/ping`. +- If using Kubernetes, check Service and port-forwards. Use the same URL as `-clickhouse`. +- See [Setup: ClickHouse](setup-clickhouse.md) and [ClickHouse backend](../backends/clickhouse.md). diff --git a/docs/reference/cli.md b/docs/reference/cli.md new file mode 100644 index 0000000..83d02b0 --- /dev/null +++ b/docs/reference/cli.md @@ -0,0 +1,57 @@ +# CLI Reference + +All flags and defaults. Modes: `realtime`, `historic`, `backfill`, `auto`, `watch`. + +## Global + +| Flag | Default | Description | +|------|---------|-------------| +| `-version` | — | Print version and exit | +| `-mode` | `realtime` | Mode: realtime, historic, backfill, auto, or watch | + +## Realtime + +| Flag | Default | Description | +|------|---------|-------------| +| `-pushgateway` | `http://localhost:9091` | Pushgateway URL | +| `-job` | `example_metrics_pusher` | Job name for metrics | +| `-continuous` | `false` | Push every 15s | + +## Historic + +| Flag | Default | Description | +|------|---------|-------------| +| `-prometheus` | `http://localhost:9090/api/v1/write` | Prometheus Remote Write URL | +| `-hours-ago` | `24` | Hours in the past (single datapoint) | + +## Backfill + +| Flag | Default | Description | +|------|---------|-------------| +| `-prometheus` | `http://localhost:9090/api/v1/write` | Prometheus Remote Write URL | +| `-start-hours` | `48` | Start time in hours ago | +| `-end-hours` | `0` | End time in hours ago (0 = now) | +| `-interval` | `1` | Interval between points in hours | + +## Auto + +| Flag | Default | Description | +|------|---------|-------------| +| `-file` | — | Input file path (required) | +| `-format` | `csv` | Input format: csv or json | +| `-pushgateway` | `http://localhost:9091` | Pushgateway URL | +| `-prometheus` | `http://localhost:9090/api/v1/write` | Prometheus Remote Write URL | + +## Watch + +| Flag | Default | Description | +|------|---------|-------------| +| `-file` | — | CSV file(s) to watch (comma-separated for multiple); required | +| `-metric-name` | — | Base metric name (e.g. myapp, food); required | +| `-prometheus` | `http://localhost:9090/api/v1/write` | Prometheus Remote Write URL (set to empty to disable) | +| `-clickhouse` | — | ClickHouse HTTP URL (e.g. http://localhost:8123) | +| `-clickhouse-table` | `epimetheus_metrics` | ClickHouse table name | +| `-job` | `example_metrics_pusher` | Job name for metrics | +| `-resolve-ip-labels` | (ip only) | Comma-separated additional IP labels to resolve via DNS | + +Watch mode requires at least one of `-prometheus` or `-clickhouse`. Use `-prometheus=` to ingest only to ClickHouse. diff --git a/docs/reference/example-queries.md b/docs/reference/example-queries.md new file mode 100644 index 0000000..e78aaec --- /dev/null +++ b/docs/reference/example-queries.md @@ -0,0 +1,66 @@ +# Example Queries + +PromQL and curl examples for Epimetheus test metrics. Use your Prometheus (or Prometheus-compatible) query URL; after port-forward, that is often http://localhost:9090. + +## Basic PromQL + +```promql +# Total requests +epimetheus_test_requests_total + +# Request rate (last 5 minutes) +rate(epimetheus_test_requests_total[5m]) + +# Active connections +epimetheus_test_active_connections + +# Temperature +epimetheus_test_temperature_celsius +``` + +## Histogram + +```promql +# 95th percentile request duration +histogram_quantile(0.95, rate(epimetheus_test_request_duration_seconds_bucket[5m])) + +# Median (50th percentile) +histogram_quantile(0.50, rate(epimetheus_test_request_duration_seconds_bucket[5m])) + +# Average request duration +rate(epimetheus_test_request_duration_seconds_sum[5m]) / +rate(epimetheus_test_request_duration_seconds_count[5m]) +``` + +## Labeled counter + +```promql +# Failed jobs by type +epimetheus_test_jobs_processed_total{status="failed"} + +# Job success rate +rate(epimetheus_test_jobs_processed_total{status="success"}[5m]) / +rate(epimetheus_test_jobs_processed_total[5m]) + +# Total jobs by type +sum by (job_type) (epimetheus_test_jobs_processed_total) +``` + +## Curl (HTTP API) + +```bash +# Port-forward if needed +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 & + +# Total requests +curl -s "http://localhost:9090/api/v1/query?query=epimetheus_test_requests_total" | jq . + +# Temperature +curl -s "http://localhost:9090/api/v1/query?query=epimetheus_test_temperature_celsius" | jq . + +# Request rate +curl -s "http://localhost:9090/api/v1/query?query=rate(epimetheus_test_requests_total[5m])" | jq . + +# Histogram p95 +curl -s "http://localhost:9090/api/v1/query?query=histogram_quantile(0.95,rate(epimetheus_test_request_duration_seconds_bucket[5m]))" | jq . +``` diff --git a/docs/reference/grafana-dashboard.md b/docs/reference/grafana-dashboard.md new file mode 100644 index 0000000..b7f2030 --- /dev/null +++ b/docs/reference/grafana-dashboard.md @@ -0,0 +1,50 @@ +# Grafana Dashboard + +A dashboard is provided that shows all Epimetheus test metrics. + +## Panels + +1. Request Rate (line graph) +2. Total Requests (stat) +3. Active Connections (gauge with thresholds) +4. Temperature (gauge with thresholds) +5. Request Duration Histogram (p50, p90, p99) +6. Average Request Duration (stat) +7. Jobs Processed by Type (bar gauge) +8. Jobs Status Breakdown (table) + +Auto-refresh: 10 seconds. Time range: last 15 minutes (configurable). Optimized for dark theme. + +## Deployment + +### Option 1: Kubernetes ConfigMap (recommended) + +If you have a manifest that defines the dashboard as a ConfigMap with Grafana’s discovery label: + +```bash +kubectl apply -f ../prometheus/epimetheus-dashboard.yaml +``` + +Grafana will pick it up automatically. + +### Option 2: Manual import + +1. Port-forward Grafana: `kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80` +2. Open http://localhost:3000 +3. Dashboards → Import → Upload `grafana-dashboard.json` + +### Option 3: Deploy script + +```bash +./scripts/deploy-dashboard.sh +# Or with credentials: +GRAFANA_URL="http://localhost:3000" GRAFANA_USER="admin" GRAFANA_PASSWORD="yourpassword" ./scripts/deploy-dashboard.sh +``` + +## Datasource + +Use Prometheus (or a Prometheus-compatible backend such as VictoriaMetrics) as the datasource. Point it at the same instance Epimetheus writes to (e.g. http://localhost:9090 after port-forward). + +## Panel guidelines + +When creating or updating Grafana panels, follow the project’s [AGENT.md](../../AGENT.md) (Grafana dashboard guidelines): e.g. sort time series by last value descending, use `sort_desc()` in bar gauges, set table sort options as specified. diff --git a/docs/reference/magefile.md b/docs/reference/magefile.md new file mode 100644 index 0000000..0ce0b0d --- /dev/null +++ b/docs/reference/magefile.md @@ -0,0 +1,67 @@ +# Magefile Reference + +Epimetheus uses [Mage](https://magefile.org/) for build, test, and run targets. The build logic lives in `Magefile.go` at the repo root. + +## Prerequisites + +```bash +go install github.com/magefile/mage@latest +``` + +## Default Target + +Running `mage` with no arguments runs **Build**. + +## Targets + +| Target | Description | Example | +|--------|-------------|---------| +| `build` | Compile the epimetheus binary | `mage build` | +| `install` | Install binary to `$GOPATH/bin` | `mage install` | +| `run` | Build and run in realtime mode (continuous) | `mage run` | +| `runHistoric` | Build and run historic mode (24h ago) | `mage runHistoric` | +| `runAuto <file>` | Build and run auto mode with a file | `mage runAuto test-all-ages.csv` | +| `runWatchClickHouse [file]` | Build and run watch mode with ClickHouse only | `mage runWatchClickHouse` or `mage runWatchClickHouse my.csv` | +| `test` | Run all tests | `mage test` | +| `testCoverage` | Run tests and open coverage report | `mage testCoverage` | +| `testRace` | Run tests with race detector | `mage testRace` | +| `benchmark` | Run Go benchmarks | `mage benchmark` | +| `lint` | Run golangci-lint | `mage lint` | +| `fmt` | Format all Go code | `mage fmt` | +| `vet` | Run go vet | `mage vet` | +| `tidy` | Run go mod tidy | `mage tidy` | +| `clean` | Remove binary and coverage artifacts | `mage clean` | +| `generate` | Run go generate | `mage generate` | +| `version` | Build and print version | `mage version` | +| `all` | Run fmt, vet, test, and build | `mage all` | +| `ci` | Tidy, vet, test, and build (CI pipeline) | `mage ci` | +| `dev` | Build, port-forward Pushgateway, run realtime mode | `mage dev` | +| `generateTestData` | Generate test data files | `mage generateTestData` | +| `backfill` | Run backfill for last 48 hours | `mage backfill` | +| `benchmark100MB` | Run 100MB benchmark script | `mage benchmark100MB` | +| `benchmark1GB` | Run 1GB benchmark script | `mage benchmark1GB` | +| `cleanupBenchmarkData` | Clean benchmark data from Prometheus | `mage cleanupBenchmarkData` | +| `cleanupBenchmarkMetrics` | Clean benchmark metric files | `mage cleanupBenchmarkMetrics` | +| `deployDashboard` | Deploy Grafana dashboard via script | `mage deployDashboard` | +| `help` | Print list of targets | `mage help` | + +## Examples + +```bash +# Build and run realtime mode +mage run + +# Run tests with coverage +mage testCoverage + +# Run watch mode with ClickHouse (default test file) +mage runWatchClickHouse + +# Run watch mode with your CSV +mage runWatchClickHouse /path/to/data.csv + +# Full CI checks +mage ci +``` + +See [Quick Start](../guides/quickstart.md) and [CLI Reference](cli.md) for more on running Epimetheus. diff --git a/docs/reference/test-metrics.md b/docs/reference/test-metrics.md new file mode 100644 index 0000000..a1af41e --- /dev/null +++ b/docs/reference/test-metrics.md @@ -0,0 +1,35 @@ +# Test Metrics + +Generated metrics use the `epimetheus_test_` prefix so they are easy to identify as test data. + +## Counter: `epimetheus_test_requests_total` + +- **Type:** Counter (monotonically increasing) +- **Description:** Total number of requests processed +- **Use case:** Total events, requests, errors + +## Gauge: `epimetheus_test_active_connections` + +- **Type:** Gauge (can increase or decrease) +- **Description:** Current number of active connections (0–100) +- **Use case:** Current state, capacity + +## Gauge: `epimetheus_test_temperature_celsius` + +- **Type:** Gauge +- **Description:** Current temperature in Celsius (0–50°C) +- **Use case:** Environmental monitoring + +## Histogram: `epimetheus_test_request_duration_seconds` + +- **Type:** Histogram (distribution) +- **Description:** Request duration distribution +- **Buckets:** 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10 seconds +- **Use case:** Latency, SLO tracking + +## Labeled counter: `epimetheus_test_jobs_processed_total` + +- **Type:** Counter with labels +- **Description:** Jobs processed by type and status +- **Labels:** `job_type` (email, report, backup), `status` (success, failed) +- **Use case:** Categorized counting, multi-dimensional metrics diff --git a/backfill-historic-data.sh b/scripts/backfill-historic-data.sh index fa0e065..c755da7 100755..100644 --- a/backfill-historic-data.sh +++ b/scripts/backfill-historic-data.sh @@ -1,8 +1,13 @@ #!/bin/bash # Backfill historic data to Prometheus for Epimetheus dashboard +# Run from repo root: ./scripts/backfill-historic-data.sh set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$REPO_ROOT" + echo "=== Epimetheus Historic Data Backfill ===" echo "" echo "This script will populate Prometheus with historic test data" @@ -49,8 +54,6 @@ if [ $EXIT_CODE -eq 0 ]; then echo " - 1 day ago" echo " - 12 hours ago" echo " - Now (from previous realtime push)" - echo "" - echo "View the dashboard at: https://grafana.f3s.buetow.org/d/epimetheus-test/epimetheus-test-metrics" else echo "" echo "❌ Backfill failed with exit code $EXIT_CODE" diff --git a/benchmark-100mb.sh b/scripts/benchmark-100mb.sh index 1d3fad0..bda6476 100755..100644 --- a/benchmark-100mb.sh +++ b/scripts/benchmark-100mb.sh @@ -1,9 +1,14 @@ #!/bin/bash # Benchmark script: Generate and ingest 100MB of historic metrics # This tests Epimetheus performance with large-scale data ingestion +# Run from repo root: ./scripts/benchmark-100mb.sh set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$REPO_ROOT" + # Optimize Go GC for better performance (Phase 3 optimization) export GOGC=200 # Reduce GC frequency (default 100) export GOMEMLIMIT=3GiB # Set memory limit for Go 1.19+ @@ -34,10 +39,6 @@ echo "Estimated lines needed: $TARGET_LINES" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" # Generate data going back 7 days with 1-minute intervals -# This gives us ~10,080 data points across 7 days -# We'll generate multiple metrics per timestamp to reach 100MB -# All data is historic (> 5 minutes old) to use Remote Write API exclusively - GENERATION_START=$(date +%s) NOW=$(date +%s)000 # Current time in milliseconds @@ -51,7 +52,6 @@ cat > benchmark-data-100mb.csv << 'EOF' EOF # Generate metrics -# We'll create ~150 unique time series, each with ~10,000 data points = 1.5M samples METRICS=( "epimetheus_benchmark_cpu_usage" "epimetheus_benchmark_memory_bytes" @@ -80,7 +80,6 @@ LINES_GENERATED=0 for ((i=0; i<TOTAL_INTERVALS; i++)); do TIMESTAMP=$((SEVEN_DAYS_AGO + (i * INTERVAL_MS))) - # Generate a sample for each metric x instance combination for METRIC in "${METRICS[@]}"; do for INSTANCE in "${INSTANCES[@]}"; do VALUE=$((RANDOM % 1000)) @@ -89,7 +88,6 @@ for ((i=0; i<TOTAL_INTERVALS; i++)); do done done - # Progress indicator every 1000 intervals if [ $((i % 1000)) -eq 0 ]; then PROGRESS=$((i * 100 / TOTAL_INTERVALS)) echo -ne "\rProgress: $PROGRESS% ($LINES_GENERATED lines)" | tee -a "$RESULT_FILE" @@ -101,7 +99,6 @@ echo "" | tee -a "$RESULT_FILE" GENERATION_END=$(date +%s) GENERATION_TIME=$((GENERATION_END - GENERATION_START)) -# Get actual file size FILE_SIZE=$(stat -f%z benchmark-data-100mb.csv 2>/dev/null || stat -c%s benchmark-data-100mb.csv 2>/dev/null) FILE_SIZE_MB=$((FILE_SIZE / 1024 / 1024)) @@ -117,18 +114,15 @@ echo "Step 2: Setting up port-forward to Prometheus..." | tee -a "$RESULT_FILE" kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 > /tmp/benchmark-pf.log 2>&1 & PF_PID=$! echo "Port-forward started (PID: $PF_PID)" | tee -a "$RESULT_FILE" -sleep 8 # Wait for port-forward to be ready +sleep 8 echo "" | tee -a "$RESULT_FILE" # Step 3: Get baseline Prometheus metrics echo "Step 3: Collecting baseline Prometheus metrics..." | tee -a "$RESULT_FILE" PROM_POD=$(kubectl get pod -n monitoring -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].metadata.name}') echo "Prometheus pod: $PROM_POD" | tee -a "$RESULT_FILE" - -# Get memory and CPU usage before ingestion BASELINE_MEMORY=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $3}') BASELINE_CPU=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $2}') - echo " Baseline memory: $BASELINE_MEMORY" | tee -a "$RESULT_FILE" echo " Baseline CPU: $BASELINE_CPU" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" @@ -136,18 +130,9 @@ echo "" | tee -a "$RESULT_FILE" # Step 4: Run ingestion benchmark echo "Step 4: Running ingestion benchmark..." | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" - INGEST_START=$(date +%s.%N) -# Run epimetheus with time measurement -# Use CSV mode with Remote Write API (all data is historic) -# Note: We can't use auto mode because it requires both Pushgateway and Remote Write -# Instead, we'll implement a direct CSV->Remote Write ingestion - echo "Parsing CSV and preparing for Remote Write ingestion..." | tee -a "$RESULT_FILE" - -# For now, use backfill mode to process the CSV data -# We'll need to enhance epimetheus to support pure CSV->RemoteWrite mode echo "WARNING: Using auto mode - this may fail if data is too recent" | tee -a "$RESULT_FILE" echo "Continuing with Remote Write API for historic data..." | tee -a "$RESULT_FILE" @@ -157,37 +142,30 @@ echo "Continuing with Remote Write API for historic data..." | tee -a "$RESULT_F -format=csv \ -prometheus=http://localhost:9090/api/v1/write \ -pushgateway=http://localhost:9091 \ - 2>&1 | tee -a "$RESULT_FILE" || true # Continue even if pushgateway fails + 2>&1 | tee -a "$RESULT_FILE" || true INGEST_END=$(date +%s.%N) - -# Calculate ingestion time INGEST_TIME=$(echo "$INGEST_END - $INGEST_START" | bc) echo "" | tee -a "$RESULT_FILE" echo "Ingestion complete:" | tee -a "$RESULT_FILE" echo " Total time: ${INGEST_TIME}s" | tee -a "$RESULT_FILE" - -# Calculate throughput SAMPLES_PER_SECOND=$(echo "scale=2; $LINES_GENERATED / $INGEST_TIME" | bc) MB_PER_SECOND=$(echo "scale=2; $FILE_SIZE_MB / $INGEST_TIME" | bc) - echo " Samples/second: $SAMPLES_PER_SECOND" | tee -a "$RESULT_FILE" echo " MB/second: $MB_PER_SECOND" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" -# Step 5: Get post-ingestion Prometheus metrics +# Step 5: Post-ingestion metrics echo "Step 5: Collecting post-ingestion Prometheus metrics..." | tee -a "$RESULT_FILE" -sleep 5 # Wait for metrics to stabilize - +sleep 5 POST_MEMORY=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $3}') POST_CPU=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $2}') - echo " Post-ingestion memory: $POST_MEMORY" | tee -a "$RESULT_FILE" echo " Post-ingestion CPU: $POST_CPU" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" -# Step 6: Query some data to verify ingestion +# Step 6: Verify echo "Step 6: Verifying data ingestion..." | tee -a "$RESULT_FILE" QUERY_RESULT=$(curl -s "http://localhost:9090/api/v1/query?query=count(epimetheus_benchmark_cpu_usage)" | jq -r '.data.result[0].value[1]') echo " Samples found for epimetheus_benchmark_cpu_usage: $QUERY_RESULT" | tee -a "$RESULT_FILE" @@ -198,7 +176,6 @@ echo "Step 7: Cleaning up..." | tee -a "$RESULT_FILE" kill $PF_PID 2>/dev/null || true echo "" | tee -a "$RESULT_FILE" -# Summary echo "=== BENCHMARK SUMMARY ===" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" echo "Dataset:" | tee -a "$RESULT_FILE" diff --git a/benchmark-1gb.sh b/scripts/benchmark-1gb.sh index f715376..35176b0 100755..100644 --- a/benchmark-1gb.sh +++ b/scripts/benchmark-1gb.sh @@ -1,9 +1,14 @@ #!/bin/bash # Benchmark script: Generate and ingest 1GB of historic metrics # This tests Epimetheus performance with large-scale data ingestion +# Run from repo root: ./scripts/benchmark-1gb.sh set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$REPO_ROOT" + # Optimize Go GC for better performance (Phase 3 optimization) export GOGC=200 # Reduce GC frequency (default 100) export GOMEMLIMIT=3GiB # Set memory limit for Go 1.19+ @@ -23,7 +28,6 @@ echo "" | tee -a "$RESULT_FILE" echo "Step 1: Generating 1GB of test data..." | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" -# Calculate: ~80 bytes per line, 1GB = ~13M lines TARGET_SIZE_MB=1000 TARGET_BYTES=$((TARGET_SIZE_MB * 1024 * 1024)) BYTES_PER_LINE=80 @@ -33,25 +37,17 @@ echo "Target size: ${TARGET_SIZE_MB}MB" | tee -a "$RESULT_FILE" echo "Estimated lines needed: $TARGET_LINES" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" -# Generate data going back 30 days with 30-second intervals -# This gives us ~86,400 data points across 30 days (respects Prometheus 720h out-of-order limit) -# We'll generate multiple metrics per timestamp to reach 1GB -# All data is historic (> 5 minutes old) to use Remote Write API exclusively - GENERATION_START=$(date +%s) -NOW=$(date +%s)000 # Current time in milliseconds -ONE_HOUR_AGO=$((NOW - 3600000)) # Start from 1 hour ago to ensure all data is historic -THIRTY_DAYS_AGO=$((ONE_HOUR_AGO - 2592000000)) # 30 days before that (30 * 24 * 60 * 60 * 1000) +NOW=$(date +%s)000 +ONE_HOUR_AGO=$((NOW - 3600000)) +THIRTY_DAYS_AGO=$((ONE_HOUR_AGO - 2592000000)) -# CSV header cat > benchmark-data-1gb.csv << 'EOF' # Prometheus metrics - 1GB benchmark dataset # Format: metric_name,labels,value,timestamp_ms EOF -# Generate metrics -# We'll create ~150 unique time series, each with ~86,400 data points = 13M samples METRICS=( "epimetheus_benchmark_cpu_usage" "epimetheus_benchmark_memory_bytes" @@ -71,8 +67,8 @@ INSTANCES=( "db-01" "db-02" "db-03" "worker-01" "worker-02" ) -INTERVAL_MS=30000 # 30 second interval (to maintain 1GB size with 30 days) -TOTAL_INTERVALS=86400 # 30 days of 30-second intervals +INTERVAL_MS=30000 +TOTAL_INTERVALS=86400 echo "Generating data..." | tee -a "$RESULT_FILE" LINES_GENERATED=0 @@ -80,7 +76,6 @@ LINES_GENERATED=0 for ((i=0; i<TOTAL_INTERVALS; i++)); do TIMESTAMP=$((THIRTY_DAYS_AGO + (i * INTERVAL_MS))) - # Generate a sample for each metric x instance combination for METRIC in "${METRICS[@]}"; do for INSTANCE in "${INSTANCES[@]}"; do VALUE=$((RANDOM % 1000)) @@ -89,7 +84,6 @@ for ((i=0; i<TOTAL_INTERVALS; i++)); do done done - # Progress indicator every 5000 intervals if [ $((i % 5000)) -eq 0 ]; then PROGRESS=$((i * 100 / TOTAL_INTERVALS)) echo -ne "\rProgress: $PROGRESS% ($LINES_GENERATED lines)" | tee -a "$RESULT_FILE" @@ -101,7 +95,6 @@ echo "" | tee -a "$RESULT_FILE" GENERATION_END=$(date +%s) GENERATION_TIME=$((GENERATION_END - GENERATION_START)) -# Get actual file size FILE_SIZE=$(stat -f%z benchmark-data-1gb.csv 2>/dev/null || stat -c%s benchmark-data-1gb.csv 2>/dev/null) FILE_SIZE_MB=$((FILE_SIZE / 1024 / 1024)) @@ -112,42 +105,30 @@ echo " File size: ${FILE_SIZE_MB}MB ($FILE_SIZE bytes)" | tee -a "$RESULT_FILE" echo " Generation time: ${GENERATION_TIME}s" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" -# Step 2: Start port-forward to Prometheus +# Step 2: Port-forward echo "Step 2: Setting up port-forward to Prometheus..." | tee -a "$RESULT_FILE" kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 > /tmp/benchmark-pf.log 2>&1 & PF_PID=$! echo "Port-forward started (PID: $PF_PID)" | tee -a "$RESULT_FILE" -sleep 8 # Wait for port-forward to be ready +sleep 8 echo "" | tee -a "$RESULT_FILE" -# Step 3: Get baseline Prometheus metrics +# Step 3: Baseline echo "Step 3: Collecting baseline Prometheus metrics..." | tee -a "$RESULT_FILE" PROM_POD=$(kubectl get pod -n monitoring -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].metadata.name}') echo "Prometheus pod: $PROM_POD" | tee -a "$RESULT_FILE" - -# Get memory and CPU usage before ingestion BASELINE_MEMORY=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $3}') BASELINE_CPU=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $2}') - echo " Baseline memory: $BASELINE_MEMORY" | tee -a "$RESULT_FILE" echo " Baseline CPU: $BASELINE_CPU" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" -# Step 4: Run ingestion benchmark +# Step 4: Ingest echo "Step 4: Running ingestion benchmark..." | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" - INGEST_START=$(date +%s.%N) -# Run epimetheus with time measurement -# Use CSV mode with Remote Write API (all data is historic) -# Note: We can't use auto mode because it requires both Pushgateway and Remote Write -# Instead, we'll implement a direct CSV->Remote Write ingestion - echo "Parsing CSV and preparing for Remote Write ingestion..." | tee -a "$RESULT_FILE" - -# For now, use backfill mode to process the CSV data -# We'll need to enhance epimetheus to support pure CSV->RemoteWrite mode echo "WARNING: Using auto mode - this may fail if data is too recent" | tee -a "$RESULT_FILE" echo "Continuing with Remote Write API for historic data..." | tee -a "$RESULT_FILE" @@ -157,37 +138,30 @@ echo "Continuing with Remote Write API for historic data..." | tee -a "$RESULT_F -format=csv \ -prometheus=http://localhost:9090/api/v1/write \ -pushgateway=http://localhost:9091 \ - 2>&1 | tee -a "$RESULT_FILE" || true # Continue even if pushgateway fails + 2>&1 | tee -a "$RESULT_FILE" || true INGEST_END=$(date +%s.%N) - -# Calculate ingestion time INGEST_TIME=$(echo "$INGEST_END - $INGEST_START" | bc) echo "" | tee -a "$RESULT_FILE" echo "Ingestion complete:" | tee -a "$RESULT_FILE" echo " Total time: ${INGEST_TIME}s" | tee -a "$RESULT_FILE" - -# Calculate throughput SAMPLES_PER_SECOND=$(echo "scale=2; $LINES_GENERATED / $INGEST_TIME" | bc) MB_PER_SECOND=$(echo "scale=2; $FILE_SIZE_MB / $INGEST_TIME" | bc) - echo " Samples/second: $SAMPLES_PER_SECOND" | tee -a "$RESULT_FILE" echo " MB/second: $MB_PER_SECOND" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" -# Step 5: Get post-ingestion Prometheus metrics +# Step 5: Post-ingestion echo "Step 5: Collecting post-ingestion Prometheus metrics..." | tee -a "$RESULT_FILE" -sleep 5 # Wait for metrics to stabilize - +sleep 5 POST_MEMORY=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $3}') POST_CPU=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $2}') - echo " Post-ingestion memory: $POST_MEMORY" | tee -a "$RESULT_FILE" echo " Post-ingestion CPU: $POST_CPU" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" -# Step 6: Query some data to verify ingestion +# Step 6: Verify echo "Step 6: Verifying data ingestion..." | tee -a "$RESULT_FILE" QUERY_RESULT=$(curl -s "http://localhost:9090/api/v1/query?query=count(epimetheus_benchmark_cpu_usage)" | jq -r '.data.result[0].value[1]') echo " Samples found for epimetheus_benchmark_cpu_usage: $QUERY_RESULT" | tee -a "$RESULT_FILE" @@ -198,7 +172,6 @@ echo "Step 7: Cleaning up..." | tee -a "$RESULT_FILE" kill $PF_PID 2>/dev/null || true echo "" | tee -a "$RESULT_FILE" -# Summary echo "=== BENCHMARK SUMMARY ===" | tee -a "$RESULT_FILE" echo "" | tee -a "$RESULT_FILE" echo "Dataset:" | tee -a "$RESULT_FILE" diff --git a/cleanup-benchmark-data.sh b/scripts/cleanup-benchmark-data.sh index a5409f1..48ba187 100755..100644 --- a/cleanup-benchmark-data.sh +++ b/scripts/cleanup-benchmark-data.sh @@ -1,6 +1,7 @@ #!/bin/bash # Cleanup script: Delete benchmark data from Prometheus # This uses the Prometheus Admin API to selectively remove benchmark metrics +# Run from repo root: ./scripts/cleanup-benchmark-data.sh [prometheus_url] set -e diff --git a/cleanup-benchmark-metrics.sh b/scripts/cleanup-benchmark-metrics.sh index d70aa95..7b1ce4e 100755..100644 --- a/cleanup-benchmark-metrics.sh +++ b/scripts/cleanup-benchmark-metrics.sh @@ -1,6 +1,7 @@ #!/bin/bash # Cleanup benchmark metrics from Prometheus # This allows running benchmarks from a clean state +# Run from repo root: ./scripts/cleanup-benchmark-metrics.sh set -e @@ -63,8 +64,7 @@ elif [ "$ADMIN_CHECK" = "405" ]; then echo " value: \"\"" echo "" echo "Then upgrade Prometheus:" - echo " cd /home/paul/git/conf/f3s/prometheus" - echo " just upgrade" + echo " helm upgrade ... (or: just upgrade in your conf repo)" echo "" echo "WARNING: Admin API should only be enabled in development/test environments!" echo "" diff --git a/generate-test-data.sh b/scripts/generate-test-data.sh index a4a0b1b..4db332e 100755..100644 --- a/generate-test-data.sh +++ b/scripts/generate-test-data.sh @@ -1,6 +1,7 @@ #!/bin/bash # Generate test data with actual timestamps for different time ranges +# Run from repo root: ./scripts/generate-test-data.sh NOW=$(date +%s)000 # Current time in milliseconds ONE_HOUR_AGO=$((NOW - 3600000)) @@ -8,6 +9,10 @@ ONE_DAY_AGO=$((NOW - 86400000)) ONE_WEEK_AGO=$((NOW - 604800000)) ONE_MONTH_AGO=$((NOW - 2592000000)) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$REPO_ROOT" + cat > test-all-ages.csv << EOF # Prometheus metrics in CSV format demonstrating all time ranges # Format: metric_name,labels,value,timestamp_ms @@ -39,8 +44,8 @@ app_temperature_celsius,instance=1m_ago;zone=africa,28.7,$ONE_MONTH_AGO EOF echo "Generated test-all-ages.csv with the following timestamps:" -echo " Current: $NOW ($(date -d @$((NOW/1000)) '+%Y-%m-%d %H:%M:%S'))" -echo " 1h ago: $ONE_HOUR_AGO ($(date -d @$((ONE_HOUR_AGO/1000)) '+%Y-%m-%d %H:%M:%S'))" -echo " 1d ago: $ONE_DAY_AGO ($(date -d @$((ONE_DAY_AGO/1000)) '+%Y-%m-%d %H:%M:%S'))" -echo " 1w ago: $ONE_WEEK_AGO ($(date -d @$((ONE_WEEK_AGO/1000)) '+%Y-%m-%d %H:%M:%S'))" -echo " 1m ago: $ONE_MONTH_AGO ($(date -d @$((ONE_MONTH_AGO/1000)) '+%Y-%m-%d %H:%M:%S'))" +echo " Current: $NOW ($(date -d @$((NOW/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((NOW/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))" +echo " 1h ago: $ONE_HOUR_AGO ($(date -d @$((ONE_HOUR_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((ONE_HOUR_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))" +echo " 1d ago: $ONE_DAY_AGO ($(date -d @$((ONE_DAY_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((ONE_DAY_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))" +echo " 1w ago: $ONE_WEEK_AGO ($(date -d @$((ONE_WEEK_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((ONE_WEEK_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))" +echo " 1m ago: $ONE_MONTH_AGO ($(date -d @$((ONE_MONTH_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((ONE_MONTH_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))" diff --git a/run.sh b/scripts/run.sh index 38637cf..d603639 100755..100644 --- a/run.sh +++ b/scripts/run.sh @@ -2,6 +2,7 @@ # Simple script to run Epimetheus # Automatically sets up port-forwarding and runs the binary +# Run from repo root: ./scripts/run.sh set -e @@ -18,6 +19,11 @@ echo "Step 2: Running epimetheus binary (realtime mode)..." echo "Press Ctrl+C to stop" echo "" +# Run from repo root so ./epimetheus resolves +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$REPO_ROOT" + # Run the binary in realtime mode and capture its exit status ./epimetheus -mode=realtime -continuous EXIT_CODE=$? diff --git a/verify-clickhouse.sh b/scripts/verify-clickhouse.sh index 5819f18..a9c3233 100755..100644 --- a/verify-clickhouse.sh +++ b/scripts/verify-clickhouse.sh @@ -1,6 +1,6 @@ #!/bin/bash # Verify that epimetheus metrics were successfully ingested into ClickHouse. -# Usage: ./verify-clickhouse.sh [clickhouse_url] [table_name] +# Usage: ./scripts/verify-clickhouse.sh [clickhouse_url] [table_name] # Default: http://localhost:8123, epimetheus_metrics set -e |
