summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore31
-rw-r--r--Magefile.go12
-rw-r--r--README.md999
-rw-r--r--docs/DOCS-RESTRUCTURE-PLAN.md235
-rw-r--r--docs/README.md66
-rw-r--r--docs/backends/clickhouse.md92
-rw-r--r--docs/backends/prometheus.md76
-rw-r--r--docs/design/architecture.md101
-rw-r--r--docs/guides/csv-format-flexibility.md52
-rw-r--r--docs/guides/data-formats.md49
-rw-r--r--docs/guides/dns-resolution.md42
-rw-r--r--docs/guides/dtail-metrics-example.md49
-rw-r--r--docs/guides/modes.md130
-rw-r--r--docs/guides/quickstart.md56
-rw-r--r--docs/operations/cleanup.md48
-rw-r--r--docs/operations/kubernetes.md51
-rw-r--r--docs/operations/macos-setup.md91
-rw-r--r--docs/operations/setup-clickhouse.md43
-rw-r--r--docs/operations/setup-prometheus.md82
-rw-r--r--docs/operations/troubleshooting.md43
-rw-r--r--docs/reference/cli.md57
-rw-r--r--docs/reference/example-queries.md66
-rw-r--r--docs/reference/grafana-dashboard.md50
-rw-r--r--docs/reference/magefile.md67
-rw-r--r--docs/reference/test-metrics.md35
-rw-r--r--[-rwxr-xr-x]scripts/backfill-historic-data.sh (renamed from backfill-historic-data.sh)7
-rw-r--r--[-rwxr-xr-x]scripts/benchmark-100mb.sh (renamed from benchmark-100mb.sh)43
-rw-r--r--[-rwxr-xr-x]scripts/benchmark-1gb.sh (renamed from benchmark-1gb.sh)63
-rw-r--r--[-rwxr-xr-x]scripts/cleanup-benchmark-data.sh (renamed from cleanup-benchmark-data.sh)1
-rw-r--r--[-rwxr-xr-x]scripts/cleanup-benchmark-metrics.sh (renamed from cleanup-benchmark-metrics.sh)4
-rw-r--r--[-rwxr-xr-x]scripts/generate-test-data.sh (renamed from generate-test-data.sh)15
-rw-r--r--[-rwxr-xr-x]scripts/run.sh (renamed from run.sh)6
-rw-r--r--[-rwxr-xr-x]scripts/verify-clickhouse.sh (renamed from verify-clickhouse.sh)2
33 files changed, 1715 insertions, 1049 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c0f69ee
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,31 @@
+# Binaries
+./epimetheus
+prometheus-pusher
+
+# Test coverage
+coverage.out
+coverage.html
+*.prof
+
+# Logs
+*.log
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Test data
+test-*.csv
+test-*.json
+
+# Temporary files
+tmp/
+benchmark-data-100mb.csv
+benchmark-results/
+OLD/
diff --git a/Magefile.go b/Magefile.go
index 3cce6e0..6a0b124 100644
--- a/Magefile.go
+++ b/Magefile.go
@@ -179,7 +179,7 @@ func Dev() error {
// GenerateTestData creates test data files
func GenerateTestData() error {
fmt.Println("Generating test data...")
- return sh.RunV("./generate-test-data.sh")
+ return sh.RunV("./scripts/generate-test-data.sh")
}
// Backfill runs backfill for the last 48 hours
@@ -192,31 +192,31 @@ func Backfill() error {
// Benchmark100MB runs the 100MB benchmark
func Benchmark100MB() error {
fmt.Println("Running 100MB benchmark...")
- return sh.RunV("./benchmark-100mb.sh")
+ return sh.RunV("./scripts/benchmark-100mb.sh")
}
// Benchmark1GB runs the 1GB benchmark
func Benchmark1GB() error {
fmt.Println("Running 1GB benchmark...")
- return sh.RunV("./benchmark-1gb.sh")
+ return sh.RunV("./scripts/benchmark-1gb.sh")
}
// CleanupBenchmarkData removes benchmark data from Prometheus
func CleanupBenchmarkData() error {
fmt.Println("Cleaning up benchmark data...")
- return sh.RunV("./cleanup-benchmark-data.sh")
+ return sh.RunV("./scripts/cleanup-benchmark-data.sh")
}
// CleanupBenchmarkMetrics removes benchmark metric files
func CleanupBenchmarkMetrics() error {
fmt.Println("Cleaning up benchmark metric files...")
- return sh.RunV("./cleanup-benchmark-metrics.sh")
+ return sh.RunV("./scripts/cleanup-benchmark-metrics.sh")
}
// DeployDashboard deploys the Grafana dashboard
func DeployDashboard() error {
fmt.Println("Deploying Grafana dashboard...")
- return sh.RunV("./deploy-dashboard.sh")
+ return sh.RunV("./scripts/deploy-dashboard.sh")
}
// Help prints available targets
diff --git a/README.md b/README.md
index ba10a76..10d7b97 100644
--- a/README.md
+++ b/README.md
@@ -4,993 +4,82 @@
# Epimetheus
-A versatile Go tool for pushing metrics to Prometheus with support for both realtime and historic data ingestion.
+A versatile Go tool for pushing metrics to Prometheus (and Prometheus-compatible backends like VictoriaMetrics) and ClickHouse, with support for realtime and historic data ingestion.
## Why "Epimetheus"?
-In Greek mythology, [Epimetheus](https://en.wikipedia.org/wiki/Epimetheus_(mythology)) is Prometheus's brother, whose name means "afterthought" or "hindsight" (while Prometheus means "forethought"). This name cleverly captures the tool's purpose: bringing data to Prometheus **after** collection, whether it's historic data from hours, days, or weeks ago, or realtime data pushed on-demand.
-
-While Epimetheus is sometimes depicted as foolish in myths (he accepted Pandora's box despite warnings), this tool embraces the "afterthought" aspect productively - it's never too late to bring your metrics home to Prometheus!
-
-## Architecture
-
-```
-┌─────────────────────────────────────────────────────────────────────────┐
-│ Epimetheus │
-│ (Metrics Ingestion Tool) │
-│ │
-│ Modes: │
-│ • Realtime - Current metrics (< 5 min old) │
-│ • Historic - Historic metrics (≥ 5 min old) │
-│ • Backfill - Range of historic data │
-│ • Auto - Automatic routing based on timestamp age │
-└─────────────────────────────────────────────────────────────────────────┘
- │ │
- │ Realtime Data │ Historic Data
- │ (via HTTP POST) │ (via Remote Write API)
- │ Uses "now" timestamp │ Preserves timestamps
- ▼ ▼
-┌─────────────────────┐ ┌─────────────────────┐
-│ Pushgateway │ │ Prometheus │
-│ (Port 9091) │ │ (Port 9090) │
-│ │ │ │
-│ • Buffers metrics │ │ Remote Write API: │
-│ • Scraped by │──── Scraped ─────▶ │ /api/v1/write │
-│ Prometheus │ every 15-30s │ │
-│ • No timestamp │ │ Feature Required: │
-│ preservation │ │ --enable-feature= │
-│ │ │ remote-write- │
-│ │ │ receiver │
-└─────────────────────┘ └─────────────────────┘
- │
- │ Prometheus Query API
- │ /api/v1/query
- ▼
- ┌─────────────────────┐
- │ Grafana │
- │ (Port 3000) │
- │ │
- │ • Prometheus as │
- │ datasource │
- │ • Dashboards: │
- │ - Epimetheus │
- │ Test Metrics │
- │ • Auto-refresh │
- └─────────────────────┘
-```
-
-### Data Flow
-
-1. **Realtime Path** (for current data):
- - Epimetheus → Pushgateway (HTTP POST)
- - Prometheus scrapes Pushgateway periodically
- - Timestamp = "now" when Prometheus scrapes
-
-2. **Historic Path** (for old data):
- - Epimetheus → Prometheus Remote Write API (HTTP POST)
- - Direct write to Prometheus TSDB
- - Timestamp preserved from original data
-
-3. **Visualization**:
- - Grafana queries Prometheus
- - Displays metrics in dashboards
- - Auto-refresh every 10 seconds
+In Greek mythology, [Epimetheus](https://en.wikipedia.org/wiki/Epimetheus_(mythology)) is Prometheus's brother—"afterthought" or "hindsight" (while Prometheus means "forethought"). This tool brings data to Prometheus **after** collection: historic data from hours or days ago, or realtime data pushed on-demand. It's never too late to bring your metrics home.
## Overview
-**epimetheus** is a standalone binary that:
-- **Generates** realistic example metrics simulating production applications
-- **Pushes** metrics via Pushgateway (realtime) or Remote Write API (historic)
-- **Automatically detects** timestamp age and chooses the optimal ingestion method
-- **Supports** multiple data formats (CSV, JSON) and all Prometheus metric types
-- **Provides** Grafana dashboard for visualizing test metrics
-
-## Quick Start
-
-### 1. Deploy Pushgateway (one-time setup)
-
-The Pushgateway Helm chart is available in the [conf repository](https://codeberg.org/snonux/conf) at `f3s/pushgateway/helm-chart`.
-
-```bash
-# Clone the conf repository if you haven't already
-git clone https://codeberg.org/snonux/conf.git
-cd conf/f3s/pushgateway/helm-chart
-
-# Deploy Pushgateway
-helm upgrade --install pushgateway . -n monitoring --create-namespace
-```
-
-Alternatively, deploy Pushgateway using the official chart:
-
-```bash
-helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
-helm install pushgateway prometheus-community/prometheus-pushgateway -n monitoring --create-namespace
-```
-
-### 2. Run in Realtime Mode
-
-```bash
-# Port-forward Pushgateway
-kubectl port-forward -n monitoring svc/pushgateway 9091:9091 &
-
-# Push test metrics continuously
-cd /home/paul/git/conf/f3s/epimetheus
-./epimetheus -mode=realtime -continuous
-```
-
-The binary pushes metrics every 15 seconds. Press Ctrl+C to stop.
-
-### 3. View Metrics
-
-```bash
-# Pushgateway UI
-open http://localhost:9091
-
-# Prometheus UI
-kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
-open http://localhost:9090
-```
-
-## Operating Modes
-
-### 👁️ Watch Mode
-Monitor CSV files for changes and push metrics to Prometheus with file modification timestamps.
-
-**Works with ANY CSV format** - automatically detects numeric vs string columns and sanitizes names.
-
-**NEW: Automatic DNS Resolution** - IP addresses are automatically resolved to hostnames for better observability in Grafana.
-
-```bash
-./epimetheus -mode=watch \
- -file=mydata.csv \
- -metric-name=myapp \
- -prometheus=http://localhost:9090/api/v1/write
-```
-
-**Features:**
-- 🔍 **Format-agnostic**: Works with any tabular CSV structure
-- 📊 **Automatic detection**: Numeric columns → metrics, String columns → labels
-- 🏷️ **Name sanitization**: `min(potatoes)`, `avg(time)`, `p99(latency)` → valid metric names
-- 🌐 **DNS Resolution**: IP addresses → hostnames (e.g., `10.50.52.61` → `foo.example.lan`)
-- 💾 **Smart Caching**: In-memory cache prevents redundant DNS lookups
-- ⏱️ **Timestamp preservation**: Uses file modification time
-- 🔄 **Continuous monitoring**: Polls file every 1 second
-- 💪 **Error resilient**: Continues watching despite failures
-- 🎯 **Remote Write**: Pushes to Prometheus (preserves timestamps)
-
-**CSV Format:**
-Works with any tabular CSV:
-- First row: column headers (automatically sanitized)
-- Subsequent rows: data values
-- Column names can be anything: `min(x)`, `avg(y)`, `p99(latency)`, etc.
-
-**Example 1** - Web metrics:
-```csv
-avg(response_time),p99(latency),endpoint,method
-45.2,120.5,/api/users,GET
-52.1,135.8,/api/orders,POST
-```
-
-Generates:
-```promql
-web_avg_response_time{endpoint="/api/users",method="GET"} 45.2
-web_p99_latency{endpoint="/api/users",method="GET"} 120.5
-web_avg_response_time{endpoint="/api/orders",method="POST"} 52.1
-web_p99_latency{endpoint="/api/orders",method="POST"} 135.8
-```
-
-**Example 2** - Food metrics:
-```csv
-min(potatoes),last(coke),avg(price),country,store_type
-5.2,10.5,12.99,USA,grocery
-3.8,8.2,9.99,Canada,convenience
-```
-
-Generates:
-```promql
-food_min_potatoes{country="USA",store_type="grocery"} 5.2
-food_last_coke{country="USA",store_type="grocery"} 10.5
-food_avg_price{country="USA",store_type="grocery"} 12.99
-# ... etc
-```
-
-Each row generates N samples (N = number of numeric columns).
-
-See [CSV-FORMAT-FLEXIBILITY.md](CSV-FORMAT-FLEXIBILITY.md) for more examples.
-
-**Options:**
-- `-file` - CSV file to watch (required)
-- `-metric-name` - Base metric name (required, e.g., `food`, `network`, `database`)
-- `-prometheus` - Prometheus Remote Write URL (default: http://localhost:9090/api/v1/write)
-- `-clickhouse` - ClickHouse HTTP URL (e.g. http://localhost:8123) to also ingest metrics
-- `-clickhouse-table` - ClickHouse table name (default: epimetheus_metrics)
-- `-job` - Job name for metrics (default: example_metrics_pusher)
-- `-resolve-ip-labels` - Additional IP labels to resolve via DNS (default: ip is always resolved)
-
-**ClickHouse Support:**
-Watch mode can ingest to ClickHouse in addition to (or instead of) Prometheus:
-
-```bash
-# Ingest to both Prometheus and ClickHouse
-./epimetheus -mode=watch -file=data.csv -metric-name=myapp \
- -prometheus=http://localhost:9090/api/v1/write \
- -clickhouse=http://localhost:8123
-
-# ClickHouse only (use -prometheus= to disable Prometheus)
-./epimetheus -mode=watch -file=test-data/watch-clickhouse-test.csv \
- -metric-name=watch_test -clickhouse=http://localhost:8123 -prometheus=
-
-# Verify data in ClickHouse
-./verify-clickhouse.sh
-```
-
-**DNS Resolution:**
-By default, the `ip` label is automatically resolved to a hostname. To resolve additional IP labels:
-
-```bash
-./epimetheus -mode=watch \
- -file=network.csv \
- -metric-name=network \
- -resolve-ip-labels=source_ip,dest_ip
-```
-
-This will resolve: `ip` (default) + `source_ip` + `dest_ip`
-
-**Example:**
-- Input: `ip="10.50.52.61"`
-- Output: `ip="foo.example.lan"`
-- Failed lookups: IP remains unchanged
-
-**Documentation:**
-- [DNS-RESOLUTION-FEATURE.md](DNS-RESOLUTION-FEATURE.md) - Complete DNS resolution guide
-- [CSV-FORMAT-FLEXIBILITY.md](CSV-FORMAT-FLEXIBILITY.md) - Works with ANY CSV format
-- [DTAIL-METRICS-EXAMPLE.md](DTAIL-METRICS-EXAMPLE.md) - Detailed dtail.csv example
-
-### 🔄 Realtime Mode (Default)
-Push current metrics to Pushgateway with "now" timestamp.
-
-```bash
-./epimetheus -mode=realtime -continuous
-```
-
-**Options:**
-- `-pushgateway` - Pushgateway URL (default: http://localhost:9091)
-- `-job` - Job name (default: example_metrics_pusher)
-- `-continuous` - Keep pushing every 15 seconds
-
-### ⏰ Historic Mode
-Push a single datapoint from the past using Remote Write API.
-
-```bash
-# Port-forward Prometheus
-kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
-
-# Push data from 24 hours ago
-./epimetheus -mode=historic -hours-ago=24
-```
-
-**Options:**
-- `-prometheus` - Prometheus URL (default: http://localhost:9090/api/v1/write)
-- `-hours-ago` - Hours in the past (default: 24)
-
-### 📦 Backfill Mode
-Import a range of historic data points.
-
-```bash
-# Backfill last 48 hours with 1-hour intervals
-./epimetheus -mode=backfill -start-hours=48 -end-hours=0 -interval=1
-
-# Backfill last week with 6-hour intervals
-./epimetheus -mode=backfill -start-hours=168 -end-hours=0 -interval=6
-```
-
-**Options:**
-- `-start-hours` - Start time in hours ago
-- `-end-hours` - End time in hours ago (0 = now)
-- `-interval` - Interval between points in hours
-
-### 🤖 Auto Mode (Recommended!)
-Automatically detect timestamp age and route to the correct ingestion method.
-
-```bash
-# Generate test data
-./generate-test-data.sh
-
-# Import mixed current and historic data
-./epimetheus -mode=auto -file=test-all-ages.csv
-```
-
-**Detection Logic:**
-- Data < 5 minutes old → Pushgateway (realtime)
-- Data ≥ 5 minutes old → Remote Write (historic)
-
-**Options:**
-- `-file` - Input file path
-- `-format` - Data format: csv or json (default: csv)
-- `-pushgateway` - Pushgateway URL
-- `-prometheus` - Prometheus Remote Write URL
-
-## Data Formats
-
-### CSV Format
-
-```csv
-# Format: metric_name,labels,value,timestamp_ms
-# Labels: key1=value1;key2=value2
-epimetheus_test_requests_total,instance=web1;env=prod,100,1767125148000
-epimetheus_test_temperature_celsius,instance=web2,22.5,1767038748000
-
-# Timestamp is optional (uses "now" if omitted)
-epimetheus_test_active_connections,instance=web3,42,
-```
-
-### JSON Format
-
-```json
-[
- {
- "metric": "epimetheus_test_requests_total",
- "labels": {"instance": "web1", "env": "prod"},
- "value": 100,
- "timestamp_ms": 1767125148000
- },
- {
- "metric": "epimetheus_test_temperature_celsius",
- "labels": {"instance": "web2"},
- "value": 22.5,
- "timestamp_ms": 1767038748000
- }
-]
-```
-
-## Test Metrics
-
-All generated metrics use the `epimetheus_test_` prefix to clearly identify them as test data.
+Epimetheus is a standalone binary that:
-### Counter: `epimetheus_test_requests_total`
-- **Type:** Counter (monotonically increasing)
-- **Description:** Total number of requests processed
-- **Use case:** Counting total events, requests, errors
+- Pushes metrics via **Pushgateway** (realtime) or **Remote Write API** (historic, watch)
+- Optionally ingests to **ClickHouse** in watch mode
+- Supports **Prometheus-compatible backends** (e.g. VictoriaMetrics) by using their Remote Write URL
+- Offers modes: realtime, historic, backfill, auto, and watch (CSV file monitoring)
+- Accepts CSV and JSON input and provides a Grafana dashboard for test metrics
-### Gauge: `epimetheus_test_active_connections`
-- **Type:** Gauge (can increase or decrease)
-- **Description:** Current number of active connections (0-100)
-- **Use case:** Current state measurements, capacity
-
-### Gauge: `epimetheus_test_temperature_celsius`
-- **Type:** Gauge
-- **Description:** Current temperature in Celsius (0-50°C)
-- **Use case:** Environmental monitoring
-
-### Histogram: `epimetheus_test_request_duration_seconds`
-- **Type:** Histogram (distribution)
-- **Description:** Request duration distribution
-- **Buckets:** 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10 seconds
-- **Use case:** Latency measurements, SLO tracking
-
-### Labeled Counter: `epimetheus_test_jobs_processed_total`
-- **Type:** Counter with labels
-- **Description:** Jobs processed by type and status
-- **Labels:**
- - `job_type`: email, report, backup
- - `status`: success, failed
-- **Use case:** Categorized counting, multi-dimensional metrics
-
-## Grafana Dashboard
-
-A comprehensive dashboard is available showcasing all test metrics.
-
-### Dashboard Features
-
-- **8 Panels:**
- 1. Request Rate (line graph)
- 2. Total Requests (stat panel)
- 3. Active Connections (gauge with thresholds)
- 4. Temperature (gauge with thresholds)
- 5. Request Duration Histogram (p50, p90, p99)
- 6. Average Request Duration (stat)
- 7. Jobs Processed by Type (bar gauge)
- 8. Jobs Status Breakdown (table)
-
-- **Auto-refresh:** Every 10 seconds
-- **Time range:** Last 15 minutes (customizable)
-- **Dark theme optimized**
-
-### Deploy Dashboard
-
-#### Option 1: Helm/Kubernetes ConfigMap (Recommended)
-
-```bash
-# Deploy via Kubernetes ConfigMap
-kubectl apply -f ../prometheus/epimetheus-dashboard.yaml
-```
-
-The dashboard will be automatically discovered by Grafana.
-
-#### Option 2: Manual Import
-
-```bash
-# Port-forward Grafana
-kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
-
-# Open Grafana
-open http://localhost:3000
-
-# Go to Dashboards → Import → Upload grafana-dashboard.json
-```
-
-#### Option 3: Automated Script
-
-```bash
-# Deploy via API
-./deploy-dashboard.sh
-
-# Or with custom credentials
-GRAFANA_URL="http://localhost:3000" \
-GRAFANA_USER="admin" \
-GRAFANA_PASSWORD="yourpassword" \
-./deploy-dashboard.sh
-```
-
-## Example Queries
-
-### Basic Queries
-
-```promql
-# View total requests
-epimetheus_test_requests_total
-
-# View request rate over last 5 minutes
-rate(epimetheus_test_requests_total[5m])
-
-# View current active connections
-epimetheus_test_active_connections
-
-# View current temperature
-epimetheus_test_temperature_celsius
-```
-
-### Histogram Queries
-
-```promql
-# 95th percentile request duration
-histogram_quantile(0.95, rate(epimetheus_test_request_duration_seconds_bucket[5m]))
-
-# 50th percentile (median)
-histogram_quantile(0.50, rate(epimetheus_test_request_duration_seconds_bucket[5m]))
-
-# Average request duration
-rate(epimetheus_test_request_duration_seconds_sum[5m]) /
-rate(epimetheus_test_request_duration_seconds_count[5m])
-```
-
-### Labeled Counter Queries
-
-```promql
-# Failed jobs by type
-epimetheus_test_jobs_processed_total{status="failed"}
-
-# Job success rate
-rate(epimetheus_test_jobs_processed_total{status="success"}[5m]) /
-rate(epimetheus_test_jobs_processed_total[5m])
-
-# Total jobs by type
-sum by (job_type) (epimetheus_test_jobs_processed_total)
-```
-
-### Curl Examples
-
-```bash
-# Port-forward Prometheus
-kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
-
-# Query total requests
-curl -s "http://localhost:9090/api/v1/query?query=epimetheus_test_requests_total" | jq .
-
-# Query temperature
-curl -s "http://localhost:9090/api/v1/query?query=epimetheus_test_temperature_celsius" | jq .
-
-# Query request rate
-curl -s "http://localhost:9090/api/v1/query?query=rate(epimetheus_test_requests_total[5m])" | jq .
-
-# Query histogram p95
-curl -s "http://localhost:9090/api/v1/query?query=histogram_quantile(0.95,rate(epimetheus_test_request_duration_seconds_bucket[5m]))" | jq .
-```
-
-## Time Range Limitations
-
-### ✅ Supported Time Ranges
-
-| Time Range | Status | Method |
-|------------|--------|--------|
-| Current (< 5 min) | ✅ Works | Pushgateway |
-| 1 hour old | ✅ Works | Remote Write |
-| 1 day old | ✅ Works | Remote Write |
-| 1 week old | ✅ Works | Remote Write |
-| 1 month old | ✅ Works | Remote Write |
-
-### ⚠️ Potential Issues
-
-- **Future timestamps:** Rejected (> 5 minutes in future)
-- **Very old data (6+ months):** May be rejected depending on Prometheus retention
-- **Years old:** Likely rejected - use `promtool tsdb create-blocks-from` instead
-- **Out-of-order samples:** Can't insert older data into existing time series (use different labels)
-
-### Prometheus Configuration
-
-Check your retention settings:
-
-```bash
-# View retention
-kubectl get prometheus -n monitoring prometheus-kube-prometheus-prometheus \
- -o jsonpath='{.spec.retention}'
-
-# Default is typically 15 days
-```
-
-For very old data:
-- Increase retention in Prometheus config
-- Enable out-of-order ingestion (experimental)
-- Use `promtool` for direct TSDB block creation
-
-## Project Structure
-
-```
-epimetheus/
-├── cmd/
-│ └── epimetheus/
-│ └── main.go # Main entry point
-├── internal/
-│ ├── config/ # Configuration
-│ ├── metrics/ # Metric generators
-│ ├── parser/ # CSV/JSON parsers (includes tabular CSV)
-│ ├── ingester/ # Pushgateway & Remote Write ingesters
-│ └── watcher/ # File watcher for watch mode
-├── epimetheus # Compiled binary
-├── grafana-dashboard.json # Grafana dashboard definition
-├── deploy-dashboard.sh # Dashboard deployment script
-├── generate-test-data.sh # Test data generator
-├── run.sh # Helper script
-└── README.md # This file
-```
-
-## Setup Requirements
-
-### 1. Enable Prometheus Remote Write Receiver ⚠️ **REQUIRED for Historic Data**
-
-**IMPORTANT**: To use historic mode, backfill mode, or auto mode with old data, you **must** enable the Prometheus Remote Write receiver. Without this feature, Epimetheus can only push realtime data via Pushgateway.
-
-The Remote Write receiver is configured in the [conf repository](https://codeberg.org/snonux/conf) at `f3s/prometheus/persistence-values.yaml`:
-
-```yaml
-# In prometheus/persistence-values.yaml (from conf repository)
-prometheus:
- prometheusSpec:
- # Enable Remote Write receiver endpoint and Admin API (Prometheus 3.x syntax)
- additionalArgs:
- - name: web.enable-remote-write-receiver
- value: ""
- - name: web.enable-admin-api
- value: ""
-
- # Enable out-of-order ingestion for backfilling
- # Allows writing data points older than existing data for the same time series
- enableFeatures:
- - exemplar-storage
- - otlp-write-receiver
-
- # Allow backfilling up to 31 days in the past (provides 1-day buffer for 30-day datasets)
- tsdb:
- outOfOrderTimeWindow: 744h # 31 days
-```
-
-**What This Enables:**
-- **Remote Write API**: HTTP endpoint at `/api/v1/write` for ingesting metrics with custom timestamps
-- **Admin API**: HTTP endpoints at `/api/v1/admin/tsdb/*` for data deletion and management
-- **Out-of-Order Ingestion**: Allows writing data points older than existing data for the same time series
-- **31-Day Window**: Can backfill data up to 31 days in the past (provides 1-day buffer for 30-day datasets)
-
-After updating the configuration, upgrade your Prometheus installation:
-
-```bash
-cd conf/f3s/prometheus
-just upgrade # Or manually:
-# helm upgrade prometheus prometheus-community/kube-prometheus-stack \
-# -n monitoring -f persistence-values.yaml
-```
-
-Verify the features are enabled:
-
-```bash
-# Check Remote Write receiver flag
-kubectl get pod -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 \
- -o jsonpath='{.spec.containers[0].args}' | grep -o "web.enable-remote-write-receiver"
-
-# Check out-of-order time window
-kubectl get prometheus -n monitoring prometheus-kube-prometheus-prometheus \
- -o jsonpath='{.spec.tsdb.outOfOrderTimeWindow}'
-# Should output: 744h
-
-# Check admin API flag
-kubectl get pod -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 \
- -o jsonpath='{.spec.containers[0].args}' | grep -o "web.enable-admin-api"
-```
-
-**Performance Considerations:**
-
-This configuration is designed for ad-hoc troubleshooting and development, **NOT production use**. Enabling these features has trade-offs:
-
-- **Increased Memory Usage**: Out-of-order ingestion requires additional memory for buffering and sorting time series
-- **Higher TSDB Overhead**: Prometheus TSDB needs to handle non-sequential writes, increasing disk I/O
-- **Query Performance**: Queries may be slower due to fragmented data blocks
-- **Storage Amplification**: Out-of-order samples can trigger additional compactions, increasing storage usage
-
-**Recommendation for Production:**
-- Keep `outOfOrderTimeWindow` as small as possible (or disabled)
-- Monitor Prometheus memory and disk usage closely
-- Use Remote Write only when necessary
-- Consider using dedicated testing/development Prometheus instances
-
-**Note**: The syntax changed in Prometheus 3.x - use `additionalArgs` with `web.enable-remote-write-receiver` instead of the deprecated `enableFeatures: [remote-write-receiver]`.
-
-### 2. Update Prometheus Scrape Config
+## Quick Start
-Ensure Pushgateway is in scrape targets:
+1. **Build:** `mage build` or `go build -o epimetheus cmd/epimetheus/main.go`
+2. **Realtime (Pushgateway):** Deploy Pushgateway and Prometheus, then run:
+ ```bash
+ ./epimetheus -mode=realtime -continuous
+ ```
+3. **Watch (Remote Write):** Enable [Remote Write receiver](docs/operations/setup-prometheus.md), then:
+ ```bash
+ ./epimetheus -mode=watch -file=mydata.csv -metric-name=myapp -prometheus=http://localhost:9090/api/v1/write
+ ```
+4. **View:** Prometheus at http://localhost:9090 (after port-forward if needed). For full steps see [Quick Start](docs/guides/quickstart.md).
-```yaml
-# additional-scrape-configs.yaml
-- job_name: 'pushgateway'
- honor_labels: true
- static_configs:
- - targets:
- - 'pushgateway.monitoring.svc.cluster.local:9091'
-```
+## Documentation
-Apply the configuration:
+Full documentation is in the [docs](docs/README.md) directory:
-```bash
-kubectl create secret generic additional-scrape-configs \
- --from-file=/home/paul/git/conf/f3s/prometheus/additional-scrape-configs.yaml \
- --dry-run=client -o yaml -n monitoring | kubectl apply -f -
-```
+| Section | Description |
+|---------|-------------|
+| [Guides](docs/guides/quickstart.md) | [Quick Start](docs/guides/quickstart.md), [Modes](docs/guides/modes.md), [Data Formats](docs/guides/data-formats.md), [CSV flexibility](docs/guides/csv-format-flexibility.md), [DNS resolution](docs/guides/dns-resolution.md), [Dtail example](docs/guides/dtail-metrics-example.md) |
+| [Backends](docs/backends/prometheus.md) | [Prometheus / VictoriaMetrics](docs/backends/prometheus.md), [ClickHouse](docs/backends/clickhouse.md) |
+| [Operations](docs/operations/setup-prometheus.md) | [Setup Prometheus](docs/operations/setup-prometheus.md), [Setup ClickHouse](docs/operations/setup-clickhouse.md), [Troubleshooting](docs/operations/troubleshooting.md), [Cleanup](docs/operations/cleanup.md), [macOS](docs/operations/macos-setup.md), [Kubernetes](docs/operations/kubernetes.md) |
+| [Reference](docs/reference/cli.md) | [CLI](docs/reference/cli.md), [Test metrics](docs/reference/test-metrics.md), [Grafana dashboard](docs/reference/grafana-dashboard.md), [Example queries](docs/reference/example-queries.md), [Magefile](docs/reference/magefile.md) |
+| [Design](docs/design/architecture.md) | [Architecture](docs/design/architecture.md) |
-## Building from Source
+[Documentation index](docs/README.md) — complete list with one-line descriptions.
-### Using Mage (Recommended)
+## Building
-This project includes a [Magefile](./MAGEFILE.md) for easy building, testing, and running:
+**Using Mage (recommended):**
```bash
-# Install Mage (one-time setup)
go install github.com/magefile/mage@latest
-
-# Build binary
mage build
-
-# Run tests
mage test
-
-# Run with coverage report
-mage testCoverage
-
-# Run in realtime mode
-mage run
-
-# See all available targets
-mage -l
+mage run # realtime mode
```
-See [MAGEFILE.md](./MAGEFILE.md) for complete documentation.
+See [Magefile reference](docs/reference/magefile.md) for all targets.
-### Using Go directly
+**Using Go:**
```bash
-# Build binary
go build -o epimetheus cmd/epimetheus/main.go
-
-# Run tests
-go test ./... -v
-
-# Check test coverage
-go test ./... -cover
-```
-
-## Troubleshooting
-
-### Binary can't connect to Pushgateway
-
-```bash
-# Check port-forward is running
-ps aux | grep "port-forward.*9091"
-
-# Restart port-forward
-kubectl port-forward -n monitoring svc/pushgateway 9091:9091
-```
-
-### Metrics not appearing in Prometheus
-
-```bash
-# Check Pushgateway has metrics
-curl http://localhost:9091/metrics | grep "prometheus_pusher_test"
-
-# Check Prometheus scrape targets
-# Open http://localhost:9090/targets - look for "pushgateway" job
-
-# Check Prometheus logs
-kubectl logs -n monitoring -l app.kubernetes.io/name=prometheus
+go test ./...
```
-### "Remote write receiver not enabled" error
-
-```bash
-# Verify feature is enabled
-kubectl logs -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 | grep "remote-write-receiver"
-
-# Should see: msg="Experimental features enabled" features=[remote-write-receiver]
-```
-
-### "Out of order sample" error
-
-This occurs when trying to insert data older than existing data for the same time series.
-
-**Solutions:**
-- Use different job labels for historic data (e.g., `job="historic_data"`)
-- Enable out-of-order ingestion in Prometheus (experimental)
-- Ensure backfill goes from oldest to newest
-
-### Dashboard not appearing in Grafana
-
-```bash
-# Check ConfigMap exists
-kubectl get configmap -n monitoring | grep epimetheus
-
-# Check labels
-kubectl get configmap epimetheus-dashboard -n monitoring -o yaml | grep "grafana_dashboard"
-
-# Restart Grafana to force reload
-kubectl rollout restart deployment/prometheus-grafana -n monitoring
-```
-
-## Architecture
-
-```
-┌─────────────────┐
-│ Go Binary │
-│ (prometheus- │──Push realtime──┐
-│ pusher) │ │
-└─────────────────┘ ▼
- │ ┌──────────────────┐
- │ │ Pushgateway │◄──Scrape──┐
- │ │ (Port 9091) │ │
- │ └──────────────────┘ │
- │ │
- └──Push historic──────────────────┐ │
- ▼ │
- ┌─────────────────┐ │
- │ Prometheus │◄────┘
- │ (Port 9090) │
- │ Remote Write API│
- └─────────────────┘
- │
- │ Datasource
- ▼
- ┌─────────────────┐
- │ Grafana │
- │ (Port 3000) │
- │ Dashboards │
- └─────────────────┘
-```
-
-## Best Practices
-
-### When to Use Pushgateway vs. Remote Write
-
-**Use Pushgateway (realtime mode):**
-- Short-lived batch jobs
-- Service-level metrics
-- Jobs behind firewalls
-- Current/recent data (< 5 minutes old)
-
-**Use Remote Write (historic mode):**
-- Historic data import
-- Backfilling gaps
-- Data migration
-- Data older than 5 minutes
-
-**Use Auto Mode:**
-- Mixed current and historic data
-- Importing from files
-- Unknown timestamp ages
-- General-purpose ingestion
-
-### Metric Design
-
-- **Use appropriate metric types:**
- - Counter for cumulative values (requests, errors)
- - Gauge for point-in-time values (temperature, connections)
- - Histogram for distributions (latency, sizes)
-
-- **Label cardinality:**
- - Include meaningful labels
- - Avoid high-cardinality labels (user IDs, timestamps)
- - Keep label combinations reasonable (< 1000 per metric)
-
-- **Naming conventions:**
- - Use descriptive names
- - Include units in gauge names (\_celsius, \_bytes)
- - Use \_total suffix for counters
-
-## Cleanup
-
-### Cleaning Up Benchmark Data from Prometheus
-
-For cleaning up benchmark metrics from Prometheus, use the provided cleanup script:
-
-```bash
-# Port-forward to Prometheus
-kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
-
-# Run the cleanup script
-./cleanup-benchmark-data.sh
-```
-
-The script will:
-1. Delete all `epimetheus_benchmark_*` metrics using the Prometheus Admin API
-2. Clean up tombstones to free disk space
-3. Provide clear success/error feedback
-
-**Manual cleanup** (if you prefer):
-
-```bash
-# Delete specific metric
-curl -X POST 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]=epimetheus_benchmark_cpu_usage'
-
-# Clean up tombstones
-curl -X POST 'http://localhost:9090/api/v1/admin/tsdb/clean_tombstones'
-```
-
-### Other Cleanup Tasks
-
-```bash
-# Stop port-forwards
-pkill -f "port-forward.*9091"
-pkill -f "port-forward.*9090"
-pkill -f "port-forward.*3000"
-
-# Delete test metrics from Pushgateway
-curl -X DELETE http://localhost:9091/metrics/job/example_metrics_pusher
-
-# Uninstall Pushgateway (if needed)
-helm uninstall pushgateway -n monitoring
-```
-
-## MacOS Setup
-
-### Basic Installation
-
-```bash
-brew install prometheus
-brew install grafana
-go install github.com/prometheus/pushgateway@latest
-brew services start grafana
-brew services start prometheus
-~/go/bin/pushgateway &
-```
-
-Once done, login to http://localhost:3000 as admin:admin, you will be prompted to change the password. Afterwards, add http://localhost:9090 as a Prometheus datasource.
-
-### Enable Remote Write Receiver (Required for Watch Mode)
-
-⚠️ **Important**: Watch mode, historic mode, backfill mode, and auto mode require the Prometheus Remote Write receiver to be enabled.
-
-#### Option 1: Permanent Configuration (Recommended)
-
-Edit the Prometheus arguments file:
-
-```bash
-# Edit the arguments file
-nano /opt/homebrew/etc/prometheus.args
-```
-
-Add this line at the end:
-```
---web.enable-remote-write-receiver
-```
-
-The complete file should look like:
-```
---config.file /opt/homebrew/etc/prometheus.yml
---web.listen-address=127.0.0.1:9090
---storage.tsdb.path /opt/homebrew/var/prometheus
---web.enable-remote-write-receiver
---web.enable-admin-api
-```
-
-**Note:** `--web.enable-admin-api` is optional but recommended for easier data management (allows deleting old metrics).
-
-Restart Prometheus:
-```bash
-brew services restart prometheus
-```
-
-Verify it's working:
-```bash
-# Check Prometheus is healthy
-curl http://localhost:9090/-/healthy
-
-# Test Remote Write endpoint (should return 400, not 404)
-curl -X POST http://localhost:9090/api/v1/write
-```
-
-#### Option 2: Temporary (For Testing)
-
-Stop the service and start manually:
-
-```bash
-# Stop brew service
-brew services stop prometheus
-
-# Start with Remote Write enabled
-prometheus --web.enable-remote-write-receiver
-```
-
-Keep this terminal open. In another terminal, run your epimetheus commands.
-
-**Note**: This only lasts until you stop the terminal. Use Option 1 for permanent setup.
-
-### Clearing Old Metrics (Optional)
-
-If you need to delete old metrics and start fresh:
-
-```bash
-# Delete specific metrics (e.g., blockstore)
-curl -X POST -g 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]={__name__=~"blockstore_.*"}'
-
-# Clean up deleted data
-curl -X POST http://localhost:9090/api/v1/admin/tsdb/clean_tombstones
-
-# Wait a moment for cleanup
-sleep 2
-```
-
-**Note:** Admin API must be enabled (add `--web.enable-admin-api` to prometheus.args).
-
-### Verify Setup
-
-Once Remote Write is enabled, test watch mode:
-
-```bash
-# Create a test CSV
-cat > /tmp/test.csv << EOF
-status,count,method
-200,100,GET
-404,50,POST
-EOF
-
-# Watch the file
-./epimetheus -mode=watch \
- -file=/tmp/test.csv \
- -metric-name=test \
- -prometheus=http://localhost:9090/api/v1/write
-```
+## Project Structure
-You should see:
-```
-✅ Successfully pushed X samples to Prometheus
```
-
-Query in Prometheus (http://localhost:9090):
-```promql
-{__name__=~"test_.*"}
+epimetheus/
+├── cmd/epimetheus/ # Main entry point
+├── internal/ # config, ingester, metrics, parser, resolver, watcher
+├── docs/ # Documentation
+├── scripts/ # Helper shell scripts (verify-clickhouse, generate-test-data, etc.)
+├── test-data/ # Test CSVs
+├── Magefile.go # Build and run targets
+└── README.md
```
-## Additional Resources
-
-- [Prometheus Documentation](https://prometheus.io/docs/)
-- [Pushgateway Documentation](https://github.com/prometheus/pushgateway)
-- [Prometheus Remote Write Spec](https://prometheus.io/docs/concepts/remote_write_spec/)
-- [Grafana Documentation](https://grafana.com/docs/)
-
## Version
Current version: 0.0.0
diff --git a/docs/DOCS-RESTRUCTURE-PLAN.md b/docs/DOCS-RESTRUCTURE-PLAN.md
new file mode 100644
index 0000000..c688993
--- /dev/null
+++ b/docs/DOCS-RESTRUCTURE-PLAN.md
@@ -0,0 +1,235 @@
+# Documentation Restructure Plan
+
+This plan addresses the current documentation sprawl and clarifies the **multiple ingestion backends** (Prometheus, ClickHouse, and future backends such as VictoriaMetrics) and **modes** (realtime, historic, backfill, auto, watch).
+
+---
+
+## 1. Current State Summary
+
+### 1.1 Existing Markdown Files
+
+| File | Purpose | Issues |
+|------------|-----------------------------------|--------|
+| `README.md` | Single ~995-line doc: intro, modes, backends, setup, troubleshooting, macOS, cleanup | Too long; mixes audiences and backends; hard to maintain |
+| `AGENT.md` | Agent rules (Grafana dashboard guidelines + ref to `~/git/conf/snippets/go/go-projects.md`) | Fine as-is; not user docs |
+| `CLAUDE.md` | One-line pointer to AGENT.md | Fine as-is |
+
+### 1.2 Broken or Missing References in README
+
+- `CSV-FORMAT-FLEXIBILITY.md` – linked, **does not exist**
+- `DNS-RESOLUTION-FEATURE.md` – linked, **does not exist**
+- `DTAIL-METRICS-EXAMPLE.md` – linked, **does not exist**
+- `MAGEFILE.md` – linked, **does not exist** (build logic lives in `Magefile.go`)
+
+### 1.3 Ingestion Backends (from codebase)
+
+| Backend | Modes | Notes |
+|-----------|---------------------------|--------|
+| **Prometheus** | realtime (Pushgateway), historic/backfill/auto (Remote Write), watch (Remote Write) | Primary; Remote Write requires feature flag |
+| **ClickHouse** | watch only | Optional; can run with Prometheus or alone |
+
+*VictoriaDB / VictoriaMetrics:* Not present in code today. Plan leaves room for a dedicated backend doc when added.
+
+---
+
+## 2. Goals
+
+1. **Separate by ingestion backend** so Prometheus vs ClickHouse (and future backends) have clear, non-redundant docs.
+2. **Split by audience and topic**: quick start vs reference vs operations (setup, troubleshooting, cleanup).
+3. **Fix broken links**: either add the missing docs or replace links with in-README sections / new doc paths.
+4. **Single source of truth** for each concept (e.g. “how watch mode works” and “how to configure Prometheus” in one place each).
+5. **Easier maintenance**: smaller, focused files; clear naming; one `docs/` tree.
+
+---
+
+## 3. Proposed Directory Layout
+
+```
+epimetheus/
+├── README.md # Short overview + quick start + doc index (slimmed)
+├── AGENT.md # Unchanged
+├── CLAUDE.md # Unchanged
+├── docs/
+│ ├── README.md # Documentation index (nav + short descriptions)
+│ │
+│ ├── guides/ # How-to and concepts
+│ │ ├── quickstart.md # Minimal path to first push (Prometheus or ClickHouse)
+│ │ ├── modes.md # All modes: realtime, historic, backfill, auto, watch
+│ │ ├── data-formats.md # CSV (epimetheus + tabular) and JSON
+│ │ ├── csv-format-flexibility.md # “Any CSV” + examples (replaces missing file)
+│ │ ├── dns-resolution.md # IP → hostname resolution (replaces missing file)
+│ │ └── dtail-metrics-example.md # Optional: dtail.csv walkthrough (replaces missing file)
+│ │
+│ ├── backends/ # One doc per ingestion backend
+│ │ ├── prometheus.md # Pushgateway + Remote Write, config, limits
+│ │ ├── clickhouse.md # Watch-only; schema; verify script
+│ │ └── (future) victoriametrics.md # When/if added
+│ │
+│ ├── operations/ # Setup, runbooks, platform-specific
+│ │ ├── setup-prometheus.md # Remote Write receiver, scrape config, retention
+│ │ ├── setup-clickhouse.md # Table creation, verify-clickhouse.sh
+│ │ ├── troubleshooting.md # Connection issues, “no metrics”, out-of-order, etc.
+│ │ ├── cleanup.md # Benchmark cleanup, Pushgateway delete, port-forwards
+│ │ ├── macos-setup.md # Brew, Prometheus args, Remote Write on macOS
+│ │ └── kubernetes.md # Port-forwards, Helm, ConfigMaps (from current README)
+│ │
+│ ├── reference/ # Reference material
+│ │ ├── cli.md # All flags by mode
+│ │ ├── test-metrics.md # epimetheus_test_* metrics and types
+│ │ ├── grafana-dashboard.md # Panels, deploy options, datasource
+│ │ ├── example-queries.md # PromQL and curl examples
+│ │ └── magefile.md # Mage targets (replaces missing MAGEFILE.md)
+│ │
+│ └── design/ # Optional, for contributors
+│ └── architecture.md # High-level data flow (current ASCII diagrams)
+```
+
+---
+
+## 4. File-by-File Plan
+
+### 4.1 Root `README.md` (slimmed)
+
+- **Keep:** Project name, tagline, “Why Epimetheus”, **one** high-level architecture diagram (simplified).
+- **Keep:** Very short “Overview” (1 paragraph) and **Quick Start** (3–5 steps pointing at `docs/guides/quickstart.md` for details).
+- **Add:** **Documentation index** – bullet list with links to:
+ - `docs/README.md`
+ - `docs/guides/quickstart.md`, `docs/guides/modes.md`
+ - `docs/backends/prometheus.md`, `docs/backends/clickhouse.md`
+ - `docs/operations/setup-prometheus.md`, `docs/operations/troubleshooting.md`
+ - `docs/reference/cli.md`, `docs/reference/magefile.md`
+- **Move out of README into `docs/`:**
+ - All mode details → `docs/guides/modes.md`
+ - Backend-specific behaviour → `docs/backends/*.md`
+ - Setup (Prometheus, ClickHouse, k8s, macOS) → `docs/operations/*.md`
+ - Data formats → `docs/guides/data-formats.md` (+ csv-format-flexibility, dns-resolution, dtail example)
+ - Test metrics, Grafana, example queries → `docs/reference/*.md`
+ - Troubleshooting, cleanup → `docs/operations/*.md`
+ - Time range / retention → `docs/backends/prometheus.md` and `docs/operations/setup-prometheus.md`
+- **Fix links:** Remove links to `CSV-FORMAT-FLEXIBILITY.md`, `DNS-RESOLUTION-FEATURE.md`, `DTAIL-METRICS-EXAMPLE.md`, `MAGEFILE.md` from README; point to `docs/guides/...` and `docs/reference/magefile.md` instead.
+
+**Target:** README under ~150–200 lines.
+
+---
+
+### 4.2 `docs/README.md` (new)
+
+- Title: “Epimetheus Documentation”.
+- Short intro (2–3 sentences).
+- **Structured index** with sections:
+ - **Guides:** quickstart, modes, data formats, CSV flexibility, DNS resolution, dtail example.
+ - **Ingestion backends:** Prometheus, ClickHouse (and placeholder for Victoria* if desired).
+ - **Operations:** setup (Prometheus, ClickHouse), troubleshooting, cleanup, macOS, Kubernetes.
+ - **Reference:** CLI, test metrics, Grafana, example queries, Mage.
+- Each entry: link + one-line description.
+
+---
+
+### 4.3 Guides
+
+| Doc | Content | Source |
+|-----|--------|--------|
+| `guides/quickstart.md` | Minimal steps: build/run, push to Prometheus or ClickHouse, view (Prometheus UI or verify-clickhouse.sh). | Current README “Quick Start” + “Run in Realtime Mode” + one watch example. |
+| `guides/modes.md` | Table: mode name, purpose, which backends, main flags. Then one subsection per mode (realtime, historic, backfill, auto, watch) with short description and example command. | Current README “Operating Modes”. |
+| `guides/data-formats.md` | Epimetheus CSV (metric_name, labels, value, timestamp_ms), JSON format, optional timestamp. Link to csv-format-flexibility for tabular CSV. | Current README “Data Formats”. |
+| `guides/csv-format-flexibility.md` | “Works with any CSV”: headers → metric names/labels, numeric vs string columns, sanitization, examples (web, food). | New content; replaces missing `CSV-FORMAT-FLEXIBILITY.md`. |
+| `guides/dns-resolution.md` | Default `ip` resolution; `-resolve-ip-labels`; behaviour on failure. | New content; replaces missing `DNS-RESOLUTION-FEATURE.md`. |
+| `guides/dtail-metrics-example.md` | Optional: step-by-step dtail.csv example. | New content; replaces missing `DTAIL-METRICS-EXAMPLE.md`; can be short. |
+
+---
+
+### 4.4 Backends
+
+| Doc | Content | Source |
+|-----|--------|--------|
+| `backends/prometheus.md` | Pushgateway (realtime) vs Remote Write (historic/watch); URLs; time range and retention limits; out-of-order; link to setup-prometheus. | README Prometheus bits + “Time Range Limitations” + “Setup Requirements” (Remote Write). |
+| `backends/clickhouse.md` | Watch-only; `-clickhouse`, `-clickhouse-table`; table schema (from code/comments); `verify-clickhouse.sh`; Prometheus + ClickHouse together. | README “ClickHouse Support” + verify-clickhouse.sh + internal/ingester/clickhouse.go. |
+
+---
+
+### 4.5 Operations
+
+| Doc | Content | Source |
+|-----|--------|--------|
+| `operations/setup-prometheus.md` | Enable Remote Write receiver (and Admin API); scrape config for Pushgateway; retention; Prometheus 3.x syntax; verify commands. | Current README “Setup Requirements” (Prometheus). |
+| `operations/setup-clickhouse.md` | Ensure table exists (e.g. from ingester); run verify script; optional Docker/systemd. | From README + scripts + code. |
+| `operations/troubleshooting.md` | Pushgateway connection; metrics not in Prometheus; “Remote write receiver not enabled”; out-of-order errors; dashboard not in Grafana; ClickHouse connection. | Current README “Troubleshooting”. |
+| `operations/cleanup.md` | Cleanup benchmark data script; manual Prometheus delete/tombstones; Pushgateway delete; stop port-forwards; uninstall Pushgateway. | Current README “Cleanup”. |
+| `operations/macos-setup.md` | Brew install; prometheus.args (Remote Write, Admin API); verify; optional “temporary” run. | Current README “MacOS Setup”. |
+| `operations/kubernetes.md` | Port-forwards (Pushgateway, Prometheus, Grafana); Helm/ConfigMap for dashboard; namespace. | Extracted from README examples. |
+
+---
+
+### 4.6 Reference
+
+| Doc | Content | Source |
+|-----|--------|--------|
+| `reference/cli.md` | Table or list of all flags by mode (realtime, historic, backfill, auto, watch); default values. | From README + `cmd/epimetheus/main.go`. |
+| `reference/test-metrics.md` | Each `epimetheus_test_*` metric: type, description, labels, use case. | Current README “Test Metrics”. |
+| `reference/grafana-dashboard.md` | Panels list; deploy (ConfigMap, manual import, script); datasource; link to AGENT.md for panel guidelines. | Current README “Grafana Dashboard”. |
+| `reference/example-queries.md` | PromQL and curl examples (basic, histogram, labeled counter). | Current README “Example Queries”. |
+| `reference/magefile.md` | List of Mage targets (build, test, run, RunWatchClickHouse, cleanup, etc.) with one-line description and example. | From `Magefile.go`; replaces missing `MAGEFILE.md`. |
+
+---
+
+### 4.7 Design (optional)
+
+| Doc | Content | Source |
+|-----|--------|--------|
+| `design/architecture.md` | High-level data flow; ASCII diagrams (current README); “when to use Pushgateway vs Remote Write” and “when to use which backend”. | Current README “Architecture” and “Best Practices”. |
+
+---
+
+## 5. Implementation Order
+
+1. **Create `docs/` and index**
+ - Create `docs/README.md` with the full index (links can target paths that don’t exist yet).
+2. **Fix broken links and add missing content**
+ - Add `docs/guides/csv-format-flexibility.md`, `docs/guides/dns-resolution.md`, `docs/guides/dtail-metrics-example.md`, `docs/reference/magefile.md` so all current README links resolve.
+3. **Backend-centric docs**
+ - Add `docs/backends/prometheus.md` and `docs/backends/clickhouse.md`; move/duplicate content from README.
+4. **Operations**
+ - Add `docs/operations/setup-prometheus.md`, `setup-clickhouse.md`, `troubleshooting.md`, `cleanup.md`, `macos-setup.md`, `kubernetes.md`; move content from README.
+5. **Guides**
+ - Add `docs/guides/quickstart.md`, `modes.md`, `data-formats.md`; move content from README.
+6. **Reference**
+ - Add `docs/reference/cli.md`, `test-metrics.md`, `grafana-dashboard.md`, `example-queries.md`; move content from README.
+7. **Slim README**
+ - Cut README down to overview, quick start, and doc index; replace old links with `docs/...` links.
+8. **Optional**
+ - Add `docs/design/architecture.md` and link from `docs/README.md`.
+
+---
+
+## 6. Cross-Cutting Conventions
+
+- **Links:** Prefer relative links from repo root (e.g. `[Modes](docs/guides/modes.md)`) or from `docs/` (e.g. `[Prometheus](backends/prometheus.md)` inside docs).
+- **Backend mentions:** In mode/CLI docs, use a short table or sentence: “Supported backends: Prometheus (all modes), ClickHouse (watch only).”
+- **One diagram:** Keep one high-level diagram in README or `design/architecture.md`; avoid duplicating large ASCII art in multiple files.
+- **CLI and defaults:** Single source of truth in `reference/cli.md`; guides and backend docs can quote the relevant subset.
+- **Version/legal:** Keep “Version” and “License” in root README (or CONTRIBUTING.md if you add one).
+
+---
+
+## 7. Future: VictoriaMetrics / VictoriaDB
+
+When adding a new backend (e.g. VictoriaMetrics, which speaks Prometheus Remote Write):
+
+- Add `docs/backends/victoriametrics.md` (or `victoriadb.md`) with URL format, any extra flags, and differences from Prometheus.
+- In `docs/README.md` and root README, add one line to the “Ingestion backends” section.
+- In `docs/guides/modes.md` and `reference/cli.md`, extend the “which backends support which mode” table and flags.
+- No need to duplicate full setup/troubleshooting if it matches Prometheus; link to `backends/prometheus.md` and note compatibility where relevant.
+
+---
+
+## 8. Checklist Before Calling Done
+
+- [ ] All current README links resolve (no 404s).
+- [ ] README is under ~200 lines and ends with doc index.
+- [ ] `docs/README.md` lists every new doc with link and one-line description.
+- [ ] Prometheus vs ClickHouse (and modes) are clearly separated in backends and guides.
+- [ ] Setup, troubleshooting, and cleanup live under `docs/operations/`.
+- [ ] Mage is documented in `docs/reference/magefile.md` and linked from root README.
+- [ ] Optional: `docs/design/architecture.md` exists and is linked from index.
+
+This plan gives you a single place to extend when you add VictoriaDB/VictoriaMetrics or another backend, and keeps the root README short while all detailed docs live under `docs/` with a clear structure by topic and backend.
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..5f944d4
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,66 @@
+# Epimetheus Documentation
+
+Documentation for Epimetheus: a Go tool for pushing metrics to Prometheus (and Prometheus-compatible backends) and ClickHouse.
+
+## Index
+
+### Guides
+
+| Document | Description |
+|----------|-------------|
+| [Quick Start](guides/quickstart.md) | Minimal path to first push: build, run, view in Prometheus or ClickHouse |
+| [Operating Modes](guides/modes.md) | Realtime, historic, backfill, auto, and watch modes with examples |
+| [Data Formats](guides/data-formats.md) | Epimetheus CSV and JSON input formats |
+| [CSV Format Flexibility](guides/csv-format-flexibility.md) | Use any tabular CSV; numeric vs string columns; sanitization and examples |
+| [DNS Resolution](guides/dns-resolution.md) | IP-to-hostname resolution for watch mode labels |
+| [Dtail Metrics Example](guides/dtail-metrics-example.md) | Walkthrough using dtail.csv |
+
+### Ingestion Backends
+
+| Document | Description |
+|----------|-------------|
+| [Prometheus (and Prometheus-compatible)](backends/prometheus.md) | Pushgateway, Remote Write, time ranges; VictoriaMetrics via same URL |
+| [ClickHouse](backends/clickhouse.md) | Watch-mode ingestion; table schema; verify script |
+
+### Operations
+
+| Document | Description |
+|----------|-------------|
+| [Setup: Prometheus](operations/setup-prometheus.md) | Enable Remote Write receiver, scrape config, retention |
+| [Setup: ClickHouse](operations/setup-clickhouse.md) | Table creation, verification |
+| [Troubleshooting](operations/troubleshooting.md) | Connection issues, metrics not appearing, out-of-order errors |
+| [Cleanup](operations/cleanup.md) | Benchmark cleanup, Pushgateway delete, port-forwards |
+| [macOS Setup](operations/macos-setup.md) | Homebrew, Prometheus args, Remote Write on macOS |
+| [Kubernetes](operations/kubernetes.md) | Port-forwards, Helm, ConfigMaps |
+
+### Reference
+
+| Document | Description |
+|----------|-------------|
+| [CLI Reference](reference/cli.md) | All flags by mode with defaults |
+| [Test Metrics](reference/test-metrics.md) | epimetheus_test_* metrics and types |
+| [Grafana Dashboard](reference/grafana-dashboard.md) | Panels, deployment options, datasource |
+| [Example Queries](reference/example-queries.md) | PromQL and curl examples |
+| [Magefile](reference/magefile.md) | Mage build and run targets |
+
+### Design
+
+| Document | Description |
+|----------|-------------|
+| [Architecture](design/architecture.md) | Data flow, when to use Pushgateway vs Remote Write, backend choice |
+
+### Helper scripts
+
+Helper shell scripts live in **`scripts/`** at the repo root. Run them from the repo root (e.g. `./scripts/verify-clickhouse.sh`).
+
+| Script | Purpose |
+|--------|---------|
+| `verify-clickhouse.sh` | Verify ClickHouse ingestion (row count, sample data) |
+| `generate-test-data.sh` | Generate `test-all-ages.csv` for auto mode |
+| `cleanup-benchmark-data.sh` | Delete benchmark metrics from Prometheus (Admin API) |
+| `cleanup-benchmark-metrics.sh` | Same + starts port-forward, then cleans up |
+| `benchmark-100mb.sh` | 100MB ingestion benchmark |
+| `benchmark-1gb.sh` | 1GB ingestion benchmark |
+| `backfill-historic-data.sh` | Backfill 7 days of historic data to Prometheus |
+| `run.sh` | Port-forward Pushgateway and run epimetheus in realtime mode |
+| `deploy-dashboard.sh` | Deploy Grafana dashboard via API (if present) |
diff --git a/docs/backends/clickhouse.md b/docs/backends/clickhouse.md
new file mode 100644
index 0000000..ad1b5f0
--- /dev/null
+++ b/docs/backends/clickhouse.md
@@ -0,0 +1,92 @@
+# ClickHouse
+
+Epimetheus can ingest metrics into ClickHouse in **watch mode** only. ClickHouse is optional: you can use it in addition to Prometheus or as the only backend (by setting `-prometheus=` to disable Prometheus ingestion).
+
+## Data flow (watch mode only)
+
+```
+┌─────────────────┐ poll (1s) ┌─────────────────────────────────────┐
+│ CSV file(s) │ ─────────────────▶ │ Epimetheus (watch mode) │
+│ (mtime = │ │ • Parse tabular CSV │
+│ timestamp) │ │ • -metric-name + columns → metrics │
+└─────────────────┘ └─────────────────────────────────────┘
+ │
+ ┌────────────────────┼────────────────────┐
+ │ │ │
+ ▼ ▼ │
+ ┌───────────────┐ ┌───────────────┐ │
+ │ Prometheus │ │ ClickHouse │ │
+ │ (optional) │ │ (optional) │ │
+ │ -prometheus= │ │ -clickhouse= │ │
+ │ Remote Write │ │ HTTP insert │ │
+ └───────────────┘ └───────────────┘ │
+ │
+ At least one of -prometheus or -clickhouse │
+```
+
+## When It's Used
+
+- **Mode:** Watch only. Other modes (realtime, historic, backfill, auto) do not write to ClickHouse.
+- **Flags:**
+ - `-clickhouse` – ClickHouse HTTP URL (e.g. `http://localhost:8123`). If empty, no ClickHouse ingestion.
+ - `-clickhouse-table` – Table name (default: `epimetheus_metrics`).
+
+At least one of `-prometheus` or `-clickhouse` must be set for watch mode.
+
+## Table Schema
+
+Epimetheus creates the table if it does not exist. Schema:
+
+```sql
+CREATE TABLE IF NOT EXISTS epimetheus_metrics (
+ metric String,
+ labels Map(String, String),
+ value Float64,
+ timestamp DateTime64(3)
+) ENGINE = MergeTree()
+ORDER BY (metric, timestamp)
+```
+
+- `metric` – metric name (e.g. from `-metric-name` and column headers in tabular CSV).
+- `labels` – key-value map of label names and values.
+- `value` – sample value.
+- `timestamp` – sample time (millisecond precision).
+
+## Examples
+
+**Prometheus and ClickHouse:**
+
+```bash
+./epimetheus -mode=watch -file=data.csv -metric-name=myapp \
+ -prometheus=http://localhost:9090/api/v1/write \
+ -clickhouse=http://localhost:8123
+```
+
+**ClickHouse only:**
+
+```bash
+./epimetheus -mode=watch -file=test-data/watch-clickhouse-test.csv \
+ -metric-name=watch_test \
+ -clickhouse=http://localhost:8123 \
+ -prometheus=
+```
+
+**Custom table:**
+
+```bash
+./epimetheus -mode=watch -file=data.csv -metric-name=myapp \
+ -clickhouse=http://localhost:8123 \
+ -clickhouse-table=my_metrics
+```
+
+## Verification
+
+Use the provided script to check that data landed in ClickHouse:
+
+```bash
+./scripts/verify-clickhouse.sh
+# Or with custom URL/table:
+./scripts/verify-clickhouse.sh http://localhost:8123 epimetheus_metrics
+```
+
+The script checks connectivity, row count, distinct metrics, sample rows, and rows per metric. See [Setup: ClickHouse](../operations/setup-clickhouse.md) for getting ClickHouse running.
diff --git a/docs/backends/prometheus.md b/docs/backends/prometheus.md
new file mode 100644
index 0000000..f8d2a9b
--- /dev/null
+++ b/docs/backends/prometheus.md
@@ -0,0 +1,76 @@
+# Prometheus (and Prometheus-Compatible Backends)
+
+Epimetheus can ingest metrics into Prometheus via two paths. Any backend that exposes the Prometheus Remote Write API (including **VictoriaMetrics**) is supported by pointing `-prometheus=` at that backend's write URL (e.g. `http://victoriametrics:8428/api/v1/write`).
+
+## Ingestion paths (overview)
+
+```
+ Epimetheus
+ │
+ ┌───────────────┼───────────────┐
+ │ │ │
+ ▼ ▼ ▼
+ Realtime mode Historic/Backfill Watch mode
+ (current data) (old data) (CSV file mtime)
+ │ │ │
+ ▼ │ │
+ ┌───────────┐ │ │
+ │Pushgateway │ │ │
+ │ (HTTP POST)│ │ │
+ └─────┬─────┘ │ │
+ │ Scrape │ │
+ │ (15–30s) │ │
+ ▼ ▼ ▼
+ ┌─────────────────────────────────────────────┐
+ │ Prometheus / VictoriaMetrics │
+ │ Remote Write API: /api/v1/write │
+ │ (realtime: via Pushgateway scrape; │
+ │ historic/watch: direct POST) │
+ └─────────────────────────────────────────────┘
+```
+
+## Ingestion Paths
+
+### Realtime: Pushgateway
+
+- **Used by:** realtime mode, and auto mode for samples &lt; 5 minutes old.
+- **Flow:** Epimetheus pushes to Pushgateway (HTTP POST); Prometheus scrapes Pushgateway on its schedule. Timestamps become "now" at scrape time.
+- **Flags:** `-pushgateway` (default `http://localhost:9091`), `-job`, `-continuous`.
+
+### Historic: Remote Write API
+
+- **Used by:** historic mode, backfill mode, auto mode for samples ≥ 5 minutes old, and watch mode (when `-prometheus` is set).
+- **Flow:** Epimetheus sends samples to the Remote Write endpoint (e.g. `/api/v1/write`). Timestamps from the data are preserved.
+- **Flags:** `-prometheus` (default `http://localhost:9090/api/v1/write`).
+
+The Remote Write receiver must be enabled on Prometheus for historic/watch/backfill/auto with old data. See [Setup: Prometheus](../operations/setup-prometheus.md).
+
+## Prometheus-Compatible Backends (e.g. VictoriaMetrics)
+
+Backends that implement the [Prometheus Remote Write](https://prometheus.io/docs/concepts/remote_write_spec/) API work with Epimetheus without any code changes. Use their write endpoint as the `-prometheus=` URL.
+
+**Example (VictoriaMetrics):**
+
+```bash
+./epimetheus -mode=watch -file=data.csv -metric-name=myapp \
+ -prometheus=http://victoriametrics:8428/api/v1/write
+```
+
+Replace host/port with your VictoriaMetrics (or other compatible) write URL. Realtime mode still uses Pushgateway (scraped by your Prometheus or VictoriaMetrics); for watch/historic/backfill/auto, only the `-prometheus=` target changes.
+
+## Time Ranges
+
+| Time range | Status | Method |
+|------------|--------|--------|
+| Current (&lt; 5 min) | Supported | Pushgateway |
+| 1 hour old | Supported | Remote Write |
+| 1 day to 1 month old | Supported | Remote Write |
+| 6+ months | May be rejected (retention) | Remote Write |
+| Years old | Likely rejected; use `promtool tsdb create-blocks-from` | — |
+| Future (&gt; 5 min ahead) | Rejected | — |
+
+Out-of-order samples (older than existing data for the same series) require out-of-order ingestion to be enabled on the backend, or use different labels. See [Troubleshooting](../operations/troubleshooting.md).
+
+## Retention and Configuration
+
+Check your backend's retention (e.g. Prometheus `retention`, VictoriaMetrics settings). For very old data you may need to increase retention or enable out-of-order ingestion. See [Setup: Prometheus](../operations/setup-prometheus.md) for Prometheus-specific options.
diff --git a/docs/design/architecture.md b/docs/design/architecture.md
new file mode 100644
index 0000000..2a01e09
--- /dev/null
+++ b/docs/design/architecture.md
@@ -0,0 +1,101 @@
+# Architecture
+
+High-level data flow and when to use each ingestion path or backend.
+
+## Data flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Epimetheus │
+│ (Metrics Ingestion Tool) │
+│ │
+│ Modes: │
+│ • Realtime - Current metrics (< 5 min old) │
+│ • Historic - Historic metrics (≥ 5 min old) │
+│ • Backfill - Range of historic data │
+│ • Auto - Automatic routing based on timestamp age │
+│ • Watch - CSV file monitoring (Prometheus and/or ClickHouse) │
+└─────────────────────────────────────────────────────────────────────────┘
+ │ │
+ │ Realtime Data │ Historic Data
+ │ (via HTTP POST) │ (via Remote Write API)
+ │ Uses "now" timestamp │ Preserves timestamps
+ ▼ ▼
+┌─────────────────────┐ ┌─────────────────────┐
+│ Pushgateway │ │ Prometheus / │
+│ (Port 9091) │ │ VictoriaMetrics │
+│ │ │ (Remote Write) │
+│ • Buffers metrics │ │ │
+│ • Scraped by │──── Scraped ─────▶ │ /api/v1/write │
+│ Prometheus │ every 15-30s │ │
+└─────────────────────┘ └─────────────────────┘
+ │
+ │ Query API
+ ▼
+ ┌─────────────────────┐
+ │ Grafana │
+ │ Dashboards │
+ └─────────────────────┘
+```
+
+**Watch mode** can also write to **ClickHouse** (separate path; see [ClickHouse backend](../backends/clickhouse.md)).
+
+## Watch mode (CSV file watcher)
+
+Watch mode polls CSV file(s), uses file modification time as the sample timestamp, and can push to Prometheus (Remote Write) and/or ClickHouse.
+
+```
+┌─────────────────┐ poll (1s) ┌─────────────────────────────────────┐
+│ CSV file(s) │ ─────────────────▶ │ Epimetheus (watch mode) │
+│ │ │ • Parse tabular CSV │
+│ File mtime = │ │ • Numeric columns → metrics │
+│ sample time │ │ • String columns → labels │
+└─────────────────┘ │ • Optional DNS resolution (IPs) │
+ └─────────────────────────────────────┘
+ │
+ ┌────────────────────┼────────────────────┐
+ │ │ │
+ ▼ ▼ │
+ ┌───────────────┐ ┌───────────────┐ │
+ │ Prometheus │ │ ClickHouse │ │
+ │ (optional) │ │ (optional) │ │
+ │ Remote Write │ │ HTTP insert │ │
+ │ /api/v1/write│ │ (batched) │ │
+ └───────────────┘ └───────────────┘ │
+```
+
+At least one of `-prometheus` or `-clickhouse` must be set. See [Operating Modes](../guides/modes.md) and [ClickHouse backend](../backends/clickhouse.md).
+
+## When to use Pushgateway vs Remote Write
+
+**Use Pushgateway (realtime mode):**
+
+- Short-lived batch jobs
+- Service-level metrics
+- Jobs behind firewalls
+- Current/recent data (< 5 minutes old)
+
+**Use Remote Write (historic, backfill, watch, or auto with old data):**
+
+- Historic data import
+- Backfilling gaps
+- Data migration
+- Data older than 5 minutes
+- Watch mode (to preserve file mtime as timestamp)
+
+**Use Auto mode:**
+
+- Mixed current and historic data in one file
+- Unknown timestamp ages
+- General-purpose file import
+
+## When to use which backend
+
+- **Prometheus or VictoriaMetrics:** Set `-prometheus=` to the backend’s Remote Write URL. Use for realtime (via Pushgateway scraped by Prometheus/VM), historic, backfill, auto, and watch.
+- **ClickHouse:** Set `-clickhouse=` in watch mode for analytics/long-term storage. Can be used together with Prometheus or alone (with `-prometheus=` empty).
+
+## Metric design (best practices)
+
+- **Types:** Counter for cumulative values (requests, errors); Gauge for point-in-time (temperature, connections); Histogram for distributions (latency).
+- **Labels:** Meaningful labels; avoid high cardinality (user IDs, raw timestamps); keep combinations reasonable (< 1000 per metric).
+- **Naming:** Descriptive names; units in gauge names (e.g. `_celsius`, `_bytes`); `_total` suffix for counters.
diff --git a/docs/guides/csv-format-flexibility.md b/docs/guides/csv-format-flexibility.md
new file mode 100644
index 0000000..180dc28
--- /dev/null
+++ b/docs/guides/csv-format-flexibility.md
@@ -0,0 +1,52 @@
+# CSV Format Flexibility
+
+Watch mode works with **any tabular CSV**. You do not need a fixed schema; Epimetheus infers metric names and labels from column headers and value types.
+
+## How It Works
+
+- **First row:** Column headers (automatically sanitized for Prometheus label/metric names).
+- **Numeric columns:** Treated as metric values. Each gets a metric name derived from the base metric name and the column header.
+- **String columns:** Treated as labels. Each row’s value becomes the label value for that series.
+- **Metric name:** Set with `-metric-name` (e.g. `web`, `food`, `network`). It is used as a prefix for all numeric columns.
+
+Column names can contain characters that are invalid in Prometheus (e.g. parentheses, spaces). They are sanitized: for example `min(potatoes)` becomes a valid metric suffix like `min_potatoes`.
+
+## Examples
+
+### Web metrics
+
+```csv
+avg(response_time),p99(latency),endpoint,method
+45.2,120.5,/api/users,GET
+52.1,135.8,/api/orders,POST
+```
+
+With `-metric-name=web` this produces series such as:
+
+- `web_avg_response_time{endpoint="/api/users",method="GET"} 45.2`
+- `web_p99_latency{endpoint="/api/users",method="GET"} 120.5`
+- `web_avg_response_time{endpoint="/api/orders",method="POST"} 52.1`
+- `web_p99_latency{endpoint="/api/orders",method="POST"} 135.8`
+
+### Food / business metrics
+
+```csv
+min(potatoes),last(coke),avg(price),country,store_type
+5.2,10.5,12.99,USA,grocery
+3.8,8.2,9.99,Canada,convenience
+```
+
+With `-metric-name=food` this produces series such as:
+
+- `food_min_potatoes{country="USA",store_type="grocery"} 5.2`
+- `food_last_coke{country="USA",store_type="grocery"} 10.5`
+- `food_avg_price{country="USA",store_type="grocery"} 12.99`
+- and the same metrics with `country="Canada",store_type="convenience"`.
+
+### Summary
+
+- Each **row** becomes one or more samples (one per numeric column).
+- **Numeric columns** → different metrics (same labels for that row).
+- **String columns** → labels shared by all those metrics for that row.
+
+For the standard Epimetheus CSV format (explicit metric name, labels, value, timestamp) see [Data Formats](data-formats.md). For modes and watch options see [Operating Modes](modes.md).
diff --git a/docs/guides/data-formats.md b/docs/guides/data-formats.md
new file mode 100644
index 0000000..24d7755
--- /dev/null
+++ b/docs/guides/data-formats.md
@@ -0,0 +1,49 @@
+# Data Formats
+
+Epimetheus accepts CSV and JSON input for **auto mode** (and for **watch mode**, watch uses tabular CSV; see [CSV Format Flexibility](csv-format-flexibility.md)).
+
+## Epimetheus CSV (auto mode)
+
+Format: one metric per line with explicit metric name, labels, value, and optional timestamp.
+
+```csv
+# Format: metric_name,labels,value,timestamp_ms
+# Labels: key1=value1;key2=value2
+epimetheus_test_requests_total,instance=web1;env=prod,100,1767125148000
+epimetheus_test_temperature_celsius,instance=web2,22.5,1767038748000
+
+# Timestamp optional (uses "now" if omitted)
+epimetheus_test_active_connections,instance=web3,42,
+```
+
+- **metric_name** – Prometheus metric name.
+- **labels** – Semicolon-separated `key=value` pairs.
+- **value** – Numeric value.
+- **timestamp_ms** – Unix milliseconds. Omit or leave empty for "now".
+
+## JSON (auto mode)
+
+Array of objects with `metric`, `labels`, `value`, and optional `timestamp_ms`:
+
+```json
+[
+ {
+ "metric": "epimetheus_test_requests_total",
+ "labels": {"instance": "web1", "env": "prod"},
+ "value": 100,
+ "timestamp_ms": 1767125148000
+ },
+ {
+ "metric": "epimetheus_test_temperature_celsius",
+ "labels": {"instance": "web2"},
+ "value": 22.5,
+ "timestamp_ms": 1767038748000
+ }
+]
+```
+
+Omit `timestamp_ms` for "now".
+
+## Watch mode CSV
+
+Watch mode uses **tabular CSV**: first row = headers, following rows = data. Numeric columns become metrics (with `-metric-name` as prefix), string columns become labels. See [CSV Format Flexibility](csv-format-flexibility.md).
diff --git a/docs/guides/dns-resolution.md b/docs/guides/dns-resolution.md
new file mode 100644
index 0000000..42478a2
--- /dev/null
+++ b/docs/guides/dns-resolution.md
@@ -0,0 +1,42 @@
+# DNS Resolution (Watch Mode)
+
+In watch mode, Epimetheus can resolve IP addresses in label values to hostnames. This improves readability in Grafana and other tools that display label values.
+
+## Default Behaviour
+
+- The label **`ip`** is always resolved by default (when present).
+- Resolution is done via reverse DNS. The result is used as the label value (e.g. `10.50.52.61` → `foo.example.lan`).
+- Failed lookups leave the original IP unchanged.
+- Results are cached in memory to avoid repeated DNS lookups.
+
+## Additional Labels
+
+To resolve other IP-carrying labels, use `-resolve-ip-labels` with a comma-separated list of label names:
+
+```bash
+./epimetheus -mode=watch \
+ -file=network.csv \
+ -metric-name=network \
+ -resolve-ip-labels=source_ip,dest_ip
+```
+
+This resolves:
+
+- `ip` (always, if present)
+- `source_ip`
+- `dest_ip`
+
+Duplicate or empty entries (e.g. listing `ip` again) are ignored.
+
+## Example
+
+- **Input label:** `ip="10.50.52.61"`
+- **After resolution:** `ip="foo.example.lan"` (if reverse DNS returns that name)
+- **If resolution fails:** `ip="10.50.52.61"` (unchanged)
+
+## When to Use
+
+- CSV columns that contain IPs and are used as labels (e.g. `ip`, `host`, `source_ip`, `dest_ip`).
+- When you want dashboards to show hostnames instead of raw IPs.
+
+DNS resolution only applies in **watch mode**. Other modes do not use this feature. See [Operating Modes](modes.md) and [CLI Reference](../reference/cli.md) for full options.
diff --git a/docs/guides/dtail-metrics-example.md b/docs/guides/dtail-metrics-example.md
new file mode 100644
index 0000000..5416726
--- /dev/null
+++ b/docs/guides/dtail-metrics-example.md
@@ -0,0 +1,49 @@
+# Dtail Metrics Example
+
+This page walks through using Epimetheus watch mode with a CSV that could come from a tool like [Dtail](https://dtail.dev/) or any similar log/aggregation export.
+
+## Scenario
+
+You have a CSV file (e.g. `dtail.csv`) with columns that mix numeric stats and identifiers (host, service, etc.). You want to turn those into Prometheus metrics so you can graph them in Grafana.
+
+## Steps
+
+1. **Ensure the CSV has a header row**
+ First line = column names. Epimetheus will sanitize them for use as metric names and labels.
+
+2. **Identify numeric vs string columns**
+ - Numeric columns (e.g. `count`, `avg_latency_ms`, `p99`) become metric values.
+ - String columns (e.g. `host`, `service`, `region`) become labels.
+
+3. **Run watch mode** with a base metric name and your Prometheus (or Prometheus-compatible) write URL:
+
+ ```bash
+ ./epimetheus -mode=watch \
+ -file=dtail.csv \
+ -metric-name=dtail \
+ -prometheus=http://localhost:9090/api/v1/write
+ ```
+
+4. **Optional: resolve IPs to hostnames**
+ If one of your label columns contains IPs (e.g. `host` or `ip`), you can resolve them:
+
+ ```bash
+ ./epimetheus -mode=watch \
+ -file=dtail.csv \
+ -metric-name=dtail \
+ -prometheus=http://localhost:9090/api/v1/write \
+ -resolve-ip-labels=host
+ ```
+
+5. **Query in Prometheus / Grafana**
+ Metrics will appear as `dtail_<column_name>` with your string columns as labels, e.g.:
+
+ ```promql
+ dtail_avg_latency_ms{service="api", region="eu"}
+ ```
+
+## References
+
+- [CSV Format Flexibility](csv-format-flexibility.md) – how column types and names are interpreted.
+- [DNS Resolution](dns-resolution.md) – IP-to-hostname resolution.
+- [Operating Modes](modes.md) – all watch mode options.
diff --git a/docs/guides/modes.md b/docs/guides/modes.md
new file mode 100644
index 0000000..bcbbb6b
--- /dev/null
+++ b/docs/guides/modes.md
@@ -0,0 +1,130 @@
+# Operating Modes
+
+Epimetheus has five modes. Backend support:
+
+| Mode | Prometheus (Pushgateway) | Prometheus (Remote Write) | ClickHouse |
+|-----------|--------------------------|---------------------------|------------|
+| Realtime | Yes | No | No |
+| Historic | No | Yes | No |
+| Backfill | No | Yes | No |
+| Auto | Yes (samples &lt; 5 min) | Yes (samples ≥ 5 min) | No |
+| Watch | Optional | Optional | Optional |
+
+At least one of Prometheus or ClickHouse must be configured for watch mode.
+
+---
+
+## Watch Mode
+
+Monitor CSV files and push metrics using file modification time as the timestamp. Works with any tabular CSV; numeric columns become metrics, string columns become labels.
+
+### Watch mode data flow
+
+```
+┌─────────────────┐ poll (1s) ┌─────────────────────────────────────┐
+│ CSV file(s) │ ─────────────────▶ │ Epimetheus (watch mode) │
+│ │ │ • Parse tabular CSV │
+│ File mtime = │ │ • Numeric columns → metrics │
+│ sample time │ │ • String columns → labels │
+└─────────────────┘ │ • Optional DNS resolution (IPs) │
+ └─────────────────────────────────────┘
+ │
+ ┌────────────────────┼────────────────────┐
+ │ │ │
+ ▼ ▼ │
+ ┌───────────────┐ ┌───────────────┐ │
+ │ Prometheus │ │ ClickHouse │ │
+ │ (optional) │ │ (optional) │ │
+ │ │ │ │ │
+ │ Remote Write │ │ HTTP insert │ │
+ │ /api/v1/write│ │ (batched) │ │
+ └───────────────┘ └───────────────┘ │
+ │ │ │
+ └────────────────────┴────────────────────┘
+ At least one of -prometheus or -clickhouse
+```
+
+```bash
+./epimetheus -mode=watch -file=mydata.csv -metric-name=myapp \
+ -prometheus=http://localhost:9090/api/v1/write
+```
+
+**Options:** `-file`, `-metric-name`, `-prometheus`, `-clickhouse`, `-clickhouse-table`, `-job`, `-resolve-ip-labels`. See [CLI Reference](../reference/cli.md).
+
+**Features:** Format-agnostic CSV, automatic numeric/string detection, label name sanitization, optional DNS resolution for IP labels, timestamp from file mtime, continuous polling (1s), Remote Write (and optionally ClickHouse). See [CSV Format Flexibility](csv-format-flexibility.md) and [DNS Resolution](dns-resolution.md).
+
+---
+
+## Realtime Mode (default)
+
+Push current metrics to Pushgateway with "now" timestamp.
+
+```bash
+./epimetheus -mode=realtime -continuous
+```
+
+**Options:** `-pushgateway` (default `http://localhost:9091`), `-job`, `-continuous`. Pushes every 15 seconds when `-continuous` is set.
+
+---
+
+## Historic Mode
+
+Push a single historic datapoint via Remote Write.
+
+```bash
+./epimetheus -mode=historic -hours-ago=24
+```
+
+**Options:** `-prometheus` (default `http://localhost:9090/api/v1/write`), `-hours-ago` (default 24). Requires Remote Write receiver. See [Backends: Prometheus](../backends/prometheus.md).
+
+---
+
+## Backfill Mode
+
+Import a range of historic data points.
+
+```bash
+./epimetheus -mode=backfill -start-hours=48 -end-hours=0 -interval=1
+./epimetheus -mode=backfill -start-hours=168 -end-hours=0 -interval=6
+```
+
+**Options:** `-start-hours`, `-end-hours` (0 = now), `-interval` (hours between points). Requires Remote Write receiver.
+
+---
+
+## Auto Mode
+
+Route samples by timestamp age: &lt; 5 minutes → Pushgateway; ≥ 5 minutes → Remote Write. Use for mixed or unknown-age data.
+
+### Auto mode data flow
+
+```
+┌─────────────────┐ ┌─────────────────────────────────────┐
+│ CSV/JSON file │ ─────────────────▶ │ Epimetheus (auto mode) │
+│ (per-sample │ │ • Parse file (csv or json) │
+│ timestamps) │ │ • Route by sample age: │
+└─────────────────┘ │ < 5 min → Pushgateway │
+ │ ≥ 5 min → Remote Write │
+ └─────────────────────────────────────┘
+ │
+ ┌────────────────────┴────────────────────┐
+ ▼ ▼ │
+ ┌───────────────┐ ┌───────────────┐ │
+ │ Pushgateway │ │ Prometheus │ │
+ │ (realtime │ │ Remote Write │ │
+ │ samples) │ │ (historic │ │
+ └───────┬───────┘ │ samples) │ │
+ │ └───────────────┘ │
+ │ Scraped by Prometheus │
+ ▼ │
+ ┌───────────────┐ │
+ │ Prometheus │◀──────────────────────────────────┘
+ └───────────────┘
+```
+
+```bash
+./scripts/generate-test-data.sh
+./epimetheus -mode=auto -file=test-all-ages.csv
+```
+
+**Options:** `-file`, `-format` (csv or json), `-pushgateway`, `-prometheus`. See [Data Formats](data-formats.md).
diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md
new file mode 100644
index 0000000..adeea2b
--- /dev/null
+++ b/docs/guides/quickstart.md
@@ -0,0 +1,56 @@
+# Quick Start
+
+Minimal path to push metrics and see them in Prometheus or ClickHouse.
+
+## 1. Build
+
+```bash
+go build -o epimetheus cmd/epimetheus/main.go
+# Or: mage build
+```
+
+## 2. Run (Prometheus path)
+
+**Realtime mode** (Pushgateway + Prometheus):
+
+1. Deploy and expose Pushgateway (see [Kubernetes](../operations/kubernetes.md) or run Pushgateway locally).
+2. Ensure Prometheus scrapes Pushgateway (see [Setup: Prometheus](../operations/setup-prometheus.md)).
+3. Port-forward if needed, then run:
+
+```bash
+kubectl port-forward -n monitoring svc/pushgateway 9091:9091 &
+./epimetheus -mode=realtime -continuous
+```
+
+Metrics are pushed every 15 seconds. Stop with Ctrl+C.
+
+**Watch mode** (Remote Write; preserves timestamps):
+
+1. Enable the Prometheus Remote Write receiver (see [Setup: Prometheus](../operations/setup-prometheus.md)).
+2. Port-forward Prometheus, then run:
+
+```bash
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
+./epimetheus -mode=watch -file=mydata.csv -metric-name=myapp \
+ -prometheus=http://localhost:9090/api/v1/write
+```
+
+## 3. View
+
+- **Pushgateway:** http://localhost:9091
+- **Prometheus:** http://localhost:9090 (e.g. query `epimetheus_test_requests_total` or your metric name)
+- **Grafana:** Add Prometheus as a datasource and import the Epimetheus dashboard (see [Grafana Dashboard](../reference/grafana-dashboard.md)).
+
+## ClickHouse path (watch only)
+
+1. Run ClickHouse (e.g. `sudo systemctl start clickhouse-server` or Docker). See [Setup: ClickHouse](../operations/setup-clickhouse.md).
+2. Run watch mode with ClickHouse:
+
+```bash
+./epimetheus -mode=watch -file=test-data/watch-clickhouse-test.csv \
+ -metric-name=watch_test -clickhouse=http://localhost:8123 -prometheus=
+```
+
+3. Verify: `./scripts/verify-clickhouse.sh`
+
+For all modes and options see [Operating Modes](modes.md) and [CLI Reference](../reference/cli.md).
diff --git a/docs/operations/cleanup.md b/docs/operations/cleanup.md
new file mode 100644
index 0000000..7835b21
--- /dev/null
+++ b/docs/operations/cleanup.md
@@ -0,0 +1,48 @@
+# Cleanup
+
+## Benchmark data in Prometheus
+
+To remove benchmark metrics from Prometheus, use the provided script:
+
+```bash
+# Port-forward to Prometheus if needed
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
+
+./scripts/cleanup-benchmark-data.sh
+```
+
+The script deletes all `epimetheus_benchmark_*` series via the Admin API and runs clean_tombstones.
+
+**Manual deletion:**
+
+```bash
+# Delete a specific metric
+curl -X POST 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]=epimetheus_benchmark_cpu_usage'
+
+# Clean tombstones
+curl -X POST http://localhost:9090/api/v1/admin/tsdb/clean_tombstones
+```
+
+The Admin API must be enabled on Prometheus (see [Setup: Prometheus](setup-prometheus.md)).
+
+## Other cleanup
+
+**Stop port-forwards:**
+
+```bash
+pkill -f "port-forward.*9091"
+pkill -f "port-forward.*9090"
+pkill -f "port-forward.*3000"
+```
+
+**Remove test metrics from Pushgateway:**
+
+```bash
+curl -X DELETE http://localhost:9091/metrics/job/example_metrics_pusher
+```
+
+**Uninstall Pushgateway (Helm):**
+
+```bash
+helm uninstall pushgateway -n monitoring
+```
diff --git a/docs/operations/kubernetes.md b/docs/operations/kubernetes.md
new file mode 100644
index 0000000..20b8b07
--- /dev/null
+++ b/docs/operations/kubernetes.md
@@ -0,0 +1,51 @@
+# Kubernetes
+
+Common tasks when running Epimetheus against Prometheus, Pushgateway, and Grafana in Kubernetes.
+
+## Port-forwards
+
+To run Epimetheus on your laptop against cluster services:
+
+```bash
+# Pushgateway (realtime mode)
+kubectl port-forward -n monitoring svc/pushgateway 9091:9091 &
+
+# Prometheus (historic/watch, queries)
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
+
+# Grafana (dashboards)
+kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80
+```
+
+Then use `http://localhost:9091`, `http://localhost:9090`, and `http://localhost:3000` in Epimetheus flags and in the browser. Adjust service names and namespaces to match your cluster (e.g. `prometheus-kube-prometheus-prometheus` for kube-prometheus-stack).
+
+## Deploying Pushgateway
+
+Example using the official Helm chart:
+
+```bash
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm install pushgateway prometheus-community/prometheus-pushgateway -n monitoring --create-namespace
+```
+
+Alternatively use your own chart (e.g. from the [conf repository](https://codeberg.org/snonux/conf) at `f3s/pushgateway/helm-chart`).
+
+## Deploying the Epimetheus Grafana dashboard
+
+**ConfigMap (recommended):** If you have a manifest that creates a ConfigMap with the dashboard JSON and the Grafana label for auto-discovery:
+
+```bash
+kubectl apply -f ../prometheus/epimetheus-dashboard.yaml
+```
+
+**Script:** From the repo, with Grafana reachable (e.g. after port-forward):
+
+```bash
+./scripts/deploy-dashboard.sh
+# Or with credentials:
+GRAFANA_URL="http://localhost:3000" GRAFANA_USER="admin" GRAFANA_PASSWORD="yourpassword" ./scripts/deploy-dashboard.sh
+```
+
+## Namespace and service names
+
+Replace `monitoring` and the Prometheus/Pushgateway/Grafana service names with whatever your Helm release or manifests use. Epimetheus only needs the URLs; it does not need to run inside the cluster.
diff --git a/docs/operations/macos-setup.md b/docs/operations/macos-setup.md
new file mode 100644
index 0000000..8ed47c9
--- /dev/null
+++ b/docs/operations/macos-setup.md
@@ -0,0 +1,91 @@
+# macOS Setup
+
+## Basic installation
+
+```bash
+brew install prometheus
+brew install grafana
+go install github.com/prometheus/pushgateway@latest
+brew services start grafana
+brew services start prometheus
+~/go/bin/pushgateway &
+```
+
+Log in to Grafana at http://localhost:3000 (default admin:admin; you will be prompted to change the password). Add http://localhost:9090 as a Prometheus datasource.
+
+## Enable Remote Write receiver (required for watch/historic/backfill/auto)
+
+Watch mode, historic mode, backfill mode, and auto mode with old data require the Prometheus Remote Write receiver.
+
+### Option 1: Permanent configuration
+
+Edit the Prometheus arguments file (Homebrew example):
+
+```bash
+nano /opt/homebrew/etc/prometheus.args
+```
+
+Add at the end:
+
+```
+--web.enable-remote-write-receiver
+--web.enable-admin-api
+```
+
+Example full file:
+
+```
+--config.file /opt/homebrew/etc/prometheus.yml
+--web.listen-address=127.0.0.1:9090
+--storage.tsdb.path /opt/homebrew/var/prometheus
+--web.enable-remote-write-receiver
+--web.enable-admin-api
+```
+
+Restart Prometheus:
+
+```bash
+brew services restart prometheus
+```
+
+Verify:
+
+```bash
+curl http://localhost:9090/-/healthy
+curl -X POST http://localhost:9090/api/v1/write # expect 400, not 404
+```
+
+### Option 2: Temporary (testing only)
+
+```bash
+brew services stop prometheus
+prometheus --web.enable-remote-write-receiver
+```
+
+Keep that terminal open; use another for Epimetheus. This stops when you close the terminal.
+
+## Clearing old metrics (optional)
+
+If the Admin API is enabled:
+
+```bash
+# Delete metrics by name pattern
+curl -X POST -g 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]={__name__=~"blockstore_.*"}'
+curl -X POST http://localhost:9090/api/v1/admin/tsdb/clean_tombstones
+sleep 2
+```
+
+## Verify watch mode
+
+```bash
+cat > /tmp/test.csv << EOF
+status,count,method
+200,100,GET
+404,50,POST
+EOF
+
+./epimetheus -mode=watch -file=/tmp/test.csv -metric-name=test \
+ -prometheus=http://localhost:9090/api/v1/write
+```
+
+You should see a success message. In Prometheus (http://localhost:9090), query `{__name__=~"test_.*"}`.
diff --git a/docs/operations/setup-clickhouse.md b/docs/operations/setup-clickhouse.md
new file mode 100644
index 0000000..acc8247
--- /dev/null
+++ b/docs/operations/setup-clickhouse.md
@@ -0,0 +1,43 @@
+# Setup: ClickHouse
+
+ClickHouse is only used in **watch mode**. Epimetheus creates the metrics table automatically if it does not exist.
+
+## Running ClickHouse
+
+- **Linux (systemd):** `sudo systemctl start clickhouse-server`
+- **Docker:** Use the official [ClickHouse image](https://hub.docker.com/r/clickhouse/clickhouse-server) and expose the HTTP interface (default port 8123).
+- **Kubernetes:** Deploy ClickHouse and expose a Service; use the HTTP URL (e.g. `http://clickhouse.monitoring.svc.cluster.local:8123`) as `-clickhouse`.
+
+Default HTTP port is **8123**. Epimetheus uses the HTTP interface, not the native protocol.
+
+## Table Creation
+
+You do not need to create the table manually. On first ingest, Epimetheus runs:
+
+```sql
+CREATE TABLE IF NOT EXISTS epimetheus_metrics (
+ metric String,
+ labels Map(String, String),
+ value Float64,
+ timestamp DateTime64(3)
+) ENGINE = MergeTree()
+ORDER BY (metric, timestamp)
+```
+
+To use a different table name, set `-clickhouse-table`.
+
+## Verification
+
+After running watch mode with `-clickhouse` set, verify ingestion:
+
+```bash
+./scripts/verify-clickhouse.sh
+```
+
+With custom URL or table:
+
+```bash
+./scripts/verify-clickhouse.sh http://localhost:8123 epimetheus_metrics
+```
+
+The script checks connectivity (`/ping`), row count, distinct metrics, sample rows, and rows per metric. If the table is empty or missing, it prints a reminder command to run Epimetheus in watch mode with `-clickhouse`. See [ClickHouse backend](../backends/clickhouse.md) for usage.
diff --git a/docs/operations/setup-prometheus.md b/docs/operations/setup-prometheus.md
new file mode 100644
index 0000000..294ce20
--- /dev/null
+++ b/docs/operations/setup-prometheus.md
@@ -0,0 +1,82 @@
+# Setup: Prometheus
+
+To use historic mode, backfill mode, auto mode with old data, or watch mode with `-prometheus`, you must enable the Prometheus Remote Write receiver. Without it, Epimetheus can only push realtime data via Pushgateway.
+
+## 1. Enable Remote Write Receiver and Admin API
+
+Example configuration (Prometheus 3.x style). Adjust paths and stack to match your environment (e.g. [conf repository](https://codeberg.org/snonux/conf) at `f3s/prometheus/persistence-values.yaml`):
+
+```yaml
+prometheus:
+ prometheusSpec:
+ additionalArgs:
+ - name: web.enable-remote-write-receiver
+ value: ""
+ - name: web.enable-admin-api
+ value: ""
+
+ enableFeatures:
+ - exemplar-storage
+ - otlp-write-receiver
+
+ tsdb:
+ outOfOrderTimeWindow: 744h # 31 days for backfilling
+```
+
+This provides:
+
+- **Remote Write API** at `/api/v1/write` for ingesting metrics with custom timestamps.
+- **Admin API** at `/api/v1/admin/tsdb/*` for deleting series and cleaning tombstones.
+- **Out-of-order ingestion** so older points can be written for existing series (within the time window).
+
+After changing config, upgrade Prometheus (e.g. `helm upgrade` or your usual apply).
+
+### Verify
+
+```bash
+# Remote Write receiver
+kubectl get pod -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 \
+ -o jsonpath='{.spec.containers[0].args}' | grep -o "web.enable-remote-write-receiver"
+
+# Out-of-order window
+kubectl get prometheus -n monitoring prometheus-kube-prometheus-prometheus \
+ -o jsonpath='{.spec.tsdb.outOfOrderTimeWindow}'
+
+# Admin API
+kubectl get pod -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 \
+ -o jsonpath='{.spec.containers[0].args}' | grep -o "web.enable-admin-api"
+```
+
+**Note:** In Prometheus 3.x use `additionalArgs` for `web.enable-remote-write-receiver`; the older `enableFeatures: [remote-write-receiver]` is deprecated.
+
+## 2. Scrape Config for Pushgateway
+
+For realtime mode, Prometheus must scrape Pushgateway. Example:
+
+```yaml
+# additional-scrape-configs.yaml
+- job_name: 'pushgateway'
+ honor_labels: true
+ static_configs:
+ - targets:
+ - 'pushgateway.monitoring.svc.cluster.local:9091'
+```
+
+Apply as a Secret (example):
+
+```bash
+kubectl create secret generic additional-scrape-configs \
+ --from-file=additional-scrape-configs.yaml \
+ --dry-run=client -o yaml -n monitoring | kubectl apply -f -
+```
+
+## 3. Retention
+
+Check retention so you know how far back Epimetheus can write:
+
+```bash
+kubectl get prometheus -n monitoring prometheus-kube-prometheus-prometheus \
+ -o jsonpath='{.spec.retention}'
+```
+
+For very old data, increase retention or use a dedicated dev/test Prometheus. Enabling out-of-order ingestion and a large `outOfOrderTimeWindow` has memory and I/O trade-offs; see [Prometheus backend](../backends/prometheus.md) and keep production config conservative.
diff --git a/docs/operations/troubleshooting.md b/docs/operations/troubleshooting.md
new file mode 100644
index 0000000..9446508
--- /dev/null
+++ b/docs/operations/troubleshooting.md
@@ -0,0 +1,43 @@
+# Troubleshooting
+
+## Binary can't connect to Pushgateway
+
+- Confirm a port-forward or route to Pushgateway is running, e.g. `ps aux | grep "port-forward.*9091"`.
+- Restart port-forward: `kubectl port-forward -n monitoring svc/pushgateway 9091:9091`.
+- Ensure `-pushgateway` points at the URL you use (e.g. `http://localhost:9091`).
+
+## Metrics not appearing in Prometheus
+
+- **Pushgateway:** `curl http://localhost:9091/metrics | grep "prometheus_pusher_test"` (or your job/metric name). If empty, Epimetheus may not be pushing or the job name may differ.
+- **Scrape:** In Prometheus UI (e.g. http://localhost:9090/targets), check that the Pushgateway job exists and is up.
+- **Logs:** `kubectl logs -n monitoring -l app.kubernetes.io/name=prometheus` (or your Prometheus pod) for scrape/remote-write errors.
+
+## "Remote write receiver not enabled" error
+
+Prometheus must be started with the Remote Write receiver enabled. Verify:
+
+```bash
+kubectl logs -n monitoring prometheus-prometheus-kube-prometheus-prometheus-0 | grep "remote-write-receiver"
+```
+
+You should see the feature listed in the enabled features. If not, add `web.enable-remote-write-receiver` (see [Setup: Prometheus](setup-prometheus.md)) and restart Prometheus.
+
+## "Out of order sample" error
+
+You are writing a sample older than existing data for the same series.
+
+- Use different labels for historic data (e.g. `job="historic_data"`), or
+- Enable out-of-order ingestion on Prometheus and set `tsdb.outOfOrderTimeWindow` (see [Setup: Prometheus](setup-prometheus.md)), or
+- Run backfills from oldest to newest.
+
+## Dashboard not appearing in Grafana
+
+- Check the dashboard ConfigMap exists: `kubectl get configmap -n monitoring | grep epimetheus`.
+- Ensure the ConfigMap has the label Grafana uses for dashboard discovery (e.g. `grafana_dashboard: "1"`): `kubectl get configmap epimetheus-dashboard -n monitoring -o yaml | grep "grafana_dashboard"`.
+- Restart Grafana to reload dashboards: `kubectl rollout restart deployment/prometheus-grafana -n monitoring` (adjust deployment name to your setup).
+
+## ClickHouse connection failed
+
+- Ensure ClickHouse is listening on HTTP (default port 8123): `curl -sS http://localhost:8123/ping`.
+- If using Kubernetes, check Service and port-forwards. Use the same URL as `-clickhouse`.
+- See [Setup: ClickHouse](setup-clickhouse.md) and [ClickHouse backend](../backends/clickhouse.md).
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
new file mode 100644
index 0000000..83d02b0
--- /dev/null
+++ b/docs/reference/cli.md
@@ -0,0 +1,57 @@
+# CLI Reference
+
+All flags and defaults. Modes: `realtime`, `historic`, `backfill`, `auto`, `watch`.
+
+## Global
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `-version` | — | Print version and exit |
+| `-mode` | `realtime` | Mode: realtime, historic, backfill, auto, or watch |
+
+## Realtime
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `-pushgateway` | `http://localhost:9091` | Pushgateway URL |
+| `-job` | `example_metrics_pusher` | Job name for metrics |
+| `-continuous` | `false` | Push every 15s |
+
+## Historic
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `-prometheus` | `http://localhost:9090/api/v1/write` | Prometheus Remote Write URL |
+| `-hours-ago` | `24` | Hours in the past (single datapoint) |
+
+## Backfill
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `-prometheus` | `http://localhost:9090/api/v1/write` | Prometheus Remote Write URL |
+| `-start-hours` | `48` | Start time in hours ago |
+| `-end-hours` | `0` | End time in hours ago (0 = now) |
+| `-interval` | `1` | Interval between points in hours |
+
+## Auto
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `-file` | — | Input file path (required) |
+| `-format` | `csv` | Input format: csv or json |
+| `-pushgateway` | `http://localhost:9091` | Pushgateway URL |
+| `-prometheus` | `http://localhost:9090/api/v1/write` | Prometheus Remote Write URL |
+
+## Watch
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `-file` | — | CSV file(s) to watch (comma-separated for multiple); required |
+| `-metric-name` | — | Base metric name (e.g. myapp, food); required |
+| `-prometheus` | `http://localhost:9090/api/v1/write` | Prometheus Remote Write URL (set to empty to disable) |
+| `-clickhouse` | — | ClickHouse HTTP URL (e.g. http://localhost:8123) |
+| `-clickhouse-table` | `epimetheus_metrics` | ClickHouse table name |
+| `-job` | `example_metrics_pusher` | Job name for metrics |
+| `-resolve-ip-labels` | (ip only) | Comma-separated additional IP labels to resolve via DNS |
+
+Watch mode requires at least one of `-prometheus` or `-clickhouse`. Use `-prometheus=` to ingest only to ClickHouse.
diff --git a/docs/reference/example-queries.md b/docs/reference/example-queries.md
new file mode 100644
index 0000000..e78aaec
--- /dev/null
+++ b/docs/reference/example-queries.md
@@ -0,0 +1,66 @@
+# Example Queries
+
+PromQL and curl examples for Epimetheus test metrics. Use your Prometheus (or Prometheus-compatible) query URL; after port-forward, that is often http://localhost:9090.
+
+## Basic PromQL
+
+```promql
+# Total requests
+epimetheus_test_requests_total
+
+# Request rate (last 5 minutes)
+rate(epimetheus_test_requests_total[5m])
+
+# Active connections
+epimetheus_test_active_connections
+
+# Temperature
+epimetheus_test_temperature_celsius
+```
+
+## Histogram
+
+```promql
+# 95th percentile request duration
+histogram_quantile(0.95, rate(epimetheus_test_request_duration_seconds_bucket[5m]))
+
+# Median (50th percentile)
+histogram_quantile(0.50, rate(epimetheus_test_request_duration_seconds_bucket[5m]))
+
+# Average request duration
+rate(epimetheus_test_request_duration_seconds_sum[5m]) /
+rate(epimetheus_test_request_duration_seconds_count[5m])
+```
+
+## Labeled counter
+
+```promql
+# Failed jobs by type
+epimetheus_test_jobs_processed_total{status="failed"}
+
+# Job success rate
+rate(epimetheus_test_jobs_processed_total{status="success"}[5m]) /
+rate(epimetheus_test_jobs_processed_total[5m])
+
+# Total jobs by type
+sum by (job_type) (epimetheus_test_jobs_processed_total)
+```
+
+## Curl (HTTP API)
+
+```bash
+# Port-forward if needed
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
+
+# Total requests
+curl -s "http://localhost:9090/api/v1/query?query=epimetheus_test_requests_total" | jq .
+
+# Temperature
+curl -s "http://localhost:9090/api/v1/query?query=epimetheus_test_temperature_celsius" | jq .
+
+# Request rate
+curl -s "http://localhost:9090/api/v1/query?query=rate(epimetheus_test_requests_total[5m])" | jq .
+
+# Histogram p95
+curl -s "http://localhost:9090/api/v1/query?query=histogram_quantile(0.95,rate(epimetheus_test_request_duration_seconds_bucket[5m]))" | jq .
+```
diff --git a/docs/reference/grafana-dashboard.md b/docs/reference/grafana-dashboard.md
new file mode 100644
index 0000000..b7f2030
--- /dev/null
+++ b/docs/reference/grafana-dashboard.md
@@ -0,0 +1,50 @@
+# Grafana Dashboard
+
+A dashboard is provided that shows all Epimetheus test metrics.
+
+## Panels
+
+1. Request Rate (line graph)
+2. Total Requests (stat)
+3. Active Connections (gauge with thresholds)
+4. Temperature (gauge with thresholds)
+5. Request Duration Histogram (p50, p90, p99)
+6. Average Request Duration (stat)
+7. Jobs Processed by Type (bar gauge)
+8. Jobs Status Breakdown (table)
+
+Auto-refresh: 10 seconds. Time range: last 15 minutes (configurable). Optimized for dark theme.
+
+## Deployment
+
+### Option 1: Kubernetes ConfigMap (recommended)
+
+If you have a manifest that defines the dashboard as a ConfigMap with Grafana’s discovery label:
+
+```bash
+kubectl apply -f ../prometheus/epimetheus-dashboard.yaml
+```
+
+Grafana will pick it up automatically.
+
+### Option 2: Manual import
+
+1. Port-forward Grafana: `kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80`
+2. Open http://localhost:3000
+3. Dashboards → Import → Upload `grafana-dashboard.json`
+
+### Option 3: Deploy script
+
+```bash
+./scripts/deploy-dashboard.sh
+# Or with credentials:
+GRAFANA_URL="http://localhost:3000" GRAFANA_USER="admin" GRAFANA_PASSWORD="yourpassword" ./scripts/deploy-dashboard.sh
+```
+
+## Datasource
+
+Use Prometheus (or a Prometheus-compatible backend such as VictoriaMetrics) as the datasource. Point it at the same instance Epimetheus writes to (e.g. http://localhost:9090 after port-forward).
+
+## Panel guidelines
+
+When creating or updating Grafana panels, follow the project’s [AGENT.md](../../AGENT.md) (Grafana dashboard guidelines): e.g. sort time series by last value descending, use `sort_desc()` in bar gauges, set table sort options as specified.
diff --git a/docs/reference/magefile.md b/docs/reference/magefile.md
new file mode 100644
index 0000000..0ce0b0d
--- /dev/null
+++ b/docs/reference/magefile.md
@@ -0,0 +1,67 @@
+# Magefile Reference
+
+Epimetheus uses [Mage](https://magefile.org/) for build, test, and run targets. The build logic lives in `Magefile.go` at the repo root.
+
+## Prerequisites
+
+```bash
+go install github.com/magefile/mage@latest
+```
+
+## Default Target
+
+Running `mage` with no arguments runs **Build**.
+
+## Targets
+
+| Target | Description | Example |
+|--------|-------------|---------|
+| `build` | Compile the epimetheus binary | `mage build` |
+| `install` | Install binary to `$GOPATH/bin` | `mage install` |
+| `run` | Build and run in realtime mode (continuous) | `mage run` |
+| `runHistoric` | Build and run historic mode (24h ago) | `mage runHistoric` |
+| `runAuto <file>` | Build and run auto mode with a file | `mage runAuto test-all-ages.csv` |
+| `runWatchClickHouse [file]` | Build and run watch mode with ClickHouse only | `mage runWatchClickHouse` or `mage runWatchClickHouse my.csv` |
+| `test` | Run all tests | `mage test` |
+| `testCoverage` | Run tests and open coverage report | `mage testCoverage` |
+| `testRace` | Run tests with race detector | `mage testRace` |
+| `benchmark` | Run Go benchmarks | `mage benchmark` |
+| `lint` | Run golangci-lint | `mage lint` |
+| `fmt` | Format all Go code | `mage fmt` |
+| `vet` | Run go vet | `mage vet` |
+| `tidy` | Run go mod tidy | `mage tidy` |
+| `clean` | Remove binary and coverage artifacts | `mage clean` |
+| `generate` | Run go generate | `mage generate` |
+| `version` | Build and print version | `mage version` |
+| `all` | Run fmt, vet, test, and build | `mage all` |
+| `ci` | Tidy, vet, test, and build (CI pipeline) | `mage ci` |
+| `dev` | Build, port-forward Pushgateway, run realtime mode | `mage dev` |
+| `generateTestData` | Generate test data files | `mage generateTestData` |
+| `backfill` | Run backfill for last 48 hours | `mage backfill` |
+| `benchmark100MB` | Run 100MB benchmark script | `mage benchmark100MB` |
+| `benchmark1GB` | Run 1GB benchmark script | `mage benchmark1GB` |
+| `cleanupBenchmarkData` | Clean benchmark data from Prometheus | `mage cleanupBenchmarkData` |
+| `cleanupBenchmarkMetrics` | Clean benchmark metric files | `mage cleanupBenchmarkMetrics` |
+| `deployDashboard` | Deploy Grafana dashboard via script | `mage deployDashboard` |
+| `help` | Print list of targets | `mage help` |
+
+## Examples
+
+```bash
+# Build and run realtime mode
+mage run
+
+# Run tests with coverage
+mage testCoverage
+
+# Run watch mode with ClickHouse (default test file)
+mage runWatchClickHouse
+
+# Run watch mode with your CSV
+mage runWatchClickHouse /path/to/data.csv
+
+# Full CI checks
+mage ci
+```
+
+See [Quick Start](../guides/quickstart.md) and [CLI Reference](cli.md) for more on running Epimetheus.
diff --git a/docs/reference/test-metrics.md b/docs/reference/test-metrics.md
new file mode 100644
index 0000000..a1af41e
--- /dev/null
+++ b/docs/reference/test-metrics.md
@@ -0,0 +1,35 @@
+# Test Metrics
+
+Generated metrics use the `epimetheus_test_` prefix so they are easy to identify as test data.
+
+## Counter: `epimetheus_test_requests_total`
+
+- **Type:** Counter (monotonically increasing)
+- **Description:** Total number of requests processed
+- **Use case:** Total events, requests, errors
+
+## Gauge: `epimetheus_test_active_connections`
+
+- **Type:** Gauge (can increase or decrease)
+- **Description:** Current number of active connections (0–100)
+- **Use case:** Current state, capacity
+
+## Gauge: `epimetheus_test_temperature_celsius`
+
+- **Type:** Gauge
+- **Description:** Current temperature in Celsius (0–50°C)
+- **Use case:** Environmental monitoring
+
+## Histogram: `epimetheus_test_request_duration_seconds`
+
+- **Type:** Histogram (distribution)
+- **Description:** Request duration distribution
+- **Buckets:** 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10 seconds
+- **Use case:** Latency, SLO tracking
+
+## Labeled counter: `epimetheus_test_jobs_processed_total`
+
+- **Type:** Counter with labels
+- **Description:** Jobs processed by type and status
+- **Labels:** `job_type` (email, report, backup), `status` (success, failed)
+- **Use case:** Categorized counting, multi-dimensional metrics
diff --git a/backfill-historic-data.sh b/scripts/backfill-historic-data.sh
index fa0e065..c755da7 100755..100644
--- a/backfill-historic-data.sh
+++ b/scripts/backfill-historic-data.sh
@@ -1,8 +1,13 @@
#!/bin/bash
# Backfill historic data to Prometheus for Epimetheus dashboard
+# Run from repo root: ./scripts/backfill-historic-data.sh
set -e
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$REPO_ROOT"
+
echo "=== Epimetheus Historic Data Backfill ==="
echo ""
echo "This script will populate Prometheus with historic test data"
@@ -49,8 +54,6 @@ if [ $EXIT_CODE -eq 0 ]; then
echo " - 1 day ago"
echo " - 12 hours ago"
echo " - Now (from previous realtime push)"
- echo ""
- echo "View the dashboard at: https://grafana.f3s.buetow.org/d/epimetheus-test/epimetheus-test-metrics"
else
echo ""
echo "❌ Backfill failed with exit code $EXIT_CODE"
diff --git a/benchmark-100mb.sh b/scripts/benchmark-100mb.sh
index 1d3fad0..bda6476 100755..100644
--- a/benchmark-100mb.sh
+++ b/scripts/benchmark-100mb.sh
@@ -1,9 +1,14 @@
#!/bin/bash
# Benchmark script: Generate and ingest 100MB of historic metrics
# This tests Epimetheus performance with large-scale data ingestion
+# Run from repo root: ./scripts/benchmark-100mb.sh
set -e
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$REPO_ROOT"
+
# Optimize Go GC for better performance (Phase 3 optimization)
export GOGC=200 # Reduce GC frequency (default 100)
export GOMEMLIMIT=3GiB # Set memory limit for Go 1.19+
@@ -34,10 +39,6 @@ echo "Estimated lines needed: $TARGET_LINES" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
# Generate data going back 7 days with 1-minute intervals
-# This gives us ~10,080 data points across 7 days
-# We'll generate multiple metrics per timestamp to reach 100MB
-# All data is historic (> 5 minutes old) to use Remote Write API exclusively
-
GENERATION_START=$(date +%s)
NOW=$(date +%s)000 # Current time in milliseconds
@@ -51,7 +52,6 @@ cat > benchmark-data-100mb.csv << 'EOF'
EOF
# Generate metrics
-# We'll create ~150 unique time series, each with ~10,000 data points = 1.5M samples
METRICS=(
"epimetheus_benchmark_cpu_usage"
"epimetheus_benchmark_memory_bytes"
@@ -80,7 +80,6 @@ LINES_GENERATED=0
for ((i=0; i<TOTAL_INTERVALS; i++)); do
TIMESTAMP=$((SEVEN_DAYS_AGO + (i * INTERVAL_MS)))
- # Generate a sample for each metric x instance combination
for METRIC in "${METRICS[@]}"; do
for INSTANCE in "${INSTANCES[@]}"; do
VALUE=$((RANDOM % 1000))
@@ -89,7 +88,6 @@ for ((i=0; i<TOTAL_INTERVALS; i++)); do
done
done
- # Progress indicator every 1000 intervals
if [ $((i % 1000)) -eq 0 ]; then
PROGRESS=$((i * 100 / TOTAL_INTERVALS))
echo -ne "\rProgress: $PROGRESS% ($LINES_GENERATED lines)" | tee -a "$RESULT_FILE"
@@ -101,7 +99,6 @@ echo "" | tee -a "$RESULT_FILE"
GENERATION_END=$(date +%s)
GENERATION_TIME=$((GENERATION_END - GENERATION_START))
-# Get actual file size
FILE_SIZE=$(stat -f%z benchmark-data-100mb.csv 2>/dev/null || stat -c%s benchmark-data-100mb.csv 2>/dev/null)
FILE_SIZE_MB=$((FILE_SIZE / 1024 / 1024))
@@ -117,18 +114,15 @@ echo "Step 2: Setting up port-forward to Prometheus..." | tee -a "$RESULT_FILE"
kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 > /tmp/benchmark-pf.log 2>&1 &
PF_PID=$!
echo "Port-forward started (PID: $PF_PID)" | tee -a "$RESULT_FILE"
-sleep 8 # Wait for port-forward to be ready
+sleep 8
echo "" | tee -a "$RESULT_FILE"
# Step 3: Get baseline Prometheus metrics
echo "Step 3: Collecting baseline Prometheus metrics..." | tee -a "$RESULT_FILE"
PROM_POD=$(kubectl get pod -n monitoring -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].metadata.name}')
echo "Prometheus pod: $PROM_POD" | tee -a "$RESULT_FILE"
-
-# Get memory and CPU usage before ingestion
BASELINE_MEMORY=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $3}')
BASELINE_CPU=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $2}')
-
echo " Baseline memory: $BASELINE_MEMORY" | tee -a "$RESULT_FILE"
echo " Baseline CPU: $BASELINE_CPU" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
@@ -136,18 +130,9 @@ echo "" | tee -a "$RESULT_FILE"
# Step 4: Run ingestion benchmark
echo "Step 4: Running ingestion benchmark..." | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-
INGEST_START=$(date +%s.%N)
-# Run epimetheus with time measurement
-# Use CSV mode with Remote Write API (all data is historic)
-# Note: We can't use auto mode because it requires both Pushgateway and Remote Write
-# Instead, we'll implement a direct CSV->Remote Write ingestion
-
echo "Parsing CSV and preparing for Remote Write ingestion..." | tee -a "$RESULT_FILE"
-
-# For now, use backfill mode to process the CSV data
-# We'll need to enhance epimetheus to support pure CSV->RemoteWrite mode
echo "WARNING: Using auto mode - this may fail if data is too recent" | tee -a "$RESULT_FILE"
echo "Continuing with Remote Write API for historic data..." | tee -a "$RESULT_FILE"
@@ -157,37 +142,30 @@ echo "Continuing with Remote Write API for historic data..." | tee -a "$RESULT_F
-format=csv \
-prometheus=http://localhost:9090/api/v1/write \
-pushgateway=http://localhost:9091 \
- 2>&1 | tee -a "$RESULT_FILE" || true # Continue even if pushgateway fails
+ 2>&1 | tee -a "$RESULT_FILE" || true
INGEST_END=$(date +%s.%N)
-
-# Calculate ingestion time
INGEST_TIME=$(echo "$INGEST_END - $INGEST_START" | bc)
echo "" | tee -a "$RESULT_FILE"
echo "Ingestion complete:" | tee -a "$RESULT_FILE"
echo " Total time: ${INGEST_TIME}s" | tee -a "$RESULT_FILE"
-
-# Calculate throughput
SAMPLES_PER_SECOND=$(echo "scale=2; $LINES_GENERATED / $INGEST_TIME" | bc)
MB_PER_SECOND=$(echo "scale=2; $FILE_SIZE_MB / $INGEST_TIME" | bc)
-
echo " Samples/second: $SAMPLES_PER_SECOND" | tee -a "$RESULT_FILE"
echo " MB/second: $MB_PER_SECOND" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-# Step 5: Get post-ingestion Prometheus metrics
+# Step 5: Post-ingestion metrics
echo "Step 5: Collecting post-ingestion Prometheus metrics..." | tee -a "$RESULT_FILE"
-sleep 5 # Wait for metrics to stabilize
-
+sleep 5
POST_MEMORY=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $3}')
POST_CPU=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $2}')
-
echo " Post-ingestion memory: $POST_MEMORY" | tee -a "$RESULT_FILE"
echo " Post-ingestion CPU: $POST_CPU" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-# Step 6: Query some data to verify ingestion
+# Step 6: Verify
echo "Step 6: Verifying data ingestion..." | tee -a "$RESULT_FILE"
QUERY_RESULT=$(curl -s "http://localhost:9090/api/v1/query?query=count(epimetheus_benchmark_cpu_usage)" | jq -r '.data.result[0].value[1]')
echo " Samples found for epimetheus_benchmark_cpu_usage: $QUERY_RESULT" | tee -a "$RESULT_FILE"
@@ -198,7 +176,6 @@ echo "Step 7: Cleaning up..." | tee -a "$RESULT_FILE"
kill $PF_PID 2>/dev/null || true
echo "" | tee -a "$RESULT_FILE"
-# Summary
echo "=== BENCHMARK SUMMARY ===" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
echo "Dataset:" | tee -a "$RESULT_FILE"
diff --git a/benchmark-1gb.sh b/scripts/benchmark-1gb.sh
index f715376..35176b0 100755..100644
--- a/benchmark-1gb.sh
+++ b/scripts/benchmark-1gb.sh
@@ -1,9 +1,14 @@
#!/bin/bash
# Benchmark script: Generate and ingest 1GB of historic metrics
# This tests Epimetheus performance with large-scale data ingestion
+# Run from repo root: ./scripts/benchmark-1gb.sh
set -e
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$REPO_ROOT"
+
# Optimize Go GC for better performance (Phase 3 optimization)
export GOGC=200 # Reduce GC frequency (default 100)
export GOMEMLIMIT=3GiB # Set memory limit for Go 1.19+
@@ -23,7 +28,6 @@ echo "" | tee -a "$RESULT_FILE"
echo "Step 1: Generating 1GB of test data..." | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-# Calculate: ~80 bytes per line, 1GB = ~13M lines
TARGET_SIZE_MB=1000
TARGET_BYTES=$((TARGET_SIZE_MB * 1024 * 1024))
BYTES_PER_LINE=80
@@ -33,25 +37,17 @@ echo "Target size: ${TARGET_SIZE_MB}MB" | tee -a "$RESULT_FILE"
echo "Estimated lines needed: $TARGET_LINES" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-# Generate data going back 30 days with 30-second intervals
-# This gives us ~86,400 data points across 30 days (respects Prometheus 720h out-of-order limit)
-# We'll generate multiple metrics per timestamp to reach 1GB
-# All data is historic (> 5 minutes old) to use Remote Write API exclusively
-
GENERATION_START=$(date +%s)
-NOW=$(date +%s)000 # Current time in milliseconds
-ONE_HOUR_AGO=$((NOW - 3600000)) # Start from 1 hour ago to ensure all data is historic
-THIRTY_DAYS_AGO=$((ONE_HOUR_AGO - 2592000000)) # 30 days before that (30 * 24 * 60 * 60 * 1000)
+NOW=$(date +%s)000
+ONE_HOUR_AGO=$((NOW - 3600000))
+THIRTY_DAYS_AGO=$((ONE_HOUR_AGO - 2592000000))
-# CSV header
cat > benchmark-data-1gb.csv << 'EOF'
# Prometheus metrics - 1GB benchmark dataset
# Format: metric_name,labels,value,timestamp_ms
EOF
-# Generate metrics
-# We'll create ~150 unique time series, each with ~86,400 data points = 13M samples
METRICS=(
"epimetheus_benchmark_cpu_usage"
"epimetheus_benchmark_memory_bytes"
@@ -71,8 +67,8 @@ INSTANCES=(
"db-01" "db-02" "db-03" "worker-01" "worker-02"
)
-INTERVAL_MS=30000 # 30 second interval (to maintain 1GB size with 30 days)
-TOTAL_INTERVALS=86400 # 30 days of 30-second intervals
+INTERVAL_MS=30000
+TOTAL_INTERVALS=86400
echo "Generating data..." | tee -a "$RESULT_FILE"
LINES_GENERATED=0
@@ -80,7 +76,6 @@ LINES_GENERATED=0
for ((i=0; i<TOTAL_INTERVALS; i++)); do
TIMESTAMP=$((THIRTY_DAYS_AGO + (i * INTERVAL_MS)))
- # Generate a sample for each metric x instance combination
for METRIC in "${METRICS[@]}"; do
for INSTANCE in "${INSTANCES[@]}"; do
VALUE=$((RANDOM % 1000))
@@ -89,7 +84,6 @@ for ((i=0; i<TOTAL_INTERVALS; i++)); do
done
done
- # Progress indicator every 5000 intervals
if [ $((i % 5000)) -eq 0 ]; then
PROGRESS=$((i * 100 / TOTAL_INTERVALS))
echo -ne "\rProgress: $PROGRESS% ($LINES_GENERATED lines)" | tee -a "$RESULT_FILE"
@@ -101,7 +95,6 @@ echo "" | tee -a "$RESULT_FILE"
GENERATION_END=$(date +%s)
GENERATION_TIME=$((GENERATION_END - GENERATION_START))
-# Get actual file size
FILE_SIZE=$(stat -f%z benchmark-data-1gb.csv 2>/dev/null || stat -c%s benchmark-data-1gb.csv 2>/dev/null)
FILE_SIZE_MB=$((FILE_SIZE / 1024 / 1024))
@@ -112,42 +105,30 @@ echo " File size: ${FILE_SIZE_MB}MB ($FILE_SIZE bytes)" | tee -a "$RESULT_FILE"
echo " Generation time: ${GENERATION_TIME}s" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-# Step 2: Start port-forward to Prometheus
+# Step 2: Port-forward
echo "Step 2: Setting up port-forward to Prometheus..." | tee -a "$RESULT_FILE"
kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 > /tmp/benchmark-pf.log 2>&1 &
PF_PID=$!
echo "Port-forward started (PID: $PF_PID)" | tee -a "$RESULT_FILE"
-sleep 8 # Wait for port-forward to be ready
+sleep 8
echo "" | tee -a "$RESULT_FILE"
-# Step 3: Get baseline Prometheus metrics
+# Step 3: Baseline
echo "Step 3: Collecting baseline Prometheus metrics..." | tee -a "$RESULT_FILE"
PROM_POD=$(kubectl get pod -n monitoring -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].metadata.name}')
echo "Prometheus pod: $PROM_POD" | tee -a "$RESULT_FILE"
-
-# Get memory and CPU usage before ingestion
BASELINE_MEMORY=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $3}')
BASELINE_CPU=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $2}')
-
echo " Baseline memory: $BASELINE_MEMORY" | tee -a "$RESULT_FILE"
echo " Baseline CPU: $BASELINE_CPU" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-# Step 4: Run ingestion benchmark
+# Step 4: Ingest
echo "Step 4: Running ingestion benchmark..." | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-
INGEST_START=$(date +%s.%N)
-# Run epimetheus with time measurement
-# Use CSV mode with Remote Write API (all data is historic)
-# Note: We can't use auto mode because it requires both Pushgateway and Remote Write
-# Instead, we'll implement a direct CSV->Remote Write ingestion
-
echo "Parsing CSV and preparing for Remote Write ingestion..." | tee -a "$RESULT_FILE"
-
-# For now, use backfill mode to process the CSV data
-# We'll need to enhance epimetheus to support pure CSV->RemoteWrite mode
echo "WARNING: Using auto mode - this may fail if data is too recent" | tee -a "$RESULT_FILE"
echo "Continuing with Remote Write API for historic data..." | tee -a "$RESULT_FILE"
@@ -157,37 +138,30 @@ echo "Continuing with Remote Write API for historic data..." | tee -a "$RESULT_F
-format=csv \
-prometheus=http://localhost:9090/api/v1/write \
-pushgateway=http://localhost:9091 \
- 2>&1 | tee -a "$RESULT_FILE" || true # Continue even if pushgateway fails
+ 2>&1 | tee -a "$RESULT_FILE" || true
INGEST_END=$(date +%s.%N)
-
-# Calculate ingestion time
INGEST_TIME=$(echo "$INGEST_END - $INGEST_START" | bc)
echo "" | tee -a "$RESULT_FILE"
echo "Ingestion complete:" | tee -a "$RESULT_FILE"
echo " Total time: ${INGEST_TIME}s" | tee -a "$RESULT_FILE"
-
-# Calculate throughput
SAMPLES_PER_SECOND=$(echo "scale=2; $LINES_GENERATED / $INGEST_TIME" | bc)
MB_PER_SECOND=$(echo "scale=2; $FILE_SIZE_MB / $INGEST_TIME" | bc)
-
echo " Samples/second: $SAMPLES_PER_SECOND" | tee -a "$RESULT_FILE"
echo " MB/second: $MB_PER_SECOND" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-# Step 5: Get post-ingestion Prometheus metrics
+# Step 5: Post-ingestion
echo "Step 5: Collecting post-ingestion Prometheus metrics..." | tee -a "$RESULT_FILE"
-sleep 5 # Wait for metrics to stabilize
-
+sleep 5
POST_MEMORY=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $3}')
POST_CPU=$(kubectl top pod -n monitoring "$PROM_POD" --no-headers | awk '{print $2}')
-
echo " Post-ingestion memory: $POST_MEMORY" | tee -a "$RESULT_FILE"
echo " Post-ingestion CPU: $POST_CPU" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
-# Step 6: Query some data to verify ingestion
+# Step 6: Verify
echo "Step 6: Verifying data ingestion..." | tee -a "$RESULT_FILE"
QUERY_RESULT=$(curl -s "http://localhost:9090/api/v1/query?query=count(epimetheus_benchmark_cpu_usage)" | jq -r '.data.result[0].value[1]')
echo " Samples found for epimetheus_benchmark_cpu_usage: $QUERY_RESULT" | tee -a "$RESULT_FILE"
@@ -198,7 +172,6 @@ echo "Step 7: Cleaning up..." | tee -a "$RESULT_FILE"
kill $PF_PID 2>/dev/null || true
echo "" | tee -a "$RESULT_FILE"
-# Summary
echo "=== BENCHMARK SUMMARY ===" | tee -a "$RESULT_FILE"
echo "" | tee -a "$RESULT_FILE"
echo "Dataset:" | tee -a "$RESULT_FILE"
diff --git a/cleanup-benchmark-data.sh b/scripts/cleanup-benchmark-data.sh
index a5409f1..48ba187 100755..100644
--- a/cleanup-benchmark-data.sh
+++ b/scripts/cleanup-benchmark-data.sh
@@ -1,6 +1,7 @@
#!/bin/bash
# Cleanup script: Delete benchmark data from Prometheus
# This uses the Prometheus Admin API to selectively remove benchmark metrics
+# Run from repo root: ./scripts/cleanup-benchmark-data.sh [prometheus_url]
set -e
diff --git a/cleanup-benchmark-metrics.sh b/scripts/cleanup-benchmark-metrics.sh
index d70aa95..7b1ce4e 100755..100644
--- a/cleanup-benchmark-metrics.sh
+++ b/scripts/cleanup-benchmark-metrics.sh
@@ -1,6 +1,7 @@
#!/bin/bash
# Cleanup benchmark metrics from Prometheus
# This allows running benchmarks from a clean state
+# Run from repo root: ./scripts/cleanup-benchmark-metrics.sh
set -e
@@ -63,8 +64,7 @@ elif [ "$ADMIN_CHECK" = "405" ]; then
echo " value: \"\""
echo ""
echo "Then upgrade Prometheus:"
- echo " cd /home/paul/git/conf/f3s/prometheus"
- echo " just upgrade"
+ echo " helm upgrade ... (or: just upgrade in your conf repo)"
echo ""
echo "WARNING: Admin API should only be enabled in development/test environments!"
echo ""
diff --git a/generate-test-data.sh b/scripts/generate-test-data.sh
index a4a0b1b..4db332e 100755..100644
--- a/generate-test-data.sh
+++ b/scripts/generate-test-data.sh
@@ -1,6 +1,7 @@
#!/bin/bash
# Generate test data with actual timestamps for different time ranges
+# Run from repo root: ./scripts/generate-test-data.sh
NOW=$(date +%s)000 # Current time in milliseconds
ONE_HOUR_AGO=$((NOW - 3600000))
@@ -8,6 +9,10 @@ ONE_DAY_AGO=$((NOW - 86400000))
ONE_WEEK_AGO=$((NOW - 604800000))
ONE_MONTH_AGO=$((NOW - 2592000000))
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$REPO_ROOT"
+
cat > test-all-ages.csv << EOF
# Prometheus metrics in CSV format demonstrating all time ranges
# Format: metric_name,labels,value,timestamp_ms
@@ -39,8 +44,8 @@ app_temperature_celsius,instance=1m_ago;zone=africa,28.7,$ONE_MONTH_AGO
EOF
echo "Generated test-all-ages.csv with the following timestamps:"
-echo " Current: $NOW ($(date -d @$((NOW/1000)) '+%Y-%m-%d %H:%M:%S'))"
-echo " 1h ago: $ONE_HOUR_AGO ($(date -d @$((ONE_HOUR_AGO/1000)) '+%Y-%m-%d %H:%M:%S'))"
-echo " 1d ago: $ONE_DAY_AGO ($(date -d @$((ONE_DAY_AGO/1000)) '+%Y-%m-%d %H:%M:%S'))"
-echo " 1w ago: $ONE_WEEK_AGO ($(date -d @$((ONE_WEEK_AGO/1000)) '+%Y-%m-%d %H:%M:%S'))"
-echo " 1m ago: $ONE_MONTH_AGO ($(date -d @$((ONE_MONTH_AGO/1000)) '+%Y-%m-%d %H:%M:%S'))"
+echo " Current: $NOW ($(date -d @$((NOW/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((NOW/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))"
+echo " 1h ago: $ONE_HOUR_AGO ($(date -d @$((ONE_HOUR_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((ONE_HOUR_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))"
+echo " 1d ago: $ONE_DAY_AGO ($(date -d @$((ONE_DAY_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((ONE_DAY_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))"
+echo " 1w ago: $ONE_WEEK_AGO ($(date -d @$((ONE_WEEK_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((ONE_WEEK_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))"
+echo " 1m ago: $ONE_MONTH_AGO ($(date -d @$((ONE_MONTH_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -r $((ONE_MONTH_AGO/1000)) '+%Y-%m-%d %H:%M:%S' 2>/dev/null))"
diff --git a/run.sh b/scripts/run.sh
index 38637cf..d603639 100755..100644
--- a/run.sh
+++ b/scripts/run.sh
@@ -2,6 +2,7 @@
# Simple script to run Epimetheus
# Automatically sets up port-forwarding and runs the binary
+# Run from repo root: ./scripts/run.sh
set -e
@@ -18,6 +19,11 @@ echo "Step 2: Running epimetheus binary (realtime mode)..."
echo "Press Ctrl+C to stop"
echo ""
+# Run from repo root so ./epimetheus resolves
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$REPO_ROOT"
+
# Run the binary in realtime mode and capture its exit status
./epimetheus -mode=realtime -continuous
EXIT_CODE=$?
diff --git a/verify-clickhouse.sh b/scripts/verify-clickhouse.sh
index 5819f18..a9c3233 100755..100644
--- a/verify-clickhouse.sh
+++ b/scripts/verify-clickhouse.sh
@@ -1,6 +1,6 @@
#!/bin/bash
# Verify that epimetheus metrics were successfully ingested into ClickHouse.
-# Usage: ./verify-clickhouse.sh [clickhouse_url] [table_name]
+# Usage: ./scripts/verify-clickhouse.sh [clickhouse_url] [table_name]
# Default: http://localhost:8123, epimetheus_metrics
set -e