Add ior crash/timeout harness tests (task 343)

- Add waitBoth unit tests: ior exit error, ior timeout, both timeout, both succeed — using real processes (true/false/sleep) - Add TestIorCrashReportsError: full harness test with fake ior binary that exits with error, verifying error mentions 'ior' and workload PID is returned - Add TestIorStartFailureCleansUpWorkload: verifies workload process is killed when ior binary doesn't exist, checking with signal 0 - Refactor waitBoth to accept grace duration parameter for testability (production code passes iorShutdownGrace, tests use 500ms) - Fix pipe drain in startWorkload: drain remaining stdout after reading PID so cmd.Wait() doesn't block on pending I/O - Add writeScript helper to helpers_test.go for creating fake binaries Co-authored-by: Amp <amp@ampcode.com> Amp-Thread-ID: https://ampcode.com/threads/T-019c8162-c1cf-7612-b8f5-84c61e3d2021
author: Paul Buetow <paul@buetow.org> 2026-02-21 20:16:19 +0200
committer: Paul Buetow <paul@buetow.org> 2026-02-21 20:18:41 +0200
commit: e51b8571bc192e7122f25a3d05a6407dfa8a6998 (patch)
tree: 81e7e1fbe1c9e0a91d033b1aded00b4273502313 /integrationtests
parent: 2f0ac27ec92840cab408e5f5a71d225be070cc0f (diff)
3 files changed, 173 insertions, 3 deletions
diff --git a/integrationtests/harness.go b/integrationtests/harness.go
index 7edde44..fde52e6 100644
--- a/integrationtests/harness.go
+++ b/integrationtests/harness.go
@@ -3,6 +3,7 @@ package integrationtests
 import (
 	"bufio"
 	"fmt"
+	"io"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -41,7 +42,7 @@ func (h *TestHarness) Run(scenario string, duration int) (TestResult, int, error
 		return TestResult{}, workloadPID, err
 	}
 
-	workloadErr, iorErr := waitBoth(workloadCmd, iorCmd, duration)
+	workloadErr, iorErr := waitBoth(workloadCmd, iorCmd, duration, iorShutdownGrace)
 
 	if iorErr != nil {
 		return TestResult{}, workloadPID, fmt.Errorf("ior: %w", iorErr)
@@ -92,6 +93,8 @@ func (h *TestHarness) startWorkload(scenario string) (*exec.Cmd, int, error) {
 		} else {
 			errCh <- fmt.Errorf("workload produced no output")
 		}
+		// Drain remaining pipe data so cmd.Wait() does not block.
+		io.Copy(io.Discard, stdout) //nolint:errcheck
 	}()
 
 	select {
@@ -132,14 +135,14 @@ func (h *TestHarness) startIor(pid int, scenario string, duration int) (*exec.Cm
 
 // waitBoth waits for both the workload and ior commands concurrently.
 // If ior does not finish within duration + grace period, it is killed.
-func waitBoth(workloadCmd, iorCmd *exec.Cmd, duration int) (workloadErr, iorErr error) {
+func waitBoth(workloadCmd, iorCmd *exec.Cmd, duration int, grace time.Duration) (workloadErr, iorErr error) {
 	workloadDone := make(chan error, 1)
 	iorDone := make(chan error, 1)
 
 	go func() { workloadDone <- workloadCmd.Wait() }()
 	go func() { iorDone <- iorCmd.Wait() }()
 
-	timeout := time.After(time.Duration(duration)*time.Second + iorShutdownGrace)
+	timeout := time.After(time.Duration(duration)*time.Second + grace)
 
 	for workloadDone != nil || iorDone != nil {
 		select {
diff --git a/integrationtests/harness_test.go b/integrationtests/harness_test.go
index 813e9d6..6e076ad 100644
--- a/integrationtests/harness_test.go
+++ b/integrationtests/harness_test.go
@@ -1,8 +1,13 @@
 package integrationtests
 
 import (
+	"os"
+	"os/exec"
+	"path/filepath"
 	"strings"
+	"syscall"
 	"testing"
+	"time"
 )
 
 func TestWorkloadCrashReportsError(t *testing.T) {
@@ -21,3 +26,155 @@ func TestWorkloadCrashReportsError(t *testing.T) {
 		t.Errorf("expected no records from crashed workload, got %d", len(result.Records))
 	}
 }
+
+func TestWaitBothIorExitError(t *testing.T) {
+	workloadCmd := exec.Command("true")
+	iorCmd := exec.Command("false")
+	if err := workloadCmd.Start(); err != nil {
+		t.Fatalf("start workload: %v", err)
+	}
+	if err := iorCmd.Start(); err != nil {
+		t.Fatalf("start ior: %v", err)
+	}
+
+	workloadErr, iorErr := waitBoth(workloadCmd, iorCmd, 5, iorShutdownGrace)
+	if iorErr == nil {
+		t.Fatal("expected ior error, got nil")
+	}
+	if workloadErr != nil {
+		t.Errorf("expected nil workload error, got: %v", workloadErr)
+	}
+}
+
+func TestWaitBothIorTimeout(t *testing.T) {
+	workloadCmd := exec.Command("true")
+	iorCmd := exec.Command("sleep", "60")
+	if err := workloadCmd.Start(); err != nil {
+		t.Fatalf("start workload: %v", err)
+	}
+	if err := iorCmd.Start(); err != nil {
+		t.Fatalf("start ior: %v", err)
+	}
+
+	// Use duration=0 and a short grace period so timeout fires quickly.
+	// Workload ("true") exits instantly; ior ("sleep 60") exceeds the timeout.
+	workloadErr, iorErr := waitBoth(workloadCmd, iorCmd, 0, 500*time.Millisecond)
+	if workloadErr != nil {
+		t.Errorf("expected nil workload error, got: %v", workloadErr)
+	}
+	if iorErr == nil {
+		t.Fatal("expected ior error from timeout, got nil")
+	}
+	if !strings.Contains(iorErr.Error(), "timed out") {
+		t.Errorf("expected timeout error, got: %v", iorErr)
+	}
+}
+
+func TestWaitBothBothTimeout(t *testing.T) {
+	workloadCmd := exec.Command("sleep", "60")
+	iorCmd := exec.Command("sleep", "60")
+	if err := workloadCmd.Start(); err != nil {
+		t.Fatalf("start workload: %v", err)
+	}
+	if err := iorCmd.Start(); err != nil {
+		t.Fatalf("start ior: %v", err)
+	}
+
+	workloadErr, iorErr := waitBoth(workloadCmd, iorCmd, 0, 500*time.Millisecond)
+	if workloadErr == nil {
+		t.Fatal("expected workload timeout error, got nil")
+	}
+	if !strings.Contains(workloadErr.Error(), "timed out") {
+		t.Errorf("expected workload timeout error, got: %v", workloadErr)
+	}
+	if iorErr == nil {
+		t.Fatal("expected ior timeout error, got nil")
+	}
+	if !strings.Contains(iorErr.Error(), "timed out") {
+		t.Errorf("expected ior timeout error, got: %v", iorErr)
+	}
+}
+
+func TestWaitBothBothSucceed(t *testing.T) {
+	workloadCmd := exec.Command("true")
+	iorCmd := exec.Command("true")
+	if err := workloadCmd.Start(); err != nil {
+		t.Fatalf("start workload: %v", err)
+	}
+	if err := iorCmd.Start(); err != nil {
+		t.Fatalf("start ior: %v", err)
+	}
+
+	workloadErr, iorErr := waitBoth(workloadCmd, iorCmd, 5, iorShutdownGrace)
+	if workloadErr != nil {
+		t.Errorf("expected nil workload error, got: %v", workloadErr)
+	}
+	if iorErr != nil {
+		t.Errorf("expected nil ior error, got: %v", iorErr)
+	}
+}
+
+func TestIorCrashReportsError(t *testing.T) {
+	tmpDir := t.TempDir()
+	outputDir := t.TempDir()
+
+	// Create a fake workload that prints its PID and exits cleanly.
+	workloadBin := writeScript(t, tmpDir, "workload", `echo $$`)
+
+	// Create a fake ior that exits with error immediately.
+	iorBin := writeScript(t, tmpDir, "ior", `exit 1`)
+
+	h := TestHarness{
+		IorBinary:      iorBin,
+		WorkloadBinary: workloadBin,
+		BpfObject:      filepath.Join(tmpDir, "fake.bpf.o"),
+		OutputDir:      outputDir,
+	}
+
+	result, pid, err := h.Run("test", 5)
+	if err == nil {
+		t.Fatal("expected error when ior crashes, got nil")
+	}
+	if !strings.Contains(err.Error(), "ior") {
+		t.Errorf("error should mention ior, got: %v", err)
+	}
+	if pid == 0 {
+		t.Fatal("expected non-zero workload PID")
+	}
+	if len(result.Records) != 0 {
+		t.Errorf("expected no records from crashed ior, got %d", len(result.Records))
+	}
+}
+
+func TestIorStartFailureCleansUpWorkload(t *testing.T) {
+	tmpDir := t.TempDir()
+	outputDir := t.TempDir()
+
+	// Create a fake workload that prints PID and sleeps.
+	// Use exec to replace the shell so killing the process kills the sleep too.
+	workloadBin := writeScript(t, tmpDir, "workload", `echo $$; exec sleep 30`)
+
+	h := TestHarness{
+		IorBinary:      "/nonexistent/ior",
+		WorkloadBinary: workloadBin,
+		BpfObject:      filepath.Join(tmpDir, "fake.bpf.o"),
+		OutputDir:      outputDir,
+	}
+
+	_, pid, err := h.Run("test", 5)
+	if err == nil {
+		t.Fatal("expected error when ior binary doesn't exist, got nil")
+	}
+	if pid == 0 {
+		t.Fatal("expected non-zero workload PID even when ior fails to start")
+	}
+	// Verify the workload process was cleaned up (killed).
+	// After Run returns, the workload should no longer be running.
+	// On Linux, FindProcess always succeeds, so we check with signal 0.
+	proc, procErr := os.FindProcess(pid)
+	if procErr == nil {
+		if signalErr := proc.Signal(syscall.Signal(0)); signalErr == nil {
+			t.Error("workload process is still running after ior start failure")
+		}
+	}
+}
diff --git a/integrationtests/helpers_test.go b/integrationtests/helpers_test.go
index edf57b9..7db54b2 100644
--- a/integrationtests/helpers_test.go
+++ b/integrationtests/helpers_test.go
@@ -36,6 +36,16 @@ func absPath(t *testing.T, rel string) string {
 	return p
 }
 
+// writeScript creates an executable shell script in dir and returns its path.
+func writeScript(t *testing.T, dir, name, content string) string {
+	t.Helper()
+	path := filepath.Join(dir, name)
+	if err := os.WriteFile(path, []byte("#!/bin/sh\n"+content+"\n"), 0o755); err != nil {
+		t.Fatalf("write script %s: %v", name, err)
+	}
+	return path
+}
+
 func runScenario(t *testing.T, scenario string, expected []ExpectedEvent) {
 	t.Helper()
 	h := newTestHarness(t)
author	Paul Buetow <paul@buetow.org>	2026-02-21 20:16:19 +0200
committer	Paul Buetow <paul@buetow.org>	2026-02-21 20:18:41 +0200
commit	e51b8571bc192e7122f25a3d05a6407dfa8a6998 (patch)
tree	81e7e1fbe1c9e0a91d033b1aded00b4273502313 /integrationtests
parent	2f0ac27ec92840cab408e5f5a71d225be070cc0f (diff)