summaryrefslogtreecommitdiff
path: root/internal
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-03-13 22:12:22 +0200
committerPaul Buetow <paul@buetow.org>2026-03-13 22:12:22 +0200
commitfd97584060cdc3d88da0fab060ad50cf9fc651c4 (patch)
treeb589d318b97970f9351335bb1431cc67082cddbd /internal
parent81fdd28081922aaeb355a1f87cebaf85f93622c5 (diff)
Handle procfs lookup errors in event loop (task 392)
Diffstat (limited to 'internal')
-rw-r--r--internal/eventloop.go3
-rw-r--r--internal/eventloop_comm.go65
-rw-r--r--internal/eventloop_commresolver_test.go56
-rw-r--r--internal/eventloop_exit.go6
4 files changed, 113 insertions, 17 deletions
diff --git a/internal/eventloop.go b/internal/eventloop.go
index 7a7236c..f15c344 100644
--- a/internal/eventloop.go
+++ b/internal/eventloop.go
@@ -146,6 +146,9 @@ func (e *eventLoop) commState() *commResolver {
if e.commResolver.pending == nil {
e.commResolver.pending = make(map[uint32]struct{})
}
+ if e.commResolver.warningFn == nil {
+ e.commResolver.warningFn = e.notifyWarning
+ }
e.commResolver.ensureLookupConfig()
return e.commResolver
}
diff --git a/internal/eventloop_comm.go b/internal/eventloop_comm.go
index 6b77023..4d49ef2 100644
--- a/internal/eventloop_comm.go
+++ b/internal/eventloop_comm.go
@@ -1,10 +1,13 @@
package internal
import (
+ "errors"
+ "fmt"
"os"
"path/filepath"
"strconv"
"sync"
+ "syscall"
)
type commResolver struct {
@@ -16,7 +19,8 @@ type commResolver struct {
lookupQueue chan uint32
lookupWorkers int
- resolveFn func(uint32) string
+ resolveFn func(uint32) (string, error)
+ warningFn func(string)
startWorkersOnce sync.Once
workersWG sync.WaitGroup
shutdownOnce sync.Once
@@ -42,7 +46,7 @@ func (r *commResolver) ensureLookupConfig() {
r.lookupQueue = make(chan uint32, defaultCommLookupQueueSize)
}
if r.resolveFn == nil {
- r.resolveFn = resolveCommFromProc
+ r.resolveFn = resolveCommFromProcWithError
}
}
@@ -65,13 +69,14 @@ func (r *commResolver) startLookupWorkers() {
func (r *commResolver) lookupWorker() {
defer r.workersWG.Done()
for tid := range r.lookupQueue {
- comm := r.resolveFn(tid)
+ comm, err := r.resolveFn(tid)
r.mu.Lock()
delete(r.pending, tid)
if comm != "" {
r.comms[tid] = comm
}
r.mu.Unlock()
+ r.notifyResolveFailure(tid, err)
}
}
@@ -90,10 +95,12 @@ func (r *commResolver) seedTrackedPidComm(pidFilter int) {
continue
}
seen[tid] = struct{}{}
- if comm := resolveCommFromProc(tid); comm != "" {
+ comm, err := r.resolveFn(tid)
+ if comm != "" {
r.setCached(tid, comm)
continue
}
+ r.notifyResolveFailure(tid, err)
r.queueLookup(tid)
}
}
@@ -167,6 +174,20 @@ func (r *commResolver) shutdown() {
})
}
+func (r *commResolver) notifyResolveFailure(tid uint32, err error) {
+ if err == nil {
+ return
+ }
+ r.notifyWarning(fmt.Sprintf("failed to resolve comm for tid %d: %v", tid, err))
+}
+
+func (r *commResolver) notifyWarning(message string) {
+ if r.warningFn == nil || message == "" {
+ return
+ }
+ r.warningFn(message)
+}
+
func (e *eventLoop) shutdownCommResolver() {
if e.commResolver == nil {
return
@@ -195,19 +216,43 @@ func procTidPathPrefix(tid uint32) string {
}
func resolveCommFromProc(tid uint32) string {
+ comm, _ := resolveCommFromProcWithError(tid)
+ return comm
+}
+
+func resolveCommFromProcWithError(tid uint32) (string, error) {
procPath := procTidPathPrefix(tid)
- if data, err := os.ReadFile(procPath + "/comm"); err == nil {
+ commPath := procPath + "/comm"
+ data, commErr := os.ReadFile(commPath)
+ if commErr == nil {
comm := string(data)
if len(comm) > 0 && comm[len(comm)-1] == '\n' {
comm = comm[:len(comm)-1]
}
if comm != "" {
- return comm
+ return comm, nil
}
+ } else if isTransientProcError(commErr) {
+ commErr = nil
+ } else {
+ commErr = fmt.Errorf("read %s: %w", commPath, commErr)
}
- if linkName, err := os.Readlink(procPath + "/exe"); err == nil {
- linkName = filepath.Base(linkName)
- return linkName
+
+ exePath := procPath + "/exe"
+ linkName, linkErr := os.Readlink(exePath)
+ if linkErr == nil {
+ if base := filepath.Base(linkName); base != "" {
+ return base, nil
+ }
+ } else if isTransientProcError(linkErr) {
+ linkErr = nil
+ } else {
+ linkErr = fmt.Errorf("readlink %s: %w", exePath, linkErr)
}
- return ""
+
+ return "", errors.Join(commErr, linkErr)
+}
+
+func isTransientProcError(err error) bool {
+ return errors.Is(err, os.ErrNotExist) || errors.Is(err, syscall.ENOENT) || errors.Is(err, syscall.ESRCH)
}
diff --git a/internal/eventloop_commresolver_test.go b/internal/eventloop_commresolver_test.go
index 4d3b193..351db70 100644
--- a/internal/eventloop_commresolver_test.go
+++ b/internal/eventloop_commresolver_test.go
@@ -1,7 +1,9 @@
package internal
import (
+ "errors"
"fmt"
+ "strings"
"sync"
"sync/atomic"
"testing"
@@ -24,14 +26,14 @@ func TestCommResolverQueueLookupRespectsWorkerLimit(t *testing.T) {
defer resolver.shutdown()
resolver.lookupWorkers = workers
resolver.lookupQueue = make(chan uint32, lookups)
- resolver.resolveFn = func(tid uint32) string {
+ resolver.resolveFn = func(tid uint32) (string, error) {
current := atomic.AddInt32(&running, 1)
setMaxInt32(&maxRunning, current)
started <- struct{}{}
<-release
atomic.AddInt32(&running, -1)
wg.Done()
- return fmt.Sprintf("comm-%d", tid)
+ return fmt.Sprintf("comm-%d", tid), nil
}
for i := 1; i <= lookups; i++ {
@@ -84,13 +86,13 @@ func TestCommResolverQueueLookupQueueFullClearsPending(t *testing.T) {
defer resolver.shutdown()
resolver.lookupWorkers = 1
resolver.lookupQueue = make(chan uint32, 1)
- resolver.resolveFn = func(tid uint32) string {
+ resolver.resolveFn = func(tid uint32) (string, error) {
select {
case started <- struct{}{}:
default:
}
<-release
- return fmt.Sprintf("comm-%d", tid)
+ return fmt.Sprintf("comm-%d", tid), nil
}
const tid1 uint32 = 101
@@ -139,10 +141,10 @@ func TestCommResolverShutdownStopsWorkersAndPreventsNewLookups(t *testing.T) {
resolver := newCommResolver(nil)
resolver.lookupWorkers = 1
resolver.lookupQueue = make(chan uint32, 1)
- resolver.resolveFn = func(tid uint32) string {
+ resolver.resolveFn = func(tid uint32) (string, error) {
started <- struct{}{}
<-release
- return fmt.Sprintf("comm-%d", tid)
+ return fmt.Sprintf("comm-%d", tid), nil
}
const activeTID uint32 = 201
@@ -182,6 +184,48 @@ func TestCommResolverShutdownStopsWorkersAndPreventsNewLookups(t *testing.T) {
}
}
+func TestCommResolverLookupWarnsOnUnexpectedResolveError(t *testing.T) {
+ const tid uint32 = 301
+
+ warnings := make(chan string, 1)
+ resolver := newCommResolver(nil)
+ defer resolver.shutdown()
+ resolver.lookupWorkers = 1
+ resolver.lookupQueue = make(chan uint32, 1)
+ resolver.warningFn = func(message string) { warnings <- message }
+ resolver.resolveFn = func(uint32) (string, error) {
+ return "", errors.New("boom")
+ }
+
+ resolver.queueLookup(tid)
+
+ waitForCondition(t, 2*time.Second, "expected failed lookup to clear pending state", func() bool {
+ return pendingCount(resolver) == 0
+ })
+ if _, ok := resolver.cached(tid); ok {
+ t.Fatalf("did not expect tid %d to be cached after resolve failure", tid)
+ }
+
+ select {
+ case message := <-warnings:
+ if message == "" || !strings.Contains(message, "boom") {
+ t.Fatalf("expected warning to mention boom, got %q", message)
+ }
+ case <-time.After(2 * time.Second):
+ t.Fatal("timed out waiting for resolve warning")
+ }
+}
+
+func TestResolveCommFromProcWithErrorIgnoresMissingProcess(t *testing.T) {
+ comm, err := resolveCommFromProcWithError(^uint32(0))
+ if err != nil {
+ t.Fatalf("expected missing procfs entries to be handled without error, got %v", err)
+ }
+ if comm != "" {
+ t.Fatalf("expected no comm for missing pid, got %q", comm)
+ }
+}
+
func hasPending(r *commResolver, tid uint32) bool {
r.mu.RLock()
defer r.mu.RUnlock()
diff --git a/internal/eventloop_exit.go b/internal/eventloop_exit.go
index 7fe2469..b0c0256 100644
--- a/internal/eventloop_exit.go
+++ b/internal/eventloop_exit.go
@@ -251,8 +251,12 @@ func (e *eventLoop) handleNullExit(ep *event.Pair, nullEv *types.NullEvent) bool
return false
}
if retEvent.Ret > 0 {
- if cwd, err := os.Readlink(procTidPathPrefix(nullEv.GetTid()) + "/cwd"); err == nil {
+ cwd, err := os.Readlink(procTidPathPrefix(nullEv.GetTid()) + "/cwd")
+ switch {
+ case err == nil:
ep.File = file.NewPathname([]byte(cwd))
+ case !isTransientProcError(err):
+ e.notifyWarning(fmt.Sprintf("failed to resolve cwd for tid %d: %v", nullEv.GetTid(), err))
}
}
}