diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-13 22:12:22 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-13 22:12:22 +0200 |
| commit | fd97584060cdc3d88da0fab060ad50cf9fc651c4 (patch) | |
| tree | b589d318b97970f9351335bb1431cc67082cddbd /internal | |
| parent | 81fdd28081922aaeb355a1f87cebaf85f93622c5 (diff) | |
Handle procfs lookup errors in event loop (task 392)
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/eventloop.go | 3 | ||||
| -rw-r--r-- | internal/eventloop_comm.go | 65 | ||||
| -rw-r--r-- | internal/eventloop_commresolver_test.go | 56 | ||||
| -rw-r--r-- | internal/eventloop_exit.go | 6 |
4 files changed, 113 insertions, 17 deletions
diff --git a/internal/eventloop.go b/internal/eventloop.go index 7a7236c..f15c344 100644 --- a/internal/eventloop.go +++ b/internal/eventloop.go @@ -146,6 +146,9 @@ func (e *eventLoop) commState() *commResolver { if e.commResolver.pending == nil { e.commResolver.pending = make(map[uint32]struct{}) } + if e.commResolver.warningFn == nil { + e.commResolver.warningFn = e.notifyWarning + } e.commResolver.ensureLookupConfig() return e.commResolver } diff --git a/internal/eventloop_comm.go b/internal/eventloop_comm.go index 6b77023..4d49ef2 100644 --- a/internal/eventloop_comm.go +++ b/internal/eventloop_comm.go @@ -1,10 +1,13 @@ package internal import ( + "errors" + "fmt" "os" "path/filepath" "strconv" "sync" + "syscall" ) type commResolver struct { @@ -16,7 +19,8 @@ type commResolver struct { lookupQueue chan uint32 lookupWorkers int - resolveFn func(uint32) string + resolveFn func(uint32) (string, error) + warningFn func(string) startWorkersOnce sync.Once workersWG sync.WaitGroup shutdownOnce sync.Once @@ -42,7 +46,7 @@ func (r *commResolver) ensureLookupConfig() { r.lookupQueue = make(chan uint32, defaultCommLookupQueueSize) } if r.resolveFn == nil { - r.resolveFn = resolveCommFromProc + r.resolveFn = resolveCommFromProcWithError } } @@ -65,13 +69,14 @@ func (r *commResolver) startLookupWorkers() { func (r *commResolver) lookupWorker() { defer r.workersWG.Done() for tid := range r.lookupQueue { - comm := r.resolveFn(tid) + comm, err := r.resolveFn(tid) r.mu.Lock() delete(r.pending, tid) if comm != "" { r.comms[tid] = comm } r.mu.Unlock() + r.notifyResolveFailure(tid, err) } } @@ -90,10 +95,12 @@ func (r *commResolver) seedTrackedPidComm(pidFilter int) { continue } seen[tid] = struct{}{} - if comm := resolveCommFromProc(tid); comm != "" { + comm, err := r.resolveFn(tid) + if comm != "" { r.setCached(tid, comm) continue } + r.notifyResolveFailure(tid, err) r.queueLookup(tid) } } @@ -167,6 +174,20 @@ func (r *commResolver) shutdown() { }) } +func (r *commResolver) notifyResolveFailure(tid uint32, err error) { + if err == nil { + return + } + r.notifyWarning(fmt.Sprintf("failed to resolve comm for tid %d: %v", tid, err)) +} + +func (r *commResolver) notifyWarning(message string) { + if r.warningFn == nil || message == "" { + return + } + r.warningFn(message) +} + func (e *eventLoop) shutdownCommResolver() { if e.commResolver == nil { return @@ -195,19 +216,43 @@ func procTidPathPrefix(tid uint32) string { } func resolveCommFromProc(tid uint32) string { + comm, _ := resolveCommFromProcWithError(tid) + return comm +} + +func resolveCommFromProcWithError(tid uint32) (string, error) { procPath := procTidPathPrefix(tid) - if data, err := os.ReadFile(procPath + "/comm"); err == nil { + commPath := procPath + "/comm" + data, commErr := os.ReadFile(commPath) + if commErr == nil { comm := string(data) if len(comm) > 0 && comm[len(comm)-1] == '\n' { comm = comm[:len(comm)-1] } if comm != "" { - return comm + return comm, nil } + } else if isTransientProcError(commErr) { + commErr = nil + } else { + commErr = fmt.Errorf("read %s: %w", commPath, commErr) } - if linkName, err := os.Readlink(procPath + "/exe"); err == nil { - linkName = filepath.Base(linkName) - return linkName + + exePath := procPath + "/exe" + linkName, linkErr := os.Readlink(exePath) + if linkErr == nil { + if base := filepath.Base(linkName); base != "" { + return base, nil + } + } else if isTransientProcError(linkErr) { + linkErr = nil + } else { + linkErr = fmt.Errorf("readlink %s: %w", exePath, linkErr) } - return "" + + return "", errors.Join(commErr, linkErr) +} + +func isTransientProcError(err error) bool { + return errors.Is(err, os.ErrNotExist) || errors.Is(err, syscall.ENOENT) || errors.Is(err, syscall.ESRCH) } diff --git a/internal/eventloop_commresolver_test.go b/internal/eventloop_commresolver_test.go index 4d3b193..351db70 100644 --- a/internal/eventloop_commresolver_test.go +++ b/internal/eventloop_commresolver_test.go @@ -1,7 +1,9 @@ package internal import ( + "errors" "fmt" + "strings" "sync" "sync/atomic" "testing" @@ -24,14 +26,14 @@ func TestCommResolverQueueLookupRespectsWorkerLimit(t *testing.T) { defer resolver.shutdown() resolver.lookupWorkers = workers resolver.lookupQueue = make(chan uint32, lookups) - resolver.resolveFn = func(tid uint32) string { + resolver.resolveFn = func(tid uint32) (string, error) { current := atomic.AddInt32(&running, 1) setMaxInt32(&maxRunning, current) started <- struct{}{} <-release atomic.AddInt32(&running, -1) wg.Done() - return fmt.Sprintf("comm-%d", tid) + return fmt.Sprintf("comm-%d", tid), nil } for i := 1; i <= lookups; i++ { @@ -84,13 +86,13 @@ func TestCommResolverQueueLookupQueueFullClearsPending(t *testing.T) { defer resolver.shutdown() resolver.lookupWorkers = 1 resolver.lookupQueue = make(chan uint32, 1) - resolver.resolveFn = func(tid uint32) string { + resolver.resolveFn = func(tid uint32) (string, error) { select { case started <- struct{}{}: default: } <-release - return fmt.Sprintf("comm-%d", tid) + return fmt.Sprintf("comm-%d", tid), nil } const tid1 uint32 = 101 @@ -139,10 +141,10 @@ func TestCommResolverShutdownStopsWorkersAndPreventsNewLookups(t *testing.T) { resolver := newCommResolver(nil) resolver.lookupWorkers = 1 resolver.lookupQueue = make(chan uint32, 1) - resolver.resolveFn = func(tid uint32) string { + resolver.resolveFn = func(tid uint32) (string, error) { started <- struct{}{} <-release - return fmt.Sprintf("comm-%d", tid) + return fmt.Sprintf("comm-%d", tid), nil } const activeTID uint32 = 201 @@ -182,6 +184,48 @@ func TestCommResolverShutdownStopsWorkersAndPreventsNewLookups(t *testing.T) { } } +func TestCommResolverLookupWarnsOnUnexpectedResolveError(t *testing.T) { + const tid uint32 = 301 + + warnings := make(chan string, 1) + resolver := newCommResolver(nil) + defer resolver.shutdown() + resolver.lookupWorkers = 1 + resolver.lookupQueue = make(chan uint32, 1) + resolver.warningFn = func(message string) { warnings <- message } + resolver.resolveFn = func(uint32) (string, error) { + return "", errors.New("boom") + } + + resolver.queueLookup(tid) + + waitForCondition(t, 2*time.Second, "expected failed lookup to clear pending state", func() bool { + return pendingCount(resolver) == 0 + }) + if _, ok := resolver.cached(tid); ok { + t.Fatalf("did not expect tid %d to be cached after resolve failure", tid) + } + + select { + case message := <-warnings: + if message == "" || !strings.Contains(message, "boom") { + t.Fatalf("expected warning to mention boom, got %q", message) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for resolve warning") + } +} + +func TestResolveCommFromProcWithErrorIgnoresMissingProcess(t *testing.T) { + comm, err := resolveCommFromProcWithError(^uint32(0)) + if err != nil { + t.Fatalf("expected missing procfs entries to be handled without error, got %v", err) + } + if comm != "" { + t.Fatalf("expected no comm for missing pid, got %q", comm) + } +} + func hasPending(r *commResolver, tid uint32) bool { r.mu.RLock() defer r.mu.RUnlock() diff --git a/internal/eventloop_exit.go b/internal/eventloop_exit.go index 7fe2469..b0c0256 100644 --- a/internal/eventloop_exit.go +++ b/internal/eventloop_exit.go @@ -251,8 +251,12 @@ func (e *eventLoop) handleNullExit(ep *event.Pair, nullEv *types.NullEvent) bool return false } if retEvent.Ret > 0 { - if cwd, err := os.Readlink(procTidPathPrefix(nullEv.GetTid()) + "/cwd"); err == nil { + cwd, err := os.Readlink(procTidPathPrefix(nullEv.GetTid()) + "/cwd") + switch { + case err == nil: ep.File = file.NewPathname([]byte(cwd)) + case !isTransientProcError(err): + e.notifyWarning(fmt.Sprintf("failed to resolve cwd for tid %d: %v", nullEv.GetTid(), err)) } } } |
