summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-03-21 22:39:00 +0200
committerPaul Buetow <paul@buetow.org>2026-03-21 22:39:00 +0200
commitf5c2125d1c1cbf3adde917747aba61cbc3a0f228 (patch)
tree99e6279a3c9825114d90993adccf7eb177d09a9d
parent0b4bbe047af8222ba0cc5d8100e2ef60ee8093bd (diff)
Fix nemotron-3-super vLLM OOM: cap context and add --enforce-eager
The [vllm] defaults had max_model_len=262144 without --enforce-eager, causing the vLLM container to OOM on startup (CUDA graph capture costs ~3-4 GB on top of ~60 GB nemotron weights on the A100 80GB). Also switch flavor to n3-H100x1 since n3-A100x1 is out of stock in CANADA-1. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
-rw-r--r--hyperstack-vm1.toml13
1 files changed, 8 insertions, 5 deletions
diff --git a/hyperstack-vm1.toml b/hyperstack-vm1.toml
index e101bec..a495dd2 100644
--- a/hyperstack-vm1.toml
+++ b/hyperstack-vm1.toml
@@ -13,9 +13,9 @@ name_prefix = "hyperstack1"
hostname = "hyperstack1"
environment_name = "snonux-ollama"
-# A100-80GB is the cost-first default for nemotron-3-super inference.
-# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom.
-flavor_name = "n3-A100x1"
+# H100-80GB: switched from n3-A100x1 which was out of stock in CANADA-1.
+# H100 also provides safer throughput and compatibility headroom for nemotron-3-super.
+flavor_name = "n3-H100x1"
image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
assign_floating_ip = true
create_bootable_volume = false
@@ -66,13 +66,16 @@ model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
hug_cache_dir = "/ephemeral/hug"
container_name = "vllm_nemotron_super"
-max_model_len = 262144
+# Capped at 131072 to keep KV cache within VRAM budget on A100 80GB.
+# 262144 OOMs without --enforce-eager (CUDA graph capture costs ~3-4 GB on top of ~60 GB weights).
+max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
# NVIDIA Nemotron-3-Super uses the same XML tool call format as Qwen3 XML.
tool_call_parser = "qwen3_xml"
trust_remote_code = true
-extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]
+# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB needed to fit within A100 80GB.
+extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]
# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1.toml model switch <name>'.
# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.