Fix nemotron-3-super vLLM OOM: cap context and add --enforce-eager

The [vllm] defaults had max_model_len=262144 without --enforce-eager, causing the vLLM container to OOM on startup (CUDA graph capture costs ~3-4 GB on top of ~60 GB nemotron weights on the A100 80GB). Also switch flavor to n3-H100x1 since n3-A100x1 is out of stock in CANADA-1. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-03-21 22:39:00 +0200
committer: Paul Buetow <paul@buetow.org> 2026-03-21 22:39:00 +0200
commit: f5c2125d1c1cbf3adde917747aba61cbc3a0f228 (patch)
tree: 99e6279a3c9825114d90993adccf7eb177d09a9d
parent: 0b4bbe047af8222ba0cc5d8100e2ef60ee8093bd (diff)
1 files changed, 8 insertions, 5 deletions
diff --git a/hyperstack-vm1.toml b/hyperstack-vm1.toml
index e101bec..a495dd2 100644
--- a/hyperstack-vm1.toml
+++ b/hyperstack-vm1.toml
@@ -13,9 +13,9 @@ name_prefix = "hyperstack1"
 hostname = "hyperstack1"
 environment_name = "snonux-ollama"
 
-# A100-80GB is the cost-first default for nemotron-3-super inference.
-# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom.
-flavor_name = "n3-A100x1"
+# H100-80GB: switched from n3-A100x1 which was out of stock in CANADA-1.
+# H100 also provides safer throughput and compatibility headroom for nemotron-3-super.
+flavor_name = "n3-H100x1"
 image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
 assign_floating_ip = true
 create_bootable_volume = false
@@ -66,13 +66,16 @@ model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
 # HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
 hug_cache_dir = "/ephemeral/hug"
 container_name = "vllm_nemotron_super"
-max_model_len = 262144
+# Capped at 131072 to keep KV cache within VRAM budget on A100 80GB.
+# 262144 OOMs without --enforce-eager (CUDA graph capture costs ~3-4 GB on top of ~60 GB weights).
+max_model_len = 131072
 gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
 # NVIDIA Nemotron-3-Super uses the same XML tool call format as Qwen3 XML.
 tool_call_parser = "qwen3_xml"
 trust_remote_code = true
-extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]
+# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB needed to fit within A100 80GB.
+extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]
 
 # Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1.toml model switch <name>'.
 # Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
author	Paul Buetow <paul@buetow.org>	2026-03-21 22:39:00 +0200
committer	Paul Buetow <paul@buetow.org>	2026-03-21 22:39:00 +0200
commit	f5c2125d1c1cbf3adde917747aba61cbc3a0f228 (patch)
tree	99e6279a3c9825114d90993adccf7eb177d09a9d
parent	0b4bbe047af8222ba0cc5d8100e2ef60ee8093bd (diff)