diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-21 22:39:00 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-21 22:39:00 +0200 |
| commit | f5c2125d1c1cbf3adde917747aba61cbc3a0f228 (patch) | |
| tree | 99e6279a3c9825114d90993adccf7eb177d09a9d | |
| parent | 0b4bbe047af8222ba0cc5d8100e2ef60ee8093bd (diff) | |
Fix nemotron-3-super vLLM OOM: cap context and add --enforce-eager
The [vllm] defaults had max_model_len=262144 without --enforce-eager,
causing the vLLM container to OOM on startup (CUDA graph capture costs
~3-4 GB on top of ~60 GB nemotron weights on the A100 80GB).
Also switch flavor to n3-H100x1 since n3-A100x1 is out of stock in
CANADA-1.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
| -rw-r--r-- | hyperstack-vm1.toml | 13 |
1 files changed, 8 insertions, 5 deletions
diff --git a/hyperstack-vm1.toml b/hyperstack-vm1.toml index e101bec..a495dd2 100644 --- a/hyperstack-vm1.toml +++ b/hyperstack-vm1.toml @@ -13,9 +13,9 @@ name_prefix = "hyperstack1" hostname = "hyperstack1" environment_name = "snonux-ollama" -# A100-80GB is the cost-first default for nemotron-3-super inference. -# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom. -flavor_name = "n3-A100x1" +# H100-80GB: switched from n3-A100x1 which was out of stock in CANADA-1. +# H100 also provides safer throughput and compatibility headroom for nemotron-3-super. +flavor_name = "n3-H100x1" image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" assign_floating_ip = true create_bootable_volume = false @@ -66,13 +66,16 @@ model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" # HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). hug_cache_dir = "/ephemeral/hug" container_name = "vllm_nemotron_super" -max_model_len = 262144 +# Capped at 131072 to keep KV cache within VRAM budget on A100 80GB. +# 262144 OOMs without --enforce-eager (CUDA graph capture costs ~3-4 GB on top of ~60 GB weights). +max_model_len = 131072 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 # NVIDIA Nemotron-3-Super uses the same XML tool call format as Qwen3 XML. tool_call_parser = "qwen3_xml" trust_remote_code = true -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] +# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB needed to fit within A100 80GB. +extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] # Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1.toml model switch <name>'. # Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. |
