summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-03-26 09:15:57 +0200
committerPaul Buetow <paul@buetow.org>2026-03-26 09:15:57 +0200
commit117874f6707a448f4ac46ba86f69a0fd45cb04a9 (patch)
treec9f05056ab8de446715d6c7ce284e00217d44850
parentb97d703ea1b0ca825ed4bcc1325fafebe5e8534d (diff)
hyperstack: tune nemotron-super preset for single A100-80GB
Model weights occupy ~73.6 GiB leaving ~5.6 GiB for KV cache. Reduce max_model_len to 32768 and raise gpu_memory_utilization to 0.98 to fit. Add --enforce-eager to disable CUDA graph capture, which profiling-phase requires ~2 GiB headroom that simply isn't available on a single A100. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
-rw-r--r--hyperstack-vm1-gptoss.toml13
1 files changed, 9 insertions, 4 deletions
diff --git a/hyperstack-vm1-gptoss.toml b/hyperstack-vm1-gptoss.toml
index ce489ee..af25248 100644
--- a/hyperstack-vm1-gptoss.toml
+++ b/hyperstack-vm1-gptoss.toml
@@ -87,17 +87,22 @@ tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"
# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
+# Single-GPU (A100-80GB) config: tensor_parallel_size=1, context capped at 32k to fit in VRAM.
+# Model weights occupy ~73.6 GiB of the 79.25 GiB A100; very little VRAM remains for KV cache.
+# enforce_eager=true disables CUDA graph capture, which avoids the large profiling-phase OOM.
+# gpu_memory_utilization=0.98 lets vLLM use nearly all available VRAM for KV blocks.
+# max_model_len reduced to 32768 to keep the KV cache footprint small enough to fit.
[vllm.presets.nemotron-super]
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
container_name = "vllm_nemotron_super"
-max_model_len = 1048576
-gpu_memory_utilization = 0.85
-tensor_parallel_size = 2
+max_model_len = 32768
+gpu_memory_utilization = 0.98
+tensor_parallel_size = 1
tool_call_parser = "qwen3_xml"
trust_remote_code = true
enable_prefix_caching = false
extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"]
-extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]
+extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]
# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
[vllm.presets.gpt-oss-20b]