diff options
| -rw-r--r-- | hyperstack-vm1-gptoss.toml | 13 |
1 files changed, 9 insertions, 4 deletions
diff --git a/hyperstack-vm1-gptoss.toml b/hyperstack-vm1-gptoss.toml index ce489ee..af25248 100644 --- a/hyperstack-vm1-gptoss.toml +++ b/hyperstack-vm1-gptoss.toml @@ -87,17 +87,22 @@ tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" # NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). +# Single-GPU (A100-80GB) config: tensor_parallel_size=1, context capped at 32k to fit in VRAM. +# Model weights occupy ~73.6 GiB of the 79.25 GiB A100; very little VRAM remains for KV cache. +# enforce_eager=true disables CUDA graph capture, which avoids the large profiling-phase OOM. +# gpu_memory_utilization=0.98 lets vLLM use nearly all available VRAM for KV blocks. +# max_model_len reduced to 32768 to keep the KV cache footprint small enough to fit. [vllm.presets.nemotron-super] model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" container_name = "vllm_nemotron_super" -max_model_len = 1048576 -gpu_memory_utilization = 0.85 -tensor_parallel_size = 2 +max_model_len = 32768 +gpu_memory_utilization = 0.98 +tensor_parallel_size = 1 tool_call_parser = "qwen3_xml" trust_remote_code = true enable_prefix_caching = false extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"] -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] +extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] # OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. [vllm.presets.gpt-oss-20b] |
