b6190357ba
Cross-host Gemma 4 throughput comparison across three architectures. Harness at scripts/gpu-bakeoff/; writeup at docs/reference/gpu-bakeoff-2026-04-20.md. Key findings: - RTX 3090 Ti wins decode decisively (128 tok/s on gemma4:26b MoE Q4, ~4.7× faster than gemma4:31b dense on the same card). - AMD Strix Halo iGPU lands at ~42% of 3090 Ti decode on ~25% of the memory bandwidth — good SIMD utilization, especially for MoE. - V100 numbers are DEGRADED: CT 167 ai-visualizer SDXL consumes 31/32 GB of its VRAM, forcing Gemma 4 models 95% onto CPU. Isolated V100 run requires SDXL eviction — left as follow-up. - MoE vs dense is the dominant latency factor across all GPUs: ~4 B active params of gemma4:26b beats 31.3 B active of gemma4:31b by the same ratio (~4.7×) on every card tested. Methodology: 1 warmup + 3 measurement runs per (host × model × prompt-length), Ollama's canonical timing fields, temp=0 greedy, num_predict=256. All three Ollama servers accessed via HTTP (Strix via Tailscale). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
81 lines
1.7 KiB
JSON
81 lines
1.7 KiB
JSON
{
|
|
"host": "steel141",
|
|
"gpu": "RTX 3090 Ti",
|
|
"vram_gb": 24,
|
|
"model_alias": "gemma4:31b",
|
|
"model_tag": "gemma4:31b-it-q4_K_M",
|
|
"prompt_key": "long",
|
|
"prompt_chars": 1614,
|
|
"num_predict": 256,
|
|
"num_ctx": 4096,
|
|
"runs": [
|
|
{
|
|
"prompt_tokens": 318,
|
|
"prompt_eval_ms": 38.2,
|
|
"prefill_tok_per_s": 8318.85,
|
|
"output_tokens": 256,
|
|
"eval_ms": 9390.5,
|
|
"decode_tok_per_s": 27.26,
|
|
"load_ms": 317.4,
|
|
"total_ms": 9886.3,
|
|
"harness_wall_s": 9.89,
|
|
"done_reason": "length"
|
|
},
|
|
{
|
|
"prompt_tokens": 318,
|
|
"prompt_eval_ms": 42.7,
|
|
"prefill_tok_per_s": 7454.7,
|
|
"output_tokens": 256,
|
|
"eval_ms": 9429.0,
|
|
"decode_tok_per_s": 27.15,
|
|
"load_ms": 316.0,
|
|
"total_ms": 9929.8,
|
|
"harness_wall_s": 9.933,
|
|
"done_reason": "length"
|
|
},
|
|
{
|
|
"prompt_tokens": 318,
|
|
"prompt_eval_ms": 41.2,
|
|
"prefill_tok_per_s": 7716.07,
|
|
"output_tokens": 256,
|
|
"eval_ms": 9477.4,
|
|
"decode_tok_per_s": 27.01,
|
|
"load_ms": 334.3,
|
|
"total_ms": 9996.2,
|
|
"harness_wall_s": 10.0,
|
|
"done_reason": "length"
|
|
}
|
|
],
|
|
"warmup": {
|
|
"prompt_tokens": 318,
|
|
"prompt_eval_ms": 967.7,
|
|
"prefill_tok_per_s": 328.62,
|
|
"output_tokens": 256,
|
|
"eval_ms": 9339.6,
|
|
"decode_tok_per_s": 27.41,
|
|
"load_ms": 324.2,
|
|
"total_ms": 10774.3,
|
|
"harness_wall_s": 10.778,
|
|
"done_reason": "length"
|
|
},
|
|
"summary": {
|
|
"prefill_tok_per_s": {
|
|
"min": 7454.7,
|
|
"median": 7716.07,
|
|
"max": 8318.85,
|
|
"n": 3
|
|
},
|
|
"decode_tok_per_s": {
|
|
"min": 27.01,
|
|
"median": 27.15,
|
|
"max": 27.26,
|
|
"n": 3
|
|
},
|
|
"total_ms": {
|
|
"min": 9886.3,
|
|
"median": 9929.8,
|
|
"max": 9996.2,
|
|
"n": 3
|
|
}
|
|
}
|
|
} |