-- Q5_K_M Unsloth quantization on Linux llama.cpp
-- context 81k, flash attention on, 8-bit K/V caches
-- pp 625 t/s, tg 30 t/s
services: llama: image: ghcr.io/ggml-org/llama.cpp:server-vulkan container_name: llama-qwen3.6-27b-dense ports: - 4201:8080 volumes: - ./Qwen3.6-27B-Q4_K_M.gguf:/models/model.gguf:ro,z - ./mmproj-BF16.gguf:/models/mmproj.gguf:ro,z devices: - /dev/dri group_add: - video command: > -m /models/model.gguf --mmproj /models/mmproj.gguf --alias "Qwen3.6 27b Dense" -ngl 99 -c 98304 -b 2048 --host 0.0.0.0 --port 8080 --parallel 2 --kv-unified --ubatch-size 2048 --flash-attn on -cb --jinja --no-webui -ctk q8_0 -ctv q8_0 --image-min-tokens 1024 --temp 0.6 --top-k 20 --top-p 0.95 --repeat-penalty 1 --presence-penalty 1.5 --reasoning auto restart: unless-stopped