Add AI server audio model profiles

This commit is contained in:
Grendgi
2026-06-09 12:50:56 +03:00
parent aaecbb1bed
commit f49ba7abd5
3 changed files with 128 additions and 4 deletions

View File

@@ -0,0 +1,110 @@
services:
qwen-audio:
image: vllm/vllm-openai:latest
container_name: qwen-audio
profiles:
- qwen-audio
- audio-compare
restart: unless-stopped
ipc: host
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
VLLM_API_KEY: ${VLLM_API_KEY}
HF_HOME: /cache
volumes:
- ./data/vllm-cache:/cache
ports:
- "10.2.3.5:8003:8000"
command:
- "--model"
- "Qwen/Qwen2-Audio-7B-Instruct"
- "--served-model-name"
- "Qwen/Qwen2-Audio-7B-Instruct"
- "--trust-remote-code"
- "--host"
- "0.0.0.0"
- "--port"
- "8000"
- "--max-model-len"
- "8192"
- "--gpu-memory-utilization"
- "0.45"
- "--api-key"
- "${VLLM_API_KEY}"
- "--max-num-seqs"
- "4"
- "--max-num-batched-tokens"
- "4096"
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 5s
retries: 5
start_period: 900s
voxtral-small:
image: vllm/vllm-openai:latest
container_name: voxtral-small
profiles:
- voxtral-small
- audio-compare
restart: unless-stopped
ipc: host
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
VLLM_API_KEY: ${VLLM_API_KEY}
HF_HOME: /cache
volumes:
- ./data/vllm-cache:/cache
ports:
- "10.2.3.5:8004:8000"
command:
- "--model"
- "mistralai/Voxtral-Small-24B-2507"
- "--served-model-name"
- "mistralai/Voxtral-Small-24B-2507"
- "--tokenizer-mode"
- "mistral"
- "--config-format"
- "mistral"
- "--load-format"
- "mistral"
- "--tool-call-parser"
- "mistral"
- "--enable-auto-tool-choice"
- "--host"
- "0.0.0.0"
- "--port"
- "8000"
- "--max-model-len"
- "32768"
- "--gpu-memory-utilization"
- "0.72"
- "--api-key"
- "${VLLM_API_KEY}"
- "--max-num-seqs"
- "2"
- "--max-num-batched-tokens"
- "8192"
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 5s
retries: 5
start_period: 1200s