Add AI server audio model profiles

2026-06-09 12:50:56 +03:00
parent aaecbb1bed
commit f49ba7abd5
3 changed files with 128 additions and 4 deletions
--- a/deploy/ai-server/docker-compose.audio.yml
+++ b/deploy/ai-server/docker-compose.audio.yml
@@ -0,0 +1,110 @@
+services:
+  qwen-audio:
+    image: vllm/vllm-openai:latest
+    container_name: qwen-audio
+    profiles:
+      - qwen-audio
+      - audio-compare
+    restart: unless-stopped
+    ipc: host
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
+      VLLM_API_KEY: ${VLLM_API_KEY}
+      HF_HOME: /cache
+    volumes:
+      - ./data/vllm-cache:/cache
+    ports:
+      - "10.2.3.5:8003:8000"
+    command:
+      - "--model"
+      - "Qwen/Qwen2-Audio-7B-Instruct"
+      - "--served-model-name"
+      - "Qwen/Qwen2-Audio-7B-Instruct"
+      - "--trust-remote-code"
+      - "--host"
+      - "0.0.0.0"
+      - "--port"
+      - "8000"
+      - "--max-model-len"
+      - "8192"
+      - "--gpu-memory-utilization"
+      - "0.45"
+      - "--api-key"
+      - "${VLLM_API_KEY}"
+      - "--max-num-seqs"
+      - "4"
+      - "--max-num-batched-tokens"
+      - "4096"
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 5s
+      retries: 5
+      start_period: 900s
+
+  voxtral-small:
+    image: vllm/vllm-openai:latest
+    container_name: voxtral-small
+    profiles:
+      - voxtral-small
+      - audio-compare
+    restart: unless-stopped
+    ipc: host
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
+      VLLM_API_KEY: ${VLLM_API_KEY}
+      HF_HOME: /cache
+    volumes:
+      - ./data/vllm-cache:/cache
+    ports:
+      - "10.2.3.5:8004:8000"
+    command:
+      - "--model"
+      - "mistralai/Voxtral-Small-24B-2507"
+      - "--served-model-name"
+      - "mistralai/Voxtral-Small-24B-2507"
+      - "--tokenizer-mode"
+      - "mistral"
+      - "--config-format"
+      - "mistral"
+      - "--load-format"
+      - "mistral"
+      - "--tool-call-parser"
+      - "mistral"
+      - "--enable-auto-tool-choice"
+      - "--host"
+      - "0.0.0.0"
+      - "--port"
+      - "8000"
+      - "--max-model-len"
+      - "32768"
+      - "--gpu-memory-utilization"
+      - "0.72"
+      - "--api-key"
+      - "${VLLM_API_KEY}"
+      - "--max-num-seqs"
+      - "2"
+      - "--max-num-batched-tokens"
+      - "8192"
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 5s
+      retries: 5
+      start_period: 1200s