Add AI server audio model profiles

2026-06-09 12:50:56 +03:00
parent aaecbb1bed
commit f49ba7abd5
3 changed files with 128 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -62,6 +62,18 @@ Qwen2-Audio and Voxtral are called through an OpenAI-compatible
 `/v1/chat/completions` endpoint with `input_audio`; set their endpoint URLs only
 after the models are actually exposed on the AI server.

+AI-server compose snippets for these temporary comparison endpoints live in
+`deploy/ai-server/docker-compose.audio.yml`. They are profile-gated because the
+single GPU cannot keep the production text vLLM, two WhisperX instances, Qwen2
+Audio and Voxtral loaded at the same time:
+
+- Qwen2-Audio endpoint: `http://10.2.3.5:8003`
+- Voxtral endpoint: `http://10.2.3.5:8004`
+- Start Qwen only:
+  `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile qwen-audio up -d qwen-audio`
+- Start Voxtral only:
+  `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
+
 ## API

 - `POST /api/v1/jobs` creates one job.
@@ -104,10 +116,12 @@ for Kubernetes probes.
 - `WHISPERX_URL`, WhisperX endpoint for transcription jobs
 - `QWEN_AUDIO_BASE_URL`, OpenAI-compatible endpoint for Qwen2-Audio
 - `QWEN_AUDIO_MODEL`, default `Qwen/Qwen2-Audio-7B-Instruct`
- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio
+- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio; falls back to
+  `AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
 - `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
 - `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral
+- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to
+  `AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
 - `AUDIO_LLM_PROMPT`, transcription instruction for audio LLM providers
 - `AUDIO_LLM_MAX_TOKENS`, default `4096`
 - `WORKER_ID`, default hostname
--- a/deploy/ai-server/docker-compose.audio.yml
+++ b/deploy/ai-server/docker-compose.audio.yml
@@ -0,0 +1,110 @@
+services:
+  qwen-audio:
+    image: vllm/vllm-openai:latest
+    container_name: qwen-audio
+    profiles:
+      - qwen-audio
+      - audio-compare
+    restart: unless-stopped
+    ipc: host
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
+      VLLM_API_KEY: ${VLLM_API_KEY}
+      HF_HOME: /cache
+    volumes:
+      - ./data/vllm-cache:/cache
+    ports:
+      - "10.2.3.5:8003:8000"
+    command:
+      - "--model"
+      - "Qwen/Qwen2-Audio-7B-Instruct"
+      - "--served-model-name"
+      - "Qwen/Qwen2-Audio-7B-Instruct"
+      - "--trust-remote-code"
+      - "--host"
+      - "0.0.0.0"
+      - "--port"
+      - "8000"
+      - "--max-model-len"
+      - "8192"
+      - "--gpu-memory-utilization"
+      - "0.45"
+      - "--api-key"
+      - "${VLLM_API_KEY}"
+      - "--max-num-seqs"
+      - "4"
+      - "--max-num-batched-tokens"
+      - "4096"
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 5s
+      retries: 5
+      start_period: 900s
+
+  voxtral-small:
+    image: vllm/vllm-openai:latest
+    container_name: voxtral-small
+    profiles:
+      - voxtral-small
+      - audio-compare
+    restart: unless-stopped
+    ipc: host
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
+      VLLM_API_KEY: ${VLLM_API_KEY}
+      HF_HOME: /cache
+    volumes:
+      - ./data/vllm-cache:/cache
+    ports:
+      - "10.2.3.5:8004:8000"
+    command:
+      - "--model"
+      - "mistralai/Voxtral-Small-24B-2507"
+      - "--served-model-name"
+      - "mistralai/Voxtral-Small-24B-2507"
+      - "--tokenizer-mode"
+      - "mistral"
+      - "--config-format"
+      - "mistral"
+      - "--load-format"
+      - "mistral"
+      - "--tool-call-parser"
+      - "mistral"
+      - "--enable-auto-tool-choice"
+      - "--host"
+      - "0.0.0.0"
+      - "--port"
+      - "8000"
+      - "--max-model-len"
+      - "32768"
+      - "--gpu-memory-utilization"
+      - "0.72"
+      - "--api-key"
+      - "${VLLM_API_KEY}"
+      - "--max-num-seqs"
+      - "2"
+      - "--max-num-batched-tokens"
+      - "8192"
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 5s
+      retries: 5
+      start_period: 1200s
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -63,11 +63,11 @@ func Load() Config {
 		WhisperXTimeout:        envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
 		WhisperXLeadSilence:    envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
 		QwenAudioBaseURL:       envString("QWEN_AUDIO_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
-		QwenAudioAPIKey:        envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
+		QwenAudioAPIKey:        envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
 		QwenAudioModel:         envString("QWEN_AUDIO_MODEL", "Qwen/Qwen2-Audio-7B-Instruct"),
 		QwenAudioTimeout:       envDuration("QWEN_AUDIO_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
 		VoxtralBaseURL:         envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
-		VoxtralAPIKey:          envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
+		VoxtralAPIKey:          envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
 		VoxtralModel:           envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
 		VoxtralTimeout:         envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
 		AudioLLMMaxTokens:      envInt("AUDIO_LLM_MAX_TOKENS", 4096),