Add AI server audio model profiles

2026-06-09 12:50:56 +03:00
parent aaecbb1bed
commit f49ba7abd5
3 changed files with 128 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -62,6 +62,18 @@ Qwen2-Audio and Voxtral are called through an OpenAI-compatible
 `/v1/chat/completions` endpoint with `input_audio`; set their endpoint URLs only
 after the models are actually exposed on the AI server.
 AI-server compose snippets for these temporary comparison endpoints live in
 `deploy/ai-server/docker-compose.audio.yml`. They are profile-gated because the
 single GPU cannot keep the production text vLLM, two WhisperX instances, Qwen2
 Audio and Voxtral loaded at the same time:
 - Qwen2-Audio endpoint: `http://10.2.3.5:8003`
 - Voxtral endpoint: `http://10.2.3.5:8004`
 - Start Qwen only:
  `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile qwen-audio up -d qwen-audio`
 - Start Voxtral only:
  `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
 ## API
 - `POST /api/v1/jobs` creates one job.
@@ -104,10 +116,12 @@ for Kubernetes probes.
 - `WHISPERX_URL`, WhisperX endpoint for transcription jobs
 - `QWEN_AUDIO_BASE_URL`, OpenAI-compatible endpoint for Qwen2-Audio
 - `QWEN_AUDIO_MODEL`, default `Qwen/Qwen2-Audio-7B-Instruct`
- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio
+- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio; falls back to
  `AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
 - `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
 - `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral
+- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to
  `AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
 - `AUDIO_LLM_PROMPT`, transcription instruction for audio LLM providers
 - `AUDIO_LLM_MAX_TOKENS`, default `4096`
 - `WORKER_ID`, default hostname
--- a/deploy/ai-server/docker-compose.audio.yml
+++ b/deploy/ai-server/docker-compose.audio.yml
@@ -0,0 +1,110 @@
 services:
  qwen-audio:
    image: vllm/vllm-openai:latest
    container_name: qwen-audio
    profiles:
      - qwen-audio
      - audio-compare
    restart: unless-stopped
    ipc: host
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
      VLLM_API_KEY: ${VLLM_API_KEY}
      HF_HOME: /cache
    volumes:
      - ./data/vllm-cache:/cache
    ports:
      - "10.2.3.5:8003:8000"
    command:
      - "--model"
      - "Qwen/Qwen2-Audio-7B-Instruct"
      - "--served-model-name"
      - "Qwen/Qwen2-Audio-7B-Instruct"
      - "--trust-remote-code"
      - "--host"
      - "0.0.0.0"
      - "--port"
      - "8000"
      - "--max-model-len"
      - "8192"
      - "--gpu-memory-utilization"
      - "0.45"
      - "--api-key"
      - "${VLLM_API_KEY}"
      - "--max-num-seqs"
      - "4"
      - "--max-num-batched-tokens"
      - "4096"
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
      interval: 30s
      timeout: 5s
      retries: 5
      start_period: 900s
  voxtral-small:
    image: vllm/vllm-openai:latest
    container_name: voxtral-small
    profiles:
      - voxtral-small
      - audio-compare
    restart: unless-stopped
    ipc: host
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
      VLLM_API_KEY: ${VLLM_API_KEY}
      HF_HOME: /cache
    volumes:
      - ./data/vllm-cache:/cache
    ports:
      - "10.2.3.5:8004:8000"
    command:
      - "--model"
      - "mistralai/Voxtral-Small-24B-2507"
      - "--served-model-name"
      - "mistralai/Voxtral-Small-24B-2507"
      - "--tokenizer-mode"
      - "mistral"
      - "--config-format"
      - "mistral"
      - "--load-format"
      - "mistral"
      - "--tool-call-parser"
      - "mistral"
      - "--enable-auto-tool-choice"
      - "--host"
      - "0.0.0.0"
      - "--port"
      - "8000"
      - "--max-model-len"
      - "32768"
      - "--gpu-memory-utilization"
      - "0.72"
      - "--api-key"
      - "${VLLM_API_KEY}"
      - "--max-num-seqs"
      - "2"
      - "--max-num-batched-tokens"
      - "8192"
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
      interval: 30s
      timeout: 5s
      retries: 5
      start_period: 1200s
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -63,11 +63,11 @@ func Load() Config {
 		WhisperXTimeout:        envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
 		WhisperXLeadSilence:    envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
 		QwenAudioBaseURL:       envString("QWEN_AUDIO_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
-		QwenAudioAPIKey:        envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
+		QwenAudioAPIKey:        envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
 		QwenAudioModel:         envString("QWEN_AUDIO_MODEL", "Qwen/Qwen2-Audio-7B-Instruct"),
 		QwenAudioTimeout:       envDuration("QWEN_AUDIO_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
 		VoxtralBaseURL:         envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
-		VoxtralAPIKey:          envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
+		VoxtralAPIKey:          envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
 		VoxtralModel:           envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
 		VoxtralTimeout:         envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
 		AudioLLMMaxTokens:      envInt("AUDIO_LLM_MAX_TOKENS", 4096),