diff --git a/README.md b/README.md index e0ea826..be1740d 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,18 @@ Qwen2-Audio and Voxtral are called through an OpenAI-compatible `/v1/chat/completions` endpoint with `input_audio`; set their endpoint URLs only after the models are actually exposed on the AI server. +AI-server compose snippets for these temporary comparison endpoints live in +`deploy/ai-server/docker-compose.audio.yml`. They are profile-gated because the +single GPU cannot keep the production text vLLM, two WhisperX instances, Qwen2 +Audio and Voxtral loaded at the same time: + +- Qwen2-Audio endpoint: `http://10.2.3.5:8003` +- Voxtral endpoint: `http://10.2.3.5:8004` +- Start Qwen only: + `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile qwen-audio up -d qwen-audio` +- Start Voxtral only: + `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small` + ## API - `POST /api/v1/jobs` creates one job. @@ -104,10 +116,12 @@ for Kubernetes probes. - `WHISPERX_URL`, WhisperX endpoint for transcription jobs - `QWEN_AUDIO_BASE_URL`, OpenAI-compatible endpoint for Qwen2-Audio - `QWEN_AUDIO_MODEL`, default `Qwen/Qwen2-Audio-7B-Instruct` -- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio +- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio; falls back to + `AUDIO_LLM_API_KEY`, then `LLM_API_KEY` - `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral - `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507` -- `VOXTRAL_API_KEY`, optional bearer token for Voxtral +- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to + `AUDIO_LLM_API_KEY`, then `LLM_API_KEY` - `AUDIO_LLM_PROMPT`, transcription instruction for audio LLM providers - `AUDIO_LLM_MAX_TOKENS`, default `4096` - `WORKER_ID`, default hostname diff --git a/deploy/ai-server/docker-compose.audio.yml b/deploy/ai-server/docker-compose.audio.yml new file mode 100644 index 0000000..9db527b --- /dev/null +++ b/deploy/ai-server/docker-compose.audio.yml @@ -0,0 +1,110 @@ +services: + qwen-audio: + image: vllm/vllm-openai:latest + container_name: qwen-audio + profiles: + - qwen-audio + - audio-compare + restart: unless-stopped + ipc: host + runtime: nvidia + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + environment: + HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN} + VLLM_API_KEY: ${VLLM_API_KEY} + HF_HOME: /cache + volumes: + - ./data/vllm-cache:/cache + ports: + - "10.2.3.5:8003:8000" + command: + - "--model" + - "Qwen/Qwen2-Audio-7B-Instruct" + - "--served-model-name" + - "Qwen/Qwen2-Audio-7B-Instruct" + - "--trust-remote-code" + - "--host" + - "0.0.0.0" + - "--port" + - "8000" + - "--max-model-len" + - "8192" + - "--gpu-memory-utilization" + - "0.45" + - "--api-key" + - "${VLLM_API_KEY}" + - "--max-num-seqs" + - "4" + - "--max-num-batched-tokens" + - "4096" + healthcheck: + test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] + interval: 30s + timeout: 5s + retries: 5 + start_period: 900s + + voxtral-small: + image: vllm/vllm-openai:latest + container_name: voxtral-small + profiles: + - voxtral-small + - audio-compare + restart: unless-stopped + ipc: host + runtime: nvidia + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + environment: + HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN} + VLLM_API_KEY: ${VLLM_API_KEY} + HF_HOME: /cache + volumes: + - ./data/vllm-cache:/cache + ports: + - "10.2.3.5:8004:8000" + command: + - "--model" + - "mistralai/Voxtral-Small-24B-2507" + - "--served-model-name" + - "mistralai/Voxtral-Small-24B-2507" + - "--tokenizer-mode" + - "mistral" + - "--config-format" + - "mistral" + - "--load-format" + - "mistral" + - "--tool-call-parser" + - "mistral" + - "--enable-auto-tool-choice" + - "--host" + - "0.0.0.0" + - "--port" + - "8000" + - "--max-model-len" + - "32768" + - "--gpu-memory-utilization" + - "0.72" + - "--api-key" + - "${VLLM_API_KEY}" + - "--max-num-seqs" + - "2" + - "--max-num-batched-tokens" + - "8192" + healthcheck: + test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] + interval: 30s + timeout: 5s + retries: 5 + start_period: 1200s diff --git a/internal/config/config.go b/internal/config/config.go index 082c076..34279f4 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -63,11 +63,11 @@ func Load() Config { WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute), WhisperXLeadSilence: envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond), QwenAudioBaseURL: envString("QWEN_AUDIO_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")), - QwenAudioAPIKey: envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", "")), + QwenAudioAPIKey: envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))), QwenAudioModel: envString("QWEN_AUDIO_MODEL", "Qwen/Qwen2-Audio-7B-Instruct"), QwenAudioTimeout: envDuration("QWEN_AUDIO_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)), VoxtralBaseURL: envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")), - VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", "")), + VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))), VoxtralModel: envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"), VoxtralTimeout: envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)), AudioLLMMaxTokens: envInt("AUDIO_LLM_MAX_TOKENS", 4096),