Add AI server audio model profiles
This commit is contained in:
18
README.md
18
README.md
@@ -62,6 +62,18 @@ Qwen2-Audio and Voxtral are called through an OpenAI-compatible
|
|||||||
`/v1/chat/completions` endpoint with `input_audio`; set their endpoint URLs only
|
`/v1/chat/completions` endpoint with `input_audio`; set their endpoint URLs only
|
||||||
after the models are actually exposed on the AI server.
|
after the models are actually exposed on the AI server.
|
||||||
|
|
||||||
|
AI-server compose snippets for these temporary comparison endpoints live in
|
||||||
|
`deploy/ai-server/docker-compose.audio.yml`. They are profile-gated because the
|
||||||
|
single GPU cannot keep the production text vLLM, two WhisperX instances, Qwen2
|
||||||
|
Audio and Voxtral loaded at the same time:
|
||||||
|
|
||||||
|
- Qwen2-Audio endpoint: `http://10.2.3.5:8003`
|
||||||
|
- Voxtral endpoint: `http://10.2.3.5:8004`
|
||||||
|
- Start Qwen only:
|
||||||
|
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile qwen-audio up -d qwen-audio`
|
||||||
|
- Start Voxtral only:
|
||||||
|
`docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
|
||||||
|
|
||||||
## API
|
## API
|
||||||
|
|
||||||
- `POST /api/v1/jobs` creates one job.
|
- `POST /api/v1/jobs` creates one job.
|
||||||
@@ -104,10 +116,12 @@ for Kubernetes probes.
|
|||||||
- `WHISPERX_URL`, WhisperX endpoint for transcription jobs
|
- `WHISPERX_URL`, WhisperX endpoint for transcription jobs
|
||||||
- `QWEN_AUDIO_BASE_URL`, OpenAI-compatible endpoint for Qwen2-Audio
|
- `QWEN_AUDIO_BASE_URL`, OpenAI-compatible endpoint for Qwen2-Audio
|
||||||
- `QWEN_AUDIO_MODEL`, default `Qwen/Qwen2-Audio-7B-Instruct`
|
- `QWEN_AUDIO_MODEL`, default `Qwen/Qwen2-Audio-7B-Instruct`
|
||||||
- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio
|
- `QWEN_AUDIO_API_KEY`, optional bearer token for Qwen2-Audio; falls back to
|
||||||
|
`AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
|
||||||
- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
|
- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
|
||||||
- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
|
- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
|
||||||
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral
|
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to
|
||||||
|
`AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
|
||||||
- `AUDIO_LLM_PROMPT`, transcription instruction for audio LLM providers
|
- `AUDIO_LLM_PROMPT`, transcription instruction for audio LLM providers
|
||||||
- `AUDIO_LLM_MAX_TOKENS`, default `4096`
|
- `AUDIO_LLM_MAX_TOKENS`, default `4096`
|
||||||
- `WORKER_ID`, default hostname
|
- `WORKER_ID`, default hostname
|
||||||
|
|||||||
110
deploy/ai-server/docker-compose.audio.yml
Normal file
110
deploy/ai-server/docker-compose.audio.yml
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
services:
|
||||||
|
qwen-audio:
|
||||||
|
image: vllm/vllm-openai:latest
|
||||||
|
container_name: qwen-audio
|
||||||
|
profiles:
|
||||||
|
- qwen-audio
|
||||||
|
- audio-compare
|
||||||
|
restart: unless-stopped
|
||||||
|
ipc: host
|
||||||
|
runtime: nvidia
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
environment:
|
||||||
|
HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
|
||||||
|
VLLM_API_KEY: ${VLLM_API_KEY}
|
||||||
|
HF_HOME: /cache
|
||||||
|
volumes:
|
||||||
|
- ./data/vllm-cache:/cache
|
||||||
|
ports:
|
||||||
|
- "10.2.3.5:8003:8000"
|
||||||
|
command:
|
||||||
|
- "--model"
|
||||||
|
- "Qwen/Qwen2-Audio-7B-Instruct"
|
||||||
|
- "--served-model-name"
|
||||||
|
- "Qwen/Qwen2-Audio-7B-Instruct"
|
||||||
|
- "--trust-remote-code"
|
||||||
|
- "--host"
|
||||||
|
- "0.0.0.0"
|
||||||
|
- "--port"
|
||||||
|
- "8000"
|
||||||
|
- "--max-model-len"
|
||||||
|
- "8192"
|
||||||
|
- "--gpu-memory-utilization"
|
||||||
|
- "0.45"
|
||||||
|
- "--api-key"
|
||||||
|
- "${VLLM_API_KEY}"
|
||||||
|
- "--max-num-seqs"
|
||||||
|
- "4"
|
||||||
|
- "--max-num-batched-tokens"
|
||||||
|
- "4096"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
start_period: 900s
|
||||||
|
|
||||||
|
voxtral-small:
|
||||||
|
image: vllm/vllm-openai:latest
|
||||||
|
container_name: voxtral-small
|
||||||
|
profiles:
|
||||||
|
- voxtral-small
|
||||||
|
- audio-compare
|
||||||
|
restart: unless-stopped
|
||||||
|
ipc: host
|
||||||
|
runtime: nvidia
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
environment:
|
||||||
|
HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
|
||||||
|
VLLM_API_KEY: ${VLLM_API_KEY}
|
||||||
|
HF_HOME: /cache
|
||||||
|
volumes:
|
||||||
|
- ./data/vllm-cache:/cache
|
||||||
|
ports:
|
||||||
|
- "10.2.3.5:8004:8000"
|
||||||
|
command:
|
||||||
|
- "--model"
|
||||||
|
- "mistralai/Voxtral-Small-24B-2507"
|
||||||
|
- "--served-model-name"
|
||||||
|
- "mistralai/Voxtral-Small-24B-2507"
|
||||||
|
- "--tokenizer-mode"
|
||||||
|
- "mistral"
|
||||||
|
- "--config-format"
|
||||||
|
- "mistral"
|
||||||
|
- "--load-format"
|
||||||
|
- "mistral"
|
||||||
|
- "--tool-call-parser"
|
||||||
|
- "mistral"
|
||||||
|
- "--enable-auto-tool-choice"
|
||||||
|
- "--host"
|
||||||
|
- "0.0.0.0"
|
||||||
|
- "--port"
|
||||||
|
- "8000"
|
||||||
|
- "--max-model-len"
|
||||||
|
- "32768"
|
||||||
|
- "--gpu-memory-utilization"
|
||||||
|
- "0.72"
|
||||||
|
- "--api-key"
|
||||||
|
- "${VLLM_API_KEY}"
|
||||||
|
- "--max-num-seqs"
|
||||||
|
- "2"
|
||||||
|
- "--max-num-batched-tokens"
|
||||||
|
- "8192"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
start_period: 1200s
|
||||||
@@ -63,11 +63,11 @@ func Load() Config {
|
|||||||
WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
|
WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
|
||||||
WhisperXLeadSilence: envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
|
WhisperXLeadSilence: envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
|
||||||
QwenAudioBaseURL: envString("QWEN_AUDIO_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
|
QwenAudioBaseURL: envString("QWEN_AUDIO_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
|
||||||
QwenAudioAPIKey: envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
|
QwenAudioAPIKey: envString("QWEN_AUDIO_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
|
||||||
QwenAudioModel: envString("QWEN_AUDIO_MODEL", "Qwen/Qwen2-Audio-7B-Instruct"),
|
QwenAudioModel: envString("QWEN_AUDIO_MODEL", "Qwen/Qwen2-Audio-7B-Instruct"),
|
||||||
QwenAudioTimeout: envDuration("QWEN_AUDIO_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
|
QwenAudioTimeout: envDuration("QWEN_AUDIO_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
|
||||||
VoxtralBaseURL: envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
|
VoxtralBaseURL: envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
|
||||||
VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", "")),
|
VoxtralAPIKey: envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
|
||||||
VoxtralModel: envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
|
VoxtralModel: envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
|
||||||
VoxtralTimeout: envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
|
VoxtralTimeout: envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
|
||||||
AudioLLMMaxTokens: envInt("AUDIO_LLM_MAX_TOKENS", 4096),
|
AudioLLMMaxTokens: envInt("AUDIO_LLM_MAX_TOKENS", 4096),
|
||||||
|
|||||||
Reference in New Issue
Block a user