ai-service/deploy/ai-server/docker-compose.audio.yml

services:
  voxtral-small:
    build:
      context: .
      dockerfile: vllm-audio.Dockerfile
    image: vllm-audio:local
    container_name: voxtral-small
    profiles:
      - voxtral-small
    restart: unless-stopped
    ipc: host
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
      VLLM_API_KEY: ${VLLM_API_KEY}
      HF_HOME: /cache
    volumes:
      - ./data/vllm-cache:/cache
    networks:
      - audio-models
    ports:
      - "10.2.3.5:8004:8000"
    command:
      - "--model"
      - "mistralai/Voxtral-Small-24B-2507"
      - "--served-model-name"
      - "mistralai/Voxtral-Small-24B-2507"
      - "--tokenizer-mode"
      - "mistral"
      - "--config-format"
      - "mistral"
      - "--load-format"
      - "mistral"
      - "--tool-call-parser"
      - "mistral"
      - "--enable-auto-tool-choice"
      - "--host"
      - "0.0.0.0"
      - "--port"
      - "8000"
      - "--max-model-len"
      - "16384"
      - "--gpu-memory-utilization"
      - "0.55"
      - "--api-key"
      - "${VLLM_API_KEY}"
      - "--max-num-seqs"
      - "1"
      - "--max-num-batched-tokens"
      - "4096"
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
      interval: 30s
      timeout: 5s
      retries: 5
      start_period: 1200s

networks:
  audio-models:
    driver: bridge