ai-service/deploy/ai-server/docker-compose.audio.yml

services:
  whisper-large-v3:
    build:
      context: .
      dockerfile: vllm-audio.Dockerfile
    image: vllm-audio:local
    container_name: whisper-large-v3
    profiles:
      - whisper-large-v3
    restart: unless-stopped
    ipc: host
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
      VLLM_API_KEY: ${VLLM_API_KEY}
      HF_HOME: /cache
    volumes:
      - ./data/vllm-cache:/cache
    networks:
      - audio-models
    ports:
      - "10.2.3.5:8004:8000"
    command:
      - "--model"
      - "openai/whisper-large-v3"
      - "--served-model-name"
      - "openai/whisper-large-v3"
      - "--task"
      - "transcription"
      - "--host"
      - "0.0.0.0"
      - "--port"
      - "8000"
      - "--gpu-memory-utilization"
      - "0.55"
      - "--api-key"
      - "${VLLM_API_KEY}"
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
      interval: 30s
      timeout: 5s
      retries: 5
      start_period: 1200s

networks:
  audio-models:
    driver: bridge