diff --git a/README.md b/README.md index fa4bf9c..dbe5153 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,11 @@ AI-server compose snippet for Voxtral lives in - Start Voxtral: `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small` +In Kubernetes the dedicated transcription worker may claim more than one +`voxtral-small` job at a time. This keeps download/upload/wait overhead from +serializing the queue while Voxtral/vLLM still controls the actual GPU +scheduling. + ## API - `POST /api/v1/jobs` creates one job. diff --git a/k8s/worker-deployment.yaml b/k8s/worker-deployment.yaml index 8dc8191..c512578 100644 --- a/k8s/worker-deployment.yaml +++ b/k8s/worker-deployment.yaml @@ -100,7 +100,7 @@ spec: - name: WORKER_MODEL_PROFILES value: "voxtral-small" - name: WORKER_CLAIM_LIMIT - value: "1" + value: "2" envFrom: - configMapRef: name: ai-service-config