From e074f6b226cf383888b201f6547c1f8d98268905 Mon Sep 17 00:00:00 2001 From: Grendgi Date: Tue, 9 Jun 2026 17:16:24 +0300 Subject: [PATCH] Run Voxtral transcription worker with two jobs --- README.md | 5 +++++ k8s/worker-deployment.yaml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fa4bf9c..dbe5153 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,11 @@ AI-server compose snippet for Voxtral lives in - Start Voxtral: `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small` +In Kubernetes the dedicated transcription worker may claim more than one +`voxtral-small` job at a time. This keeps download/upload/wait overhead from +serializing the queue while Voxtral/vLLM still controls the actual GPU +scheduling. + ## API - `POST /api/v1/jobs` creates one job. diff --git a/k8s/worker-deployment.yaml b/k8s/worker-deployment.yaml index 8dc8191..c512578 100644 --- a/k8s/worker-deployment.yaml +++ b/k8s/worker-deployment.yaml @@ -100,7 +100,7 @@ spec: - name: WORKER_MODEL_PROFILES value: "voxtral-small" - name: WORKER_CLAIM_LIMIT - value: "1" + value: "2" envFrom: - configMapRef: name: ai-service-config