From e074f6b226cf383888b201f6547c1f8d98268905 Mon Sep 17 00:00:00 2001
From: Grendgi <barsagaev39@gmail.com>
Date: Tue, 9 Jun 2026 17:16:24 +0300
Subject: [PATCH] Run Voxtral transcription worker with two jobs

---
 README.md                  | 5 +++++
 k8s/worker-deployment.yaml | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fa4bf9c..dbe5153 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,11 @@ AI-server compose snippet for Voxtral lives in
 - Start Voxtral:
   `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
 
+In Kubernetes the dedicated transcription worker may claim more than one
+`voxtral-small` job at a time. This keeps download/upload/wait overhead from
+serializing the queue while Voxtral/vLLM still controls the actual GPU
+scheduling.
+
 ## API
 
 - `POST /api/v1/jobs` creates one job.
diff --git a/k8s/worker-deployment.yaml b/k8s/worker-deployment.yaml
index 8dc8191..c512578 100644
--- a/k8s/worker-deployment.yaml
+++ b/k8s/worker-deployment.yaml
@@ -100,7 +100,7 @@ spec:
             - name: WORKER_MODEL_PROFILES
               value: "voxtral-small"
             - name: WORKER_CLAIM_LIMIT
-              value: "1"
+              value: "2"
           envFrom:
             - configMapRef:
                 name: ai-service-config