Switch transcription to Whisper large v3

2026-06-10 10:10:13 +03:00
parent 1b63dcdbf5
commit 8d6cd84403
12 changed files with 85 additions and 93 deletions
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ The service is intentionally domain-agnostic:
  `beeline/{call_id}` or `channel/{message_id}`.
 - `task_type` describes the technical task class, for example
  `transcribe`, `call_analysis`, `tg_analysis`, `pf_competitor_analysis`.
- `model_profile` selects a runtime profile, for example `voxtral-small`,
+- `model_profile` selects a runtime profile, for example `whisper-large-v3`,
  `qwen2.5-14b`, `vision`, or a future provider profile.
 - `input` and `result` are JSON payloads owned by the caller and worker.

@@ -46,23 +46,22 @@ or compact `system` / `user` fields. The completed job result contains
 domain metadata fields in `input`, but the worker only reads chat fields such as
 `system`, `user`, `messages`, `max_tokens` and `response_format`.

-`transcription` jobs are processed only by Voxtral Small
-(`mistralai/Voxtral-Small-24B-2507`) through an OpenAI-compatible
+`transcription` jobs are processed only by Whisper Large v3
+(`openai/whisper-large-v3`) through an OpenAI-compatible
 `/v1/audio/transcriptions` endpoint. The returned `segments` field stays
 compatible with telephony. If the provider returns one long segment, AI Service
-splits it into smaller transcript segments and adds heuristic speaker labels
-when diarization is requested.
+splits it into smaller transcript segments without inventing speaker labels.

-AI-server compose snippet for Voxtral lives in
+AI-server compose snippet for Whisper Large v3 lives in
 `deploy/ai-server/docker-compose.audio.yml`:

- Voxtral endpoint: `http://10.2.3.5:8004`
- Start Voxtral:
-  `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile voxtral-small up -d voxtral-small`
+- Whisper endpoint: `http://10.2.3.5:8004`
+- Start Whisper:
+  `docker compose -f docker-compose.yml -f docker-compose.audio.yml --profile whisper-large-v3 up -d whisper-large-v3`

 In Kubernetes the dedicated transcription worker may claim more than one
-`voxtral-small` job at a time. This keeps download/upload/wait overhead from
-serializing the queue while Voxtral/vLLM still controls the actual GPU
+`whisper-large-v3` job at a time. This keeps download/upload/wait overhead from
+serializing the queue while Whisper/vLLM still controls the actual GPU
 scheduling.

 ## API
@@ -102,11 +101,11 @@ for Kubernetes probes.
 - `LLM_API_KEY`, primary LLM API key
 - `LLM_MODEL`, default `qwen2.5-14b`
 - `LLM_TIMEOUT`, default `5m`
- `VOXTRAL_BASE_URL`, OpenAI-compatible endpoint for Voxtral
- `VOXTRAL_MODEL`, default `mistralai/Voxtral-Small-24B-2507`
- `VOXTRAL_API_KEY`, optional bearer token for Voxtral; falls back to
+- `AUDIO_TRANSCRIPTION_BASE_URL`, OpenAI-compatible transcription endpoint
+- `AUDIO_TRANSCRIPTION_MODEL`, default `openai/whisper-large-v3`
+- `AUDIO_TRANSCRIPTION_API_KEY`, optional bearer token; falls back to
  `AUDIO_LLM_API_KEY`, then `LLM_API_KEY`
- `AUDIO_LLM_PROMPT`, transcription instruction for Voxtral
+- `AUDIO_TRANSCRIPTION_PROMPT`, transcription instruction
 - `WORKER_ID`, default hostname
 - `WORKER_HTTP_HOST`, default `0.0.0.0`
 - `WORKER_HTTP_PORT`, default `8081`
--- a/cmd/worker/main.go
+++ b/cmd/worker/main.go
@@ -49,11 +49,11 @@ func main() {

 	llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
 	transcriber := transcription.NewWithOptions(transcription.Options{
-		VoxtralBaseURL: cfg.VoxtralBaseURL,
-		VoxtralAPIKey:  cfg.VoxtralAPIKey,
-		VoxtralModel:   cfg.VoxtralModel,
-		VoxtralTimeout: cfg.VoxtralTimeout,
-		AudioLLMPrompt: cfg.AudioLLMPrompt,
+		AudioBaseURL: cfg.AudioBaseURL,
+		AudioAPIKey:  cfg.AudioAPIKey,
+		AudioModel:   cfg.AudioModel,
+		AudioTimeout: cfg.AudioTimeout,
+		AudioPrompt:  cfg.AudioPrompt,
 	})
 	w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
 	healthSrv := startHealthServer(ctx, db, cfg)
@@ -62,8 +62,8 @@ func main() {
 		"worker_id", cfg.WorkerID,
 		"model", cfg.LLMModel,
 		"transcription_enabled", transcriber != nil,
-		"transcription_provider", "voxtral-small",
-		"transcription_model", cfg.VoxtralModel,
+		"transcription_provider", transcription.ProviderWhisperLargeV3,
+		"transcription_model", cfg.AudioModel,
 		"task_types", cfg.WorkerTaskTypes,
 		"model_profiles", cfg.WorkerModelProfiles,
 		"poll_interval", cfg.WorkerPollInterval.String(),
@@ -134,8 +134,8 @@ func (h workerHealth) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 			"worker_id":              h.cfg.WorkerID,
 			"task_types":             h.cfg.WorkerTaskTypes,
 			"model_profiles":         h.cfg.WorkerModelProfiles,
-			"transcription_provider": "voxtral-small",
-			"transcription_model":    h.cfg.VoxtralModel,
+			"transcription_provider": transcription.ProviderWhisperLargeV3,
+			"transcription_model":    h.cfg.AudioModel,
 			"claim_limit":            h.cfg.WorkerClaimLimit,
 			"poll_interval":          h.cfg.WorkerPollInterval.String(),
 			"lease_timeout":          h.cfg.WorkerLeaseTimeout.String(),
--- a/deploy/ai-server/docker-compose.audio.yml
+++ b/deploy/ai-server/docker-compose.audio.yml
@@ -1,12 +1,12 @@
 services:
-  voxtral-small:
+  whisper-large-v3:
    build:
      context: .
      dockerfile: vllm-audio.Dockerfile
    image: vllm-audio:local
-    container_name: voxtral-small
+    container_name: whisper-large-v3
    profiles:
-      - voxtral-small
+      - whisper-large-v3
    restart: unless-stopped
    ipc: host
    runtime: nvidia
@@ -29,32 +29,19 @@ services:
      - "10.2.3.5:8004:8000"
    command:
      - "--model"
-      - "mistralai/Voxtral-Small-24B-2507"
+      - "openai/whisper-large-v3"
      - "--served-model-name"
-      - "mistralai/Voxtral-Small-24B-2507"
-      - "--tokenizer-mode"
-      - "mistral"
-      - "--config-format"
-      - "mistral"
-      - "--load-format"
-      - "mistral"
-      - "--tool-call-parser"
-      - "mistral"
-      - "--enable-auto-tool-choice"
+      - "openai/whisper-large-v3"
+      - "--task"
+      - "transcription"
      - "--host"
      - "0.0.0.0"
      - "--port"
      - "8000"
-      - "--max-model-len"
-      - "16384"
      - "--gpu-memory-utilization"
      - "0.55"
      - "--api-key"
      - "${VLLM_API_KEY}"
-      - "--max-num-seqs"
-      - "1"
-      - "--max-num-batched-tokens"
-      - "4096"
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
      interval: 30s
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -18,11 +18,11 @@ type Config struct {
 	LLMAPIKey         string
 	LLMModel          string
 	LLMTimeout        time.Duration
-	VoxtralBaseURL    string
-	VoxtralAPIKey     string
-	VoxtralModel      string
-	VoxtralTimeout    time.Duration
-	AudioLLMPrompt    string
+	AudioBaseURL      string
+	AudioAPIKey       string
+	AudioModel        string
+	AudioTimeout      time.Duration
+	AudioPrompt       string
 	AIStatsSidecarURL string
 	AIStatsTimeout    time.Duration

@@ -48,11 +48,11 @@ func Load() Config {
 		LLMAPIKey:         envString("LLM_API_KEY", ""),
 		LLMModel:          envString("LLM_MODEL", "qwen2.5-14b"),
 		LLMTimeout:        envDuration("LLM_TIMEOUT", 5*time.Minute),
-		VoxtralBaseURL:    envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
-		VoxtralAPIKey:     envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
-		VoxtralModel:      envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
-		VoxtralTimeout:    envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
-		AudioLLMPrompt:    envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()),
+		AudioBaseURL:      envString("AUDIO_TRANSCRIPTION_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
+		AudioAPIKey:       envString("AUDIO_TRANSCRIPTION_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
+		AudioModel:        envString("AUDIO_TRANSCRIPTION_MODEL", "openai/whisper-large-v3"),
+		AudioTimeout:      envDuration("AUDIO_TRANSCRIPTION_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
+		AudioPrompt:       envString("AUDIO_TRANSCRIPTION_PROMPT", envString("AUDIO_LLM_PROMPT", defaultAudioPrompt())),
 		AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
 		AIStatsTimeout:    envDuration("AI_STATS_TIMEOUT", 8*time.Second),

@@ -132,7 +132,7 @@ func envCSVDefault(key string, fallback []string) []string {
 	return fallback
 }

-func defaultAudioLLMPrompt() string {
+func defaultAudioPrompt() string {
 	return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
 }

--- a/internal/httpapi/dashboard.go
+++ b/internal/httpapi/dashboard.go
@@ -5,6 +5,7 @@ import (
 	"time"

 	"ai-service/internal/model"
+	"ai-service/internal/transcription"
 )

 type dashboardResponse struct {
@@ -51,7 +52,7 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
 			At: now,
 			Providers: []providerStatus{
 				s.checkLLM(ctx),
-				s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
+				s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
 			},
 		},
 		Infra: loadInfraSnapshot(r, s.cfg),
--- a/internal/httpapi/providers.go
+++ b/internal/httpapi/providers.go
@@ -8,6 +8,8 @@ import (
 	"net/http"
 	"strings"
 	"time"
+
+	"ai-service/internal/transcription"
 )

 type providerStatus struct {
@@ -42,7 +44,7 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
 		At: time.Now().UTC(),
 		Providers: []providerStatus{
 			s.checkLLM(ctx),
-			s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
+			s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
 		},
 	}
 	writeJSON(w, http.StatusOK, resp)
--- a/internal/transcription/client.go
+++ b/internal/transcription/client.go
@@ -19,16 +19,19 @@ type Client struct {
 	http     *http.Client
 }

-const ProviderVoxtral = "voxtral-small"
+const (
+	ProviderWhisperLargeV3 = "whisper-large-v3"
+	defaultWhisperModel    = "openai/whisper-large-v3"
+)

 var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[:：-]`)

 type Options struct {
-	VoxtralBaseURL string
-	VoxtralAPIKey  string
-	VoxtralModel   string
-	VoxtralTimeout time.Duration
-	AudioLLMPrompt string
+	AudioBaseURL string
+	AudioAPIKey  string
+	AudioModel   string
+	AudioTimeout time.Duration
+	AudioPrompt  string
 }

 type ProviderConfig struct {
@@ -102,17 +105,17 @@ type audioTranscriptionSegment struct {

 func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
 	return NewWithOptions(Options{
-		VoxtralBaseURL: baseURL,
-		VoxtralTimeout: timeout,
+		AudioBaseURL: baseURL,
+		AudioTimeout: timeout,
 	})
 }

 func NewWithOptions(opts Options) *Client {
-	audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
-	if audioLLMPrompt == "" {
-		audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
+	audioPrompt := strings.TrimSpace(opts.AudioPrompt)
+	if audioPrompt == "" {
+		audioPrompt = "Transcribe the audio exactly. Return only the transcript text."
 	}
-	provider := buildVoxtralProvider(opts, audioLLMPrompt)
+	provider := buildAudioProvider(opts, audioPrompt)
 	if provider.BaseURL == "" {
 		return nil
 	}
@@ -122,18 +125,18 @@ func NewWithOptions(opts Options) *Client {
 	}
 }

-func buildVoxtralProvider(opts Options, prompt string) ProviderConfig {
-	baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
+func buildAudioProvider(opts Options, prompt string) ProviderConfig {
+	baseURL := strings.TrimRight(strings.TrimSpace(opts.AudioBaseURL), "/")
 	if baseURL == "" {
 		return ProviderConfig{}
 	}
-	model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
+	model := firstNonEmpty(opts.AudioModel, defaultWhisperModel)
 	return ProviderConfig{
-		Name:    ProviderVoxtral,
+		Name:    ProviderWhisperLargeV3,
 		BaseURL: baseURL,
-		APIKey:  strings.TrimSpace(opts.VoxtralAPIKey),
+		APIKey:  strings.TrimSpace(opts.AudioAPIKey),
 		Model:   model,
-		Timeout: defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
+		Timeout: defaultDuration(opts.AudioTimeout, 10*time.Minute),
 		Prompt:  prompt,
 	}
 }
@@ -147,7 +150,7 @@ func defaultDuration(v, fallback time.Duration) time.Duration {

 func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
 	if c == nil || c.provider.BaseURL == "" {
-		return nil, fmt.Errorf("voxtral transcription provider not configured")
+		return nil, fmt.Errorf("audio transcription provider not configured")
 	}
 	if strings.TrimSpace(in.AudioURL) == "" {
 		return nil, fmt.Errorf("audio_url is required")
--- a/internal/transcription/client_test.go
+++ b/internal/transcription/client_test.go
@@ -7,22 +7,22 @@ import (
 	"testing"
 )

-func TestNewWithOptionsBuildsVoxtralProvider(t *testing.T) {
+func TestNewWithOptionsBuildsWhisperProvider(t *testing.T) {
 	client := NewWithOptions(Options{
-		VoxtralBaseURL: "http://voxtral",
+		AudioBaseURL: "http://whisper",
 	})
 	if client == nil {
 		t.Fatal("client is nil")
 	}
-	if client.provider.Name != ProviderVoxtral {
-		t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderVoxtral)
+	if client.provider.Name != ProviderWhisperLargeV3 {
+		t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderWhisperLargeV3)
 	}
-	if client.provider.Model != "mistralai/Voxtral-Small-24B-2507" {
+	if client.provider.Model != "openai/whisper-large-v3" {
 		t.Fatalf("model = %q", client.provider.Model)
 	}
 }

-func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
+func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
 	audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		_, _ = w.Write([]byte("fake audio"))
 	}))
@@ -50,8 +50,8 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
 	defer providerSrv.Close()

 	client := NewWithOptions(Options{
-		VoxtralBaseURL: providerSrv.URL,
-		VoxtralModel:   "mistralai/Voxtral-Small-24B-2507",
+		AudioBaseURL: providerSrv.URL,
+		AudioModel:   "openai/whisper-large-v3",
 	})
 	if client == nil {
 		t.Fatal("client is nil")
@@ -63,7 +63,7 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
 	if gotPath != "/v1/audio/transcriptions" {
 		t.Fatalf("path = %q, want /v1/audio/transcriptions", gotPath)
 	}
-	if gotModel != "mistralai/Voxtral-Small-24B-2507" {
+	if gotModel != "openai/whisper-large-v3" {
 		t.Fatalf("model = %q", gotModel)
 	}
 	if gotResponseFormat != "json" {
--- a/internal/worker/worker.go
+++ b/internal/worker/worker.go
@@ -20,7 +20,7 @@ const (
 	TaskCallAnalysis   = "call_analysis"
 	TaskTranscription  = "transcription"

-	TranscriptionProfile = "voxtral-small"
+	TranscriptionProfile = "whisper-large-v3"
 )

 type Worker struct {
--- a/k8s/configmap.yaml
+++ b/k8s/configmap.yaml
@@ -11,11 +11,11 @@ data:
  LLM_BASE_URL: "http://10.2.3.5:8002"
  LLM_MODEL: "qwen2.5-14b"
  LLM_TIMEOUT: "5m"
-  # Voxtral Small is the only transcription provider. It is exposed on the AI
-  # server through an OpenAI-compatible /v1/audio/transcriptions endpoint.
-  VOXTRAL_BASE_URL: "http://10.2.3.5:8004"
-  VOXTRAL_MODEL: "mistralai/Voxtral-Small-24B-2507"
-  VOXTRAL_TIMEOUT: "30m"
+  # Whisper Large v3 is exposed on the AI server through an OpenAI-compatible
+  # /v1/audio/transcriptions endpoint.
+  AUDIO_TRANSCRIPTION_BASE_URL: "http://10.2.3.5:8004"
+  AUDIO_TRANSCRIPTION_MODEL: "openai/whisper-large-v3"
+  AUDIO_TRANSCRIPTION_TIMEOUT: "30m"
  AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
  AI_STATS_TIMEOUT: "8s"
  WORKER_POLL_INTERVAL: "2s"
--- a/k8s/secrets.yaml
+++ b/k8s/secrets.yaml
@@ -18,5 +18,5 @@ type: Opaque
 stringData:
  DATABASE_URL: "postgres://ai_service:ai_service@postgres:5432/ai_service?sslmode=disable"
  LLM_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
-  VOXTRAL_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
+  AUDIO_TRANSCRIPTION_API_KEY: "sk-111f838ccec43406e078cd9094b6797307cb895236179f32"
  AI_SERVICE_TOKEN: "d18bcacf9e02bae1806ee6b6eeda62b95be6a915c0a22936d9a700128b275442"
--- a/k8s/worker-deployment.yaml
+++ b/k8s/worker-deployment.yaml
@@ -98,7 +98,7 @@ spec:
            - name: WORKER_TASK_TYPES
              value: "transcription"
            - name: WORKER_MODEL_PROFILES
-              value: "voxtral-small"
+              value: "whisper-large-v3"
            - name: WORKER_CLAIM_LIMIT
              value: "2"
            - name: WORKER_LEASE_TIMEOUT