Switch transcription to Whisper large v3

2026-06-10 10:10:13 +03:00
parent 1b63dcdbf5
commit 8d6cd84403
12 changed files with 85 additions and 93 deletions
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -18,11 +18,11 @@ type Config struct {
 	LLMAPIKey         string
 	LLMModel          string
 	LLMTimeout        time.Duration
-	VoxtralBaseURL    string
-	VoxtralAPIKey     string
-	VoxtralModel      string
-	VoxtralTimeout    time.Duration
-	AudioLLMPrompt    string
+	AudioBaseURL      string
+	AudioAPIKey       string
+	AudioModel        string
+	AudioTimeout      time.Duration
+	AudioPrompt       string
 	AIStatsSidecarURL string
 	AIStatsTimeout    time.Duration

@@ -48,11 +48,11 @@ func Load() Config {
 		LLMAPIKey:         envString("LLM_API_KEY", ""),
 		LLMModel:          envString("LLM_MODEL", "qwen2.5-14b"),
 		LLMTimeout:        envDuration("LLM_TIMEOUT", 5*time.Minute),
-		VoxtralBaseURL:    envString("VOXTRAL_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
-		VoxtralAPIKey:     envString("VOXTRAL_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
-		VoxtralModel:      envString("VOXTRAL_MODEL", "mistralai/Voxtral-Small-24B-2507"),
-		VoxtralTimeout:    envDuration("VOXTRAL_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
-		AudioLLMPrompt:    envString("AUDIO_LLM_PROMPT", defaultAudioLLMPrompt()),
+		AudioBaseURL:      envString("AUDIO_TRANSCRIPTION_BASE_URL", envString("AUDIO_LLM_BASE_URL", "")),
+		AudioAPIKey:       envString("AUDIO_TRANSCRIPTION_API_KEY", envString("AUDIO_LLM_API_KEY", envString("LLM_API_KEY", ""))),
+		AudioModel:        envString("AUDIO_TRANSCRIPTION_MODEL", "openai/whisper-large-v3"),
+		AudioTimeout:      envDuration("AUDIO_TRANSCRIPTION_TIMEOUT", envDuration("AUDIO_LLM_TIMEOUT", 10*time.Minute)),
+		AudioPrompt:       envString("AUDIO_TRANSCRIPTION_PROMPT", envString("AUDIO_LLM_PROMPT", defaultAudioPrompt())),
 		AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
 		AIStatsTimeout:    envDuration("AI_STATS_TIMEOUT", 8*time.Second),

@@ -132,7 +132,7 @@ func envCSVDefault(key string, fallback []string) []string {
 	return fallback
 }

-func defaultAudioLLMPrompt() string {
+func defaultAudioPrompt() string {
 	return "Расшифруй речь из аудио максимально точно. Сохрани русский язык, имена, телефоны, суммы и смысловые паузы. Не добавляй комментарии, анализ, Markdown или JSON. Верни только чистый текст расшифровки."
 }

--- a/internal/httpapi/dashboard.go
+++ b/internal/httpapi/dashboard.go
@@ -5,6 +5,7 @@ import (
 	"time"

 	"ai-service/internal/model"
+	"ai-service/internal/transcription"
 )

 type dashboardResponse struct {
@@ -51,7 +52,7 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
 			At: now,
 			Providers: []providerStatus{
 				s.checkLLM(ctx),
-				s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
+				s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
 			},
 		},
 		Infra: loadInfraSnapshot(r, s.cfg),
--- a/internal/httpapi/providers.go
+++ b/internal/httpapi/providers.go
@@ -8,6 +8,8 @@ import (
 	"net/http"
 	"strings"
 	"time"
+
+	"ai-service/internal/transcription"
 )

 type providerStatus struct {
@@ -42,7 +44,7 @@ func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
 		At: time.Now().UTC(),
 		Providers: []providerStatus{
 			s.checkLLM(ctx),
-			s.checkAudioLLM(ctx, "voxtral-small", s.cfg.VoxtralBaseURL, s.cfg.VoxtralAPIKey, s.cfg.VoxtralModel, s.cfg.VoxtralTimeout),
+			s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
 		},
 	}
 	writeJSON(w, http.StatusOK, resp)
--- a/internal/transcription/client.go
+++ b/internal/transcription/client.go
@@ -19,16 +19,19 @@ type Client struct {
 	http     *http.Client
 }

-const ProviderVoxtral = "voxtral-small"
+const (
+	ProviderWhisperLargeV3 = "whisper-large-v3"
+	defaultWhisperModel    = "openai/whisper-large-v3"
+)

 var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[:：-]`)

 type Options struct {
-	VoxtralBaseURL string
-	VoxtralAPIKey  string
-	VoxtralModel   string
-	VoxtralTimeout time.Duration
-	AudioLLMPrompt string
+	AudioBaseURL string
+	AudioAPIKey  string
+	AudioModel   string
+	AudioTimeout time.Duration
+	AudioPrompt  string
 }

 type ProviderConfig struct {
@@ -102,17 +105,17 @@ type audioTranscriptionSegment struct {

 func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
 	return NewWithOptions(Options{
-		VoxtralBaseURL: baseURL,
-		VoxtralTimeout: timeout,
+		AudioBaseURL: baseURL,
+		AudioTimeout: timeout,
 	})
 }

 func NewWithOptions(opts Options) *Client {
-	audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
-	if audioLLMPrompt == "" {
-		audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
+	audioPrompt := strings.TrimSpace(opts.AudioPrompt)
+	if audioPrompt == "" {
+		audioPrompt = "Transcribe the audio exactly. Return only the transcript text."
 	}
-	provider := buildVoxtralProvider(opts, audioLLMPrompt)
+	provider := buildAudioProvider(opts, audioPrompt)
 	if provider.BaseURL == "" {
 		return nil
 	}
@@ -122,18 +125,18 @@ func NewWithOptions(opts Options) *Client {
 	}
 }

-func buildVoxtralProvider(opts Options, prompt string) ProviderConfig {
-	baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
+func buildAudioProvider(opts Options, prompt string) ProviderConfig {
+	baseURL := strings.TrimRight(strings.TrimSpace(opts.AudioBaseURL), "/")
 	if baseURL == "" {
 		return ProviderConfig{}
 	}
-	model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
+	model := firstNonEmpty(opts.AudioModel, defaultWhisperModel)
 	return ProviderConfig{
-		Name:    ProviderVoxtral,
+		Name:    ProviderWhisperLargeV3,
 		BaseURL: baseURL,
-		APIKey:  strings.TrimSpace(opts.VoxtralAPIKey),
+		APIKey:  strings.TrimSpace(opts.AudioAPIKey),
 		Model:   model,
-		Timeout: defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
+		Timeout: defaultDuration(opts.AudioTimeout, 10*time.Minute),
 		Prompt:  prompt,
 	}
 }
@@ -147,7 +150,7 @@ func defaultDuration(v, fallback time.Duration) time.Duration {

 func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
 	if c == nil || c.provider.BaseURL == "" {
-		return nil, fmt.Errorf("voxtral transcription provider not configured")
+		return nil, fmt.Errorf("audio transcription provider not configured")
 	}
 	if strings.TrimSpace(in.AudioURL) == "" {
 		return nil, fmt.Errorf("audio_url is required")
--- a/internal/transcription/client_test.go
+++ b/internal/transcription/client_test.go
@@ -7,22 +7,22 @@ import (
 	"testing"
 )

-func TestNewWithOptionsBuildsVoxtralProvider(t *testing.T) {
+func TestNewWithOptionsBuildsWhisperProvider(t *testing.T) {
 	client := NewWithOptions(Options{
-		VoxtralBaseURL: "http://voxtral",
+		AudioBaseURL: "http://whisper",
 	})
 	if client == nil {
 		t.Fatal("client is nil")
 	}
-	if client.provider.Name != ProviderVoxtral {
-		t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderVoxtral)
+	if client.provider.Name != ProviderWhisperLargeV3 {
+		t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderWhisperLargeV3)
 	}
-	if client.provider.Model != "mistralai/Voxtral-Small-24B-2507" {
+	if client.provider.Model != "openai/whisper-large-v3" {
 		t.Fatalf("model = %q", client.provider.Model)
 	}
 }

-func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
+func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
 	audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		_, _ = w.Write([]byte("fake audio"))
 	}))
@@ -50,8 +50,8 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
 	defer providerSrv.Close()

 	client := NewWithOptions(Options{
-		VoxtralBaseURL: providerSrv.URL,
-		VoxtralModel:   "mistralai/Voxtral-Small-24B-2507",
+		AudioBaseURL: providerSrv.URL,
+		AudioModel:   "openai/whisper-large-v3",
 	})
 	if client == nil {
 		t.Fatal("client is nil")
@@ -63,7 +63,7 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
 	if gotPath != "/v1/audio/transcriptions" {
 		t.Fatalf("path = %q, want /v1/audio/transcriptions", gotPath)
 	}
-	if gotModel != "mistralai/Voxtral-Small-24B-2507" {
+	if gotModel != "openai/whisper-large-v3" {
 		t.Fatalf("model = %q", gotModel)
 	}
 	if gotResponseFormat != "json" {
--- a/internal/worker/worker.go
+++ b/internal/worker/worker.go
@@ -20,7 +20,7 @@ const (
 	TaskCallAnalysis   = "call_analysis"
 	TaskTranscription  = "transcription"

-	TranscriptionProfile = "voxtral-small"
+	TranscriptionProfile = "whisper-large-v3"
 )

 type Worker struct {