Switch transcription to Whisper large v3
This commit is contained in:
@@ -19,16 +19,19 @@ type Client struct {
|
||||
http *http.Client
|
||||
}
|
||||
|
||||
const ProviderVoxtral = "voxtral-small"
|
||||
const (
|
||||
ProviderWhisperLargeV3 = "whisper-large-v3"
|
||||
defaultWhisperModel = "openai/whisper-large-v3"
|
||||
)
|
||||
|
||||
var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[::-]`)
|
||||
|
||||
type Options struct {
|
||||
VoxtralBaseURL string
|
||||
VoxtralAPIKey string
|
||||
VoxtralModel string
|
||||
VoxtralTimeout time.Duration
|
||||
AudioLLMPrompt string
|
||||
AudioBaseURL string
|
||||
AudioAPIKey string
|
||||
AudioModel string
|
||||
AudioTimeout time.Duration
|
||||
AudioPrompt string
|
||||
}
|
||||
|
||||
type ProviderConfig struct {
|
||||
@@ -102,17 +105,17 @@ type audioTranscriptionSegment struct {
|
||||
|
||||
func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
|
||||
return NewWithOptions(Options{
|
||||
VoxtralBaseURL: baseURL,
|
||||
VoxtralTimeout: timeout,
|
||||
AudioBaseURL: baseURL,
|
||||
AudioTimeout: timeout,
|
||||
})
|
||||
}
|
||||
|
||||
func NewWithOptions(opts Options) *Client {
|
||||
audioLLMPrompt := strings.TrimSpace(opts.AudioLLMPrompt)
|
||||
if audioLLMPrompt == "" {
|
||||
audioLLMPrompt = "Transcribe the audio exactly. Return only the transcript text."
|
||||
audioPrompt := strings.TrimSpace(opts.AudioPrompt)
|
||||
if audioPrompt == "" {
|
||||
audioPrompt = "Transcribe the audio exactly. Return only the transcript text."
|
||||
}
|
||||
provider := buildVoxtralProvider(opts, audioLLMPrompt)
|
||||
provider := buildAudioProvider(opts, audioPrompt)
|
||||
if provider.BaseURL == "" {
|
||||
return nil
|
||||
}
|
||||
@@ -122,18 +125,18 @@ func NewWithOptions(opts Options) *Client {
|
||||
}
|
||||
}
|
||||
|
||||
func buildVoxtralProvider(opts Options, prompt string) ProviderConfig {
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(opts.VoxtralBaseURL), "/")
|
||||
func buildAudioProvider(opts Options, prompt string) ProviderConfig {
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(opts.AudioBaseURL), "/")
|
||||
if baseURL == "" {
|
||||
return ProviderConfig{}
|
||||
}
|
||||
model := firstNonEmpty(opts.VoxtralModel, "mistralai/Voxtral-Small-24B-2507")
|
||||
model := firstNonEmpty(opts.AudioModel, defaultWhisperModel)
|
||||
return ProviderConfig{
|
||||
Name: ProviderVoxtral,
|
||||
Name: ProviderWhisperLargeV3,
|
||||
BaseURL: baseURL,
|
||||
APIKey: strings.TrimSpace(opts.VoxtralAPIKey),
|
||||
APIKey: strings.TrimSpace(opts.AudioAPIKey),
|
||||
Model: model,
|
||||
Timeout: defaultDuration(opts.VoxtralTimeout, 10*time.Minute),
|
||||
Timeout: defaultDuration(opts.AudioTimeout, 10*time.Minute),
|
||||
Prompt: prompt,
|
||||
}
|
||||
}
|
||||
@@ -147,7 +150,7 @@ func defaultDuration(v, fallback time.Duration) time.Duration {
|
||||
|
||||
func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
|
||||
if c == nil || c.provider.BaseURL == "" {
|
||||
return nil, fmt.Errorf("voxtral transcription provider not configured")
|
||||
return nil, fmt.Errorf("audio transcription provider not configured")
|
||||
}
|
||||
if strings.TrimSpace(in.AudioURL) == "" {
|
||||
return nil, fmt.Errorf("audio_url is required")
|
||||
|
||||
@@ -7,22 +7,22 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNewWithOptionsBuildsVoxtralProvider(t *testing.T) {
|
||||
func TestNewWithOptionsBuildsWhisperProvider(t *testing.T) {
|
||||
client := NewWithOptions(Options{
|
||||
VoxtralBaseURL: "http://voxtral",
|
||||
AudioBaseURL: "http://whisper",
|
||||
})
|
||||
if client == nil {
|
||||
t.Fatal("client is nil")
|
||||
}
|
||||
if client.provider.Name != ProviderVoxtral {
|
||||
t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderVoxtral)
|
||||
if client.provider.Name != ProviderWhisperLargeV3 {
|
||||
t.Fatalf("provider = %q, want %q", client.provider.Name, ProviderWhisperLargeV3)
|
||||
}
|
||||
if client.provider.Model != "mistralai/Voxtral-Small-24B-2507" {
|
||||
if client.provider.Model != "openai/whisper-large-v3" {
|
||||
t.Fatalf("model = %q", client.provider.Model)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("fake audio"))
|
||||
}))
|
||||
@@ -50,8 +50,8 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
defer providerSrv.Close()
|
||||
|
||||
client := NewWithOptions(Options{
|
||||
VoxtralBaseURL: providerSrv.URL,
|
||||
VoxtralModel: "mistralai/Voxtral-Small-24B-2507",
|
||||
AudioBaseURL: providerSrv.URL,
|
||||
AudioModel: "openai/whisper-large-v3",
|
||||
})
|
||||
if client == nil {
|
||||
t.Fatal("client is nil")
|
||||
@@ -63,7 +63,7 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
if gotPath != "/v1/audio/transcriptions" {
|
||||
t.Fatalf("path = %q, want /v1/audio/transcriptions", gotPath)
|
||||
}
|
||||
if gotModel != "mistralai/Voxtral-Small-24B-2507" {
|
||||
if gotModel != "openai/whisper-large-v3" {
|
||||
t.Fatalf("model = %q", gotModel)
|
||||
}
|
||||
if gotResponseFormat != "json" {
|
||||
|
||||
Reference in New Issue
Block a user