package transcription import ( "encoding/json" "math" "net/http" "net/http/httptest" "testing" "time" ) func TestAdjustLeadSilence(t *testing.T) { got := adjustLeadSilence([]Segment{ {Start: 0.2, End: 1.1, Text: "first"}, {Start: 1.4, End: 2.0, Text: "second"}, }, 800*time.Millisecond) if got[0].Start != 0 { t.Fatalf("first start = %v, want 0", got[0].Start) } if !near(got[0].End, 0.3) { t.Fatalf("first end = %v, want 0.3", got[0].End) } if !near(got[1].Start, 0.6) { t.Fatalf("second start = %v, want 0.6", got[1].Start) } } func TestNormalizeProviderOrder(t *testing.T) { got := normalizeProviderOrder([]string{"whisperx", "qwen", "voxtral", "qwen2-audio"}) want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral} if len(got) != len(want) { t.Fatalf("providers = %#v, want %#v", got, want) } for i := range want { if got[i] != want[i] { t.Fatalf("providers = %#v, want %#v", got, want) } } } func TestNewWithOptionsBuildsComparisonProviders(t *testing.T) { client := NewWithOptions(Options{ Providers: []string{"whisperx", "qwen2-audio", "voxtral-small"}, WhisperXURL: "http://whisperx", QwenAudioBaseURL: "http://qwen", VoxtralBaseURL: "http://voxtral", }) if client == nil { t.Fatal("client is nil") } got := make([]string, 0, len(client.providers)) for _, provider := range client.providers { got = append(got, provider.Name) } want := []string{ProviderWhisperX, ProviderQwenAudio, ProviderVoxtral} for i := range want { if got[i] != want[i] { t.Fatalf("providers = %#v, want %#v", got, want) } } } func TestAudioDataURLUsesVLLMAudioURLFormat(t *testing.T) { got := audioDataURL([]byte("abc"), "call.wav") want := "data:audio/wav;base64,YWJj" if got != want { t.Fatalf("audio data url = %q, want %q", got, want) } } func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) { audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { _, _ = w.Write([]byte("fake audio")) })) defer audioSrv.Close() var gotPath, gotModel, gotResponseFormat string providerSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { gotPath = r.URL.Path if err := r.ParseMultipartForm(16 << 20); err != nil { t.Fatalf("ParseMultipartForm: %v", err) } gotModel = r.FormValue("model") gotResponseFormat = r.FormValue("response_format") if _, _, err := r.FormFile("file"); err != nil { t.Fatalf("FormFile: %v", err) } _ = json.NewEncoder(w).Encode(map[string]any{ "text": "Алло, тест. Да, слышно.", "segments": []map[string]any{ {"start": 0, "end": 1.2, "text": "Алло, тест."}, {"start": 1.2, "end": 2.4, "text": "Да, слышно."}, }, }) })) defer providerSrv.Close() client := NewWithOptions(Options{ Providers: []string{"voxtral-small"}, VoxtralBaseURL: providerSrv.URL, VoxtralModel: "mistralai/Voxtral-Small-24B-2507", }) if client == nil { t.Fatal("client is nil") } got, err := client.Transcribe(t.Context(), Input{AudioURL: audioSrv.URL, Filename: "call.mp3"}) if err != nil { t.Fatalf("Transcribe: %v", err) } if gotPath != "/v1/audio/transcriptions" { t.Fatalf("path = %q, want /v1/audio/transcriptions", gotPath) } if gotModel != "mistralai/Voxtral-Small-24B-2507" { t.Fatalf("model = %q", gotModel) } if gotResponseFormat != "verbose_json" { t.Fatalf("response_format = %q, want verbose_json", gotResponseFormat) } if len(got.Segments) != 2 || got.Segments[0].Text != "Алло, тест." || got.Segments[1].Start != 1.2 { t.Fatalf("segments = %#v", got.Segments) } } func TestSegmentTranscriptTextAddsHeuristicSpeakers(t *testing.T) { got := segmentTranscriptText("Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается.", true) if len(got) < 2 { t.Fatalf("segments = %#v, want multiple", got) } if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" { t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker) } if got[1].Start <= got[0].Start { t.Fatalf("segment times did not advance: %#v", got) } } func near(got, want float64) bool { return math.Abs(got-want) < 0.000001 }