Use Voxtral JSON transcription without fake speakers
This commit is contained in:
@@ -66,21 +66,21 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
if gotModel != "mistralai/Voxtral-Small-24B-2507" {
|
||||
t.Fatalf("model = %q", gotModel)
|
||||
}
|
||||
if gotResponseFormat != "verbose_json" {
|
||||
t.Fatalf("response_format = %q, want verbose_json", gotResponseFormat)
|
||||
if gotResponseFormat != "json" {
|
||||
t.Fatalf("response_format = %q, want json", gotResponseFormat)
|
||||
}
|
||||
if len(got.Segments) != 2 || got.Segments[0].Text != "Алло, тест." || got.Segments[1].Start != 1.2 {
|
||||
t.Fatalf("segments = %#v", got.Segments)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSegmentTranscriptTextAddsHeuristicSpeakers(t *testing.T) {
|
||||
got := segmentTranscriptText("Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается.", true)
|
||||
func TestSegmentTranscriptTextDoesNotInventSpeakers(t *testing.T) {
|
||||
got := segmentTranscriptText("Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается.")
|
||||
if len(got) < 2 {
|
||||
t.Fatalf("segments = %#v, want multiple", got)
|
||||
}
|
||||
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" {
|
||||
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker)
|
||||
if got[0].Speaker != "" || got[1].Speaker != "" {
|
||||
t.Fatalf("speakers = %q/%q, want empty", got[0].Speaker, got[1].Speaker)
|
||||
}
|
||||
if got[1].Start <= got[0].Start {
|
||||
t.Fatalf("segment times did not advance: %#v", got)
|
||||
@@ -93,12 +93,12 @@ func TestNormalizeAudioLLMSegmentsSplitsSingleLongSegment(t *testing.T) {
|
||||
if len(got) < 2 {
|
||||
t.Fatalf("segments = %#v, want heuristic split", got)
|
||||
}
|
||||
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" {
|
||||
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker)
|
||||
if got[0].Speaker != "" || got[1].Speaker != "" {
|
||||
t.Fatalf("speakers = %q/%q, want empty", got[0].Speaker, got[1].Speaker)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeAudioLLMSegmentsKeepsSegmentsAndAddsSpeakers(t *testing.T) {
|
||||
func TestNormalizeAudioLLMSegmentsKeepsSegmentsWithoutFakeSpeakers(t *testing.T) {
|
||||
got := normalizeAudioLLMSegments([]Segment{
|
||||
{Start: 0, End: 1, Text: "Алло."},
|
||||
{Start: 1, End: 2, Text: "Да, слушаю."},
|
||||
@@ -106,7 +106,21 @@ func TestNormalizeAudioLLMSegmentsKeepsSegmentsAndAddsSpeakers(t *testing.T) {
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("segments = %#v", got)
|
||||
}
|
||||
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" {
|
||||
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker)
|
||||
if got[0].Speaker != "" || got[1].Speaker != "" {
|
||||
t.Fatalf("speakers = %q/%q, want empty", got[0].Speaker, got[1].Speaker)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeAudioLLMSegmentsUsesExplicitSpeakerLabels(t *testing.T) {
|
||||
text := "Спикер 1: Алло, добрый день. Спикер 2: Да, слушаю. Спикер 1: Скажите, квартира продается?"
|
||||
got := normalizeAudioLLMSegments(nil, text, true)
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("segments = %#v, want 3", got)
|
||||
}
|
||||
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" || got[2].Speaker != "SPEAKER_00" {
|
||||
t.Fatalf("speakers = %q/%q/%q", got[0].Speaker, got[1].Speaker, got[2].Speaker)
|
||||
}
|
||||
if got[0].Text != "Алло, добрый день." || got[1].Text != "Да, слушаю." {
|
||||
t.Fatalf("texts = %#v", got)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user