Split single Voxtral transcript segments
This commit is contained in:
@@ -377,10 +377,7 @@ func (c *Client) transcribeWithProvider(ctx context.Context, provider ProviderCo
|
|||||||
return nil, attempt, err
|
return nil, attempt, err
|
||||||
}
|
}
|
||||||
text := strings.TrimSpace(resp.Text)
|
text := strings.TrimSpace(resp.Text)
|
||||||
segments := resp.Segments
|
segments := normalizeAudioLLMSegments(resp.Segments, text, in.Diarize)
|
||||||
if len(segments) == 0 {
|
|
||||||
segments = segmentTranscriptText(text, in.Diarize)
|
|
||||||
}
|
|
||||||
attempt.Status = "ok"
|
attempt.Status = "ok"
|
||||||
attempt.Model = resp.Model
|
attempt.Model = resp.Model
|
||||||
attempt.Text = text
|
attempt.Text = text
|
||||||
@@ -749,6 +746,42 @@ func convertAudioSegments(in []audioTranscriptionSegment) []Segment {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func normalizeAudioLLMSegments(segments []Segment, text string, diarize bool) []Segment {
|
||||||
|
text = strings.TrimSpace(text)
|
||||||
|
if len(segments) <= 1 && text != "" {
|
||||||
|
heuristic := segmentTranscriptText(text, diarize)
|
||||||
|
if len(heuristic) > len(segments) {
|
||||||
|
segments = heuristic
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ensureHeuristicSpeakers(segments, diarize)
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureHeuristicSpeakers(segments []Segment, diarize bool) []Segment {
|
||||||
|
if !diarize || len(segments) < 2 || segmentsHaveSpeakers(segments) {
|
||||||
|
return segments
|
||||||
|
}
|
||||||
|
out := make([]Segment, len(segments))
|
||||||
|
copy(out, segments)
|
||||||
|
for i := range out {
|
||||||
|
if i%2 == 0 {
|
||||||
|
out[i].Speaker = "SPEAKER_00"
|
||||||
|
} else {
|
||||||
|
out[i].Speaker = "SPEAKER_01"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func segmentsHaveSpeakers(segments []Segment) bool {
|
||||||
|
for _, segment := range segments {
|
||||||
|
if strings.TrimSpace(segment.Speaker) != "" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func segmentTranscriptText(text string, diarize bool) []Segment {
|
func segmentTranscriptText(text string, diarize bool) []Segment {
|
||||||
parts := splitTranscriptSentences(text)
|
parts := splitTranscriptSentences(text)
|
||||||
out := make([]Segment, 0, len(parts))
|
out := make([]Segment, 0, len(parts))
|
||||||
|
|||||||
@@ -135,6 +135,30 @@ func TestSegmentTranscriptTextAddsHeuristicSpeakers(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNormalizeAudioLLMSegmentsSplitsSingleLongSegment(t *testing.T) {
|
||||||
|
text := "Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается."
|
||||||
|
got := normalizeAudioLLMSegments([]Segment{{Start: 0, End: 12, Text: text}}, text, true)
|
||||||
|
if len(got) < 2 {
|
||||||
|
t.Fatalf("segments = %#v, want heuristic split", got)
|
||||||
|
}
|
||||||
|
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" {
|
||||||
|
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeAudioLLMSegmentsKeepsSegmentsAndAddsSpeakers(t *testing.T) {
|
||||||
|
got := normalizeAudioLLMSegments([]Segment{
|
||||||
|
{Start: 0, End: 1, Text: "Алло."},
|
||||||
|
{Start: 1, End: 2, Text: "Да, слушаю."},
|
||||||
|
}, "Алло. Да, слушаю.", true)
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("segments = %#v", got)
|
||||||
|
}
|
||||||
|
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" {
|
||||||
|
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func near(got, want float64) bool {
|
func near(got, want float64) bool {
|
||||||
return math.Abs(got-want) < 0.000001
|
return math.Abs(got-want) < 0.000001
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user