diff --git a/internal/transcription/client.go b/internal/transcription/client.go index 44917be..bbfd2ca 100644 --- a/internal/transcription/client.go +++ b/internal/transcription/client.go @@ -377,10 +377,7 @@ func (c *Client) transcribeWithProvider(ctx context.Context, provider ProviderCo return nil, attempt, err } text := strings.TrimSpace(resp.Text) - segments := resp.Segments - if len(segments) == 0 { - segments = segmentTranscriptText(text, in.Diarize) - } + segments := normalizeAudioLLMSegments(resp.Segments, text, in.Diarize) attempt.Status = "ok" attempt.Model = resp.Model attempt.Text = text @@ -749,6 +746,42 @@ func convertAudioSegments(in []audioTranscriptionSegment) []Segment { return out } +func normalizeAudioLLMSegments(segments []Segment, text string, diarize bool) []Segment { + text = strings.TrimSpace(text) + if len(segments) <= 1 && text != "" { + heuristic := segmentTranscriptText(text, diarize) + if len(heuristic) > len(segments) { + segments = heuristic + } + } + return ensureHeuristicSpeakers(segments, diarize) +} + +func ensureHeuristicSpeakers(segments []Segment, diarize bool) []Segment { + if !diarize || len(segments) < 2 || segmentsHaveSpeakers(segments) { + return segments + } + out := make([]Segment, len(segments)) + copy(out, segments) + for i := range out { + if i%2 == 0 { + out[i].Speaker = "SPEAKER_00" + } else { + out[i].Speaker = "SPEAKER_01" + } + } + return out +} + +func segmentsHaveSpeakers(segments []Segment) bool { + for _, segment := range segments { + if strings.TrimSpace(segment.Speaker) != "" { + return true + } + } + return false +} + func segmentTranscriptText(text string, diarize bool) []Segment { parts := splitTranscriptSentences(text) out := make([]Segment, 0, len(parts)) diff --git a/internal/transcription/client_test.go b/internal/transcription/client_test.go index a769bc3..5258a8b 100644 --- a/internal/transcription/client_test.go +++ b/internal/transcription/client_test.go @@ -135,6 +135,30 @@ func TestSegmentTranscriptTextAddsHeuristicSpeakers(t *testing.T) { } } +func TestNormalizeAudioLLMSegmentsSplitsSingleLongSegment(t *testing.T) { + text := "Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается." + got := normalizeAudioLLMSegments([]Segment{{Start: 0, End: 12, Text: text}}, text, true) + if len(got) < 2 { + t.Fatalf("segments = %#v, want heuristic split", got) + } + if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" { + t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker) + } +} + +func TestNormalizeAudioLLMSegmentsKeepsSegmentsAndAddsSpeakers(t *testing.T) { + got := normalizeAudioLLMSegments([]Segment{ + {Start: 0, End: 1, Text: "Алло."}, + {Start: 1, End: 2, Text: "Да, слушаю."}, + }, "Алло. Да, слушаю.", true) + if len(got) != 2 { + t.Fatalf("segments = %#v", got) + } + if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" { + t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker) + } +} + func near(got, want float64) bool { return math.Abs(got-want) < 0.000001 }