Use Voxtral JSON transcription without fake speakers
Some checks failed
CI / test (push) Failing after 9s
Build and Deploy / build-and-deploy (push) Successful in 23s

This commit is contained in:
Grendgi
2026-06-09 22:09:27 +03:00
parent e074f6b226
commit bf945e05e3
2 changed files with 68 additions and 53 deletions

View File

@@ -9,6 +9,7 @@ import (
"mime/multipart" "mime/multipart"
"net/http" "net/http"
"path/filepath" "path/filepath"
"regexp"
"strings" "strings"
"time" "time"
) )
@@ -20,6 +21,8 @@ type Client struct {
const ProviderVoxtral = "voxtral-small" const ProviderVoxtral = "voxtral-small"
var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[:-]`)
type Options struct { type Options struct {
VoxtralBaseURL string VoxtralBaseURL string
VoxtralAPIKey string VoxtralAPIKey string
@@ -230,18 +233,7 @@ func clampTime(v float64) float64 {
} }
func (c *Client) transcribeOpenAIAudio(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*audioLLMResponse, time.Duration, error) { func (c *Client) transcribeOpenAIAudio(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*audioLLMResponse, time.Duration, error) {
resp, duration, err := c.doOpenAIAudioTranscription(ctx, provider, audio, filename, in, "verbose_json") return c.doOpenAIAudioTranscription(ctx, provider, audio, filename, in, "json")
if err == nil {
return resp, duration, nil
}
if !strings.Contains(strings.ToLower(err.Error()), "http 4") {
return nil, duration, err
}
fallback, fallbackDuration, fallbackErr := c.doOpenAIAudioTranscription(ctx, provider, audio, filename, in, "json")
if fallbackErr != nil {
return nil, duration + fallbackDuration, err
}
return fallback, duration + fallbackDuration, nil
} }
func (c *Client) doOpenAIAudioTranscription(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input, responseFormat string) (*audioLLMResponse, time.Duration, error) { func (c *Client) doOpenAIAudioTranscription(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input, responseFormat string) (*audioLLMResponse, time.Duration, error) {
@@ -336,45 +328,61 @@ func convertAudioSegments(in []audioTranscriptionSegment) []Segment {
func normalizeAudioLLMSegments(segments []Segment, text string, diarize bool) []Segment { func normalizeAudioLLMSegments(segments []Segment, text string, diarize bool) []Segment {
text = strings.TrimSpace(text) text = strings.TrimSpace(text)
if text != "" {
if labeled := segmentSpeakerLabeledText(text); len(labeled) > 0 {
return labeled
}
}
if len(segments) <= 1 && text != "" { if len(segments) <= 1 && text != "" {
heuristic := segmentTranscriptText(text, diarize) heuristic := segmentTranscriptText(text)
if len(heuristic) > len(segments) { if len(heuristic) > len(segments) {
segments = heuristic segments = heuristic
} }
} }
return ensureHeuristicSpeakers(segments, diarize) return segments
} }
func ensureHeuristicSpeakers(segments []Segment, diarize bool) []Segment { func segmentSpeakerLabeledText(text string) []Segment {
if !diarize || len(segments) < 2 || segmentsHaveSpeakers(segments) { matches := speakerLabelPattern.FindAllStringSubmatchIndex(text, -1)
return segments if len(matches) == 0 {
return nil
} }
out := make([]Segment, len(segments)) speakerIDs := map[string]string{}
copy(out, segments) var out []Segment
for i := range out { var t float64
if i%2 == 0 { for i, match := range matches {
out[i].Speaker = "SPEAKER_00" label := strings.ToLower(strings.TrimSpace(text[match[2]:match[3]]))
} else { speaker, ok := speakerIDs[label]
out[i].Speaker = "SPEAKER_01" if !ok {
speaker = fmt.Sprintf("SPEAKER_%02d", len(speakerIDs))
speakerIDs[label] = speaker
} }
start := match[1]
end := len(text)
if i+1 < len(matches) {
end = matches[i+1][0]
}
part := strings.TrimSpace(text[start:end])
part = strings.Trim(part, ":-— ")
if part == "" {
continue
}
words := len(strings.Fields(part))
duration := float64(words) * 0.42
if duration < 1.2 {
duration = 1.2
}
out = append(out, Segment{Start: t, End: t + duration, Text: part, Speaker: speaker})
t += duration
} }
return out return out
} }
func segmentsHaveSpeakers(segments []Segment) bool { func segmentTranscriptText(text string) []Segment {
for _, segment := range segments {
if strings.TrimSpace(segment.Speaker) != "" {
return true
}
}
return false
}
func segmentTranscriptText(text string, diarize bool) []Segment {
parts := splitTranscriptSentences(text) parts := splitTranscriptSentences(text)
out := make([]Segment, 0, len(parts)) out := make([]Segment, 0, len(parts))
var t float64 var t float64
for i, part := range parts { for _, part := range parts {
words := len(strings.Fields(part)) words := len(strings.Fields(part))
if words == 0 { if words == 0 {
continue continue
@@ -384,13 +392,6 @@ func segmentTranscriptText(text string, diarize bool) []Segment {
duration = 1.2 duration = 1.2
} }
segment := Segment{Start: t, End: t + duration, Text: part} segment := Segment{Start: t, End: t + duration, Text: part}
if diarize && len(parts) > 1 {
if i%2 == 0 {
segment.Speaker = "SPEAKER_00"
} else {
segment.Speaker = "SPEAKER_01"
}
}
out = append(out, segment) out = append(out, segment)
t = segment.End t = segment.End
} }

View File

@@ -66,21 +66,21 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
if gotModel != "mistralai/Voxtral-Small-24B-2507" { if gotModel != "mistralai/Voxtral-Small-24B-2507" {
t.Fatalf("model = %q", gotModel) t.Fatalf("model = %q", gotModel)
} }
if gotResponseFormat != "verbose_json" { if gotResponseFormat != "json" {
t.Fatalf("response_format = %q, want verbose_json", gotResponseFormat) t.Fatalf("response_format = %q, want json", gotResponseFormat)
} }
if len(got.Segments) != 2 || got.Segments[0].Text != "Алло, тест." || got.Segments[1].Start != 1.2 { if len(got.Segments) != 2 || got.Segments[0].Text != "Алло, тест." || got.Segments[1].Start != 1.2 {
t.Fatalf("segments = %#v", got.Segments) t.Fatalf("segments = %#v", got.Segments)
} }
} }
func TestSegmentTranscriptTextAddsHeuristicSpeakers(t *testing.T) { func TestSegmentTranscriptTextDoesNotInventSpeakers(t *testing.T) {
got := segmentTranscriptText("Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается.", true) got := segmentTranscriptText("Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается.")
if len(got) < 2 { if len(got) < 2 {
t.Fatalf("segments = %#v, want multiple", got) t.Fatalf("segments = %#v, want multiple", got)
} }
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" { if got[0].Speaker != "" || got[1].Speaker != "" {
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker) t.Fatalf("speakers = %q/%q, want empty", got[0].Speaker, got[1].Speaker)
} }
if got[1].Start <= got[0].Start { if got[1].Start <= got[0].Start {
t.Fatalf("segment times did not advance: %#v", got) t.Fatalf("segment times did not advance: %#v", got)
@@ -93,12 +93,12 @@ func TestNormalizeAudioLLMSegmentsSplitsSingleLongSegment(t *testing.T) {
if len(got) < 2 { if len(got) < 2 {
t.Fatalf("segments = %#v, want heuristic split", got) t.Fatalf("segments = %#v, want heuristic split", got)
} }
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" { if got[0].Speaker != "" || got[1].Speaker != "" {
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker) t.Fatalf("speakers = %q/%q, want empty", got[0].Speaker, got[1].Speaker)
} }
} }
func TestNormalizeAudioLLMSegmentsKeepsSegmentsAndAddsSpeakers(t *testing.T) { func TestNormalizeAudioLLMSegmentsKeepsSegmentsWithoutFakeSpeakers(t *testing.T) {
got := normalizeAudioLLMSegments([]Segment{ got := normalizeAudioLLMSegments([]Segment{
{Start: 0, End: 1, Text: "Алло."}, {Start: 0, End: 1, Text: "Алло."},
{Start: 1, End: 2, Text: "Да, слушаю."}, {Start: 1, End: 2, Text: "Да, слушаю."},
@@ -106,7 +106,21 @@ func TestNormalizeAudioLLMSegmentsKeepsSegmentsAndAddsSpeakers(t *testing.T) {
if len(got) != 2 { if len(got) != 2 {
t.Fatalf("segments = %#v", got) t.Fatalf("segments = %#v", got)
} }
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" { if got[0].Speaker != "" || got[1].Speaker != "" {
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker) t.Fatalf("speakers = %q/%q, want empty", got[0].Speaker, got[1].Speaker)
}
}
func TestNormalizeAudioLLMSegmentsUsesExplicitSpeakerLabels(t *testing.T) {
text := "Спикер 1: Алло, добрый день. Спикер 2: Да, слушаю. Спикер 1: Скажите, квартира продается?"
got := normalizeAudioLLMSegments(nil, text, true)
if len(got) != 3 {
t.Fatalf("segments = %#v, want 3", got)
}
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" || got[2].Speaker != "SPEAKER_00" {
t.Fatalf("speakers = %q/%q/%q", got[0].Speaker, got[1].Speaker, got[2].Speaker)
}
if got[0].Text != "Алло, добрый день." || got[1].Text != "Да, слушаю." {
t.Fatalf("texts = %#v", got)
} }
} }