Use Voxtral JSON transcription without fake speakers
This commit is contained in:
@@ -9,6 +9,7 @@ import (
|
|||||||
"mime/multipart"
|
"mime/multipart"
|
||||||
"net/http"
|
"net/http"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@@ -20,6 +21,8 @@ type Client struct {
|
|||||||
|
|
||||||
const ProviderVoxtral = "voxtral-small"
|
const ProviderVoxtral = "voxtral-small"
|
||||||
|
|
||||||
|
var speakerLabelPattern = regexp.MustCompile(`(?i)(?:^|[\n\r ]+)((?:speaker|спикер|говорящий)\s*\d+)\s*[::-]`)
|
||||||
|
|
||||||
type Options struct {
|
type Options struct {
|
||||||
VoxtralBaseURL string
|
VoxtralBaseURL string
|
||||||
VoxtralAPIKey string
|
VoxtralAPIKey string
|
||||||
@@ -230,18 +233,7 @@ func clampTime(v float64) float64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *Client) transcribeOpenAIAudio(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*audioLLMResponse, time.Duration, error) {
|
func (c *Client) transcribeOpenAIAudio(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*audioLLMResponse, time.Duration, error) {
|
||||||
resp, duration, err := c.doOpenAIAudioTranscription(ctx, provider, audio, filename, in, "verbose_json")
|
return c.doOpenAIAudioTranscription(ctx, provider, audio, filename, in, "json")
|
||||||
if err == nil {
|
|
||||||
return resp, duration, nil
|
|
||||||
}
|
|
||||||
if !strings.Contains(strings.ToLower(err.Error()), "http 4") {
|
|
||||||
return nil, duration, err
|
|
||||||
}
|
|
||||||
fallback, fallbackDuration, fallbackErr := c.doOpenAIAudioTranscription(ctx, provider, audio, filename, in, "json")
|
|
||||||
if fallbackErr != nil {
|
|
||||||
return nil, duration + fallbackDuration, err
|
|
||||||
}
|
|
||||||
return fallback, duration + fallbackDuration, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Client) doOpenAIAudioTranscription(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input, responseFormat string) (*audioLLMResponse, time.Duration, error) {
|
func (c *Client) doOpenAIAudioTranscription(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input, responseFormat string) (*audioLLMResponse, time.Duration, error) {
|
||||||
@@ -336,45 +328,61 @@ func convertAudioSegments(in []audioTranscriptionSegment) []Segment {
|
|||||||
|
|
||||||
func normalizeAudioLLMSegments(segments []Segment, text string, diarize bool) []Segment {
|
func normalizeAudioLLMSegments(segments []Segment, text string, diarize bool) []Segment {
|
||||||
text = strings.TrimSpace(text)
|
text = strings.TrimSpace(text)
|
||||||
|
if text != "" {
|
||||||
|
if labeled := segmentSpeakerLabeledText(text); len(labeled) > 0 {
|
||||||
|
return labeled
|
||||||
|
}
|
||||||
|
}
|
||||||
if len(segments) <= 1 && text != "" {
|
if len(segments) <= 1 && text != "" {
|
||||||
heuristic := segmentTranscriptText(text, diarize)
|
heuristic := segmentTranscriptText(text)
|
||||||
if len(heuristic) > len(segments) {
|
if len(heuristic) > len(segments) {
|
||||||
segments = heuristic
|
segments = heuristic
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ensureHeuristicSpeakers(segments, diarize)
|
return segments
|
||||||
}
|
}
|
||||||
|
|
||||||
func ensureHeuristicSpeakers(segments []Segment, diarize bool) []Segment {
|
func segmentSpeakerLabeledText(text string) []Segment {
|
||||||
if !diarize || len(segments) < 2 || segmentsHaveSpeakers(segments) {
|
matches := speakerLabelPattern.FindAllStringSubmatchIndex(text, -1)
|
||||||
return segments
|
if len(matches) == 0 {
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
out := make([]Segment, len(segments))
|
speakerIDs := map[string]string{}
|
||||||
copy(out, segments)
|
var out []Segment
|
||||||
for i := range out {
|
var t float64
|
||||||
if i%2 == 0 {
|
for i, match := range matches {
|
||||||
out[i].Speaker = "SPEAKER_00"
|
label := strings.ToLower(strings.TrimSpace(text[match[2]:match[3]]))
|
||||||
} else {
|
speaker, ok := speakerIDs[label]
|
||||||
out[i].Speaker = "SPEAKER_01"
|
if !ok {
|
||||||
|
speaker = fmt.Sprintf("SPEAKER_%02d", len(speakerIDs))
|
||||||
|
speakerIDs[label] = speaker
|
||||||
}
|
}
|
||||||
|
start := match[1]
|
||||||
|
end := len(text)
|
||||||
|
if i+1 < len(matches) {
|
||||||
|
end = matches[i+1][0]
|
||||||
|
}
|
||||||
|
part := strings.TrimSpace(text[start:end])
|
||||||
|
part = strings.Trim(part, ":-— ")
|
||||||
|
if part == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
words := len(strings.Fields(part))
|
||||||
|
duration := float64(words) * 0.42
|
||||||
|
if duration < 1.2 {
|
||||||
|
duration = 1.2
|
||||||
|
}
|
||||||
|
out = append(out, Segment{Start: t, End: t + duration, Text: part, Speaker: speaker})
|
||||||
|
t += duration
|
||||||
}
|
}
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func segmentsHaveSpeakers(segments []Segment) bool {
|
func segmentTranscriptText(text string) []Segment {
|
||||||
for _, segment := range segments {
|
|
||||||
if strings.TrimSpace(segment.Speaker) != "" {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func segmentTranscriptText(text string, diarize bool) []Segment {
|
|
||||||
parts := splitTranscriptSentences(text)
|
parts := splitTranscriptSentences(text)
|
||||||
out := make([]Segment, 0, len(parts))
|
out := make([]Segment, 0, len(parts))
|
||||||
var t float64
|
var t float64
|
||||||
for i, part := range parts {
|
for _, part := range parts {
|
||||||
words := len(strings.Fields(part))
|
words := len(strings.Fields(part))
|
||||||
if words == 0 {
|
if words == 0 {
|
||||||
continue
|
continue
|
||||||
@@ -384,13 +392,6 @@ func segmentTranscriptText(text string, diarize bool) []Segment {
|
|||||||
duration = 1.2
|
duration = 1.2
|
||||||
}
|
}
|
||||||
segment := Segment{Start: t, End: t + duration, Text: part}
|
segment := Segment{Start: t, End: t + duration, Text: part}
|
||||||
if diarize && len(parts) > 1 {
|
|
||||||
if i%2 == 0 {
|
|
||||||
segment.Speaker = "SPEAKER_00"
|
|
||||||
} else {
|
|
||||||
segment.Speaker = "SPEAKER_01"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
out = append(out, segment)
|
out = append(out, segment)
|
||||||
t = segment.End
|
t = segment.End
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -66,21 +66,21 @@ func TestVoxtralUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
|||||||
if gotModel != "mistralai/Voxtral-Small-24B-2507" {
|
if gotModel != "mistralai/Voxtral-Small-24B-2507" {
|
||||||
t.Fatalf("model = %q", gotModel)
|
t.Fatalf("model = %q", gotModel)
|
||||||
}
|
}
|
||||||
if gotResponseFormat != "verbose_json" {
|
if gotResponseFormat != "json" {
|
||||||
t.Fatalf("response_format = %q, want verbose_json", gotResponseFormat)
|
t.Fatalf("response_format = %q, want json", gotResponseFormat)
|
||||||
}
|
}
|
||||||
if len(got.Segments) != 2 || got.Segments[0].Text != "Алло, тест." || got.Segments[1].Start != 1.2 {
|
if len(got.Segments) != 2 || got.Segments[0].Text != "Алло, тест." || got.Segments[1].Start != 1.2 {
|
||||||
t.Fatalf("segments = %#v", got.Segments)
|
t.Fatalf("segments = %#v", got.Segments)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSegmentTranscriptTextAddsHeuristicSpeakers(t *testing.T) {
|
func TestSegmentTranscriptTextDoesNotInventSpeakers(t *testing.T) {
|
||||||
got := segmentTranscriptText("Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается.", true)
|
got := segmentTranscriptText("Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается.")
|
||||||
if len(got) < 2 {
|
if len(got) < 2 {
|
||||||
t.Fatalf("segments = %#v, want multiple", got)
|
t.Fatalf("segments = %#v, want multiple", got)
|
||||||
}
|
}
|
||||||
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" {
|
if got[0].Speaker != "" || got[1].Speaker != "" {
|
||||||
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker)
|
t.Fatalf("speakers = %q/%q, want empty", got[0].Speaker, got[1].Speaker)
|
||||||
}
|
}
|
||||||
if got[1].Start <= got[0].Start {
|
if got[1].Start <= got[0].Start {
|
||||||
t.Fatalf("segment times did not advance: %#v", got)
|
t.Fatalf("segment times did not advance: %#v", got)
|
||||||
@@ -93,12 +93,12 @@ func TestNormalizeAudioLLMSegmentsSplitsSingleLongSegment(t *testing.T) {
|
|||||||
if len(got) < 2 {
|
if len(got) < 2 {
|
||||||
t.Fatalf("segments = %#v, want heuristic split", got)
|
t.Fatalf("segments = %#v, want heuristic split", got)
|
||||||
}
|
}
|
||||||
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" {
|
if got[0].Speaker != "" || got[1].Speaker != "" {
|
||||||
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker)
|
t.Fatalf("speakers = %q/%q, want empty", got[0].Speaker, got[1].Speaker)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNormalizeAudioLLMSegmentsKeepsSegmentsAndAddsSpeakers(t *testing.T) {
|
func TestNormalizeAudioLLMSegmentsKeepsSegmentsWithoutFakeSpeakers(t *testing.T) {
|
||||||
got := normalizeAudioLLMSegments([]Segment{
|
got := normalizeAudioLLMSegments([]Segment{
|
||||||
{Start: 0, End: 1, Text: "Алло."},
|
{Start: 0, End: 1, Text: "Алло."},
|
||||||
{Start: 1, End: 2, Text: "Да, слушаю."},
|
{Start: 1, End: 2, Text: "Да, слушаю."},
|
||||||
@@ -106,7 +106,21 @@ func TestNormalizeAudioLLMSegmentsKeepsSegmentsAndAddsSpeakers(t *testing.T) {
|
|||||||
if len(got) != 2 {
|
if len(got) != 2 {
|
||||||
t.Fatalf("segments = %#v", got)
|
t.Fatalf("segments = %#v", got)
|
||||||
}
|
}
|
||||||
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" {
|
if got[0].Speaker != "" || got[1].Speaker != "" {
|
||||||
t.Fatalf("speakers = %q/%q", got[0].Speaker, got[1].Speaker)
|
t.Fatalf("speakers = %q/%q, want empty", got[0].Speaker, got[1].Speaker)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeAudioLLMSegmentsUsesExplicitSpeakerLabels(t *testing.T) {
|
||||||
|
text := "Спикер 1: Алло, добрый день. Спикер 2: Да, слушаю. Спикер 1: Скажите, квартира продается?"
|
||||||
|
got := normalizeAudioLLMSegments(nil, text, true)
|
||||||
|
if len(got) != 3 {
|
||||||
|
t.Fatalf("segments = %#v, want 3", got)
|
||||||
|
}
|
||||||
|
if got[0].Speaker != "SPEAKER_00" || got[1].Speaker != "SPEAKER_01" || got[2].Speaker != "SPEAKER_00" {
|
||||||
|
t.Fatalf("speakers = %q/%q/%q", got[0].Speaker, got[1].Speaker, got[2].Speaker)
|
||||||
|
}
|
||||||
|
if got[0].Text != "Алло, добрый день." || got[1].Text != "Да, слушаю." {
|
||||||
|
t.Fatalf("texts = %#v", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user