Use verbose Whisper transcription output

2026-06-11 11:40:52 +03:00
parent b536877181
commit 92ac01d8b5
2 changed files with 83 additions and 5 deletions
--- a/internal/transcription/client_test.go
+++ b/internal/transcription/client_test.go
@@ -28,7 +28,7 @@ func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
 	}))
 	defer audioSrv.Close()

-	var gotPath, gotModel, gotResponseFormat, gotPrompt, gotTemperature string
+	var gotPath, gotModel, gotResponseFormat, gotPrompt, gotTemperature, gotTimestampGranularity string
 	providerSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		gotPath = r.URL.Path
 		if err := r.ParseMultipartForm(16 << 20); err != nil {
@@ -38,6 +38,7 @@ func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
 		gotResponseFormat = r.FormValue("response_format")
 		gotPrompt = r.FormValue("prompt")
 		gotTemperature = r.FormValue("temperature")
+		gotTimestampGranularity = r.FormValue("timestamp_granularities[]")
 		if _, _, err := r.FormFile("file"); err != nil {
 			t.Fatalf("FormFile: %v", err)
 		}
@@ -68,12 +69,15 @@ func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
 	if gotModel != "openai/whisper-large-v3" {
 		t.Fatalf("model = %q", gotModel)
 	}
-	if gotResponseFormat != "json" {
-		t.Fatalf("response_format = %q, want json", gotResponseFormat)
+	if gotResponseFormat != "verbose_json" {
+		t.Fatalf("response_format = %q, want verbose_json", gotResponseFormat)
 	}
 	if gotTemperature != "0" {
 		t.Fatalf("temperature = %q, want 0", gotTemperature)
 	}
+	if gotTimestampGranularity != "segment" {
+		t.Fatalf("timestamp_granularities[] = %q, want segment", gotTimestampGranularity)
+	}
 	if gotPrompt != "" {
 		t.Fatalf("prompt = %q, want empty", gotPrompt)
 	}
@@ -82,6 +86,48 @@ func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
 	}
 }

+func TestWhisperFallsBackToJSONWhenVerboseJSONUnsupported(t *testing.T) {
+	audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = w.Write([]byte("fake audio"))
+	}))
+	defer audioSrv.Close()
+
+	var formats []string
+	providerSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if err := r.ParseMultipartForm(16 << 20); err != nil {
+			t.Fatalf("ParseMultipartForm: %v", err)
+		}
+		format := r.FormValue("response_format")
+		formats = append(formats, format)
+		if format == "verbose_json" {
+			w.WriteHeader(http.StatusBadRequest)
+			_ = json.NewEncoder(w).Encode(map[string]any{
+				"error": map[string]any{"message": "unsupported response_format verbose_json"},
+			})
+			return
+		}
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"text": "Алло, fallback работает.",
+		})
+	}))
+	defer providerSrv.Close()
+
+	client := NewWithOptions(Options{
+		AudioBaseURL: providerSrv.URL,
+		AudioModel:   "openai/whisper-large-v3",
+	})
+	got, err := client.Transcribe(t.Context(), Input{AudioURL: audioSrv.URL, Filename: "call.mp3"})
+	if err != nil {
+		t.Fatalf("Transcribe: %v", err)
+	}
+	if len(formats) != 2 || formats[0] != "verbose_json" || formats[1] != "json" {
+		t.Fatalf("formats = %#v, want verbose_json then json", formats)
+	}
+	if got.Segments[0].Text != "Алло, fallback работает." {
+		t.Fatalf("segments = %#v", got.Segments)
+	}
+}
+
 func TestSegmentTranscriptTextDoesNotInventSpeakers(t *testing.T) {
 	got := segmentTranscriptText("Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается.")
 	if len(got) < 2 {