Use verbose Whisper transcription output
This commit is contained in:
@@ -28,7 +28,7 @@ func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
}))
|
||||
defer audioSrv.Close()
|
||||
|
||||
var gotPath, gotModel, gotResponseFormat, gotPrompt, gotTemperature string
|
||||
var gotPath, gotModel, gotResponseFormat, gotPrompt, gotTemperature, gotTimestampGranularity string
|
||||
providerSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
gotPath = r.URL.Path
|
||||
if err := r.ParseMultipartForm(16 << 20); err != nil {
|
||||
@@ -38,6 +38,7 @@ func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
gotResponseFormat = r.FormValue("response_format")
|
||||
gotPrompt = r.FormValue("prompt")
|
||||
gotTemperature = r.FormValue("temperature")
|
||||
gotTimestampGranularity = r.FormValue("timestamp_granularities[]")
|
||||
if _, _, err := r.FormFile("file"); err != nil {
|
||||
t.Fatalf("FormFile: %v", err)
|
||||
}
|
||||
@@ -68,12 +69,15 @@ func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
if gotModel != "openai/whisper-large-v3" {
|
||||
t.Fatalf("model = %q", gotModel)
|
||||
}
|
||||
if gotResponseFormat != "json" {
|
||||
t.Fatalf("response_format = %q, want json", gotResponseFormat)
|
||||
if gotResponseFormat != "verbose_json" {
|
||||
t.Fatalf("response_format = %q, want verbose_json", gotResponseFormat)
|
||||
}
|
||||
if gotTemperature != "0" {
|
||||
t.Fatalf("temperature = %q, want 0", gotTemperature)
|
||||
}
|
||||
if gotTimestampGranularity != "segment" {
|
||||
t.Fatalf("timestamp_granularities[] = %q, want segment", gotTimestampGranularity)
|
||||
}
|
||||
if gotPrompt != "" {
|
||||
t.Fatalf("prompt = %q, want empty", gotPrompt)
|
||||
}
|
||||
@@ -82,6 +86,48 @@ func TestWhisperUsesAudioTranscriptionsEndpoint(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestWhisperFallsBackToJSONWhenVerboseJSONUnsupported(t *testing.T) {
|
||||
audioSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("fake audio"))
|
||||
}))
|
||||
defer audioSrv.Close()
|
||||
|
||||
var formats []string
|
||||
providerSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if err := r.ParseMultipartForm(16 << 20); err != nil {
|
||||
t.Fatalf("ParseMultipartForm: %v", err)
|
||||
}
|
||||
format := r.FormValue("response_format")
|
||||
formats = append(formats, format)
|
||||
if format == "verbose_json" {
|
||||
w.WriteHeader(http.StatusBadRequest)
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"error": map[string]any{"message": "unsupported response_format verbose_json"},
|
||||
})
|
||||
return
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"text": "Алло, fallback работает.",
|
||||
})
|
||||
}))
|
||||
defer providerSrv.Close()
|
||||
|
||||
client := NewWithOptions(Options{
|
||||
AudioBaseURL: providerSrv.URL,
|
||||
AudioModel: "openai/whisper-large-v3",
|
||||
})
|
||||
got, err := client.Transcribe(t.Context(), Input{AudioURL: audioSrv.URL, Filename: "call.mp3"})
|
||||
if err != nil {
|
||||
t.Fatalf("Transcribe: %v", err)
|
||||
}
|
||||
if len(formats) != 2 || formats[0] != "verbose_json" || formats[1] != "json" {
|
||||
t.Fatalf("formats = %#v, want verbose_json then json", formats)
|
||||
}
|
||||
if got.Segments[0].Text != "Алло, fallback работает." {
|
||||
t.Fatalf("segments = %#v", got.Segments)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSegmentTranscriptTextDoesNotInventSpeakers(t *testing.T) {
|
||||
got := segmentTranscriptText("Алло, добрый день. Да, слушаю. Скажите, квартира продается? Да, продается.")
|
||||
if len(got) < 2 {
|
||||
|
||||
Reference in New Issue
Block a user