Use verbose Whisper transcription output
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
@@ -103,6 +104,15 @@ type audioTranscriptionSegment struct {
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
type audioTranscriptionStatusError struct {
|
||||
status int
|
||||
body string
|
||||
}
|
||||
|
||||
func (e audioTranscriptionStatusError) Error() string {
|
||||
return fmt.Sprintf("audio transcription HTTP %d: %s", e.status, e.body)
|
||||
}
|
||||
|
||||
func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
|
||||
return NewWithOptions(Options{
|
||||
AudioBaseURL: baseURL,
|
||||
@@ -233,7 +243,15 @@ func clampTime(v float64) float64 {
|
||||
}
|
||||
|
||||
func (c *Client) transcribeOpenAIAudio(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input) (*audioLLMResponse, time.Duration, error) {
|
||||
return c.doOpenAIAudioTranscription(ctx, provider, audio, filename, in, "json")
|
||||
resp, duration, err := c.doOpenAIAudioTranscription(ctx, provider, audio, filename, in, "verbose_json")
|
||||
if err == nil {
|
||||
return resp, duration, nil
|
||||
}
|
||||
if !isVerboseJSONUnsupported(err) {
|
||||
return nil, duration, err
|
||||
}
|
||||
fallbackResp, fallbackDuration, fallbackErr := c.doOpenAIAudioTranscription(ctx, provider, audio, filename, in, "json")
|
||||
return fallbackResp, duration + fallbackDuration, fallbackErr
|
||||
}
|
||||
|
||||
func (c *Client) doOpenAIAudioTranscription(ctx context.Context, provider ProviderConfig, audio []byte, filename string, in Input, responseFormat string) (*audioLLMResponse, time.Duration, error) {
|
||||
@@ -295,7 +313,7 @@ func (c *Client) doOpenAIAudioTranscription(ctx context.Context, provider Provid
|
||||
return nil, duration, fmt.Errorf("audio transcription read: %w", err)
|
||||
}
|
||||
if resp.StatusCode >= 300 {
|
||||
return nil, duration, fmt.Errorf("audio transcription HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(raw)))
|
||||
return nil, duration, audioTranscriptionStatusError{status: resp.StatusCode, body: strings.TrimSpace(string(raw))}
|
||||
}
|
||||
var out audioTranscriptionResponse
|
||||
if err := json.Unmarshal(raw, &out); err != nil {
|
||||
@@ -313,6 +331,20 @@ func (c *Client) doOpenAIAudioTranscription(ctx context.Context, provider Provid
|
||||
}, duration, nil
|
||||
}
|
||||
|
||||
func isVerboseJSONUnsupported(err error) bool {
|
||||
var statusErr audioTranscriptionStatusError
|
||||
if !errors.As(err, &statusErr) {
|
||||
return false
|
||||
}
|
||||
if statusErr.status != http.StatusBadRequest && statusErr.status != http.StatusUnprocessableEntity {
|
||||
return false
|
||||
}
|
||||
body := strings.ToLower(statusErr.body)
|
||||
return strings.Contains(body, "verbose_json") ||
|
||||
strings.Contains(body, "response_format") ||
|
||||
strings.Contains(body, "timestamp_granularities")
|
||||
}
|
||||
|
||||
func convertAudioSegments(in []audioTranscriptionSegment) []Segment {
|
||||
out := make([]Segment, 0, len(in))
|
||||
for _, s := range in {
|
||||
|
||||
Reference in New Issue
Block a user