169 lines
4.6 KiB
Go
169 lines
4.6 KiB
Go
package transcription
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type Client struct {
|
|
baseURL string
|
|
http *http.Client
|
|
}
|
|
|
|
type Input struct {
|
|
AudioURL string `json:"audio_url"`
|
|
Filename string `json:"filename,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
Diarize bool `json:"diarize"`
|
|
MinSpeakers int `json:"min_speakers,omitempty"`
|
|
MaxSpeakers int `json:"max_speakers,omitempty"`
|
|
}
|
|
|
|
type Segment struct {
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Text string `json:"text"`
|
|
Speaker string `json:"speaker,omitempty"`
|
|
}
|
|
|
|
type Result struct {
|
|
Language string `json:"language"`
|
|
Segments []Segment `json:"segments"`
|
|
DiarizeError *string `json:"diarize_error,omitempty"`
|
|
AlignError *string `json:"align_error,omitempty"`
|
|
DurationMS int64 `json:"duration_ms"`
|
|
}
|
|
|
|
type whisperResponse struct {
|
|
Language string `json:"language"`
|
|
Segments []Segment `json:"segments"`
|
|
DiarizeError *string `json:"diarize_error,omitempty"`
|
|
AlignError *string `json:"align_error,omitempty"`
|
|
}
|
|
|
|
func New(baseURL string, timeout time.Duration) *Client {
|
|
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
|
|
if baseURL == "" {
|
|
return nil
|
|
}
|
|
if timeout <= 0 {
|
|
timeout = 10 * time.Minute
|
|
}
|
|
return &Client{
|
|
baseURL: baseURL,
|
|
http: &http.Client{Timeout: timeout},
|
|
}
|
|
}
|
|
|
|
func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
|
|
if c == nil || c.baseURL == "" {
|
|
return nil, fmt.Errorf("whisperx not configured")
|
|
}
|
|
if strings.TrimSpace(in.AudioURL) == "" {
|
|
return nil, fmt.Errorf("audio_url is required")
|
|
}
|
|
audio, filename, err := c.downloadAudio(ctx, in)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
resp, duration, err := c.transcribeAudio(ctx, audio, filename, in)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &Result{
|
|
Language: resp.Language,
|
|
Segments: resp.Segments,
|
|
DiarizeError: resp.DiarizeError,
|
|
AlignError: resp.AlignError,
|
|
DurationMS: duration.Milliseconds(),
|
|
}, nil
|
|
}
|
|
|
|
func (c *Client) downloadAudio(ctx context.Context, in Input) ([]byte, string, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, in.AudioURL, nil)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("audio request: %w", err)
|
|
}
|
|
resp, err := c.http.Do(req)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("audio download: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode >= 300 {
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
|
return nil, "", fmt.Errorf("audio HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body)))
|
|
}
|
|
audio, err := io.ReadAll(io.LimitReader(resp.Body, 512<<20))
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("audio read: %w", err)
|
|
}
|
|
if len(audio) == 0 {
|
|
return nil, "", fmt.Errorf("audio is empty")
|
|
}
|
|
filename := filepath.Base(strings.TrimSpace(in.Filename))
|
|
if filename == "." || filename == "/" || filename == "" {
|
|
filename = "audio.mp3"
|
|
}
|
|
return audio, filename, nil
|
|
}
|
|
|
|
func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
|
|
body := &bytes.Buffer{}
|
|
mw := multipart.NewWriter(body)
|
|
fw, err := mw.CreateFormFile("file", filename)
|
|
if err != nil {
|
|
return nil, 0, fmt.Errorf("create form file: %w", err)
|
|
}
|
|
if _, err := fw.Write(audio); err != nil {
|
|
return nil, 0, fmt.Errorf("copy audio: %w", err)
|
|
}
|
|
if in.Language != "" {
|
|
_ = mw.WriteField("language", in.Language)
|
|
}
|
|
if in.Diarize {
|
|
_ = mw.WriteField("diarize", "true")
|
|
if in.MinSpeakers > 0 {
|
|
_ = mw.WriteField("min_speakers", fmt.Sprintf("%d", in.MinSpeakers))
|
|
}
|
|
if in.MaxSpeakers > 0 {
|
|
_ = mw.WriteField("max_speakers", fmt.Sprintf("%d", in.MaxSpeakers))
|
|
}
|
|
} else {
|
|
_ = mw.WriteField("diarize", "false")
|
|
}
|
|
if err := mw.Close(); err != nil {
|
|
return nil, 0, fmt.Errorf("close form: %w", err)
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/transcribe", body)
|
|
if err != nil {
|
|
return nil, 0, fmt.Errorf("whisperx request: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", mw.FormDataContentType())
|
|
|
|
start := time.Now()
|
|
resp, err := c.http.Do(req)
|
|
duration := time.Since(start)
|
|
if err != nil {
|
|
return nil, duration, fmt.Errorf("whisperx do: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode >= 300 {
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
|
return nil, duration, fmt.Errorf("whisperx HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body)))
|
|
}
|
|
var out whisperResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
|
return nil, duration, fmt.Errorf("whisperx decode: %w", err)
|
|
}
|
|
return &out, duration, nil
|
|
}
|