Pad audio before WhisperX transcription
All checks were successful
CI / test (push) Successful in 16s
Build and Deploy / build-and-deploy (push) Successful in 28s

This commit is contained in:
Grendgi
2026-06-08 23:09:02 +03:00
parent 16ff6ecb5e
commit 8a2fef5a24
6 changed files with 166 additions and 24 deletions

View File

@@ -13,7 +13,7 @@ RUN CGO_ENABLED=0 GOOS=linux go build -o /out/ai-service ./cmd/server \
FROM alpine:3.22
RUN apk add --no-cache ca-certificates tini
RUN apk add --no-cache ca-certificates ffmpeg tini
WORKDIR /app
COPY --from=builder /out/ai-service /usr/local/bin/ai-service

View File

@@ -42,13 +42,14 @@ func main() {
}
llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
transcriber := transcription.New(cfg.WhisperXURL, cfg.WhisperXTimeout)
transcriber := transcription.New(cfg.WhisperXURL, cfg.WhisperXTimeout, cfg.FfmpegPath, cfg.WhisperXLeadSilence)
w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
slog.Info("ai_worker_started",
"worker_id", cfg.WorkerID,
"model", cfg.LLMModel,
"whisperx_enabled", transcriber != nil,
"whisperx_lead_silence", cfg.WhisperXLeadSilence.String(),
"task_types", cfg.WorkerTaskTypes,
"model_profiles", cfg.WorkerModelProfiles,
"poll_interval", cfg.WorkerPollInterval.String(),

View File

@@ -14,14 +14,16 @@ type Config struct {
MigrateOnStart bool
APIAuthToken string
LLMBaseURL string
LLMAPIKey string
LLMModel string
LLMTimeout time.Duration
WhisperXURL string
WhisperXTimeout time.Duration
AIStatsSidecarURL string
AIStatsTimeout time.Duration
LLMBaseURL string
LLMAPIKey string
LLMModel string
LLMTimeout time.Duration
WhisperXURL string
WhisperXTimeout time.Duration
WhisperXLeadSilence time.Duration
FfmpegPath string
AIStatsSidecarURL string
AIStatsTimeout time.Duration
WorkerID string
WorkerPollInterval time.Duration
@@ -39,14 +41,16 @@ func Load() Config {
MigrateOnStart: envBool("MIGRATE_ON_START", true),
APIAuthToken: envString("AI_SERVICE_TOKEN", ""),
LLMBaseURL: envString("LLM_BASE_URL", ""),
LLMAPIKey: envString("LLM_API_KEY", ""),
LLMModel: envString("LLM_MODEL", "qwen2.5-14b"),
LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute),
WhisperXURL: envString("WHISPERX_URL", ""),
WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second),
LLMBaseURL: envString("LLM_BASE_URL", ""),
LLMAPIKey: envString("LLM_API_KEY", ""),
LLMModel: envString("LLM_MODEL", "qwen2.5-14b"),
LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute),
WhisperXURL: envString("WHISPERX_URL", ""),
WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
WhisperXLeadSilence: envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond),
FfmpegPath: envString("FFMPEG_PATH", "/usr/bin/ffmpeg"),
AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""),
AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second),
WorkerID: envString("WORKER_ID", hostname()),
WorkerPollInterval: envDuration("WORKER_POLL_INTERVAL", 2*time.Second),

View File

@@ -8,14 +8,18 @@ import (
"io"
"mime/multipart"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
type Client struct {
baseURL string
http *http.Client
baseURL string
http *http.Client
ffmpegPath string
leadSilence time.Duration
}
type Input struct {
@@ -49,7 +53,7 @@ type whisperResponse struct {
AlignError *string `json:"align_error,omitempty"`
}
func New(baseURL string, timeout time.Duration) *Client {
func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client {
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if baseURL == "" {
return nil
@@ -57,9 +61,21 @@ func New(baseURL string, timeout time.Duration) *Client {
if timeout <= 0 {
timeout = 10 * time.Minute
}
if leadSilence < 0 {
leadSilence = 0
}
if leadSilence > 5*time.Second {
leadSilence = 5 * time.Second
}
ffmpegPath = strings.TrimSpace(ffmpegPath)
if ffmpegPath == "" {
ffmpegPath = "ffmpeg"
}
return &Client{
baseURL: baseURL,
http: &http.Client{Timeout: timeout},
baseURL: baseURL,
http: &http.Client{Timeout: timeout},
ffmpegPath: ffmpegPath,
leadSilence: leadSilence,
}
}
@@ -74,13 +90,20 @@ func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
if err != nil {
return nil, err
}
if c.leadSilence > 0 {
audio, filename, err = c.addLeadSilence(ctx, audio, filename)
if err != nil {
return nil, err
}
}
resp, duration, err := c.transcribeAudio(ctx, audio, filename, in)
if err != nil {
return nil, err
}
segments := adjustLeadSilence(resp.Segments, c.leadSilence)
return &Result{
Language: resp.Language,
Segments: resp.Segments,
Segments: segments,
DiarizeError: resp.DiarizeError,
AlignError: resp.AlignError,
DurationMS: duration.Milliseconds(),
@@ -115,6 +138,90 @@ func (c *Client) downloadAudio(ctx context.Context, in Input) ([]byte, string, e
return audio, filename, nil
}
func (c *Client) addLeadSilence(ctx context.Context, audio []byte, filename string) ([]byte, string, error) {
tmpDir, err := os.MkdirTemp("", "ai-transcribe-*")
if err != nil {
return nil, "", fmt.Errorf("prepare audio temp dir: %w", err)
}
defer os.RemoveAll(tmpDir)
inputPath := filepath.Join(tmpDir, "input"+safeExt(filename))
outputPath := filepath.Join(tmpDir, "padded.mp3")
if err := os.WriteFile(inputPath, audio, 0o600); err != nil {
return nil, "", fmt.Errorf("write audio temp file: %w", err)
}
delayMS := int(c.leadSilence.Milliseconds())
if delayMS <= 0 {
return audio, filename, nil
}
cmd := exec.CommandContext(ctx, c.ffmpegPath,
"-nostdin", "-y",
"-i", inputPath,
"-af", fmt.Sprintf("adelay=%d:all=1", delayMS),
"-codec:a", "libmp3lame",
"-qscale:a", "5",
outputPath,
)
out, err := cmd.CombinedOutput()
if err != nil {
return nil, "", fmt.Errorf("ffmpeg lead silence: %w (%s)", err, trimOutput(out))
}
padded, err := os.ReadFile(outputPath)
if err != nil {
return nil, "", fmt.Errorf("read padded audio: %w", err)
}
if len(padded) == 0 {
return nil, "", fmt.Errorf("padded audio is empty")
}
base := strings.TrimSuffix(filepath.Base(filename), filepath.Ext(filename))
if base == "" || base == "." || base == "/" {
base = "audio"
}
return padded, base + "-padded.mp3", nil
}
func safeExt(filename string) string {
ext := strings.ToLower(filepath.Ext(filename))
switch ext {
case ".mp3", ".wav", ".m4a", ".ogg", ".opus", ".webm":
return ext
default:
return ".audio"
}
}
func trimOutput(out []byte) string {
s := strings.TrimSpace(string(out))
if len(s) > 600 {
return s[:600]
}
return s
}
func adjustLeadSilence(segments []Segment, silence time.Duration) []Segment {
if len(segments) == 0 || silence <= 0 {
return segments
}
shift := silence.Seconds()
out := make([]Segment, 0, len(segments))
for _, segment := range segments {
segment.Start = clampTime(segment.Start - shift)
segment.End = clampTime(segment.End - shift)
if segment.End < segment.Start {
segment.End = segment.Start
}
out = append(out, segment)
}
return out
}
func clampTime(v float64) float64 {
if v < 0 {
return 0
}
return v
}
func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
body := &bytes.Buffer{}
mw := multipart.NewWriter(body)

View File

@@ -0,0 +1,28 @@
package transcription
import (
"math"
"testing"
"time"
)
func TestAdjustLeadSilence(t *testing.T) {
got := adjustLeadSilence([]Segment{
{Start: 0.2, End: 1.1, Text: "first"},
{Start: 1.4, End: 2.0, Text: "second"},
}, 800*time.Millisecond)
if got[0].Start != 0 {
t.Fatalf("first start = %v, want 0", got[0].Start)
}
if !near(got[0].End, 0.3) {
t.Fatalf("first end = %v, want 0.3", got[0].End)
}
if !near(got[1].Start, 0.6) {
t.Fatalf("second start = %v, want 0.6", got[1].Start)
}
}
func near(got, want float64) bool {
return math.Abs(got-want) < 0.000001
}

View File

@@ -13,6 +13,8 @@ data:
LLM_TIMEOUT: "5m"
WHISPERX_URL: "http://10.2.3.5:8001"
WHISPERX_TIMEOUT: "10m"
WHISPERX_LEAD_SILENCE: "800ms"
FFMPEG_PATH: "/usr/bin/ffmpeg"
AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090"
AI_STATS_TIMEOUT: "8s"
WORKER_POLL_INTERVAL: "2s"