From 8a2fef5a2409d68b622bb7e8faa55085d31d423c Mon Sep 17 00:00:00 2001 From: Grendgi Date: Mon, 8 Jun 2026 23:09:02 +0300 Subject: [PATCH] Pad audio before WhisperX transcription --- Dockerfile | 2 +- cmd/worker/main.go | 3 +- internal/config/config.go | 36 ++++---- internal/transcription/client.go | 119 ++++++++++++++++++++++++-- internal/transcription/client_test.go | 28 ++++++ k8s/configmap.yaml | 2 + 6 files changed, 166 insertions(+), 24 deletions(-) create mode 100644 internal/transcription/client_test.go diff --git a/Dockerfile b/Dockerfile index a6004f1..eadeca2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ RUN CGO_ENABLED=0 GOOS=linux go build -o /out/ai-service ./cmd/server \ FROM alpine:3.22 -RUN apk add --no-cache ca-certificates tini +RUN apk add --no-cache ca-certificates ffmpeg tini WORKDIR /app COPY --from=builder /out/ai-service /usr/local/bin/ai-service diff --git a/cmd/worker/main.go b/cmd/worker/main.go index abff1de..3b4da2b 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -42,13 +42,14 @@ func main() { } llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout) - transcriber := transcription.New(cfg.WhisperXURL, cfg.WhisperXTimeout) + transcriber := transcription.New(cfg.WhisperXURL, cfg.WhisperXTimeout, cfg.FfmpegPath, cfg.WhisperXLeadSilence) w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerTaskTypes, cfg.WorkerModelProfiles, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit) slog.Info("ai_worker_started", "worker_id", cfg.WorkerID, "model", cfg.LLMModel, "whisperx_enabled", transcriber != nil, + "whisperx_lead_silence", cfg.WhisperXLeadSilence.String(), "task_types", cfg.WorkerTaskTypes, "model_profiles", cfg.WorkerModelProfiles, "poll_interval", cfg.WorkerPollInterval.String(), diff --git a/internal/config/config.go b/internal/config/config.go index d56da79..e337231 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -14,14 +14,16 @@ type Config struct { MigrateOnStart bool APIAuthToken string - LLMBaseURL string - LLMAPIKey string - LLMModel string - LLMTimeout time.Duration - WhisperXURL string - WhisperXTimeout time.Duration - AIStatsSidecarURL string - AIStatsTimeout time.Duration + LLMBaseURL string + LLMAPIKey string + LLMModel string + LLMTimeout time.Duration + WhisperXURL string + WhisperXTimeout time.Duration + WhisperXLeadSilence time.Duration + FfmpegPath string + AIStatsSidecarURL string + AIStatsTimeout time.Duration WorkerID string WorkerPollInterval time.Duration @@ -39,14 +41,16 @@ func Load() Config { MigrateOnStart: envBool("MIGRATE_ON_START", true), APIAuthToken: envString("AI_SERVICE_TOKEN", ""), - LLMBaseURL: envString("LLM_BASE_URL", ""), - LLMAPIKey: envString("LLM_API_KEY", ""), - LLMModel: envString("LLM_MODEL", "qwen2.5-14b"), - LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute), - WhisperXURL: envString("WHISPERX_URL", ""), - WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute), - AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""), - AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second), + LLMBaseURL: envString("LLM_BASE_URL", ""), + LLMAPIKey: envString("LLM_API_KEY", ""), + LLMModel: envString("LLM_MODEL", "qwen2.5-14b"), + LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute), + WhisperXURL: envString("WHISPERX_URL", ""), + WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute), + WhisperXLeadSilence: envDuration("WHISPERX_LEAD_SILENCE", 800*time.Millisecond), + FfmpegPath: envString("FFMPEG_PATH", "/usr/bin/ffmpeg"), + AIStatsSidecarURL: envString("AI_STATS_SIDECAR_URL", ""), + AIStatsTimeout: envDuration("AI_STATS_TIMEOUT", 8*time.Second), WorkerID: envString("WORKER_ID", hostname()), WorkerPollInterval: envDuration("WORKER_POLL_INTERVAL", 2*time.Second), diff --git a/internal/transcription/client.go b/internal/transcription/client.go index 5770093..de18059 100644 --- a/internal/transcription/client.go +++ b/internal/transcription/client.go @@ -8,14 +8,18 @@ import ( "io" "mime/multipart" "net/http" + "os" + "os/exec" "path/filepath" "strings" "time" ) type Client struct { - baseURL string - http *http.Client + baseURL string + http *http.Client + ffmpegPath string + leadSilence time.Duration } type Input struct { @@ -49,7 +53,7 @@ type whisperResponse struct { AlignError *string `json:"align_error,omitempty"` } -func New(baseURL string, timeout time.Duration) *Client { +func New(baseURL string, timeout time.Duration, ffmpegPath string, leadSilence time.Duration) *Client { baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/") if baseURL == "" { return nil @@ -57,9 +61,21 @@ func New(baseURL string, timeout time.Duration) *Client { if timeout <= 0 { timeout = 10 * time.Minute } + if leadSilence < 0 { + leadSilence = 0 + } + if leadSilence > 5*time.Second { + leadSilence = 5 * time.Second + } + ffmpegPath = strings.TrimSpace(ffmpegPath) + if ffmpegPath == "" { + ffmpegPath = "ffmpeg" + } return &Client{ - baseURL: baseURL, - http: &http.Client{Timeout: timeout}, + baseURL: baseURL, + http: &http.Client{Timeout: timeout}, + ffmpegPath: ffmpegPath, + leadSilence: leadSilence, } } @@ -74,13 +90,20 @@ func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) { if err != nil { return nil, err } + if c.leadSilence > 0 { + audio, filename, err = c.addLeadSilence(ctx, audio, filename) + if err != nil { + return nil, err + } + } resp, duration, err := c.transcribeAudio(ctx, audio, filename, in) if err != nil { return nil, err } + segments := adjustLeadSilence(resp.Segments, c.leadSilence) return &Result{ Language: resp.Language, - Segments: resp.Segments, + Segments: segments, DiarizeError: resp.DiarizeError, AlignError: resp.AlignError, DurationMS: duration.Milliseconds(), @@ -115,6 +138,90 @@ func (c *Client) downloadAudio(ctx context.Context, in Input) ([]byte, string, e return audio, filename, nil } +func (c *Client) addLeadSilence(ctx context.Context, audio []byte, filename string) ([]byte, string, error) { + tmpDir, err := os.MkdirTemp("", "ai-transcribe-*") + if err != nil { + return nil, "", fmt.Errorf("prepare audio temp dir: %w", err) + } + defer os.RemoveAll(tmpDir) + + inputPath := filepath.Join(tmpDir, "input"+safeExt(filename)) + outputPath := filepath.Join(tmpDir, "padded.mp3") + if err := os.WriteFile(inputPath, audio, 0o600); err != nil { + return nil, "", fmt.Errorf("write audio temp file: %w", err) + } + delayMS := int(c.leadSilence.Milliseconds()) + if delayMS <= 0 { + return audio, filename, nil + } + cmd := exec.CommandContext(ctx, c.ffmpegPath, + "-nostdin", "-y", + "-i", inputPath, + "-af", fmt.Sprintf("adelay=%d:all=1", delayMS), + "-codec:a", "libmp3lame", + "-qscale:a", "5", + outputPath, + ) + out, err := cmd.CombinedOutput() + if err != nil { + return nil, "", fmt.Errorf("ffmpeg lead silence: %w (%s)", err, trimOutput(out)) + } + padded, err := os.ReadFile(outputPath) + if err != nil { + return nil, "", fmt.Errorf("read padded audio: %w", err) + } + if len(padded) == 0 { + return nil, "", fmt.Errorf("padded audio is empty") + } + base := strings.TrimSuffix(filepath.Base(filename), filepath.Ext(filename)) + if base == "" || base == "." || base == "/" { + base = "audio" + } + return padded, base + "-padded.mp3", nil +} + +func safeExt(filename string) string { + ext := strings.ToLower(filepath.Ext(filename)) + switch ext { + case ".mp3", ".wav", ".m4a", ".ogg", ".opus", ".webm": + return ext + default: + return ".audio" + } +} + +func trimOutput(out []byte) string { + s := strings.TrimSpace(string(out)) + if len(s) > 600 { + return s[:600] + } + return s +} + +func adjustLeadSilence(segments []Segment, silence time.Duration) []Segment { + if len(segments) == 0 || silence <= 0 { + return segments + } + shift := silence.Seconds() + out := make([]Segment, 0, len(segments)) + for _, segment := range segments { + segment.Start = clampTime(segment.Start - shift) + segment.End = clampTime(segment.End - shift) + if segment.End < segment.Start { + segment.End = segment.Start + } + out = append(out, segment) + } + return out +} + +func clampTime(v float64) float64 { + if v < 0 { + return 0 + } + return v +} + func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) { body := &bytes.Buffer{} mw := multipart.NewWriter(body) diff --git a/internal/transcription/client_test.go b/internal/transcription/client_test.go new file mode 100644 index 0000000..fb31b72 --- /dev/null +++ b/internal/transcription/client_test.go @@ -0,0 +1,28 @@ +package transcription + +import ( + "math" + "testing" + "time" +) + +func TestAdjustLeadSilence(t *testing.T) { + got := adjustLeadSilence([]Segment{ + {Start: 0.2, End: 1.1, Text: "first"}, + {Start: 1.4, End: 2.0, Text: "second"}, + }, 800*time.Millisecond) + + if got[0].Start != 0 { + t.Fatalf("first start = %v, want 0", got[0].Start) + } + if !near(got[0].End, 0.3) { + t.Fatalf("first end = %v, want 0.3", got[0].End) + } + if !near(got[1].Start, 0.6) { + t.Fatalf("second start = %v, want 0.6", got[1].Start) + } +} + +func near(got, want float64) bool { + return math.Abs(got-want) < 0.000001 +} diff --git a/k8s/configmap.yaml b/k8s/configmap.yaml index b4b32c6..31494f1 100644 --- a/k8s/configmap.yaml +++ b/k8s/configmap.yaml @@ -13,6 +13,8 @@ data: LLM_TIMEOUT: "5m" WHISPERX_URL: "http://10.2.3.5:8001" WHISPERX_TIMEOUT: "10m" + WHISPERX_LEAD_SILENCE: "800ms" + FFMPEG_PATH: "/usr/bin/ffmpeg" AI_STATS_SIDECAR_URL: "http://10.2.3.5:9090" AI_STATS_TIMEOUT: "8s" WORKER_POLL_INTERVAL: "2s"