Add transcription jobs to AI service
All checks were successful
CI / test (push) Successful in 15s
Build and Deploy / build-and-deploy (push) Successful in 25s

This commit is contained in:
Grendgi
2026-06-08 15:39:26 +03:00
parent e9792274a4
commit 17cca2a99a
6 changed files with 252 additions and 14 deletions

View File

@@ -11,6 +11,7 @@ import (
"ai-service/internal/llm" "ai-service/internal/llm"
"ai-service/internal/migrate" "ai-service/internal/migrate"
"ai-service/internal/store" "ai-service/internal/store"
"ai-service/internal/transcription"
"ai-service/internal/worker" "ai-service/internal/worker"
) )
@@ -41,11 +42,13 @@ func main() {
} }
llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout) llmClient := llm.New(cfg.LLMBaseURL, cfg.LLMAPIKey, cfg.LLMModel, cfg.LLMTimeout)
w := worker.New(db, llmClient, cfg.WorkerID, cfg.LLMModel, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit) transcriber := transcription.New(cfg.WhisperXURL, cfg.WhisperXTimeout)
w := worker.New(db, llmClient, transcriber, cfg.WorkerID, cfg.LLMModel, cfg.WorkerPollInterval, cfg.WorkerLeaseTimeout, cfg.WorkerClaimLimit)
slog.Info("ai_worker_started", slog.Info("ai_worker_started",
"worker_id", cfg.WorkerID, "worker_id", cfg.WorkerID,
"model", cfg.LLMModel, "model", cfg.LLMModel,
"whisperx_enabled", transcriber != nil,
"poll_interval", cfg.WorkerPollInterval.String(), "poll_interval", cfg.WorkerPollInterval.String(),
"lease_timeout", cfg.WorkerLeaseTimeout.String(), "lease_timeout", cfg.WorkerLeaseTimeout.String(),
"claim_limit", cfg.WorkerClaimLimit, "claim_limit", cfg.WorkerClaimLimit,

View File

@@ -13,11 +13,12 @@ type Config struct {
MigrateOnStart bool MigrateOnStart bool
APIAuthToken string APIAuthToken string
LLMBaseURL string LLMBaseURL string
LLMAPIKey string LLMAPIKey string
LLMModel string LLMModel string
LLMTimeout time.Duration LLMTimeout time.Duration
WhisperXURL string WhisperXURL string
WhisperXTimeout time.Duration
WorkerID string WorkerID string
WorkerPollInterval time.Duration WorkerPollInterval time.Duration
@@ -33,11 +34,12 @@ func Load() Config {
MigrateOnStart: envBool("MIGRATE_ON_START", true), MigrateOnStart: envBool("MIGRATE_ON_START", true),
APIAuthToken: envString("AI_SERVICE_TOKEN", ""), APIAuthToken: envString("AI_SERVICE_TOKEN", ""),
LLMBaseURL: envString("LLM_BASE_URL", ""), LLMBaseURL: envString("LLM_BASE_URL", ""),
LLMAPIKey: envString("LLM_API_KEY", ""), LLMAPIKey: envString("LLM_API_KEY", ""),
LLMModel: envString("LLM_MODEL", "qwen2.5-14b"), LLMModel: envString("LLM_MODEL", "qwen2.5-14b"),
LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute), LLMTimeout: envDuration("LLM_TIMEOUT", 5*time.Minute),
WhisperXURL: envString("WHISPERX_URL", ""), WhisperXURL: envString("WHISPERX_URL", ""),
WhisperXTimeout: envDuration("WHISPERX_TIMEOUT", 10*time.Minute),
WorkerID: envString("WORKER_ID", hostname()), WorkerID: envString("WORKER_ID", hostname()),
WorkerPollInterval: envDuration("WORKER_POLL_INTERVAL", 2*time.Second), WorkerPollInterval: envDuration("WORKER_POLL_INTERVAL", 2*time.Second),

View File

@@ -0,0 +1,168 @@
package transcription
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"path/filepath"
"strings"
"time"
)
type Client struct {
baseURL string
http *http.Client
}
type Input struct {
AudioURL string `json:"audio_url"`
Filename string `json:"filename,omitempty"`
Language string `json:"language,omitempty"`
Diarize bool `json:"diarize"`
MinSpeakers int `json:"min_speakers,omitempty"`
MaxSpeakers int `json:"max_speakers,omitempty"`
}
type Segment struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
Speaker string `json:"speaker,omitempty"`
}
type Result struct {
Language string `json:"language"`
Segments []Segment `json:"segments"`
DiarizeError *string `json:"diarize_error,omitempty"`
AlignError *string `json:"align_error,omitempty"`
DurationMS int64 `json:"duration_ms"`
}
type whisperResponse struct {
Language string `json:"language"`
Segments []Segment `json:"segments"`
DiarizeError *string `json:"diarize_error,omitempty"`
AlignError *string `json:"align_error,omitempty"`
}
func New(baseURL string, timeout time.Duration) *Client {
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if baseURL == "" {
return nil
}
if timeout <= 0 {
timeout = 10 * time.Minute
}
return &Client{
baseURL: baseURL,
http: &http.Client{Timeout: timeout},
}
}
func (c *Client) Transcribe(ctx context.Context, in Input) (*Result, error) {
if c == nil || c.baseURL == "" {
return nil, fmt.Errorf("whisperx not configured")
}
if strings.TrimSpace(in.AudioURL) == "" {
return nil, fmt.Errorf("audio_url is required")
}
audio, filename, err := c.downloadAudio(ctx, in)
if err != nil {
return nil, err
}
resp, duration, err := c.transcribeAudio(ctx, audio, filename, in)
if err != nil {
return nil, err
}
return &Result{
Language: resp.Language,
Segments: resp.Segments,
DiarizeError: resp.DiarizeError,
AlignError: resp.AlignError,
DurationMS: duration.Milliseconds(),
}, nil
}
func (c *Client) downloadAudio(ctx context.Context, in Input) ([]byte, string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, in.AudioURL, nil)
if err != nil {
return nil, "", fmt.Errorf("audio request: %w", err)
}
resp, err := c.http.Do(req)
if err != nil {
return nil, "", fmt.Errorf("audio download: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return nil, "", fmt.Errorf("audio HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body)))
}
audio, err := io.ReadAll(io.LimitReader(resp.Body, 512<<20))
if err != nil {
return nil, "", fmt.Errorf("audio read: %w", err)
}
if len(audio) == 0 {
return nil, "", fmt.Errorf("audio is empty")
}
filename := filepath.Base(strings.TrimSpace(in.Filename))
if filename == "." || filename == "/" || filename == "" {
filename = "audio.mp3"
}
return audio, filename, nil
}
func (c *Client) transcribeAudio(ctx context.Context, audio []byte, filename string, in Input) (*whisperResponse, time.Duration, error) {
body := &bytes.Buffer{}
mw := multipart.NewWriter(body)
fw, err := mw.CreateFormFile("file", filename)
if err != nil {
return nil, 0, fmt.Errorf("create form file: %w", err)
}
if _, err := fw.Write(audio); err != nil {
return nil, 0, fmt.Errorf("copy audio: %w", err)
}
if in.Language != "" {
_ = mw.WriteField("language", in.Language)
}
if in.Diarize {
_ = mw.WriteField("diarize", "true")
if in.MinSpeakers > 0 {
_ = mw.WriteField("min_speakers", fmt.Sprintf("%d", in.MinSpeakers))
}
if in.MaxSpeakers > 0 {
_ = mw.WriteField("max_speakers", fmt.Sprintf("%d", in.MaxSpeakers))
}
} else {
_ = mw.WriteField("diarize", "false")
}
if err := mw.Close(); err != nil {
return nil, 0, fmt.Errorf("close form: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/transcribe", body)
if err != nil {
return nil, 0, fmt.Errorf("whisperx request: %w", err)
}
req.Header.Set("Content-Type", mw.FormDataContentType())
start := time.Now()
resp, err := c.http.Do(req)
duration := time.Since(start)
if err != nil {
return nil, duration, fmt.Errorf("whisperx do: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return nil, duration, fmt.Errorf("whisperx HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body)))
}
var out whisperResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, duration, fmt.Errorf("whisperx decode: %w", err)
}
return &out, duration, nil
}

View File

@@ -10,17 +10,22 @@ import (
"ai-service/internal/llm" "ai-service/internal/llm"
"ai-service/internal/model" "ai-service/internal/model"
"ai-service/internal/store" "ai-service/internal/store"
"ai-service/internal/transcription"
) )
const ( const (
TaskLLMChat = "llm_chat" TaskLLMChat = "llm_chat"
TaskChatCompletion = "chat_completion" TaskChatCompletion = "chat_completion"
TaskCallAnalysis = "call_analysis" TaskCallAnalysis = "call_analysis"
TaskTranscription = "transcription"
TranscriptionProfile = "whisperx"
) )
type Worker struct { type Worker struct {
store *store.Store store *store.Store
llm *llm.Client llm *llm.Client
transcriber *transcription.Client
workerID string workerID string
modelProfile string modelProfile string
pollInterval time.Duration pollInterval time.Duration
@@ -28,7 +33,7 @@ type Worker struct {
leaseTimeout time.Duration leaseTimeout time.Duration
} }
func New(store *store.Store, llmClient *llm.Client, workerID, modelProfile string, pollInterval, leaseTimeout time.Duration, claimLimit int) *Worker { func New(store *store.Store, llmClient *llm.Client, transcriber *transcription.Client, workerID, modelProfile string, pollInterval, leaseTimeout time.Duration, claimLimit int) *Worker {
if pollInterval <= 0 { if pollInterval <= 0 {
pollInterval = 2 * time.Second pollInterval = 2 * time.Second
} }
@@ -44,6 +49,7 @@ func New(store *store.Store, llmClient *llm.Client, workerID, modelProfile strin
return &Worker{ return &Worker{
store: store, store: store,
llm: llmClient, llm: llmClient,
transcriber: transcriber,
workerID: workerID, workerID: workerID,
modelProfile: modelProfile, modelProfile: modelProfile,
pollInterval: pollInterval, pollInterval: pollInterval,
@@ -73,8 +79,8 @@ func (w *Worker) tick(ctx context.Context) {
} }
jobs, err := w.store.ClaimJobs(ctx, model.ClaimJobs{ jobs, err := w.store.ClaimJobs(ctx, model.ClaimJobs{
WorkerID: w.workerID, WorkerID: w.workerID,
TaskTypes: []string{TaskLLMChat, TaskChatCompletion, TaskCallAnalysis}, TaskTypes: []string{TaskLLMChat, TaskChatCompletion, TaskCallAnalysis, TaskTranscription},
ModelProfiles: []string{w.modelProfile}, ModelProfiles: []string{w.modelProfile, TranscriptionProfile},
Limit: w.claimLimit, Limit: w.claimLimit,
}) })
if err != nil { if err != nil {
@@ -87,6 +93,10 @@ func (w *Worker) tick(ctx context.Context) {
} }
func (w *Worker) process(ctx context.Context, job *model.Job) { func (w *Worker) process(ctx context.Context, job *model.Job) {
if job.TaskType == TaskTranscription {
w.processTranscription(ctx, job)
return
}
var input llm.ChatInput var input llm.ChatInput
if err := json.Unmarshal(job.Input, &input); err != nil { if err := json.Unmarshal(job.Input, &input); err != nil {
w.fail(ctx, job, "bad_input", err.Error()) w.fail(ctx, job, "bad_input", err.Error())
@@ -107,12 +117,62 @@ func (w *Worker) process(ctx context.Context, job *model.Job) {
} }
} }
func (w *Worker) processTranscription(ctx context.Context, job *model.Job) {
if w.transcriber == nil {
w.fail(ctx, job, "provider_unavailable", "whisperx not configured")
return
}
var input transcription.Input
if err := json.Unmarshal(job.Input, &input); err != nil {
w.fail(ctx, job, "bad_input", err.Error())
return
}
result, err := w.transcriber.Transcribe(ctx, input)
if err != nil {
w.fail(ctx, job, classifyTranscriptionError(err), err.Error())
return
}
body, err := json.Marshal(result)
if err != nil {
w.fail(ctx, job, "bad_response", err.Error())
return
}
if _, err := w.store.CompleteJob(ctx, job.ID, model.CompleteJob{Result: body}); err != nil {
slog.Error("complete transcription job failed", "job_id", job.ID, "error", err)
}
}
func (w *Worker) fail(ctx context.Context, job *model.Job, code, message string) { func (w *Worker) fail(ctx context.Context, job *model.Job, code, message string) {
if _, err := w.store.FailJob(ctx, job.ID, model.FailJob{ErrorCode: code, ErrorMessage: message}); err != nil { if _, err := w.store.FailJob(ctx, job.ID, model.FailJob{ErrorCode: code, ErrorMessage: message}); err != nil {
slog.Error("fail job failed", "job_id", job.ID, "error", err) slog.Error("fail job failed", "job_id", job.ID, "error", err)
} }
} }
func classifyTranscriptionError(err error) string {
if err == nil {
return "unknown"
}
s := strings.ToLower(err.Error())
switch {
case strings.Contains(s, "context deadline exceeded") || strings.Contains(s, "timeout"):
return "timeout"
case strings.Contains(s, "audio_url is required"):
return "bad_input"
case strings.Contains(s, "audio http 4") || strings.Contains(s, "audio is empty"):
return "bad_audio"
case strings.Contains(s, "audio download") || strings.Contains(s, "audio http 5"):
return "storage_error"
case strings.Contains(s, "whisperx http 4") || strings.Contains(s, "ffmpeg") || strings.Contains(s, "invalid data") || strings.Contains(s, "could not decode"):
return "bad_audio"
case strings.Contains(s, "whisperx do") || strings.Contains(s, "connection refused") || strings.Contains(s, "connection reset") || strings.Contains(s, "closed network connection"):
return "provider_unavailable"
case strings.Contains(s, "decode"):
return "bad_response"
default:
return "unknown"
}
}
func classifyLLMError(err error) string { func classifyLLMError(err error) string {
if err == nil { if err == nil {
return "unknown" return "unknown"

View File

@@ -12,6 +12,7 @@ data:
LLM_MODEL: "qwen2.5-14b" LLM_MODEL: "qwen2.5-14b"
LLM_TIMEOUT: "5m" LLM_TIMEOUT: "5m"
WHISPERX_URL: "http://10.2.3.5:8001" WHISPERX_URL: "http://10.2.3.5:8001"
WHISPERX_TIMEOUT: "10m"
WORKER_POLL_INTERVAL: "2s" WORKER_POLL_INTERVAL: "2s"
WORKER_CLAIM_LIMIT: "4" WORKER_CLAIM_LIMIT: "4"
WORKER_LEASE_TIMEOUT: "15m" WORKER_LEASE_TIMEOUT: "15m"

View File

@@ -14,6 +14,10 @@ spec:
app: ai-service-worker app: ai-service-worker
spec: spec:
terminationGracePeriodSeconds: 20 terminationGracePeriodSeconds: 20
hostAliases:
- ip: "77.105.173.42"
hostnames:
- "s3-minio.estateliga.work"
containers: containers:
- name: worker - name: worker
image: localhost:30300/admin/ai-service:latest image: localhost:30300/admin/ai-service:latest