From 3c124c5f5a7bbaec26b79ce27fa49a944a511475 Mon Sep 17 00:00:00 2001 From: Grendgi Date: Wed, 17 Jun 2026 16:31:22 +0300 Subject: [PATCH] feat: expose ai service health detail --- internal/httpapi/health.go | 241 +++++++++++++++++++++++++++++++++++++ internal/httpapi/server.go | 2 + 2 files changed, 243 insertions(+) create mode 100644 internal/httpapi/health.go diff --git a/internal/httpapi/health.go b/internal/httpapi/health.go new file mode 100644 index 0000000..50fd634 --- /dev/null +++ b/internal/httpapi/health.go @@ -0,0 +1,241 @@ +package httpapi + +import ( + "context" + "net/http" + "strings" + "time" + + "ai-service/internal/model" + "ai-service/internal/transcription" +) + +type healthDetailResponse struct { + Status string `json:"status"` + Generated time.Time `json:"generated_at"` + Components []healthComponent `json:"components"` +} + +type healthComponent struct { + Name string `json:"name"` + Status string `json:"status"` + Message string `json:"message,omitempty"` + Data map[string]any `json:"data,omitempty"` +} + +func (s *Server) handleHealthDetail(w http.ResponseWriter, r *http.Request) { + ctx, cancel := contextWithTimeout(r, 12*time.Second) + defer cancel() + + resp := healthDetailResponse{ + Status: "healthy", + Generated: time.Now().UTC(), + } + + if err := s.store.Ping(ctx); err != nil { + resp.Components = append(resp.Components, healthComponent{ + Name: "postgres", + Status: "unhealthy", + Message: err.Error(), + }) + resp.Status = worseHealthStatus(resp.Status, "unhealthy") + writeJSON(w, http.StatusServiceUnavailable, resp) + return + } + resp.Components = append(resp.Components, healthComponent{Name: "postgres", Status: "healthy"}) + + stats, err := s.store.Stats(ctx, s.cfg.WorkerLeaseTimeout) + if err != nil { + resp.Components = append(resp.Components, healthComponent{ + Name: "queue", + Status: "unhealthy", + Message: err.Error(), + }) + resp.Status = worseHealthStatus(resp.Status, "unhealthy") + writeJSON(w, http.StatusServiceUnavailable, resp) + return + } + + for _, component := range []healthComponent{ + s.healthProviders(ctx), + healthQueue(stats), + healthErrors(stats), + healthThroughput(stats), + healthInfra(loadInfraSnapshot(r, s.cfg)), + } { + resp.Components = append(resp.Components, component) + resp.Status = worseHealthStatus(resp.Status, component.Status) + } + + statusCode := http.StatusOK + if resp.Status == "unhealthy" { + statusCode = http.StatusServiceUnavailable + } + writeJSON(w, statusCode, resp) +} + +func (s *Server) healthProviders(ctx context.Context) healthComponent { + providers := []providerStatus{ + s.checkLLM(ctx), + s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout), + } + status := "healthy" + messages := make([]string, 0) + for _, provider := range providers { + switch { + case !provider.Configured: + status = worseHealthStatus(status, "degraded") + messages = append(messages, provider.Name+" not configured") + case !provider.OK: + status = worseHealthStatus(status, "unhealthy") + if provider.Error != "" { + messages = append(messages, provider.Name+": "+provider.Error) + } else { + messages = append(messages, provider.Name+" unavailable") + } + case provider.Stale: + status = worseHealthStatus(status, "degraded") + if provider.Error != "" { + messages = append(messages, provider.Name+": "+provider.Error) + } + } + } + return healthComponent{ + Name: "providers", + Status: status, + Message: strings.Join(messages, "; "), + Data: map[string]any{ + "providers": providers, + }, + } +} + +func healthQueue(stats *model.Stats) healthComponent { + var pending, running, staleRunning int64 + var oldestPendingAgeSeconds, oldestRunningAgeSeconds int64 + for _, row := range stats.Backlog { + pending += row.Pending + running += row.Running + staleRunning += row.StaleRunning + if row.OldestPendingAgeSeconds > oldestPendingAgeSeconds { + oldestPendingAgeSeconds = row.OldestPendingAgeSeconds + } + if row.OldestRunningAgeSeconds > oldestRunningAgeSeconds { + oldestRunningAgeSeconds = row.OldestRunningAgeSeconds + } + } + status := "healthy" + message := "" + if staleRunning > 0 { + status = "degraded" + message = "there are stale running jobs" + } + return healthComponent{ + Name: "queue", + Status: status, + Message: message, + Data: map[string]any{ + "pending": pending, + "running": running, + "stale_running": staleRunning, + "oldest_pending_age_seconds": oldestPendingAgeSeconds, + "oldest_running_age_seconds": oldestRunningAgeSeconds, + "backlog": stats.Backlog, + "queue_status_totals": stats.Queues, + "owner_status_totals": stats.Owners, + }, + } +} + +func healthErrors(stats *model.Stats) healthComponent { + var failedTotal, failed24h int64 + for _, row := range stats.Errors { + failedTotal += row.Total + failed24h += row.Last24h + } + status := "healthy" + message := "" + if failed24h > 0 { + status = "degraded" + message = "there are failed jobs in the last 24 hours" + } + return healthComponent{ + Name: "errors", + Status: status, + Message: message, + Data: map[string]any{ + "failed_total": failedTotal, + "failed_24h": failed24h, + "by_code": stats.Errors, + }, + } +} + +func healthThroughput(stats *model.Stats) healthComponent { + var done24h, retried24h int64 + for _, row := range stats.Stages { + done24h += row.Done24h + retried24h += row.Retried24h + } + + pendingByStage := make(map[string]int64) + for _, row := range stats.Backlog { + pendingByStage[row.TaskType+"|"+row.ModelProfile] += row.Pending + row.Running + } + doneByStage := make(map[string]int64) + for _, row := range stats.Stages { + doneByStage[row.TaskType+"|"+row.ModelProfile] += row.Done24h + } + + stuckStages := make([]string, 0) + for key, total := range pendingByStage { + if total > 0 && doneByStage[key] == 0 { + stuckStages = append(stuckStages, key) + } + } + + status := "healthy" + message := "" + if len(stuckStages) > 0 { + status = "degraded" + message = "some active queues have no completed jobs in the last 24 hours" + } + return healthComponent{ + Name: "throughput", + Status: status, + Message: message, + Data: map[string]any{ + "done_24h": done24h, + "retried_24h": retried24h, + "stuck_stages": stuckStages, + "stages": stats.Stages, + }, + } +} + +func healthInfra(infra infraStatusResponse) healthComponent { + status := "healthy" + message := "" + if infra.SidecarError != "" { + status = "degraded" + message = infra.SidecarError + } + return healthComponent{ + Name: "infra", + Status: status, + Message: message, + Data: map[string]any{ + "sidecar": infra.Sidecar, + }, + } +} + +func worseHealthStatus(current, next string) string { + if current == "unhealthy" || next == "unhealthy" { + return "unhealthy" + } + if current == "degraded" || next == "degraded" { + return "degraded" + } + return "healthy" +} diff --git a/internal/httpapi/server.go b/internal/httpapi/server.go index 0c27491..5d4df11 100644 --- a/internal/httpapi/server.go +++ b/internal/httpapi/server.go @@ -41,6 +41,8 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, map[string]string{"status": "ok"}) case r.Method == http.MethodGet && path == "/readyz": s.handleReady(w, r) + case r.Method == http.MethodGet && path == "/health/detail": + s.handleHealthDetail(w, r) case r.Method == http.MethodGet && path == "/": writeJSON(w, http.StatusOK, map[string]string{"service": "ai-service"}) case r.Method == http.MethodPost && path == "/api/v1/jobs":