package httpapi import ( "context" "net/http" "strings" "time" "ai-service/internal/model" "ai-service/internal/transcription" ) type healthDetailResponse struct { Status string `json:"status"` Generated time.Time `json:"generated_at"` Components []healthComponent `json:"components"` } type healthComponent struct { Name string `json:"name"` Status string `json:"status"` Message string `json:"message,omitempty"` Data map[string]any `json:"data,omitempty"` } func (s *Server) handleHealthDetail(w http.ResponseWriter, r *http.Request) { ctx, cancel := contextWithTimeout(r, 12*time.Second) defer cancel() resp := healthDetailResponse{ Status: "healthy", Generated: time.Now().UTC(), } if err := s.store.Ping(ctx); err != nil { resp.Components = append(resp.Components, healthComponent{ Name: "postgres", Status: "unhealthy", Message: err.Error(), }) resp.Status = worseHealthStatus(resp.Status, "unhealthy") writeJSON(w, http.StatusServiceUnavailable, resp) return } resp.Components = append(resp.Components, healthComponent{Name: "postgres", Status: "healthy"}) stats, err := s.store.Stats(ctx, s.cfg.WorkerLeaseTimeout) if err != nil { resp.Components = append(resp.Components, healthComponent{ Name: "queue", Status: "unhealthy", Message: err.Error(), }) resp.Status = worseHealthStatus(resp.Status, "unhealthy") writeJSON(w, http.StatusServiceUnavailable, resp) return } for _, component := range []healthComponent{ s.healthProviders(ctx), healthQueue(stats), healthErrors(stats), healthThroughput(stats), healthInfra(loadInfraSnapshot(r, s.cfg)), } { resp.Components = append(resp.Components, component) resp.Status = worseHealthStatus(resp.Status, component.Status) } statusCode := http.StatusOK if resp.Status == "unhealthy" { statusCode = http.StatusServiceUnavailable } writeJSON(w, statusCode, resp) } func (s *Server) healthProviders(ctx context.Context) healthComponent { providers := []providerStatus{ s.checkLLM(ctx), s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout), } status := "healthy" messages := make([]string, 0) for _, provider := range providers { switch { case !provider.Configured: status = worseHealthStatus(status, "degraded") messages = append(messages, provider.Name+" not configured") case !provider.OK: status = worseHealthStatus(status, "unhealthy") if provider.Error != "" { messages = append(messages, provider.Name+": "+provider.Error) } else { messages = append(messages, provider.Name+" unavailable") } case provider.Stale: status = worseHealthStatus(status, "degraded") if provider.Error != "" { messages = append(messages, provider.Name+": "+provider.Error) } } } return healthComponent{ Name: "providers", Status: status, Message: strings.Join(messages, "; "), Data: map[string]any{ "providers": providers, }, } } func healthQueue(stats *model.Stats) healthComponent { var pending, running, staleRunning int64 var oldestPendingAgeSeconds, oldestRunningAgeSeconds int64 for _, row := range stats.Backlog { pending += row.Pending running += row.Running staleRunning += row.StaleRunning if row.OldestPendingAgeSeconds > oldestPendingAgeSeconds { oldestPendingAgeSeconds = row.OldestPendingAgeSeconds } if row.OldestRunningAgeSeconds > oldestRunningAgeSeconds { oldestRunningAgeSeconds = row.OldestRunningAgeSeconds } } status := "healthy" message := "" if staleRunning > 0 { status = "degraded" message = "there are stale running jobs" } return healthComponent{ Name: "queue", Status: status, Message: message, Data: map[string]any{ "pending": pending, "running": running, "stale_running": staleRunning, "oldest_pending_age_seconds": oldestPendingAgeSeconds, "oldest_running_age_seconds": oldestRunningAgeSeconds, "backlog": stats.Backlog, "queue_status_totals": stats.Queues, "owner_status_totals": stats.Owners, }, } } func healthErrors(stats *model.Stats) healthComponent { var failedTotal, failed24h int64 for _, row := range stats.Errors { failedTotal += row.Total failed24h += row.Last24h } status := "healthy" message := "" if failed24h > 0 { status = "degraded" message = "there are failed jobs in the last 24 hours" } return healthComponent{ Name: "errors", Status: status, Message: message, Data: map[string]any{ "failed_total": failedTotal, "failed_24h": failed24h, "by_code": stats.Errors, }, } } func healthThroughput(stats *model.Stats) healthComponent { var done24h, retried24h int64 for _, row := range stats.Stages { done24h += row.Done24h retried24h += row.Retried24h } pendingByStage := make(map[string]int64) for _, row := range stats.Backlog { pendingByStage[row.TaskType+"|"+row.ModelProfile] += row.Pending + row.Running } doneByStage := make(map[string]int64) for _, row := range stats.Stages { doneByStage[row.TaskType+"|"+row.ModelProfile] += row.Done24h } stuckStages := make([]string, 0) for key, total := range pendingByStage { if total > 0 && doneByStage[key] == 0 { stuckStages = append(stuckStages, key) } } status := "healthy" message := "" if len(stuckStages) > 0 { status = "degraded" message = "some active queues have no completed jobs in the last 24 hours" } return healthComponent{ Name: "throughput", Status: status, Message: message, Data: map[string]any{ "done_24h": done24h, "retried_24h": retried24h, "stuck_stages": stuckStages, "stages": stats.Stages, }, } } func healthInfra(infra infraStatusResponse) healthComponent { status := "healthy" message := "" if infra.SidecarError != "" { status = "degraded" message = infra.SidecarError } return healthComponent{ Name: "infra", Status: status, Message: message, Data: map[string]any{ "sidecar": infra.Sidecar, }, } } func worseHealthStatus(current, next string) string { if current == "unhealthy" || next == "unhealthy" { return "unhealthy" } if current == "degraded" || next == "degraded" { return "degraded" } return "healthy" }