feat: expose ai service health detail
This commit is contained in:
241
internal/httpapi/health.go
Normal file
241
internal/httpapi/health.go
Normal file
@@ -0,0 +1,241 @@
|
||||
package httpapi
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"ai-service/internal/model"
|
||||
"ai-service/internal/transcription"
|
||||
)
|
||||
|
||||
type healthDetailResponse struct {
|
||||
Status string `json:"status"`
|
||||
Generated time.Time `json:"generated_at"`
|
||||
Components []healthComponent `json:"components"`
|
||||
}
|
||||
|
||||
type healthComponent struct {
|
||||
Name string `json:"name"`
|
||||
Status string `json:"status"`
|
||||
Message string `json:"message,omitempty"`
|
||||
Data map[string]any `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
func (s *Server) handleHealthDetail(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := contextWithTimeout(r, 12*time.Second)
|
||||
defer cancel()
|
||||
|
||||
resp := healthDetailResponse{
|
||||
Status: "healthy",
|
||||
Generated: time.Now().UTC(),
|
||||
}
|
||||
|
||||
if err := s.store.Ping(ctx); err != nil {
|
||||
resp.Components = append(resp.Components, healthComponent{
|
||||
Name: "postgres",
|
||||
Status: "unhealthy",
|
||||
Message: err.Error(),
|
||||
})
|
||||
resp.Status = worseHealthStatus(resp.Status, "unhealthy")
|
||||
writeJSON(w, http.StatusServiceUnavailable, resp)
|
||||
return
|
||||
}
|
||||
resp.Components = append(resp.Components, healthComponent{Name: "postgres", Status: "healthy"})
|
||||
|
||||
stats, err := s.store.Stats(ctx, s.cfg.WorkerLeaseTimeout)
|
||||
if err != nil {
|
||||
resp.Components = append(resp.Components, healthComponent{
|
||||
Name: "queue",
|
||||
Status: "unhealthy",
|
||||
Message: err.Error(),
|
||||
})
|
||||
resp.Status = worseHealthStatus(resp.Status, "unhealthy")
|
||||
writeJSON(w, http.StatusServiceUnavailable, resp)
|
||||
return
|
||||
}
|
||||
|
||||
for _, component := range []healthComponent{
|
||||
s.healthProviders(ctx),
|
||||
healthQueue(stats),
|
||||
healthErrors(stats),
|
||||
healthThroughput(stats),
|
||||
healthInfra(loadInfraSnapshot(r, s.cfg)),
|
||||
} {
|
||||
resp.Components = append(resp.Components, component)
|
||||
resp.Status = worseHealthStatus(resp.Status, component.Status)
|
||||
}
|
||||
|
||||
statusCode := http.StatusOK
|
||||
if resp.Status == "unhealthy" {
|
||||
statusCode = http.StatusServiceUnavailable
|
||||
}
|
||||
writeJSON(w, statusCode, resp)
|
||||
}
|
||||
|
||||
func (s *Server) healthProviders(ctx context.Context) healthComponent {
|
||||
providers := []providerStatus{
|
||||
s.checkLLM(ctx),
|
||||
s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
|
||||
}
|
||||
status := "healthy"
|
||||
messages := make([]string, 0)
|
||||
for _, provider := range providers {
|
||||
switch {
|
||||
case !provider.Configured:
|
||||
status = worseHealthStatus(status, "degraded")
|
||||
messages = append(messages, provider.Name+" not configured")
|
||||
case !provider.OK:
|
||||
status = worseHealthStatus(status, "unhealthy")
|
||||
if provider.Error != "" {
|
||||
messages = append(messages, provider.Name+": "+provider.Error)
|
||||
} else {
|
||||
messages = append(messages, provider.Name+" unavailable")
|
||||
}
|
||||
case provider.Stale:
|
||||
status = worseHealthStatus(status, "degraded")
|
||||
if provider.Error != "" {
|
||||
messages = append(messages, provider.Name+": "+provider.Error)
|
||||
}
|
||||
}
|
||||
}
|
||||
return healthComponent{
|
||||
Name: "providers",
|
||||
Status: status,
|
||||
Message: strings.Join(messages, "; "),
|
||||
Data: map[string]any{
|
||||
"providers": providers,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func healthQueue(stats *model.Stats) healthComponent {
|
||||
var pending, running, staleRunning int64
|
||||
var oldestPendingAgeSeconds, oldestRunningAgeSeconds int64
|
||||
for _, row := range stats.Backlog {
|
||||
pending += row.Pending
|
||||
running += row.Running
|
||||
staleRunning += row.StaleRunning
|
||||
if row.OldestPendingAgeSeconds > oldestPendingAgeSeconds {
|
||||
oldestPendingAgeSeconds = row.OldestPendingAgeSeconds
|
||||
}
|
||||
if row.OldestRunningAgeSeconds > oldestRunningAgeSeconds {
|
||||
oldestRunningAgeSeconds = row.OldestRunningAgeSeconds
|
||||
}
|
||||
}
|
||||
status := "healthy"
|
||||
message := ""
|
||||
if staleRunning > 0 {
|
||||
status = "degraded"
|
||||
message = "there are stale running jobs"
|
||||
}
|
||||
return healthComponent{
|
||||
Name: "queue",
|
||||
Status: status,
|
||||
Message: message,
|
||||
Data: map[string]any{
|
||||
"pending": pending,
|
||||
"running": running,
|
||||
"stale_running": staleRunning,
|
||||
"oldest_pending_age_seconds": oldestPendingAgeSeconds,
|
||||
"oldest_running_age_seconds": oldestRunningAgeSeconds,
|
||||
"backlog": stats.Backlog,
|
||||
"queue_status_totals": stats.Queues,
|
||||
"owner_status_totals": stats.Owners,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func healthErrors(stats *model.Stats) healthComponent {
|
||||
var failedTotal, failed24h int64
|
||||
for _, row := range stats.Errors {
|
||||
failedTotal += row.Total
|
||||
failed24h += row.Last24h
|
||||
}
|
||||
status := "healthy"
|
||||
message := ""
|
||||
if failed24h > 0 {
|
||||
status = "degraded"
|
||||
message = "there are failed jobs in the last 24 hours"
|
||||
}
|
||||
return healthComponent{
|
||||
Name: "errors",
|
||||
Status: status,
|
||||
Message: message,
|
||||
Data: map[string]any{
|
||||
"failed_total": failedTotal,
|
||||
"failed_24h": failed24h,
|
||||
"by_code": stats.Errors,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func healthThroughput(stats *model.Stats) healthComponent {
|
||||
var done24h, retried24h int64
|
||||
for _, row := range stats.Stages {
|
||||
done24h += row.Done24h
|
||||
retried24h += row.Retried24h
|
||||
}
|
||||
|
||||
pendingByStage := make(map[string]int64)
|
||||
for _, row := range stats.Backlog {
|
||||
pendingByStage[row.TaskType+"|"+row.ModelProfile] += row.Pending + row.Running
|
||||
}
|
||||
doneByStage := make(map[string]int64)
|
||||
for _, row := range stats.Stages {
|
||||
doneByStage[row.TaskType+"|"+row.ModelProfile] += row.Done24h
|
||||
}
|
||||
|
||||
stuckStages := make([]string, 0)
|
||||
for key, total := range pendingByStage {
|
||||
if total > 0 && doneByStage[key] == 0 {
|
||||
stuckStages = append(stuckStages, key)
|
||||
}
|
||||
}
|
||||
|
||||
status := "healthy"
|
||||
message := ""
|
||||
if len(stuckStages) > 0 {
|
||||
status = "degraded"
|
||||
message = "some active queues have no completed jobs in the last 24 hours"
|
||||
}
|
||||
return healthComponent{
|
||||
Name: "throughput",
|
||||
Status: status,
|
||||
Message: message,
|
||||
Data: map[string]any{
|
||||
"done_24h": done24h,
|
||||
"retried_24h": retried24h,
|
||||
"stuck_stages": stuckStages,
|
||||
"stages": stats.Stages,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func healthInfra(infra infraStatusResponse) healthComponent {
|
||||
status := "healthy"
|
||||
message := ""
|
||||
if infra.SidecarError != "" {
|
||||
status = "degraded"
|
||||
message = infra.SidecarError
|
||||
}
|
||||
return healthComponent{
|
||||
Name: "infra",
|
||||
Status: status,
|
||||
Message: message,
|
||||
Data: map[string]any{
|
||||
"sidecar": infra.Sidecar,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func worseHealthStatus(current, next string) string {
|
||||
if current == "unhealthy" || next == "unhealthy" {
|
||||
return "unhealthy"
|
||||
}
|
||||
if current == "degraded" || next == "degraded" {
|
||||
return "degraded"
|
||||
}
|
||||
return "healthy"
|
||||
}
|
||||
@@ -41,6 +41,8 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
|
||||
case r.Method == http.MethodGet && path == "/readyz":
|
||||
s.handleReady(w, r)
|
||||
case r.Method == http.MethodGet && path == "/health/detail":
|
||||
s.handleHealthDetail(w, r)
|
||||
case r.Method == http.MethodGet && path == "/":
|
||||
writeJSON(w, http.StatusOK, map[string]string{"service": "ai-service"})
|
||||
case r.Method == http.MethodPost && path == "/api/v1/jobs":
|
||||
|
||||
Reference in New Issue
Block a user