feat: expose ai service health detail
This commit is contained in:
241
internal/httpapi/health.go
Normal file
241
internal/httpapi/health.go
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
package httpapi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"ai-service/internal/model"
|
||||||
|
"ai-service/internal/transcription"
|
||||||
|
)
|
||||||
|
|
||||||
|
type healthDetailResponse struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Generated time.Time `json:"generated_at"`
|
||||||
|
Components []healthComponent `json:"components"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type healthComponent struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Message string `json:"message,omitempty"`
|
||||||
|
Data map[string]any `json:"data,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Server) handleHealthDetail(w http.ResponseWriter, r *http.Request) {
|
||||||
|
ctx, cancel := contextWithTimeout(r, 12*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
resp := healthDetailResponse{
|
||||||
|
Status: "healthy",
|
||||||
|
Generated: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := s.store.Ping(ctx); err != nil {
|
||||||
|
resp.Components = append(resp.Components, healthComponent{
|
||||||
|
Name: "postgres",
|
||||||
|
Status: "unhealthy",
|
||||||
|
Message: err.Error(),
|
||||||
|
})
|
||||||
|
resp.Status = worseHealthStatus(resp.Status, "unhealthy")
|
||||||
|
writeJSON(w, http.StatusServiceUnavailable, resp)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
resp.Components = append(resp.Components, healthComponent{Name: "postgres", Status: "healthy"})
|
||||||
|
|
||||||
|
stats, err := s.store.Stats(ctx, s.cfg.WorkerLeaseTimeout)
|
||||||
|
if err != nil {
|
||||||
|
resp.Components = append(resp.Components, healthComponent{
|
||||||
|
Name: "queue",
|
||||||
|
Status: "unhealthy",
|
||||||
|
Message: err.Error(),
|
||||||
|
})
|
||||||
|
resp.Status = worseHealthStatus(resp.Status, "unhealthy")
|
||||||
|
writeJSON(w, http.StatusServiceUnavailable, resp)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, component := range []healthComponent{
|
||||||
|
s.healthProviders(ctx),
|
||||||
|
healthQueue(stats),
|
||||||
|
healthErrors(stats),
|
||||||
|
healthThroughput(stats),
|
||||||
|
healthInfra(loadInfraSnapshot(r, s.cfg)),
|
||||||
|
} {
|
||||||
|
resp.Components = append(resp.Components, component)
|
||||||
|
resp.Status = worseHealthStatus(resp.Status, component.Status)
|
||||||
|
}
|
||||||
|
|
||||||
|
statusCode := http.StatusOK
|
||||||
|
if resp.Status == "unhealthy" {
|
||||||
|
statusCode = http.StatusServiceUnavailable
|
||||||
|
}
|
||||||
|
writeJSON(w, statusCode, resp)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Server) healthProviders(ctx context.Context) healthComponent {
|
||||||
|
providers := []providerStatus{
|
||||||
|
s.checkLLM(ctx),
|
||||||
|
s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
|
||||||
|
}
|
||||||
|
status := "healthy"
|
||||||
|
messages := make([]string, 0)
|
||||||
|
for _, provider := range providers {
|
||||||
|
switch {
|
||||||
|
case !provider.Configured:
|
||||||
|
status = worseHealthStatus(status, "degraded")
|
||||||
|
messages = append(messages, provider.Name+" not configured")
|
||||||
|
case !provider.OK:
|
||||||
|
status = worseHealthStatus(status, "unhealthy")
|
||||||
|
if provider.Error != "" {
|
||||||
|
messages = append(messages, provider.Name+": "+provider.Error)
|
||||||
|
} else {
|
||||||
|
messages = append(messages, provider.Name+" unavailable")
|
||||||
|
}
|
||||||
|
case provider.Stale:
|
||||||
|
status = worseHealthStatus(status, "degraded")
|
||||||
|
if provider.Error != "" {
|
||||||
|
messages = append(messages, provider.Name+": "+provider.Error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return healthComponent{
|
||||||
|
Name: "providers",
|
||||||
|
Status: status,
|
||||||
|
Message: strings.Join(messages, "; "),
|
||||||
|
Data: map[string]any{
|
||||||
|
"providers": providers,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func healthQueue(stats *model.Stats) healthComponent {
|
||||||
|
var pending, running, staleRunning int64
|
||||||
|
var oldestPendingAgeSeconds, oldestRunningAgeSeconds int64
|
||||||
|
for _, row := range stats.Backlog {
|
||||||
|
pending += row.Pending
|
||||||
|
running += row.Running
|
||||||
|
staleRunning += row.StaleRunning
|
||||||
|
if row.OldestPendingAgeSeconds > oldestPendingAgeSeconds {
|
||||||
|
oldestPendingAgeSeconds = row.OldestPendingAgeSeconds
|
||||||
|
}
|
||||||
|
if row.OldestRunningAgeSeconds > oldestRunningAgeSeconds {
|
||||||
|
oldestRunningAgeSeconds = row.OldestRunningAgeSeconds
|
||||||
|
}
|
||||||
|
}
|
||||||
|
status := "healthy"
|
||||||
|
message := ""
|
||||||
|
if staleRunning > 0 {
|
||||||
|
status = "degraded"
|
||||||
|
message = "there are stale running jobs"
|
||||||
|
}
|
||||||
|
return healthComponent{
|
||||||
|
Name: "queue",
|
||||||
|
Status: status,
|
||||||
|
Message: message,
|
||||||
|
Data: map[string]any{
|
||||||
|
"pending": pending,
|
||||||
|
"running": running,
|
||||||
|
"stale_running": staleRunning,
|
||||||
|
"oldest_pending_age_seconds": oldestPendingAgeSeconds,
|
||||||
|
"oldest_running_age_seconds": oldestRunningAgeSeconds,
|
||||||
|
"backlog": stats.Backlog,
|
||||||
|
"queue_status_totals": stats.Queues,
|
||||||
|
"owner_status_totals": stats.Owners,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func healthErrors(stats *model.Stats) healthComponent {
|
||||||
|
var failedTotal, failed24h int64
|
||||||
|
for _, row := range stats.Errors {
|
||||||
|
failedTotal += row.Total
|
||||||
|
failed24h += row.Last24h
|
||||||
|
}
|
||||||
|
status := "healthy"
|
||||||
|
message := ""
|
||||||
|
if failed24h > 0 {
|
||||||
|
status = "degraded"
|
||||||
|
message = "there are failed jobs in the last 24 hours"
|
||||||
|
}
|
||||||
|
return healthComponent{
|
||||||
|
Name: "errors",
|
||||||
|
Status: status,
|
||||||
|
Message: message,
|
||||||
|
Data: map[string]any{
|
||||||
|
"failed_total": failedTotal,
|
||||||
|
"failed_24h": failed24h,
|
||||||
|
"by_code": stats.Errors,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func healthThroughput(stats *model.Stats) healthComponent {
|
||||||
|
var done24h, retried24h int64
|
||||||
|
for _, row := range stats.Stages {
|
||||||
|
done24h += row.Done24h
|
||||||
|
retried24h += row.Retried24h
|
||||||
|
}
|
||||||
|
|
||||||
|
pendingByStage := make(map[string]int64)
|
||||||
|
for _, row := range stats.Backlog {
|
||||||
|
pendingByStage[row.TaskType+"|"+row.ModelProfile] += row.Pending + row.Running
|
||||||
|
}
|
||||||
|
doneByStage := make(map[string]int64)
|
||||||
|
for _, row := range stats.Stages {
|
||||||
|
doneByStage[row.TaskType+"|"+row.ModelProfile] += row.Done24h
|
||||||
|
}
|
||||||
|
|
||||||
|
stuckStages := make([]string, 0)
|
||||||
|
for key, total := range pendingByStage {
|
||||||
|
if total > 0 && doneByStage[key] == 0 {
|
||||||
|
stuckStages = append(stuckStages, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
status := "healthy"
|
||||||
|
message := ""
|
||||||
|
if len(stuckStages) > 0 {
|
||||||
|
status = "degraded"
|
||||||
|
message = "some active queues have no completed jobs in the last 24 hours"
|
||||||
|
}
|
||||||
|
return healthComponent{
|
||||||
|
Name: "throughput",
|
||||||
|
Status: status,
|
||||||
|
Message: message,
|
||||||
|
Data: map[string]any{
|
||||||
|
"done_24h": done24h,
|
||||||
|
"retried_24h": retried24h,
|
||||||
|
"stuck_stages": stuckStages,
|
||||||
|
"stages": stats.Stages,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func healthInfra(infra infraStatusResponse) healthComponent {
|
||||||
|
status := "healthy"
|
||||||
|
message := ""
|
||||||
|
if infra.SidecarError != "" {
|
||||||
|
status = "degraded"
|
||||||
|
message = infra.SidecarError
|
||||||
|
}
|
||||||
|
return healthComponent{
|
||||||
|
Name: "infra",
|
||||||
|
Status: status,
|
||||||
|
Message: message,
|
||||||
|
Data: map[string]any{
|
||||||
|
"sidecar": infra.Sidecar,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func worseHealthStatus(current, next string) string {
|
||||||
|
if current == "unhealthy" || next == "unhealthy" {
|
||||||
|
return "unhealthy"
|
||||||
|
}
|
||||||
|
if current == "degraded" || next == "degraded" {
|
||||||
|
return "degraded"
|
||||||
|
}
|
||||||
|
return "healthy"
|
||||||
|
}
|
||||||
@@ -41,6 +41,8 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
|||||||
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
|
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
|
||||||
case r.Method == http.MethodGet && path == "/readyz":
|
case r.Method == http.MethodGet && path == "/readyz":
|
||||||
s.handleReady(w, r)
|
s.handleReady(w, r)
|
||||||
|
case r.Method == http.MethodGet && path == "/health/detail":
|
||||||
|
s.handleHealthDetail(w, r)
|
||||||
case r.Method == http.MethodGet && path == "/":
|
case r.Method == http.MethodGet && path == "/":
|
||||||
writeJSON(w, http.StatusOK, map[string]string{"service": "ai-service"})
|
writeJSON(w, http.StatusOK, map[string]string{"service": "ai-service"})
|
||||||
case r.Method == http.MethodPost && path == "/api/v1/jobs":
|
case r.Method == http.MethodPost && path == "/api/v1/jobs":
|
||||||
|
|||||||
Reference in New Issue
Block a user