242 lines
6.2 KiB
Go
242 lines
6.2 KiB
Go
package httpapi
|
|
|
|
import (
|
|
"context"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"ai-service/internal/model"
|
|
"ai-service/internal/transcription"
|
|
)
|
|
|
|
type healthDetailResponse struct {
|
|
Status string `json:"status"`
|
|
Generated time.Time `json:"generated_at"`
|
|
Components []healthComponent `json:"components"`
|
|
}
|
|
|
|
type healthComponent struct {
|
|
Name string `json:"name"`
|
|
Status string `json:"status"`
|
|
Error string `json:"error,omitempty"`
|
|
Data map[string]any `json:"data,omitempty"`
|
|
}
|
|
|
|
func (s *Server) handleHealthDetail(w http.ResponseWriter, r *http.Request) {
|
|
ctx, cancel := contextWithTimeout(r, 12*time.Second)
|
|
defer cancel()
|
|
|
|
resp := healthDetailResponse{
|
|
Status: "ok",
|
|
Generated: time.Now().UTC(),
|
|
}
|
|
|
|
if err := s.store.Ping(ctx); err != nil {
|
|
resp.Components = append(resp.Components, healthComponent{
|
|
Name: "postgres",
|
|
Status: "down",
|
|
Error: err.Error(),
|
|
})
|
|
resp.Status = worseHealthStatus(resp.Status, "down")
|
|
writeJSON(w, http.StatusServiceUnavailable, resp)
|
|
return
|
|
}
|
|
resp.Components = append(resp.Components, healthComponent{Name: "postgres", Status: "ok"})
|
|
|
|
stats, err := s.store.Stats(ctx, s.cfg.WorkerLeaseTimeout)
|
|
if err != nil {
|
|
resp.Components = append(resp.Components, healthComponent{
|
|
Name: "queue",
|
|
Status: "down",
|
|
Error: err.Error(),
|
|
})
|
|
resp.Status = worseHealthStatus(resp.Status, "down")
|
|
writeJSON(w, http.StatusServiceUnavailable, resp)
|
|
return
|
|
}
|
|
|
|
for _, component := range []healthComponent{
|
|
s.healthProviders(ctx),
|
|
healthQueue(stats),
|
|
healthErrors(stats),
|
|
healthThroughput(stats),
|
|
healthInfra(loadInfraSnapshot(r, s.cfg)),
|
|
} {
|
|
resp.Components = append(resp.Components, component)
|
|
resp.Status = worseHealthStatus(resp.Status, component.Status)
|
|
}
|
|
|
|
statusCode := http.StatusOK
|
|
if resp.Status == "down" {
|
|
statusCode = http.StatusServiceUnavailable
|
|
}
|
|
writeJSON(w, statusCode, resp)
|
|
}
|
|
|
|
func (s *Server) healthProviders(ctx context.Context) healthComponent {
|
|
providers := []providerStatus{
|
|
s.checkLLM(ctx),
|
|
s.checkAudioLLM(ctx, transcription.ProviderWhisperLargeV3, s.cfg.AudioBaseURL, s.cfg.AudioAPIKey, s.cfg.AudioModel, s.cfg.AudioTimeout),
|
|
}
|
|
status := "ok"
|
|
messages := make([]string, 0)
|
|
for _, provider := range providers {
|
|
switch {
|
|
case !provider.Configured:
|
|
status = worseHealthStatus(status, "degraded")
|
|
messages = append(messages, provider.Name+" not configured")
|
|
case !provider.OK:
|
|
status = worseHealthStatus(status, "down")
|
|
if provider.Error != "" {
|
|
messages = append(messages, provider.Name+": "+provider.Error)
|
|
} else {
|
|
messages = append(messages, provider.Name+" unavailable")
|
|
}
|
|
case provider.Stale:
|
|
status = worseHealthStatus(status, "degraded")
|
|
if provider.Error != "" {
|
|
messages = append(messages, provider.Name+": "+provider.Error)
|
|
}
|
|
}
|
|
}
|
|
return healthComponent{
|
|
Name: "providers",
|
|
Status: status,
|
|
Error: strings.Join(messages, "; "),
|
|
Data: map[string]any{
|
|
"providers": providers,
|
|
},
|
|
}
|
|
}
|
|
|
|
func healthQueue(stats *model.Stats) healthComponent {
|
|
var pending, running, staleRunning int64
|
|
var oldestPendingAgeSeconds, oldestRunningAgeSeconds int64
|
|
for _, row := range stats.Backlog {
|
|
pending += row.Pending
|
|
running += row.Running
|
|
staleRunning += row.StaleRunning
|
|
if row.OldestPendingAgeSeconds > oldestPendingAgeSeconds {
|
|
oldestPendingAgeSeconds = row.OldestPendingAgeSeconds
|
|
}
|
|
if row.OldestRunningAgeSeconds > oldestRunningAgeSeconds {
|
|
oldestRunningAgeSeconds = row.OldestRunningAgeSeconds
|
|
}
|
|
}
|
|
status := "ok"
|
|
message := ""
|
|
if staleRunning > 0 {
|
|
status = "degraded"
|
|
message = "there are stale running jobs"
|
|
}
|
|
return healthComponent{
|
|
Name: "queue",
|
|
Status: status,
|
|
Error: message,
|
|
Data: map[string]any{
|
|
"pending": pending,
|
|
"running": running,
|
|
"stale_running": staleRunning,
|
|
"oldest_pending_age_seconds": oldestPendingAgeSeconds,
|
|
"oldest_running_age_seconds": oldestRunningAgeSeconds,
|
|
"backlog": stats.Backlog,
|
|
"queue_status_totals": stats.Queues,
|
|
"owner_status_totals": stats.Owners,
|
|
},
|
|
}
|
|
}
|
|
|
|
func healthErrors(stats *model.Stats) healthComponent {
|
|
var failedTotal, failed24h int64
|
|
for _, row := range stats.Errors {
|
|
failedTotal += row.Total
|
|
failed24h += row.Last24h
|
|
}
|
|
status := "ok"
|
|
message := ""
|
|
if failed24h > 0 {
|
|
status = "degraded"
|
|
message = "there are failed jobs in the last 24 hours"
|
|
}
|
|
return healthComponent{
|
|
Name: "errors",
|
|
Status: status,
|
|
Error: message,
|
|
Data: map[string]any{
|
|
"failed_total": failedTotal,
|
|
"failed_24h": failed24h,
|
|
"by_code": stats.Errors,
|
|
},
|
|
}
|
|
}
|
|
|
|
func healthThroughput(stats *model.Stats) healthComponent {
|
|
var done24h, retried24h int64
|
|
for _, row := range stats.Stages {
|
|
done24h += row.Done24h
|
|
retried24h += row.Retried24h
|
|
}
|
|
|
|
pendingByStage := make(map[string]int64)
|
|
for _, row := range stats.Backlog {
|
|
pendingByStage[row.TaskType+"|"+row.ModelProfile] += row.Pending + row.Running
|
|
}
|
|
doneByStage := make(map[string]int64)
|
|
for _, row := range stats.Stages {
|
|
doneByStage[row.TaskType+"|"+row.ModelProfile] += row.Done24h
|
|
}
|
|
|
|
stuckStages := make([]string, 0)
|
|
for key, total := range pendingByStage {
|
|
if total > 0 && doneByStage[key] == 0 {
|
|
stuckStages = append(stuckStages, key)
|
|
}
|
|
}
|
|
|
|
status := "ok"
|
|
message := ""
|
|
if len(stuckStages) > 0 {
|
|
status = "degraded"
|
|
message = "some active queues have no completed jobs in the last 24 hours"
|
|
}
|
|
return healthComponent{
|
|
Name: "throughput",
|
|
Status: status,
|
|
Error: message,
|
|
Data: map[string]any{
|
|
"done_24h": done24h,
|
|
"retried_24h": retried24h,
|
|
"stuck_stages": stuckStages,
|
|
"stages": stats.Stages,
|
|
},
|
|
}
|
|
}
|
|
|
|
func healthInfra(infra infraStatusResponse) healthComponent {
|
|
status := "ok"
|
|
message := ""
|
|
if infra.SidecarError != "" {
|
|
status = "degraded"
|
|
message = infra.SidecarError
|
|
}
|
|
return healthComponent{
|
|
Name: "infra",
|
|
Status: status,
|
|
Error: message,
|
|
Data: map[string]any{
|
|
"sidecar": infra.Sidecar,
|
|
},
|
|
}
|
|
}
|
|
|
|
func worseHealthStatus(current, next string) string {
|
|
if current == "down" || next == "down" {
|
|
return "down"
|
|
}
|
|
if current == "degraded" || next == "degraded" {
|
|
return "degraded"
|
|
}
|
|
return "ok"
|
|
}
|