Stabilize AI provider health checks
All checks were successful
CI / test (push) Successful in 15s
Build and Deploy / build-and-deploy (push) Successful in 25s

This commit is contained in:
Grendgi
2026-06-08 16:23:50 +03:00
parent 039bcdb2b2
commit 04e463d03f
2 changed files with 70 additions and 18 deletions

View File

@@ -14,9 +14,11 @@ type providerStatus struct {
Name string `json:"name"` Name string `json:"name"`
Configured bool `json:"configured"` Configured bool `json:"configured"`
OK bool `json:"ok"` OK bool `json:"ok"`
Stale bool `json:"stale,omitempty"`
URL string `json:"url,omitempty"` URL string `json:"url,omitempty"`
Model string `json:"model,omitempty"` Model string `json:"model,omitempty"`
LatencyMS int64 `json:"latency_ms,omitempty"` LatencyMS int64 `json:"latency_ms,omitempty"`
LastOKAt string `json:"last_ok_at,omitempty"`
Error string `json:"error,omitempty"` Error string `json:"error,omitempty"`
} }
@@ -25,6 +27,13 @@ type providersStatusResponse struct {
Providers []providerStatus `json:"providers"` Providers []providerStatus `json:"providers"`
} }
type providerHealthSnapshot struct {
at time.Time
latencyMS int64
}
const providerStaleOKWindow = 2 * time.Minute
func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) { func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
ctx, cancel := contextWithTimeout(r, 8*time.Second) ctx, cancel := contextWithTimeout(r, 8*time.Second)
defer cancel() defer cancel()
@@ -63,14 +72,15 @@ func (s *Server) checkLLM(ctx context.Context) providerStatus {
st.LatencyMS = time.Since(start).Milliseconds() st.LatencyMS = time.Since(start).Milliseconds()
if err != nil { if err != nil {
st.Error = err.Error() st.Error = err.Error()
return st return s.withStaleProviderOK("llm", st)
} }
defer res.Body.Close() defer res.Body.Close()
if res.StatusCode >= 300 { if res.StatusCode >= 300 {
st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body)) st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body))
return st return s.withStaleProviderOK("llm", st)
} }
st.OK = true st.OK = true
s.rememberProviderOK("llm", st.LatencyMS)
return st return st
} }
@@ -80,24 +90,63 @@ func (s *Server) checkWhisperX(ctx context.Context) providerStatus {
if !st.Configured { if !st.Configured {
return st return st
} }
start := time.Now() paths := []string{"/health", "/healthz", "/readyz", "/"}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/health", nil) var lastErr string
if err != nil { for _, path := range paths {
st.Error = err.Error() cctx, cancel := context.WithTimeout(ctx, 2*time.Second)
start := time.Now()
req, err := http.NewRequestWithContext(cctx, http.MethodGet, baseURL+path, nil)
if err != nil {
cancel()
lastErr = err.Error()
continue
}
res, err := (&http.Client{Timeout: 2 * time.Second}).Do(req)
st.LatencyMS = time.Since(start).Milliseconds()
cancel()
if err != nil {
lastErr = err.Error()
continue
}
body := ""
if res.StatusCode >= 300 {
body = readSmallBody(res.Body)
}
_ = res.Body.Close()
if res.StatusCode >= 300 {
lastErr = fmt.Sprintf("%s http %d: %s", path, res.StatusCode, body)
continue
}
st.OK = true
s.rememberProviderOK("whisperx", st.LatencyMS)
return st return st
} }
res, err := http.DefaultClient.Do(req) st.Error = lastErr
st.LatencyMS = time.Since(start).Milliseconds() return s.withStaleProviderOK("whisperx", st)
if err != nil { }
st.Error = err.Error()
return st func (s *Server) rememberProviderOK(name string, latencyMS int64) {
} s.providerMu.Lock()
defer res.Body.Close() defer s.providerMu.Unlock()
if res.StatusCode >= 300 { s.providerOKs[name] = providerHealthSnapshot{at: time.Now().UTC(), latencyMS: latencyMS}
st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body)) }
func (s *Server) withStaleProviderOK(name string, st providerStatus) providerStatus {
s.providerMu.Lock()
snap, ok := s.providerOKs[name]
s.providerMu.Unlock()
if !ok || time.Since(snap.at) > providerStaleOKWindow {
return st return st
} }
st.OK = true st.OK = true
st.Stale = true
if st.LatencyMS == 0 {
st.LatencyMS = snap.latencyMS
}
st.LastOKAt = snap.at.Format(time.RFC3339)
if st.Error != "" {
st.Error = "последняя проверка не ответила: " + st.Error
}
return st return st
} }

View File

@@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"net/http" "net/http"
"strings" "strings"
"sync"
"time" "time"
"ai-service/internal/config" "ai-service/internal/config"
@@ -16,12 +17,14 @@ import (
) )
type Server struct { type Server struct {
store *store.Store store *store.Store
cfg config.Config cfg config.Config
providerMu sync.Mutex
providerOKs map[string]providerHealthSnapshot
} }
func NewServer(store *store.Store, cfg config.Config) http.Handler { func NewServer(store *store.Store, cfg config.Config) http.Handler {
return &Server{store: store, cfg: cfg} return &Server{store: store, cfg: cfg, providerOKs: make(map[string]providerHealthSnapshot)}
} }
func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {