From 04e463d03f643ba873c84affa54dc33aeeba4fb3 Mon Sep 17 00:00:00 2001 From: Grendgi Date: Mon, 8 Jun 2026 16:23:50 +0300 Subject: [PATCH] Stabilize AI provider health checks --- internal/httpapi/providers.go | 79 ++++++++++++++++++++++++++++------- internal/httpapi/server.go | 9 ++-- 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/internal/httpapi/providers.go b/internal/httpapi/providers.go index be14a38..8237658 100644 --- a/internal/httpapi/providers.go +++ b/internal/httpapi/providers.go @@ -14,9 +14,11 @@ type providerStatus struct { Name string `json:"name"` Configured bool `json:"configured"` OK bool `json:"ok"` + Stale bool `json:"stale,omitempty"` URL string `json:"url,omitempty"` Model string `json:"model,omitempty"` LatencyMS int64 `json:"latency_ms,omitempty"` + LastOKAt string `json:"last_ok_at,omitempty"` Error string `json:"error,omitempty"` } @@ -25,6 +27,13 @@ type providersStatusResponse struct { Providers []providerStatus `json:"providers"` } +type providerHealthSnapshot struct { + at time.Time + latencyMS int64 +} + +const providerStaleOKWindow = 2 * time.Minute + func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) { ctx, cancel := contextWithTimeout(r, 8*time.Second) defer cancel() @@ -63,14 +72,15 @@ func (s *Server) checkLLM(ctx context.Context) providerStatus { st.LatencyMS = time.Since(start).Milliseconds() if err != nil { st.Error = err.Error() - return st + return s.withStaleProviderOK("llm", st) } defer res.Body.Close() if res.StatusCode >= 300 { st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body)) - return st + return s.withStaleProviderOK("llm", st) } st.OK = true + s.rememberProviderOK("llm", st.LatencyMS) return st } @@ -80,24 +90,63 @@ func (s *Server) checkWhisperX(ctx context.Context) providerStatus { if !st.Configured { return st } - start := time.Now() - req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/health", nil) - if err != nil { - st.Error = err.Error() + paths := []string{"/health", "/healthz", "/readyz", "/"} + var lastErr string + for _, path := range paths { + cctx, cancel := context.WithTimeout(ctx, 2*time.Second) + start := time.Now() + req, err := http.NewRequestWithContext(cctx, http.MethodGet, baseURL+path, nil) + if err != nil { + cancel() + lastErr = err.Error() + continue + } + res, err := (&http.Client{Timeout: 2 * time.Second}).Do(req) + st.LatencyMS = time.Since(start).Milliseconds() + cancel() + if err != nil { + lastErr = err.Error() + continue + } + body := "" + if res.StatusCode >= 300 { + body = readSmallBody(res.Body) + } + _ = res.Body.Close() + if res.StatusCode >= 300 { + lastErr = fmt.Sprintf("%s http %d: %s", path, res.StatusCode, body) + continue + } + st.OK = true + s.rememberProviderOK("whisperx", st.LatencyMS) return st } - res, err := http.DefaultClient.Do(req) - st.LatencyMS = time.Since(start).Milliseconds() - if err != nil { - st.Error = err.Error() - return st - } - defer res.Body.Close() - if res.StatusCode >= 300 { - st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body)) + st.Error = lastErr + return s.withStaleProviderOK("whisperx", st) +} + +func (s *Server) rememberProviderOK(name string, latencyMS int64) { + s.providerMu.Lock() + defer s.providerMu.Unlock() + s.providerOKs[name] = providerHealthSnapshot{at: time.Now().UTC(), latencyMS: latencyMS} +} + +func (s *Server) withStaleProviderOK(name string, st providerStatus) providerStatus { + s.providerMu.Lock() + snap, ok := s.providerOKs[name] + s.providerMu.Unlock() + if !ok || time.Since(snap.at) > providerStaleOKWindow { return st } st.OK = true + st.Stale = true + if st.LatencyMS == 0 { + st.LatencyMS = snap.latencyMS + } + st.LastOKAt = snap.at.Format(time.RFC3339) + if st.Error != "" { + st.Error = "последняя проверка не ответила: " + st.Error + } return st } diff --git a/internal/httpapi/server.go b/internal/httpapi/server.go index 58cb5e8..d81b866 100644 --- a/internal/httpapi/server.go +++ b/internal/httpapi/server.go @@ -6,6 +6,7 @@ import ( "fmt" "net/http" "strings" + "sync" "time" "ai-service/internal/config" @@ -16,12 +17,14 @@ import ( ) type Server struct { - store *store.Store - cfg config.Config + store *store.Store + cfg config.Config + providerMu sync.Mutex + providerOKs map[string]providerHealthSnapshot } func NewServer(store *store.Store, cfg config.Config) http.Handler { - return &Server{store: store, cfg: cfg} + return &Server{store: store, cfg: cfg, providerOKs: make(map[string]providerHealthSnapshot)} } func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {