Stabilize AI provider health checks
All checks were successful
CI / test (push) Successful in 15s
Build and Deploy / build-and-deploy (push) Successful in 25s

This commit is contained in:
Grendgi
2026-06-08 16:23:50 +03:00
parent 039bcdb2b2
commit 04e463d03f
2 changed files with 70 additions and 18 deletions

View File

@@ -14,9 +14,11 @@ type providerStatus struct {
Name string `json:"name"`
Configured bool `json:"configured"`
OK bool `json:"ok"`
Stale bool `json:"stale,omitempty"`
URL string `json:"url,omitempty"`
Model string `json:"model,omitempty"`
LatencyMS int64 `json:"latency_ms,omitempty"`
LastOKAt string `json:"last_ok_at,omitempty"`
Error string `json:"error,omitempty"`
}
@@ -25,6 +27,13 @@ type providersStatusResponse struct {
Providers []providerStatus `json:"providers"`
}
type providerHealthSnapshot struct {
at time.Time
latencyMS int64
}
const providerStaleOKWindow = 2 * time.Minute
func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
ctx, cancel := contextWithTimeout(r, 8*time.Second)
defer cancel()
@@ -63,14 +72,15 @@ func (s *Server) checkLLM(ctx context.Context) providerStatus {
st.LatencyMS = time.Since(start).Milliseconds()
if err != nil {
st.Error = err.Error()
return st
return s.withStaleProviderOK("llm", st)
}
defer res.Body.Close()
if res.StatusCode >= 300 {
st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body))
return st
return s.withStaleProviderOK("llm", st)
}
st.OK = true
s.rememberProviderOK("llm", st.LatencyMS)
return st
}
@@ -80,24 +90,63 @@ func (s *Server) checkWhisperX(ctx context.Context) providerStatus {
if !st.Configured {
return st
}
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/health", nil)
if err != nil {
st.Error = err.Error()
paths := []string{"/health", "/healthz", "/readyz", "/"}
var lastErr string
for _, path := range paths {
cctx, cancel := context.WithTimeout(ctx, 2*time.Second)
start := time.Now()
req, err := http.NewRequestWithContext(cctx, http.MethodGet, baseURL+path, nil)
if err != nil {
cancel()
lastErr = err.Error()
continue
}
res, err := (&http.Client{Timeout: 2 * time.Second}).Do(req)
st.LatencyMS = time.Since(start).Milliseconds()
cancel()
if err != nil {
lastErr = err.Error()
continue
}
body := ""
if res.StatusCode >= 300 {
body = readSmallBody(res.Body)
}
_ = res.Body.Close()
if res.StatusCode >= 300 {
lastErr = fmt.Sprintf("%s http %d: %s", path, res.StatusCode, body)
continue
}
st.OK = true
s.rememberProviderOK("whisperx", st.LatencyMS)
return st
}
res, err := http.DefaultClient.Do(req)
st.LatencyMS = time.Since(start).Milliseconds()
if err != nil {
st.Error = err.Error()
return st
}
defer res.Body.Close()
if res.StatusCode >= 300 {
st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body))
st.Error = lastErr
return s.withStaleProviderOK("whisperx", st)
}
func (s *Server) rememberProviderOK(name string, latencyMS int64) {
s.providerMu.Lock()
defer s.providerMu.Unlock()
s.providerOKs[name] = providerHealthSnapshot{at: time.Now().UTC(), latencyMS: latencyMS}
}
func (s *Server) withStaleProviderOK(name string, st providerStatus) providerStatus {
s.providerMu.Lock()
snap, ok := s.providerOKs[name]
s.providerMu.Unlock()
if !ok || time.Since(snap.at) > providerStaleOKWindow {
return st
}
st.OK = true
st.Stale = true
if st.LatencyMS == 0 {
st.LatencyMS = snap.latencyMS
}
st.LastOKAt = snap.at.Format(time.RFC3339)
if st.Error != "" {
st.Error = "последняя проверка не ответила: " + st.Error
}
return st
}