Stabilize AI provider health checks
This commit is contained in:
@@ -14,9 +14,11 @@ type providerStatus struct {
|
||||
Name string `json:"name"`
|
||||
Configured bool `json:"configured"`
|
||||
OK bool `json:"ok"`
|
||||
Stale bool `json:"stale,omitempty"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Model string `json:"model,omitempty"`
|
||||
LatencyMS int64 `json:"latency_ms,omitempty"`
|
||||
LastOKAt string `json:"last_ok_at,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
@@ -25,6 +27,13 @@ type providersStatusResponse struct {
|
||||
Providers []providerStatus `json:"providers"`
|
||||
}
|
||||
|
||||
type providerHealthSnapshot struct {
|
||||
at time.Time
|
||||
latencyMS int64
|
||||
}
|
||||
|
||||
const providerStaleOKWindow = 2 * time.Minute
|
||||
|
||||
func (s *Server) handleProviderStatus(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := contextWithTimeout(r, 8*time.Second)
|
||||
defer cancel()
|
||||
@@ -63,14 +72,15 @@ func (s *Server) checkLLM(ctx context.Context) providerStatus {
|
||||
st.LatencyMS = time.Since(start).Milliseconds()
|
||||
if err != nil {
|
||||
st.Error = err.Error()
|
||||
return st
|
||||
return s.withStaleProviderOK("llm", st)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode >= 300 {
|
||||
st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body))
|
||||
return st
|
||||
return s.withStaleProviderOK("llm", st)
|
||||
}
|
||||
st.OK = true
|
||||
s.rememberProviderOK("llm", st.LatencyMS)
|
||||
return st
|
||||
}
|
||||
|
||||
@@ -80,24 +90,63 @@ func (s *Server) checkWhisperX(ctx context.Context) providerStatus {
|
||||
if !st.Configured {
|
||||
return st
|
||||
}
|
||||
start := time.Now()
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/health", nil)
|
||||
if err != nil {
|
||||
st.Error = err.Error()
|
||||
paths := []string{"/health", "/healthz", "/readyz", "/"}
|
||||
var lastErr string
|
||||
for _, path := range paths {
|
||||
cctx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
||||
start := time.Now()
|
||||
req, err := http.NewRequestWithContext(cctx, http.MethodGet, baseURL+path, nil)
|
||||
if err != nil {
|
||||
cancel()
|
||||
lastErr = err.Error()
|
||||
continue
|
||||
}
|
||||
res, err := (&http.Client{Timeout: 2 * time.Second}).Do(req)
|
||||
st.LatencyMS = time.Since(start).Milliseconds()
|
||||
cancel()
|
||||
if err != nil {
|
||||
lastErr = err.Error()
|
||||
continue
|
||||
}
|
||||
body := ""
|
||||
if res.StatusCode >= 300 {
|
||||
body = readSmallBody(res.Body)
|
||||
}
|
||||
_ = res.Body.Close()
|
||||
if res.StatusCode >= 300 {
|
||||
lastErr = fmt.Sprintf("%s http %d: %s", path, res.StatusCode, body)
|
||||
continue
|
||||
}
|
||||
st.OK = true
|
||||
s.rememberProviderOK("whisperx", st.LatencyMS)
|
||||
return st
|
||||
}
|
||||
res, err := http.DefaultClient.Do(req)
|
||||
st.LatencyMS = time.Since(start).Milliseconds()
|
||||
if err != nil {
|
||||
st.Error = err.Error()
|
||||
return st
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode >= 300 {
|
||||
st.Error = fmt.Sprintf("http %d: %s", res.StatusCode, readSmallBody(res.Body))
|
||||
st.Error = lastErr
|
||||
return s.withStaleProviderOK("whisperx", st)
|
||||
}
|
||||
|
||||
func (s *Server) rememberProviderOK(name string, latencyMS int64) {
|
||||
s.providerMu.Lock()
|
||||
defer s.providerMu.Unlock()
|
||||
s.providerOKs[name] = providerHealthSnapshot{at: time.Now().UTC(), latencyMS: latencyMS}
|
||||
}
|
||||
|
||||
func (s *Server) withStaleProviderOK(name string, st providerStatus) providerStatus {
|
||||
s.providerMu.Lock()
|
||||
snap, ok := s.providerOKs[name]
|
||||
s.providerMu.Unlock()
|
||||
if !ok || time.Since(snap.at) > providerStaleOKWindow {
|
||||
return st
|
||||
}
|
||||
st.OK = true
|
||||
st.Stale = true
|
||||
if st.LatencyMS == 0 {
|
||||
st.LatencyMS = snap.latencyMS
|
||||
}
|
||||
st.LastOKAt = snap.at.Format(time.RFC3339)
|
||||
if st.Error != "" {
|
||||
st.Error = "последняя проверка не ответила: " + st.Error
|
||||
}
|
||||
return st
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"ai-service/internal/config"
|
||||
@@ -16,12 +17,14 @@ import (
|
||||
)
|
||||
|
||||
type Server struct {
|
||||
store *store.Store
|
||||
cfg config.Config
|
||||
store *store.Store
|
||||
cfg config.Config
|
||||
providerMu sync.Mutex
|
||||
providerOKs map[string]providerHealthSnapshot
|
||||
}
|
||||
|
||||
func NewServer(store *store.Store, cfg config.Config) http.Handler {
|
||||
return &Server{store: store, cfg: cfg}
|
||||
return &Server{store: store, cfg: cfg, providerOKs: make(map[string]providerHealthSnapshot)}
|
||||
}
|
||||
|
||||
func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
Reference in New Issue
Block a user