Skip to main content

subcog/llm/
resilience.rs

1//! LLM resilience wrapper with circuit breaking and budget instrumentation.
2
3use super::{CaptureAnalysis, LlmProvider};
4use crate::{Error, Result};
5use std::collections::VecDeque;
6use std::sync::Mutex;
7use std::time::{Duration, Instant};
8
9/// Resilience configuration for LLM calls.
10#[derive(Debug, Clone)]
11pub struct LlmResilienceConfig {
12    /// Maximum number of retries for retryable failures.
13    pub max_retries: u32,
14    /// Backoff between retries in milliseconds.
15    pub retry_backoff_ms: u64,
16    /// Consecutive failures before opening the circuit.
17    pub breaker_failure_threshold: u32,
18    /// How long to keep the circuit open before half-open.
19    pub breaker_reset_timeout_ms: u64,
20    /// Maximum trial calls while half-open.
21    pub breaker_half_open_max_calls: u32,
22    /// Latency budget in milliseconds for LLM calls.
23    pub latency_slo_ms: u64,
24    /// Error budget ratio threshold.
25    pub error_budget_ratio: f64,
26    /// Error budget window in seconds.
27    pub error_budget_window_secs: u64,
28}
29
30impl Default for LlmResilienceConfig {
31    fn default() -> Self {
32        Self {
33            max_retries: 3,
34            retry_backoff_ms: 100,
35            breaker_failure_threshold: 3,
36            breaker_reset_timeout_ms: 30_000,
37            breaker_half_open_max_calls: 1,
38            latency_slo_ms: 2_000,
39            error_budget_ratio: 0.05,
40            error_budget_window_secs: 300,
41        }
42    }
43}
44
45impl LlmResilienceConfig {
46    /// Loads resilience configuration from environment variables.
47    #[must_use]
48    pub fn from_env() -> Self {
49        Self::default().with_env_overrides()
50    }
51
52    /// Loads resilience configuration from config file settings.
53    #[must_use]
54    pub fn from_config(config: &crate::config::LlmConfig) -> Self {
55        let mut settings = Self::default();
56        if let Some(max_retries) = config.max_retries {
57            settings.max_retries = max_retries;
58        }
59        if let Some(retry_backoff_ms) = config.retry_backoff_ms {
60            settings.retry_backoff_ms = retry_backoff_ms;
61        }
62        if let Some(threshold) = config.breaker_failure_threshold {
63            settings.breaker_failure_threshold = threshold.max(1);
64        }
65        if let Some(reset_ms) = config.breaker_reset_ms {
66            settings.breaker_reset_timeout_ms = reset_ms;
67        }
68        if let Some(half_open) = config.breaker_half_open_max_calls {
69            settings.breaker_half_open_max_calls = half_open.max(1);
70        }
71        if let Some(latency_slo_ms) = config.latency_slo_ms {
72            settings.latency_slo_ms = latency_slo_ms;
73        }
74        if let Some(ratio) = config.error_budget_ratio {
75            settings.error_budget_ratio = ratio.clamp(0.0, 1.0);
76        }
77        if let Some(window_secs) = config.error_budget_window_secs {
78            settings.error_budget_window_secs = window_secs.max(1);
79        }
80        settings
81    }
82
83    /// Applies environment variable overrides.
84    #[must_use]
85    pub fn with_env_overrides(mut self) -> Self {
86        if let Some(parsed) = std::env::var("SUBCOG_LLM_MAX_RETRIES")
87            .ok()
88            .and_then(|v| v.parse::<u32>().ok())
89        {
90            self.max_retries = parsed;
91        }
92        if let Some(parsed) = std::env::var("SUBCOG_LLM_RETRY_BACKOFF_MS")
93            .ok()
94            .and_then(|v| v.parse::<u64>().ok())
95        {
96            self.retry_backoff_ms = parsed;
97        }
98        if let Some(parsed) = std::env::var("SUBCOG_LLM_BREAKER_FAILURE_THRESHOLD")
99            .ok()
100            .and_then(|v| v.parse::<u32>().ok())
101        {
102            self.breaker_failure_threshold = parsed.max(1);
103        }
104        if let Some(parsed) = std::env::var("SUBCOG_LLM_BREAKER_RESET_MS")
105            .ok()
106            .and_then(|v| v.parse::<u64>().ok())
107        {
108            self.breaker_reset_timeout_ms = parsed;
109        }
110        if let Some(parsed) = std::env::var("SUBCOG_LLM_BREAKER_HALF_OPEN_MAX_CALLS")
111            .ok()
112            .and_then(|v| v.parse::<u32>().ok())
113        {
114            self.breaker_half_open_max_calls = parsed.max(1);
115        }
116        if let Some(parsed) = std::env::var("SUBCOG_LLM_LATENCY_SLO_MS")
117            .ok()
118            .and_then(|v| v.parse::<u64>().ok())
119        {
120            self.latency_slo_ms = parsed;
121        }
122        if let Some(parsed) = std::env::var("SUBCOG_LLM_ERROR_BUDGET_RATIO")
123            .ok()
124            .and_then(|v| v.parse::<f64>().ok())
125        {
126            self.error_budget_ratio = parsed.clamp(0.0, 1.0);
127        }
128        if let Some(parsed) = std::env::var("SUBCOG_LLM_ERROR_BUDGET_WINDOW_SECS")
129            .ok()
130            .and_then(|v| v.parse::<u64>().ok())
131        {
132            self.error_budget_window_secs = parsed.max(1);
133        }
134
135        self
136    }
137}
138
139/// Circuit breaker state machine.
140#[derive(Debug)]
141enum BreakerState {
142    Closed { failures: u32 },
143    Open { opened_at: Instant },
144    HalfOpen { attempts: u32 },
145}
146
147#[derive(Debug)]
148struct CircuitBreaker {
149    state: BreakerState,
150    failure_threshold: u32,
151    reset_timeout: Duration,
152    half_open_max_calls: u32,
153}
154
155impl CircuitBreaker {
156    fn new(config: &LlmResilienceConfig) -> Self {
157        Self {
158            state: BreakerState::Closed { failures: 0 },
159            failure_threshold: config.breaker_failure_threshold.max(1),
160            reset_timeout: Duration::from_millis(config.breaker_reset_timeout_ms),
161            half_open_max_calls: config.breaker_half_open_max_calls.max(1),
162        }
163    }
164
165    fn allow(&mut self) -> bool {
166        match self.state {
167            BreakerState::Closed { .. } => true,
168            BreakerState::Open { opened_at } => {
169                if opened_at.elapsed() >= self.reset_timeout {
170                    self.state = BreakerState::HalfOpen { attempts: 0 };
171                    true
172                } else {
173                    false
174                }
175            },
176            BreakerState::HalfOpen { ref mut attempts } => {
177                if *attempts >= self.half_open_max_calls {
178                    false
179                } else {
180                    *attempts += 1;
181                    true
182                }
183            },
184        }
185    }
186
187    const fn on_success(&mut self) {
188        self.state = BreakerState::Closed { failures: 0 };
189    }
190
191    fn on_failure(&mut self) -> bool {
192        match self.state {
193            BreakerState::Closed { ref mut failures } => {
194                *failures += 1;
195                if *failures >= self.failure_threshold {
196                    self.state = BreakerState::Open {
197                        opened_at: Instant::now(),
198                    };
199                    return true;
200                }
201            },
202            BreakerState::HalfOpen { .. } => {
203                self.state = BreakerState::Open {
204                    opened_at: Instant::now(),
205                };
206                return true;
207            },
208            BreakerState::Open { .. } => {},
209        }
210        false
211    }
212
213    const fn state_value(&self) -> u8 {
214        match self.state {
215            BreakerState::Closed { .. } => 0,
216            BreakerState::Open { .. } => 1,
217            BreakerState::HalfOpen { .. } => 2,
218        }
219    }
220}
221
222#[derive(Debug)]
223struct BudgetTracker {
224    window: Duration,
225    requests: VecDeque<Instant>,
226    errors: VecDeque<Instant>,
227}
228
229impl BudgetTracker {
230    const fn new(window: Duration) -> Self {
231        Self {
232            window,
233            requests: VecDeque::new(),
234            errors: VecDeque::new(),
235        }
236    }
237
238    fn record(&mut self, now: Instant, is_error: bool) -> f64 {
239        self.requests.push_back(now);
240        if is_error {
241            self.errors.push_back(now);
242        }
243        self.evict_expired(now);
244        if self.requests.is_empty() {
245            0.0
246        } else {
247            let error_count = u32::try_from(self.errors.len()).unwrap_or(u32::MAX);
248            let request_count = u32::try_from(self.requests.len()).unwrap_or(u32::MAX);
249            f64::from(error_count) / f64::from(request_count)
250        }
251    }
252
253    fn evict_expired(&mut self, now: Instant) {
254        let threshold = now.checked_sub(self.window).unwrap_or(now);
255        while self
256            .requests
257            .front()
258            .is_some_and(|timestamp| *timestamp < threshold)
259        {
260            self.requests.pop_front();
261        }
262        while self
263            .errors
264            .front()
265            .is_some_and(|timestamp| *timestamp < threshold)
266        {
267            self.errors.pop_front();
268        }
269    }
270}
271
272/// LLM provider wrapper with circuit breaker and budget instrumentation.
273pub struct ResilientLlmProvider<P: LlmProvider> {
274    inner: P,
275    config: LlmResilienceConfig,
276    breaker: Mutex<CircuitBreaker>,
277    budget: Mutex<BudgetTracker>,
278}
279
280enum FailureAction {
281    Retry(Error),
282    Fail(Error),
283}
284
285impl<P: LlmProvider> ResilientLlmProvider<P> {
286    /// Creates a new resilient LLM provider wrapper.
287    #[must_use]
288    pub fn new(inner: P, config: LlmResilienceConfig) -> Self {
289        let window = Duration::from_secs(config.error_budget_window_secs.max(1));
290        let breaker = CircuitBreaker::new(&config);
291        Self {
292            inner,
293            config,
294            breaker: Mutex::new(breaker),
295            budget: Mutex::new(BudgetTracker::new(window)),
296        }
297    }
298
299    fn execute<T, F>(&self, operation: &'static str, mut call: F) -> Result<T>
300    where
301        F: FnMut() -> Result<T>,
302    {
303        let provider: &'static str = self.inner.name();
304        let span = tracing::info_span!(
305            "llm.request",
306            provider = provider,
307            operation = operation,
308            status = tracing::field::Empty,
309            error = tracing::field::Empty
310        );
311        let _enter = span.enter();
312
313        let mut breaker = self
314            .breaker
315            .lock()
316            .unwrap_or_else(std::sync::PoisonError::into_inner);
317        if !breaker.allow() {
318            let breaker_state = breaker.state_value();
319            drop(breaker);
320            Self::record_breaker_state(provider, breaker_state);
321            span.record("status", "circuit_open");
322            metrics::counter!(
323                "llm_requests_total",
324                "provider" => provider,
325                "operation" => operation,
326                "status" => "circuit_open"
327            )
328            .increment(1);
329            metrics::counter!(
330                "llm_circuit_breaker_rejections_total",
331                "provider" => provider,
332                "operation" => operation
333            )
334            .increment(1);
335            return Err(Error::OperationFailed {
336                operation: format!("llm_{operation}"),
337                cause: "circuit breaker open".to_string(),
338            });
339        }
340        drop(breaker);
341
342        let mut attempts = 0;
343        let max_attempts = self.config.max_retries + 1;
344        let mut last_error = None;
345
346        while attempts < max_attempts {
347            attempts += 1;
348            let attempt_start = Instant::now();
349            let result = call();
350            let elapsed = attempt_start.elapsed();
351
352            if let Ok(value) = result {
353                self.record_success(provider, operation, elapsed);
354                let breaker_state = self.record_breaker_success_state();
355                Self::record_breaker_state(provider, breaker_state);
356                span.record("status", "success");
357                return Ok(value);
358            }
359
360            let Err(err) = result else {
361                unreachable!("checked Ok above")
362            };
363            match self.handle_failure(provider, operation, err, elapsed, attempts, max_attempts) {
364                FailureAction::Retry(err) => {
365                    last_error = Some(err);
366                },
367                FailureAction::Fail(err) => return Err(err),
368            }
369        }
370
371        Err(last_error.unwrap_or_else(|| Error::OperationFailed {
372            operation: format!("llm_{operation}"),
373            cause: "exhausted retries".to_string(),
374        }))
375    }
376
377    fn record_success(&self, provider: &'static str, operation: &'static str, elapsed: Duration) {
378        self.record_request_metrics(provider, operation, elapsed, false, false);
379        let ratio = self
380            .budget
381            .lock()
382            .unwrap_or_else(std::sync::PoisonError::into_inner)
383            .record(Instant::now(), false);
384        self.record_budget_metrics(provider, operation, ratio);
385    }
386
387    fn handle_failure(
388        &self,
389        provider: &'static str,
390        operation: &'static str,
391        err: Error,
392        elapsed: Duration,
393        attempts: u32,
394        max_attempts: u32,
395    ) -> FailureAction {
396        let is_timeout = is_timeout_error(&err);
397        // CHAOS-CRIT-001/002/003: Retry on all transient errors, not just timeouts
398        let retryable = is_retryable_error(&err) && attempts < max_attempts;
399
400        self.record_failure(provider, operation, elapsed, is_timeout, retryable);
401        let mut breaker = self
402            .breaker
403            .lock()
404            .unwrap_or_else(std::sync::PoisonError::into_inner);
405        let tripped = breaker.on_failure();
406        let breaker_state = breaker.state_value();
407        drop(breaker);
408        Self::record_breaker_state(provider, breaker_state);
409        if tripped {
410            metrics::counter!(
411                "llm_circuit_breaker_trips_total",
412                "provider" => provider,
413                "operation" => operation
414            )
415            .increment(1);
416            tracing::warn!(
417                "LLM circuit breaker opened for provider={provider} operation={operation}"
418            );
419        }
420
421        let status = if is_timeout { "timeout" } else { "error" };
422        let span = tracing::Span::current();
423        span.record("status", status);
424        span.record("error", tracing::field::display(&err));
425
426        if retryable {
427            metrics::counter!(
428                "llm_retries_total",
429                "provider" => provider,
430                "operation" => operation
431            )
432            .increment(1);
433            if self.config.retry_backoff_ms > 0 {
434                let delay = Self::calculate_retry_delay(self.config.retry_backoff_ms, attempts);
435                std::thread::sleep(Duration::from_millis(delay));
436            }
437            return FailureAction::Retry(err);
438        }
439
440        FailureAction::Fail(err)
441    }
442
443    fn record_failure(
444        &self,
445        provider: &'static str,
446        operation: &'static str,
447        elapsed: Duration,
448        is_timeout: bool,
449        retryable: bool,
450    ) {
451        self.record_request_metrics(provider, operation, elapsed, true, is_timeout);
452        let ratio = self
453            .budget
454            .lock()
455            .unwrap_or_else(std::sync::PoisonError::into_inner)
456            .record(Instant::now(), true);
457        self.record_budget_metrics(provider, operation, ratio);
458
459        if retryable {
460            tracing::warn!(
461                "Retrying LLM call provider={provider} operation={operation} elapsed_ms={}",
462                elapsed.as_millis()
463            );
464        }
465    }
466
467    fn record_breaker_success_state(&self) -> u8 {
468        let mut breaker = self
469            .breaker
470            .lock()
471            .unwrap_or_else(std::sync::PoisonError::into_inner);
472        breaker.on_success();
473        breaker.state_value()
474    }
475
476    fn record_request_metrics(
477        &self,
478        provider: &'static str,
479        operation: &'static str,
480        elapsed: Duration,
481        is_error: bool,
482        is_timeout: bool,
483    ) {
484        let status = if is_timeout {
485            "timeout"
486        } else if is_error {
487            "error"
488        } else {
489            "success"
490        };
491
492        metrics::counter!(
493            "llm_requests_total",
494            "provider" => provider,
495            "operation" => operation,
496            "status" => status
497        )
498        .increment(1);
499        metrics::histogram!(
500            "llm_request_duration_ms",
501            "provider" => provider,
502            "operation" => operation,
503            "status" => status
504        )
505        .record(elapsed.as_secs_f64() * 1000.0);
506
507        if is_timeout {
508            metrics::counter!(
509                "llm_timeouts_total",
510                "provider" => provider,
511                "operation" => operation
512            )
513            .increment(1);
514        }
515
516        let elapsed_ms = u64::try_from(elapsed.as_millis()).unwrap_or(u64::MAX);
517        if self.config.latency_slo_ms > 0 && elapsed_ms > self.config.latency_slo_ms {
518            metrics::counter!(
519                "llm_latency_budget_exceeded_total",
520                "provider" => provider,
521                "operation" => operation
522            )
523            .increment(1);
524        }
525    }
526
527    fn record_budget_metrics(&self, provider: &'static str, operation: &'static str, ratio: f64) {
528        metrics::gauge!(
529            "llm_error_budget_ratio",
530            "provider" => provider,
531            "operation" => operation
532        )
533        .set(ratio);
534
535        if ratio > self.config.error_budget_ratio {
536            metrics::counter!(
537                "llm_error_budget_exceeded_total",
538                "provider" => provider,
539                "operation" => operation
540            )
541            .increment(1);
542        }
543    }
544
545    fn record_breaker_state(provider: &'static str, breaker_state: u8) {
546        metrics::gauge!("llm_circuit_breaker_state", "provider" => provider)
547            .set(f64::from(breaker_state));
548    }
549
550    /// Calculates retry delay with exponential backoff and jitter (CHAOS-CRIT-001/002/003).
551    ///
552    /// Formula: `base_delay * 2^(attempt-1) + jitter`
553    /// - Exponential growth prevents overwhelming a recovering service
554    /// - Jitter (0-50% of delay) prevents thundering herd problem
555    /// - Maximum delay capped at 10 seconds
556    fn calculate_retry_delay(base_delay_ms: u64, attempt: u32) -> u64 {
557        // Exponential: base * 2^(attempt-1), so attempt 1 = base, attempt 2 = 2x, attempt 3 = 4x
558        let exponent = attempt.saturating_sub(1);
559        let exponential_delay = base_delay_ms.saturating_mul(1u64 << exponent.min(10));
560
561        // Cap at 10 seconds max delay
562        let capped_delay = exponential_delay.min(10_000);
563
564        // Add jitter: use system time nanoseconds as pseudo-random source
565        // Jitter range: 0-50% of capped delay to prevent thundering herd
566        let jitter = Self::calculate_jitter(capped_delay);
567        let total_delay = capped_delay.saturating_add(jitter);
568
569        tracing::debug!(
570            "Retry backoff: attempt={}, base={}ms, exponential={}ms, jitter={}ms, total={}ms",
571            attempt,
572            base_delay_ms,
573            capped_delay,
574            jitter,
575            total_delay
576        );
577
578        total_delay
579    }
580
581    /// Calculates jitter for retry backoff using system time as pseudo-random source.
582    fn calculate_jitter(delay_ms: u64) -> u64 {
583        let jitter_max = delay_ms / 2;
584        if jitter_max == 0 {
585            return 0;
586        }
587
588        let nanos = std::time::SystemTime::now()
589            .duration_since(std::time::UNIX_EPOCH)
590            .map(|d| d.subsec_nanos())
591            .unwrap_or(0);
592
593        u64::from(nanos) % jitter_max
594    }
595}
596
597impl<P: LlmProvider> LlmProvider for ResilientLlmProvider<P> {
598    fn name(&self) -> &'static str {
599        self.inner.name()
600    }
601
602    fn complete(&self, prompt: &str) -> Result<String> {
603        self.execute("complete", || self.inner.complete(prompt))
604    }
605
606    fn complete_with_system(&self, system: &str, user: &str) -> Result<String> {
607        self.execute("complete_with_system", || {
608            self.inner.complete_with_system(system, user)
609        })
610    }
611
612    fn analyze_for_capture(&self, content: &str) -> Result<CaptureAnalysis> {
613        self.execute("analyze_for_capture", || {
614            self.inner.analyze_for_capture(content)
615        })
616    }
617}
618
619/// Checks if an error is a timeout error.
620fn is_timeout_error(err: &Error) -> bool {
621    match err {
622        Error::OperationFailed { cause, .. } => {
623            let lower = cause.to_lowercase();
624            lower.contains("timeout")
625                || lower.contains("timed out")
626                || lower.contains("deadline")
627                || lower.contains("elapsed")
628        },
629        _ => false,
630    }
631}
632
633/// Checks if an error is retryable (transient failures that may succeed on retry).
634///
635/// Retryable errors include:
636/// - Timeouts
637/// - Connection errors (network issues, DNS failures)
638/// - Server errors (5xx)
639/// - Rate limiting (429)
640fn is_retryable_error(err: &Error) -> bool {
641    match err {
642        Error::OperationFailed { cause, .. } => {
643            let lower = cause.to_lowercase();
644            // Timeout errors
645            lower.contains("timeout")
646                || lower.contains("timed out")
647                || lower.contains("deadline")
648                || lower.contains("elapsed")
649                // Connection errors
650                || lower.contains("connect")
651                || lower.contains("connection")
652                || lower.contains("network")
653                || lower.contains("dns")
654                || lower.contains("resolve")
655                // Server errors (5xx)
656                || lower.contains("500")
657                || lower.contains("502")
658                || lower.contains("503")
659                || lower.contains("504")
660                || lower.contains("internal server error")
661                || lower.contains("bad gateway")
662                || lower.contains("service unavailable")
663                || lower.contains("gateway timeout")
664                // Rate limiting
665                || lower.contains("429")
666                || lower.contains("rate limit")
667                || lower.contains("too many requests")
668                || lower.contains("overloaded")
669        },
670        _ => false,
671    }
672}
673
674#[cfg(test)]
675mod tests {
676    use super::*;
677
678    // =========================================================================
679    // Circuit Breaker Tests
680    // =========================================================================
681
682    #[test]
683    fn test_circuit_breaker_starts_closed() {
684        let config = LlmResilienceConfig::default();
685        let breaker = CircuitBreaker::new(&config);
686        assert_eq!(breaker.state_value(), 0); // Closed = 0
687    }
688
689    #[test]
690    fn test_circuit_breaker_allows_calls_when_closed() {
691        let config = LlmResilienceConfig::default();
692        let mut breaker = CircuitBreaker::new(&config);
693        assert!(breaker.allow());
694        assert!(breaker.allow());
695        assert!(breaker.allow());
696    }
697
698    #[test]
699    fn test_circuit_breaker_opens_after_threshold_failures() {
700        let config = LlmResilienceConfig {
701            breaker_failure_threshold: 3,
702            ..Default::default()
703        };
704        let mut breaker = CircuitBreaker::new(&config);
705
706        // First two failures don't trip the breaker
707        breaker.on_failure();
708        assert_eq!(breaker.state_value(), 0); // Still closed
709        breaker.on_failure();
710        assert_eq!(breaker.state_value(), 0); // Still closed
711
712        // Third failure trips the breaker
713        let tripped = breaker.on_failure();
714        assert!(tripped);
715        assert_eq!(breaker.state_value(), 1); // Open = 1
716    }
717
718    #[test]
719    fn test_circuit_breaker_rejects_when_open() {
720        let config = LlmResilienceConfig {
721            breaker_failure_threshold: 1,
722            breaker_reset_timeout_ms: 10_000, // Long timeout
723            ..Default::default()
724        };
725        let mut breaker = CircuitBreaker::new(&config);
726
727        // Trip the breaker
728        breaker.on_failure();
729        assert_eq!(breaker.state_value(), 1); // Open
730
731        // Should reject calls
732        assert!(!breaker.allow());
733        assert!(!breaker.allow());
734    }
735
736    #[test]
737    fn test_circuit_breaker_transitions_to_half_open_after_timeout() {
738        let config = LlmResilienceConfig {
739            breaker_failure_threshold: 1,
740            breaker_reset_timeout_ms: 0, // Immediate reset
741            breaker_half_open_max_calls: 1,
742            ..Default::default()
743        };
744        let mut breaker = CircuitBreaker::new(&config);
745
746        // Trip the breaker
747        breaker.on_failure();
748        assert_eq!(breaker.state_value(), 1); // Open
749
750        // Should allow call after timeout (immediate since reset_timeout_ms=0)
751        std::thread::sleep(Duration::from_millis(1));
752        assert!(breaker.allow());
753        assert_eq!(breaker.state_value(), 2); // Half-open = 2
754    }
755
756    #[test]
757    fn test_circuit_breaker_half_open_limits_calls() {
758        let config = LlmResilienceConfig {
759            breaker_failure_threshold: 1,
760            breaker_reset_timeout_ms: 0,
761            breaker_half_open_max_calls: 2, // Allow 2 more calls in half-open state
762            ..Default::default()
763        };
764        let mut breaker = CircuitBreaker::new(&config);
765
766        // Trip and transition to half-open
767        breaker.on_failure();
768        std::thread::sleep(Duration::from_millis(1));
769
770        // First call transitions to half-open and is allowed (free transition call)
771        assert!(breaker.allow());
772        assert_eq!(breaker.state_value(), 2); // Half-open
773
774        // Two more calls allowed (attempts = 1, 2)
775        assert!(breaker.allow()); // attempts = 1
776        assert!(breaker.allow()); // attempts = 2
777
778        // Next call rejected (attempts >= max_calls)
779        assert!(!breaker.allow());
780    }
781
782    #[test]
783    fn test_circuit_breaker_closes_on_success() {
784        let config = LlmResilienceConfig {
785            breaker_failure_threshold: 1,
786            breaker_reset_timeout_ms: 0,
787            ..Default::default()
788        };
789        let mut breaker = CircuitBreaker::new(&config);
790
791        // Trip and transition to half-open
792        breaker.on_failure();
793        std::thread::sleep(Duration::from_millis(1));
794        breaker.allow();
795
796        // Success closes the breaker
797        breaker.on_success();
798        assert_eq!(breaker.state_value(), 0); // Closed
799    }
800
801    #[test]
802    fn test_circuit_breaker_reopens_on_failure_when_half_open() {
803        let config = LlmResilienceConfig {
804            breaker_failure_threshold: 1,
805            breaker_reset_timeout_ms: 0,
806            ..Default::default()
807        };
808        let mut breaker = CircuitBreaker::new(&config);
809
810        // Trip and transition to half-open
811        breaker.on_failure();
812        std::thread::sleep(Duration::from_millis(1));
813        breaker.allow();
814        assert_eq!(breaker.state_value(), 2); // Half-open
815
816        // Failure reopens the breaker
817        let tripped = breaker.on_failure();
818        assert!(tripped);
819        assert_eq!(breaker.state_value(), 1); // Open
820    }
821
822    // =========================================================================
823    // Budget Tracker Tests
824    // =========================================================================
825
826    #[test]
827    fn test_budget_tracker_starts_empty() {
828        let tracker = BudgetTracker::new(Duration::from_secs(60));
829        assert!(tracker.requests.is_empty());
830        assert!(tracker.errors.is_empty());
831    }
832
833    #[test]
834    fn test_budget_tracker_records_successful_requests() {
835        let mut tracker = BudgetTracker::new(Duration::from_secs(60));
836        let now = Instant::now();
837
838        let ratio = tracker.record(now, false);
839        assert!(ratio.abs() < f64::EPSILON); // No errors
840
841        let ratio = tracker.record(now, false);
842        assert!(ratio.abs() < f64::EPSILON); // Still no errors
843    }
844
845    #[test]
846    fn test_budget_tracker_records_error_requests() {
847        let mut tracker = BudgetTracker::new(Duration::from_secs(60));
848        let now = Instant::now();
849
850        tracker.record(now, false);
851        tracker.record(now, false);
852        let ratio = tracker.record(now, true);
853
854        // 1 error out of 3 requests = 0.333...
855        assert!((ratio - 0.333).abs() < 0.01);
856    }
857
858    #[test]
859    fn test_budget_tracker_calculates_error_ratio() {
860        let mut tracker = BudgetTracker::new(Duration::from_secs(60));
861        let now = Instant::now();
862
863        // 50% error rate
864        let ratio = tracker.record(now, true);
865        assert!((ratio - 1.0).abs() < f64::EPSILON); // 1/1
866
867        let ratio = tracker.record(now, false);
868        assert!((ratio - 0.5).abs() < f64::EPSILON); // 1/2
869
870        let ratio = tracker.record(now, true);
871        // 2 errors / 3 total = 0.666...
872        assert!((ratio - 0.666).abs() < 0.01);
873    }
874
875    #[test]
876    fn test_budget_tracker_evicts_expired_entries() {
877        // Very short window for testing
878        let mut tracker = BudgetTracker::new(Duration::from_millis(10));
879        let now = Instant::now();
880
881        tracker.record(now, true);
882        assert_eq!(tracker.requests.len(), 1);
883        assert_eq!(tracker.errors.len(), 1);
884
885        // Wait for entries to expire
886        std::thread::sleep(Duration::from_millis(15));
887
888        // Recording a new request should evict old entries
889        let new_now = Instant::now();
890        tracker.record(new_now, false);
891
892        // Old entries should be evicted
893        assert_eq!(tracker.requests.len(), 1);
894        assert_eq!(tracker.errors.len(), 0);
895    }
896
897    // =========================================================================
898    // Configuration Tests
899    // =========================================================================
900
901    #[test]
902    fn test_config_default_values() {
903        let config = LlmResilienceConfig::default();
904        assert_eq!(config.max_retries, 3); // CHAOS-CRIT-001/002/003: 3 retries by default
905        assert_eq!(config.retry_backoff_ms, 100);
906        assert_eq!(config.breaker_failure_threshold, 3);
907        assert_eq!(config.breaker_reset_timeout_ms, 30_000);
908        assert_eq!(config.breaker_half_open_max_calls, 1);
909        assert_eq!(config.latency_slo_ms, 2_000);
910        assert!((config.error_budget_ratio - 0.05).abs() < 0.001);
911        assert_eq!(config.error_budget_window_secs, 300);
912    }
913
914    #[test]
915    fn test_config_from_config_file() {
916        let llm_config = crate::config::LlmConfig {
917            max_retries: Some(5),
918            retry_backoff_ms: Some(200),
919            breaker_failure_threshold: Some(10),
920            breaker_reset_ms: Some(60_000),
921            breaker_half_open_max_calls: Some(3),
922            latency_slo_ms: Some(5_000),
923            error_budget_ratio: Some(0.10),
924            error_budget_window_secs: Some(600),
925            ..Default::default()
926        };
927
928        let config = LlmResilienceConfig::from_config(&llm_config);
929        assert_eq!(config.max_retries, 5);
930        assert_eq!(config.retry_backoff_ms, 200);
931        assert_eq!(config.breaker_failure_threshold, 10);
932        assert_eq!(config.breaker_reset_timeout_ms, 60_000);
933        assert_eq!(config.breaker_half_open_max_calls, 3);
934        assert_eq!(config.latency_slo_ms, 5_000);
935        assert!((config.error_budget_ratio - 0.10).abs() < 0.001);
936        assert_eq!(config.error_budget_window_secs, 600);
937    }
938
939    #[test]
940    fn test_config_clamps_values() {
941        let llm_config = crate::config::LlmConfig {
942            breaker_failure_threshold: Some(0),   // Should be clamped to 1
943            breaker_half_open_max_calls: Some(0), // Should be clamped to 1
944            error_budget_ratio: Some(2.0),        // Should be clamped to 1.0
945            error_budget_window_secs: Some(0),    // Should be clamped to 1
946            ..Default::default()
947        };
948
949        let config = LlmResilienceConfig::from_config(&llm_config);
950        assert_eq!(config.breaker_failure_threshold, 1);
951        assert_eq!(config.breaker_half_open_max_calls, 1);
952        assert!((config.error_budget_ratio - 1.0).abs() < 0.001);
953        assert_eq!(config.error_budget_window_secs, 1);
954    }
955
956    // =========================================================================
957    // Error Detection Tests
958    // =========================================================================
959
960    #[test]
961    fn test_is_timeout_error_detects_timeout() {
962        let err = Error::OperationFailed {
963            operation: "test".to_string(),
964            cause: "Connection timeout".to_string(),
965        };
966        assert!(is_timeout_error(&err));
967    }
968
969    #[test]
970    fn test_is_timeout_error_detects_timed_out() {
971        let err = Error::OperationFailed {
972            operation: "test".to_string(),
973            cause: "Request timed out".to_string(),
974        };
975        assert!(is_timeout_error(&err));
976    }
977
978    #[test]
979    fn test_is_timeout_error_detects_deadline() {
980        let err = Error::OperationFailed {
981            operation: "test".to_string(),
982            cause: "Deadline exceeded".to_string(),
983        };
984        assert!(is_timeout_error(&err));
985    }
986
987    #[test]
988    fn test_is_timeout_error_detects_elapsed() {
989        let err = Error::OperationFailed {
990            operation: "test".to_string(),
991            cause: "Time elapsed".to_string(),
992        };
993        assert!(is_timeout_error(&err));
994    }
995
996    #[test]
997    fn test_is_timeout_error_returns_false_for_other_errors() {
998        let err = Error::OperationFailed {
999            operation: "test".to_string(),
1000            cause: "Connection refused".to_string(),
1001        };
1002        assert!(!is_timeout_error(&err));
1003    }
1004
1005    #[test]
1006    fn test_is_timeout_error_is_case_insensitive() {
1007        let err = Error::OperationFailed {
1008            operation: "test".to_string(),
1009            cause: "TIMEOUT ERROR".to_string(),
1010        };
1011        assert!(is_timeout_error(&err));
1012    }
1013
1014    // =========================================================================
1015    // Retryable Error Detection Tests (CHAOS-CRIT-001/002/003)
1016    // =========================================================================
1017
1018    #[test]
1019    fn test_is_retryable_error_detects_connection_errors() {
1020        let test_cases = [
1021            "connect error: connection refused",
1022            "Connection reset by peer",
1023            "network error: no route to host",
1024            "DNS lookup failed",
1025            "Failed to resolve hostname",
1026        ];
1027
1028        for cause in test_cases {
1029            let err = Error::OperationFailed {
1030                operation: "test".to_string(),
1031                cause: cause.to_string(),
1032            };
1033            assert!(is_retryable_error(&err), "Should be retryable: {cause}");
1034        }
1035    }
1036
1037    #[test]
1038    fn test_is_retryable_error_detects_server_errors() {
1039        let test_cases = [
1040            "API returned status: 500",
1041            "502 Bad Gateway",
1042            "503 Service Unavailable",
1043            "504 Gateway Timeout",
1044            "Internal Server Error",
1045            "Bad Gateway error",
1046            "Service unavailable",
1047            "Gateway timeout exceeded",
1048        ];
1049
1050        for cause in test_cases {
1051            let err = Error::OperationFailed {
1052                operation: "test".to_string(),
1053                cause: cause.to_string(),
1054            };
1055            assert!(is_retryable_error(&err), "Should be retryable: {cause}");
1056        }
1057    }
1058
1059    #[test]
1060    fn test_is_retryable_error_detects_rate_limiting() {
1061        let test_cases = [
1062            "API returned status: 429",
1063            "Rate limit exceeded",
1064            "Too many requests",
1065            "Server overloaded",
1066        ];
1067
1068        for cause in test_cases {
1069            let err = Error::OperationFailed {
1070                operation: "test".to_string(),
1071                cause: cause.to_string(),
1072            };
1073            assert!(is_retryable_error(&err), "Should be retryable: {cause}");
1074        }
1075    }
1076
1077    #[test]
1078    fn test_is_retryable_error_includes_timeout_errors() {
1079        let err = Error::OperationFailed {
1080            operation: "test".to_string(),
1081            cause: "Request timeout".to_string(),
1082        };
1083        assert!(is_retryable_error(&err));
1084    }
1085
1086    #[test]
1087    fn test_is_retryable_error_returns_false_for_client_errors() {
1088        let test_cases = [
1089            "API returned status: 400 - Bad Request",
1090            "401 Unauthorized",
1091            "403 Forbidden",
1092            "404 Not Found",
1093            "Invalid API key format",
1094            "Malformed JSON in request",
1095        ];
1096
1097        for cause in test_cases {
1098            let err = Error::OperationFailed {
1099                operation: "test".to_string(),
1100                cause: cause.to_string(),
1101            };
1102            assert!(
1103                !is_retryable_error(&err),
1104                "Should NOT be retryable: {cause}"
1105            );
1106        }
1107    }
1108
1109    #[test]
1110    fn test_is_retryable_error_is_case_insensitive() {
1111        let err = Error::OperationFailed {
1112            operation: "test".to_string(),
1113            cause: "CONNECTION REFUSED".to_string(),
1114        };
1115        assert!(is_retryable_error(&err));
1116    }
1117}