1use super::{CaptureAnalysis, LlmProvider};
4use crate::{Error, Result};
5use std::collections::VecDeque;
6use std::sync::Mutex;
7use std::time::{Duration, Instant};
8
9#[derive(Debug, Clone)]
11pub struct LlmResilienceConfig {
12 pub max_retries: u32,
14 pub retry_backoff_ms: u64,
16 pub breaker_failure_threshold: u32,
18 pub breaker_reset_timeout_ms: u64,
20 pub breaker_half_open_max_calls: u32,
22 pub latency_slo_ms: u64,
24 pub error_budget_ratio: f64,
26 pub error_budget_window_secs: u64,
28}
29
30impl Default for LlmResilienceConfig {
31 fn default() -> Self {
32 Self {
33 max_retries: 3,
34 retry_backoff_ms: 100,
35 breaker_failure_threshold: 3,
36 breaker_reset_timeout_ms: 30_000,
37 breaker_half_open_max_calls: 1,
38 latency_slo_ms: 2_000,
39 error_budget_ratio: 0.05,
40 error_budget_window_secs: 300,
41 }
42 }
43}
44
45impl LlmResilienceConfig {
46 #[must_use]
48 pub fn from_env() -> Self {
49 Self::default().with_env_overrides()
50 }
51
52 #[must_use]
54 pub fn from_config(config: &crate::config::LlmConfig) -> Self {
55 let mut settings = Self::default();
56 if let Some(max_retries) = config.max_retries {
57 settings.max_retries = max_retries;
58 }
59 if let Some(retry_backoff_ms) = config.retry_backoff_ms {
60 settings.retry_backoff_ms = retry_backoff_ms;
61 }
62 if let Some(threshold) = config.breaker_failure_threshold {
63 settings.breaker_failure_threshold = threshold.max(1);
64 }
65 if let Some(reset_ms) = config.breaker_reset_ms {
66 settings.breaker_reset_timeout_ms = reset_ms;
67 }
68 if let Some(half_open) = config.breaker_half_open_max_calls {
69 settings.breaker_half_open_max_calls = half_open.max(1);
70 }
71 if let Some(latency_slo_ms) = config.latency_slo_ms {
72 settings.latency_slo_ms = latency_slo_ms;
73 }
74 if let Some(ratio) = config.error_budget_ratio {
75 settings.error_budget_ratio = ratio.clamp(0.0, 1.0);
76 }
77 if let Some(window_secs) = config.error_budget_window_secs {
78 settings.error_budget_window_secs = window_secs.max(1);
79 }
80 settings
81 }
82
83 #[must_use]
85 pub fn with_env_overrides(mut self) -> Self {
86 if let Some(parsed) = std::env::var("SUBCOG_LLM_MAX_RETRIES")
87 .ok()
88 .and_then(|v| v.parse::<u32>().ok())
89 {
90 self.max_retries = parsed;
91 }
92 if let Some(parsed) = std::env::var("SUBCOG_LLM_RETRY_BACKOFF_MS")
93 .ok()
94 .and_then(|v| v.parse::<u64>().ok())
95 {
96 self.retry_backoff_ms = parsed;
97 }
98 if let Some(parsed) = std::env::var("SUBCOG_LLM_BREAKER_FAILURE_THRESHOLD")
99 .ok()
100 .and_then(|v| v.parse::<u32>().ok())
101 {
102 self.breaker_failure_threshold = parsed.max(1);
103 }
104 if let Some(parsed) = std::env::var("SUBCOG_LLM_BREAKER_RESET_MS")
105 .ok()
106 .and_then(|v| v.parse::<u64>().ok())
107 {
108 self.breaker_reset_timeout_ms = parsed;
109 }
110 if let Some(parsed) = std::env::var("SUBCOG_LLM_BREAKER_HALF_OPEN_MAX_CALLS")
111 .ok()
112 .and_then(|v| v.parse::<u32>().ok())
113 {
114 self.breaker_half_open_max_calls = parsed.max(1);
115 }
116 if let Some(parsed) = std::env::var("SUBCOG_LLM_LATENCY_SLO_MS")
117 .ok()
118 .and_then(|v| v.parse::<u64>().ok())
119 {
120 self.latency_slo_ms = parsed;
121 }
122 if let Some(parsed) = std::env::var("SUBCOG_LLM_ERROR_BUDGET_RATIO")
123 .ok()
124 .and_then(|v| v.parse::<f64>().ok())
125 {
126 self.error_budget_ratio = parsed.clamp(0.0, 1.0);
127 }
128 if let Some(parsed) = std::env::var("SUBCOG_LLM_ERROR_BUDGET_WINDOW_SECS")
129 .ok()
130 .and_then(|v| v.parse::<u64>().ok())
131 {
132 self.error_budget_window_secs = parsed.max(1);
133 }
134
135 self
136 }
137}
138
139#[derive(Debug)]
141enum BreakerState {
142 Closed { failures: u32 },
143 Open { opened_at: Instant },
144 HalfOpen { attempts: u32 },
145}
146
147#[derive(Debug)]
148struct CircuitBreaker {
149 state: BreakerState,
150 failure_threshold: u32,
151 reset_timeout: Duration,
152 half_open_max_calls: u32,
153}
154
155impl CircuitBreaker {
156 fn new(config: &LlmResilienceConfig) -> Self {
157 Self {
158 state: BreakerState::Closed { failures: 0 },
159 failure_threshold: config.breaker_failure_threshold.max(1),
160 reset_timeout: Duration::from_millis(config.breaker_reset_timeout_ms),
161 half_open_max_calls: config.breaker_half_open_max_calls.max(1),
162 }
163 }
164
165 fn allow(&mut self) -> bool {
166 match self.state {
167 BreakerState::Closed { .. } => true,
168 BreakerState::Open { opened_at } => {
169 if opened_at.elapsed() >= self.reset_timeout {
170 self.state = BreakerState::HalfOpen { attempts: 0 };
171 true
172 } else {
173 false
174 }
175 },
176 BreakerState::HalfOpen { ref mut attempts } => {
177 if *attempts >= self.half_open_max_calls {
178 false
179 } else {
180 *attempts += 1;
181 true
182 }
183 },
184 }
185 }
186
187 const fn on_success(&mut self) {
188 self.state = BreakerState::Closed { failures: 0 };
189 }
190
191 fn on_failure(&mut self) -> bool {
192 match self.state {
193 BreakerState::Closed { ref mut failures } => {
194 *failures += 1;
195 if *failures >= self.failure_threshold {
196 self.state = BreakerState::Open {
197 opened_at: Instant::now(),
198 };
199 return true;
200 }
201 },
202 BreakerState::HalfOpen { .. } => {
203 self.state = BreakerState::Open {
204 opened_at: Instant::now(),
205 };
206 return true;
207 },
208 BreakerState::Open { .. } => {},
209 }
210 false
211 }
212
213 const fn state_value(&self) -> u8 {
214 match self.state {
215 BreakerState::Closed { .. } => 0,
216 BreakerState::Open { .. } => 1,
217 BreakerState::HalfOpen { .. } => 2,
218 }
219 }
220}
221
222#[derive(Debug)]
223struct BudgetTracker {
224 window: Duration,
225 requests: VecDeque<Instant>,
226 errors: VecDeque<Instant>,
227}
228
229impl BudgetTracker {
230 const fn new(window: Duration) -> Self {
231 Self {
232 window,
233 requests: VecDeque::new(),
234 errors: VecDeque::new(),
235 }
236 }
237
238 fn record(&mut self, now: Instant, is_error: bool) -> f64 {
239 self.requests.push_back(now);
240 if is_error {
241 self.errors.push_back(now);
242 }
243 self.evict_expired(now);
244 if self.requests.is_empty() {
245 0.0
246 } else {
247 let error_count = u32::try_from(self.errors.len()).unwrap_or(u32::MAX);
248 let request_count = u32::try_from(self.requests.len()).unwrap_or(u32::MAX);
249 f64::from(error_count) / f64::from(request_count)
250 }
251 }
252
253 fn evict_expired(&mut self, now: Instant) {
254 let threshold = now.checked_sub(self.window).unwrap_or(now);
255 while self
256 .requests
257 .front()
258 .is_some_and(|timestamp| *timestamp < threshold)
259 {
260 self.requests.pop_front();
261 }
262 while self
263 .errors
264 .front()
265 .is_some_and(|timestamp| *timestamp < threshold)
266 {
267 self.errors.pop_front();
268 }
269 }
270}
271
272pub struct ResilientLlmProvider<P: LlmProvider> {
274 inner: P,
275 config: LlmResilienceConfig,
276 breaker: Mutex<CircuitBreaker>,
277 budget: Mutex<BudgetTracker>,
278}
279
280enum FailureAction {
281 Retry(Error),
282 Fail(Error),
283}
284
285impl<P: LlmProvider> ResilientLlmProvider<P> {
286 #[must_use]
288 pub fn new(inner: P, config: LlmResilienceConfig) -> Self {
289 let window = Duration::from_secs(config.error_budget_window_secs.max(1));
290 let breaker = CircuitBreaker::new(&config);
291 Self {
292 inner,
293 config,
294 breaker: Mutex::new(breaker),
295 budget: Mutex::new(BudgetTracker::new(window)),
296 }
297 }
298
299 fn execute<T, F>(&self, operation: &'static str, mut call: F) -> Result<T>
300 where
301 F: FnMut() -> Result<T>,
302 {
303 let provider: &'static str = self.inner.name();
304 let span = tracing::info_span!(
305 "llm.request",
306 provider = provider,
307 operation = operation,
308 status = tracing::field::Empty,
309 error = tracing::field::Empty
310 );
311 let _enter = span.enter();
312
313 let mut breaker = self
314 .breaker
315 .lock()
316 .unwrap_or_else(std::sync::PoisonError::into_inner);
317 if !breaker.allow() {
318 let breaker_state = breaker.state_value();
319 drop(breaker);
320 Self::record_breaker_state(provider, breaker_state);
321 span.record("status", "circuit_open");
322 metrics::counter!(
323 "llm_requests_total",
324 "provider" => provider,
325 "operation" => operation,
326 "status" => "circuit_open"
327 )
328 .increment(1);
329 metrics::counter!(
330 "llm_circuit_breaker_rejections_total",
331 "provider" => provider,
332 "operation" => operation
333 )
334 .increment(1);
335 return Err(Error::OperationFailed {
336 operation: format!("llm_{operation}"),
337 cause: "circuit breaker open".to_string(),
338 });
339 }
340 drop(breaker);
341
342 let mut attempts = 0;
343 let max_attempts = self.config.max_retries + 1;
344 let mut last_error = None;
345
346 while attempts < max_attempts {
347 attempts += 1;
348 let attempt_start = Instant::now();
349 let result = call();
350 let elapsed = attempt_start.elapsed();
351
352 if let Ok(value) = result {
353 self.record_success(provider, operation, elapsed);
354 let breaker_state = self.record_breaker_success_state();
355 Self::record_breaker_state(provider, breaker_state);
356 span.record("status", "success");
357 return Ok(value);
358 }
359
360 let Err(err) = result else {
361 unreachable!("checked Ok above")
362 };
363 match self.handle_failure(provider, operation, err, elapsed, attempts, max_attempts) {
364 FailureAction::Retry(err) => {
365 last_error = Some(err);
366 },
367 FailureAction::Fail(err) => return Err(err),
368 }
369 }
370
371 Err(last_error.unwrap_or_else(|| Error::OperationFailed {
372 operation: format!("llm_{operation}"),
373 cause: "exhausted retries".to_string(),
374 }))
375 }
376
377 fn record_success(&self, provider: &'static str, operation: &'static str, elapsed: Duration) {
378 self.record_request_metrics(provider, operation, elapsed, false, false);
379 let ratio = self
380 .budget
381 .lock()
382 .unwrap_or_else(std::sync::PoisonError::into_inner)
383 .record(Instant::now(), false);
384 self.record_budget_metrics(provider, operation, ratio);
385 }
386
387 fn handle_failure(
388 &self,
389 provider: &'static str,
390 operation: &'static str,
391 err: Error,
392 elapsed: Duration,
393 attempts: u32,
394 max_attempts: u32,
395 ) -> FailureAction {
396 let is_timeout = is_timeout_error(&err);
397 let retryable = is_retryable_error(&err) && attempts < max_attempts;
399
400 self.record_failure(provider, operation, elapsed, is_timeout, retryable);
401 let mut breaker = self
402 .breaker
403 .lock()
404 .unwrap_or_else(std::sync::PoisonError::into_inner);
405 let tripped = breaker.on_failure();
406 let breaker_state = breaker.state_value();
407 drop(breaker);
408 Self::record_breaker_state(provider, breaker_state);
409 if tripped {
410 metrics::counter!(
411 "llm_circuit_breaker_trips_total",
412 "provider" => provider,
413 "operation" => operation
414 )
415 .increment(1);
416 tracing::warn!(
417 "LLM circuit breaker opened for provider={provider} operation={operation}"
418 );
419 }
420
421 let status = if is_timeout { "timeout" } else { "error" };
422 let span = tracing::Span::current();
423 span.record("status", status);
424 span.record("error", tracing::field::display(&err));
425
426 if retryable {
427 metrics::counter!(
428 "llm_retries_total",
429 "provider" => provider,
430 "operation" => operation
431 )
432 .increment(1);
433 if self.config.retry_backoff_ms > 0 {
434 let delay = Self::calculate_retry_delay(self.config.retry_backoff_ms, attempts);
435 std::thread::sleep(Duration::from_millis(delay));
436 }
437 return FailureAction::Retry(err);
438 }
439
440 FailureAction::Fail(err)
441 }
442
443 fn record_failure(
444 &self,
445 provider: &'static str,
446 operation: &'static str,
447 elapsed: Duration,
448 is_timeout: bool,
449 retryable: bool,
450 ) {
451 self.record_request_metrics(provider, operation, elapsed, true, is_timeout);
452 let ratio = self
453 .budget
454 .lock()
455 .unwrap_or_else(std::sync::PoisonError::into_inner)
456 .record(Instant::now(), true);
457 self.record_budget_metrics(provider, operation, ratio);
458
459 if retryable {
460 tracing::warn!(
461 "Retrying LLM call provider={provider} operation={operation} elapsed_ms={}",
462 elapsed.as_millis()
463 );
464 }
465 }
466
467 fn record_breaker_success_state(&self) -> u8 {
468 let mut breaker = self
469 .breaker
470 .lock()
471 .unwrap_or_else(std::sync::PoisonError::into_inner);
472 breaker.on_success();
473 breaker.state_value()
474 }
475
476 fn record_request_metrics(
477 &self,
478 provider: &'static str,
479 operation: &'static str,
480 elapsed: Duration,
481 is_error: bool,
482 is_timeout: bool,
483 ) {
484 let status = if is_timeout {
485 "timeout"
486 } else if is_error {
487 "error"
488 } else {
489 "success"
490 };
491
492 metrics::counter!(
493 "llm_requests_total",
494 "provider" => provider,
495 "operation" => operation,
496 "status" => status
497 )
498 .increment(1);
499 metrics::histogram!(
500 "llm_request_duration_ms",
501 "provider" => provider,
502 "operation" => operation,
503 "status" => status
504 )
505 .record(elapsed.as_secs_f64() * 1000.0);
506
507 if is_timeout {
508 metrics::counter!(
509 "llm_timeouts_total",
510 "provider" => provider,
511 "operation" => operation
512 )
513 .increment(1);
514 }
515
516 let elapsed_ms = u64::try_from(elapsed.as_millis()).unwrap_or(u64::MAX);
517 if self.config.latency_slo_ms > 0 && elapsed_ms > self.config.latency_slo_ms {
518 metrics::counter!(
519 "llm_latency_budget_exceeded_total",
520 "provider" => provider,
521 "operation" => operation
522 )
523 .increment(1);
524 }
525 }
526
527 fn record_budget_metrics(&self, provider: &'static str, operation: &'static str, ratio: f64) {
528 metrics::gauge!(
529 "llm_error_budget_ratio",
530 "provider" => provider,
531 "operation" => operation
532 )
533 .set(ratio);
534
535 if ratio > self.config.error_budget_ratio {
536 metrics::counter!(
537 "llm_error_budget_exceeded_total",
538 "provider" => provider,
539 "operation" => operation
540 )
541 .increment(1);
542 }
543 }
544
545 fn record_breaker_state(provider: &'static str, breaker_state: u8) {
546 metrics::gauge!("llm_circuit_breaker_state", "provider" => provider)
547 .set(f64::from(breaker_state));
548 }
549
550 fn calculate_retry_delay(base_delay_ms: u64, attempt: u32) -> u64 {
557 let exponent = attempt.saturating_sub(1);
559 let exponential_delay = base_delay_ms.saturating_mul(1u64 << exponent.min(10));
560
561 let capped_delay = exponential_delay.min(10_000);
563
564 let jitter = Self::calculate_jitter(capped_delay);
567 let total_delay = capped_delay.saturating_add(jitter);
568
569 tracing::debug!(
570 "Retry backoff: attempt={}, base={}ms, exponential={}ms, jitter={}ms, total={}ms",
571 attempt,
572 base_delay_ms,
573 capped_delay,
574 jitter,
575 total_delay
576 );
577
578 total_delay
579 }
580
581 fn calculate_jitter(delay_ms: u64) -> u64 {
583 let jitter_max = delay_ms / 2;
584 if jitter_max == 0 {
585 return 0;
586 }
587
588 let nanos = std::time::SystemTime::now()
589 .duration_since(std::time::UNIX_EPOCH)
590 .map(|d| d.subsec_nanos())
591 .unwrap_or(0);
592
593 u64::from(nanos) % jitter_max
594 }
595}
596
597impl<P: LlmProvider> LlmProvider for ResilientLlmProvider<P> {
598 fn name(&self) -> &'static str {
599 self.inner.name()
600 }
601
602 fn complete(&self, prompt: &str) -> Result<String> {
603 self.execute("complete", || self.inner.complete(prompt))
604 }
605
606 fn complete_with_system(&self, system: &str, user: &str) -> Result<String> {
607 self.execute("complete_with_system", || {
608 self.inner.complete_with_system(system, user)
609 })
610 }
611
612 fn analyze_for_capture(&self, content: &str) -> Result<CaptureAnalysis> {
613 self.execute("analyze_for_capture", || {
614 self.inner.analyze_for_capture(content)
615 })
616 }
617}
618
619fn is_timeout_error(err: &Error) -> bool {
621 match err {
622 Error::OperationFailed { cause, .. } => {
623 let lower = cause.to_lowercase();
624 lower.contains("timeout")
625 || lower.contains("timed out")
626 || lower.contains("deadline")
627 || lower.contains("elapsed")
628 },
629 _ => false,
630 }
631}
632
633fn is_retryable_error(err: &Error) -> bool {
641 match err {
642 Error::OperationFailed { cause, .. } => {
643 let lower = cause.to_lowercase();
644 lower.contains("timeout")
646 || lower.contains("timed out")
647 || lower.contains("deadline")
648 || lower.contains("elapsed")
649 || lower.contains("connect")
651 || lower.contains("connection")
652 || lower.contains("network")
653 || lower.contains("dns")
654 || lower.contains("resolve")
655 || lower.contains("500")
657 || lower.contains("502")
658 || lower.contains("503")
659 || lower.contains("504")
660 || lower.contains("internal server error")
661 || lower.contains("bad gateway")
662 || lower.contains("service unavailable")
663 || lower.contains("gateway timeout")
664 || lower.contains("429")
666 || lower.contains("rate limit")
667 || lower.contains("too many requests")
668 || lower.contains("overloaded")
669 },
670 _ => false,
671 }
672}
673
674#[cfg(test)]
675mod tests {
676 use super::*;
677
678 #[test]
683 fn test_circuit_breaker_starts_closed() {
684 let config = LlmResilienceConfig::default();
685 let breaker = CircuitBreaker::new(&config);
686 assert_eq!(breaker.state_value(), 0); }
688
689 #[test]
690 fn test_circuit_breaker_allows_calls_when_closed() {
691 let config = LlmResilienceConfig::default();
692 let mut breaker = CircuitBreaker::new(&config);
693 assert!(breaker.allow());
694 assert!(breaker.allow());
695 assert!(breaker.allow());
696 }
697
698 #[test]
699 fn test_circuit_breaker_opens_after_threshold_failures() {
700 let config = LlmResilienceConfig {
701 breaker_failure_threshold: 3,
702 ..Default::default()
703 };
704 let mut breaker = CircuitBreaker::new(&config);
705
706 breaker.on_failure();
708 assert_eq!(breaker.state_value(), 0); breaker.on_failure();
710 assert_eq!(breaker.state_value(), 0); let tripped = breaker.on_failure();
714 assert!(tripped);
715 assert_eq!(breaker.state_value(), 1); }
717
718 #[test]
719 fn test_circuit_breaker_rejects_when_open() {
720 let config = LlmResilienceConfig {
721 breaker_failure_threshold: 1,
722 breaker_reset_timeout_ms: 10_000, ..Default::default()
724 };
725 let mut breaker = CircuitBreaker::new(&config);
726
727 breaker.on_failure();
729 assert_eq!(breaker.state_value(), 1); assert!(!breaker.allow());
733 assert!(!breaker.allow());
734 }
735
736 #[test]
737 fn test_circuit_breaker_transitions_to_half_open_after_timeout() {
738 let config = LlmResilienceConfig {
739 breaker_failure_threshold: 1,
740 breaker_reset_timeout_ms: 0, breaker_half_open_max_calls: 1,
742 ..Default::default()
743 };
744 let mut breaker = CircuitBreaker::new(&config);
745
746 breaker.on_failure();
748 assert_eq!(breaker.state_value(), 1); std::thread::sleep(Duration::from_millis(1));
752 assert!(breaker.allow());
753 assert_eq!(breaker.state_value(), 2); }
755
756 #[test]
757 fn test_circuit_breaker_half_open_limits_calls() {
758 let config = LlmResilienceConfig {
759 breaker_failure_threshold: 1,
760 breaker_reset_timeout_ms: 0,
761 breaker_half_open_max_calls: 2, ..Default::default()
763 };
764 let mut breaker = CircuitBreaker::new(&config);
765
766 breaker.on_failure();
768 std::thread::sleep(Duration::from_millis(1));
769
770 assert!(breaker.allow());
772 assert_eq!(breaker.state_value(), 2); assert!(breaker.allow()); assert!(breaker.allow()); assert!(!breaker.allow());
780 }
781
782 #[test]
783 fn test_circuit_breaker_closes_on_success() {
784 let config = LlmResilienceConfig {
785 breaker_failure_threshold: 1,
786 breaker_reset_timeout_ms: 0,
787 ..Default::default()
788 };
789 let mut breaker = CircuitBreaker::new(&config);
790
791 breaker.on_failure();
793 std::thread::sleep(Duration::from_millis(1));
794 breaker.allow();
795
796 breaker.on_success();
798 assert_eq!(breaker.state_value(), 0); }
800
801 #[test]
802 fn test_circuit_breaker_reopens_on_failure_when_half_open() {
803 let config = LlmResilienceConfig {
804 breaker_failure_threshold: 1,
805 breaker_reset_timeout_ms: 0,
806 ..Default::default()
807 };
808 let mut breaker = CircuitBreaker::new(&config);
809
810 breaker.on_failure();
812 std::thread::sleep(Duration::from_millis(1));
813 breaker.allow();
814 assert_eq!(breaker.state_value(), 2); let tripped = breaker.on_failure();
818 assert!(tripped);
819 assert_eq!(breaker.state_value(), 1); }
821
822 #[test]
827 fn test_budget_tracker_starts_empty() {
828 let tracker = BudgetTracker::new(Duration::from_secs(60));
829 assert!(tracker.requests.is_empty());
830 assert!(tracker.errors.is_empty());
831 }
832
833 #[test]
834 fn test_budget_tracker_records_successful_requests() {
835 let mut tracker = BudgetTracker::new(Duration::from_secs(60));
836 let now = Instant::now();
837
838 let ratio = tracker.record(now, false);
839 assert!(ratio.abs() < f64::EPSILON); let ratio = tracker.record(now, false);
842 assert!(ratio.abs() < f64::EPSILON); }
844
845 #[test]
846 fn test_budget_tracker_records_error_requests() {
847 let mut tracker = BudgetTracker::new(Duration::from_secs(60));
848 let now = Instant::now();
849
850 tracker.record(now, false);
851 tracker.record(now, false);
852 let ratio = tracker.record(now, true);
853
854 assert!((ratio - 0.333).abs() < 0.01);
856 }
857
858 #[test]
859 fn test_budget_tracker_calculates_error_ratio() {
860 let mut tracker = BudgetTracker::new(Duration::from_secs(60));
861 let now = Instant::now();
862
863 let ratio = tracker.record(now, true);
865 assert!((ratio - 1.0).abs() < f64::EPSILON); let ratio = tracker.record(now, false);
868 assert!((ratio - 0.5).abs() < f64::EPSILON); let ratio = tracker.record(now, true);
871 assert!((ratio - 0.666).abs() < 0.01);
873 }
874
875 #[test]
876 fn test_budget_tracker_evicts_expired_entries() {
877 let mut tracker = BudgetTracker::new(Duration::from_millis(10));
879 let now = Instant::now();
880
881 tracker.record(now, true);
882 assert_eq!(tracker.requests.len(), 1);
883 assert_eq!(tracker.errors.len(), 1);
884
885 std::thread::sleep(Duration::from_millis(15));
887
888 let new_now = Instant::now();
890 tracker.record(new_now, false);
891
892 assert_eq!(tracker.requests.len(), 1);
894 assert_eq!(tracker.errors.len(), 0);
895 }
896
897 #[test]
902 fn test_config_default_values() {
903 let config = LlmResilienceConfig::default();
904 assert_eq!(config.max_retries, 3); assert_eq!(config.retry_backoff_ms, 100);
906 assert_eq!(config.breaker_failure_threshold, 3);
907 assert_eq!(config.breaker_reset_timeout_ms, 30_000);
908 assert_eq!(config.breaker_half_open_max_calls, 1);
909 assert_eq!(config.latency_slo_ms, 2_000);
910 assert!((config.error_budget_ratio - 0.05).abs() < 0.001);
911 assert_eq!(config.error_budget_window_secs, 300);
912 }
913
914 #[test]
915 fn test_config_from_config_file() {
916 let llm_config = crate::config::LlmConfig {
917 max_retries: Some(5),
918 retry_backoff_ms: Some(200),
919 breaker_failure_threshold: Some(10),
920 breaker_reset_ms: Some(60_000),
921 breaker_half_open_max_calls: Some(3),
922 latency_slo_ms: Some(5_000),
923 error_budget_ratio: Some(0.10),
924 error_budget_window_secs: Some(600),
925 ..Default::default()
926 };
927
928 let config = LlmResilienceConfig::from_config(&llm_config);
929 assert_eq!(config.max_retries, 5);
930 assert_eq!(config.retry_backoff_ms, 200);
931 assert_eq!(config.breaker_failure_threshold, 10);
932 assert_eq!(config.breaker_reset_timeout_ms, 60_000);
933 assert_eq!(config.breaker_half_open_max_calls, 3);
934 assert_eq!(config.latency_slo_ms, 5_000);
935 assert!((config.error_budget_ratio - 0.10).abs() < 0.001);
936 assert_eq!(config.error_budget_window_secs, 600);
937 }
938
939 #[test]
940 fn test_config_clamps_values() {
941 let llm_config = crate::config::LlmConfig {
942 breaker_failure_threshold: Some(0), breaker_half_open_max_calls: Some(0), error_budget_ratio: Some(2.0), error_budget_window_secs: Some(0), ..Default::default()
947 };
948
949 let config = LlmResilienceConfig::from_config(&llm_config);
950 assert_eq!(config.breaker_failure_threshold, 1);
951 assert_eq!(config.breaker_half_open_max_calls, 1);
952 assert!((config.error_budget_ratio - 1.0).abs() < 0.001);
953 assert_eq!(config.error_budget_window_secs, 1);
954 }
955
956 #[test]
961 fn test_is_timeout_error_detects_timeout() {
962 let err = Error::OperationFailed {
963 operation: "test".to_string(),
964 cause: "Connection timeout".to_string(),
965 };
966 assert!(is_timeout_error(&err));
967 }
968
969 #[test]
970 fn test_is_timeout_error_detects_timed_out() {
971 let err = Error::OperationFailed {
972 operation: "test".to_string(),
973 cause: "Request timed out".to_string(),
974 };
975 assert!(is_timeout_error(&err));
976 }
977
978 #[test]
979 fn test_is_timeout_error_detects_deadline() {
980 let err = Error::OperationFailed {
981 operation: "test".to_string(),
982 cause: "Deadline exceeded".to_string(),
983 };
984 assert!(is_timeout_error(&err));
985 }
986
987 #[test]
988 fn test_is_timeout_error_detects_elapsed() {
989 let err = Error::OperationFailed {
990 operation: "test".to_string(),
991 cause: "Time elapsed".to_string(),
992 };
993 assert!(is_timeout_error(&err));
994 }
995
996 #[test]
997 fn test_is_timeout_error_returns_false_for_other_errors() {
998 let err = Error::OperationFailed {
999 operation: "test".to_string(),
1000 cause: "Connection refused".to_string(),
1001 };
1002 assert!(!is_timeout_error(&err));
1003 }
1004
1005 #[test]
1006 fn test_is_timeout_error_is_case_insensitive() {
1007 let err = Error::OperationFailed {
1008 operation: "test".to_string(),
1009 cause: "TIMEOUT ERROR".to_string(),
1010 };
1011 assert!(is_timeout_error(&err));
1012 }
1013
1014 #[test]
1019 fn test_is_retryable_error_detects_connection_errors() {
1020 let test_cases = [
1021 "connect error: connection refused",
1022 "Connection reset by peer",
1023 "network error: no route to host",
1024 "DNS lookup failed",
1025 "Failed to resolve hostname",
1026 ];
1027
1028 for cause in test_cases {
1029 let err = Error::OperationFailed {
1030 operation: "test".to_string(),
1031 cause: cause.to_string(),
1032 };
1033 assert!(is_retryable_error(&err), "Should be retryable: {cause}");
1034 }
1035 }
1036
1037 #[test]
1038 fn test_is_retryable_error_detects_server_errors() {
1039 let test_cases = [
1040 "API returned status: 500",
1041 "502 Bad Gateway",
1042 "503 Service Unavailable",
1043 "504 Gateway Timeout",
1044 "Internal Server Error",
1045 "Bad Gateway error",
1046 "Service unavailable",
1047 "Gateway timeout exceeded",
1048 ];
1049
1050 for cause in test_cases {
1051 let err = Error::OperationFailed {
1052 operation: "test".to_string(),
1053 cause: cause.to_string(),
1054 };
1055 assert!(is_retryable_error(&err), "Should be retryable: {cause}");
1056 }
1057 }
1058
1059 #[test]
1060 fn test_is_retryable_error_detects_rate_limiting() {
1061 let test_cases = [
1062 "API returned status: 429",
1063 "Rate limit exceeded",
1064 "Too many requests",
1065 "Server overloaded",
1066 ];
1067
1068 for cause in test_cases {
1069 let err = Error::OperationFailed {
1070 operation: "test".to_string(),
1071 cause: cause.to_string(),
1072 };
1073 assert!(is_retryable_error(&err), "Should be retryable: {cause}");
1074 }
1075 }
1076
1077 #[test]
1078 fn test_is_retryable_error_includes_timeout_errors() {
1079 let err = Error::OperationFailed {
1080 operation: "test".to_string(),
1081 cause: "Request timeout".to_string(),
1082 };
1083 assert!(is_retryable_error(&err));
1084 }
1085
1086 #[test]
1087 fn test_is_retryable_error_returns_false_for_client_errors() {
1088 let test_cases = [
1089 "API returned status: 400 - Bad Request",
1090 "401 Unauthorized",
1091 "403 Forbidden",
1092 "404 Not Found",
1093 "Invalid API key format",
1094 "Malformed JSON in request",
1095 ];
1096
1097 for cause in test_cases {
1098 let err = Error::OperationFailed {
1099 operation: "test".to_string(),
1100 cause: cause.to_string(),
1101 };
1102 assert!(
1103 !is_retryable_error(&err),
1104 "Should NOT be retryable: {cause}"
1105 );
1106 }
1107 }
1108
1109 #[test]
1110 fn test_is_retryable_error_is_case_insensitive() {
1111 let err = Error::OperationFailed {
1112 operation: "test".to_string(),
1113 cause: "CONNECTION REFUSED".to_string(),
1114 };
1115 assert!(is_retryable_error(&err));
1116 }
1117}