Skip to main content

subcog/services/deduplication/
config.rs

1//! Deduplication configuration.
2//!
3//! This module defines configuration for the deduplication service,
4//! including per-namespace similarity thresholds and cache settings.
5//!
6//! # Threshold Rationale
7//!
8//! Per-namespace thresholds balance precision (avoiding false duplicates) against
9//! recall (catching true duplicates). The defaults are tuned based on:
10//!
11//! ## Decisions (0.92 - High Threshold)
12//!
13//! Architectural decisions are high-value captures where even slightly different
14//! phrasings may represent distinct rationale. A false positive (marking a unique
15//! decision as duplicate) is worse than a false negative (allowing similar decisions).
16//!
17//! **Example**: "Use PostgreSQL for persistence" vs "Use PostgreSQL for ACID guarantees"
18//! are semantically similar (~91%) but capture different reasoning.
19//!
20//! ## Patterns (0.90 - Standard Threshold)
21//!
22//! Code patterns have moderate variation. Similar patterns often represent the same
23//! concept, but edge cases exist where context differs meaningfully.
24//!
25//! ## Learnings (0.88 - Lower Threshold)
26//!
27//! Learnings are frequently paraphrased differently when rediscovered. A lower threshold
28//! catches these reformulations while still allowing genuinely distinct learnings.
29//!
30//! **Example**: "TIL: Rust closures capture by reference by default" vs
31//! "Learned that closures in Rust borrow by default" are the same learning (~87% similar).
32//!
33//! ## Other Namespaces (0.90 - Default)
34//!
35//! Unconfigured namespaces use 90% as a balanced default that works well for most content.
36//!
37//! # Tuning Guidelines
38//!
39//! | Symptom | Adjustment |
40//! |---------|------------|
41//! | Too many duplicates skipped | Lower threshold (e.g., 0.85) |
42//! | Duplicate content still captured | Raise threshold (e.g., 0.95) |
43//! | Short content triggers false positives | Increase `min_semantic_length` |
44//! | Same content captured repeatedly in session | Extend `recent_window` |
45
46use crate::models::Namespace;
47use std::collections::HashMap;
48use std::time::Duration;
49
50/// Configuration for the deduplication service.
51///
52/// # Environment Variables
53///
54/// | Variable | Type | Default | Description |
55/// |----------|------|---------|-------------|
56/// | `SUBCOG_DEDUP_ENABLED` | bool | `true` | Enable deduplication |
57/// | `SUBCOG_DEDUP_THRESHOLD_DECISIONS` | f32 | `0.92` | Threshold for decisions namespace |
58/// | `SUBCOG_DEDUP_THRESHOLD_PATTERNS` | f32 | `0.90` | Threshold for patterns namespace |
59/// | `SUBCOG_DEDUP_THRESHOLD_LEARNINGS` | f32 | `0.88` | Threshold for learnings namespace |
60/// | `SUBCOG_DEDUP_THRESHOLD_DEFAULT` | f32 | `0.90` | Default threshold |
61/// | `SUBCOG_DEDUP_TIME_WINDOW_SECS` | u64 | `300` | Recent capture window |
62/// | `SUBCOG_DEDUP_CACHE_CAPACITY` | usize | `1000` | LRU cache size |
63/// | `SUBCOG_DEDUP_MIN_SEMANTIC_LENGTH` | usize | `50` | Min content length for semantic check |
64///
65/// # Example
66///
67/// ```rust
68/// use subcog::services::deduplication::DeduplicationConfig;
69/// use subcog::models::Namespace;
70///
71/// let config = DeduplicationConfig::default();
72/// assert!(config.enabled);
73/// assert_eq!(config.default_threshold, 0.90);
74/// assert_eq!(config.get_threshold(Namespace::Decisions), 0.92);
75/// ```
76#[derive(Debug, Clone)]
77pub struct DeduplicationConfig {
78    /// Enable/disable entire deduplication.
79    pub enabled: bool,
80
81    /// Per-namespace similarity thresholds.
82    pub similarity_thresholds: HashMap<Namespace, f32>,
83
84    /// Default threshold when namespace not configured.
85    pub default_threshold: f32,
86
87    /// Recent capture time window.
88    pub recent_window: Duration,
89
90    /// Recent capture cache capacity.
91    pub cache_capacity: usize,
92
93    /// Minimum content length for semantic check.
94    ///
95    /// Content shorter than this will skip semantic similarity checking
96    /// and rely only on exact match and recent capture detection.
97    pub min_semantic_length: usize,
98}
99
100impl DeduplicationConfig {
101    /// Creates a new configuration from environment variables.
102    ///
103    /// Falls back to defaults for any unset variables.
104    ///
105    /// # Example
106    ///
107    /// ```rust
108    /// use subcog::services::deduplication::DeduplicationConfig;
109    ///
110    /// let config = DeduplicationConfig::from_env();
111    /// // Config is populated from environment with defaults
112    /// ```
113    #[must_use]
114    pub fn from_env() -> Self {
115        let enabled = std::env::var("SUBCOG_DEDUP_ENABLED")
116            .map(|v| v.to_lowercase() != "false" && v != "0")
117            .unwrap_or(true);
118
119        let default_threshold = std::env::var("SUBCOG_DEDUP_THRESHOLD_DEFAULT")
120            .ok()
121            .and_then(|v| v.parse().ok())
122            .unwrap_or(0.90);
123
124        let recent_window_secs = std::env::var("SUBCOG_DEDUP_TIME_WINDOW_SECS")
125            .ok()
126            .and_then(|v| v.parse().ok())
127            .unwrap_or(300);
128
129        let cache_capacity = std::env::var("SUBCOG_DEDUP_CACHE_CAPACITY")
130            .ok()
131            .and_then(|v| v.parse().ok())
132            .unwrap_or(1000);
133
134        let min_semantic_length = std::env::var("SUBCOG_DEDUP_MIN_SEMANTIC_LENGTH")
135            .ok()
136            .and_then(|v| v.parse().ok())
137            .unwrap_or(50);
138
139        let mut thresholds = HashMap::new();
140
141        // Load per-namespace thresholds
142        if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_DECISIONS")
143            .ok()
144            .and_then(|v| v.parse::<f32>().ok())
145        {
146            thresholds.insert(Namespace::Decisions, threshold);
147        }
148
149        if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_PATTERNS")
150            .ok()
151            .and_then(|v| v.parse::<f32>().ok())
152        {
153            thresholds.insert(Namespace::Patterns, threshold);
154        }
155
156        if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_LEARNINGS")
157            .ok()
158            .and_then(|v| v.parse::<f32>().ok())
159        {
160            thresholds.insert(Namespace::Learnings, threshold);
161        }
162
163        if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_BLOCKERS")
164            .ok()
165            .and_then(|v| v.parse::<f32>().ok())
166        {
167            thresholds.insert(Namespace::Blockers, threshold);
168        }
169
170        if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_TECHDEBT")
171            .ok()
172            .and_then(|v| v.parse::<f32>().ok())
173        {
174            thresholds.insert(Namespace::TechDebt, threshold);
175        }
176
177        if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_CONTEXT")
178            .ok()
179            .and_then(|v| v.parse::<f32>().ok())
180        {
181            thresholds.insert(Namespace::Context, threshold);
182        }
183
184        Self {
185            enabled,
186            similarity_thresholds: thresholds,
187            default_threshold,
188            recent_window: Duration::from_secs(recent_window_secs),
189            cache_capacity,
190            min_semantic_length,
191        }
192    }
193
194    /// Gets the similarity threshold for a namespace.
195    ///
196    /// Returns the namespace-specific threshold if configured,
197    /// otherwise returns the default threshold.
198    ///
199    /// # Arguments
200    ///
201    /// * `namespace` - The namespace to get the threshold for
202    ///
203    /// # Example
204    ///
205    /// ```rust
206    /// use subcog::services::deduplication::DeduplicationConfig;
207    /// use subcog::models::Namespace;
208    ///
209    /// let config = DeduplicationConfig::default();
210    /// assert_eq!(config.get_threshold(Namespace::Decisions), 0.92);
211    /// assert_eq!(config.get_threshold(Namespace::Patterns), 0.90);
212    /// ```
213    #[must_use]
214    pub fn get_threshold(&self, namespace: Namespace) -> f32 {
215        self.similarity_thresholds
216            .get(&namespace)
217            .copied()
218            .unwrap_or(self.default_threshold)
219    }
220
221    /// Builder method to set enabled state.
222    #[must_use]
223    pub const fn with_enabled(mut self, enabled: bool) -> Self {
224        self.enabled = enabled;
225        self
226    }
227
228    /// Builder method to set a namespace threshold.
229    #[must_use]
230    pub fn with_threshold(mut self, namespace: Namespace, threshold: f32) -> Self {
231        self.similarity_thresholds.insert(namespace, threshold);
232        self
233    }
234
235    /// Builder method to set the default threshold.
236    #[must_use]
237    pub const fn with_default_threshold(mut self, threshold: f32) -> Self {
238        self.default_threshold = threshold;
239        self
240    }
241
242    /// Builder method to set the recent window duration.
243    #[must_use]
244    pub const fn with_recent_window(mut self, duration: Duration) -> Self {
245        self.recent_window = duration;
246        self
247    }
248
249    /// Builder method to set the cache capacity.
250    #[must_use]
251    pub const fn with_cache_capacity(mut self, capacity: usize) -> Self {
252        self.cache_capacity = capacity;
253        self
254    }
255
256    /// Builder method to set the minimum semantic length.
257    #[must_use]
258    pub const fn with_min_semantic_length(mut self, length: usize) -> Self {
259        self.min_semantic_length = length;
260        self
261    }
262}
263
264impl Default for DeduplicationConfig {
265    fn default() -> Self {
266        let mut thresholds = HashMap::new();
267
268        // Per ADR-003: Per-Namespace Similarity Thresholds
269        thresholds.insert(Namespace::Decisions, 0.92); // High value, avoid losing unique decisions
270        thresholds.insert(Namespace::Patterns, 0.90); // Standard threshold
271        thresholds.insert(Namespace::Learnings, 0.88); // Learnings often phrased differently
272
273        Self {
274            enabled: true,
275            similarity_thresholds: thresholds,
276            default_threshold: 0.90,
277            recent_window: Duration::from_secs(300), // 5 minutes
278            cache_capacity: 1000,
279            min_semantic_length: 50,
280        }
281    }
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287
288    /// Helper for float comparisons in tests.
289    fn approx_eq(a: f32, b: f32) -> bool {
290        (a - b).abs() < f32::EPSILON
291    }
292
293    #[test]
294    fn test_default_config() {
295        let config = DeduplicationConfig::default();
296
297        assert!(config.enabled);
298        assert!(approx_eq(config.default_threshold, 0.90));
299        assert_eq!(config.recent_window, Duration::from_secs(300));
300        assert_eq!(config.cache_capacity, 1000);
301        assert_eq!(config.min_semantic_length, 50);
302    }
303
304    #[test]
305    fn test_namespace_thresholds() {
306        let config = DeduplicationConfig::default();
307
308        // Configured namespaces return their specific thresholds
309        assert!(approx_eq(config.get_threshold(Namespace::Decisions), 0.92));
310        assert!(approx_eq(config.get_threshold(Namespace::Patterns), 0.90));
311        assert!(approx_eq(config.get_threshold(Namespace::Learnings), 0.88));
312
313        // Unconfigured namespaces return the default
314        assert!(approx_eq(config.get_threshold(Namespace::Blockers), 0.90));
315        assert!(approx_eq(config.get_threshold(Namespace::TechDebt), 0.90));
316    }
317
318    #[test]
319    fn test_builder_methods() {
320        let config = DeduplicationConfig::default()
321            .with_enabled(false)
322            .with_default_threshold(0.85)
323            .with_threshold(Namespace::Context, 0.95)
324            .with_recent_window(Duration::from_secs(600))
325            .with_cache_capacity(500)
326            .with_min_semantic_length(100);
327
328        assert!(!config.enabled);
329        assert!(approx_eq(config.default_threshold, 0.85));
330        assert!(approx_eq(config.get_threshold(Namespace::Context), 0.95));
331        assert_eq!(config.recent_window, Duration::from_secs(600));
332        assert_eq!(config.cache_capacity, 500);
333        assert_eq!(config.min_semantic_length, 100);
334    }
335}