subcog/services/deduplication/config.rs
1//! Deduplication configuration.
2//!
3//! This module defines configuration for the deduplication service,
4//! including per-namespace similarity thresholds and cache settings.
5//!
6//! # Threshold Rationale
7//!
8//! Per-namespace thresholds balance precision (avoiding false duplicates) against
9//! recall (catching true duplicates). The defaults are tuned based on:
10//!
11//! ## Decisions (0.92 - High Threshold)
12//!
13//! Architectural decisions are high-value captures where even slightly different
14//! phrasings may represent distinct rationale. A false positive (marking a unique
15//! decision as duplicate) is worse than a false negative (allowing similar decisions).
16//!
17//! **Example**: "Use PostgreSQL for persistence" vs "Use PostgreSQL for ACID guarantees"
18//! are semantically similar (~91%) but capture different reasoning.
19//!
20//! ## Patterns (0.90 - Standard Threshold)
21//!
22//! Code patterns have moderate variation. Similar patterns often represent the same
23//! concept, but edge cases exist where context differs meaningfully.
24//!
25//! ## Learnings (0.88 - Lower Threshold)
26//!
27//! Learnings are frequently paraphrased differently when rediscovered. A lower threshold
28//! catches these reformulations while still allowing genuinely distinct learnings.
29//!
30//! **Example**: "TIL: Rust closures capture by reference by default" vs
31//! "Learned that closures in Rust borrow by default" are the same learning (~87% similar).
32//!
33//! ## Other Namespaces (0.90 - Default)
34//!
35//! Unconfigured namespaces use 90% as a balanced default that works well for most content.
36//!
37//! # Tuning Guidelines
38//!
39//! | Symptom | Adjustment |
40//! |---------|------------|
41//! | Too many duplicates skipped | Lower threshold (e.g., 0.85) |
42//! | Duplicate content still captured | Raise threshold (e.g., 0.95) |
43//! | Short content triggers false positives | Increase `min_semantic_length` |
44//! | Same content captured repeatedly in session | Extend `recent_window` |
45
46use crate::models::Namespace;
47use std::collections::HashMap;
48use std::time::Duration;
49
50/// Configuration for the deduplication service.
51///
52/// # Environment Variables
53///
54/// | Variable | Type | Default | Description |
55/// |----------|------|---------|-------------|
56/// | `SUBCOG_DEDUP_ENABLED` | bool | `true` | Enable deduplication |
57/// | `SUBCOG_DEDUP_THRESHOLD_DECISIONS` | f32 | `0.92` | Threshold for decisions namespace |
58/// | `SUBCOG_DEDUP_THRESHOLD_PATTERNS` | f32 | `0.90` | Threshold for patterns namespace |
59/// | `SUBCOG_DEDUP_THRESHOLD_LEARNINGS` | f32 | `0.88` | Threshold for learnings namespace |
60/// | `SUBCOG_DEDUP_THRESHOLD_DEFAULT` | f32 | `0.90` | Default threshold |
61/// | `SUBCOG_DEDUP_TIME_WINDOW_SECS` | u64 | `300` | Recent capture window |
62/// | `SUBCOG_DEDUP_CACHE_CAPACITY` | usize | `1000` | LRU cache size |
63/// | `SUBCOG_DEDUP_MIN_SEMANTIC_LENGTH` | usize | `50` | Min content length for semantic check |
64///
65/// # Example
66///
67/// ```rust
68/// use subcog::services::deduplication::DeduplicationConfig;
69/// use subcog::models::Namespace;
70///
71/// let config = DeduplicationConfig::default();
72/// assert!(config.enabled);
73/// assert_eq!(config.default_threshold, 0.90);
74/// assert_eq!(config.get_threshold(Namespace::Decisions), 0.92);
75/// ```
76#[derive(Debug, Clone)]
77pub struct DeduplicationConfig {
78 /// Enable/disable entire deduplication.
79 pub enabled: bool,
80
81 /// Per-namespace similarity thresholds.
82 pub similarity_thresholds: HashMap<Namespace, f32>,
83
84 /// Default threshold when namespace not configured.
85 pub default_threshold: f32,
86
87 /// Recent capture time window.
88 pub recent_window: Duration,
89
90 /// Recent capture cache capacity.
91 pub cache_capacity: usize,
92
93 /// Minimum content length for semantic check.
94 ///
95 /// Content shorter than this will skip semantic similarity checking
96 /// and rely only on exact match and recent capture detection.
97 pub min_semantic_length: usize,
98}
99
100impl DeduplicationConfig {
101 /// Creates a new configuration from environment variables.
102 ///
103 /// Falls back to defaults for any unset variables.
104 ///
105 /// # Example
106 ///
107 /// ```rust
108 /// use subcog::services::deduplication::DeduplicationConfig;
109 ///
110 /// let config = DeduplicationConfig::from_env();
111 /// // Config is populated from environment with defaults
112 /// ```
113 #[must_use]
114 pub fn from_env() -> Self {
115 let enabled = std::env::var("SUBCOG_DEDUP_ENABLED")
116 .map(|v| v.to_lowercase() != "false" && v != "0")
117 .unwrap_or(true);
118
119 let default_threshold = std::env::var("SUBCOG_DEDUP_THRESHOLD_DEFAULT")
120 .ok()
121 .and_then(|v| v.parse().ok())
122 .unwrap_or(0.90);
123
124 let recent_window_secs = std::env::var("SUBCOG_DEDUP_TIME_WINDOW_SECS")
125 .ok()
126 .and_then(|v| v.parse().ok())
127 .unwrap_or(300);
128
129 let cache_capacity = std::env::var("SUBCOG_DEDUP_CACHE_CAPACITY")
130 .ok()
131 .and_then(|v| v.parse().ok())
132 .unwrap_or(1000);
133
134 let min_semantic_length = std::env::var("SUBCOG_DEDUP_MIN_SEMANTIC_LENGTH")
135 .ok()
136 .and_then(|v| v.parse().ok())
137 .unwrap_or(50);
138
139 let mut thresholds = HashMap::new();
140
141 // Load per-namespace thresholds
142 if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_DECISIONS")
143 .ok()
144 .and_then(|v| v.parse::<f32>().ok())
145 {
146 thresholds.insert(Namespace::Decisions, threshold);
147 }
148
149 if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_PATTERNS")
150 .ok()
151 .and_then(|v| v.parse::<f32>().ok())
152 {
153 thresholds.insert(Namespace::Patterns, threshold);
154 }
155
156 if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_LEARNINGS")
157 .ok()
158 .and_then(|v| v.parse::<f32>().ok())
159 {
160 thresholds.insert(Namespace::Learnings, threshold);
161 }
162
163 if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_BLOCKERS")
164 .ok()
165 .and_then(|v| v.parse::<f32>().ok())
166 {
167 thresholds.insert(Namespace::Blockers, threshold);
168 }
169
170 if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_TECHDEBT")
171 .ok()
172 .and_then(|v| v.parse::<f32>().ok())
173 {
174 thresholds.insert(Namespace::TechDebt, threshold);
175 }
176
177 if let Some(threshold) = std::env::var("SUBCOG_DEDUP_THRESHOLD_CONTEXT")
178 .ok()
179 .and_then(|v| v.parse::<f32>().ok())
180 {
181 thresholds.insert(Namespace::Context, threshold);
182 }
183
184 Self {
185 enabled,
186 similarity_thresholds: thresholds,
187 default_threshold,
188 recent_window: Duration::from_secs(recent_window_secs),
189 cache_capacity,
190 min_semantic_length,
191 }
192 }
193
194 /// Gets the similarity threshold for a namespace.
195 ///
196 /// Returns the namespace-specific threshold if configured,
197 /// otherwise returns the default threshold.
198 ///
199 /// # Arguments
200 ///
201 /// * `namespace` - The namespace to get the threshold for
202 ///
203 /// # Example
204 ///
205 /// ```rust
206 /// use subcog::services::deduplication::DeduplicationConfig;
207 /// use subcog::models::Namespace;
208 ///
209 /// let config = DeduplicationConfig::default();
210 /// assert_eq!(config.get_threshold(Namespace::Decisions), 0.92);
211 /// assert_eq!(config.get_threshold(Namespace::Patterns), 0.90);
212 /// ```
213 #[must_use]
214 pub fn get_threshold(&self, namespace: Namespace) -> f32 {
215 self.similarity_thresholds
216 .get(&namespace)
217 .copied()
218 .unwrap_or(self.default_threshold)
219 }
220
221 /// Builder method to set enabled state.
222 #[must_use]
223 pub const fn with_enabled(mut self, enabled: bool) -> Self {
224 self.enabled = enabled;
225 self
226 }
227
228 /// Builder method to set a namespace threshold.
229 #[must_use]
230 pub fn with_threshold(mut self, namespace: Namespace, threshold: f32) -> Self {
231 self.similarity_thresholds.insert(namespace, threshold);
232 self
233 }
234
235 /// Builder method to set the default threshold.
236 #[must_use]
237 pub const fn with_default_threshold(mut self, threshold: f32) -> Self {
238 self.default_threshold = threshold;
239 self
240 }
241
242 /// Builder method to set the recent window duration.
243 #[must_use]
244 pub const fn with_recent_window(mut self, duration: Duration) -> Self {
245 self.recent_window = duration;
246 self
247 }
248
249 /// Builder method to set the cache capacity.
250 #[must_use]
251 pub const fn with_cache_capacity(mut self, capacity: usize) -> Self {
252 self.cache_capacity = capacity;
253 self
254 }
255
256 /// Builder method to set the minimum semantic length.
257 #[must_use]
258 pub const fn with_min_semantic_length(mut self, length: usize) -> Self {
259 self.min_semantic_length = length;
260 self
261 }
262}
263
264impl Default for DeduplicationConfig {
265 fn default() -> Self {
266 let mut thresholds = HashMap::new();
267
268 // Per ADR-003: Per-Namespace Similarity Thresholds
269 thresholds.insert(Namespace::Decisions, 0.92); // High value, avoid losing unique decisions
270 thresholds.insert(Namespace::Patterns, 0.90); // Standard threshold
271 thresholds.insert(Namespace::Learnings, 0.88); // Learnings often phrased differently
272
273 Self {
274 enabled: true,
275 similarity_thresholds: thresholds,
276 default_threshold: 0.90,
277 recent_window: Duration::from_secs(300), // 5 minutes
278 cache_capacity: 1000,
279 min_semantic_length: 50,
280 }
281 }
282}
283
284#[cfg(test)]
285mod tests {
286 use super::*;
287
288 /// Helper for float comparisons in tests.
289 fn approx_eq(a: f32, b: f32) -> bool {
290 (a - b).abs() < f32::EPSILON
291 }
292
293 #[test]
294 fn test_default_config() {
295 let config = DeduplicationConfig::default();
296
297 assert!(config.enabled);
298 assert!(approx_eq(config.default_threshold, 0.90));
299 assert_eq!(config.recent_window, Duration::from_secs(300));
300 assert_eq!(config.cache_capacity, 1000);
301 assert_eq!(config.min_semantic_length, 50);
302 }
303
304 #[test]
305 fn test_namespace_thresholds() {
306 let config = DeduplicationConfig::default();
307
308 // Configured namespaces return their specific thresholds
309 assert!(approx_eq(config.get_threshold(Namespace::Decisions), 0.92));
310 assert!(approx_eq(config.get_threshold(Namespace::Patterns), 0.90));
311 assert!(approx_eq(config.get_threshold(Namespace::Learnings), 0.88));
312
313 // Unconfigured namespaces return the default
314 assert!(approx_eq(config.get_threshold(Namespace::Blockers), 0.90));
315 assert!(approx_eq(config.get_threshold(Namespace::TechDebt), 0.90));
316 }
317
318 #[test]
319 fn test_builder_methods() {
320 let config = DeduplicationConfig::default()
321 .with_enabled(false)
322 .with_default_threshold(0.85)
323 .with_threshold(Namespace::Context, 0.95)
324 .with_recent_window(Duration::from_secs(600))
325 .with_cache_capacity(500)
326 .with_min_semantic_length(100);
327
328 assert!(!config.enabled);
329 assert!(approx_eq(config.default_threshold, 0.85));
330 assert!(approx_eq(config.get_threshold(Namespace::Context), 0.95));
331 assert_eq!(config.recent_window, Duration::from_secs(600));
332 assert_eq!(config.cache_capacity, 500);
333 assert_eq!(config.min_semantic_length, 100);
334 }
335}