subcog/services/deduplication/
semantic.rs

1//! Semantic similarity deduplication checker.
2//!
3//! Detects duplicates by comparing embedding vectors using cosine similarity.
4//! Uses configurable per-namespace similarity thresholds.
5
6use crate::Result;
7use crate::embedding::Embedder;
8use crate::models::{MemoryId, Namespace};
9use crate::storage::traits::{VectorBackend, VectorFilter};
10use std::sync::Arc;
11use std::time::Instant;
12use tracing::instrument;
13
14use super::config::DeduplicationConfig;
15
16// ============================================================================
17// Trait Aliases (RUST-M1)
18// ============================================================================
19// These trait aliases reduce repetition of bounds throughout the module.
20// They document the intent: backends must be thread-safe for concurrent access.
21
22/// Thread-safe embedder backend.
23///
24/// Trait alias for embedders that can be shared across threads.
25/// All embedders used with `SemanticSimilarityChecker` must implement this.
26///
27/// Note: This trait is used as a bound on generic parameters. The `dead_code`
28/// lint doesn't recognize trait bound usage, hence the allow attribute.
29#[allow(dead_code)]
30pub trait ThreadSafeEmbedder: Embedder + Send + Sync {}
31
32/// Blanket implementation for all thread-safe embedders.
33impl<T: Embedder + Send + Sync> ThreadSafeEmbedder for T {}
34
35/// Thread-safe vector backend.
36///
37/// Trait alias for vector backends that can be shared across threads.
38/// All vector backends used with `SemanticSimilarityChecker` must implement this.
39///
40/// Note: This trait is used as a bound on generic parameters. The `dead_code`
41/// lint doesn't recognize trait bound usage, hence the allow attribute.
42#[allow(dead_code)]
43pub trait ThreadSafeVectorBackend: VectorBackend + Send + Sync {}
44
45/// Blanket implementation for all thread-safe vector backends.
46impl<T: VectorBackend + Send + Sync> ThreadSafeVectorBackend for T {}
47
48/// Checker for semantic similarity using embeddings.
49///
50/// # How it works
51///
52/// 1. Generates embedding for the new content using the configured embedder
53/// 2. Searches the vector index for similar embeddings
54/// 3. Compares similarity scores against namespace-specific thresholds
55/// 4. Returns the first match that exceeds the threshold
56///
57/// # Thresholds
58///
59/// Per-namespace thresholds are configured in `DeduplicationConfig`:
60/// - Decisions: 0.92 (high - avoid losing unique decisions)
61/// - Patterns: 0.90 (standard threshold)
62/// - Learnings: 0.88 (lower - learnings are often phrased differently)
63/// - Default: 0.90
64///
65/// # Example
66///
67/// ```rust,ignore
68/// use subcog::services::deduplication::{SemanticSimilarityChecker, DeduplicationConfig};
69/// use subcog::embedding::FastEmbedEmbedder;
70/// use subcog::storage::vector::UsearchBackend;
71/// use std::sync::Arc;
72///
73/// let embedder = Arc::new(FastEmbedEmbedder::new());
74/// let vector = Arc::new(UsearchBackend::in_memory(384));
75/// let config = DeduplicationConfig::default();
76/// let checker = SemanticSimilarityChecker::new(embedder, vector, config);
77///
78/// let result = checker.check("Use PostgreSQL for storage", Namespace::Decisions, "project")?;
79/// if let Some((memory_id, urn, score)) = result {
80///     println!("Semantic match found: {} (score: {:.2})", urn, score);
81/// }
82/// ```
83pub struct SemanticSimilarityChecker<E: ThreadSafeEmbedder, V: ThreadSafeVectorBackend> {
84    /// Embedder for generating vectors.
85    embedder: Arc<E>,
86    /// Vector backend for similarity search.
87    vector: Arc<V>,
88    /// Configuration with thresholds.
89    config: DeduplicationConfig,
90}
91
92impl<E: ThreadSafeEmbedder, V: ThreadSafeVectorBackend> SemanticSimilarityChecker<E, V> {
93    /// Creates a new semantic similarity checker.
94    ///
95    /// # Arguments
96    ///
97    /// * `embedder` - The embedding generator
98    /// * `vector` - The vector backend for similarity search
99    /// * `config` - Configuration with per-namespace thresholds
100    #[must_use]
101    pub const fn new(embedder: Arc<E>, vector: Arc<V>, config: DeduplicationConfig) -> Self {
102        Self {
103            embedder,
104            vector,
105            config,
106        }
107    }
108
109    /// Checks if content has a semantic match in the given namespace.
110    ///
111    /// Skips check if content is shorter than `min_semantic_length` configuration.
112    ///
113    /// # Arguments
114    ///
115    /// * `content` - The content to check for duplicates
116    /// * `namespace` - The namespace to search within (determines threshold)
117    /// * `domain` - The domain string for URN construction
118    ///
119    /// # Returns
120    ///
121    /// Returns `Some((MemoryId, URN, score))` if a semantic match is found above threshold,
122    /// `None` otherwise.
123    ///
124    /// # Errors
125    ///
126    /// Returns an error if embedding generation or vector search fails.
127    ///
128    /// # Example
129    ///
130    /// ```rust,ignore
131    /// let result = checker.check("content", Namespace::Decisions, "project")?;
132    /// match result {
133    ///     Some((id, urn, score)) => println!("Similar: {} ({:.2})", urn, score),
134    ///     None => println!("No similar content found"),
135    /// }
136    /// ```
137    #[instrument(
138        skip(self, content),
139        fields(
140            operation = "semantic_similarity_check",
141            namespace = %namespace.as_str(),
142            content_length = content.len()
143        )
144    )]
145    pub fn check(
146        &self,
147        content: &str,
148        namespace: Namespace,
149        domain: &str,
150    ) -> Result<Option<(MemoryId, String, f32)>> {
151        let start = Instant::now();
152
153        // Skip if content is too short for meaningful semantic comparison
154        if content.len() < self.config.min_semantic_length {
155            tracing::debug!(
156                content_length = content.len(),
157                min_length = self.config.min_semantic_length,
158                "Content too short for semantic check"
159            );
160            return Ok(None);
161        }
162
163        // Get threshold for this namespace
164        let threshold = self.config.get_threshold(namespace);
165
166        tracing::debug!(
167            threshold = threshold,
168            namespace = %namespace.as_str(),
169            "Checking semantic similarity"
170        );
171
172        // Generate embedding for the content
173        let embedding = self.embedder.embed(content)?;
174
175        // Build filter for namespace
176        let filter = VectorFilter::new().with_namespace(namespace);
177
178        // Search for similar vectors
179        // Request only 3 results - we only need to find one above threshold (PERF-H2)
180        // Reducing from 10 to 3 improves performance while maintaining effectiveness
181        let results = self.vector.search(&embedding, &filter, 3)?;
182
183        // Record metrics
184        let duration_ms = start.elapsed().as_millis();
185
186        // Find first result above threshold
187        for (memory_id, score) in results {
188            if score >= threshold {
189                let urn = format!("subcog://{}/{}/{}", domain, namespace.as_str(), memory_id);
190
191                tracing::debug!(
192                    memory_id = %memory_id,
193                    urn = %urn,
194                    score = score,
195                    threshold = threshold,
196                    duration_ms = %duration_ms,
197                    "Semantic match found"
198                );
199
200                metrics::histogram!(
201                    "deduplication_check_duration_ms",
202                    "checker" => "semantic_similarity",
203                    "found" => "true"
204                )
205                .record(duration_ms as f64);
206
207                return Ok(Some((memory_id, urn, score)));
208            }
209        }
210
211        tracing::debug!(
212            threshold = threshold,
213            duration_ms = %duration_ms,
214            "No semantic match found above threshold"
215        );
216
217        metrics::histogram!(
218            "deduplication_check_duration_ms",
219            "checker" => "semantic_similarity",
220            "found" => "false"
221        )
222        .record(duration_ms as f64);
223
224        Ok(None)
225    }
226
227    /// Generates an embedding for the given content.
228    ///
229    /// Useful for recording captures - the embedding should be stored
230    /// in the vector index for future semantic matching.
231    ///
232    /// # Arguments
233    ///
234    /// * `content` - The content to embed
235    ///
236    /// # Returns
237    ///
238    /// The embedding vector.
239    ///
240    /// # Errors
241    ///
242    /// Returns an error if embedding generation fails.
243    #[cfg(test)]
244    pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
245        self.embedder.embed(content)
246    }
247
248    /// Returns the configured threshold for a namespace.
249    ///
250    /// # Arguments
251    ///
252    /// * `namespace` - The namespace to get threshold for
253    #[cfg(test)]
254    #[must_use]
255    pub fn get_threshold(&self, namespace: Namespace) -> f32 {
256        self.config.get_threshold(namespace)
257    }
258}
259
260#[cfg(test)]
261mod tests {
262    use super::*;
263    use crate::embedding::FastEmbedEmbedder;
264    use crate::storage::vector::UsearchBackend;
265    use std::sync::RwLock;
266
267    /// Computes cosine similarity between two vectors.
268    ///
269    /// Used only for testing similarity calculations.
270    ///
271    /// # Arguments
272    ///
273    /// * `a` - First vector
274    /// * `b` - Second vector
275    ///
276    /// # Returns
277    ///
278    /// Cosine similarity normalized to [0, 1] range.
279    /// Returns 0.0 if vectors have different dimensions or zero magnitude.
280    fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
281        if a.len() != b.len() {
282            return 0.0;
283        }
284
285        let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
286        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
287        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
288
289        if norm_a == 0.0 || norm_b == 0.0 {
290            return 0.0;
291        }
292
293        // Cosine similarity ranges from -1 to 1, normalize to 0 to 1
294        f32::midpoint(dot_product / (norm_a * norm_b), 1.0)
295    }
296
297    /// Creates a usearch backend for tests.
298    /// Handles the Result return type when usearch-hnsw feature is enabled.
299    #[cfg(not(feature = "usearch-hnsw"))]
300    fn create_usearch_backend(dimensions: usize) -> UsearchBackend {
301        UsearchBackend::in_memory(dimensions)
302    }
303
304    /// Creates a usearch backend for tests.
305    /// Handles the Result return type when usearch-hnsw feature is enabled.
306    #[cfg(feature = "usearch-hnsw")]
307    fn create_usearch_backend(dimensions: usize) -> UsearchBackend {
308        UsearchBackend::in_memory(dimensions).expect("Failed to create usearch backend")
309    }
310
311    /// Helper to create a test checker with in-memory backend.
312    fn create_test_checker() -> SemanticSimilarityChecker<FastEmbedEmbedder, RwLockWrapper> {
313        let embedder = Arc::new(FastEmbedEmbedder::new());
314        let vector = Arc::new(RwLockWrapper::new(create_usearch_backend(
315            FastEmbedEmbedder::DEFAULT_DIMENSIONS,
316        )));
317        let config = DeduplicationConfig::default();
318        SemanticSimilarityChecker::new(embedder, vector, config)
319    }
320
321    /// Wrapper to make `UsearchBackend` work with Arc (needs interior mutability for tests).
322    struct RwLockWrapper {
323        inner: RwLock<UsearchBackend>,
324    }
325
326    impl RwLockWrapper {
327        fn new(backend: UsearchBackend) -> Self {
328            Self {
329                inner: RwLock::new(backend),
330            }
331        }
332    }
333
334    impl VectorBackend for RwLockWrapper {
335        fn dimensions(&self) -> usize {
336            self.inner.read().unwrap().dimensions()
337        }
338
339        fn upsert(&self, id: &MemoryId, embedding: &[f32]) -> Result<()> {
340            self.inner.write().unwrap().upsert(id, embedding)
341        }
342
343        fn remove(&self, id: &MemoryId) -> Result<bool> {
344            self.inner.write().unwrap().remove(id)
345        }
346
347        fn search(
348            &self,
349            query_embedding: &[f32],
350            filter: &VectorFilter,
351            limit: usize,
352        ) -> Result<Vec<(MemoryId, f32)>> {
353            self.inner
354                .read()
355                .unwrap()
356                .search(query_embedding, filter, limit)
357        }
358
359        fn count(&self) -> Result<usize> {
360            self.inner.read().unwrap().count()
361        }
362
363        fn clear(&self) -> Result<()> {
364            self.inner.write().unwrap().clear()
365        }
366    }
367
368    #[test]
369    fn test_cosine_similarity_same_vector() {
370        let v = vec![1.0, 0.0, 0.0];
371        let similarity = cosine_similarity(&v, &v);
372        assert!((similarity - 1.0).abs() < 0.001);
373    }
374
375    #[test]
376    fn test_cosine_similarity_orthogonal() {
377        let v1 = vec![1.0, 0.0, 0.0];
378        let v2 = vec![0.0, 1.0, 0.0];
379        let similarity = cosine_similarity(&v1, &v2);
380        // Normalized to [0, 1], so orthogonal = 0.5
381        assert!((similarity - 0.5).abs() < 0.001);
382    }
383
384    #[test]
385    fn test_cosine_similarity_opposite() {
386        let v1 = vec![1.0, 0.0, 0.0];
387        let v2 = vec![-1.0, 0.0, 0.0];
388        let similarity = cosine_similarity(&v1, &v2);
389        // Opposite vectors = 0 in [0, 1] range
390        assert!(similarity < 0.001);
391    }
392
393    #[test]
394    fn test_cosine_similarity_different_dimensions() {
395        let v1 = vec![1.0, 0.0];
396        let v2 = vec![1.0, 0.0, 0.0];
397        let similarity = cosine_similarity(&v1, &v2);
398        assert!(similarity < f32::EPSILON);
399    }
400
401    #[test]
402    fn test_cosine_similarity_zero_vector() {
403        let v1 = vec![0.0, 0.0, 0.0];
404        let v2 = vec![1.0, 0.0, 0.0];
405        let similarity = cosine_similarity(&v1, &v2);
406        assert!(similarity < f32::EPSILON);
407    }
408
409    #[test]
410    fn test_check_short_content_skipped() {
411        let checker = create_test_checker();
412
413        // Content shorter than min_semantic_length (50) should be skipped
414        let result = checker
415            .check("short", Namespace::Decisions, "project")
416            .unwrap();
417        assert!(result.is_none());
418    }
419
420    #[test]
421    fn test_check_no_match() {
422        let checker = create_test_checker();
423
424        // Content long enough but no vectors in the index
425        let content = "This is a sufficiently long piece of content that should trigger semantic similarity checking in the deduplication system.";
426        let result = checker
427            .check(content, Namespace::Decisions, "project")
428            .unwrap();
429        assert!(result.is_none());
430    }
431
432    #[test]
433    fn test_check_with_match() {
434        let embedder = Arc::new(FastEmbedEmbedder::new());
435        let vector = Arc::new(RwLockWrapper::new(create_usearch_backend(
436            FastEmbedEmbedder::DEFAULT_DIMENSIONS,
437        )));
438        let config = DeduplicationConfig::default();
439
440        // Add a vector to the index
441        let existing_content =
442            "Use PostgreSQL as the primary database for storing user data and application state.";
443        let existing_embedding = embedder.embed(existing_content).unwrap();
444        vector
445            .upsert(&MemoryId::new("existing-memory-123"), &existing_embedding)
446            .unwrap();
447
448        let checker = SemanticSimilarityChecker::new(embedder, vector, config);
449
450        // Check with identical content (should match with very high score)
451        let result = checker
452            .check(existing_content, Namespace::Decisions, "project")
453            .unwrap();
454
455        assert!(result.is_some());
456        let (id, urn, score) = result.unwrap();
457        assert_eq!(id.as_str(), "existing-memory-123");
458        assert_eq!(urn, "subcog://project/decisions/existing-memory-123");
459        assert!(score > 0.99); // Near-identical content
460    }
461
462    #[test]
463    fn test_check_below_threshold() {
464        let embedder = Arc::new(FastEmbedEmbedder::new());
465        let vector = Arc::new(RwLockWrapper::new(create_usearch_backend(
466            FastEmbedEmbedder::DEFAULT_DIMENSIONS,
467        )));
468
469        // Use a very high threshold
470        let config = DeduplicationConfig::default().with_default_threshold(0.99);
471
472        // Add a vector
473        let existing_content = "Use PostgreSQL as the primary database for storing user data.";
474        let existing_embedding = embedder.embed(existing_content).unwrap();
475        vector
476            .upsert(&MemoryId::new("existing-memory"), &existing_embedding)
477            .unwrap();
478
479        let checker = SemanticSimilarityChecker::new(embedder, vector, config);
480
481        // Check with different content - should be below threshold
482        let new_content =
483            "Use MongoDB for document storage in the application for maximum flexibility.";
484        let result = checker
485            .check(new_content, Namespace::Decisions, "project")
486            .unwrap();
487
488        // May or may not match depending on pseudo-embedding behavior
489        // With a 0.99 threshold, different content should not match
490        if let Some((_, _, score)) = result {
491            assert!(score >= 0.99);
492        }
493    }
494
495    #[test]
496    fn test_get_threshold() {
497        let checker = create_test_checker();
498
499        // Check namespace-specific thresholds
500        assert!((checker.get_threshold(Namespace::Decisions) - 0.92).abs() < f32::EPSILON);
501        assert!((checker.get_threshold(Namespace::Patterns) - 0.90).abs() < f32::EPSILON);
502        assert!((checker.get_threshold(Namespace::Learnings) - 0.88).abs() < f32::EPSILON);
503
504        // Unconfigured namespaces use default
505        assert!((checker.get_threshold(Namespace::Blockers) - 0.90).abs() < f32::EPSILON);
506    }
507
508    #[test]
509    fn test_embed() {
510        let checker = create_test_checker();
511
512        let content = "Test content for embedding generation";
513        let result = checker.embed(content);
514
515        assert!(result.is_ok());
516        let embedding = result.unwrap();
517        assert_eq!(embedding.len(), FastEmbedEmbedder::DEFAULT_DIMENSIONS);
518    }
519
520    mod property_tests {
521        use super::*;
522        use proptest::prelude::*;
523
524        /// Normalize a vector to unit length, or return a default unit vector if too small.
525        fn normalize_vector(v: Vec<f32>) -> Vec<f32> {
526            let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
527            if norm < f32::EPSILON {
528                default_unit_vector(v.len())
529            } else {
530                v.into_iter().map(|x| x / norm).collect()
531            }
532        }
533
534        /// Create a default unit vector of given dimension.
535        fn default_unit_vector(dim: usize) -> Vec<f32> {
536            let mut result = vec![0.0; dim];
537            if !result.is_empty() {
538                result[0] = 1.0;
539            }
540            result
541        }
542
543        /// Strategy for generating valid normalized vectors.
544        fn normalized_vec(dim: usize) -> impl Strategy<Value = Vec<f32>> {
545            prop::collection::vec(-1.0f32..1.0f32, dim).prop_map(normalize_vector)
546        }
547
548        proptest! {
549            /// Cosine similarity of a vector with itself is always 1.0.
550            #[test]
551            fn prop_similarity_identity(v in normalized_vec(10)) {
552                let sim = cosine_similarity(&v, &v);
553                prop_assert!((sim - 1.0).abs() < 0.001, "Self-similarity should be 1.0, got {sim}");
554            }
555
556            /// Cosine similarity is symmetric: sim(a, b) == sim(b, a).
557            #[test]
558            fn prop_similarity_symmetric(
559                v1 in normalized_vec(10),
560                v2 in normalized_vec(10)
561            ) {
562                let sim_ab = cosine_similarity(&v1, &v2);
563                let sim_ba = cosine_similarity(&v2, &v1);
564                prop_assert!(
565                    (sim_ab - sim_ba).abs() < 0.001,
566                    "Symmetry violated: sim(a,b)={sim_ab}, sim(b,a)={sim_ba}"
567                );
568            }
569
570            /// Cosine similarity is always in the range [0.0, 1.0].
571            #[test]
572            fn prop_similarity_bounded(
573                v1 in normalized_vec(10),
574                v2 in normalized_vec(10)
575            ) {
576                let sim = cosine_similarity(&v1, &v2);
577                prop_assert!(
578                    (0.0..=1.0).contains(&sim),
579                    "Similarity {sim} out of bounds [0, 1]"
580                );
581            }
582
583            /// Empty vectors should return 0.0.
584            #[test]
585            fn prop_empty_vectors_zero(_dummy: u8) {
586                let sim = cosine_similarity(&[], &[]);
587                prop_assert!(sim < f32::EPSILON, "Empty vectors should return 0.0, got {sim}");
588            }
589
590            /// Different dimension vectors should return 0.0.
591            #[test]
592            fn prop_different_dimensions_zero(
593                v1 in normalized_vec(5),
594                v2 in normalized_vec(10)
595            ) {
596                let sim = cosine_similarity(&v1, &v2);
597                prop_assert!(sim < f32::EPSILON, "Different dimension vectors should return 0.0, got {sim}");
598            }
599        }
600    }
601}