subcog/services/deduplication/
semantic.rs1use crate::Result;
7use crate::embedding::Embedder;
8use crate::models::{MemoryId, Namespace};
9use crate::storage::traits::{VectorBackend, VectorFilter};
10use std::sync::Arc;
11use std::time::Instant;
12use tracing::instrument;
13
14use super::config::DeduplicationConfig;
15
16#[allow(dead_code)]
30pub trait ThreadSafeEmbedder: Embedder + Send + Sync {}
31
32impl<T: Embedder + Send + Sync> ThreadSafeEmbedder for T {}
34
35#[allow(dead_code)]
43pub trait ThreadSafeVectorBackend: VectorBackend + Send + Sync {}
44
45impl<T: VectorBackend + Send + Sync> ThreadSafeVectorBackend for T {}
47
48pub struct SemanticSimilarityChecker<E: ThreadSafeEmbedder, V: ThreadSafeVectorBackend> {
84 embedder: Arc<E>,
86 vector: Arc<V>,
88 config: DeduplicationConfig,
90}
91
92impl<E: ThreadSafeEmbedder, V: ThreadSafeVectorBackend> SemanticSimilarityChecker<E, V> {
93 #[must_use]
101 pub const fn new(embedder: Arc<E>, vector: Arc<V>, config: DeduplicationConfig) -> Self {
102 Self {
103 embedder,
104 vector,
105 config,
106 }
107 }
108
109 #[instrument(
138 skip(self, content),
139 fields(
140 operation = "semantic_similarity_check",
141 namespace = %namespace.as_str(),
142 content_length = content.len()
143 )
144 )]
145 pub fn check(
146 &self,
147 content: &str,
148 namespace: Namespace,
149 domain: &str,
150 ) -> Result<Option<(MemoryId, String, f32)>> {
151 let start = Instant::now();
152
153 if content.len() < self.config.min_semantic_length {
155 tracing::debug!(
156 content_length = content.len(),
157 min_length = self.config.min_semantic_length,
158 "Content too short for semantic check"
159 );
160 return Ok(None);
161 }
162
163 let threshold = self.config.get_threshold(namespace);
165
166 tracing::debug!(
167 threshold = threshold,
168 namespace = %namespace.as_str(),
169 "Checking semantic similarity"
170 );
171
172 let embedding = self.embedder.embed(content)?;
174
175 let filter = VectorFilter::new().with_namespace(namespace);
177
178 let results = self.vector.search(&embedding, &filter, 3)?;
182
183 let duration_ms = start.elapsed().as_millis();
185
186 for (memory_id, score) in results {
188 if score >= threshold {
189 let urn = format!("subcog://{}/{}/{}", domain, namespace.as_str(), memory_id);
190
191 tracing::debug!(
192 memory_id = %memory_id,
193 urn = %urn,
194 score = score,
195 threshold = threshold,
196 duration_ms = %duration_ms,
197 "Semantic match found"
198 );
199
200 metrics::histogram!(
201 "deduplication_check_duration_ms",
202 "checker" => "semantic_similarity",
203 "found" => "true"
204 )
205 .record(duration_ms as f64);
206
207 return Ok(Some((memory_id, urn, score)));
208 }
209 }
210
211 tracing::debug!(
212 threshold = threshold,
213 duration_ms = %duration_ms,
214 "No semantic match found above threshold"
215 );
216
217 metrics::histogram!(
218 "deduplication_check_duration_ms",
219 "checker" => "semantic_similarity",
220 "found" => "false"
221 )
222 .record(duration_ms as f64);
223
224 Ok(None)
225 }
226
227 #[cfg(test)]
244 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
245 self.embedder.embed(content)
246 }
247
248 #[cfg(test)]
254 #[must_use]
255 pub fn get_threshold(&self, namespace: Namespace) -> f32 {
256 self.config.get_threshold(namespace)
257 }
258}
259
260#[cfg(test)]
261mod tests {
262 use super::*;
263 use crate::embedding::FastEmbedEmbedder;
264 use crate::storage::vector::UsearchBackend;
265 use std::sync::RwLock;
266
267 fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
281 if a.len() != b.len() {
282 return 0.0;
283 }
284
285 let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
286 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
287 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
288
289 if norm_a == 0.0 || norm_b == 0.0 {
290 return 0.0;
291 }
292
293 f32::midpoint(dot_product / (norm_a * norm_b), 1.0)
295 }
296
297 #[cfg(not(feature = "usearch-hnsw"))]
300 fn create_usearch_backend(dimensions: usize) -> UsearchBackend {
301 UsearchBackend::in_memory(dimensions)
302 }
303
304 #[cfg(feature = "usearch-hnsw")]
307 fn create_usearch_backend(dimensions: usize) -> UsearchBackend {
308 UsearchBackend::in_memory(dimensions).expect("Failed to create usearch backend")
309 }
310
311 fn create_test_checker() -> SemanticSimilarityChecker<FastEmbedEmbedder, RwLockWrapper> {
313 let embedder = Arc::new(FastEmbedEmbedder::new());
314 let vector = Arc::new(RwLockWrapper::new(create_usearch_backend(
315 FastEmbedEmbedder::DEFAULT_DIMENSIONS,
316 )));
317 let config = DeduplicationConfig::default();
318 SemanticSimilarityChecker::new(embedder, vector, config)
319 }
320
321 struct RwLockWrapper {
323 inner: RwLock<UsearchBackend>,
324 }
325
326 impl RwLockWrapper {
327 fn new(backend: UsearchBackend) -> Self {
328 Self {
329 inner: RwLock::new(backend),
330 }
331 }
332 }
333
334 impl VectorBackend for RwLockWrapper {
335 fn dimensions(&self) -> usize {
336 self.inner.read().unwrap().dimensions()
337 }
338
339 fn upsert(&self, id: &MemoryId, embedding: &[f32]) -> Result<()> {
340 self.inner.write().unwrap().upsert(id, embedding)
341 }
342
343 fn remove(&self, id: &MemoryId) -> Result<bool> {
344 self.inner.write().unwrap().remove(id)
345 }
346
347 fn search(
348 &self,
349 query_embedding: &[f32],
350 filter: &VectorFilter,
351 limit: usize,
352 ) -> Result<Vec<(MemoryId, f32)>> {
353 self.inner
354 .read()
355 .unwrap()
356 .search(query_embedding, filter, limit)
357 }
358
359 fn count(&self) -> Result<usize> {
360 self.inner.read().unwrap().count()
361 }
362
363 fn clear(&self) -> Result<()> {
364 self.inner.write().unwrap().clear()
365 }
366 }
367
368 #[test]
369 fn test_cosine_similarity_same_vector() {
370 let v = vec![1.0, 0.0, 0.0];
371 let similarity = cosine_similarity(&v, &v);
372 assert!((similarity - 1.0).abs() < 0.001);
373 }
374
375 #[test]
376 fn test_cosine_similarity_orthogonal() {
377 let v1 = vec![1.0, 0.0, 0.0];
378 let v2 = vec![0.0, 1.0, 0.0];
379 let similarity = cosine_similarity(&v1, &v2);
380 assert!((similarity - 0.5).abs() < 0.001);
382 }
383
384 #[test]
385 fn test_cosine_similarity_opposite() {
386 let v1 = vec![1.0, 0.0, 0.0];
387 let v2 = vec![-1.0, 0.0, 0.0];
388 let similarity = cosine_similarity(&v1, &v2);
389 assert!(similarity < 0.001);
391 }
392
393 #[test]
394 fn test_cosine_similarity_different_dimensions() {
395 let v1 = vec![1.0, 0.0];
396 let v2 = vec![1.0, 0.0, 0.0];
397 let similarity = cosine_similarity(&v1, &v2);
398 assert!(similarity < f32::EPSILON);
399 }
400
401 #[test]
402 fn test_cosine_similarity_zero_vector() {
403 let v1 = vec![0.0, 0.0, 0.0];
404 let v2 = vec![1.0, 0.0, 0.0];
405 let similarity = cosine_similarity(&v1, &v2);
406 assert!(similarity < f32::EPSILON);
407 }
408
409 #[test]
410 fn test_check_short_content_skipped() {
411 let checker = create_test_checker();
412
413 let result = checker
415 .check("short", Namespace::Decisions, "project")
416 .unwrap();
417 assert!(result.is_none());
418 }
419
420 #[test]
421 fn test_check_no_match() {
422 let checker = create_test_checker();
423
424 let content = "This is a sufficiently long piece of content that should trigger semantic similarity checking in the deduplication system.";
426 let result = checker
427 .check(content, Namespace::Decisions, "project")
428 .unwrap();
429 assert!(result.is_none());
430 }
431
432 #[test]
433 fn test_check_with_match() {
434 let embedder = Arc::new(FastEmbedEmbedder::new());
435 let vector = Arc::new(RwLockWrapper::new(create_usearch_backend(
436 FastEmbedEmbedder::DEFAULT_DIMENSIONS,
437 )));
438 let config = DeduplicationConfig::default();
439
440 let existing_content =
442 "Use PostgreSQL as the primary database for storing user data and application state.";
443 let existing_embedding = embedder.embed(existing_content).unwrap();
444 vector
445 .upsert(&MemoryId::new("existing-memory-123"), &existing_embedding)
446 .unwrap();
447
448 let checker = SemanticSimilarityChecker::new(embedder, vector, config);
449
450 let result = checker
452 .check(existing_content, Namespace::Decisions, "project")
453 .unwrap();
454
455 assert!(result.is_some());
456 let (id, urn, score) = result.unwrap();
457 assert_eq!(id.as_str(), "existing-memory-123");
458 assert_eq!(urn, "subcog://project/decisions/existing-memory-123");
459 assert!(score > 0.99); }
461
462 #[test]
463 fn test_check_below_threshold() {
464 let embedder = Arc::new(FastEmbedEmbedder::new());
465 let vector = Arc::new(RwLockWrapper::new(create_usearch_backend(
466 FastEmbedEmbedder::DEFAULT_DIMENSIONS,
467 )));
468
469 let config = DeduplicationConfig::default().with_default_threshold(0.99);
471
472 let existing_content = "Use PostgreSQL as the primary database for storing user data.";
474 let existing_embedding = embedder.embed(existing_content).unwrap();
475 vector
476 .upsert(&MemoryId::new("existing-memory"), &existing_embedding)
477 .unwrap();
478
479 let checker = SemanticSimilarityChecker::new(embedder, vector, config);
480
481 let new_content =
483 "Use MongoDB for document storage in the application for maximum flexibility.";
484 let result = checker
485 .check(new_content, Namespace::Decisions, "project")
486 .unwrap();
487
488 if let Some((_, _, score)) = result {
491 assert!(score >= 0.99);
492 }
493 }
494
495 #[test]
496 fn test_get_threshold() {
497 let checker = create_test_checker();
498
499 assert!((checker.get_threshold(Namespace::Decisions) - 0.92).abs() < f32::EPSILON);
501 assert!((checker.get_threshold(Namespace::Patterns) - 0.90).abs() < f32::EPSILON);
502 assert!((checker.get_threshold(Namespace::Learnings) - 0.88).abs() < f32::EPSILON);
503
504 assert!((checker.get_threshold(Namespace::Blockers) - 0.90).abs() < f32::EPSILON);
506 }
507
508 #[test]
509 fn test_embed() {
510 let checker = create_test_checker();
511
512 let content = "Test content for embedding generation";
513 let result = checker.embed(content);
514
515 assert!(result.is_ok());
516 let embedding = result.unwrap();
517 assert_eq!(embedding.len(), FastEmbedEmbedder::DEFAULT_DIMENSIONS);
518 }
519
520 mod property_tests {
521 use super::*;
522 use proptest::prelude::*;
523
524 fn normalize_vector(v: Vec<f32>) -> Vec<f32> {
526 let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
527 if norm < f32::EPSILON {
528 default_unit_vector(v.len())
529 } else {
530 v.into_iter().map(|x| x / norm).collect()
531 }
532 }
533
534 fn default_unit_vector(dim: usize) -> Vec<f32> {
536 let mut result = vec![0.0; dim];
537 if !result.is_empty() {
538 result[0] = 1.0;
539 }
540 result
541 }
542
543 fn normalized_vec(dim: usize) -> impl Strategy<Value = Vec<f32>> {
545 prop::collection::vec(-1.0f32..1.0f32, dim).prop_map(normalize_vector)
546 }
547
548 proptest! {
549 #[test]
551 fn prop_similarity_identity(v in normalized_vec(10)) {
552 let sim = cosine_similarity(&v, &v);
553 prop_assert!((sim - 1.0).abs() < 0.001, "Self-similarity should be 1.0, got {sim}");
554 }
555
556 #[test]
558 fn prop_similarity_symmetric(
559 v1 in normalized_vec(10),
560 v2 in normalized_vec(10)
561 ) {
562 let sim_ab = cosine_similarity(&v1, &v2);
563 let sim_ba = cosine_similarity(&v2, &v1);
564 prop_assert!(
565 (sim_ab - sim_ba).abs() < 0.001,
566 "Symmetry violated: sim(a,b)={sim_ab}, sim(b,a)={sim_ba}"
567 );
568 }
569
570 #[test]
572 fn prop_similarity_bounded(
573 v1 in normalized_vec(10),
574 v2 in normalized_vec(10)
575 ) {
576 let sim = cosine_similarity(&v1, &v2);
577 prop_assert!(
578 (0.0..=1.0).contains(&sim),
579 "Similarity {sim} out of bounds [0, 1]"
580 );
581 }
582
583 #[test]
585 fn prop_empty_vectors_zero(_dummy: u8) {
586 let sim = cosine_similarity(&[], &[]);
587 prop_assert!(sim < f32::EPSILON, "Empty vectors should return 0.0, got {sim}");
588 }
589
590 #[test]
592 fn prop_different_dimensions_zero(
593 v1 in normalized_vec(5),
594 v2 in normalized_vec(10)
595 ) {
596 let sim = cosine_similarity(&v1, &v2);
597 prop_assert!(sim < f32::EPSILON, "Different dimension vectors should return 0.0, got {sim}");
598 }
599 }
600 }
601}