Skip to main content

subcog/services/deduplication/
mod.rs

1//! Deduplication service for pre-compact hook.
2//!
3//! This module provides three-tier deduplication checking:
4//! 1. **Exact match**: SHA256 hash comparison via tag search
5//! 2. **Semantic similarity**: `FastEmbed` embeddings with cosine similarity threshold
6//! 3. **Recent capture**: In-memory LRU cache with TTL-based expiration
7//!
8//! The service implements short-circuit evaluation, exiting early on first match.
9//!
10//! # Architecture
11//!
12//! ```text
13//! ┌─────────────────────────────────────────────────────────────────┐
14//! │                    DeduplicationService                         │
15//! │  ┌──────────────┐  ┌──────────────┐  ┌────────────────────────┐ │
16//! │  │ ExactMatch   │  │ Semantic     │  │ RecentCapture          │ │
17//! │  │ Checker      │  │ Checker      │  │ Checker                │ │
18//! │  │              │  │              │  │                        │ │
19//! │  │ SHA256 hash  │  │ Embedding    │  │ LRU Cache with TTL     │ │
20//! │  │ comparison   │  │ similarity   │  │ (5 min window)         │ │
21//! │  └──────────────┘  └──────────────┘  └────────────────────────┘ │
22//! └─────────────────────────────────────────────────────────────────┘
23//! ```
24//!
25//! # Example
26//!
27//! ```rust,ignore
28//! use subcog::services::deduplication::{DeduplicationService, DeduplicationConfig};
29//!
30//! let config = DeduplicationConfig::default();
31//! let service = DeduplicationService::new(recall, embedder, config);
32//!
33//! let result = service.check_duplicate("Use PostgreSQL for primary storage", Namespace::Decisions)?;
34//! if result.is_duplicate {
35//!     println!("Skipping duplicate: {:?}", result.reason);
36//! }
37//! ```
38
39mod config;
40mod exact_match;
41mod hasher;
42mod recent;
43mod semantic;
44mod service;
45mod types;
46
47// Public API: Only expose what users need to interact with the service
48pub use config::DeduplicationConfig;
49pub use hasher::ContentHasher;
50pub use service::DeduplicationService;
51pub use types::{Deduplicator, DuplicateCheckResult, DuplicateReason};