subcog/services/deduplication/mod.rs
1//! Deduplication service for pre-compact hook.
2//!
3//! This module provides three-tier deduplication checking:
4//! 1. **Exact match**: SHA256 hash comparison via tag search
5//! 2. **Semantic similarity**: `FastEmbed` embeddings with cosine similarity threshold
6//! 3. **Recent capture**: In-memory LRU cache with TTL-based expiration
7//!
8//! The service implements short-circuit evaluation, exiting early on first match.
9//!
10//! # Architecture
11//!
12//! ```text
13//! ┌─────────────────────────────────────────────────────────────────┐
14//! │ DeduplicationService │
15//! │ ┌──────────────┐ ┌──────────────┐ ┌────────────────────────┐ │
16//! │ │ ExactMatch │ │ Semantic │ │ RecentCapture │ │
17//! │ │ Checker │ │ Checker │ │ Checker │ │
18//! │ │ │ │ │ │ │ │
19//! │ │ SHA256 hash │ │ Embedding │ │ LRU Cache with TTL │ │
20//! │ │ comparison │ │ similarity │ │ (5 min window) │ │
21//! │ └──────────────┘ └──────────────┘ └────────────────────────┘ │
22//! └─────────────────────────────────────────────────────────────────┘
23//! ```
24//!
25//! # Example
26//!
27//! ```rust,ignore
28//! use subcog::services::deduplication::{DeduplicationService, DeduplicationConfig};
29//!
30//! let config = DeduplicationConfig::default();
31//! let service = DeduplicationService::new(recall, embedder, config);
32//!
33//! let result = service.check_duplicate("Use PostgreSQL for primary storage", Namespace::Decisions)?;
34//! if result.is_duplicate {
35//! println!("Skipping duplicate: {:?}", result.reason);
36//! }
37//! ```
38
39mod config;
40mod exact_match;
41mod hasher;
42mod recent;
43mod semantic;
44mod service;
45mod types;
46
47// Public API: Only expose what users need to interact with the service
48pub use config::DeduplicationConfig;
49pub use hasher::ContentHasher;
50pub use service::DeduplicationService;
51pub use types::{Deduplicator, DuplicateCheckResult, DuplicateReason};