Skip to main content

subcog/services/deduplication/
exact_match.rs

1//! Exact match deduplication checker.
2//!
3//! Detects duplicates by comparing SHA256 content hashes stored as tags.
4//! Uses `hash:sha256:<prefix>` tag format for efficient lookup.
5
6use crate::Result;
7use crate::models::{MemoryId, Namespace, SearchFilter};
8use crate::services::recall::RecallService;
9use std::sync::Arc;
10use std::time::Instant;
11use tracing::instrument;
12
13use super::hasher::ContentHasher;
14
15/// Checker for exact content match via SHA256 hash.
16///
17/// # How it works
18///
19/// 1. Computes SHA256 hash of normalized content
20/// 2. Converts hash to tag format: `hash:sha256:<16-char-prefix>`
21/// 3. Searches for memories with matching tag in the specified namespace
22/// 4. Returns the first matching memory ID if found
23///
24/// # Example
25///
26/// ```rust,ignore
27/// use subcog::services::deduplication::ExactMatchChecker;
28/// use subcog::services::recall::RecallService;
29/// use std::sync::Arc;
30///
31/// let recall = Arc::new(RecallService::default());
32/// let checker = ExactMatchChecker::new(recall);
33///
34/// let result = checker.check("Use PostgreSQL for storage", Namespace::Decisions)?;
35/// if let Some((memory_id, urn)) = result {
36///     println!("Exact match found: {}", urn);
37/// }
38/// ```
39pub struct ExactMatchChecker {
40    /// Recall service for searching memories.
41    recall: Arc<RecallService>,
42}
43
44impl ExactMatchChecker {
45    /// Creates a new exact match checker.
46    ///
47    /// # Arguments
48    ///
49    /// * `recall` - The recall service for searching memories
50    #[must_use]
51    pub const fn new(recall: Arc<RecallService>) -> Self {
52        Self { recall }
53    }
54
55    /// Checks if content has an exact match in the given namespace.
56    ///
57    /// # Arguments
58    ///
59    /// * `content` - The content to check for duplicates
60    /// * `namespace` - The namespace to search within
61    /// * `domain` - The domain string for URN construction
62    ///
63    /// # Returns
64    ///
65    /// Returns `Some((MemoryId, URN))` if an exact match is found, `None` otherwise.
66    ///
67    /// # Errors
68    ///
69    /// Returns an error if the search operation fails.
70    ///
71    /// # Example
72    ///
73    /// ```rust,ignore
74    /// let result = checker.check("content", Namespace::Decisions, "project")?;
75    /// match result {
76    ///     Some((id, urn)) => println!("Duplicate: {}", urn),
77    ///     None => println!("No duplicate found"),
78    /// }
79    /// ```
80    #[instrument(
81        skip(self, content),
82        fields(
83            operation = "exact_match_check",
84            namespace = %namespace.as_str(),
85            content_length = content.len()
86        )
87    )]
88    #[allow(clippy::cast_precision_loss)] // Precision loss acceptable for duration metrics
89    #[allow(clippy::option_if_let_else)] // if-let is clearer for this pattern
90    pub fn check(
91        &self,
92        content: &str,
93        namespace: Namespace,
94        domain: &str,
95    ) -> Result<Option<(MemoryId, String)>> {
96        let start = Instant::now();
97
98        // Compute hash and convert to tag
99        let hash = ContentHasher::hash(content);
100        let hash_tag = ContentHasher::hash_to_tag(&hash);
101
102        tracing::debug!(hash_tag = %hash_tag, "Searching for exact match");
103
104        // Build filter for namespace and hash tag
105        let filter = SearchFilter::new()
106            .with_namespace(namespace)
107            .with_tag(&hash_tag);
108
109        // Use list_all to find memories with matching tag
110        // We only need 1 result since exact match means identical
111        let result = self.recall.list_all(&filter, 1)?;
112
113        // Record metrics
114        let duration_ms = start.elapsed().as_millis();
115        metrics::histogram!(
116            "deduplication_check_duration_ms",
117            "checker" => "exact_match",
118            "found" => if result.memories.is_empty() { "false" } else { "true" }
119        )
120        .record(duration_ms as f64);
121
122        if let Some(hit) = result.memories.first() {
123            let memory_id = hit.memory.id.clone();
124            let urn = format!("subcog://{}/{}/{}", domain, namespace.as_str(), memory_id);
125
126            tracing::debug!(
127                memory_id = %memory_id,
128                urn = %urn,
129                duration_ms = %duration_ms,
130                "Exact match found"
131            );
132
133            Ok(Some((memory_id, urn)))
134        } else {
135            tracing::debug!(duration_ms = %duration_ms, "No exact match found");
136            Ok(None)
137        }
138    }
139
140    /// Returns the hash tag for the given content.
141    ///
142    /// Useful for recording captures - the hash tag should be added
143    /// to the memory's tags for future exact match detection.
144    ///
145    /// # Arguments
146    ///
147    /// * `content` - The content to hash
148    ///
149    /// # Returns
150    ///
151    /// The hash tag in format `hash:sha256:<16-char-prefix>`
152    #[must_use]
153    pub fn content_to_tag(content: &str) -> String {
154        let hash = ContentHasher::hash(content);
155        ContentHasher::hash_to_tag(&hash)
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162    use crate::models::{Domain, Memory, MemoryStatus};
163    use crate::storage::index::SqliteBackend;
164    use crate::storage::traits::IndexBackend;
165
166    fn create_test_memory(
167        id: &str,
168        content: &str,
169        namespace: Namespace,
170        tags: Vec<String>,
171    ) -> Memory {
172        Memory {
173            id: MemoryId::new(id),
174            content: content.to_string(),
175            namespace,
176            domain: Domain::new(),
177            project_id: None,
178            branch: None,
179            file_path: None,
180            status: MemoryStatus::Active,
181            created_at: 1_234_567_890,
182            updated_at: 1_234_567_890,
183            tombstoned_at: None,
184            expires_at: None,
185            embedding: None,
186            tags,
187            #[cfg(feature = "group-scope")]
188            group_id: None,
189            source: None,
190            is_summary: false,
191            source_memory_ids: None,
192            consolidation_timestamp: None,
193        }
194    }
195
196    #[test]
197    fn test_content_to_tag() {
198        let content = "Use PostgreSQL for storage";
199        let tag = ExactMatchChecker::content_to_tag(content);
200
201        assert!(tag.starts_with("hash:sha256:"));
202        assert_eq!(tag.len(), "hash:sha256:".len() + 16);
203    }
204
205    #[test]
206    fn test_content_to_tag_normalization() {
207        // Same content with different whitespace should produce same tag
208        let tag1 = ExactMatchChecker::content_to_tag("Use PostgreSQL for storage");
209        let tag2 = ExactMatchChecker::content_to_tag("  Use  PostgreSQL   for   storage  ");
210
211        assert_eq!(tag1, tag2);
212    }
213
214    #[test]
215    fn test_content_to_tag_case_insensitive() {
216        // Same content with different case should produce same tag
217        let tag1 = ExactMatchChecker::content_to_tag("Use PostgreSQL");
218        let tag2 = ExactMatchChecker::content_to_tag("use postgresql");
219
220        assert_eq!(tag1, tag2);
221    }
222
223    #[test]
224    fn test_check_no_match() {
225        // Create in-memory backend
226        let index = SqliteBackend::in_memory().unwrap();
227        let recall = Arc::new(RecallService::with_index(index));
228        let checker = ExactMatchChecker::new(recall);
229
230        // Check for content that doesn't exist
231        let result = checker
232            .check("Non-existent content", Namespace::Decisions, "project")
233            .unwrap();
234
235        assert!(result.is_none());
236    }
237
238    #[test]
239    fn test_check_with_match() {
240        // Create in-memory backend
241        let index = SqliteBackend::in_memory().unwrap();
242
243        // Create a memory with the hash tag
244        let content = "Use PostgreSQL for storage";
245        let hash_tag = ExactMatchChecker::content_to_tag(content);
246        let memory = create_test_memory(
247            "test-memory-123",
248            content,
249            Namespace::Decisions,
250            vec![hash_tag],
251        );
252
253        index.index(&memory).unwrap();
254
255        let recall = Arc::new(RecallService::with_index(index));
256        let checker = ExactMatchChecker::new(recall);
257
258        // Check for the same content
259        let result = checker
260            .check(content, Namespace::Decisions, "project")
261            .unwrap();
262
263        assert!(result.is_some());
264        let (id, urn) = result.unwrap();
265        assert_eq!(id.as_str(), "test-memory-123");
266        assert_eq!(urn, "subcog://project/decisions/test-memory-123");
267    }
268
269    #[test]
270    fn test_check_different_namespace() {
271        // Create in-memory backend
272        let index = SqliteBackend::in_memory().unwrap();
273
274        // Create a memory in Decisions namespace
275        let content = "Use PostgreSQL for storage";
276        let hash_tag = ExactMatchChecker::content_to_tag(content);
277        let memory = create_test_memory(
278            "test-memory-123",
279            content,
280            Namespace::Decisions,
281            vec![hash_tag],
282        );
283
284        index.index(&memory).unwrap();
285
286        let recall = Arc::new(RecallService::with_index(index));
287        let checker = ExactMatchChecker::new(recall);
288
289        // Check in different namespace should not find match
290        let result = checker
291            .check(content, Namespace::Patterns, "project")
292            .unwrap();
293
294        assert!(result.is_none());
295    }
296
297    #[test]
298    fn test_check_normalized_content_matches() {
299        // Create in-memory backend
300        let index = SqliteBackend::in_memory().unwrap();
301
302        // Create a memory with normalized content hash
303        let original_content = "Use PostgreSQL";
304        let hash_tag = ExactMatchChecker::content_to_tag(original_content);
305        let memory = create_test_memory(
306            "test-memory-456",
307            original_content,
308            Namespace::Decisions,
309            vec![hash_tag],
310        );
311
312        index.index(&memory).unwrap();
313
314        let recall = Arc::new(RecallService::with_index(index));
315        let checker = ExactMatchChecker::new(recall);
316
317        // Check with whitespace and case variations should still match
318        let result = checker
319            .check("  USE  postgresql  ", Namespace::Decisions, "project")
320            .unwrap();
321
322        assert!(result.is_some());
323        let (id, _) = result.unwrap();
324        assert_eq!(id.as_str(), "test-memory-456");
325    }
326}