Skip to main content

subcog/services/deduplication/
types.rs

1//! Deduplication result types.
2//!
3//! This module defines the result types returned by deduplication checks.
4
5use crate::models::MemoryId;
6use serde::{Deserialize, Serialize};
7
8/// Result of a deduplication check.
9///
10/// Contains information about whether content was found to be a duplicate,
11/// the reason for duplication, and any matched memory information.
12///
13/// # Example
14///
15/// ```rust
16/// use subcog::services::deduplication::{DuplicateCheckResult, DuplicateReason};
17/// use subcog::models::MemoryId;
18///
19/// let result = DuplicateCheckResult {
20///     is_duplicate: true,
21///     reason: Some(DuplicateReason::ExactMatch),
22///     similarity_score: None,
23///     matched_memory_id: Some(MemoryId::new("abc123")),
24///     matched_urn: Some("subcog://project/decisions/abc123".to_string()),
25///     check_duration_ms: 5,
26/// };
27///
28/// assert!(result.is_duplicate);
29/// assert_eq!(result.reason, Some(DuplicateReason::ExactMatch));
30/// ```
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct DuplicateCheckResult {
33    /// Whether the content is a duplicate.
34    pub is_duplicate: bool,
35
36    /// The reason content was identified as a duplicate.
37    pub reason: Option<DuplicateReason>,
38
39    /// Similarity score for semantic matches (0.0 to 1.0).
40    pub similarity_score: Option<f32>,
41
42    /// The memory ID of the matched duplicate.
43    pub matched_memory_id: Option<MemoryId>,
44
45    /// Full URN of matched memory: `subcog://{domain}/{namespace}/{id}`.
46    ///
47    /// MUST be populated when `is_duplicate == true`.
48    /// All external outputs (logs, metrics labels, hook responses) MUST reference
49    /// memories by URN, not bare ID.
50    pub matched_urn: Option<String>,
51
52    /// Duration of the deduplication check in milliseconds.
53    pub check_duration_ms: u64,
54}
55
56impl DuplicateCheckResult {
57    /// Creates a result indicating no duplicate was found.
58    ///
59    /// # Arguments
60    ///
61    /// * `duration_ms` - Time taken for the check in milliseconds
62    ///
63    /// # Example
64    ///
65    /// ```rust
66    /// use subcog::services::deduplication::DuplicateCheckResult;
67    ///
68    /// let result = DuplicateCheckResult::not_duplicate(10);
69    /// assert!(!result.is_duplicate);
70    /// assert!(result.reason.is_none());
71    /// ```
72    #[must_use]
73    pub const fn not_duplicate(duration_ms: u64) -> Self {
74        Self {
75            is_duplicate: false,
76            reason: None,
77            similarity_score: None,
78            matched_memory_id: None,
79            matched_urn: None,
80            check_duration_ms: duration_ms,
81        }
82    }
83
84    /// Creates a result indicating an exact match was found.
85    ///
86    /// # Arguments
87    ///
88    /// * `memory_id` - The ID of the matched memory
89    /// * `urn` - The full URN of the matched memory
90    /// * `duration_ms` - Time taken for the check in milliseconds
91    ///
92    /// # Example
93    ///
94    /// ```rust
95    /// use subcog::services::deduplication::DuplicateCheckResult;
96    /// use subcog::models::MemoryId;
97    ///
98    /// let result = DuplicateCheckResult::exact_match(
99    ///     MemoryId::new("abc123"),
100    ///     "subcog://project/decisions/abc123".to_string(),
101    ///     5,
102    /// );
103    /// assert!(result.is_duplicate);
104    /// ```
105    #[must_use]
106    pub const fn exact_match(memory_id: MemoryId, urn: String, duration_ms: u64) -> Self {
107        Self {
108            is_duplicate: true,
109            reason: Some(DuplicateReason::ExactMatch),
110            similarity_score: None,
111            matched_memory_id: Some(memory_id),
112            matched_urn: Some(urn),
113            check_duration_ms: duration_ms,
114        }
115    }
116
117    /// Creates a result indicating a semantic similarity match was found.
118    ///
119    /// # Arguments
120    ///
121    /// * `memory_id` - The ID of the matched memory
122    /// * `urn` - The full URN of the matched memory
123    /// * `score` - The similarity score (0.0 to 1.0)
124    /// * `duration_ms` - Time taken for the check in milliseconds
125    ///
126    /// # Example
127    ///
128    /// ```rust
129    /// use subcog::services::deduplication::DuplicateCheckResult;
130    /// use subcog::models::MemoryId;
131    ///
132    /// let result = DuplicateCheckResult::semantic_match(
133    ///     MemoryId::new("abc123"),
134    ///     "subcog://project/decisions/abc123".to_string(),
135    ///     0.94,
136    ///     20,
137    /// );
138    /// assert!(result.is_duplicate);
139    /// assert_eq!(result.similarity_score, Some(0.94));
140    /// ```
141    #[must_use]
142    pub const fn semantic_match(
143        memory_id: MemoryId,
144        urn: String,
145        score: f32,
146        duration_ms: u64,
147    ) -> Self {
148        Self {
149            is_duplicate: true,
150            reason: Some(DuplicateReason::SemanticSimilar),
151            similarity_score: Some(score),
152            matched_memory_id: Some(memory_id),
153            matched_urn: Some(urn),
154            check_duration_ms: duration_ms,
155        }
156    }
157
158    /// Creates a result indicating the content was recently captured.
159    ///
160    /// # Arguments
161    ///
162    /// * `memory_id` - The ID of the matched memory
163    /// * `urn` - The full URN of the matched memory
164    /// * `duration_ms` - Time taken for the check in milliseconds
165    ///
166    /// # Example
167    ///
168    /// ```rust
169    /// use subcog::services::deduplication::DuplicateCheckResult;
170    /// use subcog::models::MemoryId;
171    ///
172    /// let result = DuplicateCheckResult::recent_capture(
173    ///     MemoryId::new("abc123"),
174    ///     "subcog://project/decisions/abc123".to_string(),
175    ///     1,
176    /// );
177    /// assert!(result.is_duplicate);
178    /// ```
179    #[must_use]
180    pub const fn recent_capture(memory_id: MemoryId, urn: String, duration_ms: u64) -> Self {
181        Self {
182            is_duplicate: true,
183            reason: Some(DuplicateReason::RecentCapture),
184            similarity_score: None,
185            matched_memory_id: Some(memory_id),
186            matched_urn: Some(urn),
187            check_duration_ms: duration_ms,
188        }
189    }
190}
191
192impl Default for DuplicateCheckResult {
193    fn default() -> Self {
194        Self::not_duplicate(0)
195    }
196}
197
198/// The reason content was identified as a duplicate.
199///
200/// # Variants
201///
202/// - `ExactMatch`: Content hash matches an existing memory exactly
203/// - `SemanticSimilar`: Embedding similarity exceeds the configured threshold
204/// - `RecentCapture`: Content was captured within the recent time window
205#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
206#[serde(rename_all = "snake_case")]
207pub enum DuplicateReason {
208    /// Content hash matches exactly (SHA256).
209    ExactMatch,
210
211    /// Semantic similarity exceeds threshold.
212    SemanticSimilar,
213
214    /// Content was captured within the recent time window.
215    RecentCapture,
216}
217
218impl std::fmt::Display for DuplicateReason {
219    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
220        match self {
221            Self::ExactMatch => write!(f, "exact_match"),
222            Self::SemanticSimilar => write!(f, "semantic_similar"),
223            Self::RecentCapture => write!(f, "recent_capture"),
224        }
225    }
226}
227
228/// Trait for deduplication checking.
229///
230/// Allows for different implementations (e.g., mock for testing).
231pub trait Deduplicator: Send + Sync {
232    /// Checks if content is a duplicate.
233    ///
234    /// # Arguments
235    ///
236    /// * `content` - The content to check
237    /// * `namespace` - The namespace to check within
238    ///
239    /// # Returns
240    ///
241    /// A result indicating whether the content is a duplicate and why.
242    ///
243    /// # Errors
244    ///
245    /// Returns an error if the check fails.
246    fn check_duplicate(
247        &self,
248        content: &str,
249        namespace: crate::models::Namespace,
250    ) -> crate::Result<DuplicateCheckResult>;
251
252    /// Records a successful capture for recent-capture tracking.
253    ///
254    /// # Arguments
255    ///
256    /// * `content_hash` - The SHA256 hash of the content
257    /// * `memory_id` - The ID of the captured memory
258    fn record_capture(&self, content_hash: &str, memory_id: &MemoryId);
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn test_not_duplicate_result() {
267        let result = DuplicateCheckResult::not_duplicate(10);
268        assert!(!result.is_duplicate);
269        assert!(result.reason.is_none());
270        assert!(result.matched_memory_id.is_none());
271        assert!(result.matched_urn.is_none());
272        assert_eq!(result.check_duration_ms, 10);
273    }
274
275    #[test]
276    fn test_exact_match_result() {
277        let result = DuplicateCheckResult::exact_match(
278            MemoryId::new("test123"),
279            "subcog://project/decisions/test123".to_string(),
280            5,
281        );
282        assert!(result.is_duplicate);
283        assert_eq!(result.reason, Some(DuplicateReason::ExactMatch));
284        assert!(result.similarity_score.is_none());
285        assert_eq!(result.matched_memory_id, Some(MemoryId::new("test123")));
286        assert_eq!(
287            result.matched_urn,
288            Some("subcog://project/decisions/test123".to_string())
289        );
290    }
291
292    #[test]
293    fn test_semantic_match_result() {
294        let result = DuplicateCheckResult::semantic_match(
295            MemoryId::new("test456"),
296            "subcog://project/patterns/test456".to_string(),
297            0.94,
298            20,
299        );
300        assert!(result.is_duplicate);
301        assert_eq!(result.reason, Some(DuplicateReason::SemanticSimilar));
302        assert_eq!(result.similarity_score, Some(0.94));
303    }
304
305    #[test]
306    fn test_recent_capture_result() {
307        let result = DuplicateCheckResult::recent_capture(
308            MemoryId::new("test789"),
309            "subcog://project/learnings/test789".to_string(),
310            1,
311        );
312        assert!(result.is_duplicate);
313        assert_eq!(result.reason, Some(DuplicateReason::RecentCapture));
314    }
315
316    #[test]
317    fn test_duplicate_reason_display() {
318        assert_eq!(DuplicateReason::ExactMatch.to_string(), "exact_match");
319        assert_eq!(
320            DuplicateReason::SemanticSimilar.to_string(),
321            "semantic_similar"
322        );
323        assert_eq!(DuplicateReason::RecentCapture.to_string(), "recent_capture");
324    }
325}