subcog/services/deduplication/types.rs
1//! Deduplication result types.
2//!
3//! This module defines the result types returned by deduplication checks.
4
5use crate::models::MemoryId;
6use serde::{Deserialize, Serialize};
7
8/// Result of a deduplication check.
9///
10/// Contains information about whether content was found to be a duplicate,
11/// the reason for duplication, and any matched memory information.
12///
13/// # Example
14///
15/// ```rust
16/// use subcog::services::deduplication::{DuplicateCheckResult, DuplicateReason};
17/// use subcog::models::MemoryId;
18///
19/// let result = DuplicateCheckResult {
20/// is_duplicate: true,
21/// reason: Some(DuplicateReason::ExactMatch),
22/// similarity_score: None,
23/// matched_memory_id: Some(MemoryId::new("abc123")),
24/// matched_urn: Some("subcog://project/decisions/abc123".to_string()),
25/// check_duration_ms: 5,
26/// };
27///
28/// assert!(result.is_duplicate);
29/// assert_eq!(result.reason, Some(DuplicateReason::ExactMatch));
30/// ```
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct DuplicateCheckResult {
33 /// Whether the content is a duplicate.
34 pub is_duplicate: bool,
35
36 /// The reason content was identified as a duplicate.
37 pub reason: Option<DuplicateReason>,
38
39 /// Similarity score for semantic matches (0.0 to 1.0).
40 pub similarity_score: Option<f32>,
41
42 /// The memory ID of the matched duplicate.
43 pub matched_memory_id: Option<MemoryId>,
44
45 /// Full URN of matched memory: `subcog://{domain}/{namespace}/{id}`.
46 ///
47 /// MUST be populated when `is_duplicate == true`.
48 /// All external outputs (logs, metrics labels, hook responses) MUST reference
49 /// memories by URN, not bare ID.
50 pub matched_urn: Option<String>,
51
52 /// Duration of the deduplication check in milliseconds.
53 pub check_duration_ms: u64,
54}
55
56impl DuplicateCheckResult {
57 /// Creates a result indicating no duplicate was found.
58 ///
59 /// # Arguments
60 ///
61 /// * `duration_ms` - Time taken for the check in milliseconds
62 ///
63 /// # Example
64 ///
65 /// ```rust
66 /// use subcog::services::deduplication::DuplicateCheckResult;
67 ///
68 /// let result = DuplicateCheckResult::not_duplicate(10);
69 /// assert!(!result.is_duplicate);
70 /// assert!(result.reason.is_none());
71 /// ```
72 #[must_use]
73 pub const fn not_duplicate(duration_ms: u64) -> Self {
74 Self {
75 is_duplicate: false,
76 reason: None,
77 similarity_score: None,
78 matched_memory_id: None,
79 matched_urn: None,
80 check_duration_ms: duration_ms,
81 }
82 }
83
84 /// Creates a result indicating an exact match was found.
85 ///
86 /// # Arguments
87 ///
88 /// * `memory_id` - The ID of the matched memory
89 /// * `urn` - The full URN of the matched memory
90 /// * `duration_ms` - Time taken for the check in milliseconds
91 ///
92 /// # Example
93 ///
94 /// ```rust
95 /// use subcog::services::deduplication::DuplicateCheckResult;
96 /// use subcog::models::MemoryId;
97 ///
98 /// let result = DuplicateCheckResult::exact_match(
99 /// MemoryId::new("abc123"),
100 /// "subcog://project/decisions/abc123".to_string(),
101 /// 5,
102 /// );
103 /// assert!(result.is_duplicate);
104 /// ```
105 #[must_use]
106 pub const fn exact_match(memory_id: MemoryId, urn: String, duration_ms: u64) -> Self {
107 Self {
108 is_duplicate: true,
109 reason: Some(DuplicateReason::ExactMatch),
110 similarity_score: None,
111 matched_memory_id: Some(memory_id),
112 matched_urn: Some(urn),
113 check_duration_ms: duration_ms,
114 }
115 }
116
117 /// Creates a result indicating a semantic similarity match was found.
118 ///
119 /// # Arguments
120 ///
121 /// * `memory_id` - The ID of the matched memory
122 /// * `urn` - The full URN of the matched memory
123 /// * `score` - The similarity score (0.0 to 1.0)
124 /// * `duration_ms` - Time taken for the check in milliseconds
125 ///
126 /// # Example
127 ///
128 /// ```rust
129 /// use subcog::services::deduplication::DuplicateCheckResult;
130 /// use subcog::models::MemoryId;
131 ///
132 /// let result = DuplicateCheckResult::semantic_match(
133 /// MemoryId::new("abc123"),
134 /// "subcog://project/decisions/abc123".to_string(),
135 /// 0.94,
136 /// 20,
137 /// );
138 /// assert!(result.is_duplicate);
139 /// assert_eq!(result.similarity_score, Some(0.94));
140 /// ```
141 #[must_use]
142 pub const fn semantic_match(
143 memory_id: MemoryId,
144 urn: String,
145 score: f32,
146 duration_ms: u64,
147 ) -> Self {
148 Self {
149 is_duplicate: true,
150 reason: Some(DuplicateReason::SemanticSimilar),
151 similarity_score: Some(score),
152 matched_memory_id: Some(memory_id),
153 matched_urn: Some(urn),
154 check_duration_ms: duration_ms,
155 }
156 }
157
158 /// Creates a result indicating the content was recently captured.
159 ///
160 /// # Arguments
161 ///
162 /// * `memory_id` - The ID of the matched memory
163 /// * `urn` - The full URN of the matched memory
164 /// * `duration_ms` - Time taken for the check in milliseconds
165 ///
166 /// # Example
167 ///
168 /// ```rust
169 /// use subcog::services::deduplication::DuplicateCheckResult;
170 /// use subcog::models::MemoryId;
171 ///
172 /// let result = DuplicateCheckResult::recent_capture(
173 /// MemoryId::new("abc123"),
174 /// "subcog://project/decisions/abc123".to_string(),
175 /// 1,
176 /// );
177 /// assert!(result.is_duplicate);
178 /// ```
179 #[must_use]
180 pub const fn recent_capture(memory_id: MemoryId, urn: String, duration_ms: u64) -> Self {
181 Self {
182 is_duplicate: true,
183 reason: Some(DuplicateReason::RecentCapture),
184 similarity_score: None,
185 matched_memory_id: Some(memory_id),
186 matched_urn: Some(urn),
187 check_duration_ms: duration_ms,
188 }
189 }
190}
191
192impl Default for DuplicateCheckResult {
193 fn default() -> Self {
194 Self::not_duplicate(0)
195 }
196}
197
198/// The reason content was identified as a duplicate.
199///
200/// # Variants
201///
202/// - `ExactMatch`: Content hash matches an existing memory exactly
203/// - `SemanticSimilar`: Embedding similarity exceeds the configured threshold
204/// - `RecentCapture`: Content was captured within the recent time window
205#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
206#[serde(rename_all = "snake_case")]
207pub enum DuplicateReason {
208 /// Content hash matches exactly (SHA256).
209 ExactMatch,
210
211 /// Semantic similarity exceeds threshold.
212 SemanticSimilar,
213
214 /// Content was captured within the recent time window.
215 RecentCapture,
216}
217
218impl std::fmt::Display for DuplicateReason {
219 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
220 match self {
221 Self::ExactMatch => write!(f, "exact_match"),
222 Self::SemanticSimilar => write!(f, "semantic_similar"),
223 Self::RecentCapture => write!(f, "recent_capture"),
224 }
225 }
226}
227
228/// Trait for deduplication checking.
229///
230/// Allows for different implementations (e.g., mock for testing).
231pub trait Deduplicator: Send + Sync {
232 /// Checks if content is a duplicate.
233 ///
234 /// # Arguments
235 ///
236 /// * `content` - The content to check
237 /// * `namespace` - The namespace to check within
238 ///
239 /// # Returns
240 ///
241 /// A result indicating whether the content is a duplicate and why.
242 ///
243 /// # Errors
244 ///
245 /// Returns an error if the check fails.
246 fn check_duplicate(
247 &self,
248 content: &str,
249 namespace: crate::models::Namespace,
250 ) -> crate::Result<DuplicateCheckResult>;
251
252 /// Records a successful capture for recent-capture tracking.
253 ///
254 /// # Arguments
255 ///
256 /// * `content_hash` - The SHA256 hash of the content
257 /// * `memory_id` - The ID of the captured memory
258 fn record_capture(&self, content_hash: &str, memory_id: &MemoryId);
259}
260
261#[cfg(test)]
262mod tests {
263 use super::*;
264
265 #[test]
266 fn test_not_duplicate_result() {
267 let result = DuplicateCheckResult::not_duplicate(10);
268 assert!(!result.is_duplicate);
269 assert!(result.reason.is_none());
270 assert!(result.matched_memory_id.is_none());
271 assert!(result.matched_urn.is_none());
272 assert_eq!(result.check_duration_ms, 10);
273 }
274
275 #[test]
276 fn test_exact_match_result() {
277 let result = DuplicateCheckResult::exact_match(
278 MemoryId::new("test123"),
279 "subcog://project/decisions/test123".to_string(),
280 5,
281 );
282 assert!(result.is_duplicate);
283 assert_eq!(result.reason, Some(DuplicateReason::ExactMatch));
284 assert!(result.similarity_score.is_none());
285 assert_eq!(result.matched_memory_id, Some(MemoryId::new("test123")));
286 assert_eq!(
287 result.matched_urn,
288 Some("subcog://project/decisions/test123".to_string())
289 );
290 }
291
292 #[test]
293 fn test_semantic_match_result() {
294 let result = DuplicateCheckResult::semantic_match(
295 MemoryId::new("test456"),
296 "subcog://project/patterns/test456".to_string(),
297 0.94,
298 20,
299 );
300 assert!(result.is_duplicate);
301 assert_eq!(result.reason, Some(DuplicateReason::SemanticSimilar));
302 assert_eq!(result.similarity_score, Some(0.94));
303 }
304
305 #[test]
306 fn test_recent_capture_result() {
307 let result = DuplicateCheckResult::recent_capture(
308 MemoryId::new("test789"),
309 "subcog://project/learnings/test789".to_string(),
310 1,
311 );
312 assert!(result.is_duplicate);
313 assert_eq!(result.reason, Some(DuplicateReason::RecentCapture));
314 }
315
316 #[test]
317 fn test_duplicate_reason_display() {
318 assert_eq!(DuplicateReason::ExactMatch.to_string(), "exact_match");
319 assert_eq!(
320 DuplicateReason::SemanticSimilar.to_string(),
321 "semantic_similar"
322 );
323 assert_eq!(DuplicateReason::RecentCapture.to_string(), "recent_capture");
324 }
325}