Skip to main content

subcog/services/deduplication/
hasher.rs

1//! Content hashing utility for deduplication.
2//!
3//! This module provides SHA256-based content hashing for exact match detection.
4//! Content is normalized before hashing to ensure consistent matches despite
5//! minor formatting differences.
6
7use sha2::{Digest, Sha256};
8
9/// Content hasher for deduplication.
10///
11/// Normalizes content and produces SHA256 hashes for exact match detection.
12///
13/// # Normalization
14///
15/// Before hashing, content is normalized:
16/// - Trimmed of leading/trailing whitespace
17/// - Converted to lowercase
18/// - Multiple whitespace characters collapsed to single spaces
19///
20/// # Example
21///
22/// ```rust
23/// use subcog::services::deduplication::ContentHasher;
24///
25/// let hash = ContentHasher::hash("Use PostgreSQL for primary storage");
26/// assert_eq!(hash.len(), 64); // SHA256 produces 64 hex chars
27///
28/// // Normalized content produces the same hash
29/// let hash2 = ContentHasher::hash("  Use  postgresql  for  primary  storage  ");
30/// assert_eq!(hash, hash2);
31/// ```
32pub struct ContentHasher;
33
34impl ContentHasher {
35    /// Computes the SHA256 hash of normalized content.
36    ///
37    /// # Arguments
38    ///
39    /// * `content` - The content to hash
40    ///
41    /// # Returns
42    ///
43    /// The lowercase hex-encoded SHA256 hash (64 characters).
44    ///
45    /// # Example
46    ///
47    /// ```rust
48    /// use subcog::services::deduplication::ContentHasher;
49    ///
50    /// let hash = ContentHasher::hash("Hello, world!");
51    /// assert_eq!(hash.len(), 64);
52    /// ```
53    #[must_use]
54    pub fn hash(content: &str) -> String {
55        let normalized = Self::normalize(content);
56        let mut hasher = Sha256::new();
57        hasher.update(normalized.as_bytes());
58        hex::encode(hasher.finalize())
59    }
60
61    /// Converts a hash to a tag format.
62    ///
63    /// The tag format is `hash:sha256:<16-char-prefix>`.
64    ///
65    /// # Arguments
66    ///
67    /// * `hash` - The full SHA256 hash
68    ///
69    /// # Returns
70    ///
71    /// The hash tag string.
72    ///
73    /// # Panics
74    ///
75    /// Does not panic. If the hash is shorter than 16 chars, uses the full hash.
76    ///
77    /// # Example
78    ///
79    /// ```rust
80    /// use subcog::services::deduplication::ContentHasher;
81    ///
82    /// let hash = ContentHasher::hash("test content");
83    /// let tag = ContentHasher::hash_to_tag(&hash);
84    /// assert!(tag.starts_with("hash:sha256:"));
85    /// assert_eq!(tag.len(), "hash:sha256:".len() + 16);
86    /// ```
87    #[must_use]
88    pub fn hash_to_tag(hash: &str) -> String {
89        let prefix_len = hash.len().min(16);
90        format!("hash:sha256:{}", &hash[..prefix_len])
91    }
92
93    /// Computes a hash and returns it in tag format.
94    ///
95    /// Convenience method that combines `hash()` and `hash_to_tag()`.
96    ///
97    /// # Arguments
98    ///
99    /// * `content` - The content to hash
100    ///
101    /// # Returns
102    ///
103    /// The hash tag string.
104    ///
105    /// # Example
106    ///
107    /// ```rust
108    /// use subcog::services::deduplication::ContentHasher;
109    ///
110    /// let tag = ContentHasher::content_to_tag("Use PostgreSQL");
111    /// assert!(tag.starts_with("hash:sha256:"));
112    /// ```
113    #[must_use]
114    pub fn content_to_tag(content: &str) -> String {
115        let hash = Self::hash(content);
116        Self::hash_to_tag(&hash)
117    }
118
119    /// Normalizes content for consistent hashing.
120    ///
121    /// Normalization steps:
122    /// 1. Trim leading/trailing whitespace
123    /// 2. Convert to lowercase
124    /// 3. Collapse multiple whitespace to single space
125    ///
126    /// # Arguments
127    ///
128    /// * `content` - The content to normalize
129    ///
130    /// # Returns
131    ///
132    /// The normalized content string.
133    ///
134    /// # Example
135    ///
136    /// ```rust
137    /// use subcog::services::deduplication::ContentHasher;
138    ///
139    /// let normalized = ContentHasher::normalize("  Hello   WORLD  ");
140    /// assert_eq!(normalized, "hello world");
141    /// ```
142    #[must_use]
143    pub fn normalize(content: &str) -> String {
144        content
145            .trim()
146            .to_lowercase()
147            .split_whitespace()
148            .collect::<Vec<_>>()
149            .join(" ")
150    }
151}
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156
157    #[test]
158    fn test_hash_produces_64_char_hex() {
159        let hash = ContentHasher::hash("test content");
160        assert_eq!(hash.len(), 64);
161        assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
162    }
163
164    #[test]
165    fn test_same_content_same_hash() {
166        let hash1 = ContentHasher::hash("Use PostgreSQL for storage");
167        let hash2 = ContentHasher::hash("Use PostgreSQL for storage");
168        assert_eq!(hash1, hash2);
169    }
170
171    #[test]
172    fn test_different_content_different_hash() {
173        let hash1 = ContentHasher::hash("Use PostgreSQL");
174        let hash2 = ContentHasher::hash("Use MySQL");
175        assert_ne!(hash1, hash2);
176    }
177
178    #[test]
179    fn test_normalization_case_insensitive() {
180        let hash1 = ContentHasher::hash("Use PostgreSQL");
181        let hash2 = ContentHasher::hash("use postgresql");
182        assert_eq!(hash1, hash2);
183    }
184
185    #[test]
186    fn test_normalization_whitespace_collapse() {
187        let hash1 = ContentHasher::hash("Use PostgreSQL");
188        let hash2 = ContentHasher::hash("  Use   PostgreSQL  ");
189        assert_eq!(hash1, hash2);
190    }
191
192    #[test]
193    fn test_normalization_mixed() {
194        let hash1 = ContentHasher::hash("use postgresql");
195        let hash2 = ContentHasher::hash("  USE    POSTGRESQL  ");
196        assert_eq!(hash1, hash2);
197    }
198
199    #[test]
200    fn test_hash_to_tag_format() {
201        let hash = ContentHasher::hash("test");
202        let tag = ContentHasher::hash_to_tag(&hash);
203
204        assert!(tag.starts_with("hash:sha256:"));
205        // Total length should be "hash:sha256:" (12) + 16 chars = 28
206        assert_eq!(tag.len(), 28);
207    }
208
209    #[test]
210    fn test_content_to_tag_convenience() {
211        let tag = ContentHasher::content_to_tag("Use PostgreSQL for storage");
212        assert!(tag.starts_with("hash:sha256:"));
213        assert_eq!(tag.len(), 28);
214    }
215
216    #[test]
217    fn test_normalize_function() {
218        assert_eq!(ContentHasher::normalize("  Hello  "), "hello");
219        assert_eq!(ContentHasher::normalize("Hello   World"), "hello world");
220        assert_eq!(ContentHasher::normalize("UPPER"), "upper");
221        assert_eq!(ContentHasher::normalize("  a  b  c  "), "a b c");
222    }
223
224    #[test]
225    fn test_empty_content() {
226        let hash = ContentHasher::hash("");
227        // Empty string should still produce a valid hash
228        assert_eq!(hash.len(), 64);
229
230        let tag = ContentHasher::hash_to_tag(&hash);
231        assert!(tag.starts_with("hash:sha256:"));
232    }
233
234    #[test]
235    fn test_unicode_content() {
236        let hash = ContentHasher::hash("Use PostgreSQL for 数据库");
237        assert_eq!(hash.len(), 64);
238
239        // Unicode is preserved but lowercased where applicable
240        let normalized = ContentHasher::normalize("Use POSTGRESQL for 数据库");
241        assert!(normalized.contains("数据库"));
242    }
243
244    #[test]
245    fn test_hash_to_tag_short_hash() {
246        // Edge case: if somehow given a short hash
247        let tag = ContentHasher::hash_to_tag("abc");
248        assert_eq!(tag, "hash:sha256:abc");
249    }
250
251    #[test]
252    fn test_newline_handling() {
253        // Newlines should be treated as whitespace
254        let hash1 = ContentHasher::hash("line one\nline two");
255        let hash2 = ContentHasher::hash("line one line two");
256        assert_eq!(hash1, hash2);
257    }
258
259    #[test]
260    fn test_tab_handling() {
261        // Tabs should be treated as whitespace
262        let hash1 = ContentHasher::hash("col1\tcol2");
263        let hash2 = ContentHasher::hash("col1 col2");
264        assert_eq!(hash1, hash2);
265    }
266
267    mod property_tests {
268        use super::*;
269        use proptest::prelude::*;
270
271        proptest! {
272            /// Hash output is always 64 hex characters.
273            #[test]
274            fn prop_hash_length(content in any::<String>()) {
275                let hash = ContentHasher::hash(&content);
276                prop_assert_eq!(hash.len(), 64, "Hash length should be 64, got {}", hash.len());
277            }
278
279            /// Same input always produces same hash (deterministic).
280            #[test]
281            fn prop_hash_deterministic(content in any::<String>()) {
282                let hash1 = ContentHasher::hash(&content);
283                let hash2 = ContentHasher::hash(&content);
284                prop_assert_eq!(hash1, hash2, "Hash should be deterministic");
285            }
286
287            /// Normalization is idempotent: normalize(normalize(x)) == normalize(x).
288            #[test]
289            fn prop_normalize_idempotent(content in any::<String>()) {
290                let once = ContentHasher::normalize(&content);
291                let twice = ContentHasher::normalize(&once);
292                prop_assert_eq!(once, twice, "Normalization should be idempotent");
293            }
294
295            /// Normalized content produces same hash regardless of whitespace/case.
296            #[test]
297            fn prop_normalized_hash_invariant(content in "[a-z ]{1,50}") {
298                let with_spaces = format!("  {content}  ");
299                let uppercased = content.to_uppercase();
300
301                let hash_original = ContentHasher::hash(&content);
302                let hash_spaces = ContentHasher::hash(&with_spaces);
303                let hash_upper = ContentHasher::hash(&uppercased);
304
305                prop_assert_eq!(
306                    &hash_original, &hash_spaces,
307                    "Extra whitespace should not affect hash"
308                );
309                prop_assert_eq!(
310                    &hash_original, &hash_upper,
311                    "Case should not affect hash"
312                );
313            }
314
315            /// Tag format is always correct.
316            #[test]
317            fn prop_tag_format(content in any::<String>()) {
318                let tag = ContentHasher::content_to_tag(&content);
319                prop_assert!(
320                    tag.starts_with("hash:sha256:"),
321                    "Tag should start with 'hash:sha256:', got {tag}"
322                );
323                prop_assert_eq!(
324                    tag.len(), 28,
325                    "Tag length should be 28 (12 prefix + 16 hash), got {}", tag.len()
326                );
327            }
328        }
329    }
330}