subcog/services/deduplication/
hasher.rs1use sha2::{Digest, Sha256};
8
9pub struct ContentHasher;
33
34impl ContentHasher {
35 #[must_use]
54 pub fn hash(content: &str) -> String {
55 let normalized = Self::normalize(content);
56 let mut hasher = Sha256::new();
57 hasher.update(normalized.as_bytes());
58 hex::encode(hasher.finalize())
59 }
60
61 #[must_use]
88 pub fn hash_to_tag(hash: &str) -> String {
89 let prefix_len = hash.len().min(16);
90 format!("hash:sha256:{}", &hash[..prefix_len])
91 }
92
93 #[must_use]
114 pub fn content_to_tag(content: &str) -> String {
115 let hash = Self::hash(content);
116 Self::hash_to_tag(&hash)
117 }
118
119 #[must_use]
143 pub fn normalize(content: &str) -> String {
144 content
145 .trim()
146 .to_lowercase()
147 .split_whitespace()
148 .collect::<Vec<_>>()
149 .join(" ")
150 }
151}
152
153#[cfg(test)]
154mod tests {
155 use super::*;
156
157 #[test]
158 fn test_hash_produces_64_char_hex() {
159 let hash = ContentHasher::hash("test content");
160 assert_eq!(hash.len(), 64);
161 assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
162 }
163
164 #[test]
165 fn test_same_content_same_hash() {
166 let hash1 = ContentHasher::hash("Use PostgreSQL for storage");
167 let hash2 = ContentHasher::hash("Use PostgreSQL for storage");
168 assert_eq!(hash1, hash2);
169 }
170
171 #[test]
172 fn test_different_content_different_hash() {
173 let hash1 = ContentHasher::hash("Use PostgreSQL");
174 let hash2 = ContentHasher::hash("Use MySQL");
175 assert_ne!(hash1, hash2);
176 }
177
178 #[test]
179 fn test_normalization_case_insensitive() {
180 let hash1 = ContentHasher::hash("Use PostgreSQL");
181 let hash2 = ContentHasher::hash("use postgresql");
182 assert_eq!(hash1, hash2);
183 }
184
185 #[test]
186 fn test_normalization_whitespace_collapse() {
187 let hash1 = ContentHasher::hash("Use PostgreSQL");
188 let hash2 = ContentHasher::hash(" Use PostgreSQL ");
189 assert_eq!(hash1, hash2);
190 }
191
192 #[test]
193 fn test_normalization_mixed() {
194 let hash1 = ContentHasher::hash("use postgresql");
195 let hash2 = ContentHasher::hash(" USE POSTGRESQL ");
196 assert_eq!(hash1, hash2);
197 }
198
199 #[test]
200 fn test_hash_to_tag_format() {
201 let hash = ContentHasher::hash("test");
202 let tag = ContentHasher::hash_to_tag(&hash);
203
204 assert!(tag.starts_with("hash:sha256:"));
205 assert_eq!(tag.len(), 28);
207 }
208
209 #[test]
210 fn test_content_to_tag_convenience() {
211 let tag = ContentHasher::content_to_tag("Use PostgreSQL for storage");
212 assert!(tag.starts_with("hash:sha256:"));
213 assert_eq!(tag.len(), 28);
214 }
215
216 #[test]
217 fn test_normalize_function() {
218 assert_eq!(ContentHasher::normalize(" Hello "), "hello");
219 assert_eq!(ContentHasher::normalize("Hello World"), "hello world");
220 assert_eq!(ContentHasher::normalize("UPPER"), "upper");
221 assert_eq!(ContentHasher::normalize(" a b c "), "a b c");
222 }
223
224 #[test]
225 fn test_empty_content() {
226 let hash = ContentHasher::hash("");
227 assert_eq!(hash.len(), 64);
229
230 let tag = ContentHasher::hash_to_tag(&hash);
231 assert!(tag.starts_with("hash:sha256:"));
232 }
233
234 #[test]
235 fn test_unicode_content() {
236 let hash = ContentHasher::hash("Use PostgreSQL for 数据库");
237 assert_eq!(hash.len(), 64);
238
239 let normalized = ContentHasher::normalize("Use POSTGRESQL for 数据库");
241 assert!(normalized.contains("数据库"));
242 }
243
244 #[test]
245 fn test_hash_to_tag_short_hash() {
246 let tag = ContentHasher::hash_to_tag("abc");
248 assert_eq!(tag, "hash:sha256:abc");
249 }
250
251 #[test]
252 fn test_newline_handling() {
253 let hash1 = ContentHasher::hash("line one\nline two");
255 let hash2 = ContentHasher::hash("line one line two");
256 assert_eq!(hash1, hash2);
257 }
258
259 #[test]
260 fn test_tab_handling() {
261 let hash1 = ContentHasher::hash("col1\tcol2");
263 let hash2 = ContentHasher::hash("col1 col2");
264 assert_eq!(hash1, hash2);
265 }
266
267 mod property_tests {
268 use super::*;
269 use proptest::prelude::*;
270
271 proptest! {
272 #[test]
274 fn prop_hash_length(content in any::<String>()) {
275 let hash = ContentHasher::hash(&content);
276 prop_assert_eq!(hash.len(), 64, "Hash length should be 64, got {}", hash.len());
277 }
278
279 #[test]
281 fn prop_hash_deterministic(content in any::<String>()) {
282 let hash1 = ContentHasher::hash(&content);
283 let hash2 = ContentHasher::hash(&content);
284 prop_assert_eq!(hash1, hash2, "Hash should be deterministic");
285 }
286
287 #[test]
289 fn prop_normalize_idempotent(content in any::<String>()) {
290 let once = ContentHasher::normalize(&content);
291 let twice = ContentHasher::normalize(&once);
292 prop_assert_eq!(once, twice, "Normalization should be idempotent");
293 }
294
295 #[test]
297 fn prop_normalized_hash_invariant(content in "[a-z ]{1,50}") {
298 let with_spaces = format!(" {content} ");
299 let uppercased = content.to_uppercase();
300
301 let hash_original = ContentHasher::hash(&content);
302 let hash_spaces = ContentHasher::hash(&with_spaces);
303 let hash_upper = ContentHasher::hash(&uppercased);
304
305 prop_assert_eq!(
306 &hash_original, &hash_spaces,
307 "Extra whitespace should not affect hash"
308 );
309 prop_assert_eq!(
310 &hash_original, &hash_upper,
311 "Case should not affect hash"
312 );
313 }
314
315 #[test]
317 fn prop_tag_format(content in any::<String>()) {
318 let tag = ContentHasher::content_to_tag(&content);
319 prop_assert!(
320 tag.starts_with("hash:sha256:"),
321 "Tag should start with 'hash:sha256:', got {tag}"
322 );
323 prop_assert_eq!(
324 tag.len(), 28,
325 "Tag length should be 28 (12 prefix + 16 hash), got {}", tag.len()
326 );
327 }
328 }
329 }
330}