1#![allow(clippy::expect_used)]
4use regex::Regex;
8use std::sync::LazyLock;
9
10#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct PiiMatch {
13 pub pii_type: String,
15 pub start: usize,
17 pub end: usize,
19 pub matched_text: String,
21}
22
23struct PiiPattern {
25 name: &'static str,
26 regex: &'static LazyLock<Regex>,
27}
28
29static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
32 Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
33 .expect("static regex: email pattern")
34});
35
36static SSN_REGEX: LazyLock<Regex> =
37 LazyLock::new(|| Regex::new(r"\b\d{3}-\d{2}-\d{4}\b").expect("static regex: SSN pattern"));
38
39static PHONE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
40 Regex::new(r"\b(?:\+?1[-.\s]?)?\(?[2-9]\d{2}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b")
41 .expect("static regex: phone pattern")
42});
43
44static CREDIT_CARD_REGEX: LazyLock<Regex> = LazyLock::new(|| {
45 Regex::new(
46 r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})\b",
47 )
48 .expect("static regex: credit card pattern")
49});
50
51static IP_REGEX: LazyLock<Regex> = LazyLock::new(|| {
52 Regex::new(
53 r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
54 )
55 .expect("static regex: IP address pattern")
56});
57
58static DOB_REGEX: LazyLock<Regex> = LazyLock::new(|| {
59 Regex::new(
60 r"(?i)\b(?:dob|date\s*of\s*birth|birth\s*date)\s*[:=]?\s*\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}\b",
61 )
62 .expect("static regex: date of birth pattern")
63});
64
65static ZIP_REGEX: LazyLock<Regex> =
66 LazyLock::new(|| Regex::new(r"\b\d{5}(?:-\d{4})?\b").expect("static regex: ZIP code pattern"));
67
68static DL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
69 Regex::new(r"(?i)\b(?:driver'?s?\s*license|dl)\s*#?\s*[:=]?\s*[A-Z0-9]{6,12}\b")
70 .expect("static regex: driver's license pattern")
71});
72
73static PASSPORT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
74 Regex::new(r"(?i)\bpassport\s*#?\s*[:=]?\s*[A-Z0-9]{6,9}\b")
75 .expect("static regex: passport pattern")
76});
77
78static UK_NIN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
83 Regex::new(r"\b[A-CEGHJ-PR-TW-Z]{2}[\s\-]?\d{2}[\s\-]?\d{2}[\s\-]?\d{2}[\s\-]?[A-D]\b")
84 .expect("static regex: UK NIN pattern")
85});
86
87static CA_SIN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
89 Regex::new(r"\b\d{3}[\s\-]?\d{3}[\s\-]?\d{3}\b").expect("static regex: Canada SIN pattern")
90});
91
92static EU_VAT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
95 Regex::new(
96 r"(?i)\b(?:ATU\d{8}|BE[01]\d{9}|DE\d{9}|DK\d{8}|EE\d{9}|EL\d{9}|ES[A-Z]\d{7}[A-Z0-9]|FI\d{8}|FR[A-Z0-9]{2}\d{9}|HR\d{11}|HU\d{8}|IE\d{7}[A-Z]{1,2}|IT\d{11}|LT\d{9,12}|LU\d{8}|LV\d{11}|MT\d{8}|NL\d{9}B\d{2}|PL\d{10}|PT\d{9}|RO\d{2,10}|SE\d{12}|SI\d{8}|SK\d{10}|CY\d{8}[A-Z]|CZ\d{8,10}|BG\d{9,10})\b",
97 )
98 .expect("static regex: EU VAT pattern")
99});
100
101static AU_TFN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
103 Regex::new(r"(?i)\b(?:tfn|tax\s*file\s*number)\s*[:=]?\s*\d{3}[\s\-]?\d{3}[\s\-]?\d{2,3}\b")
104 .expect("static regex: Australia TFN pattern")
105});
106
107static IN_AADHAAR_REGEX: LazyLock<Regex> = LazyLock::new(|| {
109 Regex::new(r"\b[2-9]\d{3}[\s\-]?\d{4}[\s\-]?\d{4}\b")
110 .expect("static regex: India Aadhaar pattern")
111});
112
113static IN_PAN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
115 Regex::new(r"\b[A-Z]{5}\d{4}[A-Z]\b").expect("static regex: India PAN pattern")
116});
117
118fn pii_patterns() -> Vec<PiiPattern> {
120 vec![
121 PiiPattern {
122 name: "Email Address",
123 regex: &EMAIL_REGEX,
124 },
125 PiiPattern {
126 name: "SSN",
127 regex: &SSN_REGEX,
128 },
129 PiiPattern {
130 name: "Phone Number",
131 regex: &PHONE_REGEX,
132 },
133 PiiPattern {
134 name: "Credit Card Number",
135 regex: &CREDIT_CARD_REGEX,
136 },
137 PiiPattern {
138 name: "IP Address",
139 regex: &IP_REGEX,
140 },
141 PiiPattern {
142 name: "Date of Birth",
143 regex: &DOB_REGEX,
144 },
145 PiiPattern {
146 name: "ZIP Code",
147 regex: &ZIP_REGEX,
148 },
149 PiiPattern {
150 name: "Driver's License",
151 regex: &DL_REGEX,
152 },
153 PiiPattern {
154 name: "Passport Number",
155 regex: &PASSPORT_REGEX,
156 },
157 PiiPattern {
159 name: "UK National Insurance Number",
160 regex: &UK_NIN_REGEX,
161 },
162 PiiPattern {
163 name: "Canada SIN",
164 regex: &CA_SIN_REGEX,
165 },
166 PiiPattern {
167 name: "EU VAT Number",
168 regex: &EU_VAT_REGEX,
169 },
170 PiiPattern {
171 name: "Australia TFN",
172 regex: &AU_TFN_REGEX,
173 },
174 PiiPattern {
175 name: "India Aadhaar",
176 regex: &IN_AADHAAR_REGEX,
177 },
178 PiiPattern {
179 name: "India PAN",
180 regex: &IN_PAN_REGEX,
181 },
182 ]
183}
184
185pub struct PiiDetector {
187 skip_local: bool,
189}
190
191impl PiiDetector {
192 #[must_use]
194 pub const fn new() -> Self {
195 Self { skip_local: true }
196 }
197
198 #[must_use]
200 pub const fn include_local(mut self) -> Self {
201 self.skip_local = false;
202 self
203 }
204
205 #[must_use]
207 pub fn contains_pii(&self, content: &str) -> bool {
208 !self.detect(content).is_empty()
209 }
210
211 #[must_use]
213 pub fn detect(&self, content: &str) -> Vec<PiiMatch> {
214 let mut found_matches = Vec::new();
215
216 for pattern in pii_patterns() {
217 self.collect_pattern_matches(pattern.name, pattern.regex, content, &mut found_matches);
218 }
219
220 found_matches.sort_by_key(|m| m.start);
222
223 deduplicate_overlapping(found_matches)
225 }
226
227 fn collect_pattern_matches(
229 &self,
230 pattern_name: &str,
231 regex: &Regex,
232 content: &str,
233 matches: &mut Vec<PiiMatch>,
234 ) {
235 for m in regex.find_iter(content) {
236 if let Some(pii_match) = self.process_match(pattern_name, &m, content) {
237 matches.push(pii_match);
238 }
239 }
240 }
241
242 fn process_match(
244 &self,
245 pattern_name: &str,
246 m: ®ex::Match<'_>,
247 content: &str,
248 ) -> Option<PiiMatch> {
249 let match_str = m.as_str();
250
251 if self.skip_local && pattern_name == "IP Address" && is_local_ip(match_str) {
253 return None;
254 }
255
256 if pattern_name == "ZIP Code"
258 && match_str.len() == 5
259 && !is_zip_in_address_context(content, m.start())
260 {
261 return None;
262 }
263
264 Some(PiiMatch {
265 pii_type: pattern_name.to_string(),
266 start: m.start(),
267 end: m.end(),
268 matched_text: match_str.to_string(),
269 })
270 }
271
272 #[must_use]
274 pub fn detect_types(&self, content: &str) -> Vec<String> {
275 self.detect(content)
276 .into_iter()
277 .map(|m| m.pii_type)
278 .collect()
279 }
280
281 #[must_use]
283 pub fn count(&self, content: &str) -> usize {
284 self.detect(content).len()
285 }
286}
287
288fn deduplicate_overlapping(sorted_matches: Vec<PiiMatch>) -> Vec<PiiMatch> {
290 let mut result = Vec::new();
291 let mut last_end = 0;
292
293 for m in sorted_matches {
294 if m.start >= last_end {
295 last_end = m.end;
296 result.push(m);
297 }
298 }
299
300 result
301}
302
303fn is_local_ip(ip: &str) -> bool {
305 ip.starts_with("127.")
306 || ip.starts_with("10.")
307 || ip.starts_with("192.168.")
308 || ip.starts_with("172.16.")
309 || ip == "0.0.0.0"
310}
311
312fn is_zip_in_address_context(content: &str, match_start: usize) -> bool {
314 let before = if match_start >= 20 {
315 &content[match_start - 20..match_start]
316 } else {
317 &content[..match_start]
318 };
319 let before_lower = before.to_lowercase();
320 before_lower.contains("address") || before_lower.contains("zip") || before.contains(',')
321}
322
323impl Default for PiiDetector {
324 fn default() -> Self {
325 Self::new()
326 }
327}
328
329#[cfg(test)]
330mod tests {
331 use super::*;
332
333 #[test]
334 fn test_detect_email() {
335 let detector = PiiDetector::new();
336 let content = "Contact me at john.doe@example.com";
337 let matches = detector.detect(content);
338
339 assert_eq!(matches.len(), 1);
340 assert_eq!(matches[0].pii_type, "Email Address");
341 assert_eq!(matches[0].matched_text, "john.doe@example.com");
342 }
343
344 #[test]
345 fn test_detect_ssn() {
346 let detector = PiiDetector::new();
347 let content = "SSN: 123-45-6789";
348 let matches = detector.detect(content);
349
350 assert!(!matches.is_empty());
351 assert!(matches.iter().any(|m| m.pii_type == "SSN"));
352 }
353
354 #[test]
355 fn test_detect_phone() {
356 let detector = PiiDetector::new();
357 let content = "Call me at (555) 123-4567";
358 let matches = detector.detect(content);
359
360 assert!(!matches.is_empty());
361 assert!(matches.iter().any(|m| m.pii_type == "Phone Number"));
362 }
363
364 #[test]
365 fn test_detect_credit_card() {
366 let detector = PiiDetector::new();
367 let content = "Card: 4111111111111111";
369 let matches = detector.detect(content);
370
371 assert!(!matches.is_empty());
372 assert!(matches.iter().any(|m| m.pii_type == "Credit Card Number"));
373 }
374
375 #[test]
376 fn test_detect_ip_address() {
377 let detector = PiiDetector::new();
378 let content = "Server IP: 203.0.113.42";
379 let matches = detector.detect(content);
380
381 assert!(!matches.is_empty());
382 assert!(matches.iter().any(|m| m.pii_type == "IP Address"));
383 }
384
385 #[test]
386 fn test_skip_local_ip() {
387 let detector = PiiDetector::new();
388 let content = "Localhost: 127.0.0.1";
389 let matches = detector.detect(content);
390
391 assert!(matches.iter().all(|m| m.pii_type != "IP Address"));
392 }
393
394 #[test]
395 fn test_include_local_ip() {
396 let detector = PiiDetector::new().include_local();
397 let content = "Localhost: 127.0.0.1";
398 let matches = detector.detect(content);
399
400 assert!(matches.iter().any(|m| m.pii_type == "IP Address"));
401 }
402
403 #[test]
404 fn test_no_pii() {
405 let detector = PiiDetector::new();
406 let content = "This is just regular text without PII.";
407 assert!(!detector.contains_pii(content));
408 }
409
410 #[test]
411 fn test_multiple_pii() {
412 let detector = PiiDetector::new();
413 let content = "Email: test@example.com, Phone: 555-123-4567";
414 let matches = detector.detect(content);
415
416 assert!(matches.len() >= 2);
417 }
418
419 #[test]
420 fn test_detect_types() {
421 let detector = PiiDetector::new();
422 let content = "test@example.com";
423 let types = detector.detect_types(content);
424
425 assert!(types.contains(&"Email Address".to_string()));
426 }
427
428 #[test]
433 fn test_detect_uk_nin() {
434 let detector = PiiDetector::new();
435 let content = "NIN: AB123456C";
437 let matches = detector.detect(content);
438
439 assert!(!matches.is_empty());
440 assert!(
441 matches
442 .iter()
443 .any(|m| m.pii_type == "UK National Insurance Number")
444 );
445 }
446
447 #[test]
448 fn test_detect_uk_nin_with_spaces() {
449 let detector = PiiDetector::new();
450 let content = "National Insurance: AB 12 34 56 C";
451 let matches = detector.detect(content);
452
453 assert!(!matches.is_empty());
454 assert!(
455 matches
456 .iter()
457 .any(|m| m.pii_type == "UK National Insurance Number")
458 );
459 }
460
461 #[test]
462 fn test_detect_canada_sin() {
463 let detector = PiiDetector::new();
464 let content = "SIN: 123-456-789";
466 let matches = detector.detect(content);
467
468 assert!(!matches.is_empty());
469 assert!(matches.iter().any(|m| m.pii_type == "Canada SIN"));
470 }
471
472 #[test]
473 fn test_detect_canada_sin_no_dashes() {
474 let detector = PiiDetector::new();
475 let content = "SIN: 123456789";
476 let matches = detector.detect(content);
477
478 assert!(!matches.is_empty());
479 assert!(matches.iter().any(|m| m.pii_type == "Canada SIN"));
480 }
481
482 #[test]
483 fn test_detect_eu_vat_german() {
484 let detector = PiiDetector::new();
485 let content = "VAT: DE123456789";
487 let matches = detector.detect(content);
488
489 assert!(!matches.is_empty());
490 assert!(matches.iter().any(|m| m.pii_type == "EU VAT Number"));
491 }
492
493 #[test]
494 fn test_detect_eu_vat_french() {
495 let detector = PiiDetector::new();
496 let content = "VAT: FR12123456789";
498 let matches = detector.detect(content);
499
500 assert!(!matches.is_empty());
501 assert!(matches.iter().any(|m| m.pii_type == "EU VAT Number"));
502 }
503
504 #[test]
505 fn test_detect_eu_vat_dutch() {
506 let detector = PiiDetector::new();
507 let content = "VAT: NL123456789B01";
509 let matches = detector.detect(content);
510
511 assert!(!matches.is_empty());
512 assert!(matches.iter().any(|m| m.pii_type == "EU VAT Number"));
513 }
514
515 #[test]
516 fn test_detect_australia_tfn() {
517 let detector = PiiDetector::new();
518 let content = "TFN: 123-456-789";
520 let matches = detector.detect(content);
521
522 assert!(!matches.is_empty());
523 assert!(matches.iter().any(|m| m.pii_type == "Australia TFN"));
524 }
525
526 #[test]
527 fn test_detect_india_aadhaar() {
528 let detector = PiiDetector::new();
529 let content = "Aadhaar: 2345 6789 0123";
531 let matches = detector.detect(content);
532
533 assert!(!matches.is_empty());
534 assert!(matches.iter().any(|m| m.pii_type == "India Aadhaar"));
535 }
536
537 #[test]
538 fn test_detect_india_pan() {
539 let detector = PiiDetector::new();
540 let content = "PAN: ABCDE1234F";
542 let matches = detector.detect(content);
543
544 assert!(!matches.is_empty());
545 assert!(matches.iter().any(|m| m.pii_type == "India PAN"));
546 }
547
548 #[test]
549 fn test_international_ids_case_insensitive() {
550 let detector = PiiDetector::new();
551
552 let content = "vat: de123456789";
554 let matches = detector.detect(content);
555 assert!(matches.iter().any(|m| m.pii_type == "EU VAT Number"));
556 }
557}