Skip to main content

subcog/security/
pii.rs

1//! PII detection.
2// Allow expect() on static regex patterns - these are guaranteed to compile
3#![allow(clippy::expect_used)]
4//!
5//! Detects personally identifiable information in content.
6
7use regex::Regex;
8use std::sync::LazyLock;
9
10/// A detected PII match.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct PiiMatch {
13    /// Type of PII detected.
14    pub pii_type: String,
15    /// Start position in content.
16    pub start: usize,
17    /// End position in content.
18    pub end: usize,
19    /// The matched text.
20    pub matched_text: String,
21}
22
23/// Pattern for detecting PII.
24struct PiiPattern {
25    name: &'static str,
26    regex: &'static LazyLock<Regex>,
27}
28
29// Define regex patterns as separate statics
30// Note: These patterns are static and guaranteed to compile, so expect() is safe
31static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
32    Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
33        .expect("static regex: email pattern")
34});
35
36static SSN_REGEX: LazyLock<Regex> =
37    LazyLock::new(|| Regex::new(r"\b\d{3}-\d{2}-\d{4}\b").expect("static regex: SSN pattern"));
38
39static PHONE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
40    Regex::new(r"\b(?:\+?1[-.\s]?)?\(?[2-9]\d{2}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b")
41        .expect("static regex: phone pattern")
42});
43
44static CREDIT_CARD_REGEX: LazyLock<Regex> = LazyLock::new(|| {
45    Regex::new(
46        r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})\b",
47    )
48    .expect("static regex: credit card pattern")
49});
50
51static IP_REGEX: LazyLock<Regex> = LazyLock::new(|| {
52    Regex::new(
53        r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
54    )
55    .expect("static regex: IP address pattern")
56});
57
58static DOB_REGEX: LazyLock<Regex> = LazyLock::new(|| {
59    Regex::new(
60        r"(?i)\b(?:dob|date\s*of\s*birth|birth\s*date)\s*[:=]?\s*\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}\b",
61    )
62    .expect("static regex: date of birth pattern")
63});
64
65static ZIP_REGEX: LazyLock<Regex> =
66    LazyLock::new(|| Regex::new(r"\b\d{5}(?:-\d{4})?\b").expect("static regex: ZIP code pattern"));
67
68static DL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
69    Regex::new(r"(?i)\b(?:driver'?s?\s*license|dl)\s*#?\s*[:=]?\s*[A-Z0-9]{6,12}\b")
70        .expect("static regex: driver's license pattern")
71});
72
73static PASSPORT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
74    Regex::new(r"(?i)\bpassport\s*#?\s*[:=]?\s*[A-Z0-9]{6,9}\b")
75        .expect("static regex: passport pattern")
76});
77
78// HIGH-SEC: International tax/ID patterns
79
80/// UK National Insurance Number: 2 letters + 6 digits + 1 letter (e.g., AB123456C)
81/// Letters may be separated by spaces or dashes
82static UK_NIN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
83    Regex::new(r"\b[A-CEGHJ-PR-TW-Z]{2}[\s\-]?\d{2}[\s\-]?\d{2}[\s\-]?\d{2}[\s\-]?[A-D]\b")
84        .expect("static regex: UK NIN pattern")
85});
86
87/// Canada Social Insurance Number: 9 digits, often XXX-XXX-XXX or XXX XXX XXX
88static CA_SIN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
89    Regex::new(r"\b\d{3}[\s\-]?\d{3}[\s\-]?\d{3}\b").expect("static regex: Canada SIN pattern")
90});
91
92/// EU VAT Number: Country prefix (2 letters) + country-specific format
93/// Common formats: AT + U + 8 digits, BE + 10 digits, DE + 9 digits, etc.
94static EU_VAT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
95    Regex::new(
96        r"(?i)\b(?:ATU\d{8}|BE[01]\d{9}|DE\d{9}|DK\d{8}|EE\d{9}|EL\d{9}|ES[A-Z]\d{7}[A-Z0-9]|FI\d{8}|FR[A-Z0-9]{2}\d{9}|HR\d{11}|HU\d{8}|IE\d{7}[A-Z]{1,2}|IT\d{11}|LT\d{9,12}|LU\d{8}|LV\d{11}|MT\d{8}|NL\d{9}B\d{2}|PL\d{10}|PT\d{9}|RO\d{2,10}|SE\d{12}|SI\d{8}|SK\d{10}|CY\d{8}[A-Z]|CZ\d{8,10}|BG\d{9,10})\b",
97    )
98    .expect("static regex: EU VAT pattern")
99});
100
101/// Australian Tax File Number (TFN): 8-9 digits
102static AU_TFN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
103    Regex::new(r"(?i)\b(?:tfn|tax\s*file\s*number)\s*[:=]?\s*\d{3}[\s\-]?\d{3}[\s\-]?\d{2,3}\b")
104        .expect("static regex: Australia TFN pattern")
105});
106
107/// Indian Aadhaar Number: 12 digits, often formatted as XXXX XXXX XXXX
108static IN_AADHAAR_REGEX: LazyLock<Regex> = LazyLock::new(|| {
109    Regex::new(r"\b[2-9]\d{3}[\s\-]?\d{4}[\s\-]?\d{4}\b")
110        .expect("static regex: India Aadhaar pattern")
111});
112
113/// Indian PAN (Permanent Account Number): 5 letters + 4 digits + 1 letter
114static IN_PAN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
115    Regex::new(r"\b[A-Z]{5}\d{4}[A-Z]\b").expect("static regex: India PAN pattern")
116});
117
118/// Returns the list of PII patterns to check.
119fn pii_patterns() -> Vec<PiiPattern> {
120    vec![
121        PiiPattern {
122            name: "Email Address",
123            regex: &EMAIL_REGEX,
124        },
125        PiiPattern {
126            name: "SSN",
127            regex: &SSN_REGEX,
128        },
129        PiiPattern {
130            name: "Phone Number",
131            regex: &PHONE_REGEX,
132        },
133        PiiPattern {
134            name: "Credit Card Number",
135            regex: &CREDIT_CARD_REGEX,
136        },
137        PiiPattern {
138            name: "IP Address",
139            regex: &IP_REGEX,
140        },
141        PiiPattern {
142            name: "Date of Birth",
143            regex: &DOB_REGEX,
144        },
145        PiiPattern {
146            name: "ZIP Code",
147            regex: &ZIP_REGEX,
148        },
149        PiiPattern {
150            name: "Driver's License",
151            regex: &DL_REGEX,
152        },
153        PiiPattern {
154            name: "Passport Number",
155            regex: &PASSPORT_REGEX,
156        },
157        // International tax/ID patterns
158        PiiPattern {
159            name: "UK National Insurance Number",
160            regex: &UK_NIN_REGEX,
161        },
162        PiiPattern {
163            name: "Canada SIN",
164            regex: &CA_SIN_REGEX,
165        },
166        PiiPattern {
167            name: "EU VAT Number",
168            regex: &EU_VAT_REGEX,
169        },
170        PiiPattern {
171            name: "Australia TFN",
172            regex: &AU_TFN_REGEX,
173        },
174        PiiPattern {
175            name: "India Aadhaar",
176            regex: &IN_AADHAAR_REGEX,
177        },
178        PiiPattern {
179            name: "India PAN",
180            regex: &IN_PAN_REGEX,
181        },
182    ]
183}
184
185/// Detector for personally identifiable information.
186pub struct PiiDetector {
187    /// Skip common non-PII patterns (like local IPs).
188    skip_local: bool,
189}
190
191impl PiiDetector {
192    /// Creates a new PII detector.
193    #[must_use]
194    pub const fn new() -> Self {
195        Self { skip_local: true }
196    }
197
198    /// Disables skipping of local/non-sensitive patterns.
199    #[must_use]
200    pub const fn include_local(mut self) -> Self {
201        self.skip_local = false;
202        self
203    }
204
205    /// Checks if content contains PII.
206    #[must_use]
207    pub fn contains_pii(&self, content: &str) -> bool {
208        !self.detect(content).is_empty()
209    }
210
211    /// Returns all detected PII matches.
212    #[must_use]
213    pub fn detect(&self, content: &str) -> Vec<PiiMatch> {
214        let mut found_matches = Vec::new();
215
216        for pattern in pii_patterns() {
217            self.collect_pattern_matches(pattern.name, pattern.regex, content, &mut found_matches);
218        }
219
220        // Sort by position
221        found_matches.sort_by_key(|m| m.start);
222
223        // Remove overlapping matches
224        deduplicate_overlapping(found_matches)
225    }
226
227    /// Collects matches for a single pattern into the result vector.
228    fn collect_pattern_matches(
229        &self,
230        pattern_name: &str,
231        regex: &Regex,
232        content: &str,
233        matches: &mut Vec<PiiMatch>,
234    ) {
235        for m in regex.find_iter(content) {
236            if let Some(pii_match) = self.process_match(pattern_name, &m, content) {
237                matches.push(pii_match);
238            }
239        }
240    }
241
242    /// Processes a match and returns a `PiiMatch` if it should be included.
243    fn process_match(
244        &self,
245        pattern_name: &str,
246        m: &regex::Match<'_>,
247        content: &str,
248    ) -> Option<PiiMatch> {
249        let match_str = m.as_str();
250
251        // Skip local IP addresses if configured
252        if self.skip_local && pattern_name == "IP Address" && is_local_ip(match_str) {
253            return None;
254        }
255
256        // Skip common non-PII ZIP codes (very short, likely not actual addresses)
257        if pattern_name == "ZIP Code"
258            && match_str.len() == 5
259            && !is_zip_in_address_context(content, m.start())
260        {
261            return None;
262        }
263
264        Some(PiiMatch {
265            pii_type: pattern_name.to_string(),
266            start: m.start(),
267            end: m.end(),
268            matched_text: match_str.to_string(),
269        })
270    }
271
272    /// Returns the types of PII detected.
273    #[must_use]
274    pub fn detect_types(&self, content: &str) -> Vec<String> {
275        self.detect(content)
276            .into_iter()
277            .map(|m| m.pii_type)
278            .collect()
279    }
280
281    /// Returns the count of PII detected.
282    #[must_use]
283    pub fn count(&self, content: &str) -> usize {
284        self.detect(content).len()
285    }
286}
287
288/// Removes overlapping matches, keeping the first occurrence.
289fn deduplicate_overlapping(sorted_matches: Vec<PiiMatch>) -> Vec<PiiMatch> {
290    let mut result = Vec::new();
291    let mut last_end = 0;
292
293    for m in sorted_matches {
294        if m.start >= last_end {
295            last_end = m.end;
296            result.push(m);
297        }
298    }
299
300    result
301}
302
303/// Checks if an IP address is a local/private address.
304fn is_local_ip(ip: &str) -> bool {
305    ip.starts_with("127.")
306        || ip.starts_with("10.")
307        || ip.starts_with("192.168.")
308        || ip.starts_with("172.16.")
309        || ip == "0.0.0.0"
310}
311
312/// Checks if a ZIP code appears in an address context.
313fn is_zip_in_address_context(content: &str, match_start: usize) -> bool {
314    let before = if match_start >= 20 {
315        &content[match_start - 20..match_start]
316    } else {
317        &content[..match_start]
318    };
319    let before_lower = before.to_lowercase();
320    before_lower.contains("address") || before_lower.contains("zip") || before.contains(',')
321}
322
323impl Default for PiiDetector {
324    fn default() -> Self {
325        Self::new()
326    }
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332
333    #[test]
334    fn test_detect_email() {
335        let detector = PiiDetector::new();
336        let content = "Contact me at john.doe@example.com";
337        let matches = detector.detect(content);
338
339        assert_eq!(matches.len(), 1);
340        assert_eq!(matches[0].pii_type, "Email Address");
341        assert_eq!(matches[0].matched_text, "john.doe@example.com");
342    }
343
344    #[test]
345    fn test_detect_ssn() {
346        let detector = PiiDetector::new();
347        let content = "SSN: 123-45-6789";
348        let matches = detector.detect(content);
349
350        assert!(!matches.is_empty());
351        assert!(matches.iter().any(|m| m.pii_type == "SSN"));
352    }
353
354    #[test]
355    fn test_detect_phone() {
356        let detector = PiiDetector::new();
357        let content = "Call me at (555) 123-4567";
358        let matches = detector.detect(content);
359
360        assert!(!matches.is_empty());
361        assert!(matches.iter().any(|m| m.pii_type == "Phone Number"));
362    }
363
364    #[test]
365    fn test_detect_credit_card() {
366        let detector = PiiDetector::new();
367        // Visa test number
368        let content = "Card: 4111111111111111";
369        let matches = detector.detect(content);
370
371        assert!(!matches.is_empty());
372        assert!(matches.iter().any(|m| m.pii_type == "Credit Card Number"));
373    }
374
375    #[test]
376    fn test_detect_ip_address() {
377        let detector = PiiDetector::new();
378        let content = "Server IP: 203.0.113.42";
379        let matches = detector.detect(content);
380
381        assert!(!matches.is_empty());
382        assert!(matches.iter().any(|m| m.pii_type == "IP Address"));
383    }
384
385    #[test]
386    fn test_skip_local_ip() {
387        let detector = PiiDetector::new();
388        let content = "Localhost: 127.0.0.1";
389        let matches = detector.detect(content);
390
391        assert!(matches.iter().all(|m| m.pii_type != "IP Address"));
392    }
393
394    #[test]
395    fn test_include_local_ip() {
396        let detector = PiiDetector::new().include_local();
397        let content = "Localhost: 127.0.0.1";
398        let matches = detector.detect(content);
399
400        assert!(matches.iter().any(|m| m.pii_type == "IP Address"));
401    }
402
403    #[test]
404    fn test_no_pii() {
405        let detector = PiiDetector::new();
406        let content = "This is just regular text without PII.";
407        assert!(!detector.contains_pii(content));
408    }
409
410    #[test]
411    fn test_multiple_pii() {
412        let detector = PiiDetector::new();
413        let content = "Email: test@example.com, Phone: 555-123-4567";
414        let matches = detector.detect(content);
415
416        assert!(matches.len() >= 2);
417    }
418
419    #[test]
420    fn test_detect_types() {
421        let detector = PiiDetector::new();
422        let content = "test@example.com";
423        let types = detector.detect_types(content);
424
425        assert!(types.contains(&"Email Address".to_string()));
426    }
427
428    // ============================================================================
429    // International Tax/ID Pattern Tests
430    // ============================================================================
431
432    #[test]
433    fn test_detect_uk_nin() {
434        let detector = PiiDetector::new();
435        // Valid UK NIN format: 2 letters + 6 digits + 1 letter
436        let content = "NIN: AB123456C";
437        let matches = detector.detect(content);
438
439        assert!(!matches.is_empty());
440        assert!(
441            matches
442                .iter()
443                .any(|m| m.pii_type == "UK National Insurance Number")
444        );
445    }
446
447    #[test]
448    fn test_detect_uk_nin_with_spaces() {
449        let detector = PiiDetector::new();
450        let content = "National Insurance: AB 12 34 56 C";
451        let matches = detector.detect(content);
452
453        assert!(!matches.is_empty());
454        assert!(
455            matches
456                .iter()
457                .any(|m| m.pii_type == "UK National Insurance Number")
458        );
459    }
460
461    #[test]
462    fn test_detect_canada_sin() {
463        let detector = PiiDetector::new();
464        // Canadian SIN: 9 digits, often XXX-XXX-XXX
465        let content = "SIN: 123-456-789";
466        let matches = detector.detect(content);
467
468        assert!(!matches.is_empty());
469        assert!(matches.iter().any(|m| m.pii_type == "Canada SIN"));
470    }
471
472    #[test]
473    fn test_detect_canada_sin_no_dashes() {
474        let detector = PiiDetector::new();
475        let content = "SIN: 123456789";
476        let matches = detector.detect(content);
477
478        assert!(!matches.is_empty());
479        assert!(matches.iter().any(|m| m.pii_type == "Canada SIN"));
480    }
481
482    #[test]
483    fn test_detect_eu_vat_german() {
484        let detector = PiiDetector::new();
485        // German VAT: DE + 9 digits
486        let content = "VAT: DE123456789";
487        let matches = detector.detect(content);
488
489        assert!(!matches.is_empty());
490        assert!(matches.iter().any(|m| m.pii_type == "EU VAT Number"));
491    }
492
493    #[test]
494    fn test_detect_eu_vat_french() {
495        let detector = PiiDetector::new();
496        // French VAT: FR + 2 chars + 9 digits
497        let content = "VAT: FR12123456789";
498        let matches = detector.detect(content);
499
500        assert!(!matches.is_empty());
501        assert!(matches.iter().any(|m| m.pii_type == "EU VAT Number"));
502    }
503
504    #[test]
505    fn test_detect_eu_vat_dutch() {
506        let detector = PiiDetector::new();
507        // Dutch VAT: NL + 9 digits + B + 2 digits
508        let content = "VAT: NL123456789B01";
509        let matches = detector.detect(content);
510
511        assert!(!matches.is_empty());
512        assert!(matches.iter().any(|m| m.pii_type == "EU VAT Number"));
513    }
514
515    #[test]
516    fn test_detect_australia_tfn() {
517        let detector = PiiDetector::new();
518        // Australian TFN: 8-9 digits with context
519        let content = "TFN: 123-456-789";
520        let matches = detector.detect(content);
521
522        assert!(!matches.is_empty());
523        assert!(matches.iter().any(|m| m.pii_type == "Australia TFN"));
524    }
525
526    #[test]
527    fn test_detect_india_aadhaar() {
528        let detector = PiiDetector::new();
529        // Aadhaar: 12 digits, first digit 2-9
530        let content = "Aadhaar: 2345 6789 0123";
531        let matches = detector.detect(content);
532
533        assert!(!matches.is_empty());
534        assert!(matches.iter().any(|m| m.pii_type == "India Aadhaar"));
535    }
536
537    #[test]
538    fn test_detect_india_pan() {
539        let detector = PiiDetector::new();
540        // PAN: 5 letters + 4 digits + 1 letter
541        let content = "PAN: ABCDE1234F";
542        let matches = detector.detect(content);
543
544        assert!(!matches.is_empty());
545        assert!(matches.iter().any(|m| m.pii_type == "India PAN"));
546    }
547
548    #[test]
549    fn test_international_ids_case_insensitive() {
550        let detector = PiiDetector::new();
551
552        // EU VAT lowercase
553        let content = "vat: de123456789";
554        let matches = detector.detect(content);
555        assert!(matches.iter().any(|m| m.pii_type == "EU VAT Number"));
556    }
557}