Skip to main content

subcog/hooks/pre_compact/
analyzer.rs

1//! Content analysis for pre-compact hook.
2//!
3//! This module handles extracting capture candidates from conversation content
4//! using keyword-based language detection.
5
6use super::{FINGERPRINT_LENGTH, MIN_COMMON_CHARS_FOR_DUPLICATE};
7use crate::models::Namespace;
8
9/// Candidate for capture.
10#[derive(Debug, Clone)]
11pub struct CaptureCandidate {
12    /// The content to capture.
13    pub content: String,
14    /// Detected namespace for this content.
15    pub namespace: Namespace,
16    /// Confidence score (0.0-1.0).
17    pub confidence: f32,
18}
19
20/// Checks if text contains decision-related language.
21#[must_use]
22pub fn contains_decision_language(text: &str) -> bool {
23    let lower = text.to_lowercase();
24    lower.contains("decided")
25        || lower.contains("decision")
26        || lower.contains("we'll use")
27        || lower.contains("we're using")
28        || lower.contains("going to use")
29        || lower.contains("chose")
30        || lower.contains("selected")
31        || lower.contains("approach")
32}
33
34/// Checks if text contains learning-related language.
35#[must_use]
36pub fn contains_learning_language(text: &str) -> bool {
37    let lower = text.to_lowercase();
38    lower.contains("learned")
39        || lower.contains("discovered")
40        || lower.contains("realized")
41        || lower.contains("til ")
42        || lower.contains("turns out")
43        || lower.contains("found out")
44        || lower.contains("gotcha")
45        || lower.contains("caveat")
46}
47
48/// Checks if text contains blocker-related language.
49#[must_use]
50pub fn contains_blocker_language(text: &str) -> bool {
51    let lower = text.to_lowercase();
52    (lower.contains("fixed") || lower.contains("resolved") || lower.contains("solved"))
53        && (lower.contains("issue")
54            || lower.contains("bug")
55            || lower.contains("error")
56            || lower.contains("problem"))
57}
58
59/// Checks if text contains pattern-related language.
60#[must_use]
61pub fn contains_pattern_language(text: &str) -> bool {
62    let lower = text.to_lowercase();
63    lower.contains("pattern")
64        || lower.contains("best practice")
65        || lower.contains("convention")
66        || lower.contains("always ")
67        || lower.contains("never ")
68        || lower.contains("should always")
69        || lower.contains("must ")
70}
71
72/// Checks if text contains context-related language.
73///
74/// Context captures explain the "why" behind decisions - constraints,
75/// requirements, and important background information.
76#[must_use]
77pub fn contains_context_language(text: &str) -> bool {
78    let lower = text.to_lowercase();
79    lower.contains("because")
80        || lower.contains("constraint")
81        || lower.contains("requirement")
82        || lower.contains("context:")
83        || lower.contains("important:")
84        || lower.contains("note:")
85        || lower.contains("background:")
86        || lower.contains("rationale")
87        || lower.contains("reason why")
88        || lower.contains("due to")
89}
90
91/// Calculates confidence for a section based on heuristics.
92///
93/// Higher confidence for:
94/// - Longer content (more complete thought)
95/// - Multiple sentences
96/// - Technical content (code blocks)
97#[must_use]
98pub fn calculate_section_confidence(section: &str) -> f32 {
99    let mut confidence: f32 = 0.5;
100
101    // Longer sections are more likely to be meaningful
102    if section.len() > 100 {
103        confidence += 0.1;
104    }
105    if section.len() > 200 {
106        confidence += 0.1;
107    }
108
109    // Multiple sentences suggest more complete thought
110    let sentence_count = section.matches('.').count() + section.matches('!').count();
111    if sentence_count >= 2 {
112        confidence += 0.1;
113    }
114
115    // Code blocks suggest technical content
116    if section.contains("```") || section.contains("    ") {
117        confidence += 0.05;
118    }
119
120    confidence.min(0.95)
121}
122
123/// Removes duplicate/similar candidates based on content fingerprints.
124///
125/// Keeps highest-confidence candidates when similar content is detected.
126#[must_use]
127pub fn deduplicate_candidates(mut candidates: Vec<CaptureCandidate>) -> Vec<CaptureCandidate> {
128    // Sort by confidence descending
129    candidates.sort_by(|a, b| {
130        b.confidence
131            .partial_cmp(&a.confidence)
132            .unwrap_or(std::cmp::Ordering::Equal)
133    });
134
135    let mut result = Vec::new();
136    let mut seen_prefixes: Vec<String> = Vec::new();
137
138    for candidate in candidates {
139        // Take first N chars as a "fingerprint"
140        let prefix: String = candidate.content.chars().take(FINGERPRINT_LENGTH).collect();
141
142        // Check if we've seen a similar prefix
143        let is_duplicate = seen_prefixes.iter().any(|p| {
144            let common = p
145                .chars()
146                .zip(prefix.chars())
147                .take_while(|(a, b)| a == b)
148                .count();
149            common > MIN_COMMON_CHARS_FOR_DUPLICATE
150        });
151
152        if !is_duplicate {
153            seen_prefixes.push(prefix);
154            result.push(candidate);
155        }
156    }
157
158    result
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    #[test]
166    fn test_contains_decision_language() {
167        assert!(contains_decision_language("We decided to use PostgreSQL"));
168        assert!(contains_decision_language("The decision was made"));
169        assert!(contains_decision_language("We chose this approach"));
170        assert!(!contains_decision_language("Just some regular text"));
171    }
172
173    #[test]
174    fn test_contains_learning_language() {
175        assert!(contains_learning_language("TIL that Rust has great safety"));
176        assert!(contains_learning_language("I realized the problem"));
177        assert!(contains_learning_language("Turns out it was a bug"));
178        assert!(!contains_learning_language("Regular text here"));
179    }
180
181    #[test]
182    fn test_contains_blocker_language() {
183        assert!(contains_blocker_language("Fixed the issue with auth"));
184        assert!(contains_blocker_language("Resolved the bug in parser"));
185        assert!(!contains_blocker_language("Just fixed the typo"));
186    }
187
188    #[test]
189    fn test_contains_pattern_language() {
190        assert!(contains_pattern_language("This is a common pattern"));
191        assert!(contains_pattern_language("Best practice is to..."));
192        assert!(contains_pattern_language("You should always check..."));
193        // Use text that truly has no pattern-related words
194        assert!(!contains_pattern_language(
195            "Hello world, this is regular code"
196        ));
197    }
198
199    #[test]
200    fn test_contains_context_language() {
201        assert!(contains_context_language(
202            "We did this because of performance requirements"
203        ));
204        assert!(contains_context_language("Context: the system needs X"));
205        assert!(contains_context_language(
206            "The constraint here is memory usage"
207        ));
208        assert!(contains_context_language(
209            "Important: this must complete fast"
210        ));
211        assert!(contains_context_language("Note: this is a workaround"));
212        assert!(contains_context_language(
213            "Due to backwards compatibility, we chose this"
214        ));
215        // Text with no context language
216        assert!(!contains_context_language(
217            "Just some regular implementation code"
218        ));
219    }
220
221    #[test]
222    fn test_calculate_confidence() {
223        let short_text = "Short";
224        let medium_text =
225            "This is a medium length text that contains some words. It has multiple sentences.";
226        let long_text = "This is a much longer text that contains many words and sentences. It should have higher confidence. The text goes on and on with more information. Here is even more content to make it longer.";
227
228        let short_conf = calculate_section_confidence(short_text);
229        let medium_conf = calculate_section_confidence(medium_text);
230        let long_conf = calculate_section_confidence(long_text);
231
232        assert!(short_conf < medium_conf);
233        assert!(medium_conf < long_conf);
234    }
235
236    #[test]
237    fn test_deduplicate_candidates() {
238        let candidates = vec![
239            CaptureCandidate {
240                content: "This is a test content that is quite long and should be unique"
241                    .to_string(),
242                namespace: Namespace::Decisions,
243                confidence: 0.8,
244            },
245            CaptureCandidate {
246                content: "This is a test content that is quite long and should match".to_string(),
247                namespace: Namespace::Decisions,
248                confidence: 0.7,
249            },
250            CaptureCandidate {
251                content: "Completely different content here with no similarity".to_string(),
252                namespace: Namespace::Learnings,
253                confidence: 0.9,
254            },
255        ];
256
257        let result = deduplicate_candidates(candidates);
258        // Should keep highest confidence of similar ones + the unique one
259        assert_eq!(result.len(), 2);
260    }
261}