subcog/hooks/pre_compact/
analyzer.rs1use super::{FINGERPRINT_LENGTH, MIN_COMMON_CHARS_FOR_DUPLICATE};
7use crate::models::Namespace;
8
9#[derive(Debug, Clone)]
11pub struct CaptureCandidate {
12 pub content: String,
14 pub namespace: Namespace,
16 pub confidence: f32,
18}
19
20#[must_use]
22pub fn contains_decision_language(text: &str) -> bool {
23 let lower = text.to_lowercase();
24 lower.contains("decided")
25 || lower.contains("decision")
26 || lower.contains("we'll use")
27 || lower.contains("we're using")
28 || lower.contains("going to use")
29 || lower.contains("chose")
30 || lower.contains("selected")
31 || lower.contains("approach")
32}
33
34#[must_use]
36pub fn contains_learning_language(text: &str) -> bool {
37 let lower = text.to_lowercase();
38 lower.contains("learned")
39 || lower.contains("discovered")
40 || lower.contains("realized")
41 || lower.contains("til ")
42 || lower.contains("turns out")
43 || lower.contains("found out")
44 || lower.contains("gotcha")
45 || lower.contains("caveat")
46}
47
48#[must_use]
50pub fn contains_blocker_language(text: &str) -> bool {
51 let lower = text.to_lowercase();
52 (lower.contains("fixed") || lower.contains("resolved") || lower.contains("solved"))
53 && (lower.contains("issue")
54 || lower.contains("bug")
55 || lower.contains("error")
56 || lower.contains("problem"))
57}
58
59#[must_use]
61pub fn contains_pattern_language(text: &str) -> bool {
62 let lower = text.to_lowercase();
63 lower.contains("pattern")
64 || lower.contains("best practice")
65 || lower.contains("convention")
66 || lower.contains("always ")
67 || lower.contains("never ")
68 || lower.contains("should always")
69 || lower.contains("must ")
70}
71
72#[must_use]
77pub fn contains_context_language(text: &str) -> bool {
78 let lower = text.to_lowercase();
79 lower.contains("because")
80 || lower.contains("constraint")
81 || lower.contains("requirement")
82 || lower.contains("context:")
83 || lower.contains("important:")
84 || lower.contains("note:")
85 || lower.contains("background:")
86 || lower.contains("rationale")
87 || lower.contains("reason why")
88 || lower.contains("due to")
89}
90
91#[must_use]
98pub fn calculate_section_confidence(section: &str) -> f32 {
99 let mut confidence: f32 = 0.5;
100
101 if section.len() > 100 {
103 confidence += 0.1;
104 }
105 if section.len() > 200 {
106 confidence += 0.1;
107 }
108
109 let sentence_count = section.matches('.').count() + section.matches('!').count();
111 if sentence_count >= 2 {
112 confidence += 0.1;
113 }
114
115 if section.contains("```") || section.contains(" ") {
117 confidence += 0.05;
118 }
119
120 confidence.min(0.95)
121}
122
123#[must_use]
127pub fn deduplicate_candidates(mut candidates: Vec<CaptureCandidate>) -> Vec<CaptureCandidate> {
128 candidates.sort_by(|a, b| {
130 b.confidence
131 .partial_cmp(&a.confidence)
132 .unwrap_or(std::cmp::Ordering::Equal)
133 });
134
135 let mut result = Vec::new();
136 let mut seen_prefixes: Vec<String> = Vec::new();
137
138 for candidate in candidates {
139 let prefix: String = candidate.content.chars().take(FINGERPRINT_LENGTH).collect();
141
142 let is_duplicate = seen_prefixes.iter().any(|p| {
144 let common = p
145 .chars()
146 .zip(prefix.chars())
147 .take_while(|(a, b)| a == b)
148 .count();
149 common > MIN_COMMON_CHARS_FOR_DUPLICATE
150 });
151
152 if !is_duplicate {
153 seen_prefixes.push(prefix);
154 result.push(candidate);
155 }
156 }
157
158 result
159}
160
161#[cfg(test)]
162mod tests {
163 use super::*;
164
165 #[test]
166 fn test_contains_decision_language() {
167 assert!(contains_decision_language("We decided to use PostgreSQL"));
168 assert!(contains_decision_language("The decision was made"));
169 assert!(contains_decision_language("We chose this approach"));
170 assert!(!contains_decision_language("Just some regular text"));
171 }
172
173 #[test]
174 fn test_contains_learning_language() {
175 assert!(contains_learning_language("TIL that Rust has great safety"));
176 assert!(contains_learning_language("I realized the problem"));
177 assert!(contains_learning_language("Turns out it was a bug"));
178 assert!(!contains_learning_language("Regular text here"));
179 }
180
181 #[test]
182 fn test_contains_blocker_language() {
183 assert!(contains_blocker_language("Fixed the issue with auth"));
184 assert!(contains_blocker_language("Resolved the bug in parser"));
185 assert!(!contains_blocker_language("Just fixed the typo"));
186 }
187
188 #[test]
189 fn test_contains_pattern_language() {
190 assert!(contains_pattern_language("This is a common pattern"));
191 assert!(contains_pattern_language("Best practice is to..."));
192 assert!(contains_pattern_language("You should always check..."));
193 assert!(!contains_pattern_language(
195 "Hello world, this is regular code"
196 ));
197 }
198
199 #[test]
200 fn test_contains_context_language() {
201 assert!(contains_context_language(
202 "We did this because of performance requirements"
203 ));
204 assert!(contains_context_language("Context: the system needs X"));
205 assert!(contains_context_language(
206 "The constraint here is memory usage"
207 ));
208 assert!(contains_context_language(
209 "Important: this must complete fast"
210 ));
211 assert!(contains_context_language("Note: this is a workaround"));
212 assert!(contains_context_language(
213 "Due to backwards compatibility, we chose this"
214 ));
215 assert!(!contains_context_language(
217 "Just some regular implementation code"
218 ));
219 }
220
221 #[test]
222 fn test_calculate_confidence() {
223 let short_text = "Short";
224 let medium_text =
225 "This is a medium length text that contains some words. It has multiple sentences.";
226 let long_text = "This is a much longer text that contains many words and sentences. It should have higher confidence. The text goes on and on with more information. Here is even more content to make it longer.";
227
228 let short_conf = calculate_section_confidence(short_text);
229 let medium_conf = calculate_section_confidence(medium_text);
230 let long_conf = calculate_section_confidence(long_text);
231
232 assert!(short_conf < medium_conf);
233 assert!(medium_conf < long_conf);
234 }
235
236 #[test]
237 fn test_deduplicate_candidates() {
238 let candidates = vec![
239 CaptureCandidate {
240 content: "This is a test content that is quite long and should be unique"
241 .to_string(),
242 namespace: Namespace::Decisions,
243 confidence: 0.8,
244 },
245 CaptureCandidate {
246 content: "This is a test content that is quite long and should match".to_string(),
247 namespace: Namespace::Decisions,
248 confidence: 0.7,
249 },
250 CaptureCandidate {
251 content: "Completely different content here with no similarity".to_string(),
252 namespace: Namespace::Learnings,
253 confidence: 0.9,
254 },
255 ];
256
257 let result = deduplicate_candidates(candidates);
258 assert_eq!(result.len(), 2);
260 }
261}