Skip to main content

subcog/services/
entity_extraction.rs

1//! Entity extraction service for extracting entities from text using LLM.
2//!
3//! Provides LLM-powered entity extraction with graceful degradation when
4//! LLM is unavailable.
5//!
6//! # Example
7//!
8//! ```rust,ignore
9//! use subcog::services::EntityExtractorService;
10//! use subcog::llm::AnthropicClient;
11//! use subcog::models::Domain;
12//!
13//! let llm = AnthropicClient::new();
14//! let service = EntityExtractorService::new(Box::new(llm), Domain::for_user());
15//!
16//! let result = service.extract("Alice from Acme Corp uses Rust")?;
17//! println!("Extracted {} entities", result.entities.len());
18//! ```
19
20use crate::llm::{LlmProvider, OperationMode, build_system_prompt};
21use crate::models::Domain;
22use crate::models::graph::{Entity, EntityType, Relationship, RelationshipType};
23use crate::{Error, Result};
24use serde::{Deserialize, Serialize};
25use std::collections::HashMap;
26use std::sync::Arc;
27
28/// Technology patterns for fallback entity extraction.
29///
30/// Organized by category for maintainability.
31static TECH_PATTERNS: &[&str] = &[
32    // Programming Languages (18)
33    "Rust",
34    "Python",
35    "Java",
36    "JavaScript",
37    "TypeScript",
38    "Go",
39    "C++",
40    "C#",
41    "Ruby",
42    "PHP",
43    "Swift",
44    "Kotlin",
45    "Scala",
46    "Elixir",
47    "Haskell",
48    "Clojure",
49    "F#",
50    "Zig",
51    // Databases (12)
52    "PostgreSQL",
53    "MySQL",
54    "SQLite",
55    "Redis",
56    "MongoDB",
57    "Cassandra",
58    "DynamoDB",
59    "CockroachDB",
60    "ClickHouse",
61    "Elasticsearch",
62    "Neo4j",
63    "Firestore",
64    // Web Frameworks (14)
65    "React",
66    "Vue",
67    "Angular",
68    "Svelte",
69    "Next.js",
70    "Nuxt",
71    "Express",
72    "Django",
73    "Rails",
74    "Laravel",
75    "Spring",
76    "Flask",
77    "FastAPI",
78    "Actix",
79    // Cloud Providers (9)
80    "AWS",
81    "Azure",
82    "GCP",
83    "Cloudflare",
84    "Vercel",
85    "Netlify",
86    "Heroku",
87    "DigitalOcean",
88    "Linode",
89    // Container/Orchestration (8)
90    "Docker",
91    "Kubernetes",
92    "k8s",
93    "Podman",
94    "Nomad",
95    "ECS",
96    "EKS",
97    "GKE",
98    // Infrastructure (6)
99    "Terraform",
100    "Ansible",
101    "Prometheus",
102    "Grafana",
103    "Datadog",
104    "Jaeger",
105    // Message Queues (6)
106    "Kafka",
107    "RabbitMQ",
108    "NATS",
109    "Pulsar",
110    "SQS",
111    "Pub/Sub",
112    // Build Tools (10)
113    "Webpack",
114    "Vite",
115    "esbuild",
116    "Rollup",
117    "Cargo",
118    "npm",
119    "yarn",
120    "pnpm",
121    "Maven",
122    "Gradle",
123    // Runtime Environments (4)
124    "Node.js",
125    "Deno",
126    "Bun",
127    "WASM",
128    // APIs/Protocols (6)
129    "REST",
130    "GraphQL",
131    "gRPC",
132    "WebSocket",
133    "MQTT",
134    "OpenAPI",
135];
136
137/// Result of entity extraction from text.
138#[derive(Debug, Clone, Default, Serialize, Deserialize)]
139pub struct ExtractionResult {
140    /// Extracted entities.
141    pub entities: Vec<ExtractedEntity>,
142    /// Extracted relationships.
143    pub relationships: Vec<ExtractedRelationship>,
144    /// Whether extraction used fallback (no LLM).
145    pub used_fallback: bool,
146    /// Any warnings during extraction.
147    pub warnings: Vec<String>,
148}
149
150/// An entity extracted from text.
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct ExtractedEntity {
153    /// Entity name.
154    pub name: String,
155    /// Entity type as string (maps to [`EntityType`]).
156    #[serde(rename = "type")]
157    pub entity_type: String,
158    /// Confidence score (0.0-1.0).
159    #[serde(default = "default_confidence")]
160    pub confidence: f32,
161    /// Alternative names for this entity.
162    #[serde(default)]
163    pub aliases: Vec<String>,
164    /// Brief description if available.
165    #[serde(default)]
166    pub description: Option<String>,
167}
168
169const fn default_confidence() -> f32 {
170    0.8
171}
172
173/// A relationship extracted from text.
174#[derive(Debug, Clone, Serialize, Deserialize)]
175pub struct ExtractedRelationship {
176    /// Source entity name.
177    pub from: String,
178    /// Target entity name.
179    pub to: String,
180    /// Relationship type as string.
181    #[serde(rename = "type")]
182    pub relationship_type: String,
183    /// Confidence score (0.0-1.0).
184    #[serde(default = "default_confidence")]
185    pub confidence: f32,
186    /// Evidence text supporting this relationship.
187    #[serde(default)]
188    pub evidence: Option<String>,
189}
190
191/// Result of relationship inference between entities.
192#[derive(Debug, Clone, Default, Serialize, Deserialize)]
193pub struct InferenceResult {
194    /// Inferred relationships.
195    pub relationships: Vec<InferredRelationship>,
196    /// Whether inference used fallback (no LLM).
197    pub used_fallback: bool,
198    /// Any warnings during inference.
199    pub warnings: Vec<String>,
200}
201
202/// A relationship inferred between existing entities.
203#[derive(Debug, Clone, Serialize, Deserialize)]
204pub struct InferredRelationship {
205    /// Source entity name.
206    pub from: String,
207    /// Target entity name.
208    pub to: String,
209    /// Relationship type as string.
210    #[serde(rename = "type")]
211    pub relationship_type: String,
212    /// Confidence score (0.0-1.0).
213    #[serde(default = "default_confidence")]
214    pub confidence: f32,
215    /// Reasoning for this inference.
216    #[serde(default)]
217    pub reasoning: Option<String>,
218}
219
220/// LLM response structure for entity extraction.
221#[derive(Debug, Clone, Deserialize)]
222struct LlmExtractionResponse {
223    #[serde(default)]
224    entities: Vec<ExtractedEntity>,
225    #[serde(default)]
226    relationships: Vec<ExtractedRelationship>,
227}
228
229/// LLM response structure for relationship inference.
230#[derive(Debug, Clone, Deserialize)]
231struct LlmInferenceResponse {
232    #[serde(default)]
233    relationships: Vec<InferredRelationship>,
234}
235
236/// Service for extracting entities from text content.
237///
238/// Uses an LLM to identify named entities and their relationships,
239/// with graceful fallback when LLM is unavailable.
240pub struct EntityExtractorService {
241    /// LLM provider for extraction.
242    llm: Option<Arc<dyn LlmProvider>>,
243    /// Default domain for extracted entities.
244    domain: Domain,
245    /// Minimum confidence threshold for entities.
246    min_confidence: f32,
247}
248
249impl EntityExtractorService {
250    /// Creates a new entity extractor with an LLM provider.
251    #[must_use]
252    pub fn new(llm: Box<dyn LlmProvider>, domain: Domain) -> Self {
253        Self {
254            llm: Some(Arc::from(llm)),
255            domain,
256            min_confidence: 0.5,
257        }
258    }
259
260    /// Creates an entity extractor without LLM (fallback mode only).
261    #[must_use]
262    pub const fn without_llm(domain: Domain) -> Self {
263        Self {
264            llm: None,
265            domain,
266            min_confidence: 0.5,
267        }
268    }
269
270    /// Creates an entity extractor with a shared LLM provider.
271    #[must_use]
272    pub const fn with_shared_llm(llm: Arc<dyn LlmProvider>, domain: Domain) -> Self {
273        Self {
274            llm: Some(llm),
275            domain,
276            min_confidence: 0.5,
277        }
278    }
279
280    /// Sets the minimum confidence threshold for extracted entities.
281    #[must_use]
282    pub const fn with_min_confidence(mut self, threshold: f32) -> Self {
283        self.min_confidence = threshold;
284        self
285    }
286
287    /// Extracts entities and relationships from text.
288    ///
289    /// # Arguments
290    ///
291    /// * `text` - The text to extract entities from.
292    ///
293    /// # Returns
294    ///
295    /// An [`ExtractionResult`] containing extracted entities and relationships.
296    ///
297    /// # Errors
298    ///
299    /// Returns an error if LLM extraction fails and no fallback is possible.
300    pub fn extract(&self, text: &str) -> Result<ExtractionResult> {
301        if text.trim().is_empty() {
302            return Ok(ExtractionResult::default());
303        }
304
305        match &self.llm {
306            Some(llm) => self.extract_with_llm(llm, text),
307            None => Ok(self.extract_fallback(text)),
308        }
309    }
310
311    /// Extracts entities using LLM.
312    fn extract_with_llm(&self, llm: &Arc<dyn LlmProvider>, text: &str) -> Result<ExtractionResult> {
313        let system = build_system_prompt(OperationMode::EntityExtraction, None);
314        let user = format!("Extract entities and relationships from this text:\n\n{text}");
315
316        let response = match llm.complete_with_system(&system, &user) {
317            Ok(r) => r,
318            Err(e) => {
319                tracing::warn!(error = %e, "LLM extraction failed, using fallback");
320                return Ok(self.extract_fallback(text));
321            },
322        };
323
324        // Parse JSON response
325        let parsed = self.parse_llm_response(&response)?;
326
327        // Filter by confidence threshold
328        let entities: Vec<_> = parsed
329            .entities
330            .into_iter()
331            .filter(|e| e.confidence >= self.min_confidence)
332            .collect();
333
334        let relationships: Vec<_> = parsed
335            .relationships
336            .into_iter()
337            .filter(|r| r.confidence >= self.min_confidence)
338            .collect();
339
340        Ok(ExtractionResult {
341            entities,
342            relationships,
343            used_fallback: false,
344            warnings: Vec::new(),
345        })
346    }
347
348    /// Parses the LLM JSON response.
349    fn parse_llm_response(&self, response: &str) -> Result<LlmExtractionResponse> {
350        // Try to find JSON in the response (it might be wrapped in markdown)
351        let json_str = self.extract_json(response);
352
353        serde_json::from_str(&json_str).map_err(|e| {
354            tracing::warn!(error = %e, response = %response, "Failed to parse LLM response");
355            Error::OperationFailed {
356                operation: "parse_entity_extraction".to_string(),
357                cause: format!("Invalid JSON response: {e}"),
358            }
359        })
360    }
361
362    /// Extracts JSON from a response that may be wrapped in markdown.
363    fn extract_json(&self, response: &str) -> String {
364        let trimmed = response.trim();
365
366        // Try markdown code block first
367        if let Some(json) = self.extract_json_from_markdown(trimmed) {
368            return json;
369        }
370
371        // Try raw JSON object
372        if let Some(json) = self.extract_raw_json(trimmed) {
373            return json;
374        }
375
376        // Return as-is if no JSON found
377        trimmed.to_string()
378    }
379
380    /// Extracts JSON from a markdown code block.
381    fn extract_json_from_markdown(&self, text: &str) -> Option<String> {
382        let start = text.find("```json")?;
383        let end_offset = text[start..]
384            .find("```\n")
385            .or_else(|| text[start..].rfind("```"))?;
386
387        let json_start = start + 7; // len("```json")
388        let json_end = start + end_offset;
389
390        if json_start < json_end {
391            Some(text[json_start..json_end].trim().to_string())
392        } else {
393            None
394        }
395    }
396
397    /// Extracts a raw JSON object from text.
398    fn extract_raw_json(&self, text: &str) -> Option<String> {
399        let start = text.find('{')?;
400        let end = text.rfind('}')?;
401
402        if start < end {
403            Some(text[start..=end].to_string())
404        } else {
405            None
406        }
407    }
408
409    /// Fallback extraction when LLM is unavailable.
410    ///
411    /// Uses simple pattern matching for common entity patterns.
412    fn extract_fallback(&self, text: &str) -> ExtractionResult {
413        let mut entities = Vec::new();
414        let mut warnings = vec!["LLM unavailable, using pattern-based fallback".to_string()];
415
416        for pattern in TECH_PATTERNS {
417            if text.contains(pattern) {
418                entities.push(ExtractedEntity {
419                    name: (*pattern).to_string(),
420                    entity_type: "Technology".to_string(),
421                    confidence: 0.7,
422                    aliases: Vec::new(),
423                    description: None,
424                });
425            }
426        }
427
428        if entities.is_empty() {
429            warnings.push("No entities detected with fallback patterns".to_string());
430        }
431
432        ExtractionResult {
433            entities,
434            relationships: Vec::new(),
435            used_fallback: true,
436            warnings,
437        }
438    }
439
440    /// Converts extracted entities to graph Entity objects.
441    ///
442    /// # Arguments
443    ///
444    /// * `extracted` - The extraction result.
445    ///
446    /// # Returns
447    ///
448    /// A vector of [`Entity`] objects ready for storage.
449    #[must_use]
450    pub fn to_graph_entities(&self, extracted: &ExtractionResult) -> Vec<Entity> {
451        extracted
452            .entities
453            .iter()
454            .map(|e| {
455                let entity_type = parse_entity_type(&e.entity_type);
456                let mut entity = Entity::new(entity_type, &e.name, self.domain.clone());
457                entity.confidence = e.confidence;
458                entity.aliases.clone_from(&e.aliases);
459                if let Some(desc) = &e.description {
460                    entity
461                        .properties
462                        .insert("description".to_string(), desc.clone());
463                }
464                entity
465            })
466            .collect()
467    }
468
469    /// Converts extracted relationships to graph Relationship objects.
470    ///
471    /// Requires a mapping from entity names to entity IDs.
472    ///
473    /// # Arguments
474    ///
475    /// * `extracted` - The extraction result.
476    /// * `entity_map` - Map from entity name to Entity.
477    ///
478    /// # Returns
479    ///
480    /// A vector of [`Relationship`] objects ready for storage.
481    #[must_use]
482    pub fn to_graph_relationships(
483        &self,
484        extracted: &ExtractionResult,
485        entity_map: &std::collections::HashMap<String, Entity>,
486    ) -> Vec<Relationship> {
487        extracted
488            .relationships
489            .iter()
490            .filter_map(|r| {
491                let from_entity = entity_map.get(&r.from)?;
492                let to_entity = entity_map.get(&r.to)?;
493                let rel_type = parse_relationship_type(&r.relationship_type);
494
495                let mut rel =
496                    Relationship::new(from_entity.id.clone(), to_entity.id.clone(), rel_type);
497                rel.confidence = r.confidence;
498                if let Some(evidence) = &r.evidence {
499                    rel.properties
500                        .insert("evidence".to_string(), evidence.clone());
501                }
502                Some(rel)
503            })
504            .collect()
505    }
506
507    /// Infers relationships between existing entities.
508    ///
509    /// Analyzes a set of entities and uses LLM to discover implicit relationships
510    /// that weren't explicitly stated in text.
511    ///
512    /// # Arguments
513    ///
514    /// * `entities` - The entities to analyze for relationships.
515    ///
516    /// # Returns
517    ///
518    /// An [`InferenceResult`] containing inferred relationships.
519    ///
520    /// # Errors
521    ///
522    /// Returns an error if LLM inference fails and no fallback is possible.
523    pub fn infer_relationships(&self, entities: &[Entity]) -> Result<InferenceResult> {
524        if entities.is_empty() {
525            return Ok(InferenceResult::default());
526        }
527
528        match &self.llm {
529            Some(llm) => self.infer_with_llm(llm, entities),
530            None => Ok(self.infer_fallback(entities)),
531        }
532    }
533
534    /// Infers relationships using LLM.
535    fn infer_with_llm(
536        &self,
537        llm: &Arc<dyn LlmProvider>,
538        entities: &[Entity],
539    ) -> Result<InferenceResult> {
540        let system = build_system_prompt(OperationMode::RelationshipInference, None);
541        let user = self.format_entities_for_inference(entities);
542
543        let response = match llm.complete_with_system(&system, &user) {
544            Ok(r) => r,
545            Err(e) => {
546                tracing::warn!(error = %e, "LLM inference failed, using fallback");
547                return Ok(self.infer_fallback(entities));
548            },
549        };
550
551        // Parse JSON response
552        let parsed = self.parse_inference_response(&response)?;
553
554        // Filter by confidence threshold
555        let relationships: Vec<_> = parsed
556            .relationships
557            .into_iter()
558            .filter(|r| r.confidence >= self.min_confidence)
559            .collect();
560
561        Ok(InferenceResult {
562            relationships,
563            used_fallback: false,
564            warnings: Vec::new(),
565        })
566    }
567
568    /// Formats entities for LLM inference.
569    fn format_entities_for_inference(&self, entities: &[Entity]) -> String {
570        use std::fmt::Write;
571
572        let mut output = String::from("Analyze these entities for potential relationships:\n\n");
573
574        for entity in entities {
575            let _ = writeln!(
576                output,
577                "- {} (type: {:?}, id: {})",
578                entity.name, entity.entity_type, entity.id
579            );
580            if !entity.aliases.is_empty() {
581                let _ = writeln!(output, "  Aliases: {}", entity.aliases.join(", "));
582            }
583        }
584
585        output
586    }
587
588    /// Parses the LLM JSON response for inference.
589    fn parse_inference_response(&self, response: &str) -> Result<LlmInferenceResponse> {
590        let json_str = self.extract_json(response);
591
592        serde_json::from_str(&json_str).map_err(|e| {
593            tracing::warn!(error = %e, response = %response, "Failed to parse inference response");
594            Error::OperationFailed {
595                operation: "parse_relationship_inference".to_string(),
596                cause: format!("Invalid JSON response: {e}"),
597            }
598        })
599    }
600
601    /// Fallback inference when LLM is unavailable.
602    ///
603    /// Uses heuristics to infer common relationships based on entity types.
604    fn infer_fallback(&self, entities: &[Entity]) -> InferenceResult {
605        let mut relationships = Vec::new();
606        let warnings = vec!["LLM unavailable, using heuristic-based fallback".to_string()];
607
608        // Build entity lookup by name
609        let entity_map: HashMap<&str, &Entity> =
610            entities.iter().map(|e| (e.name.as_str(), e)).collect();
611
612        // Infer common technology relationships
613        let tech_deps: &[(&str, &str)] = &[
614            ("Rust", "cargo"),
615            ("Python", "pip"),
616            ("Node.js", "npm"),
617            ("Java", "Maven"),
618            ("Ruby", "bundler"),
619            ("Go", "go modules"),
620            ("PostgreSQL", "SQL"),
621            ("MySQL", "SQL"),
622            ("SQLite", "SQL"),
623            ("Docker", "containers"),
624            ("Kubernetes", "Docker"),
625        ];
626
627        for (from, to) in tech_deps {
628            if entity_map.contains_key(*from) && entity_map.contains_key(*to) {
629                relationships.push(InferredRelationship {
630                    from: (*from).to_string(),
631                    to: (*to).to_string(),
632                    relationship_type: "Uses".to_string(),
633                    confidence: 0.7,
634                    reasoning: Some(format!("{from} commonly uses {to}")),
635                });
636            }
637        }
638
639        InferenceResult {
640            relationships,
641            used_fallback: true,
642            warnings,
643        }
644    }
645
646    /// Converts inferred relationships to graph [`Relationship`] objects.
647    ///
648    /// # Arguments
649    ///
650    /// * `inferred` - The inference result.
651    /// * `entity_map` - Map from entity name to Entity.
652    ///
653    /// # Returns
654    ///
655    /// A vector of [`Relationship`] objects ready for storage.
656    #[must_use]
657    pub fn inferred_to_graph_relationships(
658        &self,
659        inferred: &InferenceResult,
660        entity_map: &HashMap<String, Entity>,
661    ) -> Vec<Relationship> {
662        inferred
663            .relationships
664            .iter()
665            .filter_map(|r| {
666                let from_entity = entity_map.get(&r.from)?;
667                let to_entity = entity_map.get(&r.to)?;
668                let rel_type = parse_relationship_type(&r.relationship_type);
669
670                let mut rel =
671                    Relationship::new(from_entity.id.clone(), to_entity.id.clone(), rel_type);
672                rel.confidence = r.confidence;
673                if let Some(reasoning) = &r.reasoning {
674                    rel.properties
675                        .insert("reasoning".to_string(), reasoning.clone());
676                }
677                Some(rel)
678            })
679            .collect()
680    }
681}
682
683/// Parses entity type string to [`EntityType`] enum.
684fn parse_entity_type(s: &str) -> EntityType {
685    match s.to_lowercase().as_str() {
686        "person" => EntityType::Person,
687        "organization" | "org" | "company" | "team" => EntityType::Organization,
688        "technology" | "tech" | "framework" | "tool" | "language" => EntityType::Technology,
689        "file" | "source" | "config" => EntityType::File,
690        // Default to Concept for unknown types (including "concept", "pattern", "principle")
691        _ => EntityType::Concept,
692    }
693}
694
695/// Parses relationship type string to [`RelationshipType`] enum.
696fn parse_relationship_type(s: &str) -> RelationshipType {
697    match s.to_lowercase().as_str() {
698        "worksat" | "works_at" | "employedby" => RelationshipType::WorksAt,
699        "created" | "authored" | "wrote" => RelationshipType::Created,
700        "uses" | "utilizes" | "employs" => RelationshipType::Uses,
701        "implements" | "realizes" => RelationshipType::Implements,
702        "partof" | "part_of" | "belongsto" => RelationshipType::PartOf,
703        "mentionedin" | "mentioned_in" => RelationshipType::MentionedIn,
704        "supersedes" | "replaces" => RelationshipType::Supersedes,
705        "conflictswith" | "conflicts_with" | "contradicts" => RelationshipType::ConflictsWith,
706        _ => RelationshipType::RelatesTo, // Default to general relation
707    }
708}
709
710#[cfg(test)]
711mod tests {
712    use super::*;
713
714    #[test]
715    fn test_extraction_result_default() {
716        let result = ExtractionResult::default();
717        assert!(result.entities.is_empty());
718        assert!(result.relationships.is_empty());
719        assert!(!result.used_fallback);
720    }
721
722    #[test]
723    fn test_parse_entity_type() {
724        assert_eq!(parse_entity_type("Person"), EntityType::Person);
725        assert_eq!(parse_entity_type("PERSON"), EntityType::Person);
726        assert_eq!(parse_entity_type("Organization"), EntityType::Organization);
727        assert_eq!(parse_entity_type("company"), EntityType::Organization);
728        assert_eq!(parse_entity_type("Technology"), EntityType::Technology);
729        assert_eq!(parse_entity_type("framework"), EntityType::Technology);
730        assert_eq!(parse_entity_type("Concept"), EntityType::Concept);
731        assert_eq!(parse_entity_type("File"), EntityType::File);
732        assert_eq!(parse_entity_type("unknown"), EntityType::Concept);
733    }
734
735    #[test]
736    fn test_parse_relationship_type() {
737        assert_eq!(
738            parse_relationship_type("WorksAt"),
739            RelationshipType::WorksAt
740        );
741        assert_eq!(
742            parse_relationship_type("works_at"),
743            RelationshipType::WorksAt
744        );
745        assert_eq!(
746            parse_relationship_type("Created"),
747            RelationshipType::Created
748        );
749        assert_eq!(parse_relationship_type("Uses"), RelationshipType::Uses);
750        assert_eq!(
751            parse_relationship_type("Implements"),
752            RelationshipType::Implements
753        );
754        assert_eq!(parse_relationship_type("PartOf"), RelationshipType::PartOf);
755        assert_eq!(
756            parse_relationship_type("Supersedes"),
757            RelationshipType::Supersedes
758        );
759        assert_eq!(
760            parse_relationship_type("ConflictsWith"),
761            RelationshipType::ConflictsWith
762        );
763        assert_eq!(
764            parse_relationship_type("unknown"),
765            RelationshipType::RelatesTo
766        );
767    }
768
769    #[test]
770    fn test_extract_json_raw() {
771        let service = EntityExtractorService::without_llm(Domain::for_user());
772        let json = r#"{"entities": [], "relationships": []}"#;
773        assert_eq!(service.extract_json(json), json);
774    }
775
776    #[test]
777    fn test_extract_json_from_markdown() {
778        let service = EntityExtractorService::without_llm(Domain::for_user());
779        let response = r#"Here's the extraction:
780
781```json
782{"entities": [{"name": "Alice", "type": "Person"}], "relationships": []}
783```
784
785Done!"#;
786        let extracted = service.extract_json(response);
787        assert!(extracted.contains("Alice"));
788        assert!(extracted.starts_with('{'));
789    }
790
791    #[test]
792    fn test_fallback_extraction() {
793        let service = EntityExtractorService::without_llm(Domain::for_user());
794        let result = service
795            .extract("We use Rust and PostgreSQL for the backend")
796            .unwrap();
797
798        assert!(result.used_fallback);
799        assert!(!result.entities.is_empty());
800
801        let names: Vec<_> = result.entities.iter().map(|e| e.name.as_str()).collect();
802        assert!(names.contains(&"Rust"));
803        assert!(names.contains(&"PostgreSQL"));
804    }
805
806    #[test]
807    fn test_fallback_no_match() {
808        let service = EntityExtractorService::without_llm(Domain::for_user());
809        let result = service.extract("Hello world").unwrap();
810
811        assert!(result.used_fallback);
812        assert!(result.entities.is_empty());
813        assert!(result.warnings.len() >= 2);
814    }
815
816    #[test]
817    fn test_empty_input() {
818        let service = EntityExtractorService::without_llm(Domain::for_user());
819        let result = service.extract("").unwrap();
820
821        assert!(result.entities.is_empty());
822        assert!(!result.used_fallback);
823    }
824
825    #[test]
826    fn test_to_graph_entities() {
827        let service = EntityExtractorService::without_llm(Domain::for_user());
828        let result = ExtractionResult {
829            entities: vec![ExtractedEntity {
830                name: "Alice".to_string(),
831                entity_type: "Person".to_string(),
832                confidence: 0.9,
833                aliases: vec!["A".to_string()],
834                description: Some("A person".to_string()),
835            }],
836            relationships: Vec::new(),
837            used_fallback: false,
838            warnings: Vec::new(),
839        };
840
841        let entities = service.to_graph_entities(&result);
842        assert_eq!(entities.len(), 1);
843        assert_eq!(entities[0].name, "Alice");
844        assert_eq!(entities[0].entity_type, EntityType::Person);
845        assert!((entities[0].confidence - 0.9).abs() < f32::EPSILON);
846    }
847
848    #[test]
849    fn test_min_confidence_threshold() {
850        let service =
851            EntityExtractorService::without_llm(Domain::for_user()).with_min_confidence(0.8);
852        assert!((service.min_confidence - 0.8).abs() < f32::EPSILON);
853    }
854
855    #[test]
856    fn test_inference_result_default() {
857        let result = InferenceResult::default();
858        assert!(result.relationships.is_empty());
859        assert!(!result.used_fallback);
860        assert!(result.warnings.is_empty());
861    }
862
863    #[test]
864    fn test_infer_relationships_empty() {
865        let service = EntityExtractorService::without_llm(Domain::for_user());
866        let result = service.infer_relationships(&[]).unwrap();
867
868        assert!(result.relationships.is_empty());
869        assert!(!result.used_fallback);
870    }
871
872    #[test]
873    fn test_infer_fallback_with_matching_entities() {
874        let service = EntityExtractorService::without_llm(Domain::for_user());
875
876        let entities = vec![
877            Entity::new(EntityType::Technology, "Rust", Domain::for_user()),
878            Entity::new(EntityType::Technology, "cargo", Domain::for_user()),
879        ];
880
881        let result = service.infer_relationships(&entities).unwrap();
882
883        assert!(result.used_fallback);
884        assert_eq!(result.relationships.len(), 1);
885        assert_eq!(result.relationships[0].from, "Rust");
886        assert_eq!(result.relationships[0].to, "cargo");
887        assert_eq!(result.relationships[0].relationship_type, "Uses");
888    }
889
890    #[test]
891    fn test_infer_fallback_no_matching_pairs() {
892        let service = EntityExtractorService::without_llm(Domain::for_user());
893
894        let entities = vec![
895            Entity::new(EntityType::Person, "Alice", Domain::for_user()),
896            Entity::new(EntityType::Organization, "Acme", Domain::for_user()),
897        ];
898
899        let result = service.infer_relationships(&entities).unwrap();
900
901        assert!(result.used_fallback);
902        assert!(result.relationships.is_empty());
903    }
904
905    #[test]
906    fn test_format_entities_for_inference() {
907        let service = EntityExtractorService::without_llm(Domain::for_user());
908
909        let mut entity = Entity::new(EntityType::Technology, "Rust", Domain::for_user());
910        entity.aliases = vec!["rust-lang".to_string()];
911
912        let formatted = service.format_entities_for_inference(&[entity]);
913
914        assert!(formatted.contains("Rust"));
915        assert!(formatted.contains("Technology"));
916        assert!(formatted.contains("rust-lang"));
917    }
918
919    #[test]
920    fn test_inferred_to_graph_relationships() {
921        let service = EntityExtractorService::without_llm(Domain::for_user());
922
923        let rust = Entity::new(EntityType::Technology, "Rust", Domain::for_user());
924        let cargo = Entity::new(EntityType::Technology, "cargo", Domain::for_user());
925
926        let mut entity_map = HashMap::new();
927        entity_map.insert("Rust".to_string(), rust.clone());
928        entity_map.insert("cargo".to_string(), cargo.clone());
929
930        let inferred = InferenceResult {
931            relationships: vec![InferredRelationship {
932                from: "Rust".to_string(),
933                to: "cargo".to_string(),
934                relationship_type: "Uses".to_string(),
935                confidence: 0.8,
936                reasoning: Some("Rust uses cargo as package manager".to_string()),
937            }],
938            used_fallback: false,
939            warnings: Vec::new(),
940        };
941
942        let relationships = service.inferred_to_graph_relationships(&inferred, &entity_map);
943
944        assert_eq!(relationships.len(), 1);
945        assert_eq!(relationships[0].from_entity, rust.id);
946        assert_eq!(relationships[0].to_entity, cargo.id);
947        assert_eq!(relationships[0].relationship_type, RelationshipType::Uses);
948        assert!(relationships[0].properties.contains_key("reasoning"));
949    }
950
951    #[test]
952    fn test_inferred_to_graph_missing_entity() {
953        let service = EntityExtractorService::without_llm(Domain::for_user());
954
955        let rust = Entity::new(EntityType::Technology, "Rust", Domain::for_user());
956        let mut entity_map = HashMap::new();
957        entity_map.insert("Rust".to_string(), rust);
958        // Note: "cargo" is missing from entity_map
959
960        let inferred = InferenceResult {
961            relationships: vec![InferredRelationship {
962                from: "Rust".to_string(),
963                to: "cargo".to_string(),
964                relationship_type: "Uses".to_string(),
965                confidence: 0.8,
966                reasoning: None,
967            }],
968            used_fallback: false,
969            warnings: Vec::new(),
970        };
971
972        let relationships = service.inferred_to_graph_relationships(&inferred, &entity_map);
973
974        // Should skip relationships with missing entities
975        assert!(relationships.is_empty());
976    }
977
978    #[test]
979    fn test_to_graph_relationships() {
980        let service = EntityExtractorService::without_llm(Domain::for_user());
981
982        let result = ExtractionResult {
983            entities: vec![
984                ExtractedEntity {
985                    name: "Alice".to_string(),
986                    entity_type: "Person".to_string(),
987                    confidence: 0.9,
988                    aliases: Vec::new(),
989                    description: None,
990                },
991                ExtractedEntity {
992                    name: "Acme".to_string(),
993                    entity_type: "Organization".to_string(),
994                    confidence: 0.85,
995                    aliases: Vec::new(),
996                    description: None,
997                },
998            ],
999            relationships: vec![ExtractedRelationship {
1000                from: "Alice".to_string(),
1001                to: "Acme".to_string(),
1002                relationship_type: "WorksAt".to_string(),
1003                confidence: 0.8,
1004                evidence: None,
1005            }],
1006            used_fallback: false,
1007            warnings: Vec::new(),
1008        };
1009
1010        let entities = service.to_graph_entities(&result);
1011        // Create entity_map from entities Vec
1012        let entity_map: HashMap<String, Entity> =
1013            entities.into_iter().map(|e| (e.name.clone(), e)).collect();
1014        let relationships = service.to_graph_relationships(&result, &entity_map);
1015
1016        assert_eq!(relationships.len(), 1);
1017        assert_eq!(
1018            relationships[0].relationship_type,
1019            RelationshipType::WorksAt
1020        );
1021    }
1022
1023    #[test]
1024    fn test_extraction_with_various_technologies() {
1025        let service = EntityExtractorService::without_llm(Domain::for_user());
1026        let result = service
1027            .extract("We built this using React, TypeScript, and Docker containers")
1028            .unwrap();
1029
1030        assert!(result.used_fallback);
1031        let names: Vec<_> = result.entities.iter().map(|e| e.name.as_str()).collect();
1032        assert!(names.contains(&"React"));
1033        assert!(names.contains(&"TypeScript"));
1034        assert!(names.contains(&"Docker"));
1035    }
1036
1037    #[test]
1038    fn test_extraction_with_databases() {
1039        let service = EntityExtractorService::without_llm(Domain::for_user());
1040        let result = service
1041            .extract("Our stack uses PostgreSQL for persistence and Redis for caching")
1042            .unwrap();
1043
1044        assert!(result.used_fallback);
1045        let names: Vec<_> = result.entities.iter().map(|e| e.name.as_str()).collect();
1046        assert!(names.contains(&"PostgreSQL"));
1047        assert!(names.contains(&"Redis"));
1048    }
1049
1050    #[test]
1051    fn test_extracted_entity_defaults() {
1052        let entity = ExtractedEntity {
1053            name: "Test".to_string(),
1054            entity_type: "Concept".to_string(),
1055            confidence: 0.5,
1056            aliases: Vec::new(),
1057            description: None,
1058        };
1059
1060        assert_eq!(entity.name, "Test");
1061        assert!(entity.aliases.is_empty());
1062        assert!(entity.description.is_none());
1063    }
1064
1065    #[test]
1066    fn test_inferred_relationship_with_reasoning() {
1067        let rel = InferredRelationship {
1068            from: "Rust".to_string(),
1069            to: "LLVM".to_string(),
1070            relationship_type: "Uses".to_string(),
1071            confidence: 0.9,
1072            reasoning: Some("Rust compiles through LLVM".to_string()),
1073        };
1074
1075        assert_eq!(rel.from, "Rust");
1076        assert_eq!(rel.to, "LLVM");
1077        assert!(rel.reasoning.is_some());
1078    }
1079
1080    #[test]
1081    fn test_service_domain() {
1082        let user_domain = Domain::for_user();
1083        let service = EntityExtractorService::without_llm(user_domain);
1084
1085        // Verify domain is set correctly by extracting an entity
1086        let result = service.extract("Using Python for scripting").unwrap();
1087        let entities = service.to_graph_entities(&result);
1088
1089        if !entities.is_empty() {
1090            assert!(entities[0].domain.is_user());
1091        }
1092    }
1093
1094    /// Integration test that actually calls the LLM API.
1095    ///
1096    /// Run with: `RUST_LOG=debug cargo test test_llm_extraction_integration -- --ignored --nocapture`
1097    #[test]
1098    #[ignore = "requires OPENAI_API_KEY and makes real API calls"]
1099    fn test_llm_extraction_integration() {
1100        use crate::llm::{LlmHttpConfig, LlmProvider, OpenAiClient};
1101        use std::sync::Arc;
1102
1103        // Initialize logging for debug output
1104        let _ = tracing_subscriber::fmt()
1105            .with_max_level(tracing::Level::DEBUG)
1106            .with_test_writer()
1107            .try_init();
1108
1109        // Check for API key
1110        let api_key = std::env::var("OPENAI_API_KEY").expect("OPENAI_API_KEY must be set");
1111        assert!(!api_key.is_empty(), "OPENAI_API_KEY cannot be empty");
1112
1113        // Build client with longer timeout for debugging
1114        let http_config = LlmHttpConfig {
1115            timeout_ms: 60_000,
1116            connect_timeout_ms: 10_000,
1117        };
1118        let client = OpenAiClient::new()
1119            .with_api_key(&api_key)
1120            .with_model("gpt-5-nano-2025-08-07")
1121            .with_http_config(http_config);
1122
1123        let llm: Arc<dyn LlmProvider> = Arc::new(client);
1124
1125        let service = EntityExtractorService::with_shared_llm(llm, Domain::for_user());
1126
1127        // Test 1: Simple content (should work)
1128        println!("\n=== Test 1: Simple content ===");
1129        let simple_content = "PostgreSQL database with Redis cache";
1130        let result = service.extract(simple_content);
1131        match &result {
1132            Ok(r) => {
1133                println!(
1134                    "Simple content result: used_fallback={}, entities={:?}",
1135                    r.used_fallback, r.entities
1136                );
1137                assert!(
1138                    !r.used_fallback,
1139                    "Simple content should use LLM, not fallback"
1140                );
1141            },
1142            Err(e) => {
1143                println!("Simple content error: {e:?}");
1144                unreachable!("Simple content extraction failed: {e}");
1145            },
1146        }
1147
1148        // Test 2: Complex code-heavy content (might trigger fallback)
1149        println!("\n=== Test 2: Complex content ===");
1150        let complex_content = r#"The EntityExtractorService::extract_with_llm() method at src/services/entity_extraction.rs:312 calls llm.complete_with_system(&system, &user) to process text. If the LLM fails, it falls back to extract_fallback() which uses TECH_PATTERNS regex matching against patterns like r"\b(Rust|Python|Go|Java)\b" defined in the static LAZY_STATIC block."#;
1151        let result = service.extract(complex_content);
1152        match &result {
1153            Ok(r) => {
1154                println!(
1155                    "Complex content result: used_fallback={}, entities={:?}, warnings={:?}",
1156                    r.used_fallback, r.entities, r.warnings
1157                );
1158                if r.used_fallback {
1159                    println!("WARNING: Complex content fell back to pattern matching!");
1160                }
1161            },
1162            Err(e) => {
1163                println!("Complex content error: {e:?}");
1164            },
1165        }
1166    }
1167}