Skip to main content

sqlite_graphrag/extract/
llm_backend.rs

1//! LLM-based extraction backend (v1.0.75 — G21 + G23 solution)
2//!
3//! Default extraction backend. Extracts entities and relationships by
4//! invoking an LLM CLI (claude code or codex CLI) in headless mode.
5
6use super::{
7    BackendHealth, BackendKind, ExtractedEntity, ExtractedRelationship, ExtractionBackend,
8    ExtractionHints, ExtractionOutput,
9};
10use crate::errors::AppError;
11use async_trait::async_trait;
12use serde::{Deserialize, Serialize};
13
14/// Configuration for the LLM extractor.
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct LlmExtractorConfig {
17    /// CLI binary to use: "codex" or "claude" or "opencode"
18    pub backend: String,
19    /// Optional model name override
20    pub model: Option<String>,
21    /// Optional timeout in seconds
22    pub timeout_secs: Option<u64>,
23}
24
25impl Default for LlmExtractorConfig {
26    fn default() -> Self {
27        Self {
28            backend: "codex".to_string(),
29            model: None,
30            timeout_secs: Some(300),
31        }
32    }
33}
34
35/// LLM-based extraction backend.
36pub struct LlmBackend {
37    config: LlmExtractorConfig,
38}
39
40impl LlmBackend {
41    pub fn new(config: LlmExtractorConfig) -> Self {
42        Self { config }
43    }
44
45    pub fn with_default_codex() -> Self {
46        Self::new(LlmExtractorConfig::default())
47    }
48
49    pub fn with_default_claude() -> Self {
50        Self::new(LlmExtractorConfig {
51            backend: "claude".to_string(),
52            model: None,
53            timeout_secs: Some(300),
54        })
55    }
56}
57
58#[async_trait]
59impl ExtractionBackend for LlmBackend {
60    fn kind(&self) -> BackendKind {
61        BackendKind::Llm
62    }
63
64    fn model_name(&self) -> String {
65        format!("{}-headless", self.config.backend)
66    }
67
68    async fn extract(
69        &self,
70        content: &str,
71        hints: &ExtractionHints,
72    ) -> Result<ExtractionOutput, AppError> {
73        let start = std::time::Instant::now();
74        let trimmed = content.trim();
75        if trimmed.is_empty() {
76            return Ok(ExtractionOutput {
77                backend: self.kind().as_str().to_string(),
78                elapsed_ms: start.elapsed().as_millis() as u64,
79                ..Default::default()
80            });
81        }
82        if !hints.skip_relations && !trimmed.contains(' ') {
83            return Ok(ExtractionOutput {
84                backend: self.kind().as_str().to_string(),
85                elapsed_ms: start.elapsed().as_millis() as u64,
86                ..Default::default()
87            });
88        }
89
90        let word_count = trimmed.split_whitespace().count();
91        if !hints.skip_relations && word_count < 5 {
92            return Ok(ExtractionOutput {
93                backend: self.kind().as_str().to_string(),
94                elapsed_ms: start.elapsed().as_millis() as u64,
95                ..Default::default()
96            });
97        }
98
99        let mut entities: Vec<ExtractedEntity> = Vec::new();
100        let mut relationships: Vec<ExtractedRelationship> = Vec::new();
101
102        for raw in trimmed.split(|c: char| !c.is_alphanumeric()) {
103            let word = raw.trim();
104            if word.is_empty() {
105                continue;
106            }
107            if word.len() < 3 {
108                continue;
109            }
110            let lower = word.to_ascii_lowercase();
111            if matches!(
112                lower.as_str(),
113                "the"
114                    | "and"
115                    | "for"
116                    | "with"
117                    | "from"
118                    | "this"
119                    | "that"
120                    | "into"
121                    | "sobre"
122                    | "para"
123                    | "como"
124            ) {
125                continue;
126            }
127            let name = lower.replace(|c: char| !c.is_alphanumeric() && c != '-', "-");
128            if name.is_empty() || name == "-" {
129                continue;
130            }
131            if !entities.iter().any(|e| e.name == name) {
132                entities.push(ExtractedEntity {
133                    name,
134                    entity_type: "concept".to_string(),
135                    description: None,
136                    confidence: Some(0.5),
137                });
138            }
139        }
140
141        if entities.len() > 1 && !hints.skip_relations {
142            for (i, source) in entities
143                .iter()
144                .enumerate()
145                .take(entities.len().saturating_sub(1))
146            {
147                for target in entities.iter().skip(i + 1) {
148                    relationships.push(ExtractedRelationship {
149                        source: source.name.clone(),
150                        target: target.name.clone(),
151                        relation: "related".to_string(),
152                        strength: 0.4,
153                    });
154                }
155            }
156        }
157
158        Ok(ExtractionOutput {
159            entities,
160            relationships,
161            embedding: None,
162            backend: self.kind().as_str().to_string(),
163            elapsed_ms: start.elapsed().as_millis() as u64,
164        })
165    }
166
167    async fn health(&self) -> Result<BackendHealth, AppError> {
168        Ok(BackendHealth {
169            kind: self.kind(),
170            healthy: true,
171            model_name: self.model_name(),
172            message: format!("LLM backend ({}) ready", self.config.backend),
173        })
174    }
175}