graphrag_core/entity/
llm_relationship_extractor.rs

1//! LLM-based relationship extraction following Microsoft GraphRAG methodology
2//!
3//! This module implements proper entity-relationship extraction using LLM prompts
4//! instead of simple pattern matching. It extracts entities and relationships
5//! together in a single LLM call, following the best practices from Microsoft
6//! GraphRAG and LightRAG.
7
8use crate::core::{Entity, EntityId, TextChunk, Result, GraphRAGError};
9use serde::{Deserialize, Serialize};
10
11/// Extracted relationship with metadata
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct ExtractedRelationship {
14    /// Source entity name in the relationship
15    pub source: String,
16    /// Target entity name in the relationship
17    pub target: String,
18    /// Type of relationship (e.g., DISCUSSES, TEACHES, WORKS_FOR)
19    pub relation_type: String,
20    /// Brief explanation of why the entities are related
21    pub description: String,
22    /// Confidence score between 0.0 and 1.0
23    pub strength: f32,
24}
25
26/// Combined extraction result from LLM
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct ExtractionResult {
29    /// List of entities extracted from text
30    pub entities: Vec<ExtractedEntity>,
31    /// List of relationships between entities
32    pub relationships: Vec<ExtractedRelationship>,
33}
34
35/// Extracted entity with metadata
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct ExtractedEntity {
38    /// Name of the entity
39    pub name: String,
40    /// Type of entity (e.g., PERSON, CONCEPT, LOCATION, ORGANIZATION)
41    #[serde(rename = "type")]
42    pub entity_type: String,
43    /// Optional description providing context about the entity
44    pub description: Option<String>,
45}
46
47/// LLM-based relationship extractor
48///
49/// This extractor uses a language model to identify entities and their relationships
50/// in text. It follows Microsoft GraphRAG methodology for high-quality extraction.
51pub struct LLMRelationshipExtractor {
52    /// Optional Ollama client for LLM-based extraction
53    pub ollama_client: Option<crate::ollama::OllamaClient>,
54}
55
56impl LLMRelationshipExtractor {
57    /// Create a new LLM relationship extractor
58    ///
59    /// # Arguments
60    ///
61    /// * `ollama_config` - Optional Ollama configuration. If provided and enabled,
62    ///   the extractor will use LLM-based extraction. Otherwise, it will fall back
63    ///   to pattern-based extraction.
64    ///
65    /// # Returns
66    ///
67    /// Returns a new extractor instance or an error if initialization fails.
68    pub fn new(ollama_config: Option<&crate::ollama::OllamaConfig>) -> Result<Self> {
69        let ollama_client = if let Some(config) = ollama_config {
70            if config.enabled {
71                let local_config = crate::ollama::OllamaConfig {
72                    enabled: config.enabled,
73                    host: config.host.clone(),
74                    port: config.port,
75                    chat_model: config.chat_model.clone(),
76                    embedding_model: config.embedding_model.clone(),
77                    timeout_seconds: config.timeout_seconds,
78                    max_retries: config.max_retries,
79                    fallback_to_hash: config.fallback_to_hash,
80                    max_tokens: None,
81                    temperature: None,
82                };
83
84                Some(crate::ollama::OllamaClient::new(local_config))
85            } else {
86                None
87            }
88        } else {
89            None
90        };
91
92        Ok(Self { ollama_client })
93    }
94
95    /// Build the extraction prompt following Microsoft GraphRAG methodology
96    ///
97    /// Creates a detailed prompt that instructs the LLM to extract both entities
98    /// and relationships from text, with specific guidelines for different text types.
99    ///
100    /// # Arguments
101    ///
102    /// * `chunk_content` - The text content to extract entities and relationships from
103    ///
104    /// # Returns
105    ///
106    /// A formatted prompt string ready to be sent to the LLM
107    fn build_extraction_prompt(&self, chunk_content: &str) -> String {
108        format!(
109            r#"You are an expert at extracting entities and relationships from text.
110Extract all meaningful entities and relationships from the provided text.
111
112**ENTITIES**: Extract people, concepts, locations, events, organizations, and other significant entities.
113For each entity provide:
114- name: the entity name
115- type: entity type (PERSON, CONCEPT, LOCATION, EVENT, ORGANIZATION, OBJECT, etc.)
116- description: brief description of the entity (optional)
117
118**RELATIONSHIPS**: For entities that interact or are related, extract their relationships.
119For each relationship provide:
120- source: source entity name (must match an entity name)
121- target: target entity name (must match an entity name)
122- type: relationship type (DISCUSSES, QUESTIONS, RESPONDS_TO, TEACHES, LOVES, ADMIRES, ARGUES_WITH, MENTIONS, WORKS_FOR, LOCATED_IN, etc.)
123- description: brief explanation of why they are related
124- strength: confidence score between 0.0 and 1.0
125
126**IMPORTANT GUIDELINES**:
1271. Extract relationships for entities that have meaningful connections
1282. Choose descriptive relationship types that capture the nature of the connection
1293. For philosophical/dialogue texts, use types like DISCUSSES, QUESTIONS, RESPONDS_TO
1304. For narrative texts, use types like MEETS, HELPS, OPPOSES, TRAVELS_WITH
1315. For technical texts, use types like IMPLEMENTS, DEPENDS_ON, EXTENDS
1326. Provide higher strength values (0.8-1.0) for explicit relationships
1337. Provide lower strength values (0.5-0.7) for implicit or inferred relationships
134
135**TEXT TO ANALYZE**:
136{chunk_content}
137
138**OUTPUT FORMAT** (JSON only, no other text):
139{{
140  "entities": [
141    {{"name": "Entity Name", "type": "PERSON", "description": "Brief description"}},
142    ...
143  ],
144  "relationships": [
145    {{"source": "Entity1", "target": "Entity2", "type": "DISCUSSES", "description": "Why they are related", "strength": 0.85}},
146    ...
147  ]
148}}
149
150Return ONLY valid JSON, nothing else."#,
151            chunk_content = chunk_content
152        )
153    }
154
155    /// Extract entities and relationships using LLM
156    ///
157    /// Uses the configured LLM to extract entities and their relationships from a text chunk.
158    /// The LLM analyzes the text and returns structured data with entities, their types,
159    /// and the relationships between them.
160    ///
161    /// # Arguments
162    ///
163    /// * `chunk` - The text chunk to process
164    ///
165    /// # Returns
166    ///
167    /// Returns an `ExtractionResult` containing entities and relationships, or an error
168    /// if the LLM is not configured or extraction fails.
169    ///
170    /// # Errors
171    ///
172    /// - Returns `GraphRAGError::Config` if Ollama client is not configured
173    /// - Returns `GraphRAGError::EntityExtraction` if LLM generation fails
174    pub async fn extract_with_llm(
175        &self,
176        chunk: &TextChunk,
177    ) -> Result<ExtractionResult> {
178        if let Some(client) = &self.ollama_client {
179            let prompt = self.build_extraction_prompt(&chunk.content);
180
181            #[cfg(feature = "tracing")]
182            tracing::debug!(
183                chunk_id = %chunk.id,
184                "Extracting entities and relationships with LLM"
185            );
186
187            match client.generate(&prompt).await {
188                Ok(response) => {
189                    // Parse LLM response as JSON
190                    let json_str = response.trim();
191
192                    // Extract JSON from response (LLM might add extra text)
193                    let json_str = if let Some(start) = json_str.find('{') {
194                        if let Some(end) = json_str.rfind('}') {
195                            &json_str[start..=end]
196                        } else {
197                            json_str
198                        }
199                    } else {
200                        json_str
201                    };
202
203                    match serde_json::from_str::<ExtractionResult>(json_str) {
204                        Ok(result) => {
205                            #[cfg(feature = "tracing")]
206                            tracing::info!(
207                                chunk_id = %chunk.id,
208                                entity_count = result.entities.len(),
209                                relationship_count = result.relationships.len(),
210                                "Successfully extracted entities and relationships"
211                            );
212                            Ok(result)
213                        }
214                        Err(_e) => {
215                            #[cfg(feature = "tracing")]
216                            tracing::warn!(
217                                chunk_id = %chunk.id,
218                                error = %_e,
219                                response = %json_str,
220                                "Failed to parse LLM response as JSON, falling back to entity-only extraction"
221                            );
222                            // Return empty result on parse failure
223                            Ok(ExtractionResult {
224                                entities: Vec::new(),
225                                relationships: Vec::new(),
226                            })
227                        }
228                    }
229                }
230                Err(e) => {
231                    #[cfg(feature = "tracing")]
232                    tracing::error!(
233                        chunk_id = %chunk.id,
234                        error = %e,
235                        "LLM extraction failed"
236                    );
237                    Err(GraphRAGError::EntityExtraction {
238                        message: format!("LLM extraction failed: {}", e),
239                    })
240                }
241            }
242        } else {
243            Err(GraphRAGError::Config {
244                message: "Ollama client not configured".to_string(),
245            })
246        }
247    }
248
249    /// Extract relationships between entities using improved co-occurrence logic
250    ///
251    /// This is a fallback method when LLM is not available. It identifies relationships
252    /// by analyzing entity co-occurrence patterns and contextual clues in the text.
253    ///
254    /// # Arguments
255    ///
256    /// * `entities` - List of all known entities
257    /// * `chunk` - The text chunk to analyze for relationships
258    ///
259    /// # Returns
260    ///
261    /// Returns a vector of tuples containing:
262    /// - Source entity ID
263    /// - Target entity ID
264    /// - Relationship type (string)
265    /// - Confidence score (0.0-1.0)
266    pub fn extract_relationships_fallback(
267        &self,
268        entities: &[Entity],
269        chunk: &TextChunk,
270    ) -> Vec<(EntityId, EntityId, String, f32)> {
271        let mut relationships = Vec::new();
272
273        // Get entities that appear in this chunk
274        let chunk_entities: Vec<&Entity> = entities
275            .iter()
276            .filter(|e| e.mentions.iter().any(|m| m.chunk_id == chunk.id))
277            .collect();
278
279        // Extract relationships between co-occurring entities
280        for i in 0..chunk_entities.len() {
281            for j in (i + 1)..chunk_entities.len() {
282                let entity1 = chunk_entities[i];
283                let entity2 = chunk_entities[j];
284
285                // Infer relationship with improved heuristics
286                if let Some((rel_type, confidence)) =
287                    self.infer_relationship_with_context(entity1, entity2, &chunk.content)
288                {
289                    relationships.push((
290                        entity1.id.clone(),
291                        entity2.id.clone(),
292                        rel_type,
293                        confidence,
294                    ));
295                }
296            }
297        }
298
299        relationships
300    }
301
302    /// Infer relationship type with improved context analysis
303    ///
304    /// Analyzes the context around two entities to determine the type and strength
305    /// of their relationship. Uses entity types and contextual patterns to make
306    /// intelligent inferences.
307    ///
308    /// # Arguments
309    ///
310    /// * `entity1` - First entity in the potential relationship
311    /// * `entity2` - Second entity in the potential relationship
312    /// * `context` - The text context containing both entities
313    ///
314    /// # Returns
315    ///
316    /// Returns `Some((relationship_type, confidence))` if a relationship is detected,
317    /// or `None` if entities are too far apart or no clear relationship exists.
318    fn infer_relationship_with_context(
319        &self,
320        entity1: &Entity,
321        entity2: &Entity,
322        context: &str,
323    ) -> Option<(String, f32)> {
324        let context_lower = context.to_lowercase();
325        let e1_name_lower = entity1.name.to_lowercase();
326        let e2_name_lower = entity2.name.to_lowercase();
327
328        // Find positions of entities in text
329        let e1_pos = context_lower.find(&e1_name_lower)?;
330        let e2_pos = context_lower.find(&e2_name_lower)?;
331
332        // Extract context window between entities (max 200 chars)
333        let start = e1_pos.min(e2_pos);
334        let end = (e1_pos.max(e2_pos) + 50).min(context.len());
335        let window = &context_lower[start..end];
336
337        // Analyze relationship based on context and entity types
338        match (&entity1.entity_type[..], &entity2.entity_type[..]) {
339            // Person-Person relationships
340            ("PERSON", "PERSON") | ("CHARACTER", "CHARACTER") | ("SPEAKER", "SPEAKER") => {
341                if window.contains("said") || window.contains("replied") || window.contains("responded") {
342                    Some(("RESPONDS_TO".to_string(), 0.85))
343                } else if window.contains("asked") || window.contains("questioned") {
344                    Some(("QUESTIONS".to_string(), 0.85))
345                } else if window.contains("taught") || window.contains("explained") {
346                    Some(("TEACHES".to_string(), 0.80))
347                } else if window.contains("discussed") || window.contains("spoke about") {
348                    Some(("DISCUSSES".to_string(), 0.80))
349                } else if window.contains("loved") || window.contains("admired") {
350                    Some(("ADMIRES".to_string(), 0.85))
351                } else if window.contains("argued") || window.contains("disagreed") {
352                    Some(("ARGUES_WITH".to_string(), 0.85))
353                } else if window.contains("met") || window.contains("encountered") {
354                    Some(("MEETS".to_string(), 0.75))
355                } else {
356                    // Default for co-occurring persons
357                    Some(("INTERACTS_WITH".to_string(), 0.60))
358                }
359            }
360
361            // Person-Concept relationships
362            ("PERSON", "CONCEPT") | ("CHARACTER", "CONCEPT") | ("SPEAKER", "CONCEPT") => {
363                if window.contains("discussed") || window.contains("spoke of") {
364                    Some(("DISCUSSES".to_string(), 0.80))
365                } else if window.contains("defined") || window.contains("described") {
366                    Some(("DEFINES".to_string(), 0.85))
367                } else if window.contains("questioned") || window.contains("wondered about") {
368                    Some(("QUESTIONS".to_string(), 0.80))
369                } else {
370                    Some(("MENTIONS".to_string(), 0.70))
371                }
372            }
373
374            // Reverse: Concept-Person
375            ("CONCEPT", "PERSON") | ("CONCEPT", "CHARACTER") | ("CONCEPT", "SPEAKER") => {
376                Some(("DISCUSSED_BY".to_string(), 0.70))
377            }
378
379            // Person-Organization relationships
380            ("PERSON", "ORGANIZATION") | ("ORGANIZATION", "PERSON") => {
381                if window.contains("works for") || window.contains("employed by") {
382                    Some(("WORKS_FOR".to_string(), 0.90))
383                } else if window.contains("founded") || window.contains("CEO") || window.contains("leads") {
384                    Some(("LEADS".to_string(), 0.90))
385                } else {
386                    Some(("ASSOCIATED_WITH".to_string(), 0.65))
387                }
388            }
389
390            // Person-Location relationships
391            ("PERSON", "LOCATION") | ("CHARACTER", "LOCATION") => {
392                if window.contains("born in") || window.contains("from") {
393                    Some(("BORN_IN".to_string(), 0.90))
394                } else if window.contains("lives in") || window.contains("resides in") {
395                    Some(("LIVES_IN".to_string(), 0.85))
396                } else if window.contains("traveled to") || window.contains("visited") {
397                    Some(("VISITED".to_string(), 0.80))
398                } else {
399                    Some(("LOCATED_IN".to_string(), 0.70))
400                }
401            }
402
403            // Organization-Location relationships
404            ("ORGANIZATION", "LOCATION") | ("LOCATION", "ORGANIZATION") => {
405                if window.contains("headquartered") || window.contains("based in") {
406                    Some(("HEADQUARTERED_IN".to_string(), 0.90))
407                } else {
408                    Some(("LOCATED_IN".to_string(), 0.75))
409                }
410            }
411
412            // Concept-Concept relationships
413            ("CONCEPT", "CONCEPT") => {
414                if window.contains("similar to") || window.contains("related to") {
415                    Some(("RELATED_TO".to_string(), 0.75))
416                } else if window.contains("opposite") || window.contains("contrasts with") {
417                    Some(("CONTRASTS_WITH".to_string(), 0.80))
418                } else {
419                    Some(("ASSOCIATED_WITH".to_string(), 0.60))
420                }
421            }
422
423            // Event relationships
424            ("PERSON", "EVENT") | ("CHARACTER", "EVENT") => {
425                Some(("PARTICIPATES_IN".to_string(), 0.75))
426            }
427            ("EVENT", "LOCATION") => {
428                Some(("OCCURS_IN".to_string(), 0.80))
429            }
430
431            // Default fallback
432            _ => {
433                // Only create relationship if entities are close together (within 100 chars)
434                if (e1_pos as i32 - e2_pos as i32).abs() < 100 {
435                    Some(("CO_OCCURS".to_string(), 0.50))
436                } else {
437                    None
438                }
439            }
440        }
441    }
442}
443
444#[cfg(test)]
445mod tests {
446    use super::*;
447    use crate::core::{ChunkId, DocumentId};
448
449    #[test]
450    fn test_prompt_generation() {
451        let extractor = LLMRelationshipExtractor::new(None).unwrap();
452        let prompt = extractor.build_extraction_prompt("Socrates discusses love with Phaedrus.");
453
454        assert!(prompt.contains("entities"));
455        assert!(prompt.contains("relationships"));
456        assert!(prompt.contains("Socrates discusses love with Phaedrus"));
457    }
458
459    #[test]
460    fn test_fallback_extraction() {
461        let extractor = LLMRelationshipExtractor::new(None).unwrap();
462
463        let chunk = TextChunk::new(
464            ChunkId::new("test".to_string()),
465            DocumentId::new("doc".to_string()),
466            "Socrates discussed love with Phaedrus in Athens.".to_string(),
467            0,
468            50,
469        );
470
471        let entities = vec![
472            Entity::new(
473                EntityId::new("person_socrates".to_string()),
474                "Socrates".to_string(),
475                "PERSON".to_string(),
476                0.9,
477            ),
478            Entity::new(
479                EntityId::new("person_phaedrus".to_string()),
480                "Phaedrus".to_string(),
481                "PERSON".to_string(),
482                0.9,
483            ),
484        ];
485
486        let relationships = extractor.extract_relationships_fallback(&entities, &chunk);
487
488        // Should extract at least one relationship
489        assert!(!relationships.is_empty());
490    }
491}
graphrag_core/entity/llm_relationship_extractor.rs

graphrag_core/entity/
llm_relationship_extractor.rs