1use crate::core::{Entity, EntityId, TextChunk, Result, GraphRAGError};
9use serde::{Deserialize, Serialize};
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct ExtractedRelationship {
14 pub source: String,
16 pub target: String,
18 pub relation_type: String,
20 pub description: String,
22 pub strength: f32,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct ExtractionResult {
29 pub entities: Vec<ExtractedEntity>,
31 pub relationships: Vec<ExtractedRelationship>,
33}
34
35#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct ExtractedEntity {
38 pub name: String,
40 #[serde(rename = "type")]
42 pub entity_type: String,
43 pub description: Option<String>,
45}
46
47pub struct LLMRelationshipExtractor {
52 pub ollama_client: Option<crate::ollama::OllamaClient>,
54}
55
56impl LLMRelationshipExtractor {
57 pub fn new(ollama_config: Option<&crate::ollama::OllamaConfig>) -> Result<Self> {
69 let ollama_client = if let Some(config) = ollama_config {
70 if config.enabled {
71 let local_config = crate::ollama::OllamaConfig {
72 enabled: config.enabled,
73 host: config.host.clone(),
74 port: config.port,
75 chat_model: config.chat_model.clone(),
76 embedding_model: config.embedding_model.clone(),
77 timeout_seconds: config.timeout_seconds,
78 max_retries: config.max_retries,
79 fallback_to_hash: config.fallback_to_hash,
80 max_tokens: None,
81 temperature: None,
82 };
83
84 Some(crate::ollama::OllamaClient::new(local_config))
85 } else {
86 None
87 }
88 } else {
89 None
90 };
91
92 Ok(Self { ollama_client })
93 }
94
95 fn build_extraction_prompt(&self, chunk_content: &str) -> String {
108 format!(
109 r#"You are an expert at extracting entities and relationships from text.
110Extract all meaningful entities and relationships from the provided text.
111
112**ENTITIES**: Extract people, concepts, locations, events, organizations, and other significant entities.
113For each entity provide:
114- name: the entity name
115- type: entity type (PERSON, CONCEPT, LOCATION, EVENT, ORGANIZATION, OBJECT, etc.)
116- description: brief description of the entity (optional)
117
118**RELATIONSHIPS**: For entities that interact or are related, extract their relationships.
119For each relationship provide:
120- source: source entity name (must match an entity name)
121- target: target entity name (must match an entity name)
122- type: relationship type (DISCUSSES, QUESTIONS, RESPONDS_TO, TEACHES, LOVES, ADMIRES, ARGUES_WITH, MENTIONS, WORKS_FOR, LOCATED_IN, etc.)
123- description: brief explanation of why they are related
124- strength: confidence score between 0.0 and 1.0
125
126**IMPORTANT GUIDELINES**:
1271. Extract relationships for entities that have meaningful connections
1282. Choose descriptive relationship types that capture the nature of the connection
1293. For philosophical/dialogue texts, use types like DISCUSSES, QUESTIONS, RESPONDS_TO
1304. For narrative texts, use types like MEETS, HELPS, OPPOSES, TRAVELS_WITH
1315. For technical texts, use types like IMPLEMENTS, DEPENDS_ON, EXTENDS
1326. Provide higher strength values (0.8-1.0) for explicit relationships
1337. Provide lower strength values (0.5-0.7) for implicit or inferred relationships
134
135**TEXT TO ANALYZE**:
136{chunk_content}
137
138**OUTPUT FORMAT** (JSON only, no other text):
139{{
140 "entities": [
141 {{"name": "Entity Name", "type": "PERSON", "description": "Brief description"}},
142 ...
143 ],
144 "relationships": [
145 {{"source": "Entity1", "target": "Entity2", "type": "DISCUSSES", "description": "Why they are related", "strength": 0.85}},
146 ...
147 ]
148}}
149
150Return ONLY valid JSON, nothing else."#,
151 chunk_content = chunk_content
152 )
153 }
154
155 pub async fn extract_with_llm(
175 &self,
176 chunk: &TextChunk,
177 ) -> Result<ExtractionResult> {
178 if let Some(client) = &self.ollama_client {
179 let prompt = self.build_extraction_prompt(&chunk.content);
180
181 #[cfg(feature = "tracing")]
182 tracing::debug!(
183 chunk_id = %chunk.id,
184 "Extracting entities and relationships with LLM"
185 );
186
187 match client.generate(&prompt).await {
188 Ok(response) => {
189 let json_str = response.trim();
191
192 let json_str = if let Some(start) = json_str.find('{') {
194 if let Some(end) = json_str.rfind('}') {
195 &json_str[start..=end]
196 } else {
197 json_str
198 }
199 } else {
200 json_str
201 };
202
203 match serde_json::from_str::<ExtractionResult>(json_str) {
204 Ok(result) => {
205 #[cfg(feature = "tracing")]
206 tracing::info!(
207 chunk_id = %chunk.id,
208 entity_count = result.entities.len(),
209 relationship_count = result.relationships.len(),
210 "Successfully extracted entities and relationships"
211 );
212 Ok(result)
213 }
214 Err(_e) => {
215 #[cfg(feature = "tracing")]
216 tracing::warn!(
217 chunk_id = %chunk.id,
218 error = %_e,
219 response = %json_str,
220 "Failed to parse LLM response as JSON, falling back to entity-only extraction"
221 );
222 Ok(ExtractionResult {
224 entities: Vec::new(),
225 relationships: Vec::new(),
226 })
227 }
228 }
229 }
230 Err(e) => {
231 #[cfg(feature = "tracing")]
232 tracing::error!(
233 chunk_id = %chunk.id,
234 error = %e,
235 "LLM extraction failed"
236 );
237 Err(GraphRAGError::EntityExtraction {
238 message: format!("LLM extraction failed: {}", e),
239 })
240 }
241 }
242 } else {
243 Err(GraphRAGError::Config {
244 message: "Ollama client not configured".to_string(),
245 })
246 }
247 }
248
249 pub fn extract_relationships_fallback(
267 &self,
268 entities: &[Entity],
269 chunk: &TextChunk,
270 ) -> Vec<(EntityId, EntityId, String, f32)> {
271 let mut relationships = Vec::new();
272
273 let chunk_entities: Vec<&Entity> = entities
275 .iter()
276 .filter(|e| e.mentions.iter().any(|m| m.chunk_id == chunk.id))
277 .collect();
278
279 for i in 0..chunk_entities.len() {
281 for j in (i + 1)..chunk_entities.len() {
282 let entity1 = chunk_entities[i];
283 let entity2 = chunk_entities[j];
284
285 if let Some((rel_type, confidence)) =
287 self.infer_relationship_with_context(entity1, entity2, &chunk.content)
288 {
289 relationships.push((
290 entity1.id.clone(),
291 entity2.id.clone(),
292 rel_type,
293 confidence,
294 ));
295 }
296 }
297 }
298
299 relationships
300 }
301
302 fn infer_relationship_with_context(
319 &self,
320 entity1: &Entity,
321 entity2: &Entity,
322 context: &str,
323 ) -> Option<(String, f32)> {
324 let context_lower = context.to_lowercase();
325 let e1_name_lower = entity1.name.to_lowercase();
326 let e2_name_lower = entity2.name.to_lowercase();
327
328 let e1_pos = context_lower.find(&e1_name_lower)?;
330 let e2_pos = context_lower.find(&e2_name_lower)?;
331
332 let start = e1_pos.min(e2_pos);
334 let end = (e1_pos.max(e2_pos) + 50).min(context.len());
335 let window = &context_lower[start..end];
336
337 match (&entity1.entity_type[..], &entity2.entity_type[..]) {
339 ("PERSON", "PERSON") | ("CHARACTER", "CHARACTER") | ("SPEAKER", "SPEAKER") => {
341 if window.contains("said") || window.contains("replied") || window.contains("responded") {
342 Some(("RESPONDS_TO".to_string(), 0.85))
343 } else if window.contains("asked") || window.contains("questioned") {
344 Some(("QUESTIONS".to_string(), 0.85))
345 } else if window.contains("taught") || window.contains("explained") {
346 Some(("TEACHES".to_string(), 0.80))
347 } else if window.contains("discussed") || window.contains("spoke about") {
348 Some(("DISCUSSES".to_string(), 0.80))
349 } else if window.contains("loved") || window.contains("admired") {
350 Some(("ADMIRES".to_string(), 0.85))
351 } else if window.contains("argued") || window.contains("disagreed") {
352 Some(("ARGUES_WITH".to_string(), 0.85))
353 } else if window.contains("met") || window.contains("encountered") {
354 Some(("MEETS".to_string(), 0.75))
355 } else {
356 Some(("INTERACTS_WITH".to_string(), 0.60))
358 }
359 }
360
361 ("PERSON", "CONCEPT") | ("CHARACTER", "CONCEPT") | ("SPEAKER", "CONCEPT") => {
363 if window.contains("discussed") || window.contains("spoke of") {
364 Some(("DISCUSSES".to_string(), 0.80))
365 } else if window.contains("defined") || window.contains("described") {
366 Some(("DEFINES".to_string(), 0.85))
367 } else if window.contains("questioned") || window.contains("wondered about") {
368 Some(("QUESTIONS".to_string(), 0.80))
369 } else {
370 Some(("MENTIONS".to_string(), 0.70))
371 }
372 }
373
374 ("CONCEPT", "PERSON") | ("CONCEPT", "CHARACTER") | ("CONCEPT", "SPEAKER") => {
376 Some(("DISCUSSED_BY".to_string(), 0.70))
377 }
378
379 ("PERSON", "ORGANIZATION") | ("ORGANIZATION", "PERSON") => {
381 if window.contains("works for") || window.contains("employed by") {
382 Some(("WORKS_FOR".to_string(), 0.90))
383 } else if window.contains("founded") || window.contains("CEO") || window.contains("leads") {
384 Some(("LEADS".to_string(), 0.90))
385 } else {
386 Some(("ASSOCIATED_WITH".to_string(), 0.65))
387 }
388 }
389
390 ("PERSON", "LOCATION") | ("CHARACTER", "LOCATION") => {
392 if window.contains("born in") || window.contains("from") {
393 Some(("BORN_IN".to_string(), 0.90))
394 } else if window.contains("lives in") || window.contains("resides in") {
395 Some(("LIVES_IN".to_string(), 0.85))
396 } else if window.contains("traveled to") || window.contains("visited") {
397 Some(("VISITED".to_string(), 0.80))
398 } else {
399 Some(("LOCATED_IN".to_string(), 0.70))
400 }
401 }
402
403 ("ORGANIZATION", "LOCATION") | ("LOCATION", "ORGANIZATION") => {
405 if window.contains("headquartered") || window.contains("based in") {
406 Some(("HEADQUARTERED_IN".to_string(), 0.90))
407 } else {
408 Some(("LOCATED_IN".to_string(), 0.75))
409 }
410 }
411
412 ("CONCEPT", "CONCEPT") => {
414 if window.contains("similar to") || window.contains("related to") {
415 Some(("RELATED_TO".to_string(), 0.75))
416 } else if window.contains("opposite") || window.contains("contrasts with") {
417 Some(("CONTRASTS_WITH".to_string(), 0.80))
418 } else {
419 Some(("ASSOCIATED_WITH".to_string(), 0.60))
420 }
421 }
422
423 ("PERSON", "EVENT") | ("CHARACTER", "EVENT") => {
425 Some(("PARTICIPATES_IN".to_string(), 0.75))
426 }
427 ("EVENT", "LOCATION") => {
428 Some(("OCCURS_IN".to_string(), 0.80))
429 }
430
431 _ => {
433 if (e1_pos as i32 - e2_pos as i32).abs() < 100 {
435 Some(("CO_OCCURS".to_string(), 0.50))
436 } else {
437 None
438 }
439 }
440 }
441 }
442}
443
444#[cfg(test)]
445mod tests {
446 use super::*;
447 use crate::core::{ChunkId, DocumentId};
448
449 #[test]
450 fn test_prompt_generation() {
451 let extractor = LLMRelationshipExtractor::new(None).unwrap();
452 let prompt = extractor.build_extraction_prompt("Socrates discusses love with Phaedrus.");
453
454 assert!(prompt.contains("entities"));
455 assert!(prompt.contains("relationships"));
456 assert!(prompt.contains("Socrates discusses love with Phaedrus"));
457 }
458
459 #[test]
460 fn test_fallback_extraction() {
461 let extractor = LLMRelationshipExtractor::new(None).unwrap();
462
463 let chunk = TextChunk::new(
464 ChunkId::new("test".to_string()),
465 DocumentId::new("doc".to_string()),
466 "Socrates discussed love with Phaedrus in Athens.".to_string(),
467 0,
468 50,
469 );
470
471 let entities = vec![
472 Entity::new(
473 EntityId::new("person_socrates".to_string()),
474 "Socrates".to_string(),
475 "PERSON".to_string(),
476 0.9,
477 ),
478 Entity::new(
479 EntityId::new("person_phaedrus".to_string()),
480 "Phaedrus".to_string(),
481 "PERSON".to_string(),
482 0.9,
483 ),
484 ];
485
486 let relationships = extractor.extract_relationships_fallback(&entities, &chunk);
487
488 assert!(!relationships.is_empty());
490 }
491}