Skip to main content

graphrag_core/entity/
prompts.rs

1//! Prompt templates for LLM-based entity and relationship extraction
2//!
3//! Based on Microsoft GraphRAG prompts with structured JSON output
4
5use serde::{Deserialize, Serialize};
6
7/// Entity extraction prompt template (Microsoft GraphRAG style)
8pub const ENTITY_EXTRACTION_PROMPT: &str = r#"-Goal-
9Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
10
11-Steps-
121. Identify all entities. For each identified entity, extract the following information:
13- entity_name: Name of the entity, capitalized
14- entity_type: One of the following types: [{entity_types}]
15- entity_description: Comprehensive description of the entity's attributes and activities
16Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
17
182. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
19For each pair of related entities, extract the following information:
20- source_entity: name of the source entity, as identified in step 1
21- target_entity: name of the target entity, as identified in step 1
22- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
23- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
24Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)
25
263. Return output in JSON format with the following structure:
27{{
28  "entities": [
29    {{
30      "name": "entity name",
31      "type": "entity type",
32      "description": "entity description"
33    }}
34  ],
35  "relationships": [
36    {{
37      "source": "source entity name",
38      "target": "target entity name",
39      "description": "relationship description",
40      "strength": 0.8
41    }}
42  ]
43}}
44
45-Real Data-
46######################
47Entity Types: {entity_types}
48Text: {input_text}
49######################
50Output:
51"#;
52
53/// Gleaning continuation prompt for additional rounds
54pub const GLEANING_CONTINUATION_PROMPT: &str = r#"-Goal-
55You previously extracted entities and relationships from a text document. Review your previous extraction and the original text to identify any additional entities or relationships you may have missed in the first pass.
56
57-Steps-
581. Review the entities you previously identified:
59{previous_entities}
60
612. Review the relationships you previously identified:
62{previous_relationships}
63
643. Carefully review the original text again and identify:
65- Any entities you may have missed
66- Any relationships between entities you may have overlooked
67- Any entities that need better descriptions
68
694. Return ONLY the NEW entities and relationships you discovered in this pass, using the same JSON format:
70{{
71  "entities": [
72    {{
73      "name": "entity name",
74      "type": "entity type",
75      "description": "entity description"
76    }}
77  ],
78  "relationships": [
79    {{
80      "source": "source entity name",
81      "target": "target entity name",
82      "description": "relationship description",
83      "strength": 0.8
84    }}
85  ]
86}}
87
88If you found no additional entities or relationships, return empty arrays.
89
90-Real Data-
91######################
92Entity Types: {entity_types}
93Text: {input_text}
94######################
95Output:
96"#;
97
98/// Completion check prompt to determine if extraction is complete
99pub const COMPLETION_CHECK_PROMPT: &str = r#"Based on the text below and the entities/relationships already extracted, are there any significant entities or relationships that have been missed?
100
101Text:
102{input_text}
103
104Current Entities ({entity_count}):
105{entities_summary}
106
107Current Relationships ({relationship_count}):
108{relationships_summary}
109
110Think carefully about:
1111. Are all important characters, people, organizations mentioned in the text captured?
1122. Are all significant locations, places, settings identified?
1133. Are all key events, objects, concepts extracted?
1144. Are all meaningful relationships between entities documented?
115
116Respond with ONLY "YES" if the extraction is complete and thorough, or "NO" if there are still significant entities or relationships missing.
117
118Answer (YES or NO):"#;
119
120/// JSON schema for entity extraction output
121pub const ENTITY_EXTRACTION_JSON_SCHEMA: &str = r#"{
122  "type": "object",
123  "properties": {
124    "entities": {
125      "type": "array",
126      "items": {
127        "type": "object",
128        "properties": {
129          "name": {"type": "string"},
130          "type": {"type": "string"},
131          "description": {"type": "string"}
132        },
133        "required": ["name", "type", "description"]
134      }
135    },
136    "relationships": {
137      "type": "array",
138      "items": {
139        "type": "object",
140        "properties": {
141          "source": {"type": "string"},
142          "target": {"type": "string"},
143          "description": {"type": "string"},
144          "strength": {"type": "number"}
145        },
146        "required": ["source", "target", "description", "strength"]
147      }
148    }
149  },
150  "required": ["entities", "relationships"]
151}"#;
152
153/// Structured extraction output from LLM entity and relationship analysis.
154///
155/// This structure contains the results from LLM-based entity extraction,
156/// including both discovered entities and their relationships.
157#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct ExtractionOutput {
159    /// List of entities extracted from the text
160    pub entities: Vec<EntityData>,
161    /// List of relationships between extracted entities
162    pub relationships: Vec<RelationshipData>,
163}
164
165/// Represents an entity extracted from text with its metadata.
166///
167/// Contains the entity's name, type classification, and a description
168/// of its role or significance in the context.
169#[derive(Debug, Clone, Serialize, Deserialize)]
170pub struct EntityData {
171    /// The name/identifier of the extracted entity
172    pub name: String,
173    /// The type/category of the entity (e.g., "PERSON", "ORGANIZATION", "CONCEPT")
174    #[serde(rename = "type")]
175    pub entity_type: String,
176    /// Description of the entity's role or significance in the context
177    pub description: String,
178}
179
180/// Represents a relationship between two extracted entities.
181///
182/// Defines how entities are connected with a description and strength
183/// indicating the relationship's importance or confidence.
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct RelationshipData {
186    /// The source entity in the relationship
187    pub source: String,
188    /// The target entity in the relationship
189    pub target: String,
190    /// Description of the relationship type and context
191    pub description: String,
192    /// Strength/confidence score of the relationship (0.0-1.0)
193    pub strength: f64,
194}
195
196/// Prompt builder for entity extraction
197pub struct PromptBuilder {
198    entity_types: Vec<String>,
199    tuple_delimiter: String,
200}
201
202impl PromptBuilder {
203    /// Create a new prompt builder
204    pub fn new(entity_types: Vec<String>) -> Self {
205        Self {
206            entity_types,
207            tuple_delimiter: "|".to_string(),
208        }
209    }
210
211    /// Build initial entity extraction prompt
212    pub fn build_extraction_prompt(&self, text: &str) -> String {
213        let entity_types_str = self.entity_types.join(", ");
214
215        ENTITY_EXTRACTION_PROMPT
216            .replace("{entity_types}", &entity_types_str)
217            .replace("{tuple_delimiter}", &self.tuple_delimiter)
218            .replace("{input_text}", text)
219    }
220
221    /// Build gleaning continuation prompt
222    pub fn build_continuation_prompt(
223        &self,
224        text: &str,
225        previous_entities: &[EntityData],
226        previous_relationships: &[RelationshipData],
227    ) -> String {
228        let entity_types_str = self.entity_types.join(", ");
229
230        // Format previous entities for display
231        let entities_summary = previous_entities
232            .iter()
233            .map(|e| format!("- {} ({}): {}", e.name, e.entity_type, e.description))
234            .collect::<Vec<_>>()
235            .join("\n");
236
237        // Format previous relationships for display
238        let relationships_summary = previous_relationships
239            .iter()
240            .map(|r| format!("- {} -> {}: {} (strength: {:.2})", r.source, r.target, r.description, r.strength))
241            .collect::<Vec<_>>()
242            .join("\n");
243
244        GLEANING_CONTINUATION_PROMPT
245            .replace("{entity_types}", &entity_types_str)
246            .replace("{input_text}", text)
247            .replace("{previous_entities}", &entities_summary)
248            .replace("{previous_relationships}", &relationships_summary)
249    }
250
251    /// Build completion check prompt
252    pub fn build_completion_prompt(
253        &self,
254        text: &str,
255        entities: &[EntityData],
256        relationships: &[RelationshipData],
257    ) -> String {
258        // Create concise summary of entities
259        let entities_summary = entities
260            .iter()
261            .take(20)  // Limit to first 20 to keep prompt manageable
262            .map(|e| format!("- {} ({})", e.name, e.entity_type))
263            .collect::<Vec<_>>()
264            .join("\n");
265
266        let entities_summary = if entities.len() > 20 {
267            format!("{}...\n(showing 20 of {} entities)", entities_summary, entities.len())
268        } else {
269            entities_summary
270        };
271
272        // Create concise summary of relationships
273        let relationships_summary = relationships
274            .iter()
275            .take(20)  // Limit to first 20
276            .map(|r| format!("- {} -> {}", r.source, r.target))
277            .collect::<Vec<_>>()
278            .join("\n");
279
280        let relationships_summary = if relationships.len() > 20 {
281            format!("{}...\n(showing 20 of {} relationships)", relationships_summary, relationships.len())
282        } else {
283            relationships_summary
284        };
285
286        COMPLETION_CHECK_PROMPT
287            .replace("{input_text}", text)
288            .replace("{entity_count}", &entities.len().to_string())
289            .replace("{entities_summary}", &entities_summary)
290            .replace("{relationship_count}", &relationships.len().to_string())
291            .replace("{relationships_summary}", &relationships_summary)
292    }
293}
294
295#[cfg(test)]
296mod tests {
297    use super::*;
298
299    #[test]
300    fn test_build_extraction_prompt() {
301        let builder = PromptBuilder::new(vec![
302            "PERSON".to_string(),
303            "LOCATION".to_string(),
304            "ORGANIZATION".to_string(),
305        ]);
306
307        let prompt = builder.build_extraction_prompt("Tom and Huck went to the cave.");
308
309        assert!(prompt.contains("PERSON"));
310        assert!(prompt.contains("LOCATION"));
311        assert!(prompt.contains("ORGANIZATION"));
312        assert!(prompt.contains("Tom and Huck went to the cave."));
313    }
314
315    #[test]
316    fn test_build_continuation_prompt() {
317        let builder = PromptBuilder::new(vec!["PERSON".to_string()]);
318
319        let previous_entities = vec![
320            EntityData {
321                name: "Tom".to_string(),
322                entity_type: "PERSON".to_string(),
323                description: "A young boy".to_string(),
324            },
325        ];
326
327        let previous_relationships = vec![
328            RelationshipData {
329                source: "Tom".to_string(),
330                target: "Huck".to_string(),
331                description: "friends".to_string(),
332                strength: 0.9,
333            },
334        ];
335
336        let prompt = builder.build_continuation_prompt(
337            "Tom and Huck are best friends.",
338            &previous_entities,
339            &previous_relationships,
340        );
341
342        assert!(prompt.contains("Tom"));
343        assert!(prompt.contains("Huck"));
344        assert!(prompt.contains("friends"));
345    }
346
347    #[test]
348    fn test_build_completion_prompt() {
349        let builder = PromptBuilder::new(vec!["PERSON".to_string()]);
350
351        let entities = vec![
352            EntityData {
353                name: "Tom".to_string(),
354                entity_type: "PERSON".to_string(),
355                description: "A young boy".to_string(),
356            },
357        ];
358
359        let relationships = vec![
360            RelationshipData {
361                source: "Tom".to_string(),
362                target: "Huck".to_string(),
363                description: "friends".to_string(),
364                strength: 0.9,
365            },
366        ];
367
368        let prompt = builder.build_completion_prompt("Test text", &entities, &relationships);
369
370        assert!(prompt.contains("Tom"));
371        assert!(prompt.contains("YES or NO"));
372    }
373
374    #[test]
375    fn test_extraction_output_serialization() {
376        let output = ExtractionOutput {
377            entities: vec![
378                EntityData {
379                    name: "Tom Sawyer".to_string(),
380                    entity_type: "PERSON".to_string(),
381                    description: "The protagonist".to_string(),
382                },
383            ],
384            relationships: vec![
385                RelationshipData {
386                    source: "Tom Sawyer".to_string(),
387                    target: "Huck Finn".to_string(),
388                    description: "best friends".to_string(),
389                    strength: 0.95,
390                },
391            ],
392        };
393
394        let json = serde_json::to_string(&output).unwrap();
395        assert!(json.contains("Tom Sawyer"));
396        assert!(json.contains("PERSON"));
397
398        let deserialized: ExtractionOutput = serde_json::from_str(&json).unwrap();
399        assert_eq!(deserialized.entities.len(), 1);
400        assert_eq!(deserialized.relationships.len(), 1);
401    }
402}