graphrag-core 0.2.0

Core portable library for GraphRAG - works on native and WASM
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
//! Prompt templates for LLM-based entity and relationship extraction
//!
//! Based on Microsoft GraphRAG prompts with structured JSON output

use serde::{Deserialize, Serialize};

/// Entity extraction prompt template (Microsoft GraphRAG style)
pub const ENTITY_EXTRACTION_PROMPT: &str = r#"-Goal-
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)

3. Return output in JSON format with the following structure:
{{
  "entities": [
    {{
      "name": "entity name",
      "type": "entity type",
      "description": "entity description"
    }}
  ],
  "relationships": [
    {{
      "source": "source entity name",
      "target": "target entity name",
      "description": "relationship description",
      "strength": 0.8
    }}
  ]
}}

-Real Data-
######################
Entity Types: {entity_types}
Text: {input_text}
######################
Output:
"#;

/// Gleaning continuation prompt for additional rounds
pub const GLEANING_CONTINUATION_PROMPT: &str = r#"-Goal-
You previously extracted entities and relationships from a text document. Review your previous extraction and the original text to identify any additional entities or relationships you may have missed in the first pass.

-Steps-
1. Review the entities you previously identified:
{previous_entities}

2. Review the relationships you previously identified:
{previous_relationships}

3. Carefully review the original text again and identify:
- Any entities you may have missed
- Any relationships between entities you may have overlooked
- Any entities that need better descriptions

4. Return ONLY the NEW entities and relationships you discovered in this pass, using the same JSON format:
{{
  "entities": [
    {{
      "name": "entity name",
      "type": "entity type",
      "description": "entity description"
    }}
  ],
  "relationships": [
    {{
      "source": "source entity name",
      "target": "target entity name",
      "description": "relationship description",
      "strength": 0.8
    }}
  ]
}}

If you found no additional entities or relationships, return empty arrays.

-Real Data-
######################
Entity Types: {entity_types}
Text: {input_text}
######################
Output:
"#;

/// Completion check prompt to determine if extraction is complete
pub const COMPLETION_CHECK_PROMPT: &str = r#"Based on the text below and the entities/relationships already extracted, are there any significant entities or relationships that have been missed?

Text:
{input_text}

Current Entities ({entity_count}):
{entities_summary}

Current Relationships ({relationship_count}):
{relationships_summary}

Think carefully about:
1. Are all important characters, people, organizations mentioned in the text captured?
2. Are all significant locations, places, settings identified?
3. Are all key events, objects, concepts extracted?
4. Are all meaningful relationships between entities documented?

Respond with ONLY "YES" if the extraction is complete and thorough, or "NO" if there are still significant entities or relationships missing.

Answer (YES or NO):"#;

/// JSON schema for entity extraction output
pub const ENTITY_EXTRACTION_JSON_SCHEMA: &str = r#"{
  "type": "object",
  "properties": {
    "entities": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "type": {"type": "string"},
          "description": {"type": "string"}
        },
        "required": ["name", "type", "description"]
      }
    },
    "relationships": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "source": {"type": "string"},
          "target": {"type": "string"},
          "description": {"type": "string"},
          "strength": {"type": "number"}
        },
        "required": ["source", "target", "description", "strength"]
      }
    }
  },
  "required": ["entities", "relationships"]
}"#;

/// Structured extraction output from LLM entity and relationship analysis.
///
/// This structure contains the results from LLM-based entity extraction,
/// including both discovered entities and their relationships.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionOutput {
    /// List of entities extracted from the text
    pub entities: Vec<EntityData>,
    /// List of relationships between extracted entities
    pub relationships: Vec<RelationshipData>,
}

/// Represents an entity extracted from text with its metadata.
///
/// Contains the entity's name, type classification, and a description
/// of its role or significance in the context.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EntityData {
    /// The name/identifier of the extracted entity
    pub name: String,
    /// The type/category of the entity (e.g., "PERSON", "ORGANIZATION", "CONCEPT")
    #[serde(rename = "type")]
    pub entity_type: String,
    /// Description of the entity's role or significance in the context
    #[serde(default)]
    pub description: String,
}

/// Represents a relationship between two extracted entities.
///
/// Defines how entities are connected with a description and strength
/// indicating the relationship's importance or confidence.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RelationshipData {
    /// The source entity in the relationship
    pub source: String,
    /// The target entity in the relationship
    pub target: String,
    /// Description of the relationship type and context
    pub description: String,
    /// Strength/confidence score of the relationship (0.0-1.0)
    pub strength: f64,
}

/// Prompt builder for entity extraction
pub struct PromptBuilder {
    entity_types: Vec<String>,
    tuple_delimiter: String,
}

impl PromptBuilder {
    /// Create a new prompt builder
    pub fn new(entity_types: Vec<String>) -> Self {
        Self {
            entity_types,
            tuple_delimiter: "|".to_string(),
        }
    }

    /// Build initial entity extraction prompt
    pub fn build_extraction_prompt(&self, text: &str) -> String {
        let entity_types_str = self.entity_types.join(", ");

        ENTITY_EXTRACTION_PROMPT
            .replace("{entity_types}", &entity_types_str)
            .replace("{tuple_delimiter}", &self.tuple_delimiter)
            .replace("{input_text}", text)
    }

    /// Build gleaning continuation prompt
    pub fn build_continuation_prompt(
        &self,
        text: &str,
        previous_entities: &[EntityData],
        previous_relationships: &[RelationshipData],
    ) -> String {
        let entity_types_str = self.entity_types.join(", ");

        // Format previous entities for display
        let entities_summary = previous_entities
            .iter()
            .map(|e| format!("- {} ({}): {}", e.name, e.entity_type, e.description))
            .collect::<Vec<_>>()
            .join("\n");

        // Format previous relationships for display
        let relationships_summary = previous_relationships
            .iter()
            .map(|r| {
                format!(
                    "- {} -> {}: {} (strength: {:.2})",
                    r.source, r.target, r.description, r.strength
                )
            })
            .collect::<Vec<_>>()
            .join("\n");

        GLEANING_CONTINUATION_PROMPT
            .replace("{entity_types}", &entity_types_str)
            .replace("{input_text}", text)
            .replace("{previous_entities}", &entities_summary)
            .replace("{previous_relationships}", &relationships_summary)
    }

    /// Build completion check prompt
    pub fn build_completion_prompt(
        &self,
        text: &str,
        entities: &[EntityData],
        relationships: &[RelationshipData],
    ) -> String {
        // Create concise summary of entities
        let entities_summary = entities
            .iter()
            .take(20)  // Limit to first 20 to keep prompt manageable
            .map(|e| format!("- {} ({})", e.name, e.entity_type))
            .collect::<Vec<_>>()
            .join("\n");

        let entities_summary = if entities.len() > 20 {
            format!(
                "{}...\n(showing 20 of {} entities)",
                entities_summary,
                entities.len()
            )
        } else {
            entities_summary
        };

        // Create concise summary of relationships
        let relationships_summary = relationships
            .iter()
            .take(20)  // Limit to first 20
            .map(|r| format!("- {} -> {}", r.source, r.target))
            .collect::<Vec<_>>()
            .join("\n");

        let relationships_summary = if relationships.len() > 20 {
            format!(
                "{}...\n(showing 20 of {} relationships)",
                relationships_summary,
                relationships.len()
            )
        } else {
            relationships_summary
        };

        COMPLETION_CHECK_PROMPT
            .replace("{input_text}", text)
            .replace("{entity_count}", &entities.len().to_string())
            .replace("{entities_summary}", &entities_summary)
            .replace("{relationship_count}", &relationships.len().to_string())
            .replace("{relationships_summary}", &relationships_summary)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_build_extraction_prompt() {
        let builder = PromptBuilder::new(vec![
            "PERSON".to_string(),
            "LOCATION".to_string(),
            "ORGANIZATION".to_string(),
        ]);

        let prompt = builder.build_extraction_prompt("Tom and Huck went to the cave.");

        assert!(prompt.contains("PERSON"));
        assert!(prompt.contains("LOCATION"));
        assert!(prompt.contains("ORGANIZATION"));
        assert!(prompt.contains("Tom and Huck went to the cave."));
    }

    #[test]
    fn test_build_continuation_prompt() {
        let builder = PromptBuilder::new(vec!["PERSON".to_string()]);

        let previous_entities = vec![EntityData {
            name: "Tom".to_string(),
            entity_type: "PERSON".to_string(),
            description: "A young boy".to_string(),
        }];

        let previous_relationships = vec![RelationshipData {
            source: "Tom".to_string(),
            target: "Huck".to_string(),
            description: "friends".to_string(),
            strength: 0.9,
        }];

        let prompt = builder.build_continuation_prompt(
            "Tom and Huck are best friends.",
            &previous_entities,
            &previous_relationships,
        );

        assert!(prompt.contains("Tom"));
        assert!(prompt.contains("Huck"));
        assert!(prompt.contains("friends"));
    }

    #[test]
    fn test_build_completion_prompt() {
        let builder = PromptBuilder::new(vec!["PERSON".to_string()]);

        let entities = vec![EntityData {
            name: "Tom".to_string(),
            entity_type: "PERSON".to_string(),
            description: "A young boy".to_string(),
        }];

        let relationships = vec![RelationshipData {
            source: "Tom".to_string(),
            target: "Huck".to_string(),
            description: "friends".to_string(),
            strength: 0.9,
        }];

        let prompt = builder.build_completion_prompt("Test text", &entities, &relationships);

        assert!(prompt.contains("Tom"));
        assert!(prompt.contains("YES or NO"));
    }

    #[test]
    fn test_extraction_output_serialization() {
        let output = ExtractionOutput {
            entities: vec![EntityData {
                name: "Tom Sawyer".to_string(),
                entity_type: "PERSON".to_string(),
                description: "The protagonist".to_string(),
            }],
            relationships: vec![RelationshipData {
                source: "Tom Sawyer".to_string(),
                target: "Huck Finn".to_string(),
                description: "best friends".to_string(),
                strength: 0.95,
            }],
        };

        let json = serde_json::to_string(&output).unwrap();
        assert!(json.contains("Tom Sawyer"));
        assert!(json.contains("PERSON"));

        let deserialized: ExtractionOutput = serde_json::from_str(&json).unwrap();
        assert_eq!(deserialized.entities.len(), 1);
        assert_eq!(deserialized.relationships.len(), 1);
    }
}