graphrag_core/core/
mod.rs

1//! Core data structures and abstractions for GraphRAG
2//!
3//! This module contains the fundamental types, traits, and error handling
4//! that power the GraphRAG system.
5
6pub mod error;
7pub mod metadata;
8
9// Registry requires async feature (uses storage)
10#[cfg(feature = "async")]
11pub mod registry;
12
13// Traits module requires async feature
14#[cfg(feature = "async")]
15pub mod traits;
16
17#[cfg(test)]
18pub mod test_traits;
19
20// Re-export key items for convenience
21pub use error::{ErrorContext, ErrorSeverity, GraphRAGError, Result};
22pub use metadata::ChunkMetadata;
23
24#[cfg(feature = "async")]
25pub use registry::{RegistryBuilder, ServiceConfig, ServiceContext, ServiceRegistry};
26
27// Traits require async feature
28#[cfg(feature = "async")]
29pub use traits::*;
30
31/// Core trait for text chunking strategies
32///
33/// This trait provides a simple interface for different chunking approaches.
34/// Implementations can range from simple text splitters to sophisticated
35/// AST-based code chunking strategies.
36///
37/// # Examples
38///
39/// ```rust
40/// use graphrag_core::{ChunkingStrategy, TextChunk};
41///
42/// struct SimpleChunker;
43///
44/// impl ChunkingStrategy for SimpleChunker {
45///     fn chunk(&self, text: &str) -> Vec<TextChunk> {
46///         // Simple implementation
47///         vec![]
48///     }
49/// }
50/// ```
51pub trait ChunkingStrategy: Send + Sync {
52    /// Chunk text into pieces following the strategy's logic
53    ///
54    /// # Arguments
55    /// * `text` - The input text to chunk
56    ///
57    /// # Returns
58    /// A vector of TextChunk objects representing the chunks
59    fn chunk(&self, text: &str) -> Vec<TextChunk>;
60}
61
62use indexmap::IndexMap;
63use petgraph::{graph::NodeIndex, Graph};
64use std::collections::HashMap;
65
66// PageRank-related imports are only available when the feature is enabled
67#[cfg(feature = "pagerank")]
68use sprs::CsMat;
69
70/// Type alias for adjacency matrix build result to reduce type complexity
71/// Only available when pagerank feature is enabled
72#[cfg(feature = "pagerank")]
73type AdjacencyMatrixResult = (
74    CsMat<f64>,
75    HashMap<EntityId, usize>,
76    HashMap<usize, EntityId>,
77);
78
79/// Unique identifier for documents
80#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
81pub struct DocumentId(pub String);
82
83impl DocumentId {
84    /// Creates a new DocumentId from a string
85    pub fn new(id: String) -> Self {
86        Self(id)
87    }
88}
89
90impl std::fmt::Display for DocumentId {
91    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92        write!(f, "{}", self.0)
93    }
94}
95
96impl From<String> for DocumentId {
97    fn from(s: String) -> Self {
98        Self(s)
99    }
100}
101
102impl From<DocumentId> for String {
103    fn from(id: DocumentId) -> Self {
104        id.0
105    }
106}
107
108/// Unique identifier for entities
109#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
110pub struct EntityId(pub String);
111
112impl EntityId {
113    /// Creates a new EntityId from a string
114    pub fn new(id: String) -> Self {
115        Self(id)
116    }
117}
118
119impl std::fmt::Display for EntityId {
120    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
121        write!(f, "{}", self.0)
122    }
123}
124
125impl From<String> for EntityId {
126    fn from(s: String) -> Self {
127        Self(s)
128    }
129}
130
131impl From<EntityId> for String {
132    fn from(id: EntityId) -> Self {
133        id.0
134    }
135}
136
137/// Unique identifier for text chunks
138#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
139pub struct ChunkId(pub String);
140
141impl ChunkId {
142    /// Creates a new ChunkId from a string
143    pub fn new(id: String) -> Self {
144        Self(id)
145    }
146}
147
148impl std::fmt::Display for ChunkId {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        write!(f, "{}", self.0)
151    }
152}
153
154impl From<String> for ChunkId {
155    fn from(s: String) -> Self {
156        Self(s)
157    }
158}
159
160impl From<ChunkId> for String {
161    fn from(id: ChunkId) -> Self {
162        id.0
163    }
164}
165
166/// A document in the system
167#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
168pub struct Document {
169    /// Unique identifier for the document
170    pub id: DocumentId,
171    /// Title of the document
172    pub title: String,
173    /// Full text content of the document
174    pub content: String,
175    /// Additional metadata key-value pairs
176    pub metadata: IndexMap<String, String>,
177    /// Text chunks extracted from the document
178    pub chunks: Vec<TextChunk>,
179}
180
181/// A chunk of text from a document
182#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
183pub struct TextChunk {
184    /// Unique identifier for the chunk
185    pub id: ChunkId,
186    /// ID of the parent document
187    pub document_id: DocumentId,
188    /// Text content of the chunk
189    pub content: String,
190    /// Starting character offset in the original document
191    pub start_offset: usize,
192    /// Ending character offset in the original document
193    pub end_offset: usize,
194    /// Optional vector embedding for the chunk
195    pub embedding: Option<Vec<f32>>,
196    /// List of entity IDs mentioned in this chunk
197    pub entities: Vec<EntityId>,
198    /// Semantic metadata for the chunk (chapter, keywords, summary, etc.)
199    pub metadata: ChunkMetadata,
200}
201
202/// An entity extracted from text
203#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
204pub struct Entity {
205    /// Unique identifier for the entity
206    pub id: EntityId,
207    /// Name or label of the entity
208    pub name: String,
209    /// Type or category of the entity (e.g., "person", "organization")
210    pub entity_type: String,
211    /// Confidence score for the entity extraction (0.0-1.0)
212    pub confidence: f32,
213    /// List of locations where this entity is mentioned
214    pub mentions: Vec<EntityMention>,
215    /// Optional vector embedding for the entity
216    pub embedding: Option<Vec<f32>>,
217}
218
219/// A mention of an entity in text
220#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
221pub struct EntityMention {
222    /// ID of the chunk containing this mention
223    pub chunk_id: ChunkId,
224    /// Starting character offset of the mention in the chunk
225    pub start_offset: usize,
226    /// Ending character offset of the mention in the chunk
227    pub end_offset: usize,
228    /// Confidence score for this specific mention (0.0-1.0)
229    pub confidence: f32,
230}
231
232/// Relationship between entities
233#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
234pub struct Relationship {
235    /// Source entity ID for the relationship
236    pub source: EntityId,
237    /// Target entity ID for the relationship
238    pub target: EntityId,
239    /// Type of relationship (e.g., "works_for", "located_in")
240    pub relation_type: String,
241    /// Confidence score for the relationship (0.0-1.0)
242    pub confidence: f32,
243    /// Chunk IDs providing context for this relationship
244    pub context: Vec<ChunkId>,
245}
246
247/// Knowledge graph containing entities and their relationships
248#[derive(Debug)]
249pub struct KnowledgeGraph {
250    graph: Graph<Entity, Relationship>,
251    entity_index: HashMap<EntityId, NodeIndex>,
252    documents: IndexMap<DocumentId, Document>,
253    chunks: IndexMap<ChunkId, TextChunk>,
254}
255
256impl KnowledgeGraph {
257    /// Create a new empty knowledge graph
258    pub fn new() -> Self {
259        Self {
260            graph: Graph::new(),
261            entity_index: HashMap::new(),
262            documents: IndexMap::new(),
263            chunks: IndexMap::new(),
264        }
265    }
266
267    /// Add a document to the knowledge graph
268    pub fn add_document(&mut self, document: Document) -> Result<()> {
269        let document_id = document.id.clone();
270
271        // Add chunks to the index
272        for chunk in &document.chunks {
273            self.chunks.insert(chunk.id.clone(), chunk.clone());
274        }
275
276        // Store the document
277        self.documents.insert(document_id, document);
278
279        Ok(())
280    }
281
282    /// Add an entity to the knowledge graph
283    pub fn add_entity(&mut self, entity: Entity) -> Result<NodeIndex> {
284        let entity_id = entity.id.clone();
285        let node_index = self.graph.add_node(entity);
286        self.entity_index.insert(entity_id, node_index);
287        Ok(node_index)
288    }
289
290    /// Add a relationship between entities
291    pub fn add_relationship(&mut self, relationship: Relationship) -> Result<()> {
292        let source_idx = self.entity_index.get(&relationship.source).ok_or_else(|| {
293            crate::GraphRAGError::GraphConstruction {
294                message: format!("Source entity {} not found", relationship.source),
295            }
296        })?;
297
298        let target_idx = self.entity_index.get(&relationship.target).ok_or_else(|| {
299            crate::GraphRAGError::GraphConstruction {
300                message: format!("Target entity {} not found", relationship.target),
301            }
302        })?;
303
304        self.graph.add_edge(*source_idx, *target_idx, relationship);
305        Ok(())
306    }
307
308    /// Add a chunk to the knowledge graph
309    pub fn add_chunk(&mut self, chunk: TextChunk) -> Result<()> {
310        self.chunks.insert(chunk.id.clone(), chunk);
311        Ok(())
312    }
313
314    /// Get an entity by ID
315    pub fn get_entity(&self, id: &EntityId) -> Option<&Entity> {
316        let node_idx = self.entity_index.get(id)?;
317        self.graph.node_weight(*node_idx)
318    }
319
320    /// Get a document by ID
321    pub fn get_document(&self, id: &DocumentId) -> Option<&Document> {
322        self.documents.get(id)
323    }
324
325    /// Get a chunk by ID
326    pub fn get_chunk(&self, id: &ChunkId) -> Option<&TextChunk> {
327        self.chunks.get(id)
328    }
329
330    /// Get a mutable reference to an entity by ID
331    pub fn get_entity_mut(&mut self, id: &EntityId) -> Option<&mut Entity> {
332        let node_idx = self.entity_index.get(id)?;
333        self.graph.node_weight_mut(*node_idx)
334    }
335
336    /// Get a mutable reference to a chunk by ID
337    pub fn get_chunk_mut(&mut self, id: &ChunkId) -> Option<&mut TextChunk> {
338        self.chunks.get_mut(id)
339    }
340
341    /// Get all entities
342    pub fn entities(&self) -> impl Iterator<Item = &Entity> {
343        self.graph.node_weights()
344    }
345
346    /// Get all entities (mutable)
347    pub fn entities_mut(&mut self) -> impl Iterator<Item = &mut Entity> {
348        self.graph.node_weights_mut()
349    }
350
351    /// Get all documents
352    pub fn documents(&self) -> impl Iterator<Item = &Document> {
353        self.documents.values()
354    }
355
356    /// Get all documents (mutable)
357    pub fn documents_mut(&mut self) -> impl Iterator<Item = &mut Document> {
358        self.documents.values_mut()
359    }
360
361    /// Get all chunks
362    pub fn chunks(&self) -> impl Iterator<Item = &TextChunk> {
363        self.chunks.values()
364    }
365
366    /// Get all chunks (mutable)
367    pub fn chunks_mut(&mut self) -> impl Iterator<Item = &mut TextChunk> {
368        self.chunks.values_mut()
369    }
370
371    /// Get neighbors of an entity
372    pub fn get_neighbors(&self, entity_id: &EntityId) -> Vec<(&Entity, &Relationship)> {
373        use petgraph::visit::EdgeRef;
374
375        if let Some(&node_idx) = self.entity_index.get(entity_id) {
376            self.graph
377                .edges(node_idx)
378                .filter_map(|edge| {
379                    let target_entity = self.graph.node_weight(edge.target())?;
380                    Some((target_entity, edge.weight()))
381                })
382                .collect()
383        } else {
384            Vec::new()
385        }
386    }
387
388    /// Get all relationships in the graph
389    pub fn get_all_relationships(&self) -> Vec<&Relationship> {
390        self.graph.edge_weights().collect()
391    }
392
393    /// Load knowledge graph from JSON file
394    pub fn load_from_json(file_path: &str) -> Result<Self> {
395        use std::fs;
396
397        // Read and parse JSON
398        let json_str = fs::read_to_string(file_path)?;
399        let json_data = json::parse(&json_str)
400            .map_err(|e| GraphRAGError::Config {
401                message: format!("Failed to parse JSON: {}", e),
402            })?;
403
404        let mut kg = KnowledgeGraph::new();
405
406        // Load entities
407        if json_data["entities"].is_array() {
408            for entity_obj in json_data["entities"].members() {
409                let id = EntityId::new(entity_obj["id"].as_str().unwrap_or("").to_string());
410                let name = entity_obj["name"].as_str().unwrap_or("").to_string();
411                let entity_type = entity_obj["type"].as_str().unwrap_or("").to_string();
412                let confidence = entity_obj["confidence"].as_f32().unwrap_or(0.0);
413
414                // Parse mentions
415                let mut mentions = Vec::new();
416                if entity_obj["mentions"].is_array() {
417                    for mention_obj in entity_obj["mentions"].members() {
418                        let mention = EntityMention {
419                            chunk_id: ChunkId::new(mention_obj["chunk_id"].as_str().unwrap_or("").to_string()),
420                            start_offset: mention_obj["start_offset"].as_usize().unwrap_or(0),
421                            end_offset: mention_obj["end_offset"].as_usize().unwrap_or(0),
422                            confidence: mention_obj["confidence"].as_f32().unwrap_or(0.0),
423                        };
424                        mentions.push(mention);
425                    }
426                }
427
428                let entity = Entity {
429                    id,
430                    name,
431                    entity_type,
432                    confidence,
433                    mentions,
434                    embedding: None, // Embeddings not stored in JSON
435                };
436
437                kg.add_entity(entity)?;
438            }
439        }
440
441        // Load relationships
442        if json_data["relationships"].is_array() {
443            for rel_obj in json_data["relationships"].members() {
444                let source = EntityId::new(rel_obj["source_id"].as_str().unwrap_or("").to_string());
445                let target = EntityId::new(rel_obj["target_id"].as_str().unwrap_or("").to_string());
446                let relation_type = rel_obj["relation_type"].as_str().unwrap_or("").to_string();
447                let confidence = rel_obj["confidence"].as_f32().unwrap_or(0.0);
448
449                let mut context = Vec::new();
450                if rel_obj["context_chunks"].is_array() {
451                    for chunk_id in rel_obj["context_chunks"].members() {
452                        if let Some(chunk_id_str) = chunk_id.as_str() {
453                            context.push(ChunkId::new(chunk_id_str.to_string()));
454                        }
455                    }
456                }
457
458                let relationship = Relationship {
459                    source,
460                    target,
461                    relation_type,
462                    confidence,
463                    context,
464                };
465
466                // Ignore errors if entities don't exist
467                let _ = kg.add_relationship(relationship);
468            }
469        }
470
471        // Load chunks with full content
472        if json_data["chunks"].is_array() {
473            for chunk_obj in json_data["chunks"].members() {
474                let id = ChunkId::new(chunk_obj["id"].as_str().unwrap_or("").to_string());
475                let document_id = DocumentId::new(chunk_obj["document_id"].as_str().unwrap_or("").to_string());
476                let start_offset = chunk_obj["start_offset"].as_usize().unwrap_or(0);
477                let end_offset = chunk_obj["end_offset"].as_usize().unwrap_or(0);
478
479                // Get full content
480                let content = chunk_obj["content"].as_str().unwrap_or("").to_string();
481
482                // Load entities list
483                let mut entities = Vec::new();
484                if chunk_obj["entities"].is_array() {
485                    for entity_id in chunk_obj["entities"].members() {
486                        if let Some(entity_id_str) = entity_id.as_str() {
487                            entities.push(EntityId::new(entity_id_str.to_string()));
488                        }
489                    }
490                }
491
492                let chunk = TextChunk {
493                    id,
494                    document_id,
495                    content,
496                    start_offset,
497                    end_offset,
498                    embedding: None, // Embeddings not stored in JSON
499                    entities,
500                    metadata: ChunkMetadata::default(),
501                };
502                kg.add_chunk(chunk)?;
503            }
504        }
505
506        // Load documents with full content
507        if json_data["documents"].is_array() {
508            for doc_obj in json_data["documents"].members() {
509                let id = DocumentId::new(doc_obj["id"].as_str().unwrap_or("").to_string());
510                let title = doc_obj["title"].as_str().unwrap_or("").to_string();
511                let content = doc_obj["content"].as_str().unwrap_or("").to_string();
512
513                // Parse metadata
514                let mut metadata = IndexMap::new();
515                if doc_obj["metadata"].is_object() {
516                    for (key, value) in doc_obj["metadata"].entries() {
517                        metadata.insert(key.to_string(), value.as_str().unwrap_or("").to_string());
518                    }
519                }
520
521                let document = Document {
522                    id,
523                    title,
524                    content,
525                    metadata,
526                    chunks: vec![], // Chunks are stored separately in the graph
527                };
528                kg.add_document(document)?;
529            }
530        }
531
532        Ok(kg)
533    }
534
535    /// Save knowledge graph to JSON file with optimized format for entities and relationships
536    pub fn save_to_json(&self, file_path: &str) -> Result<()> {
537        use std::fs;
538
539        // Create optimized JSON structure based on 2024 best practices
540        let mut json_data = json::JsonValue::new_object();
541
542        // Add metadata
543        json_data["metadata"] = json::object! {
544            "format_version" => "2.0",
545            "created_at" => chrono::Utc::now().to_rfc3339(),
546            "total_entities" => self.entities().count(),
547            "total_relationships" => self.get_all_relationships().len(),
548            "total_chunks" => self.chunks().count(),
549            "total_documents" => self.documents().count()
550        };
551
552        // Add entities with enhanced information
553        let mut entities_array = json::JsonValue::new_array();
554        for entity in self.entities() {
555            let mut entity_obj = json::object! {
556                "id" => entity.id.to_string(),
557                "name" => entity.name.clone(),
558                "type" => entity.entity_type.clone(),
559                "confidence" => entity.confidence,
560                "mentions_count" => entity.mentions.len()
561            };
562
563            // Add mentions with chunk references
564            let mut mentions_array = json::JsonValue::new_array();
565            for mention in &entity.mentions {
566                mentions_array
567                    .push(json::object! {
568                        "chunk_id" => mention.chunk_id.to_string(),
569                        "start_offset" => mention.start_offset,
570                        "end_offset" => mention.end_offset,
571                        "confidence" => mention.confidence
572                    })
573                    .unwrap();
574            }
575            entity_obj["mentions"] = mentions_array;
576
577            // Add embedding if present
578            if let Some(embedding) = &entity.embedding {
579                entity_obj["has_embedding"] = true.into();
580                entity_obj["embedding_dimension"] = embedding.len().into();
581                // Store only first few dimensions for debugging (full embedding too large for JSON)
582                let sample_embedding: Vec<f32> = embedding.iter().take(5).cloned().collect();
583                entity_obj["embedding_sample"] = sample_embedding.into();
584            } else {
585                entity_obj["has_embedding"] = false.into();
586            }
587
588            entities_array.push(entity_obj).unwrap();
589        }
590        json_data["entities"] = entities_array;
591
592        // Add relationships with detailed information
593        let mut relationships_array = json::JsonValue::new_array();
594        for relationship in self.get_all_relationships() {
595            let rel_obj = json::object! {
596                "source_id" => relationship.source.to_string(),
597                "target_id" => relationship.target.to_string(),
598                "relation_type" => relationship.relation_type.clone(),
599                "confidence" => relationship.confidence,
600                "context_chunks" => relationship.context.iter()
601                    .map(|c| c.to_string())
602                    .collect::<Vec<String>>()
603            };
604            relationships_array.push(rel_obj).unwrap();
605        }
606        json_data["relationships"] = relationships_array;
607
608        // Add chunks information with FULL content for persistence
609        let mut chunks_array = json::JsonValue::new_array();
610        for chunk in self.chunks() {
611            let mut chunk_obj = json::object! {
612                "id" => chunk.id.to_string(),
613                "document_id" => chunk.document_id.to_string(),
614                "content" => chunk.content.clone(),  // Full content for persistence
615                "start_offset" => chunk.start_offset,
616                "end_offset" => chunk.end_offset
617            };
618
619            // Add entities list
620            let entities_list: Vec<String> = chunk.entities.iter()
621                .map(|e| e.to_string())
622                .collect();
623            chunk_obj["entities"] = entities_list.into();
624
625            // Add embedding info
626            chunk_obj["has_embedding"] = chunk.embedding.is_some().into();
627            if let Some(embedding) = &chunk.embedding {
628                chunk_obj["embedding_dimension"] = embedding.len().into();
629            }
630
631            chunks_array.push(chunk_obj).unwrap();
632        }
633        json_data["chunks"] = chunks_array;
634
635        // Add documents information with FULL content for persistence
636        let mut documents_array = json::JsonValue::new_array();
637        for document in self.documents() {
638            let mut meta_obj = json::JsonValue::new_object();
639            for (key, value) in &document.metadata {
640                meta_obj[key] = value.clone().into();
641            }
642
643            let doc_obj = json::object! {
644                "id" => document.id.to_string(),
645                "title" => document.title.clone(),
646                "content" => document.content.clone(),  // Full content for persistence
647                "metadata" => meta_obj
648            };
649            documents_array.push(doc_obj).unwrap();
650        }
651        json_data["documents"] = documents_array;
652
653        // Save to file
654        fs::write(file_path, json_data.dump())?;
655        tracing::info!("Knowledge graph saved to {file_path}");
656
657        Ok(())
658    }
659
660    /// Find entities by name (case-insensitive partial match)
661    pub fn find_entities_by_name(&self, name: &str) -> impl Iterator<Item = &Entity> {
662        let name_lower = name.to_lowercase();
663        self.entities()
664            .filter(move |entity| entity.name.to_lowercase().contains(&name_lower))
665    }
666
667    /// Get entity by ID (string version for compatibility)
668    pub fn get_entity_by_id(&self, id: &str) -> Option<&Entity> {
669        let entity_id = EntityId::new(id.to_string());
670        self.get_entity(&entity_id)
671    }
672
673    /// Get entity relationships
674    pub fn get_entity_relationships(&self, entity_id: &str) -> impl Iterator<Item = &Relationship> {
675        let entity_id = EntityId::new(entity_id.to_string());
676        if let Some(&node_idx) = self.entity_index.get(&entity_id) {
677            self.graph
678                .edges(node_idx)
679                .map(|edge| edge.weight())
680                .collect::<Vec<_>>()
681                .into_iter()
682        } else {
683            Vec::new().into_iter()
684        }
685    }
686
687    /// Find relationship path between two entities (simplified BFS)
688    pub fn find_relationship_path(
689        &self,
690        entity1: &str,
691        entity2: &str,
692        _max_depth: usize,
693    ) -> Vec<String> {
694        let entity1_id = EntityId::new(entity1.to_string());
695        let entity2_id = EntityId::new(entity2.to_string());
696
697        let node1 = self.entity_index.get(&entity1_id);
698        let node2 = self.entity_index.get(&entity2_id);
699
700        if let (Some(&start), Some(&end)) = (node1, node2) {
701            // Simple path finding - just check direct connections for now
702            use petgraph::visit::EdgeRef;
703            for edge in self.graph.edges(start) {
704                if edge.target() == end {
705                    return vec![edge.weight().relation_type.clone()];
706                }
707            }
708        }
709
710        Vec::new() // No path found or entities don't exist
711    }
712
713    /// Build PageRank calculator from current graph structure
714    /// Only available when pagerank feature is enabled
715    #[cfg(feature = "pagerank")]
716    pub fn build_pagerank_calculator(
717        &self,
718    ) -> Result<crate::graph::pagerank::PersonalizedPageRank> {
719        let config = crate::graph::pagerank::PageRankConfig::default();
720        let (adjacency_matrix, node_mapping, reverse_mapping) = self.build_adjacency_matrix()?;
721
722        Ok(crate::graph::pagerank::PersonalizedPageRank::new(
723            config,
724            adjacency_matrix,
725            node_mapping,
726            reverse_mapping,
727        ))
728    }
729
730    /// Build adjacency matrix for PageRank calculations
731    /// Only available when pagerank feature is enabled
732    #[cfg(feature = "pagerank")]
733    fn build_adjacency_matrix(&self) -> Result<AdjacencyMatrixResult> {
734        let nodes: Vec<EntityId> = self.entities().map(|e| e.id.clone()).collect();
735        let node_mapping: HashMap<EntityId, usize> = nodes
736            .iter()
737            .enumerate()
738            .map(|(i, id)| (id.clone(), i))
739            .collect();
740        let reverse_mapping: HashMap<usize, EntityId> = nodes
741            .iter()
742            .enumerate()
743            .map(|(i, id)| (i, id.clone()))
744            .collect();
745
746        // Build sparse adjacency matrix from relationships
747        let mut row_indices = Vec::new();
748        let mut col_indices = Vec::new();
749        let mut values = Vec::new();
750
751        for relationship in self.get_all_relationships() {
752            if let (Some(&from_idx), Some(&to_idx)) = (
753                node_mapping.get(&relationship.source),
754                node_mapping.get(&relationship.target),
755            ) {
756                row_indices.push(from_idx);
757                col_indices.push(to_idx);
758                values.push(relationship.confidence as f64);
759            }
760        }
761
762        let matrix = if row_indices.is_empty() {
763            // Create an empty matrix if no relationships
764            sprs::CsMat::zero((nodes.len(), nodes.len()))
765        } else {
766            // Build using triplet matrix first, then convert to CSR
767            let mut triplet_mat = sprs::TriMat::new((nodes.len(), nodes.len()));
768            for ((row, col), val) in row_indices
769                .into_iter()
770                .zip(col_indices.into_iter())
771                .zip(values.into_iter())
772            {
773                triplet_mat.add_triplet(row, col, val);
774            }
775            triplet_mat.to_csr()
776        };
777
778        Ok((matrix, node_mapping, reverse_mapping))
779    }
780
781    /// Count entities in the graph
782    pub fn entity_count(&self) -> usize {
783        self.entities().count()
784    }
785
786    /// Count relationships in the graph
787    pub fn relationship_count(&self) -> usize {
788        self.get_all_relationships().len()
789    }
790
791    /// Count documents in the graph
792    pub fn document_count(&self) -> usize {
793        self.documents().count()
794    }
795
796    /// Get all relationships as an iterator
797    pub fn relationships(&self) -> impl Iterator<Item = &Relationship> {
798        self.graph.edge_weights()
799    }
800
801    /// Clear all entities and relationships while preserving documents and chunks
802    ///
803    /// This is useful for rebuilding the graph from scratch without reloading documents.
804    pub fn clear_entities_and_relationships(&mut self) {
805        self.graph.clear();
806        self.entity_index.clear();
807        // Note: documents and chunks are preserved
808    }
809
810    /// Convert KnowledgeGraph to petgraph format for Leiden clustering
811    /// Returns a graph with entity names as nodes and relationship confidences as edge weights
812    /// Only available when leiden feature is enabled
813    #[cfg(feature = "leiden")]
814    pub fn to_leiden_graph(&self) -> petgraph::Graph<String, f32, petgraph::Undirected> {
815        let mut graph = Graph::new_undirected();
816        let mut node_map = HashMap::new();
817
818        // Add nodes (entities) - use entity name as node label
819        for entity in self.entities() {
820            let idx = graph.add_node(entity.name.clone());
821            node_map.insert(entity.id.clone(), idx);
822        }
823
824        // Add edges (relationships) with confidence as weight
825        for rel in self.get_all_relationships() {
826            if let (Some(&src), Some(&tgt)) = (
827                node_map.get(&rel.source),
828                node_map.get(&rel.target)
829            ) {
830                graph.add_edge(src, tgt, rel.confidence);
831            }
832        }
833
834        graph
835    }
836
837    /// Detect hierarchical communities in the entity graph using Leiden algorithm
838    /// Only available when leiden feature is enabled
839    ///
840    /// # Arguments
841    /// * `config` - Leiden algorithm configuration
842    ///
843    /// # Returns
844    /// HierarchicalCommunities structure with community assignments at each level
845    ///
846    /// # Example
847    /// ```no_run
848    /// use graphrag_core::{KnowledgeGraph, graph::LeidenConfig};
849    ///
850    /// let graph = KnowledgeGraph::new();
851    /// // ... build graph ...
852    ///
853    /// let config = LeidenConfig {
854    ///     max_cluster_size: 10,
855    ///     resolution: 1.0,
856    ///     ..Default::default()
857    /// };
858    ///
859    /// let communities = graph.detect_hierarchical_communities(config).unwrap();
860    /// ```
861    #[cfg(feature = "leiden")]
862    pub fn detect_hierarchical_communities(
863        &self,
864        config: crate::graph::leiden::LeidenConfig,
865    ) -> Result<crate::graph::leiden::HierarchicalCommunities> {
866        use crate::graph::leiden::LeidenCommunityDetector;
867
868        // Convert to Leiden-compatible graph format
869        let leiden_graph = self.to_leiden_graph();
870
871        // Create detector and run clustering
872        let detector = LeidenCommunityDetector::new(config);
873        let mut communities = detector.detect_communities(&leiden_graph)?;
874
875        // Enrich with entity metadata
876        communities.entity_mapping = Some(self.build_entity_mapping());
877
878        Ok(communities)
879    }
880
881    /// Build mapping from entity names to entity metadata
882    /// Used to enrich hierarchical communities with entity information
883    #[cfg(feature = "leiden")]
884    fn build_entity_mapping(&self) -> HashMap<String, crate::graph::leiden::EntityMetadata> {
885        use crate::graph::leiden::EntityMetadata;
886
887        self.entities()
888            .map(|entity| {
889                let metadata = EntityMetadata {
890                    id: entity.id.to_string(),
891                    name: entity.name.clone(),
892                    entity_type: entity.entity_type.clone(),
893                    confidence: entity.confidence,
894                    mention_count: entity.mentions.len(),
895                };
896                (entity.name.clone(), metadata)
897            })
898            .collect()
899    }
900}
901
902impl Default for KnowledgeGraph {
903    fn default() -> Self {
904        Self::new()
905    }
906}
907
908impl Document {
909    /// Create a new document
910    pub fn new(id: DocumentId, title: String, content: String) -> Self {
911        Self {
912            id,
913            title,
914            content,
915            metadata: IndexMap::new(),
916            chunks: Vec::new(),
917        }
918    }
919
920    /// Add metadata to the document
921    pub fn with_metadata(mut self, key: String, value: String) -> Self {
922        self.metadata.insert(key, value);
923        self
924    }
925
926    /// Add chunks to the document
927    pub fn with_chunks(mut self, chunks: Vec<TextChunk>) -> Self {
928        self.chunks = chunks;
929        self
930    }
931}
932
933impl TextChunk {
934    /// Create a new text chunk
935    pub fn new(
936        id: ChunkId,
937        document_id: DocumentId,
938        content: String,
939        start_offset: usize,
940        end_offset: usize,
941    ) -> Self {
942        Self {
943            id,
944            document_id,
945            content,
946            start_offset,
947            end_offset,
948            embedding: None,
949            entities: Vec::new(),
950            metadata: ChunkMetadata::default(),
951        }
952    }
953
954    /// Add an embedding to the chunk
955    pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
956        self.embedding = Some(embedding);
957        self
958    }
959
960    /// Add entities to the chunk
961    pub fn with_entities(mut self, entities: Vec<EntityId>) -> Self {
962        self.entities = entities;
963        self
964    }
965
966    /// Add metadata to the chunk
967    pub fn with_metadata(mut self, metadata: ChunkMetadata) -> Self {
968        self.metadata = metadata;
969        self
970    }
971}
972
973impl Entity {
974    /// Create a new entity
975    pub fn new(id: EntityId, name: String, entity_type: String, confidence: f32) -> Self {
976        Self {
977            id,
978            name,
979            entity_type,
980            confidence,
981            mentions: Vec::new(),
982            embedding: None,
983        }
984    }
985
986    /// Add mentions to the entity
987    pub fn with_mentions(mut self, mentions: Vec<EntityMention>) -> Self {
988        self.mentions = mentions;
989        self
990    }
991
992    /// Add an embedding to the entity
993    pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
994        self.embedding = Some(embedding);
995        self
996    }
997}
graphrag_core/core/mod.rs

graphrag_core/core/
mod.rs