use crate::Result;
use std::fs;
pub mod enhancements;
#[cfg(feature = "json5-support")]
pub mod json5_loader;
mod json_parser;
pub mod loader;
#[cfg(feature = "json5-support")]
pub mod schema_validator;
pub mod setconfig;
pub mod validation;
pub use setconfig::{
AlgorithmicEmbeddingsConfig,
AlgorithmicEntityConfig,
AlgorithmicGraphConfig,
AlgorithmicPipelineConfig,
AlgorithmicRetrievalConfig,
HybridEmbeddingsConfig,
HybridEntityConfig,
HybridGraphConfig,
HybridPipelineConfig,
HybridRetrievalConfig,
HybridWeightsConfig,
ModeConfig,
SemanticEmbeddingsConfig,
SemanticEntityConfig,
SemanticGraphConfig,
SemanticPipelineConfig,
SemanticRetrievalConfig,
SetConfig,
};
pub use validation::{validate_config_file, Validatable, ValidationResult};
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Config {
pub output_dir: String,
pub chunk_size: usize,
pub chunk_overlap: usize,
pub max_entities_per_chunk: Option<usize>,
pub top_k_results: Option<usize>,
pub similarity_threshold: Option<f32>,
#[serde(default = "default_approach")]
pub approach: String,
pub embeddings: EmbeddingConfig,
pub graph: GraphConfig,
pub text: TextConfig,
pub entities: EntityConfig,
pub retrieval: RetrievalConfig,
pub parallel: ParallelConfig,
pub ollama: crate::ollama::OllamaConfig,
pub gliner: GlinerConfig,
pub enhancements: enhancements::EnhancementsConfig,
pub auto_save: AutoSaveConfig,
pub summarization: crate::summarization::HierarchicalConfig,
pub zero_cost_approach: ZeroCostApproachConfig,
#[serde(default)]
pub advanced_features: AdvancedFeaturesConfig,
#[serde(default)]
pub suppress_progress_bars: bool,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct GlinerConfig {
pub enabled: bool,
pub model_path: String,
pub tokenizer_path: String,
pub mode: String,
pub entity_labels: Vec<String>,
pub relation_labels: Vec<String>,
pub entity_threshold: f32,
pub relation_threshold: f32,
pub use_gpu: bool,
#[serde(default)]
pub max_concurrent_chunks: Option<usize>,
}
impl Default for GlinerConfig {
fn default() -> Self {
Self {
enabled: false,
model_path: String::new(),
tokenizer_path: String::new(),
mode: "span".to_string(),
entity_labels: vec![
"person".into(),
"organization".into(),
"location".into(),
"concept".into(),
],
relation_labels: vec!["related to".into(), "part of".into(), "causes".into()],
entity_threshold: 0.4,
relation_threshold: 0.5,
use_gpu: false,
max_concurrent_chunks: None,
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct AutoSaveConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub base_dir: Option<String>,
#[serde(default = "default_auto_save_interval")]
pub interval_seconds: u64,
#[serde(default)]
pub workspace_name: Option<String>,
#[serde(default = "default_max_versions")]
pub max_versions: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ZeroCostApproachConfig {
#[serde(default = "default_zero_cost_approach")]
pub approach: String,
#[serde(default)]
pub lazy_graphrag: LazyGraphRAGConfig,
#[serde(default)]
pub e2_graphrag: E2GraphRAGConfig,
#[serde(default)]
pub pure_algorithmic: PureAlgorithmicConfig,
#[serde(default)]
pub hybrid_strategy: HybridStrategyConfig,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
pub struct LazyGraphRAGConfig {
pub enabled: bool,
pub concept_extraction: ConceptExtractionConfig,
pub co_occurrence: CoOccurrenceConfig,
pub indexing: LazyIndexingConfig,
pub query_expansion: LazyQueryExpansionConfig,
pub relevance_scoring: LazyRelevanceScoringConfig,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ConceptExtractionConfig {
pub min_concept_length: usize,
pub max_concept_words: usize,
pub use_noun_phrases: bool,
pub use_capitalization: bool,
pub use_title_case: bool,
pub use_tf_idf_scoring: bool,
pub min_term_frequency: usize,
pub max_concepts_per_chunk: usize,
pub min_concept_score: f32,
pub exclude_stopwords: bool,
pub custom_stopwords: Vec<String>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CoOccurrenceConfig {
pub window_size: usize,
pub min_co_occurrence: usize,
pub jaccard_threshold: f32,
pub max_edges_per_node: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LazyIndexingConfig {
pub use_bidirectional_index: bool,
pub enable_hnsw_index: bool,
pub cache_size: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LazyQueryExpansionConfig {
pub enabled: bool,
pub max_expansions: usize,
pub expansion_model: String,
pub expansion_temperature: f32,
pub max_tokens_per_expansion: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LazyRelevanceScoringConfig {
pub enabled: bool,
pub scoring_model: String,
pub batch_size: usize,
pub temperature: f32,
pub max_tokens_per_score: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
pub struct E2GraphRAGConfig {
pub enabled: bool,
pub ner_extraction: NERExtractionConfig,
pub keyword_extraction: KeywordExtractionConfig,
pub graph_construction: E2GraphConstructionConfig,
pub indexing: E2IndexingConfig,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct NERExtractionConfig {
pub entity_types: Vec<String>,
pub use_capitalized_patterns: bool,
pub use_title_case_patterns: bool,
pub use_quoted_patterns: bool,
pub use_abbreviations: bool,
pub use_contextual_disambiguation: bool,
pub min_context_words: usize,
pub min_confidence: f32,
pub use_positional_boost: bool,
pub use_frequency_boost: bool,
}
impl Default for NERExtractionConfig {
fn default() -> Self {
Self {
entity_types: vec![
"PERSON".to_string(),
"ORG".to_string(),
"LOCATION".to_string(),
],
use_capitalized_patterns: true,
use_title_case_patterns: true,
use_quoted_patterns: true,
use_abbreviations: true,
use_contextual_disambiguation: true,
min_context_words: 5,
min_confidence: 0.7,
use_positional_boost: true,
use_frequency_boost: true,
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct KeywordExtractionConfig {
pub algorithms: Vec<String>,
pub max_keywords_per_chunk: usize,
pub min_keyword_length: usize,
pub combine_algorithms: bool,
}
impl Default for KeywordExtractionConfig {
fn default() -> Self {
Self {
algorithms: vec!["tfidf".to_string(), "yake".to_string()],
max_keywords_per_chunk: 10,
min_keyword_length: 3,
combine_algorithms: true,
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct E2GraphConstructionConfig {
pub relationship_types: Vec<String>,
pub min_relationship_score: f32,
pub max_relationships_per_entity: usize,
pub use_mutual_information: bool,
}
impl Default for E2GraphConstructionConfig {
fn default() -> Self {
Self {
relationship_types: vec!["CO_OCCURS_WITH".to_string(), "RELATED_TO".to_string()],
min_relationship_score: 0.5,
max_relationships_per_entity: 20,
use_mutual_information: true,
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct E2IndexingConfig {
pub batch_size: usize,
pub enable_parallel_processing: bool,
pub cache_concept_vectors: bool,
pub use_hash_embeddings: bool,
}
impl Default for E2IndexingConfig {
fn default() -> Self {
Self {
batch_size: 32,
enable_parallel_processing: true,
cache_concept_vectors: true,
use_hash_embeddings: false,
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct PureAlgorithmicConfig {
pub enabled: bool,
pub pattern_extraction: PatternExtractionConfig,
pub keyword_extraction: PureKeywordExtractionConfig,
pub relationship_discovery: RelationshipDiscoveryConfig,
pub search_ranking: SearchRankingConfig,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct PatternExtractionConfig {
pub capitalized_patterns: Vec<String>,
pub technical_patterns: Vec<String>,
pub context_patterns: Vec<String>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct PureKeywordExtractionConfig {
pub algorithm: String,
pub max_keywords: usize,
pub min_word_length: usize,
pub use_positional_boost: bool,
pub use_frequency_filter: bool,
pub min_term_frequency: usize,
pub max_term_frequency_ratio: f32,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct RelationshipDiscoveryConfig {
pub window_size: usize,
pub min_co_occurrence: usize,
pub use_mutual_information: bool,
pub relationship_types: Vec<String>,
pub scoring_method: String,
pub min_similarity_score: f32,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct SearchRankingConfig {
pub vector_search: VectorSearchConfig,
pub keyword_search: KeywordSearchConfig,
pub graph_traversal: GraphTraversalConfig,
pub hybrid_fusion: HybridFusionConfig,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
pub struct VectorSearchConfig {
pub enabled: bool,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct KeywordSearchConfig {
pub enabled: bool,
pub algorithm: String,
pub k1: f32,
pub b: f32,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct GraphTraversalConfig {
pub enabled: bool,
pub algorithm: String,
pub damping_factor: f32,
pub max_iterations: usize,
pub personalized: bool,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct HybridFusionConfig {
pub enabled: bool,
pub weights: FusionWeights,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct FusionWeights {
pub keywords: f32,
pub graph: f32,
pub bm25: f32,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct HybridStrategyConfig {
pub lazy_algorithmic: LazyAlgorithmicConfig,
pub progressive: ProgressiveConfig,
pub budget_aware: BudgetAwareConfig,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct LazyAlgorithmicConfig {
pub indexing_approach: String,
pub query_approach: String,
pub cost_optimization: String,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ProgressiveConfig {
pub level_0: String,
pub level_1: String,
pub level_2: String,
pub level_3: String,
pub level_4_plus: String,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct BudgetAwareConfig {
pub daily_budget_usd: f64,
pub queries_per_day: usize,
pub max_llm_cost_per_query: f64,
pub strategy: String,
pub fallback_to_algorithmic: bool,
}
fn default_zero_cost_approach() -> String {
"pure_algorithmic".to_string()
}
impl Default for ZeroCostApproachConfig {
fn default() -> Self {
Self {
approach: default_zero_cost_approach(),
lazy_graphrag: LazyGraphRAGConfig::default(),
e2_graphrag: E2GraphRAGConfig::default(),
pure_algorithmic: PureAlgorithmicConfig::default(),
hybrid_strategy: HybridStrategyConfig::default(),
}
}
}
impl Default for ConceptExtractionConfig {
fn default() -> Self {
Self {
min_concept_length: 3,
max_concept_words: 5,
use_noun_phrases: true,
use_capitalization: true,
use_title_case: true,
use_tf_idf_scoring: true,
min_term_frequency: 2,
max_concepts_per_chunk: 10,
min_concept_score: 0.1,
exclude_stopwords: true,
custom_stopwords: vec!["the".to_string(), "and".to_string(), "or".to_string()],
}
}
}
impl Default for CoOccurrenceConfig {
fn default() -> Self {
Self {
window_size: 50,
min_co_occurrence: 2,
jaccard_threshold: 0.2,
max_edges_per_node: 25,
}
}
}
impl Default for LazyIndexingConfig {
fn default() -> Self {
Self {
use_bidirectional_index: true,
enable_hnsw_index: false,
cache_size: 10000,
}
}
}
impl Default for LazyQueryExpansionConfig {
fn default() -> Self {
Self {
enabled: true,
max_expansions: 3,
expansion_model: "llama3.1:8b".to_string(),
expansion_temperature: 0.1,
max_tokens_per_expansion: 50,
}
}
}
impl Default for LazyRelevanceScoringConfig {
fn default() -> Self {
Self {
enabled: true,
scoring_model: "llama3.1:8b".to_string(),
batch_size: 10,
temperature: 0.2,
max_tokens_per_score: 30,
}
}
}
impl Default for PureAlgorithmicConfig {
fn default() -> Self {
Self {
enabled: true,
pattern_extraction: Default::default(),
keyword_extraction: Default::default(),
relationship_discovery: Default::default(),
search_ranking: Default::default(),
}
}
}
impl Default for PatternExtractionConfig {
fn default() -> Self {
Self {
capitalized_patterns: vec![r"[A-Z][a-z]+".to_string()],
technical_patterns: vec![r"[a-z]+-[a-z]+".to_string()],
context_patterns: vec![r"\b(the|this)\s+(\w+)".to_string()],
}
}
}
impl Default for PureKeywordExtractionConfig {
fn default() -> Self {
Self {
algorithm: "tf_idf".to_string(),
max_keywords: 20,
min_word_length: 4,
use_positional_boost: true,
use_frequency_filter: true,
min_term_frequency: 2,
max_term_frequency_ratio: 0.8,
}
}
}
impl Default for RelationshipDiscoveryConfig {
fn default() -> Self {
Self {
window_size: 30,
min_co_occurrence: 2,
use_mutual_information: true,
relationship_types: vec!["co_occurs_with".to_string()],
scoring_method: "jaccard_similarity".to_string(),
min_similarity_score: 0.1,
}
}
}
impl Default for SearchRankingConfig {
fn default() -> Self {
Self {
vector_search: VectorSearchConfig { enabled: false },
keyword_search: KeywordSearchConfig {
enabled: true,
algorithm: "bm25".to_string(),
k1: 1.2,
b: 0.75,
},
graph_traversal: GraphTraversalConfig {
enabled: true,
algorithm: "pagerank".to_string(),
damping_factor: 0.85,
max_iterations: 20,
personalized: true,
},
hybrid_fusion: HybridFusionConfig {
enabled: true,
weights: FusionWeights {
keywords: 0.4,
graph: 0.4,
bm25: 0.2,
},
},
}
}
}
impl Default for HybridStrategyConfig {
fn default() -> Self {
Self {
lazy_algorithmic: LazyAlgorithmicConfig {
indexing_approach: "e2_graphrag".to_string(),
query_approach: "lazy_graphrag".to_string(),
cost_optimization: "indexing".to_string(),
},
progressive: ProgressiveConfig {
level_0: "pure_algorithmic".to_string(),
level_1: "pure_algorithmic".to_string(),
level_2: "e2_graphrag".to_string(),
level_3: "lazy_graphrag".to_string(),
level_4_plus: "lazy_graphrag".to_string(),
},
budget_aware: BudgetAwareConfig {
daily_budget_usd: 1.0,
queries_per_day: 1000,
max_llm_cost_per_query: 0.002,
strategy: "lazy_graphrag".to_string(),
fallback_to_algorithmic: true,
},
}
}
}
impl Default for KeywordSearchConfig {
fn default() -> Self {
Self {
enabled: true,
algorithm: "bm25".to_string(),
k1: 1.2,
b: 0.75,
}
}
}
impl Default for GraphTraversalConfig {
fn default() -> Self {
Self {
enabled: true,
algorithm: "pagerank".to_string(),
damping_factor: 0.85,
max_iterations: 20,
personalized: true,
}
}
}
impl Default for HybridFusionConfig {
fn default() -> Self {
Self {
enabled: true,
weights: FusionWeights {
keywords: 0.4,
graph: 0.4,
bm25: 0.2,
},
}
}
}
impl Default for FusionWeights {
fn default() -> Self {
Self {
keywords: 0.4,
graph: 0.4,
bm25: 0.2,
}
}
}
impl Default for LazyAlgorithmicConfig {
fn default() -> Self {
Self {
indexing_approach: "e2_graphrag".to_string(),
query_approach: "lazy_graphrag".to_string(),
cost_optimization: "indexing".to_string(),
}
}
}
impl Default for ProgressiveConfig {
fn default() -> Self {
Self {
level_0: "pure_algorithmic".to_string(),
level_1: "pure_algorithmic".to_string(),
level_2: "e2_graphrag".to_string(),
level_3: "lazy_graphrag".to_string(),
level_4_plus: "lazy_graphrag".to_string(),
}
}
}
impl Default for BudgetAwareConfig {
fn default() -> Self {
Self {
daily_budget_usd: 1.0,
queries_per_day: 1000,
max_llm_cost_per_query: 0.002,
strategy: "lazy_graphrag".to_string(),
fallback_to_algorithmic: true,
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct EmbeddingConfig {
pub dimension: usize,
pub backend: String,
#[serde(default)]
pub model: Option<String>,
pub fallback_to_hash: bool,
pub api_endpoint: Option<String>,
pub api_key: Option<String>,
#[serde(default)]
pub cache_dir: Option<String>,
#[serde(default = "default_batch_size")]
pub batch_size: usize,
}
fn default_batch_size() -> usize {
32
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct GraphConfig {
pub max_connections: usize,
pub similarity_threshold: f32,
#[serde(default = "default_true")]
pub extract_relationships: bool,
#[serde(default = "default_relationship_confidence")]
pub relationship_confidence_threshold: f32,
#[serde(default)]
pub traversal: TraversalConfigParams,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct TraversalConfigParams {
#[serde(default = "default_max_traversal_depth")]
pub max_depth: usize,
#[serde(default = "default_max_paths")]
pub max_paths: usize,
#[serde(default = "default_true")]
pub use_edge_weights: bool,
#[serde(default = "default_min_relationship_strength")]
pub min_relationship_strength: f32,
}
impl Default for TraversalConfigParams {
fn default() -> Self {
Self {
max_depth: default_max_traversal_depth(),
max_paths: default_max_paths(),
use_edge_weights: true,
min_relationship_strength: default_min_relationship_strength(),
}
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct TextConfig {
pub chunk_size: usize,
pub chunk_overlap: usize,
pub languages: Vec<String>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct EntityConfig {
pub min_confidence: f32,
pub entity_types: Vec<String>,
#[serde(default)]
pub use_gleaning: bool,
#[serde(default = "default_max_gleaning_rounds")]
pub max_gleaning_rounds: usize,
#[serde(default)]
pub enable_triple_reflection: bool,
#[serde(default = "default_validation_confidence")]
pub validation_min_confidence: f32,
#[serde(default)]
pub use_atomic_facts: bool,
#[serde(default = "default_max_fact_tokens")]
pub max_fact_tokens: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
pub struct AdvancedFeaturesConfig {
#[serde(default)]
pub symbolic_anchoring: SymbolicAnchoringConfig,
#[serde(default)]
pub dynamic_weighting: DynamicWeightingConfig,
#[serde(default)]
pub causal_analysis: CausalAnalysisConfig,
#[serde(default)]
pub hierarchical_clustering: HierarchicalClusteringConfig,
#[serde(default)]
pub weight_optimization: WeightOptimizationConfig,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct SymbolicAnchoringConfig {
#[serde(default = "default_anchor_min_relevance")]
pub min_relevance: f32,
#[serde(default = "default_max_anchors")]
pub max_anchors: usize,
#[serde(default = "default_max_entities_per_anchor")]
pub max_entities_per_anchor: usize,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct DynamicWeightingConfig {
#[serde(default = "default_true")]
pub enable_semantic_boost: bool,
#[serde(default = "default_true")]
pub enable_temporal_boost: bool,
#[serde(default = "default_true")]
pub enable_concept_boost: bool,
#[serde(default = "default_true")]
pub enable_causal_boost: bool,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CausalAnalysisConfig {
#[serde(default = "default_causal_min_confidence")]
pub min_confidence: f32,
#[serde(default = "default_causal_min_strength")]
pub min_causal_strength: f32,
#[serde(default = "default_max_chain_depth")]
pub max_chain_depth: usize,
#[serde(default = "default_true")]
pub require_temporal_consistency: bool,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct HierarchicalClusteringConfig {
#[serde(default = "default_num_levels")]
pub num_levels: usize,
#[serde(default = "default_resolutions")]
pub resolutions: Vec<f32>,
#[serde(default = "default_min_cluster_size")]
pub min_cluster_size: usize,
#[serde(default = "default_true")]
pub generate_summaries: bool,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct WeightOptimizationConfig {
#[serde(default = "default_learning_rate")]
pub learning_rate: f32,
#[serde(default = "default_max_iterations")]
pub max_iterations: usize,
#[serde(default = "default_slope_window")]
pub slope_window: usize,
#[serde(default = "default_stagnation_threshold")]
pub stagnation_threshold: f32,
#[serde(default = "default_true")]
pub use_llm_eval: bool,
#[serde(default)]
pub objective_weights: ObjectiveWeightsConfig,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ObjectiveWeightsConfig {
#[serde(default = "default_relevance_weight")]
pub relevance: f32,
#[serde(default = "default_faithfulness_weight")]
pub faithfulness: f32,
#[serde(default = "default_conciseness_weight")]
pub conciseness: f32,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct RetrievalConfig {
pub top_k: usize,
pub search_algorithm: String,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ParallelConfig {
pub num_threads: usize,
pub enabled: bool,
pub min_batch_size: usize,
pub chunk_batch_size: usize,
pub parallel_embeddings: bool,
pub parallel_graph_ops: bool,
pub parallel_vector_ops: bool,
}
fn default_embedding_dim() -> usize {
384
}
fn default_embedding_backend() -> String {
"hash".to_string()
}
fn default_max_connections() -> usize {
10
}
fn default_similarity_threshold() -> f32 {
0.8
}
fn default_chunk_size() -> usize {
1000
}
fn default_chunk_overlap() -> usize {
200
}
fn default_languages() -> Vec<String> {
vec!["en".to_string()]
}
fn default_min_confidence() -> f32 {
0.7
}
fn default_entity_types() -> Vec<String> {
vec![
"PERSON".to_string(),
"ORG".to_string(),
"LOCATION".to_string(),
]
}
fn default_top_k() -> usize {
10
}
fn default_search_algorithm() -> String {
"cosine".to_string()
}
fn default_num_threads() -> usize {
0
} fn default_min_batch_size() -> usize {
10
}
fn default_chunk_batch_size() -> usize {
100
}
fn default_true() -> bool {
true
}
fn default_relationship_confidence() -> f32 {
0.5
}
fn default_max_gleaning_rounds() -> usize {
3
}
fn default_validation_confidence() -> f32 {
0.7
}
fn default_anchor_min_relevance() -> f32 {
0.3
}
fn default_max_anchors() -> usize {
5
}
fn default_max_entities_per_anchor() -> usize {
10
}
fn default_causal_min_confidence() -> f32 {
0.3
}
fn default_causal_min_strength() -> f32 {
0.5
}
fn default_max_chain_depth() -> usize {
5
}
fn default_num_levels() -> usize {
3
}
fn default_resolutions() -> Vec<f32> {
vec![1.0, 0.5, 0.2]
}
fn default_min_cluster_size() -> usize {
2
}
fn default_learning_rate() -> f32 {
0.1
}
fn default_max_iterations() -> usize {
20
}
fn default_slope_window() -> usize {
3
}
fn default_stagnation_threshold() -> f32 {
0.01
}
fn default_relevance_weight() -> f32 {
0.4
}
fn default_faithfulness_weight() -> f32 {
0.4
}
fn default_conciseness_weight() -> f32 {
0.2
}
fn default_max_fact_tokens() -> usize {
400
}
fn default_approach() -> String {
"semantic".to_string()
}
fn default_max_traversal_depth() -> usize {
3
}
fn default_max_paths() -> usize {
10
}
fn default_min_relationship_strength() -> f32 {
0.3
}
fn default_auto_save_interval() -> u64 {
300 }
fn default_max_versions() -> usize {
5 }
impl Default for Config {
fn default() -> Self {
Self {
output_dir: "./output".to_string(),
chunk_size: default_chunk_size(),
chunk_overlap: default_chunk_overlap(),
max_entities_per_chunk: Some(10),
top_k_results: Some(default_top_k()),
similarity_threshold: Some(default_similarity_threshold()),
approach: default_approach(),
embeddings: EmbeddingConfig {
dimension: default_embedding_dim(),
backend: default_embedding_backend(),
model: Some("sentence-transformers/all-MiniLM-L6-v2".to_string()),
fallback_to_hash: true,
api_endpoint: None,
api_key: None,
cache_dir: None,
batch_size: default_batch_size(),
},
graph: GraphConfig {
max_connections: default_max_connections(),
similarity_threshold: default_similarity_threshold(),
extract_relationships: default_true(),
relationship_confidence_threshold: default_relationship_confidence(),
traversal: TraversalConfigParams::default(),
},
text: TextConfig {
chunk_size: default_chunk_size(),
chunk_overlap: default_chunk_overlap(),
languages: default_languages(),
},
entities: EntityConfig {
min_confidence: default_min_confidence(),
entity_types: default_entity_types(),
use_gleaning: false,
max_gleaning_rounds: default_max_gleaning_rounds(),
enable_triple_reflection: false,
validation_min_confidence: default_validation_confidence(),
use_atomic_facts: false,
max_fact_tokens: default_max_fact_tokens(),
},
retrieval: RetrievalConfig {
top_k: default_top_k(),
search_algorithm: default_search_algorithm(),
},
parallel: ParallelConfig {
num_threads: default_num_threads(),
enabled: true,
min_batch_size: default_min_batch_size(),
chunk_batch_size: default_chunk_batch_size(),
parallel_embeddings: true,
parallel_graph_ops: true,
parallel_vector_ops: true,
},
ollama: crate::ollama::OllamaConfig::default(),
gliner: GlinerConfig::default(),
enhancements: enhancements::EnhancementsConfig::default(),
auto_save: AutoSaveConfig {
enabled: false,
base_dir: None,
interval_seconds: default_auto_save_interval(),
workspace_name: None,
max_versions: default_max_versions(),
},
summarization: crate::summarization::HierarchicalConfig::default(),
zero_cost_approach: ZeroCostApproachConfig::default(),
advanced_features: AdvancedFeaturesConfig::default(),
suppress_progress_bars: false,
}
}
}
impl Default for AutoSaveConfig {
fn default() -> Self {
Self {
enabled: false,
base_dir: None,
interval_seconds: default_auto_save_interval(),
workspace_name: None,
max_versions: default_max_versions(),
}
}
}
impl Config {
pub fn quick(workspace: impl AsRef<std::path::Path>) -> Self {
let ws = workspace.as_ref();
let ws_str = ws.to_string_lossy().into_owned();
let (base, name) = match (ws.parent(), ws.file_name()) {
(Some(p), Some(f)) if !p.as_os_str().is_empty() => (
p.to_string_lossy().into_owned(),
f.to_string_lossy().into_owned(),
),
_ => (".".to_string(), ws_str.clone()),
};
Self {
output_dir: ws_str,
auto_save: AutoSaveConfig {
enabled: true,
base_dir: Some(base),
workspace_name: Some(name),
..AutoSaveConfig::default()
},
..Self::default()
}
}
pub fn with_ollama(mut self) -> Self {
self.ollama.enabled = true;
self.embeddings.backend = "ollama".to_string();
self
}
pub fn with_ollama_host(mut self, host: impl Into<String>) -> Self {
self.ollama.host = host.into();
self.ollama.enabled = true;
self
}
pub fn with_chunk_size(mut self, size: usize) -> Self {
self.chunk_size = size;
self.chunk_overlap = size / 5;
self.text.chunk_size = size;
self.text.chunk_overlap = size / 5;
self
}
}
impl Default for SymbolicAnchoringConfig {
fn default() -> Self {
Self {
min_relevance: default_anchor_min_relevance(),
max_anchors: default_max_anchors(),
max_entities_per_anchor: default_max_entities_per_anchor(),
}
}
}
impl Default for DynamicWeightingConfig {
fn default() -> Self {
Self {
enable_semantic_boost: default_true(),
enable_temporal_boost: default_true(),
enable_concept_boost: default_true(),
enable_causal_boost: default_true(),
}
}
}
impl Default for CausalAnalysisConfig {
fn default() -> Self {
Self {
min_confidence: default_causal_min_confidence(),
min_causal_strength: default_causal_min_strength(),
max_chain_depth: default_max_chain_depth(),
require_temporal_consistency: default_true(),
}
}
}
impl Default for HierarchicalClusteringConfig {
fn default() -> Self {
Self {
num_levels: default_num_levels(),
resolutions: default_resolutions(),
min_cluster_size: default_min_cluster_size(),
generate_summaries: default_true(),
}
}
}
impl Default for WeightOptimizationConfig {
fn default() -> Self {
Self {
learning_rate: default_learning_rate(),
max_iterations: default_max_iterations(),
slope_window: default_slope_window(),
stagnation_threshold: default_stagnation_threshold(),
use_llm_eval: default_true(),
objective_weights: ObjectiveWeightsConfig::default(),
}
}
}
impl Default for ObjectiveWeightsConfig {
fn default() -> Self {
Self {
relevance: default_relevance_weight(),
faithfulness: default_faithfulness_weight(),
conciseness: default_conciseness_weight(),
}
}
}
impl Config {
#[cfg(feature = "hierarchical-config")]
pub fn load() -> Result<Self> {
use figment::{
providers::{Env, Format, Serialized, Toml},
Figment,
};
let mut figment = Figment::new()
.merge(Serialized::defaults(Config::default()));
if let Some(home) = dirs::home_dir() {
let user_config = home.join(".graphrag").join("config.toml");
if user_config.exists() {
figment = figment.merge(Toml::file(user_config));
}
}
let project_config = std::path::Path::new("graphrag.toml");
if project_config.exists() {
figment = figment.merge(Toml::file(project_config));
}
figment = figment.merge(Env::prefixed("GRAPHRAG_").split("_"));
figment
.extract()
.map_err(|e| crate::core::GraphRAGError::Config {
message: format!("Failed to load hierarchical configuration: {}", e),
})
}
#[cfg(not(feature = "hierarchical-config"))]
pub fn load() -> Result<Self> {
Ok(Config::default())
}
pub fn from_toml_file<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
let content = fs::read_to_string(path.as_ref())?;
let config: Config =
toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
message: format!("Failed to parse TOML config: {}", e),
})?;
Ok(config)
}
}