use crate::Result;
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::Path;
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct SetConfig {
#[serde(default)]
pub mode: ModeConfig,
#[serde(default)]
pub semantic: Option<SemanticPipelineConfig>,
#[serde(default)]
pub algorithmic: Option<AlgorithmicPipelineConfig>,
#[serde(default)]
pub hybrid: Option<HybridPipelineConfig>,
#[serde(default)]
pub general: GeneralConfig,
#[serde(default)]
pub pipeline: PipelineConfig,
#[serde(default)]
pub storage: StorageConfig,
#[serde(default)]
pub models: ModelsConfig,
#[serde(default)]
pub performance: PerformanceConfig,
#[serde(default)]
pub ollama: OllamaSetConfig,
#[serde(default)]
pub gliner: GlinerSetConfig,
#[serde(default)]
pub experimental: ExperimentalConfig,
#[serde(default)]
pub entity_extraction: EntityExtractionTopLevelConfig,
#[serde(default)]
pub auto_save: AutoSaveSetConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AutoSaveSetConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub base_dir: Option<String>,
#[serde(default = "default_auto_save_interval")]
pub interval_seconds: u64,
#[serde(default)]
pub workspace_name: Option<String>,
#[serde(default = "default_max_auto_save_versions")]
pub max_versions: usize,
}
impl Default for AutoSaveSetConfig {
fn default() -> Self {
Self {
enabled: false,
base_dir: None,
interval_seconds: default_auto_save_interval(),
workspace_name: None,
max_versions: default_max_auto_save_versions(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GeneralConfig {
#[serde(default = "default_log_level")]
pub log_level: String,
#[serde(default = "default_output_dir")]
pub output_dir: String,
#[serde(default)]
pub input_document_path: Option<String>,
#[serde(default)]
pub max_threads: Option<usize>,
#[serde(default)]
pub enable_profiling: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PipelineConfig {
#[serde(default = "default_workflows")]
pub workflows: Vec<String>,
#[serde(default = "default_true")]
pub parallel_execution: bool,
#[serde(default)]
pub text_extraction: TextExtractionConfig,
#[serde(default)]
pub entity_extraction: EntityExtractionConfig,
#[serde(default)]
pub graph_building: GraphBuildingConfig,
#[serde(default)]
pub community_detection: CommunityDetectionConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextExtractionConfig {
#[serde(default = "default_chunk_size")]
pub chunk_size: usize,
#[serde(default = "default_chunk_overlap")]
pub chunk_overlap: usize,
#[serde(default = "default_true")]
pub clean_control_chars: bool,
#[serde(default = "default_min_chunk_size")]
pub min_chunk_size: usize,
#[serde(default)]
pub cleaning: Option<CleaningConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CleaningConfig {
#[serde(default)]
pub remove_urls: bool,
#[serde(default)]
pub remove_emails: bool,
#[serde(default = "default_true")]
pub normalize_whitespace: bool,
#[serde(default)]
pub remove_special_chars: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EntityExtractionConfig {
#[serde(default = "default_ner_model")]
pub model_name: String,
#[serde(default = "default_extraction_temperature")]
pub temperature: f32,
#[serde(default = "default_max_tokens")]
pub max_tokens: usize,
pub entity_types: Option<Vec<String>>,
#[serde(default = "default_confidence_threshold")]
pub confidence_threshold: f32,
pub custom_prompt: Option<String>,
#[serde(default)]
pub filters: Option<EntityFiltersConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EntityFiltersConfig {
#[serde(default = "default_min_entity_length")]
pub min_entity_length: usize,
#[serde(default = "default_max_entity_length")]
pub max_entity_length: usize,
pub allowed_entity_types: Option<Vec<String>>,
#[serde(default = "default_confidence_threshold")]
pub confidence_threshold: f32,
pub allowed_patterns: Option<Vec<String>>,
pub excluded_patterns: Option<Vec<String>>,
#[serde(default)]
pub enable_fuzzy_matching: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GraphBuildingConfig {
#[serde(default = "default_relation_scorer")]
pub relation_scorer: String,
#[serde(default = "default_min_relation_score")]
pub min_relation_score: f32,
#[serde(default = "default_max_connections")]
pub max_connections_per_node: usize,
#[serde(default = "default_true")]
pub bidirectional_relations: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CommunityDetectionConfig {
#[serde(default = "default_community_algorithm")]
pub algorithm: String,
#[serde(default = "default_resolution")]
pub resolution: f32,
#[serde(default = "default_min_community_size")]
pub min_community_size: usize,
#[serde(default)]
pub max_community_size: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StorageConfig {
#[serde(default = "default_database_type")]
pub database_type: String,
#[serde(default = "default_database_path")]
pub database_path: String,
#[serde(default = "default_true")]
pub enable_wal: bool,
pub postgresql: Option<PostgreSQLConfig>,
pub neo4j: Option<Neo4jConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PostgreSQLConfig {
pub host: String,
pub port: u16,
pub database: String,
pub username: String,
pub password: String,
#[serde(default = "default_pool_size")]
pub pool_size: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Neo4jConfig {
pub uri: String,
pub username: String,
pub password: String,
#[serde(default)]
pub encrypted: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelsConfig {
#[serde(default = "default_primary_llm")]
pub primary_llm: String,
#[serde(default = "default_embedding_model")]
pub embedding_model: String,
#[serde(default = "default_max_context")]
pub max_context_length: usize,
#[serde(default)]
pub llm_params: Option<LLMParamsConfig>,
#[serde(default)]
pub local: Option<LocalModelsConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LLMParamsConfig {
#[serde(default = "default_temperature")]
pub temperature: f32,
#[serde(default = "default_top_p")]
pub top_p: f32,
#[serde(default)]
pub frequency_penalty: f32,
#[serde(default)]
pub presence_penalty: f32,
pub stop_sequences: Option<Vec<String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LocalModelsConfig {
#[serde(default = "default_ollama_url")]
pub ollama_base_url: String,
#[serde(default = "default_ollama_model")]
pub model_name: String,
#[serde(default = "default_ollama_embedding")]
pub embedding_model: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceConfig {
#[serde(default = "default_true")]
pub batch_processing: bool,
#[serde(default = "default_batch_size")]
pub batch_size: usize,
#[serde(default = "default_worker_threads")]
pub worker_threads: usize,
#[serde(default = "default_memory_limit")]
pub memory_limit_mb: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OllamaSetConfig {
#[serde(default = "default_true")]
pub enabled: bool,
#[serde(default = "default_ollama_host")]
pub host: String,
#[serde(default = "default_ollama_port")]
pub port: u16,
#[serde(default = "default_chat_model")]
pub chat_model: String,
#[serde(default = "default_embedding_model_ollama")]
pub embedding_model: String,
#[serde(default = "default_timeout")]
pub timeout_seconds: u64,
#[serde(default = "default_max_retries")]
pub max_retries: u32,
#[serde(default)]
pub fallback_to_hash: bool,
pub max_tokens: Option<u32>,
pub temperature: Option<f32>,
pub keep_alive: Option<String>,
pub num_ctx: Option<u32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GlinerSetConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub model_path: String,
#[serde(default)]
pub tokenizer_path: String,
#[serde(default = "default_gliner_mode")]
pub mode: String,
#[serde(default = "default_gliner_entity_labels")]
pub entity_labels: Vec<String>,
#[serde(default = "default_gliner_relation_labels")]
pub relation_labels: Vec<String>,
#[serde(default = "default_entity_threshold")]
pub entity_threshold: f32,
#[serde(default = "default_relation_threshold")]
pub relation_threshold: f32,
#[serde(default)]
pub use_gpu: bool,
#[serde(default)]
pub max_concurrent_chunks: Option<usize>,
}
fn default_gliner_mode() -> String {
"span".to_string()
}
fn default_gliner_entity_labels() -> Vec<String> {
vec![
"person".into(),
"organization".into(),
"location".into(),
"concept".into(),
]
}
fn default_gliner_relation_labels() -> Vec<String> {
vec!["related to".into(), "part of".into(), "causes".into()]
}
fn default_entity_threshold() -> f32 {
0.4
}
fn default_relation_threshold() -> f32 {
0.5
}
impl Default for GlinerSetConfig {
fn default() -> Self {
Self {
enabled: false,
model_path: String::new(),
tokenizer_path: String::new(),
mode: default_gliner_mode(),
entity_labels: default_gliner_entity_labels(),
relation_labels: default_gliner_relation_labels(),
entity_threshold: default_entity_threshold(),
relation_threshold: default_relation_threshold(),
use_gpu: false,
max_concurrent_chunks: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ExperimentalConfig {
#[serde(default)]
pub neural_reranking: bool,
#[serde(default)]
pub federated_learning: bool,
#[serde(default)]
pub real_time_updates: bool,
#[serde(default)]
pub distributed_processing: bool,
#[serde(default)]
pub lazy_graphrag: bool,
#[serde(default)]
pub e2_graphrag: bool,
#[serde(default)]
pub lazy_graphrag_config: Option<LazyGraphRAGConfig>,
#[serde(default)]
pub e2_graphrag_config: Option<E2GraphRAGConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LazyGraphRAGConfig {
#[serde(default = "default_true")]
pub use_concept_extraction: bool,
#[serde(default = "default_min_concept_length")]
pub min_concept_length: usize,
#[serde(default = "default_max_concept_words")]
pub max_concept_words: usize,
#[serde(default = "default_co_occurrence_threshold")]
pub co_occurrence_threshold: usize,
#[serde(default = "default_true")]
pub use_query_refinement: bool,
#[serde(default = "default_max_refinement_iterations")]
pub max_refinement_iterations: usize,
#[serde(default = "default_true")]
pub use_bidirectional_index: bool,
}
impl Default for LazyGraphRAGConfig {
fn default() -> Self {
Self {
use_concept_extraction: true,
min_concept_length: 3,
max_concept_words: 5,
co_occurrence_threshold: 1,
use_query_refinement: true,
max_refinement_iterations: 3,
use_bidirectional_index: true,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct E2GraphRAGConfig {
#[serde(default = "default_true")]
pub use_lightweight_ner: bool,
#[serde(default = "default_e2_entity_types")]
pub entity_types: Vec<String>,
#[serde(default = "default_e2_min_confidence")]
pub min_confidence: f32,
#[serde(default = "default_true")]
pub use_capitalization_detection: bool,
#[serde(default = "default_true")]
pub use_noun_phrase_extraction: bool,
#[serde(default = "default_min_entity_frequency")]
pub min_entity_frequency: usize,
#[serde(default = "default_true")]
pub use_fast_cooccurrence: bool,
#[serde(default = "default_true")]
pub use_bidirectional_index: bool,
}
impl Default for E2GraphRAGConfig {
fn default() -> Self {
Self {
use_lightweight_ner: true,
entity_types: default_e2_entity_types(),
min_confidence: 0.6,
use_capitalization_detection: true,
use_noun_phrase_extraction: true,
min_entity_frequency: 1,
use_fast_cooccurrence: true,
use_bidirectional_index: true,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModeConfig {
#[serde(default = "default_approach")]
pub approach: String,
}
impl Default for ModeConfig {
fn default() -> Self {
Self {
approach: default_approach(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticPipelineConfig {
#[serde(default)]
pub enabled: bool,
pub embeddings: SemanticEmbeddingsConfig,
pub entity_extraction: SemanticEntityConfig,
pub retrieval: SemanticRetrievalConfig,
pub graph_construction: SemanticGraphConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticEmbeddingsConfig {
#[serde(default = "default_semantic_embedding_backend")]
pub backend: String,
#[serde(default = "default_semantic_embedding_model")]
pub model: String,
#[serde(default = "default_semantic_embedding_dim")]
pub dimension: usize,
#[serde(default = "default_true")]
pub use_gpu: bool,
#[serde(default = "default_similarity_metric")]
pub similarity_metric: String,
#[serde(default = "default_batch_size")]
pub batch_size: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticEntityConfig {
#[serde(default = "default_semantic_entity_method")]
pub method: String,
#[serde(default = "default_true")]
pub use_gleaning: bool,
#[serde(default = "default_max_gleaning_rounds")]
pub max_gleaning_rounds: usize,
#[serde(default = "default_chat_model")]
pub model: String,
#[serde(default = "default_semantic_temperature")]
pub temperature: f32,
#[serde(default = "default_semantic_confidence")]
pub confidence_threshold: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticRetrievalConfig {
#[serde(default = "default_semantic_retrieval_strategy")]
pub strategy: String,
#[serde(default = "default_true")]
pub use_hnsw: bool,
#[serde(default = "default_hnsw_ef_construction")]
pub hnsw_ef_construction: usize,
#[serde(default = "default_hnsw_m")]
pub hnsw_m: usize,
#[serde(default = "default_top_k")]
pub top_k: usize,
#[serde(default = "default_semantic_similarity_threshold")]
pub similarity_threshold: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticGraphConfig {
#[serde(default = "default_semantic_relation_scorer")]
pub relation_scorer: String,
#[serde(default = "default_true")]
pub use_transformer_embeddings: bool,
#[serde(default = "default_min_relation_score")]
pub min_relation_score: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct AlgorithmicPipelineConfig {
#[serde(default)]
pub enabled: bool,
pub embeddings: AlgorithmicEmbeddingsConfig,
pub entity_extraction: AlgorithmicEntityConfig,
pub retrieval: AlgorithmicRetrievalConfig,
pub graph_construction: AlgorithmicGraphConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlgorithmicEmbeddingsConfig {
#[serde(default = "default_algorithmic_embedding_backend")]
pub backend: String,
#[serde(default = "default_algorithmic_embedding_dim")]
pub dimension: usize,
#[serde(default = "default_true")]
pub use_tfidf: bool,
#[serde(default = "default_vocabulary_size")]
pub vocabulary_size: usize,
#[serde(default = "default_min_term_frequency")]
pub min_term_frequency: usize,
#[serde(default = "default_max_document_frequency")]
pub max_document_frequency: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlgorithmicEntityConfig {
#[serde(default = "default_algorithmic_entity_method")]
pub method: String,
#[serde(default = "default_true")]
pub use_ner_rules: bool,
#[serde(default)]
pub use_pos_tagging: bool,
#[serde(default = "default_min_entity_length")]
pub min_entity_length: usize,
#[serde(default = "default_algorithmic_confidence")]
pub confidence_threshold: f32,
pub patterns: Option<Vec<String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlgorithmicRetrievalConfig {
#[serde(default = "default_algorithmic_retrieval_strategy")]
pub strategy: String,
#[serde(default = "default_bm25_k1")]
pub k1: f32,
#[serde(default = "default_bm25_b")]
pub b: f32,
#[serde(default = "default_true")]
pub use_stemming: bool,
#[serde(default = "default_language")]
pub language: String,
#[serde(default = "default_top_k")]
pub top_k: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlgorithmicGraphConfig {
#[serde(default = "default_algorithmic_relation_scorer")]
pub relation_scorer: String,
#[serde(default = "default_true")]
pub use_cooccurrence: bool,
#[serde(default = "default_cooccurrence_window")]
pub window_size: usize,
#[serde(default = "default_algorithmic_min_relation_score")]
pub min_relation_score: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HybridPipelineConfig {
#[serde(default)]
pub enabled: bool,
pub weights: HybridWeightsConfig,
pub embeddings: HybridEmbeddingsConfig,
pub entity_extraction: HybridEntityConfig,
pub retrieval: HybridRetrievalConfig,
pub graph_construction: HybridGraphConfig,
#[serde(default = "default_hybrid_fallback_strategy")]
pub fallback_strategy: String,
#[serde(default = "default_true")]
pub cross_validation: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HybridWeightsConfig {
#[serde(default = "default_hybrid_semantic_weight")]
pub semantic_weight: f32,
#[serde(default = "default_hybrid_algorithmic_weight")]
pub algorithmic_weight: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HybridEmbeddingsConfig {
#[serde(default = "default_semantic_embedding_backend")]
pub primary: String,
#[serde(default = "default_algorithmic_embedding_backend")]
pub fallback: String,
#[serde(default = "default_true")]
pub combine_scores: bool,
#[serde(default = "default_true")]
pub auto_fallback: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HybridEntityConfig {
#[serde(default = "default_true")]
pub use_both: bool,
#[serde(default = "default_hybrid_llm_weight")]
pub llm_weight: f32,
#[serde(default = "default_hybrid_pattern_weight")]
pub pattern_weight: f32,
#[serde(default = "default_true")]
pub cross_validate: bool,
#[serde(default = "default_hybrid_confidence_boost")]
pub confidence_boost: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HybridRetrievalConfig {
#[serde(default = "default_hybrid_retrieval_strategy")]
pub strategy: String,
#[serde(default = "default_true")]
pub combine_vector_bm25: bool,
#[serde(default = "default_hybrid_vector_weight")]
pub vector_weight: f32,
#[serde(default = "default_hybrid_bm25_weight")]
pub bm25_weight: f32,
#[serde(default = "default_rrf_constant")]
pub rrf_constant: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HybridGraphConfig {
#[serde(default = "default_semantic_relation_scorer")]
pub primary_scorer: String,
#[serde(default = "default_algorithmic_relation_scorer")]
pub fallback_scorer: String,
#[serde(default = "default_true")]
pub combine_scores: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EntityExtractionTopLevelConfig {
#[serde(default = "default_true")]
pub enabled: bool,
#[serde(default = "default_confidence_threshold")]
pub min_confidence: f32,
#[serde(default)]
pub use_gleaning: bool,
#[serde(default = "default_gleaning_rounds")]
pub max_gleaning_rounds: usize,
#[serde(default = "default_gleaning_improvement")]
pub gleaning_improvement_threshold: f32,
#[serde(default)]
pub semantic_merging: bool,
#[serde(default = "default_merge_threshold")]
pub merge_similarity_threshold: f32,
#[serde(default)]
pub automatic_linking: bool,
#[serde(default = "default_confidence_threshold")]
pub linking_confidence_threshold: f32,
}
impl Default for EntityExtractionTopLevelConfig {
fn default() -> Self {
Self {
enabled: true,
min_confidence: default_confidence_threshold(),
use_gleaning: false,
max_gleaning_rounds: default_gleaning_rounds(),
gleaning_improvement_threshold: default_gleaning_improvement(),
semantic_merging: false,
merge_similarity_threshold: default_merge_threshold(),
automatic_linking: false,
linking_confidence_threshold: default_confidence_threshold(),
}
}
}
fn default_log_level() -> String {
"info".to_string()
}
fn default_output_dir() -> String {
"./output".to_string()
}
fn default_true() -> bool {
true
}
fn default_workflows() -> Vec<String> {
vec![
"extract_text".to_string(),
"extract_entities".to_string(),
"build_graph".to_string(),
"detect_communities".to_string(),
]
}
fn default_chunk_size() -> usize {
512
}
fn default_chunk_overlap() -> usize {
64
}
fn default_min_chunk_size() -> usize {
50
}
fn default_ner_model() -> String {
"microsoft/DialoGPT-medium".to_string()
}
fn default_temperature() -> f32 {
0.1
}
fn default_extraction_temperature() -> f32 {
0.0
}
fn default_max_tokens() -> usize {
2048
}
fn default_min_entity_length() -> usize {
3
}
fn default_max_entity_length() -> usize {
100
}
fn default_confidence_threshold() -> f32 {
0.8
}
fn default_relation_scorer() -> String {
"cosine_similarity".to_string()
}
fn default_min_relation_score() -> f32 {
0.7
}
fn default_max_connections() -> usize {
10
}
fn default_community_algorithm() -> String {
"leiden".to_string()
}
fn default_resolution() -> f32 {
1.0
}
fn default_min_community_size() -> usize {
3
}
fn default_database_type() -> String {
"sqlite".to_string()
}
fn default_database_path() -> String {
"./graphrag.db".to_string()
}
fn default_pool_size() -> usize {
10
}
fn default_primary_llm() -> String {
"gpt-4".to_string()
}
fn default_embedding_model() -> String {
"text-embedding-ada-002".to_string()
}
fn default_max_context() -> usize {
4096
}
fn default_top_p() -> f32 {
0.9
}
fn default_ollama_url() -> String {
"http://localhost:11434".to_string()
}
fn default_ollama_model() -> String {
"llama2:7b".to_string()
}
fn default_ollama_embedding() -> String {
"nomic-embed-text".to_string()
}
fn default_batch_size() -> usize {
100
}
fn default_worker_threads() -> usize {
4
}
fn default_memory_limit() -> usize {
1024
}
fn default_ollama_host() -> String {
"http://localhost".to_string()
}
fn default_ollama_port() -> u16 {
11434
}
fn default_chat_model() -> String {
"llama3.1:8b".to_string()
}
fn default_embedding_model_ollama() -> String {
"nomic-embed-text".to_string()
}
fn default_timeout() -> u64 {
60
}
fn default_max_retries() -> u32 {
3
}
fn default_gleaning_rounds() -> usize {
3
}
fn default_gleaning_improvement() -> f32 {
0.1
}
fn default_merge_threshold() -> f32 {
0.85
}
fn default_approach() -> String {
"semantic".to_string() }
fn default_semantic_embedding_backend() -> String {
"huggingface".to_string()
}
fn default_semantic_embedding_model() -> String {
"sentence-transformers/all-MiniLM-L6-v2".to_string()
}
fn default_semantic_embedding_dim() -> usize {
384 }
fn default_similarity_metric() -> String {
"cosine".to_string()
}
fn default_semantic_entity_method() -> String {
"llm".to_string()
}
fn default_max_gleaning_rounds() -> usize {
3
}
fn default_semantic_temperature() -> f32 {
0.1
}
fn default_semantic_confidence() -> f32 {
0.7
}
fn default_semantic_retrieval_strategy() -> String {
"vector".to_string()
}
fn default_hnsw_ef_construction() -> usize {
200
}
fn default_hnsw_m() -> usize {
16
}
fn default_top_k() -> usize {
10
}
fn default_semantic_similarity_threshold() -> f32 {
0.7
}
fn default_semantic_relation_scorer() -> String {
"embedding_similarity".to_string()
}
fn default_algorithmic_embedding_backend() -> String {
"hash".to_string()
}
fn default_algorithmic_embedding_dim() -> usize {
128
}
fn default_vocabulary_size() -> usize {
10000
}
fn default_min_term_frequency() -> usize {
2
}
fn default_max_document_frequency() -> f32 {
0.8
}
fn default_algorithmic_entity_method() -> String {
"pattern".to_string()
}
fn default_algorithmic_confidence() -> f32 {
0.75
}
fn default_algorithmic_retrieval_strategy() -> String {
"bm25".to_string()
}
fn default_bm25_k1() -> f32 {
1.5
}
fn default_bm25_b() -> f32 {
0.75
}
fn default_language() -> String {
"english".to_string()
}
fn default_algorithmic_relation_scorer() -> String {
"jaccard".to_string()
}
fn default_cooccurrence_window() -> usize {
10
}
fn default_algorithmic_min_relation_score() -> f32 {
0.6
}
fn default_hybrid_semantic_weight() -> f32 {
0.6
}
fn default_hybrid_algorithmic_weight() -> f32 {
0.4
}
fn default_hybrid_llm_weight() -> f32 {
0.7
}
fn default_hybrid_pattern_weight() -> f32 {
0.3
}
fn default_hybrid_confidence_boost() -> f32 {
0.15
}
fn default_hybrid_retrieval_strategy() -> String {
"fusion".to_string()
}
fn default_hybrid_vector_weight() -> f32 {
0.6
}
fn default_hybrid_bm25_weight() -> f32 {
0.4
}
fn default_rrf_constant() -> usize {
60
}
fn default_hybrid_fallback_strategy() -> String {
"semantic_first".to_string()
}
fn default_auto_save_interval() -> u64 {
300 }
fn default_max_auto_save_versions() -> usize {
5 }
fn default_min_concept_length() -> usize {
3 }
fn default_max_concept_words() -> usize {
5 }
fn default_co_occurrence_threshold() -> usize {
1 }
fn default_max_refinement_iterations() -> usize {
3 }
fn default_e2_entity_types() -> Vec<String> {
vec![
"PERSON".to_string(),
"ORGANIZATION".to_string(),
"LOCATION".to_string(),
"CONCEPT".to_string(),
]
}
fn default_e2_min_confidence() -> f32 {
0.6 }
fn default_min_entity_frequency() -> usize {
1 }
impl Default for GeneralConfig {
fn default() -> Self {
Self {
log_level: default_log_level(),
output_dir: default_output_dir(),
input_document_path: None,
max_threads: None,
enable_profiling: false,
}
}
}
impl Default for PipelineConfig {
fn default() -> Self {
Self {
workflows: default_workflows(),
parallel_execution: default_true(),
text_extraction: TextExtractionConfig::default(),
entity_extraction: EntityExtractionConfig::default(),
graph_building: GraphBuildingConfig::default(),
community_detection: CommunityDetectionConfig::default(),
}
}
}
impl Default for TextExtractionConfig {
fn default() -> Self {
Self {
chunk_size: default_chunk_size(),
chunk_overlap: default_chunk_overlap(),
clean_control_chars: default_true(),
min_chunk_size: default_min_chunk_size(),
cleaning: None,
}
}
}
impl Default for EntityExtractionConfig {
fn default() -> Self {
Self {
model_name: default_ner_model(),
temperature: default_temperature(),
max_tokens: default_max_tokens(),
entity_types: None,
confidence_threshold: default_confidence_threshold(),
custom_prompt: None,
filters: None,
}
}
}
impl Default for GraphBuildingConfig {
fn default() -> Self {
Self {
relation_scorer: default_relation_scorer(),
min_relation_score: default_min_relation_score(),
max_connections_per_node: default_max_connections(),
bidirectional_relations: default_true(),
}
}
}
impl Default for CommunityDetectionConfig {
fn default() -> Self {
Self {
algorithm: default_community_algorithm(),
resolution: default_resolution(),
min_community_size: default_min_community_size(),
max_community_size: 0,
}
}
}
impl Default for StorageConfig {
fn default() -> Self {
Self {
database_type: default_database_type(),
database_path: default_database_path(),
enable_wal: default_true(),
postgresql: None,
neo4j: None,
}
}
}
impl Default for ModelsConfig {
fn default() -> Self {
Self {
primary_llm: default_primary_llm(),
embedding_model: default_embedding_model(),
max_context_length: default_max_context(),
llm_params: None,
local: None,
}
}
}
impl Default for PerformanceConfig {
fn default() -> Self {
Self {
batch_processing: default_true(),
batch_size: default_batch_size(),
worker_threads: default_worker_threads(),
memory_limit_mb: default_memory_limit(),
}
}
}
impl Default for OllamaSetConfig {
fn default() -> Self {
Self {
enabled: default_true(),
host: default_ollama_host(),
port: default_ollama_port(),
chat_model: default_chat_model(),
embedding_model: default_embedding_model_ollama(),
timeout_seconds: default_timeout(),
max_retries: default_max_retries(),
fallback_to_hash: false,
max_tokens: Some(800),
temperature: Some(0.3),
keep_alive: None,
num_ctx: None,
}
}
}
impl Default for SemanticPipelineConfig {
fn default() -> Self {
Self {
enabled: true,
embeddings: SemanticEmbeddingsConfig::default(),
entity_extraction: SemanticEntityConfig::default(),
retrieval: SemanticRetrievalConfig::default(),
graph_construction: SemanticGraphConfig::default(),
}
}
}
impl Default for SemanticEmbeddingsConfig {
fn default() -> Self {
Self {
backend: default_semantic_embedding_backend(),
model: default_semantic_embedding_model(),
dimension: default_semantic_embedding_dim(),
use_gpu: default_true(),
similarity_metric: default_similarity_metric(),
batch_size: default_batch_size(),
}
}
}
impl Default for SemanticEntityConfig {
fn default() -> Self {
Self {
method: default_semantic_entity_method(),
use_gleaning: default_true(),
max_gleaning_rounds: default_max_gleaning_rounds(),
model: default_chat_model(),
temperature: default_semantic_temperature(),
confidence_threshold: default_semantic_confidence(),
}
}
}
impl Default for SemanticRetrievalConfig {
fn default() -> Self {
Self {
strategy: default_semantic_retrieval_strategy(),
use_hnsw: default_true(),
hnsw_ef_construction: default_hnsw_ef_construction(),
hnsw_m: default_hnsw_m(),
top_k: default_top_k(),
similarity_threshold: default_semantic_similarity_threshold(),
}
}
}
impl Default for SemanticGraphConfig {
fn default() -> Self {
Self {
relation_scorer: default_semantic_relation_scorer(),
use_transformer_embeddings: default_true(),
min_relation_score: default_min_relation_score(),
}
}
}
impl Default for AlgorithmicEmbeddingsConfig {
fn default() -> Self {
Self {
backend: default_algorithmic_embedding_backend(),
dimension: default_algorithmic_embedding_dim(),
use_tfidf: default_true(),
vocabulary_size: default_vocabulary_size(),
min_term_frequency: default_min_term_frequency(),
max_document_frequency: default_max_document_frequency(),
}
}
}
impl Default for AlgorithmicEntityConfig {
fn default() -> Self {
Self {
method: default_algorithmic_entity_method(),
use_ner_rules: default_true(),
use_pos_tagging: false,
min_entity_length: default_min_entity_length(),
confidence_threshold: default_algorithmic_confidence(),
patterns: None,
}
}
}
impl Default for AlgorithmicRetrievalConfig {
fn default() -> Self {
Self {
strategy: default_algorithmic_retrieval_strategy(),
k1: default_bm25_k1(),
b: default_bm25_b(),
use_stemming: default_true(),
language: default_language(),
top_k: default_top_k(),
}
}
}
impl Default for AlgorithmicGraphConfig {
fn default() -> Self {
Self {
relation_scorer: default_algorithmic_relation_scorer(),
use_cooccurrence: default_true(),
window_size: default_cooccurrence_window(),
min_relation_score: default_algorithmic_min_relation_score(),
}
}
}
impl Default for HybridPipelineConfig {
fn default() -> Self {
Self {
enabled: false,
weights: HybridWeightsConfig::default(),
embeddings: HybridEmbeddingsConfig::default(),
entity_extraction: HybridEntityConfig::default(),
retrieval: HybridRetrievalConfig::default(),
graph_construction: HybridGraphConfig::default(),
fallback_strategy: default_hybrid_fallback_strategy(),
cross_validation: default_true(),
}
}
}
impl Default for HybridWeightsConfig {
fn default() -> Self {
Self {
semantic_weight: default_hybrid_semantic_weight(),
algorithmic_weight: default_hybrid_algorithmic_weight(),
}
}
}
impl Default for HybridEmbeddingsConfig {
fn default() -> Self {
Self {
primary: default_semantic_embedding_backend(),
fallback: default_algorithmic_embedding_backend(),
combine_scores: default_true(),
auto_fallback: default_true(),
}
}
}
impl Default for HybridEntityConfig {
fn default() -> Self {
Self {
use_both: default_true(),
llm_weight: default_hybrid_llm_weight(),
pattern_weight: default_hybrid_pattern_weight(),
cross_validate: default_true(),
confidence_boost: default_hybrid_confidence_boost(),
}
}
}
impl Default for HybridRetrievalConfig {
fn default() -> Self {
Self {
strategy: default_hybrid_retrieval_strategy(),
combine_vector_bm25: default_true(),
vector_weight: default_hybrid_vector_weight(),
bm25_weight: default_hybrid_bm25_weight(),
rrf_constant: default_rrf_constant(),
}
}
}
impl Default for HybridGraphConfig {
fn default() -> Self {
Self {
primary_scorer: default_semantic_relation_scorer(),
fallback_scorer: default_algorithmic_relation_scorer(),
combine_scores: default_true(),
}
}
}
impl SetConfig {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let path_ref = path.as_ref();
let content = fs::read_to_string(path_ref)?;
let extension = path_ref.extension().and_then(|e| e.to_str()).unwrap_or("");
let config: SetConfig = match extension {
#[cfg(feature = "json5-support")]
"json5" | "json" => {
json5::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
message: format!("JSON5 parse error: {e}"),
})?
},
#[cfg(not(feature = "json5-support"))]
"json5" | "json" => {
return Err(crate::core::GraphRAGError::Config {
message: "JSON5 support not enabled. Rebuild with --features json5-support"
.to_string(),
});
},
_ => toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
message: format!("TOML parse error: {e}"),
})?,
};
Ok(config)
}
pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
let toml_string =
toml::to_string_pretty(&self).map_err(|e| crate::core::GraphRAGError::Config {
message: format!("TOML serialize error: {e}"),
})?;
let commented_toml = format!(
"# =============================================================================\n\
# GraphRAG Configuration File\n\
# Complete configuration with extensive parameters for easy customization\n\
# =============================================================================\n\n{toml_string}"
);
fs::write(path, commented_toml)?;
Ok(())
}
pub fn to_graphrag_config(&self) -> crate::Config {
let mut config = crate::Config {
approach: self.mode.approach.clone(),
..Default::default()
};
config.text.chunk_size = self.pipeline.text_extraction.chunk_size;
config.text.chunk_overlap = self.pipeline.text_extraction.chunk_overlap;
config.entities.min_confidence = self.entity_extraction.min_confidence;
if let Some(ref types) = self.pipeline.entity_extraction.entity_types {
config.entities.entity_types = types.clone();
}
match self.mode.approach.as_str() {
"semantic" => {
if let Some(ref semantic) = self.semantic {
config.entities.use_gleaning = semantic.entity_extraction.use_gleaning;
config.entities.max_gleaning_rounds =
semantic.entity_extraction.max_gleaning_rounds;
config.entities.min_confidence =
semantic.entity_extraction.confidence_threshold;
} else {
config.entities.use_gleaning = self.entity_extraction.use_gleaning;
config.entities.max_gleaning_rounds =
self.entity_extraction.max_gleaning_rounds;
config.entities.min_confidence = self.entity_extraction.min_confidence;
}
},
"algorithmic" => {
config.entities.use_gleaning = false;
if let Some(ref algorithmic) = self.algorithmic {
config.entities.min_confidence =
algorithmic.entity_extraction.confidence_threshold;
}
},
"hybrid" => {
config.entities.use_gleaning = true;
if self.hybrid.is_some() {
config.entities.max_gleaning_rounds = 2; }
},
_ => {
config.entities.use_gleaning = self.entity_extraction.use_gleaning;
config.entities.max_gleaning_rounds = self.entity_extraction.max_gleaning_rounds;
},
}
config.graph.similarity_threshold = self.pipeline.graph_building.min_relation_score;
config.graph.max_connections = self.pipeline.graph_building.max_connections_per_node;
config.graph.extract_relationships = true; config.graph.relationship_confidence_threshold = 0.5;
config.retrieval.top_k = 10;
config.embeddings.dimension = 768; config.embeddings.backend = "ollama".to_string();
config.embeddings.fallback_to_hash = self.ollama.fallback_to_hash;
config.parallel.enabled = self.pipeline.parallel_execution;
config.parallel.num_threads = self.performance.worker_threads;
config.ollama = crate::ollama::OllamaConfig {
enabled: self.ollama.enabled,
host: self.ollama.host.clone(),
port: self.ollama.port,
chat_model: self.ollama.chat_model.clone(),
embedding_model: self.ollama.embedding_model.clone(),
timeout_seconds: self.ollama.timeout_seconds,
max_retries: self.ollama.max_retries,
fallback_to_hash: self.ollama.fallback_to_hash,
max_tokens: self.ollama.max_tokens,
temperature: self.ollama.temperature,
enable_caching: true,
keep_alive: self.ollama.keep_alive.clone(),
num_ctx: self.ollama.num_ctx,
};
config.gliner = crate::config::GlinerConfig {
enabled: self.gliner.enabled,
model_path: self.gliner.model_path.clone(),
tokenizer_path: self.gliner.tokenizer_path.clone(),
mode: self.gliner.mode.clone(),
entity_labels: self.gliner.entity_labels.clone(),
relation_labels: self.gliner.relation_labels.clone(),
entity_threshold: self.gliner.entity_threshold,
relation_threshold: self.gliner.relation_threshold,
use_gpu: self.gliner.use_gpu,
max_concurrent_chunks: self.gliner.max_concurrent_chunks,
};
config.auto_save = crate::config::AutoSaveConfig {
enabled: self.auto_save.enabled,
base_dir: self.auto_save.base_dir.clone(),
interval_seconds: self.auto_save.interval_seconds,
workspace_name: self.auto_save.workspace_name.clone(),
max_versions: self.auto_save.max_versions,
};
config
}
}
#[cfg(test)]
mod drift_guard_tests {
use super::*;
use crate::config::{AutoSaveConfig, GlinerConfig};
#[test]
fn gliner_setconfig_default_matches_runtime() {
let set = GlinerSetConfig::default();
let runtime = GlinerConfig::default();
assert_eq!(set.mode, runtime.mode, "gliner.mode drifted");
assert_eq!(
set.entity_labels, runtime.entity_labels,
"gliner.entity_labels drifted"
);
assert_eq!(
set.relation_labels, runtime.relation_labels,
"gliner.relation_labels drifted"
);
assert_eq!(
set.entity_threshold, runtime.entity_threshold,
"gliner.entity_threshold drifted"
);
assert_eq!(
set.relation_threshold, runtime.relation_threshold,
"gliner.relation_threshold drifted"
);
assert_eq!(set.use_gpu, runtime.use_gpu, "gliner.use_gpu drifted");
}
#[test]
fn autosave_setconfig_default_matches_runtime() {
let set = AutoSaveSetConfig::default();
let runtime = AutoSaveConfig::default();
assert_eq!(set.enabled, runtime.enabled, "auto_save.enabled drifted");
assert_eq!(
set.interval_seconds, runtime.interval_seconds,
"auto_save.interval_seconds drifted"
);
assert_eq!(
set.max_versions, runtime.max_versions,
"auto_save.max_versions drifted"
);
}
}