cognee_cognify/
config.rs

1//! Configuration for the cognify pipeline.
2//!
3//! CRITICAL: This is the SINGLE SOURCE OF TRUTH for all pipeline configuration.
4//! NO hardcoded values should exist in pipeline components.
5//! NO environment variables should be read in pipeline components.
6//! ALL configuration flows through this struct.
7
8use std::sync::Arc;
9
10use cognee_chunking::TokenCounterKind;
11use cognee_embedding::engine::EmbeddingEngine;
12use cognee_llm::{Llm, Transcriber};
13use serde::{Deserialize, Serialize};
14use thiserror::Error;
15
16/// Configuration for the cognify pipeline.
17///
18/// Design Principles:
19/// 1. NO hardcoded values in pipeline code - everything flows through config
20/// 2. NO environment variable reading in components (only in config construction if needed)
21/// 3. Sensible defaults matching `cognee` behavior
22/// 4. Builder pattern for easy customization
23///
24/// What is NOT in this config:
25/// - Storage/Database/LLM/Embedding instances (passed as Arc<T> to pipeline constructor)
26/// - Runtime data (data_items, dataset_id, etc. - passed to cognify() method)
27/// - Provider-specific API keys (handled by provider implementations, not pipeline config)
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct CognifyConfig {
30    /// Maximum chunk size in tokens.
31    ///
32    /// The sentinel value `1500` means "auto-calculate at pipeline time" via
33    /// [`CognifyConfig::auto_chunk_size`]. The pipeline in `tasks.rs` replaces the
34    /// sentinel with the computed value before executing — matching Python's
35    /// `get_max_chunk_tokens()` behaviour where `chunk_size=None` at the cognify
36    /// entry point always triggers auto-calculation. The computed value depends on
37    /// the active embedding engine: ≈512 for the local ONNX/BGE default (512-token
38    /// sequence limit) and 8191 for an OpenAI-compatible engine at its default
39    /// `max_completion_tokens` (8191), both clamped by the LLM term (8192).
40    ///
41    /// Pass an explicit value via [`CognifyConfig::with_chunk_size`] to override
42    /// the auto-calculation; any value other than the sentinel is used as-is.
43    pub max_chunk_size: usize,
44
45    /// Overlap between chunks (in tokens).
46    /// Python default: 10 (from ChunkConfig.chunk_overlap)
47    /// Used when chunk_strategy is RECURSIVE or LANGCHAIN
48    pub chunk_overlap: usize,
49
50    /// Chunking strategy.
51    /// Python default: ChunkStrategy.PARAGRAPH
52    /// Options: Paragraph (sentence-aware), Recursive (character-based with overlap)
53    pub chunk_strategy: ChunkStrategy,
54
55    /// Number of chunks to process in a single batch during graph extraction.
56    /// Python default: 100 (cognify parameter)
57    /// Controls memory usage vs parallelism tradeoff
58    pub chunks_per_batch: usize,
59
60    /// Maximum number of parallel tasks for graph extraction within a batch.
61    /// Python default: No explicit limit (uses asyncio.gather)
62    /// Rust: Prevents spawning too many tokio tasks
63    pub max_parallel_extractions: usize,
64
65    /// Custom prompt for entity/relationship extraction.
66    /// Python parameter: custom_prompt (optional)
67    /// If None, uses default prompts from cognee_llm
68    pub custom_extraction_prompt: Option<String>,
69
70    /// Enable text summarization stage.
71    /// Python behavior: Always runs if summarization_model is set
72    /// Default: true (matches Python)
73    pub enable_summarization: bool,
74
75    /// Batch size for summarization (parallel summary generation).
76    /// Python default: No explicit batching (processes all chunks in parallel)
77    /// Rust: Prevents spawning too many tasks
78    pub summarization_batch_size: usize,
79
80    /// Whether to generate and index triplet embeddings.
81    /// Triplets are formatted as "source › relationship › target"
82    /// Python config: CognifyConfig.triplet_embedding (default: False)
83    pub embed_triplets: bool,
84
85    /// Batch size for embedding generation (all types: chunks, entities, summaries, triplets).
86    /// Python default: varies by provider (36 for OpenAI, 100 for others)
87    /// Controls how many texts are embedded in a single API call
88    pub embedding_batch_size: usize,
89
90    /// Vector collection name prefix.
91    /// Python default: Uses type names directly ("Entity", "DocumentChunk", etc.)
92    /// Allows customization for multi-tenant or versioned deployments
93    pub vector_collection_prefix: String,
94
95    /// Enable incremental loading - only process new/changed data.
96    /// When true, tracks processed data IDs to avoid reprocessing.
97    /// Python parameter: incremental_loading (default: True)
98    pub incremental_loading: bool,
99
100    /// Enable pipeline-level caching.
101    /// When true, skips datasets whose latest pipeline run status is `Completed`.
102    /// Requires a database connection to be provided.
103    /// Python parameter: use_pipeline_cache (default: False)
104    pub use_pipeline_cache: bool,
105
106    /// Enable temporal graph construction.
107    /// Python parameter: temporal_cognify (default: False)
108    /// Extracts events and timestamps for temporal reasoning
109    pub temporal_cognify: bool,
110
111    /// Create WebPage/WebSite provenance nodes for URL-sourced documents.
112    ///
113    /// When true, documents whose external metadata was produced by URL
114    /// ingestion create deterministic WebPage and WebSite nodes plus
115    /// `DocumentChunk -> SOURCED_FROM -> WebPage` and
116    /// `WebPage -> PART_OF -> WebSite` edges.
117    pub create_web_page_nodes: bool,
118
119    /// Batch size for data processing in temporal cognify.
120    /// Python parameter: data_per_batch (default: 20)
121    pub data_per_batch: usize,
122
123    /// How to count tokens when chunking text.
124    /// Default is determined at construction time via [`TokenCounterKind::from_env`].
125    pub token_counter_kind: TokenCounterKind,
126
127    /// Optional JSON Schema for custom graph extraction model.
128    ///
129    /// When `Some`, the LLM uses this schema instead of the default
130    /// `KnowledgeGraph` schema for entity/relationship extraction.
131    /// Extracted data is stored as-is in chunk metadata.
132    ///
133    /// Mirrors Python's `graph_model` parameter.
134    #[serde(skip)]
135    pub graph_schema: Option<serde_json::Value>,
136
137    /// Optional JSON schema for the summarization output.
138    ///
139    /// Mirrors Python's `CognifyConfig.summarization_model` (a Pydantic class,
140    /// default `SummarizedContent`). When `Some`, the summarization stage
141    /// requests this schema from the LLM instead of the built-in
142    /// `SummarizedContent` shape. The schema **must** contain a string
143    /// `summary` field — the pipeline reads `summary` to build each
144    /// `TextSummary` (Python parity).
145    ///
146    /// Validated at setter/builder time via `validate_summary_schema`.
147    #[serde(skip)]
148    pub summary_schema: Option<serde_json::Value>,
149
150    /// Pluggable chunker callback.
151    ///
152    /// When `Some`, this function is called instead of the built-in
153    /// paragraph/recursive chunking. The callback receives the text and
154    /// max token count, and returns a list of chunk strings.
155    ///
156    /// Mirrors Python's `chunker` parameter.
157    #[serde(skip)]
158    pub custom_chunker: Option<CustomChunker>,
159
160    /// Optional transcriber for audio/video document processing.
161    ///
162    /// When `Some`, this transcriber is used to convert audio content into
163    /// text before chunking and graph extraction. Only takes effect when
164    /// processing documents classified as audio type.
165    #[serde(skip)]
166    pub transcriber: Option<TranscriberHandle>,
167}
168
169/// Opaque wrapper around a custom chunker callback.
170///
171/// Implements [`Debug`] (prints `"CustomChunker(…)"`) and [`Clone`] (cheap
172/// `Arc` clone), keeping [`CognifyConfig`] derivable.
173#[derive(Clone)]
174#[allow(clippy::type_complexity)]
175pub struct CustomChunker(pub Arc<dyn Fn(&str, usize) -> Vec<String> + Send + Sync>);
176
177impl std::fmt::Debug for CustomChunker {
178    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
179        f.write_str("CustomChunker(…)")
180    }
181}
182
183/// Opaque wrapper around a [`Transcriber`] implementation.
184///
185/// Implements [`Debug`] (prints `"TranscriberHandle(…)"`) and [`Clone`] (cheap
186/// `Arc` clone), keeping [`CognifyConfig`] derivable.
187#[derive(Clone)]
188pub struct TranscriberHandle(pub Arc<dyn Transcriber>);
189
190impl std::fmt::Debug for TranscriberHandle {
191    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
192        f.write_str("TranscriberHandle(…)")
193    }
194}
195
196/// Chunking strategy options.
197#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
198pub enum ChunkStrategy {
199    /// Paragraph-based chunking (sentence-aware, no overlap).
200    /// Python: ChunkStrategy.PARAGRAPH
201    /// Default and most reliable for semantic coherence.
202    Paragraph,
203
204    /// Recursive character-based chunking with overlap.
205    /// Python: ChunkStrategy.RECURSIVE (via LangchainChunker)
206    /// Better for preserving context across chunk boundaries.
207    Recursive,
208}
209
210impl Default for CognifyConfig {
211    fn default() -> Self {
212        Self {
213            max_chunk_size: 1500,
214            chunk_overlap: 10,
215            chunk_strategy: ChunkStrategy::Paragraph,
216
217            chunks_per_batch: 100,
218            max_parallel_extractions: 20,
219            custom_extraction_prompt: None,
220
221            enable_summarization: true,
222            summarization_batch_size: 50,
223
224            embed_triplets: false,
225            embedding_batch_size: 100,
226            vector_collection_prefix: String::new(),
227
228            incremental_loading: true,
229
230            use_pipeline_cache: false,
231
232            temporal_cognify: false,
233            create_web_page_nodes: true,
234            data_per_batch: 20,
235
236            token_counter_kind: TokenCounterKind::from_env(),
237
238            graph_schema: None,
239            summary_schema: None,
240            custom_chunker: None,
241            transcriber: None,
242        }
243    }
244}
245
246impl CognifyConfig {
247    /// Set maximum chunk size in tokens.
248    pub fn with_chunk_size(mut self, size: usize) -> Self {
249        self.max_chunk_size = size;
250        self
251    }
252
253    /// Set chunk overlap (for recursive chunking).
254    pub fn with_chunk_overlap(mut self, overlap: usize) -> Self {
255        self.chunk_overlap = overlap;
256        self
257    }
258
259    /// Set chunking strategy.
260    pub fn with_chunk_strategy(mut self, strategy: ChunkStrategy) -> Self {
261        self.chunk_strategy = strategy;
262        self
263    }
264
265    /// Set number of chunks per batch during graph extraction.
266    pub fn with_chunks_per_batch(mut self, batch_size: usize) -> Self {
267        self.chunks_per_batch = batch_size;
268        self
269    }
270
271    /// Set maximum parallel extractions.
272    pub fn with_max_parallel_extractions(mut self, limit: usize) -> Self {
273        self.max_parallel_extractions = limit;
274        self
275    }
276
277    /// Set custom extraction prompt.
278    pub fn with_custom_prompt(mut self, prompt: String) -> Self {
279        self.custom_extraction_prompt = Some(prompt);
280        self
281    }
282
283    /// Enable or disable summarization.
284    pub fn with_summarization(mut self, enable: bool) -> Self {
285        self.enable_summarization = enable;
286        self
287    }
288
289    /// Set summarization batch size.
290    pub fn with_summarization_batch_size(mut self, batch_size: usize) -> Self {
291        self.summarization_batch_size = batch_size;
292        self
293    }
294
295    /// Enable or disable triplet embeddings.
296    pub fn with_triplet_embeddings(mut self, enable: bool) -> Self {
297        self.embed_triplets = enable;
298        self
299    }
300
301    /// Set embedding batch size.
302    pub fn with_embedding_batch_size(mut self, batch_size: usize) -> Self {
303        self.embedding_batch_size = batch_size;
304        self
305    }
306
307    /// Set vector collection prefix.
308    pub fn with_collection_prefix(mut self, prefix: String) -> Self {
309        self.vector_collection_prefix = prefix;
310        self
311    }
312
313    /// Enable or disable incremental loading.
314    pub fn with_incremental_loading(mut self, enable: bool) -> Self {
315        self.incremental_loading = enable;
316        self
317    }
318
319    /// Enable or disable pipeline-level caching.
320    pub fn with_pipeline_cache(mut self, enable: bool) -> Self {
321        self.use_pipeline_cache = enable;
322        self
323    }
324
325    /// Enable or disable temporal cognify.
326    pub fn with_temporal_cognify(mut self, enable: bool) -> Self {
327        self.temporal_cognify = enable;
328        self
329    }
330
331    /// Enable or disable WebPage/WebSite provenance graph construction.
332    pub fn with_web_page_nodes(mut self, enable: bool) -> Self {
333        self.create_web_page_nodes = enable;
334        self
335    }
336
337    /// Set data per batch for temporal processing.
338    pub fn with_data_per_batch(mut self, batch_size: usize) -> Self {
339        self.data_per_batch = batch_size;
340        self
341    }
342
343    /// Set the token counter implementation to use during chunking.
344    pub fn with_token_counter(mut self, kind: TokenCounterKind) -> Self {
345        self.token_counter_kind = kind;
346        self
347    }
348
349    /// Set a custom JSON Schema for graph extraction.
350    pub fn with_graph_schema(mut self, schema: serde_json::Value) -> Self {
351        self.graph_schema = Some(schema);
352        self
353    }
354
355    /// Set a custom JSON schema for summarization output (Python `summarization_model` parity).
356    ///
357    /// The schema must contain a string `summary` field — the pipeline reads
358    /// `summary` to build each `TextSummary`. Returns an error if the schema
359    /// lacks that field so callers catch the misconfiguration early rather than
360    /// mid-pipeline.
361    pub fn with_summary_schema(mut self, schema: serde_json::Value) -> Result<Self, ConfigError> {
362        validate_summary_schema(&schema)?;
363        self.summary_schema = Some(schema);
364        Ok(self)
365    }
366
367    /// Set a custom chunker callback.
368    #[allow(clippy::type_complexity)]
369    pub fn with_custom_chunker(
370        mut self,
371        chunker: Arc<dyn Fn(&str, usize) -> Vec<String> + Send + Sync>,
372    ) -> Self {
373        self.custom_chunker = Some(CustomChunker(chunker));
374        self
375    }
376
377    /// Set a transcriber for audio document processing.
378    pub fn with_transcriber(mut self, transcriber: Arc<dyn Transcriber>) -> Self {
379        self.transcriber = Some(TranscriberHandle(transcriber));
380        self
381    }
382
383    /// Auto-calculate `max_chunk_size`, mirroring Python's `get_max_chunk_tokens()`
384    /// from `cognee/infrastructure/llm/utils.py`:
385    ///
386    /// ```text
387    /// llm_cutoff_point = llm_max_completion_tokens // 2   # Python default: 16384 → 8192
388    /// max_chunk_tokens = min(embedding_engine.max_completion_tokens, llm_cutoff_point)
389    /// ```
390    ///
391    /// Python uses **completion-token** budgets (not context windows):
392    /// - `embedding_engine.max_completion_tokens` — the engine's configured token
393    ///   limit. Python's `EmbeddingConfig` default is **8191**
394    ///   (`embeddings/config.py:81`), passed to the engine by the factory; the
395    ///   engine class's own `__init__` default of 512 is overridden in that path.
396    ///   Rust mirrors this: `EmbeddingConfig.max_completion_tokens` defaults to 8191.
397    /// - `llm_max_completion_tokens` = **16384** (infrastructure/llm/config.py:51).
398    /// - So for an OpenAI-compatible engine: `min(8191, 8192) = 8191`. For the local
399    ///   ONNX/BGE engine, `max_sequence_length()` is the model's 512-token limit, so
400    ///   `min(512, 8192) = 512`. The embedding term is the binding one in both cases.
401    ///
402    /// The Rust `Llm` trait exposes only `max_context_length()` (a context window),
403    /// not a completion-token limit. Rather than divide an unrelated quantity, we use
404    /// Python's LLM completion-token constant (16384) directly. The embedding term
405    /// (`max_sequence_length()` — 512 for BGE, the configured `max_completion_tokens`
406    /// for OpenAI-compatible) is binding in all practical configurations, so the LLM
407    /// argument is currently unused (`_llm`).
408    ///
409    /// Result is at least 1.
410    pub fn auto_chunk_size(embedding_engine: &dyn EmbeddingEngine, _llm: &dyn Llm) -> usize {
411        // Python infrastructure/llm/config.py:51 — default LLM completion-token budget.
412        const PY_LLM_MAX_COMPLETION_TOKENS: usize = 16_384;
413        let llm_cutoff = PY_LLM_MAX_COMPLETION_TOKENS / 2; // == 8192
414        let embed_max = embedding_engine.max_sequence_length();
415        llm_cutoff.min(embed_max).max(1)
416    }
417
418    /// Set max_chunk_size by auto-calculating from embedding and LLM capabilities.
419    ///
420    /// See [`auto_chunk_size`](Self::auto_chunk_size) for the formula used.
421    pub fn with_auto_chunk_size(
422        mut self,
423        embedding_engine: &dyn EmbeddingEngine,
424        llm: &dyn Llm,
425    ) -> Self {
426        self.max_chunk_size = Self::auto_chunk_size(embedding_engine, llm);
427        self
428    }
429
430    /// Validate configuration parameters.
431    ///
432    /// Returns an error if any parameters are invalid.
433    pub fn validate(&self) -> Result<(), ConfigError> {
434        if self.max_chunk_size == 0 {
435            return Err(ConfigError::InvalidParameter(
436                "max_chunk_size must be greater than 0".to_string(),
437            ));
438        }
439
440        if self.chunk_overlap >= self.max_chunk_size {
441            return Err(ConfigError::InvalidParameter(
442                "chunk_overlap must be less than max_chunk_size".to_string(),
443            ));
444        }
445
446        if self.chunks_per_batch == 0 {
447            return Err(ConfigError::InvalidParameter(
448                "chunks_per_batch must be greater than 0".to_string(),
449            ));
450        }
451
452        if self.max_parallel_extractions == 0 {
453            return Err(ConfigError::InvalidParameter(
454                "max_parallel_extractions must be greater than 0".to_string(),
455            ));
456        }
457
458        if self.embedding_batch_size == 0 {
459            return Err(ConfigError::InvalidParameter(
460                "embedding_batch_size must be greater than 0".to_string(),
461            ));
462        }
463
464        if self.summarization_batch_size == 0 {
465            return Err(ConfigError::InvalidParameter(
466                "summarization_batch_size must be greater than 0".to_string(),
467            ));
468        }
469
470        if self.data_per_batch == 0 {
471            return Err(ConfigError::InvalidParameter(
472                "data_per_batch must be greater than 0".to_string(),
473            ));
474        }
475
476        Ok(())
477    }
478}
479
480/// Configuration error types.
481#[derive(Error, Debug)]
482pub enum ConfigError {
483    #[error("Invalid configuration parameter: {0}")]
484    InvalidParameter(String),
485    #[error("Invalid summary schema: {0}")]
486    InvalidSummarySchema(String),
487}
488
489/// Validate that a JSON schema supplied for `summary_schema` has a string
490/// `summary` property, so misconfigurations are caught at builder/setter time
491/// rather than mid-pipeline.
492pub fn validate_summary_schema(schema: &serde_json::Value) -> Result<(), ConfigError> {
493    let obj = schema.as_object().ok_or_else(|| {
494        ConfigError::InvalidSummarySchema("schema must be a JSON object".to_string())
495    })?;
496
497    let props = obj
498        .get("properties")
499        .and_then(|p| p.as_object())
500        .ok_or_else(|| {
501            ConfigError::InvalidSummarySchema("schema must have a 'properties' object".to_string())
502        })?;
503
504    let summary_prop = props.get("summary").ok_or_else(|| {
505        ConfigError::InvalidSummarySchema(
506            "schema 'properties' must include a 'summary' field".to_string(),
507        )
508    })?;
509
510    // Accept either {"type": "string"} or no type constraint at all.
511    if let Some(type_val) = summary_prop.get("type")
512        && type_val.as_str() != Some("string")
513    {
514        return Err(ConfigError::InvalidSummarySchema(
515            "'summary' field must be of type 'string'".to_string(),
516        ));
517    }
518
519    Ok(())
520}
521
522#[cfg(test)]
523mod tests {
524    use super::*;
525    use async_trait::async_trait;
526    use cognee_embedding::error::EmbeddingResult;
527    use cognee_llm::types::GenerationOptions;
528
529    // Minimal mock for EmbeddingEngine — only max_sequence_length() matters.
530    struct MockEmbedding {
531        max_seq: usize,
532    }
533
534    #[async_trait]
535    impl EmbeddingEngine for MockEmbedding {
536        async fn embed(&self, _texts: &[&str]) -> EmbeddingResult<Vec<Vec<f32>>> {
537            Ok(vec![])
538        }
539        fn dimension(&self) -> usize {
540            384
541        }
542        fn batch_size(&self) -> usize {
543            32
544        }
545        fn max_sequence_length(&self) -> usize {
546            self.max_seq
547        }
548    }
549
550    // Minimal mock for Llm — only max_context_length() matters.
551    struct MockLlm {
552        max_ctx: u32,
553    }
554
555    #[async_trait]
556    impl Llm for MockLlm {
557        async fn generate(
558            &self,
559            _messages: Vec<cognee_llm::Message>,
560            _options: Option<GenerationOptions>,
561        ) -> cognee_llm::LlmResult<cognee_llm::GenerationResponse> {
562            unimplemented!()
563        }
564        async fn create_structured_output_with_messages_raw(
565            &self,
566            _messages: Vec<cognee_llm::Message>,
567            _json_schema: &serde_json::Value,
568            _options: Option<GenerationOptions>,
569        ) -> cognee_llm::LlmResult<serde_json::Value> {
570            unimplemented!()
571        }
572        fn model(&self) -> &str {
573            "mock"
574        }
575        fn max_context_length(&self) -> u32 {
576            self.max_ctx
577        }
578    }
579
580    #[test]
581    fn test_default_config() {
582        let config = CognifyConfig::default();
583
584        // Chunking defaults
585        assert_eq!(config.max_chunk_size, 1500);
586        assert_eq!(config.chunk_overlap, 10);
587        assert_eq!(config.chunk_strategy, ChunkStrategy::Paragraph);
588
589        // Graph extraction defaults
590        assert_eq!(config.chunks_per_batch, 100);
591        assert_eq!(config.max_parallel_extractions, 20);
592        assert!(config.custom_extraction_prompt.is_none());
593
594        // Summarization defaults
595        assert!(config.enable_summarization);
596        assert_eq!(config.summarization_batch_size, 50);
597
598        // Embedding defaults
599        assert!(!config.embed_triplets);
600        assert_eq!(config.embedding_batch_size, 100);
601        assert_eq!(config.vector_collection_prefix, "");
602
603        // Incremental defaults
604        assert!(config.incremental_loading);
605
606        // Pipeline cache defaults
607        assert!(!config.use_pipeline_cache);
608
609        // Advanced defaults
610        assert!(!config.temporal_cognify);
611        assert_eq!(config.data_per_batch, 20);
612    }
613
614    #[test]
615    fn test_config_builder_chunking() {
616        let config = CognifyConfig::default()
617            .with_chunk_size(2000)
618            .with_chunk_overlap(50)
619            .with_chunk_strategy(ChunkStrategy::Recursive);
620
621        assert_eq!(config.max_chunk_size, 2000);
622        assert_eq!(config.chunk_overlap, 50);
623        assert_eq!(config.chunk_strategy, ChunkStrategy::Recursive);
624    }
625
626    #[test]
627    fn test_config_builder_graph_extraction() {
628        let config = CognifyConfig::default()
629            .with_chunks_per_batch(50)
630            .with_max_parallel_extractions(25)
631            .with_custom_prompt("Extract entities:".to_string());
632
633        assert_eq!(config.chunks_per_batch, 50);
634        assert_eq!(config.max_parallel_extractions, 25);
635        assert_eq!(
636            config.custom_extraction_prompt,
637            Some("Extract entities:".to_string())
638        );
639    }
640
641    #[test]
642    fn test_config_builder_all_features() {
643        let config = CognifyConfig::default()
644            .with_chunk_size(2000)
645            .with_triplet_embeddings(true)
646            .with_incremental_loading(false)
647            .with_summarization(false)
648            .with_temporal_cognify(true);
649
650        assert_eq!(config.max_chunk_size, 2000);
651        assert!(config.embed_triplets);
652        assert!(!config.incremental_loading);
653        assert!(!config.enable_summarization);
654        assert!(config.temporal_cognify);
655    }
656
657    #[test]
658    fn test_config_validation_success() {
659        let config = CognifyConfig::default();
660        assert!(config.validate().is_ok());
661    }
662
663    #[test]
664    fn test_config_validation_zero_chunk_size() {
665        let config = CognifyConfig {
666            max_chunk_size: 0,
667            ..Default::default()
668        };
669        assert!(matches!(
670            config.validate(),
671            Err(ConfigError::InvalidParameter(_))
672        ));
673    }
674
675    #[test]
676    fn test_config_validation_overlap_too_large() {
677        let config = CognifyConfig {
678            max_chunk_size: 100,
679            chunk_overlap: 100,
680            ..Default::default()
681        };
682        assert!(matches!(
683            config.validate(),
684            Err(ConfigError::InvalidParameter(_))
685        ));
686    }
687
688    #[test]
689    fn test_config_validation_zero_batch_sizes() {
690        let config1 = CognifyConfig {
691            chunks_per_batch: 0,
692            ..Default::default()
693        };
694        assert!(config1.validate().is_err());
695
696        let config2 = CognifyConfig {
697            embedding_batch_size: 0,
698            ..Default::default()
699        };
700        assert!(config2.validate().is_err());
701
702        let config3 = CognifyConfig {
703            summarization_batch_size: 0,
704            ..Default::default()
705        };
706        assert!(config3.validate().is_err());
707    }
708
709    /// Local ONNX/BGE default: the model's 512-token sequence limit binds →
710    /// min(512, 8192) = 512. The LLM argument is unused in the new formula
711    /// (Python constant 16384/2=8192). For an OpenAI-compatible engine at its
712    /// default max_completion_tokens (8191), the result would be min(8191, 8192)
713    /// = 8191 — see `test_auto_chunk_size_large_embedding` for the >8192 case.
714    #[test]
715    fn auto_chunk_size_matches_python_default() {
716        // BGE-like embedding: max_seq=512 binds → result=512.
717        let embed = MockEmbedding { max_seq: 512 };
718        let llm = MockLlm { max_ctx: 4096 };
719        assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 512);
720    }
721
722    #[test]
723    fn test_auto_chunk_size_embed_is_smaller() {
724        // embed_max=512, LLM cutoff=8192 → result=512 (embedding term dominates).
725        let embed = MockEmbedding { max_seq: 512 };
726        let llm = MockLlm { max_ctx: 4096 };
727        assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 512);
728    }
729
730    #[test]
731    fn test_auto_chunk_size_llm_cutoff_unused() {
732        // LLM context window is NOT used for the cutoff. The Python completion-token
733        // constant (16384 → 8192) is used instead. Even a tiny context window no
734        // longer artificially restricts the chunk size — the embedding term (512)
735        // still dominates.
736        let embed = MockEmbedding { max_seq: 512 };
737        let llm = MockLlm { max_ctx: 256 }; // previously returned 128, now returns 512
738        assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 512);
739    }
740
741    #[test]
742    fn test_auto_chunk_size_large_embedding() {
743        // embed_max=10000 > llm_cutoff (8192) → result=8192 (LLM constant dominates).
744        let embed = MockEmbedding { max_seq: 10_000 };
745        let llm = MockLlm { max_ctx: 4096 };
746        assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 8192);
747    }
748
749    #[test]
750    fn test_auto_chunk_size_equal_values() {
751        // embed_max=1024 < 8192 → result=1024 (embedding term dominates).
752        let embed = MockEmbedding { max_seq: 1024 };
753        let llm = MockLlm { max_ctx: 2048 };
754        assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 1024);
755    }
756
757    #[test]
758    fn test_auto_chunk_size_floor_at_one() {
759        // embed_max=0 → min(0, 8192)=0 → clamped to 1.
760        let embed = MockEmbedding { max_seq: 0 };
761        let llm = MockLlm { max_ctx: 0 };
762        assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 1);
763    }
764
765    #[test]
766    fn test_auto_chunk_size_embed_exactly_at_llm_cutoff() {
767        // embed_max=8192 == llm_cutoff → result=8192.
768        let embed = MockEmbedding { max_seq: 8192 };
769        let llm = MockLlm { max_ctx: 4096 };
770        assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 8192);
771    }
772
773    #[test]
774    fn test_with_auto_chunk_size_builder() {
775        let embed = MockEmbedding { max_seq: 512 };
776        let llm = MockLlm { max_ctx: 4096 };
777        let config = CognifyConfig::default().with_auto_chunk_size(&embed, &llm);
778        assert_eq!(config.max_chunk_size, 512);
779        // Other fields should remain at defaults
780        assert_eq!(config.chunk_overlap, 10);
781        assert_eq!(config.chunks_per_batch, 100);
782    }
783}
cognee_cognify/config.rs

cognee_cognify/
config.rs