cognee_cognify/config.rs
1//! Configuration for the cognify pipeline.
2//!
3//! CRITICAL: This is the SINGLE SOURCE OF TRUTH for all pipeline configuration.
4//! NO hardcoded values should exist in pipeline components.
5//! NO environment variables should be read in pipeline components.
6//! ALL configuration flows through this struct.
7
8use std::sync::Arc;
9
10use cognee_chunking::TokenCounterKind;
11use cognee_embedding::engine::EmbeddingEngine;
12use cognee_llm::{Llm, Transcriber};
13use serde::{Deserialize, Serialize};
14use thiserror::Error;
15
16/// Configuration for the cognify pipeline.
17///
18/// Design Principles:
19/// 1. NO hardcoded values in pipeline code - everything flows through config
20/// 2. NO environment variable reading in components (only in config construction if needed)
21/// 3. Sensible defaults matching `cognee` behavior
22/// 4. Builder pattern for easy customization
23///
24/// What is NOT in this config:
25/// - Storage/Database/LLM/Embedding instances (passed as Arc<T> to pipeline constructor)
26/// - Runtime data (data_items, dataset_id, etc. - passed to cognify() method)
27/// - Provider-specific API keys (handled by provider implementations, not pipeline config)
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct CognifyConfig {
30 /// Maximum chunk size in tokens.
31 ///
32 /// The sentinel value `1500` means "auto-calculate at pipeline time" via
33 /// [`CognifyConfig::auto_chunk_size`]. The pipeline in `tasks.rs` replaces the
34 /// sentinel with the computed value before executing — matching Python's
35 /// `get_max_chunk_tokens()` behaviour where `chunk_size=None` at the cognify
36 /// entry point always triggers auto-calculation. The computed value depends on
37 /// the active embedding engine: ≈512 for the local ONNX/BGE default (512-token
38 /// sequence limit) and 8191 for an OpenAI-compatible engine at its default
39 /// `max_completion_tokens` (8191), both clamped by the LLM term (8192).
40 ///
41 /// Pass an explicit value via [`CognifyConfig::with_chunk_size`] to override
42 /// the auto-calculation; any value other than the sentinel is used as-is.
43 pub max_chunk_size: usize,
44
45 /// Overlap between chunks (in tokens).
46 /// Python default: 10 (from ChunkConfig.chunk_overlap)
47 /// Used when chunk_strategy is RECURSIVE or LANGCHAIN
48 pub chunk_overlap: usize,
49
50 /// Chunking strategy.
51 /// Python default: ChunkStrategy.PARAGRAPH
52 /// Options: Paragraph (sentence-aware), Recursive (character-based with overlap)
53 pub chunk_strategy: ChunkStrategy,
54
55 /// Number of chunks to process in a single batch during graph extraction.
56 /// Python default: 100 (cognify parameter)
57 /// Controls memory usage vs parallelism tradeoff
58 pub chunks_per_batch: usize,
59
60 /// Maximum number of parallel tasks for graph extraction within a batch.
61 /// Python default: No explicit limit (uses asyncio.gather)
62 /// Rust: Prevents spawning too many tokio tasks
63 pub max_parallel_extractions: usize,
64
65 /// Custom prompt for entity/relationship extraction.
66 /// Python parameter: custom_prompt (optional)
67 /// If None, uses default prompts from cognee_llm
68 pub custom_extraction_prompt: Option<String>,
69
70 /// Enable text summarization stage.
71 /// Python behavior: Always runs if summarization_model is set
72 /// Default: true (matches Python)
73 pub enable_summarization: bool,
74
75 /// Batch size for summarization (parallel summary generation).
76 /// Python default: No explicit batching (processes all chunks in parallel)
77 /// Rust: Prevents spawning too many tasks
78 pub summarization_batch_size: usize,
79
80 /// Whether to generate and index triplet embeddings.
81 /// Triplets are formatted as "source › relationship › target"
82 /// Python config: CognifyConfig.triplet_embedding (default: False)
83 pub embed_triplets: bool,
84
85 /// Batch size for embedding generation (all types: chunks, entities, summaries, triplets).
86 /// Python default: varies by provider (36 for OpenAI, 100 for others)
87 /// Controls how many texts are embedded in a single API call
88 pub embedding_batch_size: usize,
89
90 /// Vector collection name prefix.
91 /// Python default: Uses type names directly ("Entity", "DocumentChunk", etc.)
92 /// Allows customization for multi-tenant or versioned deployments
93 pub vector_collection_prefix: String,
94
95 /// Enable incremental loading - only process new/changed data.
96 /// When true, tracks processed data IDs to avoid reprocessing.
97 /// Python parameter: incremental_loading (default: True)
98 pub incremental_loading: bool,
99
100 /// Enable pipeline-level caching.
101 /// When true, skips datasets whose latest pipeline run status is `Completed`.
102 /// Requires a database connection to be provided.
103 /// Python parameter: use_pipeline_cache (default: False)
104 pub use_pipeline_cache: bool,
105
106 /// Enable temporal graph construction.
107 /// Python parameter: temporal_cognify (default: False)
108 /// Extracts events and timestamps for temporal reasoning
109 pub temporal_cognify: bool,
110
111 /// Create WebPage/WebSite provenance nodes for URL-sourced documents.
112 ///
113 /// When true, documents whose external metadata was produced by URL
114 /// ingestion create deterministic WebPage and WebSite nodes plus
115 /// `DocumentChunk -> SOURCED_FROM -> WebPage` and
116 /// `WebPage -> PART_OF -> WebSite` edges.
117 pub create_web_page_nodes: bool,
118
119 /// Batch size for data processing in temporal cognify.
120 /// Python parameter: data_per_batch (default: 20)
121 pub data_per_batch: usize,
122
123 /// How to count tokens when chunking text.
124 /// Default is determined at construction time via [`TokenCounterKind::from_env`].
125 pub token_counter_kind: TokenCounterKind,
126
127 /// Optional JSON Schema for custom graph extraction model.
128 ///
129 /// When `Some`, the LLM uses this schema instead of the default
130 /// `KnowledgeGraph` schema for entity/relationship extraction.
131 /// Extracted data is stored as-is in chunk metadata.
132 ///
133 /// Mirrors Python's `graph_model` parameter.
134 #[serde(skip)]
135 pub graph_schema: Option<serde_json::Value>,
136
137 /// Optional JSON schema for the summarization output.
138 ///
139 /// Mirrors Python's `CognifyConfig.summarization_model` (a Pydantic class,
140 /// default `SummarizedContent`). When `Some`, the summarization stage
141 /// requests this schema from the LLM instead of the built-in
142 /// `SummarizedContent` shape. The schema **must** contain a string
143 /// `summary` field — the pipeline reads `summary` to build each
144 /// `TextSummary` (Python parity).
145 ///
146 /// Validated at setter/builder time via `validate_summary_schema`.
147 #[serde(skip)]
148 pub summary_schema: Option<serde_json::Value>,
149
150 /// Pluggable chunker callback.
151 ///
152 /// When `Some`, this function is called instead of the built-in
153 /// paragraph/recursive chunking. The callback receives the text and
154 /// max token count, and returns a list of chunk strings.
155 ///
156 /// Mirrors Python's `chunker` parameter.
157 #[serde(skip)]
158 pub custom_chunker: Option<CustomChunker>,
159
160 /// Optional transcriber for audio/video document processing.
161 ///
162 /// When `Some`, this transcriber is used to convert audio content into
163 /// text before chunking and graph extraction. Only takes effect when
164 /// processing documents classified as audio type.
165 #[serde(skip)]
166 pub transcriber: Option<TranscriberHandle>,
167}
168
169/// Opaque wrapper around a custom chunker callback.
170///
171/// Implements [`Debug`] (prints `"CustomChunker(…)"`) and [`Clone`] (cheap
172/// `Arc` clone), keeping [`CognifyConfig`] derivable.
173#[derive(Clone)]
174#[allow(clippy::type_complexity)]
175pub struct CustomChunker(pub Arc<dyn Fn(&str, usize) -> Vec<String> + Send + Sync>);
176
177impl std::fmt::Debug for CustomChunker {
178 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
179 f.write_str("CustomChunker(…)")
180 }
181}
182
183/// Opaque wrapper around a [`Transcriber`] implementation.
184///
185/// Implements [`Debug`] (prints `"TranscriberHandle(…)"`) and [`Clone`] (cheap
186/// `Arc` clone), keeping [`CognifyConfig`] derivable.
187#[derive(Clone)]
188pub struct TranscriberHandle(pub Arc<dyn Transcriber>);
189
190impl std::fmt::Debug for TranscriberHandle {
191 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
192 f.write_str("TranscriberHandle(…)")
193 }
194}
195
196/// Chunking strategy options.
197#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
198pub enum ChunkStrategy {
199 /// Paragraph-based chunking (sentence-aware, no overlap).
200 /// Python: ChunkStrategy.PARAGRAPH
201 /// Default and most reliable for semantic coherence.
202 Paragraph,
203
204 /// Recursive character-based chunking with overlap.
205 /// Python: ChunkStrategy.RECURSIVE (via LangchainChunker)
206 /// Better for preserving context across chunk boundaries.
207 Recursive,
208}
209
210impl Default for CognifyConfig {
211 fn default() -> Self {
212 Self {
213 max_chunk_size: 1500,
214 chunk_overlap: 10,
215 chunk_strategy: ChunkStrategy::Paragraph,
216
217 chunks_per_batch: 100,
218 max_parallel_extractions: 20,
219 custom_extraction_prompt: None,
220
221 enable_summarization: true,
222 summarization_batch_size: 50,
223
224 embed_triplets: false,
225 embedding_batch_size: 100,
226 vector_collection_prefix: String::new(),
227
228 incremental_loading: true,
229
230 use_pipeline_cache: false,
231
232 temporal_cognify: false,
233 create_web_page_nodes: true,
234 data_per_batch: 20,
235
236 token_counter_kind: TokenCounterKind::from_env(),
237
238 graph_schema: None,
239 summary_schema: None,
240 custom_chunker: None,
241 transcriber: None,
242 }
243 }
244}
245
246impl CognifyConfig {
247 /// Set maximum chunk size in tokens.
248 pub fn with_chunk_size(mut self, size: usize) -> Self {
249 self.max_chunk_size = size;
250 self
251 }
252
253 /// Set chunk overlap (for recursive chunking).
254 pub fn with_chunk_overlap(mut self, overlap: usize) -> Self {
255 self.chunk_overlap = overlap;
256 self
257 }
258
259 /// Set chunking strategy.
260 pub fn with_chunk_strategy(mut self, strategy: ChunkStrategy) -> Self {
261 self.chunk_strategy = strategy;
262 self
263 }
264
265 /// Set number of chunks per batch during graph extraction.
266 pub fn with_chunks_per_batch(mut self, batch_size: usize) -> Self {
267 self.chunks_per_batch = batch_size;
268 self
269 }
270
271 /// Set maximum parallel extractions.
272 pub fn with_max_parallel_extractions(mut self, limit: usize) -> Self {
273 self.max_parallel_extractions = limit;
274 self
275 }
276
277 /// Set custom extraction prompt.
278 pub fn with_custom_prompt(mut self, prompt: String) -> Self {
279 self.custom_extraction_prompt = Some(prompt);
280 self
281 }
282
283 /// Enable or disable summarization.
284 pub fn with_summarization(mut self, enable: bool) -> Self {
285 self.enable_summarization = enable;
286 self
287 }
288
289 /// Set summarization batch size.
290 pub fn with_summarization_batch_size(mut self, batch_size: usize) -> Self {
291 self.summarization_batch_size = batch_size;
292 self
293 }
294
295 /// Enable or disable triplet embeddings.
296 pub fn with_triplet_embeddings(mut self, enable: bool) -> Self {
297 self.embed_triplets = enable;
298 self
299 }
300
301 /// Set embedding batch size.
302 pub fn with_embedding_batch_size(mut self, batch_size: usize) -> Self {
303 self.embedding_batch_size = batch_size;
304 self
305 }
306
307 /// Set vector collection prefix.
308 pub fn with_collection_prefix(mut self, prefix: String) -> Self {
309 self.vector_collection_prefix = prefix;
310 self
311 }
312
313 /// Enable or disable incremental loading.
314 pub fn with_incremental_loading(mut self, enable: bool) -> Self {
315 self.incremental_loading = enable;
316 self
317 }
318
319 /// Enable or disable pipeline-level caching.
320 pub fn with_pipeline_cache(mut self, enable: bool) -> Self {
321 self.use_pipeline_cache = enable;
322 self
323 }
324
325 /// Enable or disable temporal cognify.
326 pub fn with_temporal_cognify(mut self, enable: bool) -> Self {
327 self.temporal_cognify = enable;
328 self
329 }
330
331 /// Enable or disable WebPage/WebSite provenance graph construction.
332 pub fn with_web_page_nodes(mut self, enable: bool) -> Self {
333 self.create_web_page_nodes = enable;
334 self
335 }
336
337 /// Set data per batch for temporal processing.
338 pub fn with_data_per_batch(mut self, batch_size: usize) -> Self {
339 self.data_per_batch = batch_size;
340 self
341 }
342
343 /// Set the token counter implementation to use during chunking.
344 pub fn with_token_counter(mut self, kind: TokenCounterKind) -> Self {
345 self.token_counter_kind = kind;
346 self
347 }
348
349 /// Set a custom JSON Schema for graph extraction.
350 pub fn with_graph_schema(mut self, schema: serde_json::Value) -> Self {
351 self.graph_schema = Some(schema);
352 self
353 }
354
355 /// Set a custom JSON schema for summarization output (Python `summarization_model` parity).
356 ///
357 /// The schema must contain a string `summary` field — the pipeline reads
358 /// `summary` to build each `TextSummary`. Returns an error if the schema
359 /// lacks that field so callers catch the misconfiguration early rather than
360 /// mid-pipeline.
361 pub fn with_summary_schema(mut self, schema: serde_json::Value) -> Result<Self, ConfigError> {
362 validate_summary_schema(&schema)?;
363 self.summary_schema = Some(schema);
364 Ok(self)
365 }
366
367 /// Set a custom chunker callback.
368 #[allow(clippy::type_complexity)]
369 pub fn with_custom_chunker(
370 mut self,
371 chunker: Arc<dyn Fn(&str, usize) -> Vec<String> + Send + Sync>,
372 ) -> Self {
373 self.custom_chunker = Some(CustomChunker(chunker));
374 self
375 }
376
377 /// Set a transcriber for audio document processing.
378 pub fn with_transcriber(mut self, transcriber: Arc<dyn Transcriber>) -> Self {
379 self.transcriber = Some(TranscriberHandle(transcriber));
380 self
381 }
382
383 /// Auto-calculate `max_chunk_size`, mirroring Python's `get_max_chunk_tokens()`
384 /// from `cognee/infrastructure/llm/utils.py`:
385 ///
386 /// ```text
387 /// llm_cutoff_point = llm_max_completion_tokens // 2 # Python default: 16384 → 8192
388 /// max_chunk_tokens = min(embedding_engine.max_completion_tokens, llm_cutoff_point)
389 /// ```
390 ///
391 /// Python uses **completion-token** budgets (not context windows):
392 /// - `embedding_engine.max_completion_tokens` — the engine's configured token
393 /// limit. Python's `EmbeddingConfig` default is **8191**
394 /// (`embeddings/config.py:81`), passed to the engine by the factory; the
395 /// engine class's own `__init__` default of 512 is overridden in that path.
396 /// Rust mirrors this: `EmbeddingConfig.max_completion_tokens` defaults to 8191.
397 /// - `llm_max_completion_tokens` = **16384** (infrastructure/llm/config.py:51).
398 /// - So for an OpenAI-compatible engine: `min(8191, 8192) = 8191`. For the local
399 /// ONNX/BGE engine, `max_sequence_length()` is the model's 512-token limit, so
400 /// `min(512, 8192) = 512`. The embedding term is the binding one in both cases.
401 ///
402 /// The Rust `Llm` trait exposes only `max_context_length()` (a context window),
403 /// not a completion-token limit. Rather than divide an unrelated quantity, we use
404 /// Python's LLM completion-token constant (16384) directly. The embedding term
405 /// (`max_sequence_length()` — 512 for BGE, the configured `max_completion_tokens`
406 /// for OpenAI-compatible) is binding in all practical configurations, so the LLM
407 /// argument is currently unused (`_llm`).
408 ///
409 /// Result is at least 1.
410 pub fn auto_chunk_size(embedding_engine: &dyn EmbeddingEngine, _llm: &dyn Llm) -> usize {
411 // Python infrastructure/llm/config.py:51 — default LLM completion-token budget.
412 const PY_LLM_MAX_COMPLETION_TOKENS: usize = 16_384;
413 let llm_cutoff = PY_LLM_MAX_COMPLETION_TOKENS / 2; // == 8192
414 let embed_max = embedding_engine.max_sequence_length();
415 llm_cutoff.min(embed_max).max(1)
416 }
417
418 /// Set max_chunk_size by auto-calculating from embedding and LLM capabilities.
419 ///
420 /// See [`auto_chunk_size`](Self::auto_chunk_size) for the formula used.
421 pub fn with_auto_chunk_size(
422 mut self,
423 embedding_engine: &dyn EmbeddingEngine,
424 llm: &dyn Llm,
425 ) -> Self {
426 self.max_chunk_size = Self::auto_chunk_size(embedding_engine, llm);
427 self
428 }
429
430 /// Validate configuration parameters.
431 ///
432 /// Returns an error if any parameters are invalid.
433 pub fn validate(&self) -> Result<(), ConfigError> {
434 if self.max_chunk_size == 0 {
435 return Err(ConfigError::InvalidParameter(
436 "max_chunk_size must be greater than 0".to_string(),
437 ));
438 }
439
440 if self.chunk_overlap >= self.max_chunk_size {
441 return Err(ConfigError::InvalidParameter(
442 "chunk_overlap must be less than max_chunk_size".to_string(),
443 ));
444 }
445
446 if self.chunks_per_batch == 0 {
447 return Err(ConfigError::InvalidParameter(
448 "chunks_per_batch must be greater than 0".to_string(),
449 ));
450 }
451
452 if self.max_parallel_extractions == 0 {
453 return Err(ConfigError::InvalidParameter(
454 "max_parallel_extractions must be greater than 0".to_string(),
455 ));
456 }
457
458 if self.embedding_batch_size == 0 {
459 return Err(ConfigError::InvalidParameter(
460 "embedding_batch_size must be greater than 0".to_string(),
461 ));
462 }
463
464 if self.summarization_batch_size == 0 {
465 return Err(ConfigError::InvalidParameter(
466 "summarization_batch_size must be greater than 0".to_string(),
467 ));
468 }
469
470 if self.data_per_batch == 0 {
471 return Err(ConfigError::InvalidParameter(
472 "data_per_batch must be greater than 0".to_string(),
473 ));
474 }
475
476 Ok(())
477 }
478}
479
480/// Configuration error types.
481#[derive(Error, Debug)]
482pub enum ConfigError {
483 #[error("Invalid configuration parameter: {0}")]
484 InvalidParameter(String),
485 #[error("Invalid summary schema: {0}")]
486 InvalidSummarySchema(String),
487}
488
489/// Validate that a JSON schema supplied for `summary_schema` has a string
490/// `summary` property, so misconfigurations are caught at builder/setter time
491/// rather than mid-pipeline.
492pub fn validate_summary_schema(schema: &serde_json::Value) -> Result<(), ConfigError> {
493 let obj = schema.as_object().ok_or_else(|| {
494 ConfigError::InvalidSummarySchema("schema must be a JSON object".to_string())
495 })?;
496
497 let props = obj
498 .get("properties")
499 .and_then(|p| p.as_object())
500 .ok_or_else(|| {
501 ConfigError::InvalidSummarySchema("schema must have a 'properties' object".to_string())
502 })?;
503
504 let summary_prop = props.get("summary").ok_or_else(|| {
505 ConfigError::InvalidSummarySchema(
506 "schema 'properties' must include a 'summary' field".to_string(),
507 )
508 })?;
509
510 // Accept either {"type": "string"} or no type constraint at all.
511 if let Some(type_val) = summary_prop.get("type")
512 && type_val.as_str() != Some("string")
513 {
514 return Err(ConfigError::InvalidSummarySchema(
515 "'summary' field must be of type 'string'".to_string(),
516 ));
517 }
518
519 Ok(())
520}
521
522#[cfg(test)]
523mod tests {
524 use super::*;
525 use async_trait::async_trait;
526 use cognee_embedding::error::EmbeddingResult;
527 use cognee_llm::types::GenerationOptions;
528
529 // Minimal mock for EmbeddingEngine — only max_sequence_length() matters.
530 struct MockEmbedding {
531 max_seq: usize,
532 }
533
534 #[async_trait]
535 impl EmbeddingEngine for MockEmbedding {
536 async fn embed(&self, _texts: &[&str]) -> EmbeddingResult<Vec<Vec<f32>>> {
537 Ok(vec![])
538 }
539 fn dimension(&self) -> usize {
540 384
541 }
542 fn batch_size(&self) -> usize {
543 32
544 }
545 fn max_sequence_length(&self) -> usize {
546 self.max_seq
547 }
548 }
549
550 // Minimal mock for Llm — only max_context_length() matters.
551 struct MockLlm {
552 max_ctx: u32,
553 }
554
555 #[async_trait]
556 impl Llm for MockLlm {
557 async fn generate(
558 &self,
559 _messages: Vec<cognee_llm::Message>,
560 _options: Option<GenerationOptions>,
561 ) -> cognee_llm::LlmResult<cognee_llm::GenerationResponse> {
562 unimplemented!()
563 }
564 async fn create_structured_output_with_messages_raw(
565 &self,
566 _messages: Vec<cognee_llm::Message>,
567 _json_schema: &serde_json::Value,
568 _options: Option<GenerationOptions>,
569 ) -> cognee_llm::LlmResult<serde_json::Value> {
570 unimplemented!()
571 }
572 fn model(&self) -> &str {
573 "mock"
574 }
575 fn max_context_length(&self) -> u32 {
576 self.max_ctx
577 }
578 }
579
580 #[test]
581 fn test_default_config() {
582 let config = CognifyConfig::default();
583
584 // Chunking defaults
585 assert_eq!(config.max_chunk_size, 1500);
586 assert_eq!(config.chunk_overlap, 10);
587 assert_eq!(config.chunk_strategy, ChunkStrategy::Paragraph);
588
589 // Graph extraction defaults
590 assert_eq!(config.chunks_per_batch, 100);
591 assert_eq!(config.max_parallel_extractions, 20);
592 assert!(config.custom_extraction_prompt.is_none());
593
594 // Summarization defaults
595 assert!(config.enable_summarization);
596 assert_eq!(config.summarization_batch_size, 50);
597
598 // Embedding defaults
599 assert!(!config.embed_triplets);
600 assert_eq!(config.embedding_batch_size, 100);
601 assert_eq!(config.vector_collection_prefix, "");
602
603 // Incremental defaults
604 assert!(config.incremental_loading);
605
606 // Pipeline cache defaults
607 assert!(!config.use_pipeline_cache);
608
609 // Advanced defaults
610 assert!(!config.temporal_cognify);
611 assert_eq!(config.data_per_batch, 20);
612 }
613
614 #[test]
615 fn test_config_builder_chunking() {
616 let config = CognifyConfig::default()
617 .with_chunk_size(2000)
618 .with_chunk_overlap(50)
619 .with_chunk_strategy(ChunkStrategy::Recursive);
620
621 assert_eq!(config.max_chunk_size, 2000);
622 assert_eq!(config.chunk_overlap, 50);
623 assert_eq!(config.chunk_strategy, ChunkStrategy::Recursive);
624 }
625
626 #[test]
627 fn test_config_builder_graph_extraction() {
628 let config = CognifyConfig::default()
629 .with_chunks_per_batch(50)
630 .with_max_parallel_extractions(25)
631 .with_custom_prompt("Extract entities:".to_string());
632
633 assert_eq!(config.chunks_per_batch, 50);
634 assert_eq!(config.max_parallel_extractions, 25);
635 assert_eq!(
636 config.custom_extraction_prompt,
637 Some("Extract entities:".to_string())
638 );
639 }
640
641 #[test]
642 fn test_config_builder_all_features() {
643 let config = CognifyConfig::default()
644 .with_chunk_size(2000)
645 .with_triplet_embeddings(true)
646 .with_incremental_loading(false)
647 .with_summarization(false)
648 .with_temporal_cognify(true);
649
650 assert_eq!(config.max_chunk_size, 2000);
651 assert!(config.embed_triplets);
652 assert!(!config.incremental_loading);
653 assert!(!config.enable_summarization);
654 assert!(config.temporal_cognify);
655 }
656
657 #[test]
658 fn test_config_validation_success() {
659 let config = CognifyConfig::default();
660 assert!(config.validate().is_ok());
661 }
662
663 #[test]
664 fn test_config_validation_zero_chunk_size() {
665 let config = CognifyConfig {
666 max_chunk_size: 0,
667 ..Default::default()
668 };
669 assert!(matches!(
670 config.validate(),
671 Err(ConfigError::InvalidParameter(_))
672 ));
673 }
674
675 #[test]
676 fn test_config_validation_overlap_too_large() {
677 let config = CognifyConfig {
678 max_chunk_size: 100,
679 chunk_overlap: 100,
680 ..Default::default()
681 };
682 assert!(matches!(
683 config.validate(),
684 Err(ConfigError::InvalidParameter(_))
685 ));
686 }
687
688 #[test]
689 fn test_config_validation_zero_batch_sizes() {
690 let config1 = CognifyConfig {
691 chunks_per_batch: 0,
692 ..Default::default()
693 };
694 assert!(config1.validate().is_err());
695
696 let config2 = CognifyConfig {
697 embedding_batch_size: 0,
698 ..Default::default()
699 };
700 assert!(config2.validate().is_err());
701
702 let config3 = CognifyConfig {
703 summarization_batch_size: 0,
704 ..Default::default()
705 };
706 assert!(config3.validate().is_err());
707 }
708
709 /// Local ONNX/BGE default: the model's 512-token sequence limit binds →
710 /// min(512, 8192) = 512. The LLM argument is unused in the new formula
711 /// (Python constant 16384/2=8192). For an OpenAI-compatible engine at its
712 /// default max_completion_tokens (8191), the result would be min(8191, 8192)
713 /// = 8191 — see `test_auto_chunk_size_large_embedding` for the >8192 case.
714 #[test]
715 fn auto_chunk_size_matches_python_default() {
716 // BGE-like embedding: max_seq=512 binds → result=512.
717 let embed = MockEmbedding { max_seq: 512 };
718 let llm = MockLlm { max_ctx: 4096 };
719 assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 512);
720 }
721
722 #[test]
723 fn test_auto_chunk_size_embed_is_smaller() {
724 // embed_max=512, LLM cutoff=8192 → result=512 (embedding term dominates).
725 let embed = MockEmbedding { max_seq: 512 };
726 let llm = MockLlm { max_ctx: 4096 };
727 assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 512);
728 }
729
730 #[test]
731 fn test_auto_chunk_size_llm_cutoff_unused() {
732 // LLM context window is NOT used for the cutoff. The Python completion-token
733 // constant (16384 → 8192) is used instead. Even a tiny context window no
734 // longer artificially restricts the chunk size — the embedding term (512)
735 // still dominates.
736 let embed = MockEmbedding { max_seq: 512 };
737 let llm = MockLlm { max_ctx: 256 }; // previously returned 128, now returns 512
738 assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 512);
739 }
740
741 #[test]
742 fn test_auto_chunk_size_large_embedding() {
743 // embed_max=10000 > llm_cutoff (8192) → result=8192 (LLM constant dominates).
744 let embed = MockEmbedding { max_seq: 10_000 };
745 let llm = MockLlm { max_ctx: 4096 };
746 assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 8192);
747 }
748
749 #[test]
750 fn test_auto_chunk_size_equal_values() {
751 // embed_max=1024 < 8192 → result=1024 (embedding term dominates).
752 let embed = MockEmbedding { max_seq: 1024 };
753 let llm = MockLlm { max_ctx: 2048 };
754 assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 1024);
755 }
756
757 #[test]
758 fn test_auto_chunk_size_floor_at_one() {
759 // embed_max=0 → min(0, 8192)=0 → clamped to 1.
760 let embed = MockEmbedding { max_seq: 0 };
761 let llm = MockLlm { max_ctx: 0 };
762 assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 1);
763 }
764
765 #[test]
766 fn test_auto_chunk_size_embed_exactly_at_llm_cutoff() {
767 // embed_max=8192 == llm_cutoff → result=8192.
768 let embed = MockEmbedding { max_seq: 8192 };
769 let llm = MockLlm { max_ctx: 4096 };
770 assert_eq!(CognifyConfig::auto_chunk_size(&embed, &llm), 8192);
771 }
772
773 #[test]
774 fn test_with_auto_chunk_size_builder() {
775 let embed = MockEmbedding { max_seq: 512 };
776 let llm = MockLlm { max_ctx: 4096 };
777 let config = CognifyConfig::default().with_auto_chunk_size(&embed, &llm);
778 assert_eq!(config.max_chunk_size, 512);
779 // Other fields should remain at defaults
780 assert_eq!(config.chunk_overlap, 10);
781 assert_eq!(config.chunks_per_batch, 100);
782 }
783}