1use std::path::Path;
11
12use anyhow::{anyhow, Context, Result};
13use regex::Regex;
14use serde::Deserialize;
15
16const ALLOWED_PROMOTE_TYPES: &[&str] = &[
17 "text",
18 "text[]",
19 "int",
20 "bigint",
21 "boolean",
22 "jsonb",
23 "timestamptz",
24 "date",
25];
26
27const CLICKHOUSE_ENGINE_RE: &str = r"^(MergeTree(\(\))?|ReplacingMergeTree\(\w+\))( ORDER BY .+)?$";
38
39#[derive(Debug, Clone)]
45pub struct PromoteColumn {
46 pub path: String,
47 pub type_: String,
48}
49
50impl PromoteColumn {
51 pub fn column_name(&self) -> String {
54 self.path.replace('.', "__").to_lowercase()
55 }
56
57 fn validate_path(path: &str) -> std::result::Result<(), String> {
58 if path.is_empty() {
59 return Err("path must not be empty".into());
60 }
61 let seg_re = Regex::new(r"^[A-Za-z_][A-Za-z0-9_]*$").unwrap();
62 for seg in path.split('.') {
63 if !seg_re.is_match(seg) {
64 return Err(format!(
65 "path segments must match ^[A-Za-z_][A-Za-z0-9_]*$ separated by '.', got {path:?}"
66 ));
67 }
68 }
69 Ok(())
70 }
71
72 fn validate_type(t: &str) -> std::result::Result<(), String> {
73 if !ALLOWED_PROMOTE_TYPES.contains(&t) {
74 return Err(format!(
75 "promote_metadata type must be one of {ALLOWED_PROMOTE_TYPES:?}, got {t:?}"
76 ));
77 }
78 Ok(())
79 }
80}
81
82impl<'de> serde::Deserialize<'de> for PromoteColumn {
83 fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
84 #[derive(serde::Deserialize)]
85 struct Raw {
86 path: String,
87 #[serde(rename = "type")]
88 type_: String,
89 }
90 let r = Raw::deserialize(d)?;
91 Self::validate_path(&r.path).map_err(serde::de::Error::custom)?;
92 Self::validate_type(&r.type_).map_err(serde::de::Error::custom)?;
93 Ok(Self {
94 path: r.path,
95 type_: r.type_,
96 })
97 }
98}
99
100#[derive(Debug, Clone, Deserialize)]
102pub struct CellConfig {
103 pub cell_name: String,
104 pub source: SourceConfig,
105 pub chunker: ChunkerConfig,
106 pub embedder: EmbedderConfig,
107 pub target: TargetConfig,
108 #[serde(default)]
109 pub runtime: RuntimeConfig,
110 #[serde(default)]
111 pub framer: FramerConfig,
112 #[serde(default)]
113 pub extractor: ExtractorConfig,
114}
115
116#[derive(Debug, Clone, Deserialize)]
120#[serde(tag = "type", rename_all = "snake_case")]
121pub enum ExtractorConfig {
122 None(NoneExtractorConfig),
123 Composite(CompositeExtractorConfig),
124 RakeKeywords(RakeKeywordsExtractorConfig),
125 LangDetect(LangDetectExtractorConfig),
126 KeybertPhrases(KeybertPhrasesExtractorConfig),
127 SpacyEntities(SpacyEntitiesExtractorConfig),
128}
129
130impl Default for ExtractorConfig {
131 fn default() -> Self {
132 ExtractorConfig::None(NoneExtractorConfig::default())
133 }
134}
135
136#[derive(Debug, Clone, Deserialize, Default)]
137pub struct NoneExtractorConfig {}
138
139#[derive(Debug, Clone, Deserialize)]
140pub struct CompositeExtractorConfig {
141 #[serde(default)]
142 pub extractors: Vec<ExtractorConfig>,
143}
144
145#[derive(Debug, Clone, Deserialize)]
146pub struct RakeKeywordsExtractorConfig {
147 #[serde(default = "default_rake_top_k")]
148 pub top_k: usize,
149 #[serde(default = "default_rake_min_chars")]
150 pub min_chars: usize,
151}
152
153#[derive(Debug, Clone, Deserialize)]
154pub struct LangDetectExtractorConfig {
155 #[serde(default = "default_lang_backend")]
156 pub backend: String,
157}
158
159#[derive(Debug, Clone, Deserialize)]
160pub struct KeybertPhrasesExtractorConfig {
161 #[serde(default = "default_keybert_top_k")]
162 pub top_k: usize,
163 #[serde(default = "default_keybert_model")]
164 pub model_name: String,
165 #[serde(default = "default_keybert_ngram")]
166 pub keyphrase_ngram_range: (usize, usize),
167}
168
169#[derive(Debug, Clone, Deserialize)]
170pub struct SpacyEntitiesExtractorConfig {
171 #[serde(default = "default_spacy_model")]
172 pub model: String,
173 #[serde(default = "default_spacy_whitelist")]
174 pub label_whitelist: Vec<String>,
175}
176
177fn default_rake_top_k() -> usize {
178 10
179}
180fn default_rake_min_chars() -> usize {
181 3
182}
183fn default_lang_backend() -> String {
184 "langdetect".to_string()
185}
186fn default_keybert_top_k() -> usize {
187 10
188}
189fn default_keybert_model() -> String {
190 "all-MiniLM-L6-v2".to_string()
191}
192fn default_keybert_ngram() -> (usize, usize) {
193 (1, 2)
194}
195fn default_spacy_model() -> String {
196 "en_core_web_sm".to_string()
197}
198fn default_spacy_whitelist() -> Vec<String> {
199 vec!["ORG", "PERSON", "GPE", "DATE", "LAW"]
200 .into_iter()
201 .map(String::from)
202 .collect()
203}
204
205#[derive(Debug, Clone, Deserialize)]
206#[serde(tag = "type", rename_all = "snake_case")]
207pub enum FramerConfig {
208 Identity(IdentityFramerConfig),
209 HeadingBoundary(HeadingBoundaryFramerConfig),
210 RegexBoundary(RegexBoundaryFramerConfig),
211 Jsonpath(JsonPathFramerConfig),
212 SessionEpisode(SessionEpisodeFramerConfig),
216}
217
218impl Default for FramerConfig {
219 fn default() -> Self {
220 FramerConfig::Identity(IdentityFramerConfig {})
221 }
222}
223
224#[derive(Debug, Clone, Deserialize, Default)]
225pub struct IdentityFramerConfig {}
226
227#[derive(Debug, Clone, Deserialize)]
228pub struct HeadingBoundaryFramerConfig {
229 #[serde(default = "default_heading_pattern")]
230 pub pattern: String,
231 #[serde(default = "default_true")]
232 pub title_from_heading: bool,
233}
234
235#[derive(Debug, Clone, Deserialize)]
236pub struct RegexBoundaryFramerConfig {
237 pub split_pattern: String,
238 #[serde(default)]
239 pub title_pattern: Option<String>,
240 #[serde(default = "default_true")]
241 pub body_starts_with_match: bool,
242}
243
244#[derive(Debug, Clone, Deserialize)]
245pub struct JsonPathFramerConfig {
246 pub row_path: String,
247 #[serde(default)]
248 pub title_path: Option<String>,
249 #[serde(default = "default_jsonpath_body")]
250 pub body_path: String,
251}
252
253#[derive(Debug, Clone, Deserialize)]
257#[serde(deny_unknown_fields)]
258pub struct SessionEpisodeFramerConfig {
259 #[serde(default = "default_max_gap_seconds")]
262 pub max_gap_seconds: u64,
263 #[serde(default = "default_max_turns")]
265 pub max_turns: u32,
266 #[serde(default = "default_max_words")]
268 pub max_words: u32,
269 #[serde(default = "default_true")]
271 pub boundary_on_tool: bool,
272}
273
274fn default_max_gap_seconds() -> u64 {
275 1800
276}
277fn default_max_turns() -> u32 {
278 40
279}
280fn default_max_words() -> u32 {
281 1200
282}
283
284fn default_heading_pattern() -> String {
285 r"^#+\s".to_string()
286}
287fn default_true() -> bool {
288 true
289}
290fn default_jsonpath_body() -> String {
291 "$".to_string()
292}
293
294#[derive(Debug, Clone, Deserialize)]
295#[serde(tag = "type", rename_all = "snake_case")]
296pub enum SourceConfig {
297 Files(FilesSourceConfig),
298 JsonCorpus(JsonCorpusSourceConfig),
299 PgTable(PgTableSourceConfig),
300 MariadbTable(MariadbTableSourceConfig),
301 SqliteTable(SqliteTableSourceConfig),
302 Http(HttpSourceConfig),
303 S3(S3SourceConfig),
304 ClickhouseTable(ClickhouseTableSourceConfig),
305 SessionStaging(SessionStagingSourceConfig),
309 Inline(InlineSourceConfig),
314}
315
316#[derive(Debug, Clone, Deserialize, Default)]
317pub struct InlineSourceConfig {}
318
319#[derive(Debug, Clone, Deserialize)]
320pub struct FilesSourceConfig {
321 pub glob: String,
322 #[serde(default = "default_id_from")]
323 pub id_from: String,
324 #[serde(default = "default_encoding")]
325 pub encoding: String,
326}
327
328#[derive(Debug, Clone, Deserialize)]
329pub struct JsonCorpusSourceConfig {
330 pub path: String,
331 #[serde(default = "default_documents_key")]
332 pub documents_key: String,
333 #[serde(default = "default_id_field")]
334 pub id_field: String,
335 #[serde(default = "default_content_field")]
336 pub content_field: String,
337 #[serde(default = "default_title_field")]
338 pub title_field: Option<String>,
339}
340
341#[derive(Debug, Clone, Deserialize)]
342pub struct PgTableSourceConfig {
343 pub dsn_env: String,
344 #[serde(rename = "schema")]
345 pub schema_name: String,
346 pub table: String,
347 pub id_column: String,
348 pub content_column: String,
349 #[serde(default)]
350 pub title_column: Option<String>,
351 #[serde(default, rename = "where")]
355 pub where_clause: Option<String>,
356 #[serde(default)]
360 pub metadata_columns: Vec<String>,
361 #[serde(default)]
368 pub updated_at_column: Option<String>,
369}
370
371#[derive(Debug, Clone, Deserialize)]
372pub struct MariadbTableSourceConfig {
373 pub dsn_env: String,
374 #[serde(rename = "database")]
375 pub database_name: String,
376 pub table: String,
377 pub id_column: String,
378 pub content_column: String,
379 #[serde(default)]
380 pub title_column: Option<String>,
381 #[serde(default, rename = "where")]
384 pub where_clause: Option<String>,
385 #[serde(default)]
386 pub metadata_columns: Vec<String>,
387}
388
389#[derive(Debug, Clone, Deserialize)]
393pub struct SqliteTableSourceConfig {
394 pub dsn_env: String,
395 #[serde(rename = "database")]
396 pub database_name: String,
397 pub table: String,
398 pub id_column: String,
399 pub content_column: String,
400 #[serde(default)]
401 pub title_column: Option<String>,
402 #[serde(default, rename = "where")]
405 pub where_clause: Option<String>,
406 #[serde(default)]
407 pub metadata_columns: Vec<String>,
408}
409
410#[derive(Debug, Clone, Deserialize)]
412pub struct ClickhouseTableSourceConfig {
413 pub dsn_env: String,
414 #[serde(rename = "database")]
415 pub database_name: String,
416 pub table: String,
417 pub id_column: String,
418 pub content_column: String,
419 #[serde(default)]
420 pub title_column: Option<String>,
421 #[serde(default, rename = "where")]
425 pub where_clause: Option<String>,
426 #[serde(default)]
427 pub metadata_columns: Vec<String>,
428}
429
430#[derive(Debug, Clone, Deserialize)]
431pub struct HttpSourceConfig {
432 #[serde(default)]
433 pub urls: Vec<String>,
434 #[serde(default)]
435 pub sitemap: Option<String>,
436 #[serde(default)]
441 pub crawl_depth: u32,
442 #[serde(default)]
446 pub allow_external: bool,
447 #[serde(default = "default_request_delay_seconds")]
450 pub request_delay_seconds: f64,
451 #[serde(default = "default_respect_robots")]
453 pub respect_robots: bool,
454 #[serde(default = "default_max_pages")]
456 pub max_pages: u64,
457 #[serde(default = "default_user_agent")]
459 pub user_agent: String,
460}
461
462fn default_request_delay_seconds() -> f64 {
463 0.5
464}
465fn default_respect_robots() -> bool {
466 true
467}
468fn default_max_pages() -> u64 {
469 1000
470}
471fn default_user_agent() -> String {
472 "chunkshop/0.6 (+https://github.com/yonk-labs/chunkshop)".to_string()
473}
474
475impl Default for HttpSourceConfig {
476 fn default() -> Self {
477 Self {
478 urls: Vec::new(),
479 sitemap: None,
480 crawl_depth: 0,
481 allow_external: false,
482 request_delay_seconds: default_request_delay_seconds(),
483 respect_robots: default_respect_robots(),
484 max_pages: default_max_pages(),
485 user_agent: default_user_agent(),
486 }
487 }
488}
489
490#[derive(Debug, Clone, Deserialize)]
491pub struct S3SourceConfig {
492 pub bucket: String,
493 #[serde(default)]
494 pub prefix: String,
495 #[serde(default)]
499 pub endpoint_url: Option<String>,
500}
501
502#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
507#[serde(rename_all = "lowercase")]
508pub enum MemoryTier {
509 Provisional,
510 Consolidated,
511}
512
513#[derive(Debug, Clone, Deserialize)]
518#[serde(deny_unknown_fields)]
519pub struct MemoryConfig {
520 pub tier: MemoryTier,
521 #[serde(default = "default_memory_supersede")]
525 pub supersede: bool,
526 #[serde(default)]
529 pub namespace: Option<String>,
530}
531
532fn default_memory_supersede() -> bool {
533 true
534}
535
536#[derive(Debug, Clone, Deserialize)]
544#[serde(deny_unknown_fields)]
545pub struct SessionStagingSourceConfig {
546 #[serde(default)]
549 pub dsn: Option<String>,
550 #[serde(default)]
551 pub dsn_env: Option<String>,
552 #[serde(default = "default_staging_table")]
553 pub staging_table: String,
554 #[serde(default = "default_staging_schema")]
555 pub staging_schema: String,
556 pub mode: SessionStagingMode,
557 #[serde(default)]
561 pub min_age_seconds: u64,
562 #[serde(default)]
564 pub max_sessions: Option<usize>,
565}
566
567#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
568#[serde(rename_all = "lowercase")]
569pub enum SessionStagingMode {
570 Realtime,
571 Consolidate,
572}
573
574fn default_staging_table() -> String {
575 "chunkshop_staging".to_string()
576}
577fn default_staging_schema() -> String {
578 "public".to_string()
579}
580
581fn default_id_from() -> String {
582 "stem".to_string()
583}
584
585fn default_encoding() -> String {
586 "utf-8".to_string()
587}
588
589fn default_documents_key() -> String {
590 "documents".to_string()
591}
592fn default_id_field() -> String {
593 "id".to_string()
594}
595fn default_content_field() -> String {
596 "content".to_string()
597}
598fn default_title_field() -> Option<String> {
599 Some("title".to_string())
600}
601
602#[derive(Debug, Clone, Deserialize)]
603#[serde(tag = "type", rename_all = "snake_case")]
604pub enum ChunkerConfig {
605 SentenceAware(SentenceAwareChunkerConfig),
606 Hierarchy(HierarchyChunkerConfig),
607 FixedOverlap(FixedOverlapChunkerConfig),
608 NeighborExpand(NeighborExpandChunkerConfig),
609 Semantic(SemanticChunkerConfig),
610 SummaryEmbed(SummaryEmbedChunkerConfig),
611 HierarchicalSummary(HierarchicalSummaryChunkerConfig),
612 Consolidation(ConsolidationChunkerConfig),
616 #[cfg(feature = "code-aware")]
620 SymbolAware(SymbolAwareChunkerConfig),
621}
622
623impl ChunkerConfig {
624 pub fn effective_max_chars(&self) -> Option<usize> {
631 match self {
632 ChunkerConfig::SentenceAware(c) => Some(c.max_chars),
633 ChunkerConfig::Hierarchy(c) => Some(c.max_chars),
634 ChunkerConfig::FixedOverlap(c) => c.max_chars,
635 ChunkerConfig::Semantic(c) => Some(c.max_chunk_chars),
636 ChunkerConfig::NeighborExpand(c) => {
637 c.max_chars.or_else(|| c.base.effective_max_chars())
638 }
639 ChunkerConfig::SummaryEmbed(c) => c.max_chars.or_else(|| c.base.effective_max_chars()),
640 ChunkerConfig::HierarchicalSummary(c) => {
641 c.max_chars.or_else(|| c.base.effective_max_chars())
642 }
643 ChunkerConfig::Consolidation(c) => c.base.effective_max_chars(),
644 #[cfg(feature = "code-aware")]
645 ChunkerConfig::SymbolAware(_) => None,
646 }
647 }
648
649 pub fn if_oversize(&self) -> Option<&ChunkerConfig> {
652 match self {
653 ChunkerConfig::SentenceAware(c) => c.if_oversize.as_deref(),
654 ChunkerConfig::Hierarchy(c) => c.if_oversize.as_deref(),
655 ChunkerConfig::FixedOverlap(c) => c.if_oversize.as_deref(),
656 ChunkerConfig::Semantic(c) => c.if_oversize.as_deref(),
657 ChunkerConfig::NeighborExpand(c) => c.if_oversize.as_deref(),
658 ChunkerConfig::SummaryEmbed(c) => c.if_oversize.as_deref(),
659 ChunkerConfig::HierarchicalSummary(c) => c.if_oversize.as_deref(),
660 ChunkerConfig::Consolidation(c) => c.if_oversize.as_deref(),
661 #[cfg(feature = "code-aware")]
662 ChunkerConfig::SymbolAware(_) => None,
663 }
664 }
665
666 pub fn type_name(&self) -> &'static str {
668 match self {
669 ChunkerConfig::SentenceAware(_) => "sentence_aware",
670 ChunkerConfig::Hierarchy(_) => "hierarchy",
671 ChunkerConfig::FixedOverlap(_) => "fixed_overlap",
672 ChunkerConfig::NeighborExpand(_) => "neighbor_expand",
673 ChunkerConfig::Semantic(_) => "semantic",
674 ChunkerConfig::SummaryEmbed(_) => "summary_embed",
675 ChunkerConfig::HierarchicalSummary(_) => "hierarchical_summary",
676 ChunkerConfig::Consolidation(_) => "consolidation",
677 #[cfg(feature = "code-aware")]
678 ChunkerConfig::SymbolAware(_) => "symbol_aware",
679 }
680 }
681}
682
683#[derive(Debug, Clone, Deserialize)]
684pub struct SentenceAwareChunkerConfig {
685 #[serde(default = "default_doc_type")]
686 pub doc_type: String,
687 #[serde(default = "default_max_chars")]
688 pub max_chars: usize,
689 #[serde(default = "default_min_chars")]
690 pub min_chars: usize,
691 #[serde(default)]
692 pub if_oversize: Option<Box<ChunkerConfig>>,
693}
694
695#[cfg(feature = "code-aware")]
698#[derive(Debug, Clone, Deserialize, Default)]
699pub struct SymbolAwareChunkerConfig {
700 #[serde(default)]
703 pub project_id: Option<String>,
704}
705
706#[derive(Debug, Clone, Deserialize)]
707pub struct HierarchyChunkerConfig {
708 #[serde(default = "default_prefix_heading")]
709 pub prefix_heading: bool,
710 #[serde(default = "default_min_section_chars")]
711 pub min_section_chars: usize,
712 #[serde(default = "default_max_chars")]
713 pub max_chars: usize,
714 #[serde(default)]
715 pub if_oversize: Option<Box<ChunkerConfig>>,
716 #[serde(default, skip_serializing_if = "Option::is_none")]
725 pub heading_pattern: Option<String>,
726}
727
728#[derive(Debug, Clone, Deserialize)]
729pub struct FixedOverlapChunkerConfig {
730 #[serde(default = "default_window_words")]
731 pub window_words: usize,
732 #[serde(default = "default_step_words")]
733 pub step_words: usize,
734 #[serde(default)]
740 pub max_chars: Option<usize>,
741 #[serde(default)]
742 pub if_oversize: Option<Box<ChunkerConfig>>,
743}
744
745#[derive(Debug, Clone, Deserialize)]
746pub struct NeighborExpandChunkerConfig {
747 pub base: Box<ChunkerConfig>,
748 #[serde(default = "default_neighbor_window")]
749 pub window: usize,
750 #[serde(default)]
753 pub max_chars: Option<usize>,
754 #[serde(default)]
755 pub if_oversize: Option<Box<ChunkerConfig>>,
756}
757
758#[derive(Debug, Clone, Deserialize)]
763#[serde(deny_unknown_fields)]
764pub struct ConsolidationChunkerConfig {
765 pub base: Box<ChunkerConfig>,
768 pub consolidator: ConsolidatorConfig,
772 #[serde(default = "default_fact_max_chars")]
774 pub fact_max_chars: usize,
775 #[serde(default)]
776 pub if_oversize: Option<Box<ChunkerConfig>>,
777}
778
779fn default_fact_max_chars() -> usize {
780 1200
781}
782
783#[derive(Debug, Clone, Deserialize)]
788#[serde(tag = "mode", rename_all = "snake_case", deny_unknown_fields)]
789pub enum ConsolidatorConfig {
790 Extractive(ExtractiveConsolidatorConfig),
791}
792
793#[derive(Debug, Clone, Deserialize, Default)]
794#[serde(deny_unknown_fields)]
795pub struct ExtractiveConsolidatorConfig {}
796
797#[derive(Debug, Clone, Deserialize)]
798pub struct SemanticChunkerConfig {
799 #[serde(default = "default_boundary_model")]
800 pub boundary_model: String,
801 #[serde(default = "default_breakpoint_percentile")]
802 pub breakpoint_percentile: u32,
803 #[serde(default = "default_min_sents_per_chunk")]
804 pub min_sentences_per_chunk: usize,
805 #[serde(default = "default_max_chunk_chars")]
806 pub max_chunk_chars: usize,
807 #[serde(default = "default_sentence_splitter")]
808 pub sentence_splitter: String,
809 #[serde(default)]
810 pub if_oversize: Option<Box<ChunkerConfig>>,
811}
812
813#[derive(Debug, Clone, Deserialize)]
816#[serde(tag = "mode", rename_all = "snake_case")]
817pub enum SummarizerConfig {
818 External(ExternalSummarizerConfig),
819 Callable(CallableSummarizerConfig),
820 Passthrough(PassthroughSummarizerConfig),
821}
822
823impl SummarizerConfig {
824 pub fn mode_str(&self) -> &'static str {
827 match self {
828 SummarizerConfig::External(_) => "external",
829 SummarizerConfig::Callable(_) => "callable",
830 SummarizerConfig::Passthrough(_) => "passthrough",
831 }
832 }
833}
834
835#[derive(Debug, Clone, Deserialize)]
836pub struct ExternalSummarizerConfig {
837 #[serde(default = "default_external_field")]
838 pub field: String,
839}
840
841#[derive(Debug, Clone, Deserialize)]
842pub struct CallableSummarizerConfig {
843 pub module: String,
844 #[serde(default = "default_callable_function")]
845 pub function: String,
846 #[serde(default)]
847 pub kwargs: serde_json::Map<String, serde_json::Value>,
848}
849
850#[derive(Debug, Clone, Deserialize, Default)]
851pub struct PassthroughSummarizerConfig {}
852
853fn default_external_field() -> String {
854 "summary".to_string()
855}
856fn default_callable_function() -> String {
857 "summarize".to_string()
858}
859
860#[derive(Debug, Clone, Deserialize)]
863#[serde(tag = "strategy", rename_all = "snake_case")]
864pub enum GroupingConfig {
865 FixedN(FixedNGroupingConfig),
866 WordBudget(WordBudgetGroupingConfig),
867 SectionAware(SectionAwareGroupingConfig),
868}
869
870impl Default for GroupingConfig {
871 fn default() -> Self {
872 GroupingConfig::FixedN(FixedNGroupingConfig::default())
873 }
874}
875
876#[derive(Debug, Clone, Deserialize)]
877pub struct FixedNGroupingConfig {
878 #[serde(default = "default_fixed_n")]
879 pub n: usize,
880}
881
882impl Default for FixedNGroupingConfig {
883 fn default() -> Self {
884 Self {
885 n: default_fixed_n(),
886 }
887 }
888}
889
890#[derive(Debug, Clone, Deserialize)]
891pub struct WordBudgetGroupingConfig {
892 #[serde(default = "default_word_budget")]
893 pub max_words: usize,
894}
895
896#[derive(Debug, Clone, Deserialize, Default)]
897pub struct SectionAwareGroupingConfig {}
898
899fn default_fixed_n() -> usize {
900 5
901}
902fn default_word_budget() -> usize {
903 2000
904}
905
906#[derive(Debug, Clone, Deserialize)]
907pub struct SummaryEmbedChunkerConfig {
908 pub base: Box<ChunkerConfig>,
909 pub summarizer: SummarizerConfig,
910 #[serde(default)]
913 pub max_chars: Option<usize>,
914 #[serde(default)]
915 pub if_oversize: Option<Box<ChunkerConfig>>,
916}
917
918#[derive(Debug, Clone, Deserialize)]
919pub struct HierarchicalSummaryChunkerConfig {
920 pub base: Box<ChunkerConfig>,
921 pub summarizer: SummarizerConfig,
922 #[serde(default)]
923 pub grouping: GroupingConfig,
924 #[serde(default)]
928 pub max_chars: Option<usize>,
929 #[serde(default)]
930 pub if_oversize: Option<Box<ChunkerConfig>>,
931}
932
933fn default_window_words() -> usize {
934 300
935}
936fn default_step_words() -> usize {
937 150
938}
939fn default_neighbor_window() -> usize {
940 1
941}
942fn default_boundary_model() -> String {
943 "sentence-transformers/all-MiniLM-L6-v2-int8".to_string()
944}
945fn default_breakpoint_percentile() -> u32 {
946 95
947}
948fn default_min_sents_per_chunk() -> usize {
949 3
950}
951fn default_max_chunk_chars() -> usize {
952 2000
953}
954fn default_sentence_splitter() -> String {
955 "naive".to_string()
956}
957
958fn default_doc_type() -> String {
959 "prose".to_string()
960}
961fn default_max_chars() -> usize {
962 2000
963}
964fn default_min_chars() -> usize {
965 200
966}
967fn default_prefix_heading() -> bool {
968 true
969}
970fn default_min_section_chars() -> usize {
971 100
972}
973
974#[derive(Debug, Clone, Deserialize)]
975#[serde(tag = "type", rename_all = "snake_case")]
976pub enum EmbedderConfig {
977 Fastembed(FastembedEmbedderConfig),
978}
979
980#[derive(Debug, Clone, Deserialize)]
981pub struct FastembedEmbedderConfig {
982 pub model_name: String,
983 pub dim: usize,
984 #[serde(default = "default_batch_size")]
985 pub batch_size: usize,
986 #[serde(default)]
987 pub threads: Option<usize>,
988
989 #[serde(default)]
995 pub hf_repo: Option<String>,
996 #[serde(default)]
997 pub onnx_path: Option<String>,
998 #[serde(default = "default_pooling")]
999 pub pooling: String, #[serde(default = "default_additional_files")]
1001 pub additional_files: Vec<String>,
1002}
1003
1004fn default_batch_size() -> usize {
1005 64
1006}
1007
1008fn default_pooling() -> String {
1009 "cls".to_string()
1010}
1011
1012fn default_additional_files() -> Vec<String> {
1013 vec![
1014 "tokenizer.json".to_string(),
1015 "tokenizer_config.json".to_string(),
1016 "special_tokens_map.json".to_string(),
1017 "config.json".to_string(),
1018 ]
1019}
1020
1021impl FastembedEmbedderConfig {
1022 pub fn is_byo(&self) -> bool {
1024 self.hf_repo.is_some() && self.onnx_path.is_some()
1025 }
1026
1027 pub fn validate(&self) -> Result<()> {
1030 if self.hf_repo.is_some() != self.onnx_path.is_some() {
1031 return Err(anyhow!(
1032 "embedder.hf_repo and embedder.onnx_path must be set together \
1033 (BYO mode) or both omitted (registry mode)."
1034 ));
1035 }
1036 if self.hf_repo.is_some() && !matches!(self.pooling.as_str(), "cls" | "mean") {
1037 return Err(anyhow!(
1038 "embedder.pooling must be 'cls' or 'mean' for BYO embedders, got {:?}",
1039 self.pooling
1040 ));
1041 }
1042 Ok(())
1043 }
1044}
1045
1046#[derive(Debug, Clone, Deserialize)]
1047#[serde(tag = "type", rename_all = "snake_case")]
1048pub enum TargetConfig {
1049 Postgres(PostgresTargetConfig),
1050 Mariadb(MariadbTargetConfig),
1051 Sqlite(SqliteTargetConfig),
1052 Clickhouse(ClickhouseTargetConfig),
1053}
1054
1055impl TargetConfig {
1056 fn validate(&self) -> Result<()> {
1059 match self {
1060 TargetConfig::Postgres(t) => t.validate(),
1061 TargetConfig::Mariadb(t) => t.validate(),
1062 TargetConfig::Sqlite(t) => t.validate(),
1063 TargetConfig::Clickhouse(t) => t.validate(),
1064 }
1065 }
1066}
1067
1068#[derive(Debug, Clone, Deserialize, Default)]
1069pub struct DocumentStoreConfig {
1070 #[serde(default)]
1071 pub enabled: bool,
1072}
1073
1074fn validate_no_document_store(documents: &Option<DocumentStoreConfig>) -> Result<()> {
1075 if documents.as_ref().is_some_and(|d| d.enabled) {
1076 return Err(anyhow!(
1077 "target.documents is currently Python/Postgres-only; Rust does not write the companion document table yet"
1078 ));
1079 }
1080 Ok(())
1081}
1082
1083#[derive(Debug, Clone, Deserialize)]
1084pub struct PostgresTargetConfig {
1085 #[serde(default = "default_dsn_env")]
1086 pub dsn_env: String,
1087 #[serde(rename = "database")]
1088 pub database_name: String,
1089 pub table: String,
1090 #[serde(default)]
1094 pub overwrite: bool,
1095 #[serde(default = "default_hnsw")]
1096 pub hnsw: bool,
1097 #[serde(default = "default_vector_metric")]
1099 pub vector_metric: String,
1100 #[serde(default = "default_mode")]
1103 pub mode: String,
1104 #[serde(default)]
1105 pub source_tag: Option<String>,
1106 #[serde(default)]
1107 pub promote_metadata: Vec<PromoteColumn>,
1108 #[serde(default)]
1109 pub force_overwrite: bool,
1110 #[serde(default)]
1116 pub delete_orphans: bool,
1117 #[serde(default)]
1121 pub memory: Option<MemoryConfig>,
1122 #[serde(default)]
1126 pub documents: Option<DocumentStoreConfig>,
1127}
1128
1129impl PostgresTargetConfig {
1130 fn validate(&self) -> Result<()> {
1134 if self.mode == "append" && self.source_tag.is_none() {
1135 return Err(anyhow!(
1136 "target.mode='append' requires target.source_tag to identify this cell"
1137 ));
1138 }
1139 validate_no_document_store(&self.documents)?;
1140 if !matches!(
1141 self.vector_metric.as_str(),
1142 "cosine" | "inner_product" | "l2"
1143 ) {
1144 return Err(anyhow!(
1145 "target.vector_metric must be one of 'cosine', 'inner_product', or 'l2', got {:?}",
1146 self.vector_metric
1147 ));
1148 }
1149 Ok(())
1150 }
1151}
1152
1153#[derive(Debug, Clone, Deserialize)]
1154pub struct MariadbTargetConfig {
1155 #[serde(default = "default_dsn_env")]
1156 pub dsn_env: String,
1157 #[serde(rename = "database")]
1158 pub database_name: String,
1159 pub table: String,
1160 #[serde(default)]
1163 pub overwrite: bool,
1164 #[serde(default = "default_hnsw")]
1165 pub hnsw: bool,
1166 #[serde(default = "default_mode")]
1167 pub mode: String,
1168 #[serde(default)]
1169 pub source_tag: Option<String>,
1170 #[serde(default)]
1171 pub promote_metadata: Vec<PromoteColumn>,
1172 #[serde(default)]
1173 pub force_overwrite: bool,
1174 #[serde(default)]
1175 pub delete_orphans: bool,
1176 #[serde(default)]
1177 pub documents: Option<DocumentStoreConfig>,
1178}
1179
1180impl MariadbTargetConfig {
1181 pub(crate) fn validate(&self) -> Result<()> {
1182 if self.mode == "append" && self.source_tag.is_none() {
1183 return Err(anyhow!(
1184 "target.mode='append' requires target.source_tag to identify this cell"
1185 ));
1186 }
1187 validate_no_document_store(&self.documents)?;
1188 Ok(())
1189 }
1190}
1191
1192#[derive(Debug, Clone, Deserialize)]
1193pub struct ClickhouseTargetConfig {
1194 #[serde(default = "default_dsn_env")]
1195 pub dsn_env: String,
1196 #[serde(rename = "database")]
1197 pub database_name: String,
1198 pub table: String,
1199 #[serde(default = "default_hnsw")]
1200 pub hnsw: bool,
1201 #[serde(default = "default_mode")]
1202 pub mode: String,
1203 #[serde(default)]
1204 pub source_tag: Option<String>,
1205 #[serde(default)]
1206 pub promote_metadata: Vec<PromoteColumn>,
1207 #[serde(default)]
1208 pub force_overwrite: bool,
1209 #[serde(default)]
1213 pub delete_orphans: bool,
1214 #[serde(default)]
1220 pub engine: Option<String>,
1221 #[serde(default)]
1222 pub documents: Option<DocumentStoreConfig>,
1223}
1224
1225impl ClickhouseTargetConfig {
1226 fn validate(&self) -> Result<()> {
1227 if self.mode == "append" && self.source_tag.is_none() {
1228 return Err(anyhow!(
1229 "target.mode='append' requires target.source_tag to identify this cell"
1230 ));
1231 }
1232 if let Some(e) = &self.engine {
1233 let re = Regex::new(CLICKHOUSE_ENGINE_RE).unwrap();
1234 if !re.is_match(e) {
1235 return Err(anyhow!(
1236 "target.engine {e:?} not in allowlist. Accepted shapes: \
1237 'MergeTree', 'MergeTree()', 'ReplacingMergeTree(<col>)', \
1238 each optionally followed by ' ORDER BY <expr>'. Custom engines \
1239 are not supported in v0.4 — file an issue if you need one."
1240 ));
1241 }
1242 }
1243 validate_no_document_store(&self.documents)?;
1244 Ok(())
1245 }
1246}
1247
1248#[derive(Debug, Clone, Deserialize)]
1254pub struct SqliteTargetConfig {
1255 pub dsn_env: String,
1257 #[serde(rename = "database")]
1258 pub database_name: String,
1259 pub table: String,
1260 #[serde(default)]
1262 pub overwrite: bool,
1263 #[serde(default = "default_hnsw")]
1264 pub hnsw: bool,
1265 #[serde(default = "default_mode")]
1267 pub mode: String,
1268 #[serde(default)]
1269 pub source_tag: Option<String>,
1270 #[serde(default)]
1271 pub promote_metadata: Vec<PromoteColumn>,
1272 #[serde(default)]
1273 pub force_overwrite: bool,
1274 #[serde(default)]
1276 pub delete_orphans: bool,
1277 #[serde(default)]
1278 pub documents: Option<DocumentStoreConfig>,
1279}
1280
1281impl SqliteTargetConfig {
1282 pub(crate) fn validate(&self) -> Result<()> {
1283 if self.mode == "append" && self.source_tag.is_none() {
1284 return Err(anyhow!(
1285 "target.mode='append' requires target.source_tag to identify this cell"
1286 ));
1287 }
1288 validate_no_document_store(&self.documents)?;
1289 Ok(())
1290 }
1291}
1292
1293fn default_dsn_env() -> String {
1294 "CHUNKSHOP_DSN".to_string()
1295}
1296fn default_hnsw() -> bool {
1297 true
1298}
1299
1300fn default_vector_metric() -> String {
1301 "cosine".to_string()
1302}
1303fn default_mode() -> String {
1304 "overwrite".to_string()
1305}
1306
1307#[derive(Debug, Clone, Default, Deserialize)]
1308pub struct RuntimeConfig {
1309 #[serde(default)]
1310 pub omp_num_threads: Option<usize>,
1311 #[serde(default)]
1312 pub doc_limit: Option<usize>,
1313 #[serde(default)]
1314 pub log_path: Option<String>,
1315 #[serde(default)]
1316 pub heartbeat_every: Option<usize>,
1317 #[serde(default = "default_log_format")]
1320 pub log_format: String,
1321}
1322
1323fn default_log_format() -> String {
1324 "text".to_string()
1325}
1326
1327fn validate_ident(name: &str, field: &str) -> Result<()> {
1329 let re = Regex::new(r"^[a-z_][a-z0-9_]*$").unwrap();
1330 if !re.is_match(name) {
1331 return Err(anyhow!(
1332 "{field} must match ^[a-z_][a-z0-9_]*$, got {name:?}"
1333 ));
1334 }
1335 Ok(())
1336}
1337
1338fn reject_legacy_forms(yaml: &serde_yaml_ng::Value) -> Result<()> {
1345 let target = yaml.get("target").and_then(|v| v.as_mapping());
1346 let Some(target) = target else {
1347 return Ok(()); };
1349
1350 if let Some(t) = target.get("type").and_then(|v| v.as_str()) {
1351 if t == "pgvector" {
1352 return Err(anyhow!(
1353 "target.type 'pgvector' was renamed to 'postgres' in v0.4.0. Update your YAML."
1354 ));
1355 }
1356 }
1357 if target.get("schema").is_some() {
1358 return Err(anyhow!(
1359 "target.schema was renamed to target.database in v0.4.0. Update your YAML."
1360 ));
1361 }
1362 if let Some(o) = target.get("overwrite") {
1363 if matches!(o.as_bool(), Some(true)) {
1364 return Err(anyhow!(
1365 "target.overwrite: true was replaced by target.mode: 'overwrite' in v0.4.0. \
1366 Update your YAML."
1367 ));
1368 }
1369 }
1370 Ok(())
1371}
1372
1373pub fn load_config(path: &Path) -> Result<CellConfig> {
1374 let text = std::fs::read_to_string(path)
1375 .with_context(|| format!("reading config {}", path.display()))?;
1376
1377 let raw_value: serde_yaml_ng::Value = serde_yaml_ng::from_str(&text)
1380 .with_context(|| format!("parsing YAML at {}", path.display()))?;
1381 reject_legacy_forms(&raw_value)?;
1382
1383 let cfg: CellConfig = serde_yaml_ng::from_str(&text)
1384 .with_context(|| format!("parsing YAML {}", path.display()))?;
1385 match &cfg.target {
1386 TargetConfig::Postgres(t) => {
1387 validate_ident(&t.database_name, "target.database")?;
1388 validate_ident(&t.table, "target.table")?;
1389 if let Some(tag) = &t.source_tag {
1390 validate_ident(tag, "target.source_tag")?;
1391 }
1392 }
1393 TargetConfig::Mariadb(t) => {
1394 validate_ident(&t.database_name, "target.database")?;
1395 validate_ident(&t.table, "target.table")?;
1396 if let Some(tag) = &t.source_tag {
1397 validate_ident(tag, "target.source_tag")?;
1398 }
1399 }
1400 TargetConfig::Sqlite(t) => {
1401 validate_ident(&t.database_name, "target.database")?;
1402 validate_ident(&t.table, "target.table")?;
1403 if let Some(tag) = &t.source_tag {
1404 validate_ident(tag, "target.source_tag")?;
1405 }
1406 }
1407 TargetConfig::Clickhouse(t) => {
1408 validate_ident(&t.database_name, "target.database")?;
1409 validate_ident(&t.table, "target.table")?;
1410 if let Some(tag) = &t.source_tag {
1411 validate_ident(tag, "target.source_tag")?;
1412 }
1413 }
1414 }
1415 if let SourceConfig::PgTable(p) = &cfg.source {
1416 validate_ident(&p.schema_name, "source.schema")?;
1417 validate_ident(&p.table, "source.table")?;
1418 validate_ident(&p.id_column, "source.id_column")?;
1419 validate_ident(&p.content_column, "source.content_column")?;
1420 if let Some(tc) = &p.title_column {
1421 validate_ident(tc, "source.title_column")?;
1422 }
1423 }
1425 if let SourceConfig::MariadbTable(p) = &cfg.source {
1426 validate_ident(&p.database_name, "source.database")?;
1427 validate_ident(&p.table, "source.table")?;
1428 validate_ident(&p.id_column, "source.id_column")?;
1429 validate_ident(&p.content_column, "source.content_column")?;
1430 if let Some(tc) = &p.title_column {
1431 validate_ident(tc, "source.title_column")?;
1432 }
1433 }
1435 if let SourceConfig::SqliteTable(s) = &cfg.source {
1436 validate_ident(&s.database_name, "source.database")?;
1437 validate_ident(&s.table, "source.table")?;
1438 validate_ident(&s.id_column, "source.id_column")?;
1439 validate_ident(&s.content_column, "source.content_column")?;
1440 if let Some(tc) = &s.title_column {
1441 validate_ident(tc, "source.title_column")?;
1442 }
1443 }
1445 if let SourceConfig::ClickhouseTable(p) = &cfg.source {
1446 validate_ident(&p.database_name, "source.database")?;
1447 validate_ident(&p.table, "source.table")?;
1448 validate_ident(&p.id_column, "source.id_column")?;
1449 validate_ident(&p.content_column, "source.content_column")?;
1450 if let Some(tc) = &p.title_column {
1451 validate_ident(tc, "source.title_column")?;
1452 }
1453 for mc in &p.metadata_columns {
1454 validate_ident(mc, "source.metadata_columns")?;
1455 }
1456 }
1458 cfg.target.validate()?;
1459 validate_chunker_config(&cfg.chunker)?;
1460 match &cfg.embedder {
1461 EmbedderConfig::Fastembed(e) => e.validate()?,
1462 }
1463 Ok(cfg)
1464}
1465
1466fn validate_chunker_config(c: &ChunkerConfig) -> Result<()> {
1470 if c.if_oversize().is_some() && c.effective_max_chars().is_none() {
1473 return Err(anyhow!(
1474 "chunker {:?} has `if_oversize` set but no effective `max_chars` ceiling. \
1475 Either set `max_chars` on this chunker (or on its `base` for wrappers), \
1476 or remove `if_oversize`.",
1477 c.type_name()
1478 ));
1479 }
1480 if let Some(nested) = c.if_oversize() {
1482 validate_chunker_config(nested)?;
1483 }
1484 match c {
1485 ChunkerConfig::SentenceAware(_)
1486 | ChunkerConfig::Hierarchy(_)
1487 | ChunkerConfig::FixedOverlap(_)
1488 | ChunkerConfig::Semantic(_) => Ok(()),
1489 ChunkerConfig::NeighborExpand(c) => validate_chunker_config(&c.base),
1490 ChunkerConfig::SummaryEmbed(c) => validate_chunker_config(&c.base),
1491 ChunkerConfig::HierarchicalSummary(c) => {
1492 if matches!(c.grouping, GroupingConfig::SectionAware(_)) {
1495 let base_type_name = c.base.type_name();
1496 if base_type_name != "hierarchy" {
1497 return Err(anyhow!(
1498 "hierarchical_summary with strategy='section_aware' requires \
1499 base.type='hierarchy', got {base_type_name:?}"
1500 ));
1501 }
1502 }
1503 validate_chunker_config(&c.base)
1504 }
1505 ChunkerConfig::Consolidation(c) => validate_chunker_config(&c.base),
1506 #[cfg(feature = "code-aware")]
1507 ChunkerConfig::SymbolAware(_) => Ok(()),
1508 }
1509}
1510
1511#[cfg(test)]
1512mod tests {
1513 use super::*;
1514
1515 fn write_yaml(body: &str) -> std::path::PathBuf {
1516 let path = std::env::temp_dir().join(format!(
1517 "chunkshop-rs-cfg-{}.yaml",
1518 std::time::SystemTime::now()
1519 .duration_since(std::time::UNIX_EPOCH)
1520 .unwrap()
1521 .as_nanos()
1522 ));
1523 std::fs::write(&path, body).unwrap();
1524 path
1525 }
1526
1527 #[test]
1528 fn rejects_append_without_source_tag() {
1529 let yaml = r#"
1530cell_name: t
1531source: { type: files, glob: "x", id_from: stem }
1532chunker: { type: sentence_aware }
1533embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1534target: { type: postgres, dsn_env: D, database: s, table: t, mode: append, hnsw: false }
1535"#;
1536 let path = write_yaml(yaml);
1537 let err = format!("{:#}", load_config(&path).unwrap_err());
1538 assert!(
1539 err.contains("source_tag"),
1540 "expected source_tag mention, got: {err}"
1541 );
1542 }
1543
1544 #[test]
1545 fn rejects_enabled_document_store_until_rust_parity_exists() {
1546 let yaml = r#"
1547cell_name: t
1548source: { type: files, glob: "x", id_from: stem }
1549chunker: { type: sentence_aware }
1550embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1551target:
1552 type: postgres
1553 dsn_env: D
1554 database: s
1555 table: chunks
1556 mode: overwrite
1557 hnsw: false
1558 documents:
1559 enabled: true
1560 table: documents
1561"#;
1562 let path = write_yaml(yaml);
1563 let err = format!("{:#}", load_config(&path).unwrap_err());
1564 assert!(
1565 err.contains("Python/Postgres-only") && err.contains("target.documents"),
1566 "expected document-store parity complaint, got: {err}"
1567 );
1568 }
1569
1570 #[test]
1571 fn rejects_invalid_promote_type() {
1572 let yaml = r#"
1573cell_name: t
1574source: { type: files, glob: "x", id_from: stem }
1575chunker: { type: sentence_aware }
1576embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1577target:
1578 type: postgres
1579 dsn_env: D
1580 database: s
1581 table: t
1582 mode: overwrite
1583 hnsw: false
1584 promote_metadata:
1585 - { path: entities.ORG, type: bogus_type }
1586"#;
1587 let path = write_yaml(yaml);
1588 let err = format!("{:#}", load_config(&path).unwrap_err());
1589 assert!(
1590 err.contains("type"),
1591 "expected promote_metadata type complaint, got: {err}"
1592 );
1593 }
1594
1595 #[test]
1596 fn rejects_invalid_promote_path() {
1597 let yaml = r#"
1598cell_name: t
1599source: { type: files, glob: "x", id_from: stem }
1600chunker: { type: sentence_aware }
1601embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1602target:
1603 type: postgres
1604 dsn_env: D
1605 database: s
1606 table: t
1607 mode: overwrite
1608 hnsw: false
1609 promote_metadata:
1610 - { path: "0entities.ORG", type: text }
1611"#;
1612 let path = write_yaml(yaml);
1613 let err = format!("{:#}", load_config(&path).unwrap_err());
1614 assert!(
1615 err.contains("path"),
1616 "expected promote_metadata path complaint, got: {err}"
1617 );
1618 }
1619
1620 #[test]
1621 fn promote_column_name_lowercases_and_double_underscores() {
1622 let pc: PromoteColumn =
1623 serde_yaml_ng::from_str("{ path: entities.ORG, type: \"text[]\" }").unwrap();
1624 assert_eq!(pc.column_name(), "entities__org");
1625 }
1626
1627 #[test]
1628 fn parses_promote_metadata_into_typed_vec() {
1629 let yaml = r#"
1630cell_name: t
1631source: { type: files, glob: "x", id_from: stem }
1632chunker: { type: sentence_aware }
1633embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1634target:
1635 type: postgres
1636 dsn_env: D
1637 database: s
1638 table: t
1639 mode: overwrite
1640 hnsw: false
1641 promote_metadata:
1642 - { path: heading, type: text }
1643 - { path: entities.ORG, type: "text[]" }
1644"#;
1645 let path = write_yaml(yaml);
1646 let cfg = load_config(&path).expect("load");
1647 let TargetConfig::Postgres(t) = &cfg.target else {
1648 panic!("expected Postgres target");
1649 };
1650 assert_eq!(t.promote_metadata.len(), 2);
1651 assert_eq!(t.promote_metadata[0].path, "heading");
1652 assert_eq!(t.promote_metadata[0].type_, "text");
1653 assert_eq!(t.promote_metadata[1].column_name(), "entities__org");
1654 }
1655
1656 #[test]
1657 fn postgres_target_vector_metric_defaults_and_validates() {
1658 let yaml = r#"
1659cell_name: t
1660source: { type: files, glob: "x", id_from: stem }
1661chunker: { type: sentence_aware }
1662embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1663target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1664"#;
1665 let path = write_yaml(yaml);
1666 let cfg = load_config(&path).expect("load");
1667 let TargetConfig::Postgres(t) = &cfg.target else {
1668 panic!("expected Postgres target");
1669 };
1670 assert_eq!(t.vector_metric, "cosine");
1671
1672 let yaml = r#"
1673cell_name: t
1674source: { type: files, glob: "x", id_from: stem }
1675chunker: { type: sentence_aware }
1676embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1677target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false, vector_metric: l2 }
1678"#;
1679 let path = write_yaml(yaml);
1680 let cfg = load_config(&path).expect("load");
1681 let TargetConfig::Postgres(t) = &cfg.target else {
1682 panic!("expected Postgres target");
1683 };
1684 assert_eq!(t.vector_metric, "l2");
1685
1686 let yaml = r#"
1687cell_name: t
1688source: { type: files, glob: "x", id_from: stem }
1689chunker: { type: sentence_aware }
1690embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1691target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false, vector_metric: manhattan }
1692"#;
1693 let path = write_yaml(yaml);
1694 let err = format!("{:#}", load_config(&path).unwrap_err());
1695 assert!(
1696 err.contains("vector_metric"),
1697 "expected vector_metric error, got: {err}"
1698 );
1699 }
1700
1701 #[test]
1702 fn rejects_section_aware_without_hierarchy_base() {
1703 let yaml = r#"
1704cell_name: t
1705source: { type: files, glob: "x", id_from: stem }
1706chunker:
1707 type: hierarchical_summary
1708 base: { type: sentence_aware }
1709 summarizer: { mode: passthrough }
1710 grouping: { strategy: section_aware }
1711embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1712target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1713"#;
1714 let path = write_yaml(yaml);
1715 let err = format!("{:#}", load_config(&path).unwrap_err());
1716 assert!(
1717 err.contains("section_aware") && err.contains("hierarchy"),
1718 "expected section_aware/hierarchy mention, got: {err}"
1719 );
1720 }
1721
1722 #[test]
1723 fn parses_if_oversize_on_every_chunker_variant() {
1724 for kind in [
1727 "sentence_aware",
1728 "hierarchy",
1729 "fixed_overlap",
1730 "neighbor_expand",
1731 "semantic",
1732 "summary_embed",
1733 "hierarchical_summary",
1734 ] {
1735 let chunker_yaml = match kind {
1736 "sentence_aware" => "{ type: sentence_aware }".to_string(),
1737 "hierarchy" => "{ type: hierarchy }".to_string(),
1738 "fixed_overlap" => "{ type: fixed_overlap, max_chars: 1500 }".to_string(),
1739 "neighbor_expand" => {
1740 "{ type: neighbor_expand, base: { type: hierarchy } }".to_string()
1741 }
1742 "semantic" => "{ type: semantic }".to_string(),
1743 "summary_embed" => "{ type: summary_embed, base: { type: hierarchy }, summarizer: { mode: passthrough } }".to_string(),
1744 "hierarchical_summary" => "{ type: hierarchical_summary, base: { type: hierarchy }, summarizer: { mode: passthrough } }".to_string(),
1745 _ => unreachable!(),
1746 };
1747 let yaml = format!(
1749 r#"
1750cell_name: t
1751source: {{ type: files, glob: "x", id_from: stem }}
1752chunker:
1753 type: {kind}
1754 {extra}
1755 if_oversize:
1756 type: fixed_overlap
1757 window_words: 100
1758 step_words: 100
1759 max_chars: 500
1760embedder: {{ type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }}
1761target: {{ type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }}
1762"#,
1763 kind = kind,
1764 extra = match kind {
1765 "fixed_overlap" => "max_chars: 1500".to_string(),
1766 "neighbor_expand" => "base: { type: hierarchy }".to_string(),
1767 "summary_embed" =>
1768 "base: { type: hierarchy }\n summarizer: { mode: passthrough }".to_string(),
1769 "hierarchical_summary" =>
1770 "base: { type: hierarchy }\n summarizer: { mode: passthrough }".to_string(),
1771 _ => "".to_string(),
1772 }
1773 );
1774 let _ = chunker_yaml; let path = write_yaml(&yaml);
1776 let cfg = load_config(&path).unwrap_or_else(|e| {
1777 panic!("if_oversize on {kind} failed to parse: {e:#}");
1778 });
1779 assert!(
1780 cfg.chunker.if_oversize().is_some(),
1781 "if_oversize missing for {kind}"
1782 );
1783 }
1784 }
1785
1786 #[test]
1787 fn rejects_if_oversize_without_effective_ceiling() {
1788 let yaml = r#"
1792cell_name: t
1793source: { type: files, glob: "x", id_from: stem }
1794chunker:
1795 type: fixed_overlap
1796 window_words: 200
1797 step_words: 100
1798 if_oversize:
1799 type: fixed_overlap
1800 window_words: 100
1801 step_words: 50
1802 max_chars: 500
1803embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1804target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1805"#;
1806 let path = write_yaml(yaml);
1807 let err = format!("{:#}", load_config(&path).unwrap_err());
1808 assert!(
1809 err.contains("if_oversize") && err.contains("max_chars"),
1810 "expected if_oversize/max_chars complaint, got: {err}"
1811 );
1812 }
1813
1814 #[test]
1815 fn effective_max_chars_falls_through_to_base() {
1816 let yaml = r#"
1818cell_name: t
1819source: { type: files, glob: "x", id_from: stem }
1820chunker:
1821 type: neighbor_expand
1822 window: 2
1823 base:
1824 type: hierarchy
1825 max_chars: 1234
1826embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1827target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1828"#;
1829 let path = write_yaml(yaml);
1830 let cfg = load_config(&path).expect("load");
1831 assert_eq!(cfg.chunker.effective_max_chars(), Some(1234));
1832 }
1833
1834 #[test]
1835 fn fixed_overlap_max_chars_is_optional_unset() {
1836 let yaml = r#"
1839cell_name: t
1840source: { type: files, glob: "x", id_from: stem }
1841chunker: { type: fixed_overlap, window_words: 200, step_words: 100 }
1842embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1843target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1844"#;
1845 let path = write_yaml(yaml);
1846 let cfg = load_config(&path).expect("load");
1847 assert!(cfg.chunker.effective_max_chars().is_none());
1848 }
1849
1850 #[test]
1851 fn accepts_section_aware_with_hierarchy_base() {
1852 let yaml = r#"
1853cell_name: t
1854source: { type: files, glob: "x", id_from: stem }
1855chunker:
1856 type: hierarchical_summary
1857 base: { type: hierarchy }
1858 summarizer: { mode: passthrough }
1859 grouping: { strategy: section_aware }
1860embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1861target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1862"#;
1863 let path = write_yaml(yaml);
1864 load_config(&path).expect("should accept section_aware over hierarchy base");
1865 }
1866
1867 #[test]
1868 fn parses_sqlite_target_config() {
1869 let yaml = r#"
1870cell_name: t
1871source: { type: files, glob: "x", id_from: stem }
1872chunker: { type: sentence_aware }
1873embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1874target: { type: sqlite, dsn_env: SQLITE_PATH, database: ignored, table: chunks, mode: overwrite, hnsw: false }
1875"#;
1876 let path = write_yaml(yaml);
1877 let cfg = load_config(&path).expect("load");
1878 match &cfg.target {
1879 TargetConfig::Sqlite(t) => {
1880 assert_eq!(t.dsn_env, "SQLITE_PATH");
1881 assert_eq!(t.database_name, "ignored");
1882 assert_eq!(t.table, "chunks");
1883 assert_eq!(t.mode, "overwrite");
1884 }
1885 _ => panic!("expected Sqlite target"),
1886 }
1887 }
1888
1889 #[test]
1890 fn rejects_sqlite_append_without_source_tag() {
1891 let yaml = r#"
1892cell_name: t
1893source: { type: files, glob: "x", id_from: stem }
1894chunker: { type: sentence_aware }
1895embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1896target: { type: sqlite, dsn_env: SQLITE_PATH, database: ignored, table: chunks, mode: append, hnsw: false }
1897"#;
1898 let path = write_yaml(yaml);
1899 let err = format!("{:#}", load_config(&path).unwrap_err());
1900 assert!(
1901 err.contains("source_tag"),
1902 "expected source_tag mention, got: {err}"
1903 );
1904 }
1905
1906 #[test]
1907 fn parses_sqlite_table_source_config() {
1908 let yaml = r#"
1909cell_name: t
1910source:
1911 type: sqlite_table
1912 dsn_env: SQLITE_PATH
1913 database: ignored
1914 table: docs
1915 id_column: id
1916 content_column: body
1917chunker: { type: sentence_aware }
1918embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1919target: { type: sqlite, dsn_env: SQLITE_PATH, database: ignored, table: chunks, mode: overwrite, hnsw: false }
1920"#;
1921 let path = write_yaml(yaml);
1922 let cfg = load_config(&path).expect("load");
1923 match &cfg.source {
1924 SourceConfig::SqliteTable(s) => {
1925 assert_eq!(s.dsn_env, "SQLITE_PATH");
1926 assert_eq!(s.table, "docs");
1927 assert_eq!(s.id_column, "id");
1928 }
1929 _ => panic!("expected SqliteTable source"),
1930 }
1931 }
1932
1933 #[test]
1934 fn parses_clickhouse_target() {
1935 let yaml = r#"
1936cell_name: t
1937source: { type: files, glob: "x", id_from: stem }
1938chunker: { type: sentence_aware }
1939embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1940target:
1941 type: clickhouse
1942 dsn_env: CHUNKSHOP_DSN_CH
1943 database: my_db
1944 table: chunks
1945 mode: overwrite
1946 hnsw: true
1947"#;
1948 let path = write_yaml(yaml);
1949 let cfg = load_config(&path).expect("load");
1950 let TargetConfig::Clickhouse(t) = &cfg.target else {
1951 panic!("expected Clickhouse variant");
1952 };
1953 assert_eq!(t.database_name, "my_db");
1954 assert_eq!(t.table, "chunks");
1955 assert!(t.engine.is_none());
1956 }
1957
1958 #[test]
1959 fn accepts_replacing_merge_tree_engine() {
1960 let yaml = r#"
1961cell_name: t
1962source: { type: files, glob: "x", id_from: stem }
1963chunker: { type: sentence_aware }
1964embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1965target:
1966 type: clickhouse
1967 dsn_env: D
1968 database: db
1969 table: t
1970 mode: overwrite
1971 hnsw: false
1972 engine: "ReplacingMergeTree(created_at) ORDER BY (id)"
1973"#;
1974 let path = write_yaml(yaml);
1975 let cfg = load_config(&path).expect("ReplacingMergeTree should be accepted");
1976 let TargetConfig::Clickhouse(t) = &cfg.target else {
1977 unreachable!()
1978 };
1979 assert_eq!(
1980 t.engine.as_deref(),
1981 Some("ReplacingMergeTree(created_at) ORDER BY (id)")
1982 );
1983 }
1984
1985 #[test]
1986 fn rejects_arbitrary_engine_string() {
1987 let yaml = r#"
1988cell_name: t
1989source: { type: files, glob: "x", id_from: stem }
1990chunker: { type: sentence_aware }
1991embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1992target:
1993 type: clickhouse
1994 dsn_env: D
1995 database: db
1996 table: t
1997 mode: overwrite
1998 hnsw: false
1999 engine: "Memory"
2000"#;
2001 let path = write_yaml(yaml);
2002 let err = format!("{:#}", load_config(&path).unwrap_err());
2003 assert!(
2004 err.contains("allowlist") && err.contains("Memory"),
2005 "got: {err}"
2006 );
2007 }
2008
2009 #[test]
2010 fn rejects_engine_with_drop_table_injection() {
2011 let yaml = r#"
2012cell_name: t
2013source: { type: files, glob: "x", id_from: stem }
2014chunker: { type: sentence_aware }
2015embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2016target:
2017 type: clickhouse
2018 dsn_env: D
2019 database: db
2020 table: t
2021 mode: overwrite
2022 hnsw: false
2023 engine: "MergeTree(); DROP TABLE other"
2024"#;
2025 let path = write_yaml(yaml);
2026 assert!(
2027 load_config(&path).is_err(),
2028 "engine with embedded DROP must be rejected"
2029 );
2030 }
2031
2032 #[test]
2035 fn session_staging_source_deserialises() {
2036 let yaml = r#"
2037cell_name: m
2038source:
2039 type: session_staging
2040 dsn: "postgresql://localhost/x"
2041 staging_table: chunkshop_staging
2042 staging_schema: public
2043 mode: realtime
2044chunker: { type: sentence_aware }
2045embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2046target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2047"#;
2048 let path = write_yaml(yaml);
2049 let cfg = load_config(&path).expect("load_config should succeed");
2050 match cfg.source {
2051 SourceConfig::SessionStaging(s) => {
2052 assert_eq!(s.staging_table, "chunkshop_staging");
2053 assert_eq!(s.staging_schema, "public");
2054 assert_eq!(s.mode, SessionStagingMode::Realtime);
2055 assert_eq!(s.min_age_seconds, 0);
2056 assert!(s.dsn.is_some());
2057 }
2058 other => panic!("expected SessionStaging variant; got {other:?}"),
2059 }
2060 }
2061
2062 #[test]
2063 fn session_staging_defaults_match_python() {
2064 let yaml = r#"
2066cell_name: m
2067source:
2068 type: session_staging
2069 dsn_env: D
2070 mode: consolidate
2071chunker: { type: sentence_aware }
2072embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2073target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2074"#;
2075 let cfg = load_config(&write_yaml(yaml)).unwrap();
2076 if let SourceConfig::SessionStaging(s) = cfg.source {
2077 assert_eq!(s.staging_table, "chunkshop_staging");
2078 assert_eq!(s.staging_schema, "public");
2079 assert_eq!(s.mode, SessionStagingMode::Consolidate);
2080 } else {
2081 panic!("not session_staging");
2082 }
2083 }
2084
2085 #[test]
2086 fn memory_block_on_postgres_target() {
2087 let yaml = r#"
2088cell_name: m
2089source: { type: files, glob: "x", id_from: stem }
2090chunker: { type: sentence_aware }
2091embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2092target:
2093 type: postgres
2094 dsn_env: D
2095 database: agent_memory
2096 table: memory
2097 mode: create_if_missing
2098 source_tag: ns1
2099 hnsw: false
2100 memory:
2101 tier: consolidated
2102 supersede: true
2103 namespace: ns1
2104"#;
2105 let cfg = load_config(&write_yaml(yaml)).unwrap();
2106 let mem = match cfg.target {
2107 TargetConfig::Postgres(p) => p.memory.expect("memory expected"),
2108 _ => panic!("expected postgres target"),
2109 };
2110 assert_eq!(mem.tier, MemoryTier::Consolidated);
2111 assert!(mem.supersede);
2112 assert_eq!(mem.namespace.as_deref(), Some("ns1"));
2113 }
2114
2115 #[test]
2116 fn memory_block_unknown_field_rejected() {
2117 let yaml = r#"
2118cell_name: m
2119source: { type: files, glob: "x", id_from: stem }
2120chunker: { type: sentence_aware }
2121embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2122target:
2123 type: postgres
2124 dsn_env: D
2125 database: agent_memory
2126 table: memory
2127 mode: create_if_missing
2128 source_tag: ns1
2129 hnsw: false
2130 memory: { tier: consolidated, supersede: true, namespace: ns1, bogus_field: yes }
2131"#;
2132 let err = load_config(&write_yaml(yaml)).expect_err("bogus_field must fail");
2133 let msg = format!("{:#}", err);
2134 assert!(
2135 msg.contains("bogus_field") || msg.contains("unknown"),
2136 "expected unknown-field complaint, got: {msg}"
2137 );
2138 }
2139
2140 #[test]
2143 fn session_episode_framer_deserialises_with_defaults() {
2144 let yaml = r#"
2145cell_name: m
2146source: { type: files, glob: "x", id_from: stem }
2147framer: { type: session_episode }
2148chunker: { type: sentence_aware }
2149embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2150target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2151"#;
2152 let cfg = load_config(&write_yaml(yaml)).unwrap();
2153 match cfg.framer {
2154 FramerConfig::SessionEpisode(f) => {
2155 assert_eq!(f.max_gap_seconds, 1800); assert_eq!(f.max_turns, 40);
2157 assert_eq!(f.max_words, 1200);
2158 assert!(f.boundary_on_tool);
2159 }
2160 other => panic!("expected SessionEpisode framer; got {other:?}"),
2161 }
2162 }
2163
2164 #[test]
2165 fn session_episode_framer_overrides_apply() {
2166 let yaml = r#"
2167cell_name: m
2168source: { type: files, glob: "x", id_from: stem }
2169framer:
2170 type: session_episode
2171 max_gap_seconds: 600
2172 max_turns: 20
2173 max_words: 500
2174 boundary_on_tool: false
2175chunker: { type: sentence_aware }
2176embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2177target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2178"#;
2179 let cfg = load_config(&write_yaml(yaml)).unwrap();
2180 if let FramerConfig::SessionEpisode(f) = cfg.framer {
2181 assert_eq!(f.max_gap_seconds, 600);
2182 assert_eq!(f.max_turns, 20);
2183 assert_eq!(f.max_words, 500);
2184 assert!(!f.boundary_on_tool);
2185 } else {
2186 panic!("not session_episode");
2187 }
2188 }
2189
2190 #[test]
2193 fn consolidation_chunker_deserialises() {
2194 let yaml = r#"
2195cell_name: m
2196source: { type: files, glob: "x", id_from: stem }
2197framer: { type: session_episode }
2198chunker:
2199 type: consolidation
2200 base:
2201 type: sentence_aware
2202 max_chars: 2000
2203 min_chars: 200
2204 consolidator:
2205 mode: extractive
2206 fact_max_chars: 1200
2207embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2208target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2209"#;
2210 let cfg = load_config(&write_yaml(yaml)).unwrap();
2211 match cfg.chunker {
2212 ChunkerConfig::Consolidation(c) => {
2213 assert_eq!(c.fact_max_chars, 1200);
2214 assert!(matches!(*c.base, ChunkerConfig::SentenceAware(_)));
2215 assert!(matches!(c.consolidator, ConsolidatorConfig::Extractive(_)));
2216 }
2217 other => panic!("expected Consolidation; got {other:?}"),
2218 }
2219 }
2220
2221 #[test]
2222 fn consolidation_chunker_default_fact_max_chars() {
2223 let yaml = r#"
2224cell_name: m
2225source: { type: files, glob: "x", id_from: stem }
2226chunker:
2227 type: consolidation
2228 base: { type: sentence_aware }
2229 consolidator: { mode: extractive }
2230embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2231target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2232"#;
2233 let cfg = load_config(&write_yaml(yaml)).unwrap();
2234 if let ChunkerConfig::Consolidation(c) = cfg.chunker {
2235 assert_eq!(c.fact_max_chars, 1200); assert!(matches!(*c.base, ChunkerConfig::SentenceAware(_)));
2237 } else {
2238 panic!("not consolidation");
2239 }
2240 }
2241
2242 #[test]
2243 fn consolidation_chunker_unknown_field_rejected() {
2244 let yaml = r#"
2245cell_name: m
2246source: { type: files, glob: "x", id_from: stem }
2247chunker:
2248 type: consolidation
2249 base: { type: sentence_aware }
2250 consolidator: { mode: extractive }
2251 bogus_consolidation_field: yes
2252embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2253target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2254"#;
2255 let err = load_config(&write_yaml(yaml)).expect_err("bogus must fail");
2256 let msg = format!("{:#}", err);
2257 assert!(
2258 msg.contains("bogus") || msg.contains("unknown"),
2259 "expected unknown-field complaint, got: {msg}"
2260 );
2261 }
2262
2263 #[test]
2264 fn consolidator_unknown_mode_rejected() {
2265 let yaml = r#"
2266cell_name: m
2267source: { type: files, glob: "x", id_from: stem }
2268chunker:
2269 type: consolidation
2270 base: { type: sentence_aware }
2271 consolidator: { mode: callable }
2272embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2273target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2274"#;
2275 let err = load_config(&write_yaml(yaml)).expect_err("callable must fail in rust");
2278 let msg = format!("{:#}", err);
2279 assert!(
2280 msg.contains("callable") || msg.contains("variant") || msg.contains("unknown"),
2281 "expected mode-unknown complaint, got: {msg}"
2282 );
2283 }
2284
2285 #[test]
2286 fn session_episode_framer_unknown_field_rejected() {
2287 let yaml = r#"
2288cell_name: m
2289source: { type: files, glob: "x", id_from: stem }
2290framer: { type: session_episode, bogus: 1 }
2291chunker: { type: sentence_aware }
2292embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2293target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2294"#;
2295 let err = load_config(&write_yaml(yaml)).expect_err("bogus must fail");
2296 let msg = format!("{:#}", err);
2297 assert!(
2298 msg.contains("bogus") || msg.contains("unknown"),
2299 "expected unknown-field complaint, got: {msg}"
2300 );
2301 }
2302
2303 #[test]
2304 fn session_staging_unknown_field_rejected() {
2305 let yaml = r#"
2306cell_name: m
2307source:
2308 type: session_staging
2309 dsn_env: D
2310 mode: realtime
2311 bogus: 1
2312chunker: { type: sentence_aware }
2313embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2314target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2315"#;
2316 let err = load_config(&write_yaml(yaml)).expect_err("bogus must fail");
2317 let msg = format!("{:#}", err);
2318 assert!(
2319 msg.contains("bogus") || msg.contains("unknown"),
2320 "expected unknown-field complaint, got: {msg}"
2321 );
2322 }
2323}