chunkshop/
config.rs

1//! YAML config parsing.
2//!
3//! Accepts the same YAML shape as the Python reference implementation, but
4//! only the minimal subset: files source, sentence_aware chunker, fastembed
5//! embedder, pgvector target. Unknown fields are tolerated at the top level
6//! (e.g. `extractor:`, `framer:`, `runtime:` extras) so Python-authored YAMLs
7//! parse without edits — per-section structs use serde's default untagged
8//! behavior with explicit fields.
9
10use std::path::Path;
11
12use anyhow::{anyhow, Context, Result};
13use regex::Regex;
14use serde::Deserialize;
15
16const ALLOWED_PROMOTE_TYPES: &[&str] = &[
17    "text",
18    "text[]",
19    "int",
20    "bigint",
21    "boolean",
22    "jsonb",
23    "timestamptz",
24    "date",
25];
26
27/// Allowlist regex for `ClickhouseTargetConfig::engine`. Hardening relative to
28/// Python (which interpolates the engine string raw — see
29/// python/src/chunkshop/config.py:542). Accepts:
30///   - `MergeTree` / `MergeTree()`
31///   - `ReplacingMergeTree(<single_ident>)` (the `created_at` dedup column)
32///   - Any of the above optionally followed by ` ORDER BY <expr>`
33///
34/// Rejects engines outside this whitelist (Replicated*, Distributed, Memory,
35/// engines with embedded SQL, etc.) — those need explicit user request and a
36/// separate brief.
37const CLICKHOUSE_ENGINE_RE: &str = r"^(MergeTree(\(\))?|ReplacingMergeTree\(\w+\))( ORDER BY .+)?$";
38
39/// One promoted jsonb path → typed Postgres column. Mirrors Python's
40/// `chunkshop.config.PromoteColumn`. The `path` is dot-separated; each segment
41/// must match `^[A-Za-z_][A-Za-z0-9_]*$`. The `type_` must be in
42/// `ALLOWED_PROMOTE_TYPES` — this is SQL-injection-prevention by allowlist:
43/// `_ensure_promote_columns` interpolates the type as a literal into DDL.
44#[derive(Debug, Clone)]
45pub struct PromoteColumn {
46    pub path: String,
47    pub type_: String,
48}
49
50impl PromoteColumn {
51    /// Postgres column identifier — dots → double-underscore, lowercased.
52    /// Mirrors Python's `PromoteColumn.column_name`.
53    pub fn column_name(&self) -> String {
54        self.path.replace('.', "__").to_lowercase()
55    }
56
57    fn validate_path(path: &str) -> std::result::Result<(), String> {
58        if path.is_empty() {
59            return Err("path must not be empty".into());
60        }
61        let seg_re = Regex::new(r"^[A-Za-z_][A-Za-z0-9_]*$").unwrap();
62        for seg in path.split('.') {
63            if !seg_re.is_match(seg) {
64                return Err(format!(
65                    "path segments must match ^[A-Za-z_][A-Za-z0-9_]*$ separated by '.', got {path:?}"
66                ));
67            }
68        }
69        Ok(())
70    }
71
72    fn validate_type(t: &str) -> std::result::Result<(), String> {
73        if !ALLOWED_PROMOTE_TYPES.contains(&t) {
74            return Err(format!(
75                "promote_metadata type must be one of {ALLOWED_PROMOTE_TYPES:?}, got {t:?}"
76            ));
77        }
78        Ok(())
79    }
80}
81
82impl<'de> serde::Deserialize<'de> for PromoteColumn {
83    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
84        #[derive(serde::Deserialize)]
85        struct Raw {
86            path: String,
87            #[serde(rename = "type")]
88            type_: String,
89        }
90        let r = Raw::deserialize(d)?;
91        Self::validate_path(&r.path).map_err(serde::de::Error::custom)?;
92        Self::validate_type(&r.type_).map_err(serde::de::Error::custom)?;
93        Ok(Self {
94            path: r.path,
95            type_: r.type_,
96        })
97    }
98}
99
100/// One YAML = one cell. Matches `python/src/chunkshop/config.py::CellConfig`.
101#[derive(Debug, Clone, Deserialize)]
102pub struct CellConfig {
103    pub cell_name: String,
104    pub source: SourceConfig,
105    pub chunker: ChunkerConfig,
106    pub embedder: EmbedderConfig,
107    pub target: TargetConfig,
108    #[serde(default)]
109    pub runtime: RuntimeConfig,
110    #[serde(default)]
111    pub framer: FramerConfig,
112    #[serde(default)]
113    pub extractor: ExtractorConfig,
114}
115
116/// Discriminated union over extractor types. Mirrors Python's `ExtractorConfig`.
117/// Tagged on `type`. Default = `None` (the no-op extractor — equivalent to
118/// "no extractor stage" in pre-extractor YAMLs).
119#[derive(Debug, Clone, Deserialize)]
120#[serde(tag = "type", rename_all = "snake_case")]
121pub enum ExtractorConfig {
122    None(NoneExtractorConfig),
123    Composite(CompositeExtractorConfig),
124    RakeKeywords(RakeKeywordsExtractorConfig),
125    LangDetect(LangDetectExtractorConfig),
126    KeybertPhrases(KeybertPhrasesExtractorConfig),
127    SpacyEntities(SpacyEntitiesExtractorConfig),
128}
129
130impl Default for ExtractorConfig {
131    fn default() -> Self {
132        ExtractorConfig::None(NoneExtractorConfig::default())
133    }
134}
135
136#[derive(Debug, Clone, Deserialize, Default)]
137pub struct NoneExtractorConfig {}
138
139#[derive(Debug, Clone, Deserialize)]
140pub struct CompositeExtractorConfig {
141    #[serde(default)]
142    pub extractors: Vec<ExtractorConfig>,
143}
144
145#[derive(Debug, Clone, Deserialize)]
146pub struct RakeKeywordsExtractorConfig {
147    #[serde(default = "default_rake_top_k")]
148    pub top_k: usize,
149    #[serde(default = "default_rake_min_chars")]
150    pub min_chars: usize,
151}
152
153#[derive(Debug, Clone, Deserialize)]
154pub struct LangDetectExtractorConfig {
155    #[serde(default = "default_lang_backend")]
156    pub backend: String,
157}
158
159#[derive(Debug, Clone, Deserialize)]
160pub struct KeybertPhrasesExtractorConfig {
161    #[serde(default = "default_keybert_top_k")]
162    pub top_k: usize,
163    #[serde(default = "default_keybert_model")]
164    pub model_name: String,
165    #[serde(default = "default_keybert_ngram")]
166    pub keyphrase_ngram_range: (usize, usize),
167}
168
169#[derive(Debug, Clone, Deserialize)]
170pub struct SpacyEntitiesExtractorConfig {
171    #[serde(default = "default_spacy_model")]
172    pub model: String,
173    #[serde(default = "default_spacy_whitelist")]
174    pub label_whitelist: Vec<String>,
175}
176
177fn default_rake_top_k() -> usize {
178    10
179}
180fn default_rake_min_chars() -> usize {
181    3
182}
183fn default_lang_backend() -> String {
184    "langdetect".to_string()
185}
186fn default_keybert_top_k() -> usize {
187    10
188}
189fn default_keybert_model() -> String {
190    "all-MiniLM-L6-v2".to_string()
191}
192fn default_keybert_ngram() -> (usize, usize) {
193    (1, 2)
194}
195fn default_spacy_model() -> String {
196    "en_core_web_sm".to_string()
197}
198fn default_spacy_whitelist() -> Vec<String> {
199    vec!["ORG", "PERSON", "GPE", "DATE", "LAW"]
200        .into_iter()
201        .map(String::from)
202        .collect()
203}
204
205#[derive(Debug, Clone, Deserialize)]
206#[serde(tag = "type", rename_all = "snake_case")]
207pub enum FramerConfig {
208    Identity(IdentityFramerConfig),
209    HeadingBoundary(HeadingBoundaryFramerConfig),
210    RegexBoundary(RegexBoundaryFramerConfig),
211    Jsonpath(JsonPathFramerConfig),
212    /// RM-A: agent-memory episode segmentation (gap-based + role/tool
213    /// boundary + max-turns/max-words). Mirror of Python
214    /// `chunkshop.framers.session_episode.SessionEpisodeFramer`.
215    SessionEpisode(SessionEpisodeFramerConfig),
216}
217
218impl Default for FramerConfig {
219    fn default() -> Self {
220        FramerConfig::Identity(IdentityFramerConfig {})
221    }
222}
223
224#[derive(Debug, Clone, Deserialize, Default)]
225pub struct IdentityFramerConfig {}
226
227#[derive(Debug, Clone, Deserialize)]
228pub struct HeadingBoundaryFramerConfig {
229    #[serde(default = "default_heading_pattern")]
230    pub pattern: String,
231    #[serde(default = "default_true")]
232    pub title_from_heading: bool,
233}
234
235#[derive(Debug, Clone, Deserialize)]
236pub struct RegexBoundaryFramerConfig {
237    pub split_pattern: String,
238    #[serde(default)]
239    pub title_pattern: Option<String>,
240    #[serde(default = "default_true")]
241    pub body_starts_with_match: bool,
242}
243
244#[derive(Debug, Clone, Deserialize)]
245pub struct JsonPathFramerConfig {
246    pub row_path: String,
247    #[serde(default)]
248    pub title_path: Option<String>,
249    #[serde(default = "default_jsonpath_body")]
250    pub body_path: String,
251}
252
253/// RM-A Task 2: gap- and turn-based session episode segmentation.
254/// Mirrors Python `SessionEpisodeFramerConfig`. All numeric fields default
255/// to match the Python defaults exactly so the same YAML drives both impls.
256#[derive(Debug, Clone, Deserialize)]
257#[serde(deny_unknown_fields)]
258pub struct SessionEpisodeFramerConfig {
259    /// Episode boundary when the gap between consecutive events exceeds
260    /// this many seconds. Default 1800 (= 30 min, matches Python).
261    #[serde(default = "default_max_gap_seconds")]
262    pub max_gap_seconds: u64,
263    /// Boundary when an episode reaches this many turns. Default 40.
264    #[serde(default = "default_max_turns")]
265    pub max_turns: u32,
266    /// Boundary when an episode reaches this many words. Default 1200.
267    #[serde(default = "default_max_words")]
268    pub max_words: u32,
269    /// Place a boundary at the role change from non-tool to tool. Default true.
270    #[serde(default = "default_true")]
271    pub boundary_on_tool: bool,
272}
273
274fn default_max_gap_seconds() -> u64 {
275    1800
276}
277fn default_max_turns() -> u32 {
278    40
279}
280fn default_max_words() -> u32 {
281    1200
282}
283
284fn default_heading_pattern() -> String {
285    r"^#+\s".to_string()
286}
287fn default_true() -> bool {
288    true
289}
290fn default_jsonpath_body() -> String {
291    "$".to_string()
292}
293
294#[derive(Debug, Clone, Deserialize)]
295#[serde(tag = "type", rename_all = "snake_case")]
296pub enum SourceConfig {
297    Files(FilesSourceConfig),
298    JsonCorpus(JsonCorpusSourceConfig),
299    PgTable(PgTableSourceConfig),
300    MariadbTable(MariadbTableSourceConfig),
301    SqliteTable(SqliteTableSourceConfig),
302    Http(HttpSourceConfig),
303    S3(S3SourceConfig),
304    ClickhouseTable(ClickhouseTableSourceConfig),
305    /// RM-A: agent-memory staging-table source. Consumed by the two
306    /// `memory/*.yaml` cell presets (realtime + consolidate). Mirror of
307    /// Python `chunkshop.sources.session_staging.SessionStagingSource`.
308    SessionStaging(SessionStagingSourceConfig),
309    /// Library/embedded mode — no automatic iteration. The host application
310    /// drives ingestion via `chunkshop::Pipeline::from_yaml(...)` and calls
311    /// `pipeline.ingest_text(doc_id, text, metadata)` per document.
312    /// `Runner::run_cell` rejects this variant; only `Pipeline` accepts it.
313    Inline(InlineSourceConfig),
314}
315
316#[derive(Debug, Clone, Deserialize, Default)]
317pub struct InlineSourceConfig {}
318
319#[derive(Debug, Clone, Deserialize)]
320pub struct FilesSourceConfig {
321    pub glob: String,
322    #[serde(default = "default_id_from")]
323    pub id_from: String,
324    #[serde(default = "default_encoding")]
325    pub encoding: String,
326}
327
328#[derive(Debug, Clone, Deserialize)]
329pub struct JsonCorpusSourceConfig {
330    pub path: String,
331    #[serde(default = "default_documents_key")]
332    pub documents_key: String,
333    #[serde(default = "default_id_field")]
334    pub id_field: String,
335    #[serde(default = "default_content_field")]
336    pub content_field: String,
337    #[serde(default = "default_title_field")]
338    pub title_field: Option<String>,
339}
340
341#[derive(Debug, Clone, Deserialize)]
342pub struct PgTableSourceConfig {
343    pub dsn_env: String,
344    #[serde(rename = "schema")]
345    pub schema_name: String,
346    pub table: String,
347    pub id_column: String,
348    pub content_column: String,
349    #[serde(default)]
350    pub title_column: Option<String>,
351    /// Trusted operator-supplied SQL fragment appended after `WHERE`. Mirrors
352    /// Python's `pg_table.py` which interpolates this verbatim. NOT validated;
353    /// don't expose this field to untrusted YAML authors.
354    #[serde(default, rename = "where")]
355    pub where_clause: Option<String>,
356    /// Extra columns to pull alongside id/content/title and put into each
357    /// Document's metadata. Pair with `target.promote_metadata` to surface
358    /// specific keys as typed columns in the target table.
359    #[serde(default)]
360    pub metadata_columns: Vec<String>,
361    /// RM-B Task 2 / Python ff01268: optional timestamp column enabling
362    /// cursor-based incremental sync. When set, the source implements
363    /// `IncrementalSource` with a tuple cursor of shape
364    /// `{"after_ts": "<iso ts>", "after_id": "<id>"}`. The tuple ordering
365    /// defends against silent row loss when multiple rows commit at the
366    /// boundary timestamp.
367    #[serde(default)]
368    pub updated_at_column: Option<String>,
369}
370
371#[derive(Debug, Clone, Deserialize)]
372pub struct MariadbTableSourceConfig {
373    pub dsn_env: String,
374    #[serde(rename = "database")]
375    pub database_name: String,
376    pub table: String,
377    pub id_column: String,
378    pub content_column: String,
379    #[serde(default)]
380    pub title_column: Option<String>,
381    /// Trusted operator-supplied SQL fragment appended after `WHERE`. Same
382    /// contract as PgTableSourceConfig.where_clause — NOT validated.
383    #[serde(default, rename = "where")]
384    pub where_clause: Option<String>,
385    #[serde(default)]
386    pub metadata_columns: Vec<String>,
387}
388
389/// SQLite source. Mirrors `python/src/chunkshop/sources/sqlite_table.py`.
390/// `database` is validated as a non-empty ident at config-load (loose parity
391/// with Postgres) but ignored at runtime — SQLite has no schema namespace.
392#[derive(Debug, Clone, Deserialize)]
393pub struct SqliteTableSourceConfig {
394    pub dsn_env: String,
395    #[serde(rename = "database")]
396    pub database_name: String,
397    pub table: String,
398    pub id_column: String,
399    pub content_column: String,
400    #[serde(default)]
401    pub title_column: Option<String>,
402    /// Trusted operator-supplied SQL fragment appended after `WHERE`. Same
403    /// contract as PgTableSourceConfig.where_clause — NOT validated.
404    #[serde(default, rename = "where")]
405    pub where_clause: Option<String>,
406    #[serde(default)]
407    pub metadata_columns: Vec<String>,
408}
409
410/// ClickHouse source. Mirrors `python/src/chunkshop/sources/clickhouse_table.py`.
411#[derive(Debug, Clone, Deserialize)]
412pub struct ClickhouseTableSourceConfig {
413    pub dsn_env: String,
414    #[serde(rename = "database")]
415    pub database_name: String,
416    pub table: String,
417    pub id_column: String,
418    pub content_column: String,
419    #[serde(default)]
420    pub title_column: Option<String>,
421    /// Trusted operator-supplied SQL fragment appended after `WHERE`. Mirrors
422    /// Python's `clickhouse_table.py` which interpolates this verbatim. NOT
423    /// validated; don't expose this field to untrusted YAML authors.
424    #[serde(default, rename = "where")]
425    pub where_clause: Option<String>,
426    #[serde(default)]
427    pub metadata_columns: Vec<String>,
428}
429
430#[derive(Debug, Clone, Deserialize)]
431pub struct HttpSourceConfig {
432    #[serde(default)]
433    pub urls: Vec<String>,
434    #[serde(default)]
435    pub sitemap: Option<String>,
436    /// RM-B Task 4 / Python fcbad65: Depth-bounded link crawl. 0 = current
437    /// behavior (fetch only listed URLs + sitemap entries). `>=1` follows
438    /// that many link-hops from each seed via `<a href>` extraction.
439    /// Capped at 5 to match Python's `ge=0, le=5` Field constraint.
440    #[serde(default)]
441    pub crawl_depth: u32,
442    /// By default the crawler only follows same-host links. Flip to true to
443    /// follow off-host links too (still rate-limited, still subject to
444    /// `max_pages` and `respect_robots`).
445    #[serde(default)]
446    pub allow_external: bool,
447    /// Minimum delay between outbound requests (per source, not per host).
448    /// Default 0.5s matches Python.
449    #[serde(default = "default_request_delay_seconds")]
450    pub request_delay_seconds: f64,
451    /// Enforce robots.txt. One fetch per host, cached. Default true.
452    #[serde(default = "default_respect_robots")]
453    pub respect_robots: bool,
454    /// Hard runaway cap on number of pages fetched per call. Default 1000.
455    #[serde(default = "default_max_pages")]
456    pub max_pages: u64,
457    /// User-Agent header. Default matches Python's `chunkshop/0.6 (+https://…)`.
458    #[serde(default = "default_user_agent")]
459    pub user_agent: String,
460}
461
462fn default_request_delay_seconds() -> f64 {
463    0.5
464}
465fn default_respect_robots() -> bool {
466    true
467}
468fn default_max_pages() -> u64 {
469    1000
470}
471fn default_user_agent() -> String {
472    "chunkshop/0.6 (+https://github.com/yonk-labs/chunkshop)".to_string()
473}
474
475impl Default for HttpSourceConfig {
476    fn default() -> Self {
477        Self {
478            urls: Vec::new(),
479            sitemap: None,
480            crawl_depth: 0,
481            allow_external: false,
482            request_delay_seconds: default_request_delay_seconds(),
483            respect_robots: default_respect_robots(),
484            max_pages: default_max_pages(),
485            user_agent: default_user_agent(),
486        }
487    }
488}
489
490#[derive(Debug, Clone, Deserialize)]
491pub struct S3SourceConfig {
492    pub bucket: String,
493    #[serde(default)]
494    pub prefix: String,
495    /// Optional S3-compatible endpoint URL (minio, R2, custom). When None,
496    /// `object_store` resolves the standard AWS S3 endpoint per the
497    /// credential's region.
498    #[serde(default)]
499    pub endpoint_url: Option<String>,
500}
501
502// --- RM-A agent-memory configs (mirror Python SP-A) -------------------------
503
504/// SP-A memory tier — `provisional` (realtime path, supersede=false) or
505/// `consolidated` (lazy path, supersede=true).
506#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
507#[serde(rename_all = "lowercase")]
508pub enum MemoryTier {
509    Provisional,
510    Consolidated,
511}
512
513/// MemorySink-side block on `TargetConfig::Postgres`. When present, the
514/// `load_sink` dispatcher returns a `MemorySink` instead of a plain `PgSink`.
515/// Mirror of Python `chunkshop.config.MemoryConfig`. `deny_unknown_fields`
516/// per RM-A spec (typos must fail load-time).
517#[derive(Debug, Clone, Deserialize)]
518#[serde(deny_unknown_fields)]
519pub struct MemoryConfig {
520    pub tier: MemoryTier,
521    /// Default `true` for `consolidated` tier (it should replace prior
522    /// provisional rows for the same session); `false` is normal for
523    /// `provisional` so realtime cells don't churn the table.
524    #[serde(default = "default_memory_supersede")]
525    pub supersede: bool,
526    /// When None, MemorySink falls back to `source_tag` as the namespace —
527    /// matches the Python default.
528    #[serde(default)]
529    pub namespace: Option<String>,
530}
531
532fn default_memory_supersede() -> bool {
533    true
534}
535
536/// SP-A staging-table source. Reads the chunkshop-owned append-only
537/// staging table (`schema.table`, default `public.chunkshop_staging`)
538/// and yields one `Document` per session. `mode='realtime'` advances
539/// the per-session realtime watermark; `mode='consolidate'` uses a
540/// session-level eligibility WHERE so a late event after consolidation
541/// triggers a full-staging rebuild (SP-A spec O1 — non-negotiable;
542/// see Task 5 of the RM-A plan and Python fix `49861dc`).
543#[derive(Debug, Clone, Deserialize)]
544#[serde(deny_unknown_fields)]
545pub struct SessionStagingSourceConfig {
546    /// Direct DSN (literal or `${VAR}` interpolated at connect time).
547    /// At least one of `dsn` / `dsn_env` must be set.
548    #[serde(default)]
549    pub dsn: Option<String>,
550    #[serde(default)]
551    pub dsn_env: Option<String>,
552    #[serde(default = "default_staging_table")]
553    pub staging_table: String,
554    #[serde(default = "default_staging_schema")]
555    pub staging_schema: String,
556    pub mode: SessionStagingMode,
557    /// Consolidate-mode: only sessions with `max(event_ts|staged_at) < now() -
558    /// min_age_seconds` are selected. Lets quiet sessions consolidate while
559    /// active sessions stay provisional. Ignored in realtime mode.
560    #[serde(default)]
561    pub min_age_seconds: u64,
562    /// Cap on yielded sessions per run (None = no cap). Mirrors Python.
563    #[serde(default)]
564    pub max_sessions: Option<usize>,
565}
566
567#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
568#[serde(rename_all = "lowercase")]
569pub enum SessionStagingMode {
570    Realtime,
571    Consolidate,
572}
573
574fn default_staging_table() -> String {
575    "chunkshop_staging".to_string()
576}
577fn default_staging_schema() -> String {
578    "public".to_string()
579}
580
581fn default_id_from() -> String {
582    "stem".to_string()
583}
584
585fn default_encoding() -> String {
586    "utf-8".to_string()
587}
588
589fn default_documents_key() -> String {
590    "documents".to_string()
591}
592fn default_id_field() -> String {
593    "id".to_string()
594}
595fn default_content_field() -> String {
596    "content".to_string()
597}
598fn default_title_field() -> Option<String> {
599    Some("title".to_string())
600}
601
602#[derive(Debug, Clone, Deserialize)]
603#[serde(tag = "type", rename_all = "snake_case")]
604pub enum ChunkerConfig {
605    SentenceAware(SentenceAwareChunkerConfig),
606    Hierarchy(HierarchyChunkerConfig),
607    FixedOverlap(FixedOverlapChunkerConfig),
608    NeighborExpand(NeighborExpandChunkerConfig),
609    Semantic(SemanticChunkerConfig),
610    SummaryEmbed(SummaryEmbedChunkerConfig),
611    HierarchicalSummary(HierarchicalSummaryChunkerConfig),
612    /// RM-A: wraps a base chunker to emit `kind=episode` chunks plus
613    /// per-triple `kind=fact` chunks from a `Consolidator` callable.
614    /// Mirror of Python `chunkshop.chunkers.consolidation.ConsolidationChunker`.
615    Consolidation(ConsolidationChunkerConfig),
616    /// RM-C: emits one chunk per extracted code symbol (function / class /
617    /// method) via tree-sitter. Gated on the `code-aware` feature; per-grammar
618    /// dispatch in the factory is gated on `code-aware-<lang>`.
619    #[cfg(feature = "code-aware")]
620    SymbolAware(SymbolAwareChunkerConfig),
621}
622
623impl ChunkerConfig {
624    /// Resolve the effective `max_chars` ceiling for this chunker. Wrappers
625    /// (`neighbor_expand`, `summary_embed`, `hierarchical_summary`) fall back
626    /// to `base.effective_max_chars()` when no explicit override is set.
627    /// `fixed_overlap` returns `None` unless the user opted in via `max_chars`.
628    /// Mirrors Python's `ChunkerConfig.effective_max_chars` resolver.
629    /// Brief SC-003.
630    pub fn effective_max_chars(&self) -> Option<usize> {
631        match self {
632            ChunkerConfig::SentenceAware(c) => Some(c.max_chars),
633            ChunkerConfig::Hierarchy(c) => Some(c.max_chars),
634            ChunkerConfig::FixedOverlap(c) => c.max_chars,
635            ChunkerConfig::Semantic(c) => Some(c.max_chunk_chars),
636            ChunkerConfig::NeighborExpand(c) => {
637                c.max_chars.or_else(|| c.base.effective_max_chars())
638            }
639            ChunkerConfig::SummaryEmbed(c) => c.max_chars.or_else(|| c.base.effective_max_chars()),
640            ChunkerConfig::HierarchicalSummary(c) => {
641                c.max_chars.or_else(|| c.base.effective_max_chars())
642            }
643            ChunkerConfig::Consolidation(c) => c.base.effective_max_chars(),
644            #[cfg(feature = "code-aware")]
645            ChunkerConfig::SymbolAware(_) => None,
646        }
647    }
648
649    /// Borrow the optional `if_oversize` fallback chunker config. Returns
650    /// `None` for chunkers that haven't opted in. Brief SC-001.
651    pub fn if_oversize(&self) -> Option<&ChunkerConfig> {
652        match self {
653            ChunkerConfig::SentenceAware(c) => c.if_oversize.as_deref(),
654            ChunkerConfig::Hierarchy(c) => c.if_oversize.as_deref(),
655            ChunkerConfig::FixedOverlap(c) => c.if_oversize.as_deref(),
656            ChunkerConfig::Semantic(c) => c.if_oversize.as_deref(),
657            ChunkerConfig::NeighborExpand(c) => c.if_oversize.as_deref(),
658            ChunkerConfig::SummaryEmbed(c) => c.if_oversize.as_deref(),
659            ChunkerConfig::HierarchicalSummary(c) => c.if_oversize.as_deref(),
660            ChunkerConfig::Consolidation(c) => c.if_oversize.as_deref(),
661            #[cfg(feature = "code-aware")]
662            ChunkerConfig::SymbolAware(_) => None,
663        }
664    }
665
666    /// Stable lower-snake-case discriminator string for logs/error messages.
667    pub fn type_name(&self) -> &'static str {
668        match self {
669            ChunkerConfig::SentenceAware(_) => "sentence_aware",
670            ChunkerConfig::Hierarchy(_) => "hierarchy",
671            ChunkerConfig::FixedOverlap(_) => "fixed_overlap",
672            ChunkerConfig::NeighborExpand(_) => "neighbor_expand",
673            ChunkerConfig::Semantic(_) => "semantic",
674            ChunkerConfig::SummaryEmbed(_) => "summary_embed",
675            ChunkerConfig::HierarchicalSummary(_) => "hierarchical_summary",
676            ChunkerConfig::Consolidation(_) => "consolidation",
677            #[cfg(feature = "code-aware")]
678            ChunkerConfig::SymbolAware(_) => "symbol_aware",
679        }
680    }
681}
682
683#[derive(Debug, Clone, Deserialize)]
684pub struct SentenceAwareChunkerConfig {
685    #[serde(default = "default_doc_type")]
686    pub doc_type: String,
687    #[serde(default = "default_max_chars")]
688    pub max_chars: usize,
689    #[serde(default = "default_min_chars")]
690    pub min_chars: usize,
691    #[serde(default)]
692    pub if_oversize: Option<Box<ChunkerConfig>>,
693}
694
695/// RM-C: config for `SymbolAwareChunker`. Mirrors Python
696/// `chunkshop.chunkers.symbol_aware.SymbolAwareChunkerConfig`.
697#[cfg(feature = "code-aware")]
698#[derive(Debug, Clone, Deserialize, Default)]
699pub struct SymbolAwareChunkerConfig {
700    /// Optional project_id passed to `code_symbol_node_id` for scoping.
701    /// Defaults to "default" to mirror Python.
702    #[serde(default)]
703    pub project_id: Option<String>,
704}
705
706#[derive(Debug, Clone, Deserialize)]
707pub struct HierarchyChunkerConfig {
708    #[serde(default = "default_prefix_heading")]
709    pub prefix_heading: bool,
710    #[serde(default = "default_min_section_chars")]
711    pub min_section_chars: usize,
712    #[serde(default = "default_max_chars")]
713    pub max_chars: usize,
714    #[serde(default)]
715    pub if_oversize: Option<Box<ChunkerConfig>>,
716    /// Optional custom heading regex. When `None` (the default), the chunker
717    /// uses the built-in markdown heading pattern (`^#{1,6}\s+.+$`). When
718    /// `Some(s)`, `s` is compiled as a `regex::Regex` (eagerly, in
719    /// `HierarchyChunker::new`) and used as the section-boundary matcher in
720    /// place of the markdown pattern. If the custom regex contains a first
721    /// capture group, that group's match is used as the heading text;
722    /// otherwise the full match is used. Lets external consumers tune
723    /// hierarchy chunking for AsciiDoc, reST, or any custom delimiter shape.
724    #[serde(default, skip_serializing_if = "Option::is_none")]
725    pub heading_pattern: Option<String>,
726}
727
728#[derive(Debug, Clone, Deserialize)]
729pub struct FixedOverlapChunkerConfig {
730    #[serde(default = "default_window_words")]
731    pub window_words: usize,
732    #[serde(default = "default_step_words")]
733    pub step_words: usize,
734    /// Optional post-hoc char ceiling for emitted chunks. Mirrors Python's
735    /// `FixedOverlapChunker.max_chars` added in 0.3.2 (Brief SC-002). When
736    /// `None`, behavior is unchanged from 0.3.1 (word-only window). When set,
737    /// the chunker pairs with `if_oversize` to fall back over chunks that
738    /// exceed this ceiling.
739    #[serde(default)]
740    pub max_chars: Option<usize>,
741    #[serde(default)]
742    pub if_oversize: Option<Box<ChunkerConfig>>,
743}
744
745#[derive(Debug, Clone, Deserialize)]
746pub struct NeighborExpandChunkerConfig {
747    pub base: Box<ChunkerConfig>,
748    #[serde(default = "default_neighbor_window")]
749    pub window: usize,
750    /// Explicit char ceiling override. When `None`, the wrapper inherits
751    /// `base.effective_max_chars()` (Brief SC-003).
752    #[serde(default)]
753    pub max_chars: Option<usize>,
754    #[serde(default)]
755    pub if_oversize: Option<Box<ChunkerConfig>>,
756}
757
758/// RM-A Task 3: ConsolidationChunker wraps a base chunker; emits one
759/// `kind=episode` chunk per base chunk and one `kind=fact` chunk per
760/// triple returned by the wired `Consolidator`. Mirror of Python
761/// `chunkshop.chunkers.consolidation.ConsolidationChunker`.
762#[derive(Debug, Clone, Deserialize)]
763#[serde(deny_unknown_fields)]
764pub struct ConsolidationChunkerConfig {
765    /// Underlying chunker for episode-text segmentation (typically
766    /// `sentence_aware` with a memory-sized ceiling).
767    pub base: Box<ChunkerConfig>,
768    /// Which consolidator wires the SPO-triple extraction. v1 ships
769    /// `mode: extractive` (zero-network default); LLM modes wired by the
770    /// host application as Rust trait impls.
771    pub consolidator: ConsolidatorConfig,
772    /// Hard char cap for `kind=fact` chunks. Default 1200 (Python parity).
773    #[serde(default = "default_fact_max_chars")]
774    pub fact_max_chars: usize,
775    #[serde(default)]
776    pub if_oversize: Option<Box<ChunkerConfig>>,
777}
778
779fn default_fact_max_chars() -> usize {
780    1200
781}
782
783/// Consolidator wiring. `mode: extractive` selects the zero-network Rust
784/// default (Task 7). Future v1.x: `mode: llm` once a deterministic LLM
785/// seam lands. Custom impls live in host code and aren't selected via
786/// YAML — they're injected at `build_chunker` time by the consumer.
787#[derive(Debug, Clone, Deserialize)]
788#[serde(tag = "mode", rename_all = "snake_case", deny_unknown_fields)]
789pub enum ConsolidatorConfig {
790    Extractive(ExtractiveConsolidatorConfig),
791}
792
793#[derive(Debug, Clone, Deserialize, Default)]
794#[serde(deny_unknown_fields)]
795pub struct ExtractiveConsolidatorConfig {}
796
797#[derive(Debug, Clone, Deserialize)]
798pub struct SemanticChunkerConfig {
799    #[serde(default = "default_boundary_model")]
800    pub boundary_model: String,
801    #[serde(default = "default_breakpoint_percentile")]
802    pub breakpoint_percentile: u32,
803    #[serde(default = "default_min_sents_per_chunk")]
804    pub min_sentences_per_chunk: usize,
805    #[serde(default = "default_max_chunk_chars")]
806    pub max_chunk_chars: usize,
807    #[serde(default = "default_sentence_splitter")]
808    pub sentence_splitter: String,
809    #[serde(default)]
810    pub if_oversize: Option<Box<ChunkerConfig>>,
811}
812
813/// Discriminated union over summarizer modes. Mirrors Python's `SummarizerConfig`.
814/// Tagged on `mode` (matches the Python YAML).
815#[derive(Debug, Clone, Deserialize)]
816#[serde(tag = "mode", rename_all = "snake_case")]
817pub enum SummarizerConfig {
818    External(ExternalSummarizerConfig),
819    Callable(CallableSummarizerConfig),
820    Passthrough(PassthroughSummarizerConfig),
821}
822
823impl SummarizerConfig {
824    /// One of `"external"`, `"callable"`, `"passthrough"` — the value chunkshop
825    /// stamps into `metadata.summarizer` for traceability.
826    pub fn mode_str(&self) -> &'static str {
827        match self {
828            SummarizerConfig::External(_) => "external",
829            SummarizerConfig::Callable(_) => "callable",
830            SummarizerConfig::Passthrough(_) => "passthrough",
831        }
832    }
833}
834
835#[derive(Debug, Clone, Deserialize)]
836pub struct ExternalSummarizerConfig {
837    #[serde(default = "default_external_field")]
838    pub field: String,
839}
840
841#[derive(Debug, Clone, Deserialize)]
842pub struct CallableSummarizerConfig {
843    pub module: String,
844    #[serde(default = "default_callable_function")]
845    pub function: String,
846    #[serde(default)]
847    pub kwargs: serde_json::Map<String, serde_json::Value>,
848}
849
850#[derive(Debug, Clone, Deserialize, Default)]
851pub struct PassthroughSummarizerConfig {}
852
853fn default_external_field() -> String {
854    "summary".to_string()
855}
856fn default_callable_function() -> String {
857    "summarize".to_string()
858}
859
860/// Discriminated union over grouping strategies for HierarchicalSummaryChunker.
861/// Mirrors Python's `GroupingConfig`. Tagged on `strategy`.
862#[derive(Debug, Clone, Deserialize)]
863#[serde(tag = "strategy", rename_all = "snake_case")]
864pub enum GroupingConfig {
865    FixedN(FixedNGroupingConfig),
866    WordBudget(WordBudgetGroupingConfig),
867    SectionAware(SectionAwareGroupingConfig),
868}
869
870impl Default for GroupingConfig {
871    fn default() -> Self {
872        GroupingConfig::FixedN(FixedNGroupingConfig::default())
873    }
874}
875
876#[derive(Debug, Clone, Deserialize)]
877pub struct FixedNGroupingConfig {
878    #[serde(default = "default_fixed_n")]
879    pub n: usize,
880}
881
882impl Default for FixedNGroupingConfig {
883    fn default() -> Self {
884        Self {
885            n: default_fixed_n(),
886        }
887    }
888}
889
890#[derive(Debug, Clone, Deserialize)]
891pub struct WordBudgetGroupingConfig {
892    #[serde(default = "default_word_budget")]
893    pub max_words: usize,
894}
895
896#[derive(Debug, Clone, Deserialize, Default)]
897pub struct SectionAwareGroupingConfig {}
898
899fn default_fixed_n() -> usize {
900    5
901}
902fn default_word_budget() -> usize {
903    2000
904}
905
906#[derive(Debug, Clone, Deserialize)]
907pub struct SummaryEmbedChunkerConfig {
908    pub base: Box<ChunkerConfig>,
909    pub summarizer: SummarizerConfig,
910    /// Explicit char ceiling override. When `None`, the wrapper inherits
911    /// `base.effective_max_chars()` (Brief SC-003).
912    #[serde(default)]
913    pub max_chars: Option<usize>,
914    #[serde(default)]
915    pub if_oversize: Option<Box<ChunkerConfig>>,
916}
917
918#[derive(Debug, Clone, Deserialize)]
919pub struct HierarchicalSummaryChunkerConfig {
920    pub base: Box<ChunkerConfig>,
921    pub summarizer: SummarizerConfig,
922    #[serde(default)]
923    pub grouping: GroupingConfig,
924    /// Explicit char ceiling override. When `None`, the wrapper inherits
925    /// `base.effective_max_chars()` (Brief SC-003). Only fine rows are
926    /// checked; coarse rows are exempt by design (Brief SC-005).
927    #[serde(default)]
928    pub max_chars: Option<usize>,
929    #[serde(default)]
930    pub if_oversize: Option<Box<ChunkerConfig>>,
931}
932
933fn default_window_words() -> usize {
934    300
935}
936fn default_step_words() -> usize {
937    150
938}
939fn default_neighbor_window() -> usize {
940    1
941}
942fn default_boundary_model() -> String {
943    "sentence-transformers/all-MiniLM-L6-v2-int8".to_string()
944}
945fn default_breakpoint_percentile() -> u32 {
946    95
947}
948fn default_min_sents_per_chunk() -> usize {
949    3
950}
951fn default_max_chunk_chars() -> usize {
952    2000
953}
954fn default_sentence_splitter() -> String {
955    "naive".to_string()
956}
957
958fn default_doc_type() -> String {
959    "prose".to_string()
960}
961fn default_max_chars() -> usize {
962    2000
963}
964fn default_min_chars() -> usize {
965    200
966}
967fn default_prefix_heading() -> bool {
968    true
969}
970fn default_min_section_chars() -> usize {
971    100
972}
973
974#[derive(Debug, Clone, Deserialize)]
975#[serde(tag = "type", rename_all = "snake_case")]
976pub enum EmbedderConfig {
977    Fastembed(FastembedEmbedderConfig),
978}
979
980#[derive(Debug, Clone, Deserialize)]
981pub struct FastembedEmbedderConfig {
982    pub model_name: String,
983    pub dim: usize,
984    #[serde(default = "default_batch_size")]
985    pub batch_size: usize,
986    #[serde(default)]
987    pub threads: Option<usize>,
988
989    // YAML-driven HF pointer ("BYO embedder"). When `hf_repo` is set, the
990    // Rust dispatch routes through the user-defined ONNX path with these
991    // values at runtime — no `user_defined_source` edit, no rebuild
992    // required. When NOT set, dispatch falls back to the registry
993    // (resolve_model_name + user_defined_source for known names).
994    #[serde(default)]
995    pub hf_repo: Option<String>,
996    #[serde(default)]
997    pub onnx_path: Option<String>,
998    #[serde(default = "default_pooling")]
999    pub pooling: String, // "cls" | "mean"
1000    #[serde(default = "default_additional_files")]
1001    pub additional_files: Vec<String>,
1002}
1003
1004fn default_batch_size() -> usize {
1005    64
1006}
1007
1008fn default_pooling() -> String {
1009    "cls".to_string()
1010}
1011
1012fn default_additional_files() -> Vec<String> {
1013    vec![
1014        "tokenizer.json".to_string(),
1015        "tokenizer_config.json".to_string(),
1016        "special_tokens_map.json".to_string(),
1017        "config.json".to_string(),
1018    ]
1019}
1020
1021impl FastembedEmbedderConfig {
1022    /// True when YAML opted into BYO mode (both hf_repo + onnx_path set).
1023    pub fn is_byo(&self) -> bool {
1024        self.hf_repo.is_some() && self.onnx_path.is_some()
1025    }
1026
1027    /// Validate field pairing: `hf_repo` and `onnx_path` go together. Called
1028    /// post-deserialize alongside `validate_ident` in `load_config`.
1029    pub fn validate(&self) -> Result<()> {
1030        if self.hf_repo.is_some() != self.onnx_path.is_some() {
1031            return Err(anyhow!(
1032                "embedder.hf_repo and embedder.onnx_path must be set together \
1033                 (BYO mode) or both omitted (registry mode)."
1034            ));
1035        }
1036        if self.hf_repo.is_some() && !matches!(self.pooling.as_str(), "cls" | "mean") {
1037            return Err(anyhow!(
1038                "embedder.pooling must be 'cls' or 'mean' for BYO embedders, got {:?}",
1039                self.pooling
1040            ));
1041        }
1042        Ok(())
1043    }
1044}
1045
1046#[derive(Debug, Clone, Deserialize)]
1047#[serde(tag = "type", rename_all = "snake_case")]
1048pub enum TargetConfig {
1049    Postgres(PostgresTargetConfig),
1050    Mariadb(MariadbTargetConfig),
1051    Sqlite(SqliteTargetConfig),
1052    Clickhouse(ClickhouseTargetConfig),
1053}
1054
1055impl TargetConfig {
1056    /// Post-deserialize validation that crosses field boundaries. Delegates to
1057    /// the active variant's `validate()`.
1058    fn validate(&self) -> Result<()> {
1059        match self {
1060            TargetConfig::Postgres(t) => t.validate(),
1061            TargetConfig::Mariadb(t) => t.validate(),
1062            TargetConfig::Sqlite(t) => t.validate(),
1063            TargetConfig::Clickhouse(t) => t.validate(),
1064        }
1065    }
1066}
1067
1068#[derive(Debug, Clone, Deserialize, Default)]
1069pub struct DocumentStoreConfig {
1070    #[serde(default)]
1071    pub enabled: bool,
1072}
1073
1074fn validate_no_document_store(documents: &Option<DocumentStoreConfig>) -> Result<()> {
1075    if documents.as_ref().is_some_and(|d| d.enabled) {
1076        return Err(anyhow!(
1077            "target.documents is currently Python/Postgres-only; Rust does not write the companion document table yet"
1078        ));
1079    }
1080    Ok(())
1081}
1082
1083#[derive(Debug, Clone, Deserialize)]
1084pub struct PostgresTargetConfig {
1085    #[serde(default = "default_dsn_env")]
1086    pub dsn_env: String,
1087    #[serde(rename = "database")]
1088    pub database_name: String,
1089    pub table: String,
1090    /// Legacy bool field from 0.3.x — accepted but never preferred. New configs
1091    /// should use `mode`. Top-level `target.overwrite: true` is still rejected
1092    /// at config-load by the legacy-form check (Task 13).
1093    #[serde(default)]
1094    pub overwrite: bool,
1095    #[serde(default = "default_hnsw")]
1096    pub hnsw: bool,
1097    /// Postgres/pgvector semantic-search metric: cosine, inner_product, or l2.
1098    #[serde(default = "default_vector_metric")]
1099    pub vector_metric: String,
1100    /// `overwrite` (default), `append`, or `create_if_missing`. All three are
1101    /// implemented in Rust as of MB-3 (sink full-mode parity).
1102    #[serde(default = "default_mode")]
1103    pub mode: String,
1104    #[serde(default)]
1105    pub source_tag: Option<String>,
1106    #[serde(default)]
1107    pub promote_metadata: Vec<PromoteColumn>,
1108    #[serde(default)]
1109    pub force_overwrite: bool,
1110    /// When true, after upserting chunks for a document, delete any rows for
1111    /// that document with `seq_num >= len(new_chunks)`. Closes the per-doc
1112    /// shrink gap (last run wrote 12 chunks; this run writes 8 → drop the 4
1113    /// orphans inside the same write transaction). Default false to preserve
1114    /// the historical behavior. See `docs/incremental.md`.
1115    #[serde(default)]
1116    pub delete_orphans: bool,
1117    /// RM-A: when present, `load_sink` returns a MemorySink (extends PgSink
1118    /// with tier/kind stamping + supersede + soft-invalidate). Mirror of
1119    /// Python `chunkshop.config.TargetConfig.memory`.
1120    #[serde(default)]
1121    pub memory: Option<MemoryConfig>,
1122    /// Python/Postgres-only document table config. Rust rejects enabled
1123    /// document stores explicitly so cross-language YAMLs cannot silently
1124    /// ingest chunks without the companion document rows.
1125    #[serde(default)]
1126    pub documents: Option<DocumentStoreConfig>,
1127}
1128
1129impl PostgresTargetConfig {
1130    /// Post-deserialize validation that crosses field boundaries (e.g.
1131    /// mode/source_tag coupling). Identifier safety is enforced separately in
1132    /// `load_config` via `validate_ident`.
1133    fn validate(&self) -> Result<()> {
1134        if self.mode == "append" && self.source_tag.is_none() {
1135            return Err(anyhow!(
1136                "target.mode='append' requires target.source_tag to identify this cell"
1137            ));
1138        }
1139        validate_no_document_store(&self.documents)?;
1140        if !matches!(
1141            self.vector_metric.as_str(),
1142            "cosine" | "inner_product" | "l2"
1143        ) {
1144            return Err(anyhow!(
1145                "target.vector_metric must be one of 'cosine', 'inner_product', or 'l2', got {:?}",
1146                self.vector_metric
1147            ));
1148        }
1149        Ok(())
1150    }
1151}
1152
1153#[derive(Debug, Clone, Deserialize)]
1154pub struct MariadbTargetConfig {
1155    #[serde(default = "default_dsn_env")]
1156    pub dsn_env: String,
1157    #[serde(rename = "database")]
1158    pub database_name: String,
1159    pub table: String,
1160    /// Legacy bool field from 0.3.x — accepted but never preferred. Same shape
1161    /// as PostgresTargetConfig.
1162    #[serde(default)]
1163    pub overwrite: bool,
1164    #[serde(default = "default_hnsw")]
1165    pub hnsw: bool,
1166    #[serde(default = "default_mode")]
1167    pub mode: String,
1168    #[serde(default)]
1169    pub source_tag: Option<String>,
1170    #[serde(default)]
1171    pub promote_metadata: Vec<PromoteColumn>,
1172    #[serde(default)]
1173    pub force_overwrite: bool,
1174    #[serde(default)]
1175    pub delete_orphans: bool,
1176    #[serde(default)]
1177    pub documents: Option<DocumentStoreConfig>,
1178}
1179
1180impl MariadbTargetConfig {
1181    pub(crate) fn validate(&self) -> Result<()> {
1182        if self.mode == "append" && self.source_tag.is_none() {
1183            return Err(anyhow!(
1184                "target.mode='append' requires target.source_tag to identify this cell"
1185            ));
1186        }
1187        validate_no_document_store(&self.documents)?;
1188        Ok(())
1189    }
1190}
1191
1192#[derive(Debug, Clone, Deserialize)]
1193pub struct ClickhouseTargetConfig {
1194    #[serde(default = "default_dsn_env")]
1195    pub dsn_env: String,
1196    #[serde(rename = "database")]
1197    pub database_name: String,
1198    pub table: String,
1199    #[serde(default = "default_hnsw")]
1200    pub hnsw: bool,
1201    #[serde(default = "default_mode")]
1202    pub mode: String,
1203    #[serde(default)]
1204    pub source_tag: Option<String>,
1205    #[serde(default)]
1206    pub promote_metadata: Vec<PromoteColumn>,
1207    #[serde(default)]
1208    pub force_overwrite: bool,
1209    /// On ClickHouse, `delete_orphans: true` is a NO-OP that emits a single
1210    /// `tracing::warn!` per process. CH's `ALTER TABLE ... DELETE` is async
1211    /// and breaks chunkshop's per-document atomic write contract.
1212    #[serde(default)]
1213    pub delete_orphans: bool,
1214    /// Optional engine override. When `None`, the sink emits
1215    /// `MergeTree() ORDER BY (id)`. To opt into lazy dedup, set
1216    /// `"ReplacingMergeTree(created_at) ORDER BY (id)"`. Validated against
1217    /// `CLICKHOUSE_ENGINE_RE` at config-load — a Rust-only hardening relative
1218    /// to Python which interpolates the field raw.
1219    #[serde(default)]
1220    pub engine: Option<String>,
1221    #[serde(default)]
1222    pub documents: Option<DocumentStoreConfig>,
1223}
1224
1225impl ClickhouseTargetConfig {
1226    fn validate(&self) -> Result<()> {
1227        if self.mode == "append" && self.source_tag.is_none() {
1228            return Err(anyhow!(
1229                "target.mode='append' requires target.source_tag to identify this cell"
1230            ));
1231        }
1232        if let Some(e) = &self.engine {
1233            let re = Regex::new(CLICKHOUSE_ENGINE_RE).unwrap();
1234            if !re.is_match(e) {
1235                return Err(anyhow!(
1236                    "target.engine {e:?} not in allowlist. Accepted shapes: \
1237                     'MergeTree', 'MergeTree()', 'ReplacingMergeTree(<col>)', \
1238                     each optionally followed by ' ORDER BY <expr>'. Custom engines \
1239                     are not supported in v0.4 — file an issue if you need one."
1240                ));
1241            }
1242        }
1243        validate_no_document_store(&self.documents)?;
1244        Ok(())
1245    }
1246}
1247
1248/// SQLite target. Mirrors Python's `chunkshop.config.SqliteTarget`.
1249/// `database` is validated as a non-empty ident at config-load (loose parity
1250/// with Postgres) but ignored at runtime — SQLite has no schema namespace.
1251/// `target.hnsw=true` is a no-op on SQLite (sqlite-vec is brute-force KNN);
1252/// the sink emits a one-time process-level warning when set.
1253#[derive(Debug, Clone, Deserialize)]
1254pub struct SqliteTargetConfig {
1255    /// Env var holding the path to the SQLite file (or `:memory:`).
1256    pub dsn_env: String,
1257    #[serde(rename = "database")]
1258    pub database_name: String,
1259    pub table: String,
1260    /// Legacy bool from 0.3.x — accepted but never preferred. New configs use `mode`.
1261    #[serde(default)]
1262    pub overwrite: bool,
1263    #[serde(default = "default_hnsw")]
1264    pub hnsw: bool,
1265    /// `overwrite` (default), `append`, or `create_if_missing`.
1266    #[serde(default = "default_mode")]
1267    pub mode: String,
1268    #[serde(default)]
1269    pub source_tag: Option<String>,
1270    #[serde(default)]
1271    pub promote_metadata: Vec<PromoteColumn>,
1272    #[serde(default)]
1273    pub force_overwrite: bool,
1274    /// Mirror PostgresTargetConfig.delete_orphans. Same per-doc-shrink semantics.
1275    #[serde(default)]
1276    pub delete_orphans: bool,
1277    #[serde(default)]
1278    pub documents: Option<DocumentStoreConfig>,
1279}
1280
1281impl SqliteTargetConfig {
1282    pub(crate) fn validate(&self) -> Result<()> {
1283        if self.mode == "append" && self.source_tag.is_none() {
1284            return Err(anyhow!(
1285                "target.mode='append' requires target.source_tag to identify this cell"
1286            ));
1287        }
1288        validate_no_document_store(&self.documents)?;
1289        Ok(())
1290    }
1291}
1292
1293fn default_dsn_env() -> String {
1294    "CHUNKSHOP_DSN".to_string()
1295}
1296fn default_hnsw() -> bool {
1297    true
1298}
1299
1300fn default_vector_metric() -> String {
1301    "cosine".to_string()
1302}
1303fn default_mode() -> String {
1304    "overwrite".to_string()
1305}
1306
1307#[derive(Debug, Clone, Default, Deserialize)]
1308pub struct RuntimeConfig {
1309    #[serde(default)]
1310    pub omp_num_threads: Option<usize>,
1311    #[serde(default)]
1312    pub doc_limit: Option<usize>,
1313    #[serde(default)]
1314    pub log_path: Option<String>,
1315    #[serde(default)]
1316    pub heartbeat_every: Option<usize>,
1317    /// "text" (default) or "json" — controls the CLI's tracing-subscriber
1318    /// formatter. JSON emits one structured event per line for log aggregators.
1319    #[serde(default = "default_log_format")]
1320    pub log_format: String,
1321}
1322
1323fn default_log_format() -> String {
1324    "text".to_string()
1325}
1326
1327/// Validate identifier against Python's regex: `^[a-z_][a-z0-9_]*$`.
1328fn validate_ident(name: &str, field: &str) -> Result<()> {
1329    let re = Regex::new(r"^[a-z_][a-z0-9_]*$").unwrap();
1330    if !re.is_match(name) {
1331        return Err(anyhow!(
1332            "{field} must match ^[a-z_][a-z0-9_]*$, got {name:?}"
1333        ));
1334    }
1335    Ok(())
1336}
1337
1338/// Pre-deserialize legacy-form rejection (V4-SC-006).
1339///
1340/// Walks the raw YAML for known 0.3.x field/value patterns and emits a
1341/// migration-friendly error when found. Without this pass, serde's default
1342/// errors are cryptic ("unknown variant `pgvector`") or absent (silently
1343/// accepted legacy fields).
1344fn reject_legacy_forms(yaml: &serde_yaml_ng::Value) -> Result<()> {
1345    let target = yaml.get("target").and_then(|v| v.as_mapping());
1346    let Some(target) = target else {
1347        return Ok(()); // No target block; nothing to validate.
1348    };
1349
1350    if let Some(t) = target.get("type").and_then(|v| v.as_str()) {
1351        if t == "pgvector" {
1352            return Err(anyhow!(
1353                "target.type 'pgvector' was renamed to 'postgres' in v0.4.0. Update your YAML."
1354            ));
1355        }
1356    }
1357    if target.get("schema").is_some() {
1358        return Err(anyhow!(
1359            "target.schema was renamed to target.database in v0.4.0. Update your YAML."
1360        ));
1361    }
1362    if let Some(o) = target.get("overwrite") {
1363        if matches!(o.as_bool(), Some(true)) {
1364            return Err(anyhow!(
1365                "target.overwrite: true was replaced by target.mode: 'overwrite' in v0.4.0. \
1366                 Update your YAML."
1367            ));
1368        }
1369    }
1370    Ok(())
1371}
1372
1373pub fn load_config(path: &Path) -> Result<CellConfig> {
1374    let text = std::fs::read_to_string(path)
1375        .with_context(|| format!("reading config {}", path.display()))?;
1376
1377    // V4-SC-006: reject 0.3.x legacy YAML shapes with friendly errors before
1378    // typed deserialization (which would emit cryptic "unknown variant" errors).
1379    let raw_value: serde_yaml_ng::Value = serde_yaml_ng::from_str(&text)
1380        .with_context(|| format!("parsing YAML at {}", path.display()))?;
1381    reject_legacy_forms(&raw_value)?;
1382
1383    let cfg: CellConfig = serde_yaml_ng::from_str(&text)
1384        .with_context(|| format!("parsing YAML {}", path.display()))?;
1385    match &cfg.target {
1386        TargetConfig::Postgres(t) => {
1387            validate_ident(&t.database_name, "target.database")?;
1388            validate_ident(&t.table, "target.table")?;
1389            if let Some(tag) = &t.source_tag {
1390                validate_ident(tag, "target.source_tag")?;
1391            }
1392        }
1393        TargetConfig::Mariadb(t) => {
1394            validate_ident(&t.database_name, "target.database")?;
1395            validate_ident(&t.table, "target.table")?;
1396            if let Some(tag) = &t.source_tag {
1397                validate_ident(tag, "target.source_tag")?;
1398            }
1399        }
1400        TargetConfig::Sqlite(t) => {
1401            validate_ident(&t.database_name, "target.database")?;
1402            validate_ident(&t.table, "target.table")?;
1403            if let Some(tag) = &t.source_tag {
1404                validate_ident(tag, "target.source_tag")?;
1405            }
1406        }
1407        TargetConfig::Clickhouse(t) => {
1408            validate_ident(&t.database_name, "target.database")?;
1409            validate_ident(&t.table, "target.table")?;
1410            if let Some(tag) = &t.source_tag {
1411                validate_ident(tag, "target.source_tag")?;
1412            }
1413        }
1414    }
1415    if let SourceConfig::PgTable(p) = &cfg.source {
1416        validate_ident(&p.schema_name, "source.schema")?;
1417        validate_ident(&p.table, "source.table")?;
1418        validate_ident(&p.id_column, "source.id_column")?;
1419        validate_ident(&p.content_column, "source.content_column")?;
1420        if let Some(tc) = &p.title_column {
1421            validate_ident(tc, "source.title_column")?;
1422        }
1423        // `where_clause` is intentionally NOT validated — see PgTableSourceConfig docstring.
1424    }
1425    if let SourceConfig::MariadbTable(p) = &cfg.source {
1426        validate_ident(&p.database_name, "source.database")?;
1427        validate_ident(&p.table, "source.table")?;
1428        validate_ident(&p.id_column, "source.id_column")?;
1429        validate_ident(&p.content_column, "source.content_column")?;
1430        if let Some(tc) = &p.title_column {
1431            validate_ident(tc, "source.title_column")?;
1432        }
1433        // `where_clause` intentionally NOT validated — same contract as PgTableSourceConfig.
1434    }
1435    if let SourceConfig::SqliteTable(s) = &cfg.source {
1436        validate_ident(&s.database_name, "source.database")?;
1437        validate_ident(&s.table, "source.table")?;
1438        validate_ident(&s.id_column, "source.id_column")?;
1439        validate_ident(&s.content_column, "source.content_column")?;
1440        if let Some(tc) = &s.title_column {
1441            validate_ident(tc, "source.title_column")?;
1442        }
1443        // `where_clause` intentionally NOT validated — same contract as PgTableSourceConfig.
1444    }
1445    if let SourceConfig::ClickhouseTable(p) = &cfg.source {
1446        validate_ident(&p.database_name, "source.database")?;
1447        validate_ident(&p.table, "source.table")?;
1448        validate_ident(&p.id_column, "source.id_column")?;
1449        validate_ident(&p.content_column, "source.content_column")?;
1450        if let Some(tc) = &p.title_column {
1451            validate_ident(tc, "source.title_column")?;
1452        }
1453        for mc in &p.metadata_columns {
1454            validate_ident(mc, "source.metadata_columns")?;
1455        }
1456        // `where_clause` is intentionally NOT validated — see ClickhouseTableSourceConfig docstring.
1457    }
1458    cfg.target.validate()?;
1459    validate_chunker_config(&cfg.chunker)?;
1460    match &cfg.embedder {
1461        EmbedderConfig::Fastembed(e) => e.validate()?,
1462    }
1463    Ok(cfg)
1464}
1465
1466/// Cross-field validation for chunker configs (recursive, walks any
1467/// `Box<ChunkerConfig>` base fields). Mirrors Python's pydantic model
1468/// validators that fire at config-load time.
1469fn validate_chunker_config(c: &ChunkerConfig) -> Result<()> {
1470    // Brief SC-001: `if_oversize` without an effective ceiling is nonsensical
1471    // — there's nothing to compare against. Reject at config-load.
1472    if c.if_oversize().is_some() && c.effective_max_chars().is_none() {
1473        return Err(anyhow!(
1474            "chunker {:?} has `if_oversize` set but no effective `max_chars` ceiling. \
1475             Either set `max_chars` on this chunker (or on its `base` for wrappers), \
1476             or remove `if_oversize`.",
1477            c.type_name()
1478        ));
1479    }
1480    // Recurse into the fallback chunker config so nested chains are validated.
1481    if let Some(nested) = c.if_oversize() {
1482        validate_chunker_config(nested)?;
1483    }
1484    match c {
1485        ChunkerConfig::SentenceAware(_)
1486        | ChunkerConfig::Hierarchy(_)
1487        | ChunkerConfig::FixedOverlap(_)
1488        | ChunkerConfig::Semantic(_) => Ok(()),
1489        ChunkerConfig::NeighborExpand(c) => validate_chunker_config(&c.base),
1490        ChunkerConfig::SummaryEmbed(c) => validate_chunker_config(&c.base),
1491        ChunkerConfig::HierarchicalSummary(c) => {
1492            // Mirror Python's _section_aware_requires_hierarchy_base: when
1493            // grouping is section_aware, the base chunker MUST be hierarchy.
1494            if matches!(c.grouping, GroupingConfig::SectionAware(_)) {
1495                let base_type_name = c.base.type_name();
1496                if base_type_name != "hierarchy" {
1497                    return Err(anyhow!(
1498                        "hierarchical_summary with strategy='section_aware' requires \
1499                         base.type='hierarchy', got {base_type_name:?}"
1500                    ));
1501                }
1502            }
1503            validate_chunker_config(&c.base)
1504        }
1505        ChunkerConfig::Consolidation(c) => validate_chunker_config(&c.base),
1506        #[cfg(feature = "code-aware")]
1507        ChunkerConfig::SymbolAware(_) => Ok(()),
1508    }
1509}
1510
1511#[cfg(test)]
1512mod tests {
1513    use super::*;
1514
1515    fn write_yaml(body: &str) -> std::path::PathBuf {
1516        let path = std::env::temp_dir().join(format!(
1517            "chunkshop-rs-cfg-{}.yaml",
1518            std::time::SystemTime::now()
1519                .duration_since(std::time::UNIX_EPOCH)
1520                .unwrap()
1521                .as_nanos()
1522        ));
1523        std::fs::write(&path, body).unwrap();
1524        path
1525    }
1526
1527    #[test]
1528    fn rejects_append_without_source_tag() {
1529        let yaml = r#"
1530cell_name: t
1531source: { type: files, glob: "x", id_from: stem }
1532chunker: { type: sentence_aware }
1533embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1534target: { type: postgres, dsn_env: D, database: s, table: t, mode: append, hnsw: false }
1535"#;
1536        let path = write_yaml(yaml);
1537        let err = format!("{:#}", load_config(&path).unwrap_err());
1538        assert!(
1539            err.contains("source_tag"),
1540            "expected source_tag mention, got: {err}"
1541        );
1542    }
1543
1544    #[test]
1545    fn rejects_enabled_document_store_until_rust_parity_exists() {
1546        let yaml = r#"
1547cell_name: t
1548source: { type: files, glob: "x", id_from: stem }
1549chunker: { type: sentence_aware }
1550embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1551target:
1552  type: postgres
1553  dsn_env: D
1554  database: s
1555  table: chunks
1556  mode: overwrite
1557  hnsw: false
1558  documents:
1559    enabled: true
1560    table: documents
1561"#;
1562        let path = write_yaml(yaml);
1563        let err = format!("{:#}", load_config(&path).unwrap_err());
1564        assert!(
1565            err.contains("Python/Postgres-only") && err.contains("target.documents"),
1566            "expected document-store parity complaint, got: {err}"
1567        );
1568    }
1569
1570    #[test]
1571    fn rejects_invalid_promote_type() {
1572        let yaml = r#"
1573cell_name: t
1574source: { type: files, glob: "x", id_from: stem }
1575chunker: { type: sentence_aware }
1576embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1577target:
1578  type: postgres
1579  dsn_env: D
1580  database: s
1581  table: t
1582  mode: overwrite
1583  hnsw: false
1584  promote_metadata:
1585    - { path: entities.ORG, type: bogus_type }
1586"#;
1587        let path = write_yaml(yaml);
1588        let err = format!("{:#}", load_config(&path).unwrap_err());
1589        assert!(
1590            err.contains("type"),
1591            "expected promote_metadata type complaint, got: {err}"
1592        );
1593    }
1594
1595    #[test]
1596    fn rejects_invalid_promote_path() {
1597        let yaml = r#"
1598cell_name: t
1599source: { type: files, glob: "x", id_from: stem }
1600chunker: { type: sentence_aware }
1601embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1602target:
1603  type: postgres
1604  dsn_env: D
1605  database: s
1606  table: t
1607  mode: overwrite
1608  hnsw: false
1609  promote_metadata:
1610    - { path: "0entities.ORG", type: text }
1611"#;
1612        let path = write_yaml(yaml);
1613        let err = format!("{:#}", load_config(&path).unwrap_err());
1614        assert!(
1615            err.contains("path"),
1616            "expected promote_metadata path complaint, got: {err}"
1617        );
1618    }
1619
1620    #[test]
1621    fn promote_column_name_lowercases_and_double_underscores() {
1622        let pc: PromoteColumn =
1623            serde_yaml_ng::from_str("{ path: entities.ORG, type: \"text[]\" }").unwrap();
1624        assert_eq!(pc.column_name(), "entities__org");
1625    }
1626
1627    #[test]
1628    fn parses_promote_metadata_into_typed_vec() {
1629        let yaml = r#"
1630cell_name: t
1631source: { type: files, glob: "x", id_from: stem }
1632chunker: { type: sentence_aware }
1633embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1634target:
1635  type: postgres
1636  dsn_env: D
1637  database: s
1638  table: t
1639  mode: overwrite
1640  hnsw: false
1641  promote_metadata:
1642    - { path: heading, type: text }
1643    - { path: entities.ORG, type: "text[]" }
1644"#;
1645        let path = write_yaml(yaml);
1646        let cfg = load_config(&path).expect("load");
1647        let TargetConfig::Postgres(t) = &cfg.target else {
1648            panic!("expected Postgres target");
1649        };
1650        assert_eq!(t.promote_metadata.len(), 2);
1651        assert_eq!(t.promote_metadata[0].path, "heading");
1652        assert_eq!(t.promote_metadata[0].type_, "text");
1653        assert_eq!(t.promote_metadata[1].column_name(), "entities__org");
1654    }
1655
1656    #[test]
1657    fn postgres_target_vector_metric_defaults_and_validates() {
1658        let yaml = r#"
1659cell_name: t
1660source: { type: files, glob: "x", id_from: stem }
1661chunker: { type: sentence_aware }
1662embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1663target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1664"#;
1665        let path = write_yaml(yaml);
1666        let cfg = load_config(&path).expect("load");
1667        let TargetConfig::Postgres(t) = &cfg.target else {
1668            panic!("expected Postgres target");
1669        };
1670        assert_eq!(t.vector_metric, "cosine");
1671
1672        let yaml = r#"
1673cell_name: t
1674source: { type: files, glob: "x", id_from: stem }
1675chunker: { type: sentence_aware }
1676embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1677target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false, vector_metric: l2 }
1678"#;
1679        let path = write_yaml(yaml);
1680        let cfg = load_config(&path).expect("load");
1681        let TargetConfig::Postgres(t) = &cfg.target else {
1682            panic!("expected Postgres target");
1683        };
1684        assert_eq!(t.vector_metric, "l2");
1685
1686        let yaml = r#"
1687cell_name: t
1688source: { type: files, glob: "x", id_from: stem }
1689chunker: { type: sentence_aware }
1690embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1691target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false, vector_metric: manhattan }
1692"#;
1693        let path = write_yaml(yaml);
1694        let err = format!("{:#}", load_config(&path).unwrap_err());
1695        assert!(
1696            err.contains("vector_metric"),
1697            "expected vector_metric error, got: {err}"
1698        );
1699    }
1700
1701    #[test]
1702    fn rejects_section_aware_without_hierarchy_base() {
1703        let yaml = r#"
1704cell_name: t
1705source: { type: files, glob: "x", id_from: stem }
1706chunker:
1707  type: hierarchical_summary
1708  base: { type: sentence_aware }
1709  summarizer: { mode: passthrough }
1710  grouping: { strategy: section_aware }
1711embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1712target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1713"#;
1714        let path = write_yaml(yaml);
1715        let err = format!("{:#}", load_config(&path).unwrap_err());
1716        assert!(
1717            err.contains("section_aware") && err.contains("hierarchy"),
1718            "expected section_aware/hierarchy mention, got: {err}"
1719        );
1720    }
1721
1722    #[test]
1723    fn parses_if_oversize_on_every_chunker_variant() {
1724        // Brief SC-001: every chunker variant accepts an optional if_oversize
1725        // pointing at any other chunker config.
1726        for kind in [
1727            "sentence_aware",
1728            "hierarchy",
1729            "fixed_overlap",
1730            "neighbor_expand",
1731            "semantic",
1732            "summary_embed",
1733            "hierarchical_summary",
1734        ] {
1735            let chunker_yaml = match kind {
1736                "sentence_aware" => "{ type: sentence_aware }".to_string(),
1737                "hierarchy" => "{ type: hierarchy }".to_string(),
1738                "fixed_overlap" => "{ type: fixed_overlap, max_chars: 1500 }".to_string(),
1739                "neighbor_expand" => {
1740                    "{ type: neighbor_expand, base: { type: hierarchy } }".to_string()
1741                }
1742                "semantic" => "{ type: semantic }".to_string(),
1743                "summary_embed" => "{ type: summary_embed, base: { type: hierarchy }, summarizer: { mode: passthrough } }".to_string(),
1744                "hierarchical_summary" => "{ type: hierarchical_summary, base: { type: hierarchy }, summarizer: { mode: passthrough } }".to_string(),
1745                _ => unreachable!(),
1746            };
1747            // Inline a minimal cell config and inject if_oversize on the chunker.
1748            let yaml = format!(
1749                r#"
1750cell_name: t
1751source: {{ type: files, glob: "x", id_from: stem }}
1752chunker:
1753  type: {kind}
1754  {extra}
1755  if_oversize:
1756    type: fixed_overlap
1757    window_words: 100
1758    step_words: 100
1759    max_chars: 500
1760embedder: {{ type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }}
1761target: {{ type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }}
1762"#,
1763                kind = kind,
1764                extra = match kind {
1765                    "fixed_overlap" => "max_chars: 1500".to_string(),
1766                    "neighbor_expand" => "base: { type: hierarchy }".to_string(),
1767                    "summary_embed" =>
1768                        "base: { type: hierarchy }\n  summarizer: { mode: passthrough }".to_string(),
1769                    "hierarchical_summary" =>
1770                        "base: { type: hierarchy }\n  summarizer: { mode: passthrough }".to_string(),
1771                    _ => "".to_string(),
1772                }
1773            );
1774            let _ = chunker_yaml; // suppress unused-var lint in this branch
1775            let path = write_yaml(&yaml);
1776            let cfg = load_config(&path).unwrap_or_else(|e| {
1777                panic!("if_oversize on {kind} failed to parse: {e:#}");
1778            });
1779            assert!(
1780                cfg.chunker.if_oversize().is_some(),
1781                "if_oversize missing for {kind}"
1782            );
1783        }
1784    }
1785
1786    #[test]
1787    fn rejects_if_oversize_without_effective_ceiling() {
1788        // Brief SC-001 NEVER: fixed_overlap without max_chars and with
1789        // if_oversize is rejected at config-load — there's nothing to
1790        // compare against.
1791        let yaml = r#"
1792cell_name: t
1793source: { type: files, glob: "x", id_from: stem }
1794chunker:
1795  type: fixed_overlap
1796  window_words: 200
1797  step_words: 100
1798  if_oversize:
1799    type: fixed_overlap
1800    window_words: 100
1801    step_words: 50
1802    max_chars: 500
1803embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1804target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1805"#;
1806        let path = write_yaml(yaml);
1807        let err = format!("{:#}", load_config(&path).unwrap_err());
1808        assert!(
1809            err.contains("if_oversize") && err.contains("max_chars"),
1810            "expected if_oversize/max_chars complaint, got: {err}"
1811        );
1812    }
1813
1814    #[test]
1815    fn effective_max_chars_falls_through_to_base() {
1816        // Brief SC-003: wrapper without explicit max_chars inherits from base.
1817        let yaml = r#"
1818cell_name: t
1819source: { type: files, glob: "x", id_from: stem }
1820chunker:
1821  type: neighbor_expand
1822  window: 2
1823  base:
1824    type: hierarchy
1825    max_chars: 1234
1826embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1827target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1828"#;
1829        let path = write_yaml(yaml);
1830        let cfg = load_config(&path).expect("load");
1831        assert_eq!(cfg.chunker.effective_max_chars(), Some(1234));
1832    }
1833
1834    #[test]
1835    fn fixed_overlap_max_chars_is_optional_unset() {
1836        // Brief SC-002: fixed_overlap without max_chars parses and resolves
1837        // to None (legacy word-only behavior).
1838        let yaml = r#"
1839cell_name: t
1840source: { type: files, glob: "x", id_from: stem }
1841chunker: { type: fixed_overlap, window_words: 200, step_words: 100 }
1842embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1843target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1844"#;
1845        let path = write_yaml(yaml);
1846        let cfg = load_config(&path).expect("load");
1847        assert!(cfg.chunker.effective_max_chars().is_none());
1848    }
1849
1850    #[test]
1851    fn accepts_section_aware_with_hierarchy_base() {
1852        let yaml = r#"
1853cell_name: t
1854source: { type: files, glob: "x", id_from: stem }
1855chunker:
1856  type: hierarchical_summary
1857  base: { type: hierarchy }
1858  summarizer: { mode: passthrough }
1859  grouping: { strategy: section_aware }
1860embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1861target: { type: postgres, dsn_env: D, database: s, table: t, mode: overwrite, hnsw: false }
1862"#;
1863        let path = write_yaml(yaml);
1864        load_config(&path).expect("should accept section_aware over hierarchy base");
1865    }
1866
1867    #[test]
1868    fn parses_sqlite_target_config() {
1869        let yaml = r#"
1870cell_name: t
1871source: { type: files, glob: "x", id_from: stem }
1872chunker: { type: sentence_aware }
1873embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1874target: { type: sqlite, dsn_env: SQLITE_PATH, database: ignored, table: chunks, mode: overwrite, hnsw: false }
1875"#;
1876        let path = write_yaml(yaml);
1877        let cfg = load_config(&path).expect("load");
1878        match &cfg.target {
1879            TargetConfig::Sqlite(t) => {
1880                assert_eq!(t.dsn_env, "SQLITE_PATH");
1881                assert_eq!(t.database_name, "ignored");
1882                assert_eq!(t.table, "chunks");
1883                assert_eq!(t.mode, "overwrite");
1884            }
1885            _ => panic!("expected Sqlite target"),
1886        }
1887    }
1888
1889    #[test]
1890    fn rejects_sqlite_append_without_source_tag() {
1891        let yaml = r#"
1892cell_name: t
1893source: { type: files, glob: "x", id_from: stem }
1894chunker: { type: sentence_aware }
1895embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1896target: { type: sqlite, dsn_env: SQLITE_PATH, database: ignored, table: chunks, mode: append, hnsw: false }
1897"#;
1898        let path = write_yaml(yaml);
1899        let err = format!("{:#}", load_config(&path).unwrap_err());
1900        assert!(
1901            err.contains("source_tag"),
1902            "expected source_tag mention, got: {err}"
1903        );
1904    }
1905
1906    #[test]
1907    fn parses_sqlite_table_source_config() {
1908        let yaml = r#"
1909cell_name: t
1910source:
1911  type: sqlite_table
1912  dsn_env: SQLITE_PATH
1913  database: ignored
1914  table: docs
1915  id_column: id
1916  content_column: body
1917chunker: { type: sentence_aware }
1918embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1919target: { type: sqlite, dsn_env: SQLITE_PATH, database: ignored, table: chunks, mode: overwrite, hnsw: false }
1920"#;
1921        let path = write_yaml(yaml);
1922        let cfg = load_config(&path).expect("load");
1923        match &cfg.source {
1924            SourceConfig::SqliteTable(s) => {
1925                assert_eq!(s.dsn_env, "SQLITE_PATH");
1926                assert_eq!(s.table, "docs");
1927                assert_eq!(s.id_column, "id");
1928            }
1929            _ => panic!("expected SqliteTable source"),
1930        }
1931    }
1932
1933    #[test]
1934    fn parses_clickhouse_target() {
1935        let yaml = r#"
1936cell_name: t
1937source: { type: files, glob: "x", id_from: stem }
1938chunker: { type: sentence_aware }
1939embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1940target:
1941  type: clickhouse
1942  dsn_env: CHUNKSHOP_DSN_CH
1943  database: my_db
1944  table: chunks
1945  mode: overwrite
1946  hnsw: true
1947"#;
1948        let path = write_yaml(yaml);
1949        let cfg = load_config(&path).expect("load");
1950        let TargetConfig::Clickhouse(t) = &cfg.target else {
1951            panic!("expected Clickhouse variant");
1952        };
1953        assert_eq!(t.database_name, "my_db");
1954        assert_eq!(t.table, "chunks");
1955        assert!(t.engine.is_none());
1956    }
1957
1958    #[test]
1959    fn accepts_replacing_merge_tree_engine() {
1960        let yaml = r#"
1961cell_name: t
1962source: { type: files, glob: "x", id_from: stem }
1963chunker: { type: sentence_aware }
1964embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1965target:
1966  type: clickhouse
1967  dsn_env: D
1968  database: db
1969  table: t
1970  mode: overwrite
1971  hnsw: false
1972  engine: "ReplacingMergeTree(created_at) ORDER BY (id)"
1973"#;
1974        let path = write_yaml(yaml);
1975        let cfg = load_config(&path).expect("ReplacingMergeTree should be accepted");
1976        let TargetConfig::Clickhouse(t) = &cfg.target else {
1977            unreachable!()
1978        };
1979        assert_eq!(
1980            t.engine.as_deref(),
1981            Some("ReplacingMergeTree(created_at) ORDER BY (id)")
1982        );
1983    }
1984
1985    #[test]
1986    fn rejects_arbitrary_engine_string() {
1987        let yaml = r#"
1988cell_name: t
1989source: { type: files, glob: "x", id_from: stem }
1990chunker: { type: sentence_aware }
1991embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
1992target:
1993  type: clickhouse
1994  dsn_env: D
1995  database: db
1996  table: t
1997  mode: overwrite
1998  hnsw: false
1999  engine: "Memory"
2000"#;
2001        let path = write_yaml(yaml);
2002        let err = format!("{:#}", load_config(&path).unwrap_err());
2003        assert!(
2004            err.contains("allowlist") && err.contains("Memory"),
2005            "got: {err}"
2006        );
2007    }
2008
2009    #[test]
2010    fn rejects_engine_with_drop_table_injection() {
2011        let yaml = r#"
2012cell_name: t
2013source: { type: files, glob: "x", id_from: stem }
2014chunker: { type: sentence_aware }
2015embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2016target:
2017  type: clickhouse
2018  dsn_env: D
2019  database: db
2020  table: t
2021  mode: overwrite
2022  hnsw: false
2023  engine: "MergeTree(); DROP TABLE other"
2024"#;
2025        let path = write_yaml(yaml);
2026        assert!(
2027            load_config(&path).is_err(),
2028            "engine with embedded DROP must be rejected"
2029        );
2030    }
2031
2032    // --- RM-A Task 1: SessionStagingSourceConfig + MemoryConfig ------------
2033
2034    #[test]
2035    fn session_staging_source_deserialises() {
2036        let yaml = r#"
2037cell_name: m
2038source:
2039  type: session_staging
2040  dsn: "postgresql://localhost/x"
2041  staging_table: chunkshop_staging
2042  staging_schema: public
2043  mode: realtime
2044chunker: { type: sentence_aware }
2045embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2046target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2047"#;
2048        let path = write_yaml(yaml);
2049        let cfg = load_config(&path).expect("load_config should succeed");
2050        match cfg.source {
2051            SourceConfig::SessionStaging(s) => {
2052                assert_eq!(s.staging_table, "chunkshop_staging");
2053                assert_eq!(s.staging_schema, "public");
2054                assert_eq!(s.mode, SessionStagingMode::Realtime);
2055                assert_eq!(s.min_age_seconds, 0);
2056                assert!(s.dsn.is_some());
2057            }
2058            other => panic!("expected SessionStaging variant; got {other:?}"),
2059        }
2060    }
2061
2062    #[test]
2063    fn session_staging_defaults_match_python() {
2064        // Only mode is required; staging_schema/staging_table default.
2065        let yaml = r#"
2066cell_name: m
2067source:
2068  type: session_staging
2069  dsn_env: D
2070  mode: consolidate
2071chunker: { type: sentence_aware }
2072embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2073target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2074"#;
2075        let cfg = load_config(&write_yaml(yaml)).unwrap();
2076        if let SourceConfig::SessionStaging(s) = cfg.source {
2077            assert_eq!(s.staging_table, "chunkshop_staging");
2078            assert_eq!(s.staging_schema, "public");
2079            assert_eq!(s.mode, SessionStagingMode::Consolidate);
2080        } else {
2081            panic!("not session_staging");
2082        }
2083    }
2084
2085    #[test]
2086    fn memory_block_on_postgres_target() {
2087        let yaml = r#"
2088cell_name: m
2089source: { type: files, glob: "x", id_from: stem }
2090chunker: { type: sentence_aware }
2091embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2092target:
2093  type: postgres
2094  dsn_env: D
2095  database: agent_memory
2096  table: memory
2097  mode: create_if_missing
2098  source_tag: ns1
2099  hnsw: false
2100  memory:
2101    tier: consolidated
2102    supersede: true
2103    namespace: ns1
2104"#;
2105        let cfg = load_config(&write_yaml(yaml)).unwrap();
2106        let mem = match cfg.target {
2107            TargetConfig::Postgres(p) => p.memory.expect("memory expected"),
2108            _ => panic!("expected postgres target"),
2109        };
2110        assert_eq!(mem.tier, MemoryTier::Consolidated);
2111        assert!(mem.supersede);
2112        assert_eq!(mem.namespace.as_deref(), Some("ns1"));
2113    }
2114
2115    #[test]
2116    fn memory_block_unknown_field_rejected() {
2117        let yaml = r#"
2118cell_name: m
2119source: { type: files, glob: "x", id_from: stem }
2120chunker: { type: sentence_aware }
2121embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2122target:
2123  type: postgres
2124  dsn_env: D
2125  database: agent_memory
2126  table: memory
2127  mode: create_if_missing
2128  source_tag: ns1
2129  hnsw: false
2130  memory: { tier: consolidated, supersede: true, namespace: ns1, bogus_field: yes }
2131"#;
2132        let err = load_config(&write_yaml(yaml)).expect_err("bogus_field must fail");
2133        let msg = format!("{:#}", err);
2134        assert!(
2135            msg.contains("bogus_field") || msg.contains("unknown"),
2136            "expected unknown-field complaint, got: {msg}"
2137        );
2138    }
2139
2140    // --- RM-A Task 2: SessionEpisodeFramerConfig --------------------------
2141
2142    #[test]
2143    fn session_episode_framer_deserialises_with_defaults() {
2144        let yaml = r#"
2145cell_name: m
2146source: { type: files, glob: "x", id_from: stem }
2147framer: { type: session_episode }
2148chunker: { type: sentence_aware }
2149embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2150target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2151"#;
2152        let cfg = load_config(&write_yaml(yaml)).unwrap();
2153        match cfg.framer {
2154            FramerConfig::SessionEpisode(f) => {
2155                assert_eq!(f.max_gap_seconds, 1800); // python default
2156                assert_eq!(f.max_turns, 40);
2157                assert_eq!(f.max_words, 1200);
2158                assert!(f.boundary_on_tool);
2159            }
2160            other => panic!("expected SessionEpisode framer; got {other:?}"),
2161        }
2162    }
2163
2164    #[test]
2165    fn session_episode_framer_overrides_apply() {
2166        let yaml = r#"
2167cell_name: m
2168source: { type: files, glob: "x", id_from: stem }
2169framer:
2170  type: session_episode
2171  max_gap_seconds: 600
2172  max_turns: 20
2173  max_words: 500
2174  boundary_on_tool: false
2175chunker: { type: sentence_aware }
2176embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2177target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2178"#;
2179        let cfg = load_config(&write_yaml(yaml)).unwrap();
2180        if let FramerConfig::SessionEpisode(f) = cfg.framer {
2181            assert_eq!(f.max_gap_seconds, 600);
2182            assert_eq!(f.max_turns, 20);
2183            assert_eq!(f.max_words, 500);
2184            assert!(!f.boundary_on_tool);
2185        } else {
2186            panic!("not session_episode");
2187        }
2188    }
2189
2190    // --- RM-A Task 3: ConsolidationChunkerConfig --------------------------
2191
2192    #[test]
2193    fn consolidation_chunker_deserialises() {
2194        let yaml = r#"
2195cell_name: m
2196source: { type: files, glob: "x", id_from: stem }
2197framer: { type: session_episode }
2198chunker:
2199  type: consolidation
2200  base:
2201    type: sentence_aware
2202    max_chars: 2000
2203    min_chars: 200
2204  consolidator:
2205    mode: extractive
2206  fact_max_chars: 1200
2207embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2208target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2209"#;
2210        let cfg = load_config(&write_yaml(yaml)).unwrap();
2211        match cfg.chunker {
2212            ChunkerConfig::Consolidation(c) => {
2213                assert_eq!(c.fact_max_chars, 1200);
2214                assert!(matches!(*c.base, ChunkerConfig::SentenceAware(_)));
2215                assert!(matches!(c.consolidator, ConsolidatorConfig::Extractive(_)));
2216            }
2217            other => panic!("expected Consolidation; got {other:?}"),
2218        }
2219    }
2220
2221    #[test]
2222    fn consolidation_chunker_default_fact_max_chars() {
2223        let yaml = r#"
2224cell_name: m
2225source: { type: files, glob: "x", id_from: stem }
2226chunker:
2227  type: consolidation
2228  base: { type: sentence_aware }
2229  consolidator: { mode: extractive }
2230embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2231target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2232"#;
2233        let cfg = load_config(&write_yaml(yaml)).unwrap();
2234        if let ChunkerConfig::Consolidation(c) = cfg.chunker {
2235            assert_eq!(c.fact_max_chars, 1200); // python parity default
2236            assert!(matches!(*c.base, ChunkerConfig::SentenceAware(_)));
2237        } else {
2238            panic!("not consolidation");
2239        }
2240    }
2241
2242    #[test]
2243    fn consolidation_chunker_unknown_field_rejected() {
2244        let yaml = r#"
2245cell_name: m
2246source: { type: files, glob: "x", id_from: stem }
2247chunker:
2248  type: consolidation
2249  base: { type: sentence_aware }
2250  consolidator: { mode: extractive }
2251  bogus_consolidation_field: yes
2252embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2253target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2254"#;
2255        let err = load_config(&write_yaml(yaml)).expect_err("bogus must fail");
2256        let msg = format!("{:#}", err);
2257        assert!(
2258            msg.contains("bogus") || msg.contains("unknown"),
2259            "expected unknown-field complaint, got: {msg}"
2260        );
2261    }
2262
2263    #[test]
2264    fn consolidator_unknown_mode_rejected() {
2265        let yaml = r#"
2266cell_name: m
2267source: { type: files, glob: "x", id_from: stem }
2268chunker:
2269  type: consolidation
2270  base: { type: sentence_aware }
2271  consolidator: { mode: callable }
2272embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2273target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2274"#;
2275        // `mode: callable` is Python-side only; Rust accepts `extractive` (and
2276        // future `llm`) but not `callable`. Reject cleanly.
2277        let err = load_config(&write_yaml(yaml)).expect_err("callable must fail in rust");
2278        let msg = format!("{:#}", err);
2279        assert!(
2280            msg.contains("callable") || msg.contains("variant") || msg.contains("unknown"),
2281            "expected mode-unknown complaint, got: {msg}"
2282        );
2283    }
2284
2285    #[test]
2286    fn session_episode_framer_unknown_field_rejected() {
2287        let yaml = r#"
2288cell_name: m
2289source: { type: files, glob: "x", id_from: stem }
2290framer: { type: session_episode, bogus: 1 }
2291chunker: { type: sentence_aware }
2292embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2293target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2294"#;
2295        let err = load_config(&write_yaml(yaml)).expect_err("bogus must fail");
2296        let msg = format!("{:#}", err);
2297        assert!(
2298            msg.contains("bogus") || msg.contains("unknown"),
2299            "expected unknown-field complaint, got: {msg}"
2300        );
2301    }
2302
2303    #[test]
2304    fn session_staging_unknown_field_rejected() {
2305        let yaml = r#"
2306cell_name: m
2307source:
2308  type: session_staging
2309  dsn_env: D
2310  mode: realtime
2311  bogus: 1
2312chunker: { type: sentence_aware }
2313embedder: { type: fastembed, model_name: BAAI/bge-base-en-v1.5, dim: 768 }
2314target: { type: postgres, dsn_env: D, database: agent_memory, table: memory, mode: create_if_missing, source_tag: ns1, hnsw: false }
2315"#;
2316        let err = load_config(&write_yaml(yaml)).expect_err("bogus must fail");
2317        let msg = format!("{:#}", err);
2318        assert!(
2319            msg.contains("bogus") || msg.contains("unknown"),
2320            "expected unknown-field complaint, got: {msg}"
2321        );
2322    }
2323}
chunkshop/config.rs

chunkshop/
config.rs