Skip to main content

rust_memex/rag/
provider.rs

1use anyhow::{Result, anyhow};
2use chrono::{TimeZone, Utc};
3use serde::{Deserialize, Serialize};
4use serde_json::{Value, json};
5use std::future::Future;
6use std::path::Path;
7use std::pin::Pin;
8
9use crate::rag::{
10    OnionSlice, OnionSliceConfig, OuterSynthesis, SliceMode, create_onion_slices_async,
11    create_onion_slices_fast_async,
12};
13use crate::rag::{compute_content_hash, pipeline::Chunk, pipeline::FileContent};
14
15pub type ChunkProviderFuture<'a> = Pin<Box<dyn Future<Output = Result<Vec<Chunk>>> + Send + 'a>>;
16
17/// Options shared by all chunk providers.
18#[derive(Debug, Clone)]
19pub struct ChunkOpts {
20    pub chunker: ChunkerKind,
21    pub slice_mode: SliceMode,
22    pub outer_synthesis: OuterSynthesis,
23    pub flat_window: usize,
24    pub flat_overlap: usize,
25}
26
27impl ChunkOpts {
28    pub fn new(
29        chunker: ChunkerKind,
30        slice_mode: SliceMode,
31        outer_synthesis: OuterSynthesis,
32    ) -> Self {
33        Self {
34            chunker,
35            slice_mode,
36            outer_synthesis,
37            flat_window: 512,
38            flat_overlap: 128,
39        }
40    }
41}
42
43/// Trait abstracting chunk production strategies.
44pub trait ChunkProvider: Send + Sync {
45    fn name(&self) -> &'static str;
46    fn chunk<'a>(&'a self, doc: &'a FileContent, opts: &'a ChunkOpts) -> ChunkProviderFuture<'a>;
47}
48
49/// Chunker strategy exposed through CLI/API routing.
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
51#[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
52#[serde(rename_all = "kebab-case")]
53pub enum ChunkerKind {
54    Aicx,
55    Onion,
56    Flat,
57}
58
59impl ChunkerKind {
60    pub fn name(self) -> &'static str {
61        match self {
62            Self::Aicx => "aicx",
63            Self::Onion => "onion",
64            Self::Flat => "flat",
65        }
66    }
67
68    pub fn into_provider(self) -> Box<dyn ChunkProvider> {
69        match self {
70            Self::Aicx => Box::new(AicxChunkProvider::default()),
71            Self::Onion => Box::new(OnionChunkProvider),
72            Self::Flat => Box::new(FlatChunkProvider {
73                window: 512,
74                overlap: 128,
75            }),
76        }
77    }
78
79    pub fn slice_mode(self, requested: SliceMode) -> SliceMode {
80        match self {
81            Self::Aicx | Self::Flat => SliceMode::Flat,
82            Self::Onion => match requested {
83                SliceMode::OnionFast => SliceMode::OnionFast,
84                _ => SliceMode::Onion,
85            },
86        }
87    }
88}
89
90impl std::str::FromStr for ChunkerKind {
91    type Err = String;
92
93    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
94        match s.to_ascii_lowercase().as_str() {
95            "aicx" => Ok(Self::Aicx),
96            "onion" => Ok(Self::Onion),
97            "flat" => Ok(Self::Flat),
98            other => Err(format!(
99                "Invalid chunker: '{}'. Use 'aicx', 'onion', or 'flat'",
100                other
101            )),
102        }
103    }
104}
105
106impl std::fmt::Display for ChunkerKind {
107    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
108        f.write_str(self.name())
109    }
110}
111
112/// Existing onion-slicer logic. Deprecated for transcript-shaped content.
113pub struct OnionChunkProvider;
114
115/// Frame-aware transcript chunker via `aicx-parser`.
116#[derive(Default)]
117pub struct AicxChunkProvider {
118    config: aicx_parser::ChunkerConfig,
119}
120
121/// Simple fixed-size sliding window with configurable overlap.
122pub struct FlatChunkProvider {
123    window: usize,
124    overlap: usize,
125}
126
127impl ChunkProvider for OnionChunkProvider {
128    fn name(&self) -> &'static str {
129        "onion"
130    }
131
132    fn chunk<'a>(&'a self, doc: &'a FileContent, opts: &'a ChunkOpts) -> ChunkProviderFuture<'a> {
133        Box::pin(async move {
134            let metadata = base_metadata(doc, opts.chunker, opts.slice_mode);
135            let config = OnionSliceConfig {
136                outer_synthesis: opts.outer_synthesis.clone(),
137                ..OnionSliceConfig::default()
138            };
139            let slices = match opts.slice_mode {
140                SliceMode::OnionFast => {
141                    create_onion_slices_fast_async(&doc.text, &metadata, &config).await
142                }
143                SliceMode::Onion | SliceMode::Flat => {
144                    create_onion_slices_async(&doc.text, &metadata, &config).await
145                }
146            };
147
148            Ok(slices_to_chunks(slices, doc, metadata))
149        })
150    }
151}
152
153impl ChunkProvider for AicxChunkProvider {
154    fn name(&self) -> &'static str {
155        "aicx"
156    }
157
158    fn chunk<'a>(&'a self, doc: &'a FileContent, opts: &'a ChunkOpts) -> ChunkProviderFuture<'a> {
159        Box::pin(async move {
160            let entries = doc_to_timeline_entries(doc)?;
161            let project = project_label(&doc.namespace);
162            let agent = first_agent(&entries).unwrap_or("rust-memex");
163            let aicx_chunks =
164                aicx_parser::chunker::chunk_entries(&entries, &project, agent, &self.config);
165            Ok(aicx_chunks
166                .into_iter()
167                .map(|chunk| memex_chunk_from_aicx(chunk, doc, opts))
168                .collect())
169        })
170    }
171}
172
173impl ChunkProvider for FlatChunkProvider {
174    fn name(&self) -> &'static str {
175        "flat"
176    }
177
178    fn chunk<'a>(&'a self, doc: &'a FileContent, opts: &'a ChunkOpts) -> ChunkProviderFuture<'a> {
179        Box::pin(async move {
180            let window = opts.flat_window.max(1).max(self.window);
181            let overlap = opts
182                .flat_overlap
183                .min(window.saturating_sub(1))
184                .max(self.overlap.min(window.saturating_sub(1)));
185            let metadata = base_metadata(doc, opts.chunker, SliceMode::Flat);
186            Ok(create_flat_chunks(
187                &doc.text, doc, metadata, window, overlap,
188            ))
189        })
190    }
191}
192
193pub fn detect_default_chunker(source_path: &Path, namespace: &str) -> ChunkerKind {
194    let path_str = source_path.to_string_lossy();
195    if namespace.starts_with("kb:transcripts")
196        || namespace.starts_with("aicx")
197        || namespace.starts_with("klaudiusz-")
198        || path_str.contains("__clean.md")
199        || path_str.contains("/.aicx/store/")
200        || path_str.contains("/transcripts/")
201    {
202        ChunkerKind::Aicx
203    } else {
204        ChunkerKind::Onion
205    }
206}
207
208fn base_metadata(doc: &FileContent, chunker: ChunkerKind, slice_mode: SliceMode) -> Value {
209    let mut metadata = json!({
210        "path": doc.path.to_str(),
211        "content_hash": &doc.content_hash,
212        "source_hash": &doc.content_hash,
213        "chunker": chunker.name(),
214        "slice_mode": match slice_mode {
215            SliceMode::Onion => "onion",
216            SliceMode::OnionFast => "onion-fast",
217            SliceMode::Flat => "flat",
218        },
219    });
220
221    if chunker == ChunkerKind::Aicx
222        && let Value::Object(ref mut map) = metadata
223    {
224        map.insert("format".to_string(), json!("markdown_transcript"));
225        map.insert("type".to_string(), json!("conversation"));
226    }
227
228    metadata
229}
230
231fn slices_to_chunks(
232    slices: Vec<OnionSlice>,
233    content: &FileContent,
234    base_metadata: Value,
235) -> Vec<Chunk> {
236    slices
237        .into_iter()
238        .map(|slice| {
239            let chunk_hash = compute_content_hash(&slice.content);
240            let mut metadata = base_metadata.clone();
241            if let Value::Object(ref mut map) = metadata {
242                map.insert("chunk_hash".to_string(), json!(&chunk_hash));
243                map.insert("layer".to_string(), json!(slice.layer.name()));
244                map.insert("keywords".to_string(), json!(slice.keywords));
245            }
246
247            Chunk {
248                id: slice.id,
249                content: slice.content,
250                source_path: content.path.clone(),
251                namespace: content.namespace.clone(),
252                chunk_hash,
253                source_hash: content.content_hash.clone(),
254                layer: slice.layer.as_u8(),
255                parent_id: slice.parent_id,
256                children_ids: slice.children_ids,
257                keywords: slice.keywords,
258                metadata,
259            }
260        })
261        .collect()
262}
263
264fn create_flat_chunks(
265    text: &str,
266    content: &FileContent,
267    base_metadata: Value,
268    window: usize,
269    overlap: usize,
270) -> Vec<Chunk> {
271    let chunks = split_into_chunks(text, window, overlap);
272    let total_chunks = chunks.len();
273
274    chunks
275        .into_iter()
276        .enumerate()
277        .map(|(idx, chunk_text)| {
278            let chunk_hash = compute_content_hash(&chunk_text);
279            let mut metadata = base_metadata.clone();
280            if let Value::Object(ref mut map) = metadata {
281                map.insert("chunk_index".to_string(), json!(idx));
282                map.insert("total_chunks".to_string(), json!(total_chunks));
283                map.insert("chunk_hash".to_string(), json!(&chunk_hash));
284            }
285
286            let id = format!(
287                "{}_{}_{}",
288                content.path.to_str().unwrap_or("unknown"),
289                content.content_hash.get(..8).unwrap_or(""),
290                idx
291            );
292
293            Chunk {
294                id,
295                content: chunk_text,
296                source_path: content.path.clone(),
297                namespace: content.namespace.clone(),
298                chunk_hash,
299                source_hash: content.content_hash.clone(),
300                layer: 0,
301                parent_id: None,
302                children_ids: vec![],
303                keywords: vec![],
304                metadata,
305            }
306        })
307        .collect()
308}
309
310pub(crate) fn split_into_chunks(text: &str, target_size: usize, overlap: usize) -> Vec<String> {
311    let mut char_offsets: Vec<usize> = text.char_indices().map(|(byte_idx, _)| byte_idx).collect();
312    let len = char_offsets.len();
313
314    if len <= target_size {
315        return vec![text.to_string()];
316    }
317
318    char_offsets.push(text.len());
319
320    let mut chunks = Vec::new();
321    let mut start = 0;
322
323    while start < len {
324        let end = (start + target_size).min(len);
325        let start_byte = char_offsets[start];
326        let end_byte = char_offsets[end];
327        chunks.push(text[start_byte..end_byte].to_string());
328
329        if end >= len {
330            break;
331        }
332
333        start = end.saturating_sub(overlap);
334    }
335
336    chunks
337}
338
339fn doc_to_timeline_entries(doc: &FileContent) -> Result<Vec<aicx_parser::TimelineEntry>> {
340    let timestamp = Utc
341        .timestamp_opt(0, 0)
342        .single()
343        .ok_or_else(|| anyhow!("failed to construct epoch timestamp"))?;
344    let session_id = compute_content_hash(&doc.text)
345        .get(..12)
346        .unwrap_or("session")
347        .to_string();
348    let cwd = doc
349        .path
350        .parent()
351        .and_then(Path::to_str)
352        .map(ToString::to_string);
353    let preamble = extract_preamble(&doc.text);
354    let mut entries = Vec::new();
355    let mut current_role: Option<String> = None;
356    let mut current_body = String::new();
357
358    for line in doc.text.lines() {
359        if let Some(role) = role_heading(line) {
360            push_entry(
361                &mut entries,
362                current_role.take(),
363                &mut current_body,
364                timestamp,
365                &session_id,
366                cwd.clone(),
367            );
368            current_role = Some(role.to_string());
369        } else {
370            if !current_body.is_empty() {
371                current_body.push('\n');
372            }
373            current_body.push_str(line);
374        }
375    }
376
377    push_entry(
378        &mut entries,
379        current_role,
380        &mut current_body,
381        timestamp,
382        &session_id,
383        cwd,
384    );
385
386    if let Some(preamble) = preamble
387        && let Some(first) = entries.first_mut()
388        && !first.message.starts_with("---")
389    {
390        first.message = format!("{}\n\n{}", preamble.trim_end(), first.message.trim_start());
391    }
392
393    if entries.is_empty() {
394        entries.push(aicx_parser::TimelineEntry {
395            timestamp,
396            agent: "rust-memex".to_string(),
397            session_id,
398            role: "assistant".to_string(),
399            message: doc.text.clone(),
400            frame_kind: Some(aicx_parser::FrameKind::AgentReply),
401            branch: None,
402            cwd: doc
403                .path
404                .parent()
405                .and_then(Path::to_str)
406                .map(ToString::to_string),
407        });
408    }
409
410    Ok(entries)
411}
412
413fn extract_preamble(text: &str) -> Option<String> {
414    let mut lines = Vec::new();
415    for line in text.lines() {
416        if role_heading(line).is_some() {
417            break;
418        }
419        lines.push(line);
420    }
421    let preamble = lines.join("\n");
422    (!preamble.trim().is_empty()).then_some(preamble)
423}
424
425fn push_entry(
426    entries: &mut Vec<aicx_parser::TimelineEntry>,
427    role: Option<String>,
428    body: &mut String,
429    timestamp: chrono::DateTime<Utc>,
430    session_id: &str,
431    cwd: Option<String>,
432) {
433    let Some(role) = role else {
434        body.clear();
435        return;
436    };
437    let message = body.trim().to_string();
438    body.clear();
439    if message.is_empty() {
440        return;
441    }
442    let frame_kind = match role.as_str() {
443        "user" => Some(aicx_parser::FrameKind::UserMsg),
444        "assistant" => Some(aicx_parser::FrameKind::AgentReply),
445        "tool" => Some(aicx_parser::FrameKind::ToolCall),
446        _ => None,
447    };
448    entries.push(aicx_parser::TimelineEntry {
449        timestamp,
450        agent: "rust-memex".to_string(),
451        session_id: session_id.to_string(),
452        role,
453        message,
454        frame_kind,
455        branch: None,
456        cwd,
457    });
458}
459
460fn role_heading(line: &str) -> Option<&'static str> {
461    let lowered = line.trim().to_ascii_lowercase();
462    match lowered.as_str() {
463        "## user" | "### user" | "[user]" | "user request:" => Some("user"),
464        "## assistant" | "### assistant" | "[assistant]" | "assistant response:" => {
465            Some("assistant")
466        }
467        "## tool" | "### tool" | "[tool]" | "tool:" | "tool result:" => Some("tool"),
468        _ => None,
469    }
470}
471
472fn first_agent(entries: &[aicx_parser::TimelineEntry]) -> Option<&str> {
473    entries.first().map(|entry| entry.agent.as_str())
474}
475
476fn project_label(namespace: &str) -> String {
477    namespace
478        .chars()
479        .map(|ch| {
480            if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') {
481                ch
482            } else {
483                '_'
484            }
485        })
486        .collect()
487}
488
489fn memex_chunk_from_aicx(
490    chunk: aicx_parser::Chunk,
491    content: &FileContent,
492    opts: &ChunkOpts,
493) -> Chunk {
494    let chunk_hash = compute_content_hash(&chunk.text);
495    let id_prefix = content.content_hash.get(..8).unwrap_or("aicx");
496    let mut metadata = base_metadata(content, opts.chunker, SliceMode::Flat);
497    if let Value::Object(ref mut map) = metadata {
498        map.insert("aicx_chunk_id".to_string(), json!(&chunk.id));
499        map.insert("aicx_project".to_string(), json!(&chunk.project));
500        map.insert("aicx_agent".to_string(), json!(&chunk.agent));
501        map.insert("aicx_date".to_string(), json!(&chunk.date));
502        map.insert("aicx_session_id".to_string(), json!(&chunk.session_id));
503        map.insert("aicx_kind".to_string(), json!(chunk.kind));
504        map.insert("aicx_frame_kind".to_string(), json!(chunk.frame_kind));
505        map.insert("aicx_msg_start".to_string(), json!(chunk.msg_range.0));
506        map.insert("aicx_msg_end".to_string(), json!(chunk.msg_range.1));
507        map.insert("token_estimate".to_string(), json!(chunk.token_estimate));
508        map.insert("highlights".to_string(), json!(&chunk.highlights));
509        map.insert("chunk_hash".to_string(), json!(&chunk_hash));
510        if let Some(cwd) = &chunk.cwd {
511            map.insert("aicx_cwd".to_string(), json!(cwd));
512        }
513        if let Some(run_id) = &chunk.run_id {
514            map.insert("run_id".to_string(), json!(run_id));
515        }
516        if let Some(prompt_id) = &chunk.prompt_id {
517            map.insert("prompt_id".to_string(), json!(prompt_id));
518        }
519        if let Some(model) = &chunk.agent_model {
520            map.insert("agent_model".to_string(), json!(model));
521        }
522    }
523
524    Chunk {
525        id: format!("{id_prefix}::{}", chunk.id),
526        content: chunk.text,
527        source_path: content.path.clone(),
528        namespace: content.namespace.clone(),
529        chunk_hash,
530        source_hash: content.content_hash.clone(),
531        layer: 0,
532        parent_id: None,
533        children_ids: vec![],
534        keywords: chunk.highlights,
535        metadata,
536    }
537}
538
539#[cfg(test)]
540mod tests {
541    use super::*;
542    use std::path::PathBuf;
543
544    fn doc(text: &str, namespace: &str, path: &str) -> FileContent {
545        FileContent {
546            path: PathBuf::from(path),
547            text: text.to_string(),
548            namespace: namespace.to_string(),
549            content_hash: compute_content_hash(text),
550        }
551    }
552
553    #[test]
554    fn default_chunker_routes_transcripts_to_aicx() {
555        assert_eq!(
556            detect_default_chunker(Path::new("/tmp/transcripts/sample.md"), "kb:any"),
557            ChunkerKind::Aicx
558        );
559        assert_eq!(
560            detect_default_chunker(Path::new("/tmp/readme.md"), "kb:docs"),
561            ChunkerKind::Onion
562        );
563    }
564
565    #[tokio::test]
566    async fn aicx_provider_chunks_markdown_transcript() {
567        let doc = doc(
568            "## user\nWhat is the meaning of life?\n\n## assistant\nBuild something useful.\n",
569            "kb:transcripts-test",
570            "/tmp/sample-transcript.md",
571        );
572        let opts = ChunkOpts::new(
573            ChunkerKind::Aicx,
574            SliceMode::Flat,
575            OuterSynthesis::default(),
576        );
577        let chunks = AicxChunkProvider::default()
578            .chunk(&doc, &opts)
579            .await
580            .unwrap();
581
582        assert!(!chunks.is_empty());
583        assert_eq!(chunks[0].metadata["chunker"], "aicx");
584        assert!(chunks[0].content.contains("meaning of life"));
585    }
586
587    #[test]
588    fn split_into_chunks_preserves_overlap() {
589        let chunks = split_into_chunks("abcdefghijklmnopqrstuvwxyz", 10, 3);
590
591        assert!(chunks.len() > 1);
592        assert_eq!(chunks[0].len(), 10);
593        assert!(chunks[0].ends_with(&chunks[1][..3]));
594    }
595}