1use anyhow::{Result, anyhow};
2use chrono::{TimeZone, Utc};
3use serde::{Deserialize, Serialize};
4use serde_json::{Value, json};
5use std::future::Future;
6use std::path::Path;
7use std::pin::Pin;
8
9use crate::rag::{
10 OnionSlice, OnionSliceConfig, OuterSynthesis, SliceMode, create_onion_slices_async,
11 create_onion_slices_fast_async,
12};
13use crate::rag::{compute_content_hash, pipeline::Chunk, pipeline::FileContent};
14
15pub type ChunkProviderFuture<'a> = Pin<Box<dyn Future<Output = Result<Vec<Chunk>>> + Send + 'a>>;
16
17#[derive(Debug, Clone)]
19pub struct ChunkOpts {
20 pub chunker: ChunkerKind,
21 pub slice_mode: SliceMode,
22 pub outer_synthesis: OuterSynthesis,
23 pub flat_window: usize,
24 pub flat_overlap: usize,
25}
26
27impl ChunkOpts {
28 pub fn new(
29 chunker: ChunkerKind,
30 slice_mode: SliceMode,
31 outer_synthesis: OuterSynthesis,
32 ) -> Self {
33 Self {
34 chunker,
35 slice_mode,
36 outer_synthesis,
37 flat_window: 512,
38 flat_overlap: 128,
39 }
40 }
41}
42
43pub trait ChunkProvider: Send + Sync {
45 fn name(&self) -> &'static str;
46 fn chunk<'a>(&'a self, doc: &'a FileContent, opts: &'a ChunkOpts) -> ChunkProviderFuture<'a>;
47}
48
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
51#[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
52#[serde(rename_all = "kebab-case")]
53pub enum ChunkerKind {
54 Aicx,
55 Onion,
56 Flat,
57}
58
59impl ChunkerKind {
60 pub fn name(self) -> &'static str {
61 match self {
62 Self::Aicx => "aicx",
63 Self::Onion => "onion",
64 Self::Flat => "flat",
65 }
66 }
67
68 pub fn into_provider(self) -> Box<dyn ChunkProvider> {
69 match self {
70 Self::Aicx => Box::new(AicxChunkProvider::default()),
71 Self::Onion => Box::new(OnionChunkProvider),
72 Self::Flat => Box::new(FlatChunkProvider {
73 window: 512,
74 overlap: 128,
75 }),
76 }
77 }
78
79 pub fn slice_mode(self, requested: SliceMode) -> SliceMode {
80 match self {
81 Self::Aicx | Self::Flat => SliceMode::Flat,
82 Self::Onion => match requested {
83 SliceMode::OnionFast => SliceMode::OnionFast,
84 _ => SliceMode::Onion,
85 },
86 }
87 }
88}
89
90impl std::str::FromStr for ChunkerKind {
91 type Err = String;
92
93 fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
94 match s.to_ascii_lowercase().as_str() {
95 "aicx" => Ok(Self::Aicx),
96 "onion" => Ok(Self::Onion),
97 "flat" => Ok(Self::Flat),
98 other => Err(format!(
99 "Invalid chunker: '{}'. Use 'aicx', 'onion', or 'flat'",
100 other
101 )),
102 }
103 }
104}
105
106impl std::fmt::Display for ChunkerKind {
107 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
108 f.write_str(self.name())
109 }
110}
111
112pub struct OnionChunkProvider;
114
115#[derive(Default)]
117pub struct AicxChunkProvider {
118 config: aicx_parser::ChunkerConfig,
119}
120
121pub struct FlatChunkProvider {
123 window: usize,
124 overlap: usize,
125}
126
127impl ChunkProvider for OnionChunkProvider {
128 fn name(&self) -> &'static str {
129 "onion"
130 }
131
132 fn chunk<'a>(&'a self, doc: &'a FileContent, opts: &'a ChunkOpts) -> ChunkProviderFuture<'a> {
133 Box::pin(async move {
134 let metadata = base_metadata(doc, opts.chunker, opts.slice_mode);
135 let config = OnionSliceConfig {
136 outer_synthesis: opts.outer_synthesis.clone(),
137 ..OnionSliceConfig::default()
138 };
139 let slices = match opts.slice_mode {
140 SliceMode::OnionFast => {
141 create_onion_slices_fast_async(&doc.text, &metadata, &config).await
142 }
143 SliceMode::Onion | SliceMode::Flat => {
144 create_onion_slices_async(&doc.text, &metadata, &config).await
145 }
146 };
147
148 Ok(slices_to_chunks(slices, doc, metadata))
149 })
150 }
151}
152
153impl ChunkProvider for AicxChunkProvider {
154 fn name(&self) -> &'static str {
155 "aicx"
156 }
157
158 fn chunk<'a>(&'a self, doc: &'a FileContent, opts: &'a ChunkOpts) -> ChunkProviderFuture<'a> {
159 Box::pin(async move {
160 let entries = doc_to_timeline_entries(doc)?;
161 let project = project_label(&doc.namespace);
162 let agent = first_agent(&entries).unwrap_or("rust-memex");
163 let aicx_chunks =
164 aicx_parser::chunker::chunk_entries(&entries, &project, agent, &self.config);
165 Ok(aicx_chunks
166 .into_iter()
167 .map(|chunk| memex_chunk_from_aicx(chunk, doc, opts))
168 .collect())
169 })
170 }
171}
172
173impl ChunkProvider for FlatChunkProvider {
174 fn name(&self) -> &'static str {
175 "flat"
176 }
177
178 fn chunk<'a>(&'a self, doc: &'a FileContent, opts: &'a ChunkOpts) -> ChunkProviderFuture<'a> {
179 Box::pin(async move {
180 let window = opts.flat_window.max(1).max(self.window);
181 let overlap = opts
182 .flat_overlap
183 .min(window.saturating_sub(1))
184 .max(self.overlap.min(window.saturating_sub(1)));
185 let metadata = base_metadata(doc, opts.chunker, SliceMode::Flat);
186 Ok(create_flat_chunks(
187 &doc.text, doc, metadata, window, overlap,
188 ))
189 })
190 }
191}
192
193pub fn detect_default_chunker(source_path: &Path, namespace: &str) -> ChunkerKind {
194 let path_str = source_path.to_string_lossy();
195 if namespace.starts_with("kb:transcripts")
196 || namespace.starts_with("aicx")
197 || namespace.starts_with("klaudiusz-")
198 || path_str.contains("__clean.md")
199 || path_str.contains("/.aicx/store/")
200 || path_str.contains("/transcripts/")
201 {
202 ChunkerKind::Aicx
203 } else {
204 ChunkerKind::Onion
205 }
206}
207
208fn base_metadata(doc: &FileContent, chunker: ChunkerKind, slice_mode: SliceMode) -> Value {
209 let mut metadata = json!({
210 "path": doc.path.to_str(),
211 "content_hash": &doc.content_hash,
212 "source_hash": &doc.content_hash,
213 "chunker": chunker.name(),
214 "slice_mode": match slice_mode {
215 SliceMode::Onion => "onion",
216 SliceMode::OnionFast => "onion-fast",
217 SliceMode::Flat => "flat",
218 },
219 });
220
221 if chunker == ChunkerKind::Aicx
222 && let Value::Object(ref mut map) = metadata
223 {
224 map.insert("format".to_string(), json!("markdown_transcript"));
225 map.insert("type".to_string(), json!("conversation"));
226 }
227
228 metadata
229}
230
231fn slices_to_chunks(
232 slices: Vec<OnionSlice>,
233 content: &FileContent,
234 base_metadata: Value,
235) -> Vec<Chunk> {
236 slices
237 .into_iter()
238 .map(|slice| {
239 let chunk_hash = compute_content_hash(&slice.content);
240 let mut metadata = base_metadata.clone();
241 if let Value::Object(ref mut map) = metadata {
242 map.insert("chunk_hash".to_string(), json!(&chunk_hash));
243 map.insert("layer".to_string(), json!(slice.layer.name()));
244 map.insert("keywords".to_string(), json!(slice.keywords));
245 }
246
247 Chunk {
248 id: slice.id,
249 content: slice.content,
250 source_path: content.path.clone(),
251 namespace: content.namespace.clone(),
252 chunk_hash,
253 source_hash: content.content_hash.clone(),
254 layer: slice.layer.as_u8(),
255 parent_id: slice.parent_id,
256 children_ids: slice.children_ids,
257 keywords: slice.keywords,
258 metadata,
259 }
260 })
261 .collect()
262}
263
264fn create_flat_chunks(
265 text: &str,
266 content: &FileContent,
267 base_metadata: Value,
268 window: usize,
269 overlap: usize,
270) -> Vec<Chunk> {
271 let chunks = split_into_chunks(text, window, overlap);
272 let total_chunks = chunks.len();
273
274 chunks
275 .into_iter()
276 .enumerate()
277 .map(|(idx, chunk_text)| {
278 let chunk_hash = compute_content_hash(&chunk_text);
279 let mut metadata = base_metadata.clone();
280 if let Value::Object(ref mut map) = metadata {
281 map.insert("chunk_index".to_string(), json!(idx));
282 map.insert("total_chunks".to_string(), json!(total_chunks));
283 map.insert("chunk_hash".to_string(), json!(&chunk_hash));
284 }
285
286 let id = format!(
287 "{}_{}_{}",
288 content.path.to_str().unwrap_or("unknown"),
289 content.content_hash.get(..8).unwrap_or(""),
290 idx
291 );
292
293 Chunk {
294 id,
295 content: chunk_text,
296 source_path: content.path.clone(),
297 namespace: content.namespace.clone(),
298 chunk_hash,
299 source_hash: content.content_hash.clone(),
300 layer: 0,
301 parent_id: None,
302 children_ids: vec![],
303 keywords: vec![],
304 metadata,
305 }
306 })
307 .collect()
308}
309
310pub(crate) fn split_into_chunks(text: &str, target_size: usize, overlap: usize) -> Vec<String> {
311 let mut char_offsets: Vec<usize> = text.char_indices().map(|(byte_idx, _)| byte_idx).collect();
312 let len = char_offsets.len();
313
314 if len <= target_size {
315 return vec![text.to_string()];
316 }
317
318 char_offsets.push(text.len());
319
320 let mut chunks = Vec::new();
321 let mut start = 0;
322
323 while start < len {
324 let end = (start + target_size).min(len);
325 let start_byte = char_offsets[start];
326 let end_byte = char_offsets[end];
327 chunks.push(text[start_byte..end_byte].to_string());
328
329 if end >= len {
330 break;
331 }
332
333 start = end.saturating_sub(overlap);
334 }
335
336 chunks
337}
338
339fn doc_to_timeline_entries(doc: &FileContent) -> Result<Vec<aicx_parser::TimelineEntry>> {
340 let timestamp = Utc
341 .timestamp_opt(0, 0)
342 .single()
343 .ok_or_else(|| anyhow!("failed to construct epoch timestamp"))?;
344 let session_id = compute_content_hash(&doc.text)
345 .get(..12)
346 .unwrap_or("session")
347 .to_string();
348 let cwd = doc
349 .path
350 .parent()
351 .and_then(Path::to_str)
352 .map(ToString::to_string);
353 let preamble = extract_preamble(&doc.text);
354 let mut entries = Vec::new();
355 let mut current_role: Option<String> = None;
356 let mut current_body = String::new();
357
358 for line in doc.text.lines() {
359 if let Some(role) = role_heading(line) {
360 push_entry(
361 &mut entries,
362 current_role.take(),
363 &mut current_body,
364 timestamp,
365 &session_id,
366 cwd.clone(),
367 );
368 current_role = Some(role.to_string());
369 } else {
370 if !current_body.is_empty() {
371 current_body.push('\n');
372 }
373 current_body.push_str(line);
374 }
375 }
376
377 push_entry(
378 &mut entries,
379 current_role,
380 &mut current_body,
381 timestamp,
382 &session_id,
383 cwd,
384 );
385
386 if let Some(preamble) = preamble
387 && let Some(first) = entries.first_mut()
388 && !first.message.starts_with("---")
389 {
390 first.message = format!("{}\n\n{}", preamble.trim_end(), first.message.trim_start());
391 }
392
393 if entries.is_empty() {
394 entries.push(aicx_parser::TimelineEntry {
395 timestamp,
396 agent: "rust-memex".to_string(),
397 session_id,
398 role: "assistant".to_string(),
399 message: doc.text.clone(),
400 frame_kind: Some(aicx_parser::FrameKind::AgentReply),
401 branch: None,
402 cwd: doc
403 .path
404 .parent()
405 .and_then(Path::to_str)
406 .map(ToString::to_string),
407 });
408 }
409
410 Ok(entries)
411}
412
413fn extract_preamble(text: &str) -> Option<String> {
414 let mut lines = Vec::new();
415 for line in text.lines() {
416 if role_heading(line).is_some() {
417 break;
418 }
419 lines.push(line);
420 }
421 let preamble = lines.join("\n");
422 (!preamble.trim().is_empty()).then_some(preamble)
423}
424
425fn push_entry(
426 entries: &mut Vec<aicx_parser::TimelineEntry>,
427 role: Option<String>,
428 body: &mut String,
429 timestamp: chrono::DateTime<Utc>,
430 session_id: &str,
431 cwd: Option<String>,
432) {
433 let Some(role) = role else {
434 body.clear();
435 return;
436 };
437 let message = body.trim().to_string();
438 body.clear();
439 if message.is_empty() {
440 return;
441 }
442 let frame_kind = match role.as_str() {
443 "user" => Some(aicx_parser::FrameKind::UserMsg),
444 "assistant" => Some(aicx_parser::FrameKind::AgentReply),
445 "tool" => Some(aicx_parser::FrameKind::ToolCall),
446 _ => None,
447 };
448 entries.push(aicx_parser::TimelineEntry {
449 timestamp,
450 agent: "rust-memex".to_string(),
451 session_id: session_id.to_string(),
452 role,
453 message,
454 frame_kind,
455 branch: None,
456 cwd,
457 });
458}
459
460fn role_heading(line: &str) -> Option<&'static str> {
461 let lowered = line.trim().to_ascii_lowercase();
462 match lowered.as_str() {
463 "## user" | "### user" | "[user]" | "user request:" => Some("user"),
464 "## assistant" | "### assistant" | "[assistant]" | "assistant response:" => {
465 Some("assistant")
466 }
467 "## tool" | "### tool" | "[tool]" | "tool:" | "tool result:" => Some("tool"),
468 _ => None,
469 }
470}
471
472fn first_agent(entries: &[aicx_parser::TimelineEntry]) -> Option<&str> {
473 entries.first().map(|entry| entry.agent.as_str())
474}
475
476fn project_label(namespace: &str) -> String {
477 namespace
478 .chars()
479 .map(|ch| {
480 if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') {
481 ch
482 } else {
483 '_'
484 }
485 })
486 .collect()
487}
488
489fn memex_chunk_from_aicx(
490 chunk: aicx_parser::Chunk,
491 content: &FileContent,
492 opts: &ChunkOpts,
493) -> Chunk {
494 let chunk_hash = compute_content_hash(&chunk.text);
495 let id_prefix = content.content_hash.get(..8).unwrap_or("aicx");
496 let mut metadata = base_metadata(content, opts.chunker, SliceMode::Flat);
497 if let Value::Object(ref mut map) = metadata {
498 map.insert("aicx_chunk_id".to_string(), json!(&chunk.id));
499 map.insert("aicx_project".to_string(), json!(&chunk.project));
500 map.insert("aicx_agent".to_string(), json!(&chunk.agent));
501 map.insert("aicx_date".to_string(), json!(&chunk.date));
502 map.insert("aicx_session_id".to_string(), json!(&chunk.session_id));
503 map.insert("aicx_kind".to_string(), json!(chunk.kind));
504 map.insert("aicx_frame_kind".to_string(), json!(chunk.frame_kind));
505 map.insert("aicx_msg_start".to_string(), json!(chunk.msg_range.0));
506 map.insert("aicx_msg_end".to_string(), json!(chunk.msg_range.1));
507 map.insert("token_estimate".to_string(), json!(chunk.token_estimate));
508 map.insert("highlights".to_string(), json!(&chunk.highlights));
509 map.insert("chunk_hash".to_string(), json!(&chunk_hash));
510 if let Some(cwd) = &chunk.cwd {
511 map.insert("aicx_cwd".to_string(), json!(cwd));
512 }
513 if let Some(run_id) = &chunk.run_id {
514 map.insert("run_id".to_string(), json!(run_id));
515 }
516 if let Some(prompt_id) = &chunk.prompt_id {
517 map.insert("prompt_id".to_string(), json!(prompt_id));
518 }
519 if let Some(model) = &chunk.agent_model {
520 map.insert("agent_model".to_string(), json!(model));
521 }
522 }
523
524 Chunk {
525 id: format!("{id_prefix}::{}", chunk.id),
526 content: chunk.text,
527 source_path: content.path.clone(),
528 namespace: content.namespace.clone(),
529 chunk_hash,
530 source_hash: content.content_hash.clone(),
531 layer: 0,
532 parent_id: None,
533 children_ids: vec![],
534 keywords: chunk.highlights,
535 metadata,
536 }
537}
538
539#[cfg(test)]
540mod tests {
541 use super::*;
542 use std::path::PathBuf;
543
544 fn doc(text: &str, namespace: &str, path: &str) -> FileContent {
545 FileContent {
546 path: PathBuf::from(path),
547 text: text.to_string(),
548 namespace: namespace.to_string(),
549 content_hash: compute_content_hash(text),
550 }
551 }
552
553 #[test]
554 fn default_chunker_routes_transcripts_to_aicx() {
555 assert_eq!(
556 detect_default_chunker(Path::new("/tmp/transcripts/sample.md"), "kb:any"),
557 ChunkerKind::Aicx
558 );
559 assert_eq!(
560 detect_default_chunker(Path::new("/tmp/readme.md"), "kb:docs"),
561 ChunkerKind::Onion
562 );
563 }
564
565 #[tokio::test]
566 async fn aicx_provider_chunks_markdown_transcript() {
567 let doc = doc(
568 "## user\nWhat is the meaning of life?\n\n## assistant\nBuild something useful.\n",
569 "kb:transcripts-test",
570 "/tmp/sample-transcript.md",
571 );
572 let opts = ChunkOpts::new(
573 ChunkerKind::Aicx,
574 SliceMode::Flat,
575 OuterSynthesis::default(),
576 );
577 let chunks = AicxChunkProvider::default()
578 .chunk(&doc, &opts)
579 .await
580 .unwrap();
581
582 assert!(!chunks.is_empty());
583 assert_eq!(chunks[0].metadata["chunker"], "aicx");
584 assert!(chunks[0].content.contains("meaning of life"));
585 }
586
587 #[test]
588 fn split_into_chunks_preserves_overlap() {
589 let chunks = split_into_chunks("abcdefghijklmnopqrstuvwxyz", 10, 3);
590
591 assert!(chunks.len() > 1);
592 assert_eq!(chunks[0].len(), 10);
593 assert!(chunks[0].ends_with(&chunks[1][..3]));
594 }
595}