aicx_parser/
chunker.rs

1//! Semantic windowing chunker for RAG indexing.
2//!
3//! Splits timeline entries into overlapping windows of ~1.5k tokens,
4//! suitable for vector embedding and semantic search via memex.
5//!
6//! Vibecrafted with AI Agents by VetCoders (c)2026 VetCoders
7
8use anyhow::Result;
9use serde::{Deserialize, Serialize};
10use std::borrow::Cow;
11use std::collections::{BTreeMap, HashMap, HashSet};
12use std::fs;
13use std::path::{Path, PathBuf};
14
15use crate::timeline::{FrameKind, Kind, TimelineEntry};
16
17// ============================================================================
18// Types
19// ============================================================================
20
21/// A single chunk ready for vector indexing.
22#[derive(Debug, Clone)]
23pub struct Chunk {
24    /// Unique ID: `{project}_{agent}_{date}_{seq:03}`
25    pub id: String,
26    pub project: String,
27    pub agent: String,
28    /// Date string (YYYY-MM-DD)
29    pub date: String,
30    /// Session ID from first message in chunk
31    pub session_id: String,
32    /// Working directory from the first message in the chunk window
33    pub cwd: Option<String>,
34    /// Classified kind for this chunk's content
35    pub kind: Kind,
36    /// Stable stream/channel classification for the chunk contents.
37    pub frame_kind: Option<FrameKind>,
38    /// Optional correlation ID for the originating run
39    pub run_id: Option<String>,
40    /// Optional prompt or task identity for the originating run
41    pub prompt_id: Option<String>,
42    /// Optional agent model reported by the source frontmatter
43    pub agent_model: Option<String>,
44    /// Optional run start timestamp reported by the source frontmatter
45    pub started_at: Option<String>,
46    /// Optional run completion timestamp reported by the source frontmatter
47    pub completed_at: Option<String>,
48    /// Optional token usage reported by the source frontmatter
49    pub token_usage: Option<u64>,
50    /// Optional findings count reported by the source frontmatter
51    pub findings_count: Option<u32>,
52    /// Optional workflow phase reported by the source frontmatter
53    pub workflow_phase: Option<String>,
54    /// Optional routing mode reported by the source frontmatter
55    pub mode: Option<String>,
56    /// Optional framework skill code reported by the source frontmatter
57    pub skill_code: Option<String>,
58    /// Optional steering schema/framework version reported by the source frontmatter
59    pub framework_version: Option<String>,
60    /// Index range in original day's entries (start, end exclusive)
61    pub msg_range: (usize, usize),
62    /// Formatted chunk text with header
63    pub text: String,
64    /// Estimated token count (~chars/4)
65    pub token_estimate: usize,
66    /// Decision/plan highlights extracted from the chunk
67    pub highlights: Vec<String>,
68}
69
70/// Structured metadata sidecar persisted alongside each chunk file.
71#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
72pub struct ChunkMetadataSidecar {
73    pub id: String,
74    pub project: String,
75    pub agent: String,
76    pub date: String,
77    pub session_id: String,
78    #[serde(skip_serializing_if = "Option::is_none")]
79    pub cwd: Option<String>,
80    pub kind: Kind,
81    #[serde(skip_serializing_if = "Option::is_none")]
82    pub frame_kind: Option<FrameKind>,
83    #[serde(skip_serializing_if = "Option::is_none")]
84    pub run_id: Option<String>,
85    #[serde(skip_serializing_if = "Option::is_none")]
86    pub prompt_id: Option<String>,
87    #[serde(skip_serializing_if = "Option::is_none")]
88    pub agent_model: Option<String>,
89    #[serde(skip_serializing_if = "Option::is_none")]
90    pub started_at: Option<String>,
91    #[serde(skip_serializing_if = "Option::is_none")]
92    pub completed_at: Option<String>,
93    #[serde(skip_serializing_if = "Option::is_none")]
94    pub token_usage: Option<u64>,
95    #[serde(skip_serializing_if = "Option::is_none")]
96    pub findings_count: Option<u32>,
97    #[serde(skip_serializing_if = "Option::is_none")]
98    pub workflow_phase: Option<String>,
99    #[serde(skip_serializing_if = "Option::is_none")]
100    pub mode: Option<String>,
101    #[serde(skip_serializing_if = "Option::is_none")]
102    pub skill_code: Option<String>,
103    #[serde(skip_serializing_if = "Option::is_none")]
104    pub framework_version: Option<String>,
105    #[serde(default, skip_serializing_if = "Vec::is_empty")]
106    pub intent_entries: Vec<crate::types::IntentEntry>,
107}
108
109impl From<&Chunk> for ChunkMetadataSidecar {
110    fn from(chunk: &Chunk) -> Self {
111        Self {
112            id: chunk.id.clone(),
113            project: chunk.project.clone(),
114            agent: chunk.agent.clone(),
115            date: chunk.date.clone(),
116            session_id: chunk.session_id.clone(),
117            cwd: chunk.cwd.clone(),
118            kind: chunk.kind,
119            frame_kind: chunk.frame_kind,
120            run_id: chunk.run_id.clone(),
121            prompt_id: chunk.prompt_id.clone(),
122            agent_model: chunk.agent_model.clone(),
123            started_at: chunk.started_at.clone(),
124            completed_at: chunk.completed_at.clone(),
125            token_usage: chunk.token_usage,
126            findings_count: chunk.findings_count,
127            workflow_phase: chunk.workflow_phase.clone(),
128            mode: chunk.mode.clone(),
129            skill_code: chunk.skill_code.clone(),
130            framework_version: chunk.framework_version.clone(),
131            intent_entries: Vec::new(),
132        }
133    }
134}
135
136/// Configuration for the chunker.
137#[derive(Debug, Clone)]
138pub struct ChunkerConfig {
139    /// Target tokens per chunk (default: 1500)
140    pub target_tokens: usize,
141    /// Minimum tokens — don't create tiny chunks unless it's the last window (default: 500)
142    pub min_tokens: usize,
143    /// Maximum tokens — force split if exceeded (default: 2500)
144    pub max_tokens: usize,
145    /// Number of messages to overlap between consecutive windows (default: 2)
146    pub overlap_messages: usize,
147}
148
149impl Default for ChunkerConfig {
150    fn default() -> Self {
151        Self {
152            target_tokens: 1500,
153            min_tokens: 500,
154            max_tokens: 2500,
155            overlap_messages: 2,
156        }
157    }
158}
159
160// ============================================================================
161// Token estimation
162// ============================================================================
163
164/// Estimate token count from text length.
165///
166/// Uses the simple heuristic: 1 token ≈ 4 characters.
167/// Rounds up to avoid underestimation.
168pub fn estimate_tokens(text: &str) -> usize {
169    text.len().div_ceil(4)
170}
171
172// ── Kind heuristics ────────────────────────────────────────────────────────
173
174const PLAN_KEYWORDS: &[&str] = &[
175    "implementation plan",
176    "plan:",
177    "## plan",
178    "step 1:",
179    "step 2:",
180    "step 3:",
181    "action items",
182    "milestones",
183    "roadmap",
184    "todo list",
185    "acceptance criteria",
186    "## steps",
187    "## phases",
188];
189
190const REPORT_KEYWORDS: &[&str] = &[
191    "## findings",
192    "## summary",
193    "## report",
194    "audit report",
195    "coverage report",
196    "test results",
197    "## metrics",
198    "## recommendations",
199    "## conclusion",
200    "status report",
201    "incident report",
202    "pr review",
203    "code review",
204];
205
206/// Classify a set of timeline entries into a canonical `Kind`.
207///
208/// Uses a lightweight keyword-scoring approach:
209/// - Scans assistant messages (where classification signal is strongest)
210/// - Scores plan vs report keywords
211/// - Conversations win by default when neither plan nor report signal is strong
212///
213/// The approach is intentionally conservative: ambiguous content falls to
214/// `Conversations` (the most common kind), not `Other`.
215pub fn classify_kind(entries: &[TimelineEntry]) -> Kind {
216    if entries.is_empty() {
217        return Kind::Other;
218    }
219
220    let mut plan_score: u32 = 0;
221    let mut report_score: u32 = 0;
222    let mut has_conversation = false;
223
224    for entry in entries {
225        let lower = entry.message.to_lowercase();
226
227        // Only count strong signals from assistant messages.
228        if entry.role == "assistant" {
229            for kw in PLAN_KEYWORDS {
230                if lower.contains(kw) {
231                    plan_score += 1;
232                }
233            }
234            for kw in REPORT_KEYWORDS {
235                if lower.contains(kw) {
236                    report_score += 1;
237                }
238            }
239        }
240
241        if entry.role == "user" || entry.role == "assistant" {
242            has_conversation = true;
243        }
244    }
245
246    let threshold = 3;
247
248    if plan_score >= threshold && plan_score > report_score {
249        Kind::Plans
250    } else if report_score >= threshold && report_score > plan_score {
251        Kind::Reports
252    } else if has_conversation {
253        Kind::Conversations
254    } else {
255        Kind::Other
256    }
257}
258
259fn prepare_entries_for_chunking<'a>(
260    entries: &'a [TimelineEntry],
261) -> (
262    Option<crate::frontmatter::ReportFrontmatter>,
263    Cow<'a, [TimelineEntry]>,
264) {
265    let Some(first) = entries.first() else {
266        return (None, Cow::Borrowed(entries));
267    };
268
269    if !first.message.trim_start().starts_with("---") {
270        return (None, Cow::Borrowed(entries));
271    }
272
273    let (frontmatter, body) = crate::frontmatter::parse(&first.message);
274    if body == first.message {
275        return (None, Cow::Borrowed(entries));
276    }
277
278    let mut stripped_entries = entries.to_vec();
279    if let Some(stripped_first) = stripped_entries.first_mut() {
280        stripped_first.message = body.to_string();
281    }
282
283    (frontmatter, Cow::Owned(stripped_entries))
284}
285
286fn apply_frontmatter(chunk: &mut Chunk, frontmatter: &crate::frontmatter::ReportFrontmatter) {
287    if chunk.frame_kind.is_none() {
288        chunk.frame_kind = frontmatter.telemetry.frame_kind;
289    }
290    chunk.run_id = frontmatter.telemetry.run_id.clone();
291    chunk.prompt_id = frontmatter.telemetry.prompt_id.clone();
292    chunk.agent_model = frontmatter.telemetry.model.clone();
293    chunk.started_at = frontmatter.telemetry.started_at.clone();
294    chunk.completed_at = frontmatter.telemetry.completed_at.clone();
295    chunk.token_usage = frontmatter.telemetry.token_usage;
296    chunk.findings_count = frontmatter.telemetry.findings_count;
297    chunk.workflow_phase = frontmatter.steering.workflow_phase.clone();
298    chunk.mode = frontmatter.steering.mode.clone();
299    chunk.skill_code = frontmatter.steering.skill_code.clone();
300    chunk.framework_version = frontmatter.steering.framework_version.clone();
301}
302
303fn split_day_entries_by_frame_kind<'a>(
304    entries: &'a [(usize, &'a TimelineEntry)],
305) -> Vec<&'a [(usize, &'a TimelineEntry)]> {
306    if entries.is_empty() {
307        return Vec::new();
308    }
309
310    let mut groups = Vec::new();
311    let mut start = 0usize;
312
313    for idx in 1..entries.len() {
314        let previous = entries[idx - 1].1.frame_kind;
315        let current = entries[idx].1.frame_kind;
316        if previous != current {
317            groups.push(&entries[start..idx]);
318            start = idx;
319        }
320    }
321
322    groups.push(&entries[start..]);
323    groups
324}
325
326fn frame_kind_for_window(entries: &[&TimelineEntry]) -> Option<FrameKind> {
327    let first = entries.first().and_then(|entry| entry.frame_kind)?;
328    entries
329        .iter()
330        .all(|entry| entry.frame_kind == Some(first))
331        .then_some(first)
332}
333
334// ============================================================================
335// Chunking logic
336// ============================================================================
337
338/// Chunk timeline entries into semantic windows with overlap.
339///
340/// Groups entries by date, then applies sliding window within each day.
341/// Returns chunks sorted by date and sequence number.
342pub fn chunk_entries(
343    entries: &[TimelineEntry],
344    project: &str,
345    agent: &str,
346    config: &ChunkerConfig,
347) -> Vec<Chunk> {
348    if entries.is_empty() {
349        return vec![];
350    }
351
352    let (frontmatter, prepared_entries) = prepare_entries_for_chunking(entries);
353    let prepared_entries = prepared_entries.as_ref();
354
355    // Group entries by date
356    let mut by_date: BTreeMap<String, Vec<(usize, &TimelineEntry)>> = BTreeMap::new();
357    for (idx, entry) in prepared_entries.iter().enumerate() {
358        let date = entry.timestamp.format("%Y-%m-%d").to_string();
359        by_date.entry(date).or_default().push((idx, entry));
360    }
361
362    let mut chunks = Vec::new();
363
364    for (date, day_entries) in &by_date {
365        let mut day_chunks = Vec::new();
366        let mut next_seq = 1usize;
367        for frame_group in split_day_entries_by_frame_kind(day_entries) {
368            let (mut group_chunks, updated_seq) =
369                chunk_day_entries(frame_group, project, agent, date, config, next_seq);
370            next_seq = updated_seq;
371            day_chunks.append(&mut group_chunks);
372        }
373        if let Some(frontmatter) = frontmatter.as_ref() {
374            for chunk in &mut day_chunks {
375                apply_frontmatter(chunk, frontmatter);
376            }
377        }
378        chunks.extend(day_chunks);
379    }
380
381    chunks
382}
383
384/// Apply sliding window chunking to a single day's entries.
385fn chunk_day_entries(
386    entries: &[(usize, &TimelineEntry)],
387    project: &str,
388    agent: &str,
389    date: &str,
390    config: &ChunkerConfig,
391    start_seq: usize,
392) -> (Vec<Chunk>, usize) {
393    if entries.is_empty() {
394        return (vec![], start_seq);
395    }
396
397    let mut chunks = Vec::new();
398    let mut seq = start_seq;
399    let mut start = 0usize;
400
401    while start < entries.len() {
402        // Find window end: accumulate until target_tokens reached
403        let mut end = start;
404        let mut accumulated_tokens = 0usize;
405
406        while end < entries.len() {
407            let msg_tokens = estimate_tokens(&entries[end].1.message);
408            let next_total = accumulated_tokens + msg_tokens + 20; // ~20 tokens for timestamp/role header
409
410            if next_total > config.max_tokens && end > start {
411                break;
412            }
413
414            accumulated_tokens = next_total;
415            end += 1;
416
417            if accumulated_tokens >= config.target_tokens {
418                break;
419            }
420        }
421
422        // Build chunk from entries[start..end]
423        let window: Vec<&TimelineEntry> = entries[start..end].iter().map(|(_, e)| *e).collect();
424        let highlights = extract_highlights(&window);
425        let signals = extract_signals(&window);
426        let frame_kind = frame_kind_for_window(&window);
427        let text = format_chunk_text_inner(
428            &window,
429            project,
430            agent,
431            date,
432            frame_kind,
433            &signals,
434            &highlights,
435        );
436        let token_estimate = estimate_tokens(&text);
437
438        let session_id = window
439            .first()
440            .map(|e| e.session_id.clone())
441            .unwrap_or_default();
442        let cwd = window.first().and_then(|entry| entry.cwd.clone());
443
444        let global_start = entries[start].0;
445        let global_end = entries[end - 1].0 + 1;
446
447        let kind = classify_kind(&window.iter().map(|e| (*e).clone()).collect::<Vec<_>>());
448
449        chunks.push(Chunk {
450            id: format!("{}_{}_{}_{{:03}}", project, agent, date)
451                .replace("{:03}", &format!("{:03}", seq)),
452            project: project.to_string(),
453            agent: agent.to_string(),
454            date: date.to_string(),
455            session_id,
456            cwd,
457            kind,
458            frame_kind,
459            run_id: None,
460            prompt_id: None,
461            agent_model: None,
462            started_at: None,
463            completed_at: None,
464            token_usage: None,
465            findings_count: None,
466            workflow_phase: None,
467            mode: None,
468            skill_code: None,
469            framework_version: None,
470            msg_range: (global_start, global_end),
471            text,
472            token_estimate,
473            highlights,
474        });
475
476        seq += 1;
477
478        // Next window starts at (end - overlap), but always advance at least 1
479        let overlap = config.overlap_messages.min(end - start);
480        let next_start = if end >= entries.len() {
481            entries.len() // done
482        } else if end - overlap > start {
483            end - overlap
484        } else {
485            end // avoid infinite loop
486        };
487
488        start = next_start;
489    }
490
491    (chunks, seq)
492}
493
494/// Format entries into chunk text with metadata header.
495pub fn format_chunk_text(
496    entries: &[&TimelineEntry],
497    project: &str,
498    agent: &str,
499    date: &str,
500) -> String {
501    let highlights = extract_highlights(entries);
502    let signals = extract_signals(entries);
503    format_chunk_text_inner(
504        entries,
505        project,
506        agent,
507        date,
508        frame_kind_for_window(entries),
509        &signals,
510        &highlights,
511    )
512}
513
514fn format_chunk_text_inner(
515    entries: &[&TimelineEntry],
516    project: &str,
517    agent: &str,
518    date: &str,
519    frame_kind: Option<FrameKind>,
520    signals: &ChunkSignals,
521    highlights: &[String],
522) -> String {
523    let mut text = if let Some(frame_kind) = frame_kind {
524        format!(
525            "[project: {} | agent: {} | date: {} | frame_kind: {}]\n\n",
526            project, agent, date, frame_kind
527        )
528    } else {
529        format!(
530            "[project: {} | agent: {} | date: {}]\n\n",
531            project, agent, date
532        )
533    };
534
535    if let Some(block) = format_signals_block(signals, highlights) {
536        text.push_str(&block);
537        text.push('\n');
538    }
539
540    for entry in entries {
541        let time = entry.timestamp.format("%H:%M:%S");
542        // Truncate very long messages to avoid monster chunks (UTF-8 safe).
543        let msg = if entry.message.len() > 4000 {
544            truncate_message_bytes(&entry.message, 4000)
545        } else {
546            entry.message.clone()
547        };
548        text.push_str(&format!("[{}] {}: {}\n", time, entry.role, msg));
549    }
550
551    text
552}
553
554const HIGHLIGHT_KEYWORDS: &[&str] = &[
555    "decision:",
556    "plan:",
557    "architecture",
558    "breaking",
559    "todo:",
560    "fixme:",
561];
562
563const HIGHLIGHT_KEYWORDS_CASE_SENSITIVE: &[&str] = &["WAŻNE", "KEY"];
564
565fn extract_highlights(entries: &[&TimelineEntry]) -> Vec<String> {
566    let mut highlights = Vec::new();
567    for entry in entries {
568        if highlights.len() >= 3 {
569            break;
570        }
571        if !is_highlight_message(&entry.message) {
572            continue;
573        }
574
575        if let Some(line) = entry.message.lines().map(str::trim).find(|l| !l.is_empty())
576            && highlights.last().map(String::as_str) != Some(line)
577        {
578            highlights.push(line.to_string());
579        }
580    }
581    highlights
582}
583
584fn is_highlight_message(message: &str) -> bool {
585    let lower = message.to_lowercase();
586    HIGHLIGHT_KEYWORDS.iter().any(|kw| lower.contains(kw))
587        || HIGHLIGHT_KEYWORDS_CASE_SENSITIVE
588            .iter()
589            .any(|kw| message.contains(kw))
590}
591
592// ============================================================================
593// Signals (intent + checklists)
594// ============================================================================
595
596#[derive(Debug, Clone, Default)]
597struct ChunkSignals {
598    todo_open: Vec<String>,
599    todo_done: Vec<String>,
600    ultrathink: Vec<String>,
601    insights: Vec<String>,
602    plan_mode: Vec<String>,
603    intents: Vec<String>,
604    results: Vec<String>,
605    skills: Vec<String>,
606    decisions: Vec<String>,
607    outcomes: Vec<String>,
608}
609
610const MAX_TODO_ITEMS: usize = 8;
611const MAX_ULTRATHINK_BLOCKS: usize = 4;
612const MAX_INSIGHT_BLOCKS: usize = 6;
613const MAX_PLAN_MODE_EVENTS: usize = 8;
614const MAX_INTENT_LINES: usize = 6;
615const MAX_RESULT_LINES: usize = 6;
616const MAX_TAG_BLOCK_LINES: usize = 4;
617
618pub const INTENT_KEYWORDS: &[&str] = &[
619    // Polish
620    "mam pomysl",
621    "mam pomysł",
622    "mam taki pomysl",
623    "mam taki pomysł",
624    "pomysl",
625    "pomysł",
626    "proponuje",
627    "proponuję",
628    "zrobmy",
629    "zróbmy",
630    "ustalmy",
631    "ustalmy",
632    "chce",
633    "chcę",
634    "chcialbym",
635    "chciałbym",
636    "potrzebuje",
637    "potrzebuję",
638    "następny krok",
639    "nastepny krok",
640    "kolejny krok",
641    // English
642    "i want",
643    "i'd like",
644    "let's",
645    "next step",
646];
647
648const RESULT_KEYWORDS: &[&str] = &[
649    "smoke test",
650    "passed",
651    "all checks passed",
652    "0 failed",
653    "completed",
654    "done",
655    "zrobione",
656    "dowiezione",
657    "gotowe",
658    "dziala",
659    "działa",
660];
661
662fn extract_signals(entries: &[&TimelineEntry]) -> ChunkSignals {
663    let (todo_open, todo_done) = extract_checklist_items(entries);
664    let ultrathink = extract_tag_blocks(entries, is_ultrathink_tag, MAX_ULTRATHINK_BLOCKS);
665    let insights = extract_tag_blocks(entries, is_insight_tag, MAX_INSIGHT_BLOCKS);
666    let plan_mode = extract_tag_blocks(entries, is_plan_mode_tag, MAX_PLAN_MODE_EVENTS);
667    let intents = extract_intent_lines(entries);
668    let results = extract_result_lines(entries);
669    let skills = extract_tag_blocks(entries, is_skill_tag, 4);
670    let decisions = extract_tag_blocks(entries, is_decision_tag, 4);
671    let outcomes = extract_tag_blocks(entries, is_outcome_tag, 4);
672
673    ChunkSignals {
674        todo_open,
675        todo_done,
676        ultrathink,
677        insights,
678        plan_mode,
679        intents,
680        results,
681        skills,
682        decisions,
683        outcomes,
684    }
685}
686
687fn extract_checklist_items(entries: &[&TimelineEntry]) -> (Vec<String>, Vec<String>) {
688    #[derive(Debug, Clone, Copy)]
689    enum TaskState {
690        Open,
691        Done,
692    }
693
694    let mut state_by_key: HashMap<String, TaskState> = HashMap::new();
695    let mut display_by_key: HashMap<String, String> = HashMap::new();
696    let mut order: Vec<String> = Vec::new();
697
698    for entry in entries {
699        for line in entry.message.lines() {
700            if let Some((is_done, task)) = parse_checklist_task(line) {
701                let key = normalize_key(&task);
702                if !state_by_key.contains_key(&key) {
703                    order.push(key.clone());
704                    display_by_key.insert(key.clone(), task);
705                    state_by_key.insert(key.clone(), TaskState::Open);
706                }
707
708                // Once a task is marked done anywhere, keep it done.
709                if is_done {
710                    state_by_key.insert(key, TaskState::Done);
711                }
712            }
713        }
714    }
715
716    let mut open = Vec::new();
717    let mut done = Vec::new();
718    for key in order {
719        let Some(task) = display_by_key.get(&key) else {
720            continue;
721        };
722        match state_by_key.get(&key) {
723            Some(TaskState::Done) => done.push(task.clone()),
724            Some(TaskState::Open) => open.push(task.clone()),
725            None => {}
726        }
727    }
728
729    (open, done)
730}
731
732pub fn parse_checklist_task(line: &str) -> Option<(bool, String)> {
733    let l = line.trim_start();
734    let mut chars = l.chars();
735    let bullet = chars.next()?;
736    if !matches!(bullet, '-' | '*' | '+') {
737        return None;
738    }
739    let rest = chars.as_str().trim_start();
740    let rest = rest.strip_prefix('[')?;
741    let mut chars = rest.chars();
742    let state = chars.next()?;
743    let rest = chars.as_str();
744    let rest = rest.strip_prefix(']')?;
745    let task = rest.trim_start();
746    if task.is_empty() {
747        return None;
748    }
749
750    match state {
751        'x' | 'X' => Some((true, task.trim().to_string())),
752        ' ' => Some((false, task.trim().to_string())),
753        _ => None,
754    }
755}
756
757fn extract_intent_lines(entries: &[&TimelineEntry]) -> Vec<String> {
758    let mut out = Vec::new();
759    let mut seen = HashSet::new();
760
761    for entry in entries {
762        if entry.role.to_lowercase() != "user" {
763            continue;
764        }
765        for line in entry.message.lines().map(str::trim) {
766            if line.is_empty() {
767                continue;
768            }
769            if !is_intent_line(line) {
770                continue;
771            }
772
773            let key = normalize_key(line);
774            if !seen.insert(key) {
775                continue;
776            }
777
778            out.push(truncate_signal_line(line));
779            if out.len() >= MAX_INTENT_LINES {
780                return out;
781            }
782        }
783    }
784
785    out
786}
787
788pub(crate) fn is_intent_line(line: &str) -> bool {
789    let lower = line.to_lowercase();
790    INTENT_KEYWORDS.iter().any(|kw| lower.contains(kw))
791}
792
793fn extract_result_lines(entries: &[&TimelineEntry]) -> Vec<String> {
794    let mut out = Vec::new();
795    let mut seen = HashSet::new();
796
797    for entry in entries {
798        for line in entry.message.lines().map(str::trim) {
799            if line.is_empty() {
800                continue;
801            }
802            if !is_result_line(line) {
803                continue;
804            }
805            let key = normalize_key(line);
806            if !seen.insert(key) {
807                continue;
808            }
809            out.push(truncate_signal_line(line));
810            if out.len() >= MAX_RESULT_LINES {
811                return out;
812            }
813        }
814    }
815
816    out
817}
818
819pub fn is_result_line(line: &str) -> bool {
820    let lower = line.to_lowercase();
821    RESULT_KEYWORDS.iter().any(|kw| lower.contains(kw))
822}
823
824pub fn normalize_key(s: &str) -> String {
825    s.split_whitespace()
826        .collect::<Vec<_>>()
827        .join(" ")
828        .to_lowercase()
829}
830
831pub fn truncate_signal_line(line: &str) -> String {
832    const MAX_BYTES: usize = 240;
833    if line.len() <= MAX_BYTES {
834        return line.to_string();
835    }
836    truncate_message_bytes(line, MAX_BYTES)
837}
838
839fn is_ultrathink_tag(line: &str) -> bool {
840    line.to_lowercase().contains("ultrathink")
841}
842
843fn is_insight_tag(line: &str) -> bool {
844    let lower = line.to_lowercase();
845    // Prefer common "tag" forms like "Insight:" / "★ Insight" / "Insight ─".
846    lower.starts_with("insight")
847        || lower.contains("★ insight")
848        || lower.contains("insight ─")
849        || lower.contains("insight -")
850}
851
852fn is_plan_mode_tag(line: &str) -> bool {
853    let lower = line.to_lowercase();
854    // Capture Plan Mode session transitions + explicit accept/approval actions.
855    lower.contains("plan mode")
856        || lower.contains("accept plan")
857        || lower.contains("user accepted the plan")
858        || lower.contains("approve and bypass permissions")
859        || lower.contains("bypass permissions")
860}
861
862fn is_skill_tag(line: &str) -> bool {
863    let lower = line.to_lowercase();
864    lower.contains("[skill_enter]")
865        || lower.contains("vetcoders-partner")
866        || lower.contains("vetcoders-spawn")
867        || lower.contains("vetcoders-ownership")
868        || lower.contains("vetcoders-workflow")
869}
870
871pub fn is_decision_tag(line: &str) -> bool {
872    let lower = line.to_lowercase();
873    lower.contains("[decision]") || lower.starts_with("decision:")
874}
875
876pub fn is_outcome_tag(line: &str) -> bool {
877    let lower = line.to_lowercase();
878    lower.contains("[skill_outcome]")
879        || lower.starts_with("outcome:")
880        || lower.starts_with("validation:")
881}
882
883fn extract_tag_blocks(
884    entries: &[&TimelineEntry],
885    is_tag: fn(&str) -> bool,
886    max_blocks: usize,
887) -> Vec<String> {
888    let mut out = Vec::new();
889    let mut seen = HashSet::new();
890
891    for entry in entries {
892        let lines: Vec<&str> = entry.message.lines().collect();
893        for (i, raw) in lines.iter().enumerate() {
894            let line = raw.trim();
895            if line.is_empty() || !is_tag(line) {
896                continue;
897            }
898
899            let mut block = Vec::new();
900            block.push(line);
901
902            for raw_next in lines.iter().skip(i + 1) {
903                let next = raw_next.trim();
904                if next.is_empty() {
905                    break;
906                }
907                if is_tag(next) {
908                    break;
909                }
910                block.push(next);
911                if block.len() >= MAX_TAG_BLOCK_LINES {
912                    break;
913                }
914            }
915
916            let joined = block.join(" ");
917            let key = normalize_key(&joined);
918            if !seen.insert(key) {
919                continue;
920            }
921
922            out.push(truncate_signal_line(&joined));
923            if out.len() >= max_blocks {
924                return out;
925            }
926        }
927    }
928
929    out
930}
931
932fn format_signals_block(signals: &ChunkSignals, highlights: &[String]) -> Option<String> {
933    let has_any = !signals.todo_open.is_empty()
934        || !signals.todo_done.is_empty()
935        || !signals.ultrathink.is_empty()
936        || !signals.insights.is_empty()
937        || !signals.plan_mode.is_empty()
938        || !signals.intents.is_empty()
939        || !signals.results.is_empty()
940        || !signals.skills.is_empty()
941        || !signals.decisions.is_empty()
942        || !signals.outcomes.is_empty()
943        || !highlights.is_empty();
944    if !has_any {
945        return None;
946    }
947
948    let mut out = String::new();
949    out.push_str("[signals]\n");
950
951    if !signals.skills.is_empty() {
952        out.push_str("=== SKILL ENTER ===\n");
953        for line in &signals.skills {
954            out.push_str(&format!("{}\n", line));
955        }
956        out.push_str("===================\n");
957    }
958
959    if !signals.todo_open.is_empty() || !signals.todo_done.is_empty() {
960        if !signals.todo_open.is_empty() {
961            out.push_str(&format!(
962                "RED LIGHT: checklist detected (open: {}, done: {})\n",
963                signals.todo_open.len(),
964                signals.todo_done.len()
965            ));
966        } else {
967            out.push_str(&format!(
968                "Checklist detected (open: 0, done: {})\n",
969                signals.todo_done.len()
970            ));
971        }
972
973        for task in signals.todo_open.iter().take(MAX_TODO_ITEMS) {
974            out.push_str(&format!("- [ ] {}\n", task));
975        }
976        if signals.todo_open.len() > MAX_TODO_ITEMS {
977            out.push_str(&format!(
978                "... (+{} more open)\n",
979                signals.todo_open.len() - MAX_TODO_ITEMS
980            ));
981        }
982
983        for task in signals.todo_done.iter().take(MAX_TODO_ITEMS) {
984            out.push_str(&format!("- [x] {}\n", task));
985        }
986        if signals.todo_done.len() > MAX_TODO_ITEMS {
987            out.push_str(&format!(
988                "... (+{} more done)\n",
989                signals.todo_done.len() - MAX_TODO_ITEMS
990            ));
991        }
992    }
993
994    if !signals.ultrathink.is_empty() {
995        out.push_str("Ultrathink:\n");
996        for line in &signals.ultrathink {
997            out.push_str(&format!("- {}\n", line));
998        }
999    }
1000
1001    if !signals.insights.is_empty() {
1002        out.push_str("Insight:\n");
1003        for line in &signals.insights {
1004            out.push_str(&format!("- {}\n", line));
1005        }
1006    }
1007
1008    if !signals.plan_mode.is_empty() {
1009        out.push_str("Plan mode:\n");
1010        for line in &signals.plan_mode {
1011            out.push_str(&format!("- {}\n", line));
1012        }
1013    }
1014
1015    if !signals.intents.is_empty() {
1016        out.push_str("Intent:\n");
1017        for line in &signals.intents {
1018            out.push_str(&format!("- {}\n", line));
1019        }
1020    }
1021
1022    if !signals.decisions.is_empty() {
1023        out.push_str("Decision:\n");
1024        for line in &signals.decisions {
1025            out.push_str(&format!("- {}\n", line));
1026        }
1027    }
1028
1029    if !signals.results.is_empty() {
1030        out.push_str("Results:\n");
1031        for line in &signals.results {
1032            out.push_str(&format!("- {}\n", line));
1033        }
1034    }
1035
1036    if !signals.outcomes.is_empty() {
1037        out.push_str("Outcome:\n");
1038        for line in &signals.outcomes {
1039            out.push_str(&format!("- {}\n", line));
1040        }
1041    }
1042
1043    if !highlights.is_empty() {
1044        out.push_str("Notes:\n");
1045        for line in highlights {
1046            out.push_str(&format!("- {}\n", truncate_signal_line(line)));
1047        }
1048    }
1049
1050    out.push_str("[/signals]\n");
1051    Some(out)
1052}
1053
1054fn truncate_message_bytes(message: &str, max_bytes: usize) -> String {
1055    let mut cutoff = max_bytes.min(message.len());
1056    while cutoff > 0 && !message.is_char_boundary(cutoff) {
1057        cutoff -= 1;
1058    }
1059    let mut out = String::with_capacity(cutoff + 15);
1060    out.push_str(&message[..cutoff]);
1061    out.push_str("...[truncated]");
1062    out
1063}
1064
1065// ============================================================================
1066// File output
1067// ============================================================================
1068
1069/// Write chunks as individual .txt files to a directory.
1070///
1071/// Each file is named `{chunk.id}.txt`. Returns paths of written files.
1072pub fn write_chunks_to_dir(chunks: &[Chunk], dir: &Path) -> Result<Vec<PathBuf>> {
1073    fs::create_dir_all(dir)?;
1074
1075    let mut paths = Vec::new();
1076
1077    for chunk in chunks {
1078        let filename = format!("{}.txt", chunk.id);
1079        let path = dir.join(&filename);
1080        fs::write(&path, &chunk.text)?;
1081        let sidecar_path = dir.join(format!("{}.meta.json", chunk.id));
1082        let sidecar = ChunkMetadataSidecar::from(chunk);
1083        fs::write(&sidecar_path, serde_json::to_vec_pretty(&sidecar)?)?;
1084        paths.push(path);
1085    }
1086
1087    Ok(paths)
1088}
1089
1090/// Summary of chunking results.
1091pub fn chunk_summary(chunks: &[Chunk]) -> String {
1092    if chunks.is_empty() {
1093        return "No chunks generated.".to_string();
1094    }
1095
1096    let total_tokens: usize = chunks.iter().map(|c| c.token_estimate).sum();
1097    let avg_tokens = total_tokens / chunks.len();
1098    let dates: Vec<&str> = chunks
1099        .iter()
1100        .map(|c| c.date.as_str())
1101        .collect::<std::collections::HashSet<_>>()
1102        .into_iter()
1103        .collect();
1104
1105    format!(
1106        "{} chunks, {} total tokens (avg {}), {} days",
1107        chunks.len(),
1108        total_tokens,
1109        avg_tokens,
1110        dates.len(),
1111    )
1112}
1113
1114// ============================================================================
1115// Tests
1116// ============================================================================
1117
1118#[cfg(test)]
1119mod tests {
1120    use super::*;
1121    use chrono::{TimeZone, Utc};
1122
1123    fn make_entry(hour: u32, min: u32, role: &str, msg: &str) -> TimelineEntry {
1124        TimelineEntry {
1125            timestamp: Utc.with_ymd_and_hms(2026, 1, 22, hour, min, 0).unwrap(),
1126            agent: "claude".to_string(),
1127            session_id: "sess-1".to_string(),
1128            role: role.to_string(),
1129            message: msg.to_string(),
1130            frame_kind: None,
1131            branch: None,
1132            cwd: None,
1133        }
1134    }
1135
1136    #[test]
1137    fn test_estimate_tokens() {
1138        assert_eq!(estimate_tokens(""), 0);
1139        assert_eq!(estimate_tokens("hi"), 1); // 2 chars → ceil(2/4) = 1
1140        assert_eq!(estimate_tokens("hello world"), 3); // 11 chars → ceil(11/4) = 3
1141        assert_eq!(estimate_tokens("1234"), 1); // exactly 4 chars = 1 token
1142        assert_eq!(estimate_tokens("12345"), 2); // 5 chars → 2 tokens
1143    }
1144
1145    #[test]
1146    fn test_chunk_entries_empty() {
1147        let config = ChunkerConfig::default();
1148        let chunks = chunk_entries(&[], "proj", "claude", &config);
1149        assert!(chunks.is_empty());
1150    }
1151
1152    #[test]
1153    fn test_chunk_entries_single_message() {
1154        let entries = vec![make_entry(14, 0, "user", "short message")];
1155        let config = ChunkerConfig::default();
1156        let chunks = chunk_entries(&entries, "proj", "claude", &config);
1157
1158        assert_eq!(chunks.len(), 1);
1159        assert_eq!(chunks[0].project, "proj");
1160        assert_eq!(chunks[0].agent, "claude");
1161        assert_eq!(chunks[0].date, "2026-01-22");
1162        assert!(chunks[0].text.contains("short message"));
1163    }
1164
1165    #[test]
1166    fn test_chunk_entries_basic() {
1167        // Create 10 entries with ~200 chars each → ~500 tokens total
1168        // With target=150 tokens, should get multiple chunks
1169        let entries: Vec<TimelineEntry> = (0..10)
1170            .map(|i| make_entry(14, i as u32, "user", &"x".repeat(200)))
1171            .collect();
1172
1173        let config = ChunkerConfig {
1174            target_tokens: 150,
1175            min_tokens: 50,
1176            max_tokens: 300,
1177            overlap_messages: 2,
1178        };
1179
1180        let chunks = chunk_entries(&entries, "proj", "claude", &config);
1181        assert!(
1182            chunks.len() > 1,
1183            "Expected multiple chunks, got {}",
1184            chunks.len()
1185        );
1186
1187        // Verify sequential IDs
1188        for (i, chunk) in chunks.iter().enumerate() {
1189            assert!(chunk.id.contains(&format!("{:03}", i + 1)));
1190        }
1191    }
1192
1193    #[test]
1194    fn test_chunk_entries_respects_max_tokens() {
1195        // One very long message
1196        let entries = vec![make_entry(14, 0, "user", &"x".repeat(20000))];
1197        let config = ChunkerConfig {
1198            target_tokens: 1500,
1199            min_tokens: 500,
1200            max_tokens: 2500,
1201            overlap_messages: 2,
1202        };
1203
1204        let chunks = chunk_entries(&entries, "proj", "claude", &config);
1205        // Single long message can't be split within chunker (it's per-message)
1206        // but format_chunk_text truncates at 4000 bytes
1207        assert_eq!(chunks.len(), 1);
1208        assert!(chunks[0].text.contains("[truncated]"));
1209    }
1210
1211    #[test]
1212    fn test_chunk_entries_groups_by_date() {
1213        let entries = vec![
1214            TimelineEntry {
1215                timestamp: Utc.with_ymd_and_hms(2026, 1, 20, 10, 0, 0).unwrap(),
1216                agent: "claude".to_string(),
1217                session_id: "s1".to_string(),
1218                role: "user".to_string(),
1219                message: "day one".to_string(),
1220                frame_kind: None,
1221                branch: None,
1222                cwd: None,
1223            },
1224            TimelineEntry {
1225                timestamp: Utc.with_ymd_and_hms(2026, 1, 21, 10, 0, 0).unwrap(),
1226                agent: "claude".to_string(),
1227                session_id: "s2".to_string(),
1228                role: "user".to_string(),
1229                message: "day two".to_string(),
1230                frame_kind: None,
1231                branch: None,
1232                cwd: None,
1233            },
1234        ];
1235
1236        let config = ChunkerConfig::default();
1237        let chunks = chunk_entries(&entries, "proj", "claude", &config);
1238
1239        assert_eq!(chunks.len(), 2);
1240        assert_eq!(chunks[0].date, "2026-01-20");
1241        assert_eq!(chunks[1].date, "2026-01-21");
1242    }
1243
1244    #[test]
1245    fn test_format_chunk_text() {
1246        let entries = [
1247            make_entry(14, 30, "user", "hello"),
1248            make_entry(14, 31, "assistant", "hi there"),
1249        ];
1250        let refs: Vec<&TimelineEntry> = entries.iter().collect();
1251
1252        let text = format_chunk_text(&refs, "TestProj", "claude", "2026-01-22");
1253
1254        assert!(text.starts_with("[project: TestProj | agent: claude | date: 2026-01-22]"));
1255        assert!(text.contains("[14:30:00] user: hello"));
1256        assert!(text.contains("[14:31:00] assistant: hi there"));
1257    }
1258
1259    #[test]
1260    fn test_format_chunk_text_truncates_utf8_safely() {
1261        let mut msg = "a".repeat(3999);
1262        msg.push('é'); // 2-byte char forces non-boundary at 4000
1263        let entries = [make_entry(14, 30, "user", &msg)];
1264        let refs: Vec<&TimelineEntry> = entries.iter().collect();
1265
1266        let text = format_chunk_text(&refs, "TestProj", "claude", "2026-01-22");
1267
1268        assert!(text.contains("[truncated]"));
1269        assert!(!text.contains('é'));
1270    }
1271
1272    #[test]
1273    fn test_chunk_entries_extracts_frontmatter_telemetry() {
1274        let entries = vec![make_entry(
1275            14,
1276            30,
1277            "assistant",
1278            "---\nrun_id: mrbl-001\nprompt_id: api-redesign_20260327\nmodel: gpt-5.4\nstarted_at: 2026-03-27T10:00:00Z\ncompleted_at: 2026-03-27T10:01:00Z\ntoken_usage: 1234\nfindings_count: 4\nframe_kind: agent_reply\nphase: implement\nmode: session-first\nskill_code: vc-workflow\nframework_version: 2026-03\n---\n## Report\nContent here",
1279        )];
1280
1281        let chunks = chunk_entries(&entries, "proj", "claude", &ChunkerConfig::default());
1282        assert_eq!(chunks.len(), 1);
1283
1284        let chunk = &chunks[0];
1285        assert_eq!(chunk.run_id.as_deref(), Some("mrbl-001"));
1286        assert_eq!(chunk.prompt_id.as_deref(), Some("api-redesign_20260327"));
1287        assert_eq!(chunk.agent_model.as_deref(), Some("gpt-5.4"));
1288        assert_eq!(chunk.started_at.as_deref(), Some("2026-03-27T10:00:00Z"));
1289        assert_eq!(chunk.completed_at.as_deref(), Some("2026-03-27T10:01:00Z"));
1290        assert_eq!(chunk.token_usage, Some(1234));
1291        assert_eq!(chunk.findings_count, Some(4));
1292        assert_eq!(chunk.frame_kind, Some(FrameKind::AgentReply));
1293        assert_eq!(chunk.workflow_phase.as_deref(), Some("implement"));
1294        assert_eq!(chunk.mode.as_deref(), Some("session-first"));
1295        assert_eq!(chunk.skill_code.as_deref(), Some("vc-workflow"));
1296        assert_eq!(chunk.framework_version.as_deref(), Some("2026-03"));
1297        assert!(chunk.text.contains("## Report"));
1298        assert!(!chunk.text.contains("run_id: mrbl-001"));
1299        assert!(!chunk.text.contains("phase: implement"));
1300    }
1301
1302    #[test]
1303    fn test_chunk_entries_strip_malformed_frontmatter_without_metadata() {
1304        let entries = vec![make_entry(
1305            14,
1306            30,
1307            "assistant",
1308            "---\nrun_id: [nope\nmode: session-first\n---\n## Report\nBody survives",
1309        )];
1310
1311        let chunks = chunk_entries(&entries, "proj", "claude", &ChunkerConfig::default());
1312        assert_eq!(chunks.len(), 1);
1313
1314        let chunk = &chunks[0];
1315        assert_eq!(chunk.run_id, None);
1316        assert_eq!(chunk.mode, None);
1317        assert!(chunk.text.contains("## Report"));
1318        assert!(chunk.text.contains("Body survives"));
1319        assert!(!chunk.text.contains("mode: session-first"));
1320    }
1321
1322    #[test]
1323    fn test_write_chunks_to_dir() {
1324        let tmp = std::env::temp_dir().join("ai-ctx-chunker-test");
1325        let _ = fs::remove_dir_all(&tmp);
1326
1327        let chunks = vec![
1328            Chunk {
1329                id: "proj_claude_2026-01-22_001".to_string(),
1330                project: "proj".to_string(),
1331                agent: "claude".to_string(),
1332                date: "2026-01-22".to_string(),
1333                session_id: "s1".to_string(),
1334                cwd: Some("/Users/tester/workspaces/proj".to_string()),
1335                kind: Kind::Conversations,
1336                frame_kind: Some(FrameKind::UserMsg),
1337                run_id: None,
1338                prompt_id: None,
1339                agent_model: None,
1340                started_at: None,
1341                completed_at: None,
1342                token_usage: None,
1343                findings_count: None,
1344                workflow_phase: Some("implement".to_string()),
1345                mode: Some("session-first".to_string()),
1346                skill_code: Some("vc-workflow".to_string()),
1347                framework_version: Some("2026-03".to_string()),
1348                msg_range: (0, 5),
1349                text: "chunk one content".to_string(),
1350                token_estimate: 4,
1351                highlights: vec![],
1352            },
1353            Chunk {
1354                id: "proj_claude_2026-01-22_002".to_string(),
1355                project: "proj".to_string(),
1356                agent: "claude".to_string(),
1357                date: "2026-01-22".to_string(),
1358                session_id: "s1".to_string(),
1359                cwd: None,
1360                kind: Kind::Conversations,
1361                frame_kind: None,
1362                run_id: None,
1363                prompt_id: None,
1364                agent_model: None,
1365                started_at: None,
1366                completed_at: None,
1367                token_usage: None,
1368                findings_count: None,
1369                workflow_phase: None,
1370                mode: None,
1371                skill_code: None,
1372                framework_version: None,
1373                msg_range: (3, 8),
1374                text: "chunk two content".to_string(),
1375                token_estimate: 4,
1376                highlights: vec![],
1377            },
1378        ];
1379
1380        let paths = write_chunks_to_dir(&chunks, &tmp).unwrap();
1381        assert_eq!(paths.len(), 2);
1382        assert!(paths[0].exists());
1383        assert!(paths[1].exists());
1384
1385        let content = fs::read_to_string(&paths[0]).unwrap();
1386        assert_eq!(content, "chunk one content");
1387
1388        let sidecar = fs::read_to_string(tmp.join("proj_claude_2026-01-22_001.meta.json")).unwrap();
1389        let metadata: ChunkMetadataSidecar = serde_json::from_str(&sidecar).unwrap();
1390        assert_eq!(metadata.project, "proj");
1391        assert_eq!(metadata.agent, "claude");
1392        assert_eq!(metadata.date, "2026-01-22");
1393        assert_eq!(
1394            metadata.cwd.as_deref(),
1395            Some("/Users/tester/workspaces/proj")
1396        );
1397        assert_eq!(metadata.kind, Kind::Conversations);
1398        assert_eq!(metadata.frame_kind, Some(FrameKind::UserMsg));
1399        assert_eq!(metadata.workflow_phase.as_deref(), Some("implement"));
1400        assert_eq!(metadata.mode.as_deref(), Some("session-first"));
1401        assert_eq!(metadata.skill_code.as_deref(), Some("vc-workflow"));
1402        assert_eq!(metadata.framework_version.as_deref(), Some("2026-03"));
1403
1404        let legacy: ChunkMetadataSidecar = serde_json::from_value(serde_json::json!({
1405            "id": "legacy",
1406            "project": "proj",
1407            "agent": "claude",
1408            "date": "2026-01-22",
1409            "session_id": "s1",
1410            "kind": "conversations",
1411        }))
1412        .unwrap();
1413        assert_eq!(legacy.cwd, None);
1414        assert_eq!(legacy.frame_kind, None);
1415        assert_eq!(legacy.workflow_phase, None);
1416        assert_eq!(legacy.mode, None);
1417        assert_eq!(legacy.skill_code, None);
1418        assert_eq!(legacy.framework_version, None);
1419
1420        let _ = fs::remove_dir_all(&tmp);
1421    }
1422
1423    #[test]
1424    fn test_overlap_messages() {
1425        // 8 entries with short messages (~22 tokens each incl. header)
1426        // target=80 → ~4 messages per window, overlap=2 → windows share 2 messages
1427        let entries: Vec<TimelineEntry> = (0..8)
1428            .map(|i| make_entry(14, i as u32, "user", &format!("msg_{}", i)))
1429            .collect();
1430
1431        let config = ChunkerConfig {
1432            target_tokens: 80,
1433            min_tokens: 20,
1434            max_tokens: 200,
1435            overlap_messages: 2,
1436        };
1437
1438        let chunks = chunk_entries(&entries, "p", "c", &config);
1439
1440        // With overlap=2, consecutive chunks should share messages
1441        if chunks.len() >= 2 {
1442            // Verify ranges overlap (overlap=2 means last 2 msgs of chunk N start chunk N+1)
1443            let (_, end1) = chunks[0].msg_range;
1444            let (start2, _) = chunks[1].msg_range;
1445            assert!(
1446                start2 < end1,
1447                "Expected overlap: chunk1 ends at {}, chunk2 starts at {}",
1448                end1,
1449                start2
1450            );
1451        }
1452    }
1453
1454    #[test]
1455    fn test_chunk_id_format() {
1456        let entries = vec![make_entry(10, 0, "user", "test")];
1457        let config = ChunkerConfig::default();
1458        let chunks = chunk_entries(&entries, "MyProject", "gemini", &config);
1459
1460        assert_eq!(chunks[0].id, "MyProject_gemini_2026-01-22_001");
1461    }
1462
1463    #[test]
1464    fn test_chunk_summary() {
1465        let chunks = vec![
1466            Chunk {
1467                id: "a".to_string(),
1468                project: "p".to_string(),
1469                agent: "c".to_string(),
1470                date: "2026-01-20".to_string(),
1471                session_id: "s".to_string(),
1472                cwd: None,
1473                kind: Kind::Conversations,
1474                frame_kind: None,
1475                run_id: None,
1476                prompt_id: None,
1477                agent_model: None,
1478                started_at: None,
1479                completed_at: None,
1480                token_usage: None,
1481                findings_count: None,
1482                workflow_phase: None,
1483                mode: None,
1484                skill_code: None,
1485                framework_version: None,
1486                msg_range: (0, 5),
1487                text: "x".repeat(100),
1488                token_estimate: 25,
1489                highlights: vec![],
1490            },
1491            Chunk {
1492                id: "b".to_string(),
1493                project: "p".to_string(),
1494                agent: "c".to_string(),
1495                date: "2026-01-21".to_string(),
1496                session_id: "s".to_string(),
1497                cwd: None,
1498                kind: Kind::Conversations,
1499                frame_kind: None,
1500                run_id: None,
1501                prompt_id: None,
1502                agent_model: None,
1503                started_at: None,
1504                completed_at: None,
1505                token_usage: None,
1506                findings_count: None,
1507                workflow_phase: None,
1508                mode: None,
1509                skill_code: None,
1510                framework_version: None,
1511                msg_range: (5, 10),
1512                text: "y".repeat(200),
1513                token_estimate: 50,
1514                highlights: vec![],
1515            },
1516        ];
1517
1518        let summary = chunk_summary(&chunks);
1519        assert!(summary.contains("2 chunks"));
1520        assert!(summary.contains("75 total tokens"));
1521        assert!(summary.contains("2 days"));
1522    }
1523
1524    #[test]
1525    fn test_extract_highlights_filters_keywords() {
1526        let entries = [
1527            make_entry(10, 0, "user", "Decision: lock chunking heuristics"),
1528            make_entry(10, 1, "assistant", "Just chatting"),
1529            make_entry(10, 2, "user", "TODO: add summarization notes"),
1530            make_entry(10, 3, "user", "KEY architectural choice"),
1531        ];
1532        let refs: Vec<&TimelineEntry> = entries.iter().collect();
1533
1534        let highlights = extract_highlights(&refs);
1535        assert_eq!(
1536            highlights,
1537            vec![
1538                "Decision: lock chunking heuristics",
1539                "TODO: add summarization notes",
1540                "KEY architectural choice"
1541            ]
1542        );
1543    }
1544
1545    #[test]
1546    fn test_format_chunk_text_includes_signals_for_checklist_and_intent() {
1547        let entries = [make_entry(
1548            14,
1549            30,
1550            "user",
1551            "No i tutaj mam taki pomysł, żeby to zrobić\nPlan mode: enabled\nUser accepted the plan\nUltrathink:\n- [ ] pierwsza rzecz\n- [x] druga rzecz\n\n★ Insight ─ to działa",
1552        )];
1553        let refs: Vec<&TimelineEntry> = entries.iter().collect();
1554
1555        let text = format_chunk_text(&refs, "TestProj", "claude", "2026-01-22");
1556
1557        assert!(text.contains("[signals]"));
1558        assert!(text.contains("RED LIGHT: checklist detected (open: 1, done: 1)"));
1559        assert!(text.contains("- [ ] pierwsza rzecz"));
1560        assert!(text.contains("- [x] druga rzecz"));
1561        assert!(text.contains("Ultrathink:"));
1562        assert!(text.contains("- Ultrathink:"));
1563        assert!(text.contains("Insight:"));
1564        assert!(text.contains("- ★ Insight ─ to działa"));
1565        assert!(text.contains("Plan mode:"));
1566        assert!(text.contains("- Plan mode: enabled"));
1567        assert!(text.contains("- User accepted the plan"));
1568        assert!(text.contains("Intent:"));
1569        assert!(text.contains("No i tutaj mam taki pomysł, żeby to zrobić"));
1570        assert!(text.contains("[/signals]"));
1571    }
1572}
aicx_parser/chunker.rs

aicx_parser/
chunker.rs