Skip to main content

nvs_core/chunker/
annotate.rs

1use super::TokenCounter;
2
3#[derive(Clone, Copy, Debug, PartialEq, Eq)]
4pub enum LineType {
5    Normal,
6    MajorHeading,
7    MinorHeading,
8    ListItem,
9    Blank,
10    CodeBlock,
11}
12
13#[derive(Clone, Debug)]
14pub struct AnnotatedLine {
15    pub text: String,
16    pub line_type: LineType,
17    pub tokens: usize,
18    pub page: i32,
19    pub heading_level: i32,
20}
21
22fn detect_line_type(line: &str) -> (LineType, i32) {
23    let s = line.trim();
24    if s.is_empty() {
25        return (LineType::Blank, 0);
26    }
27    // markdown-style headings
28    if let Some(stripped) = s.strip_prefix('#') {
29        let mut level = 1;
30        let mut rest = stripped;
31        while let Some(r) = rest.strip_prefix('#') {
32            level += 1;
33            rest = r;
34        }
35        if level <= 2 {
36            return (LineType::MajorHeading, level as i32);
37        }
38        return (LineType::MinorHeading, level as i32);
39    }
40    // list items
41    if s.starts_with('-') || s.starts_with('*') || s.starts_with('+') {
42        return (LineType::ListItem, 0);
43    }
44    if s.chars().all(|c| c == '`') {
45        return (LineType::CodeBlock, 0);
46    }
47    (LineType::Normal, 0)
48}
49
50pub fn annotate_lines(pages: &[(String, i32)], tokenizer: &dyn TokenCounter) -> Vec<AnnotatedLine> {
51    let mut out = Vec::new();
52    for (text, page) in pages {
53        for line in text.split('\n') {
54            let (lt, lvl) = detect_line_type(line);
55            let tokens = tokenizer.count_tokens(line);
56            out.push(AnnotatedLine {
57                text: line.to_string(),
58                line_type: lt,
59                tokens,
60                page: *page,
61                heading_level: lvl,
62            });
63        }
64    }
65    out
66}