Skip to main content

nvs_core/chunker/
mod.rs

1// Shared chunker: line annotation -> semantic grouping -> packing -> overlap -> merge/split
2// This module was moved from nvs-pdf to nvs-core to be reused by HTML and PDF pipelines.
3
4use std::time::Instant;
5
6mod annotate;
7mod group;
8mod merge;
9mod overlap;
10mod pack;
11mod split;
12
13pub use annotate::{annotate_lines, AnnotatedLine, LineType};
14pub use group::{group_semantic_units, SemanticUnit};
15pub use merge::{final_merge, merge_small_chunks};
16pub use overlap::add_overlap;
17pub use pack::pack_initial_chunks;
18pub use split::split_oversized;
19
20// Abstraction: token counting, to swap implementations later.
21pub trait TokenCounter {
22    fn count_tokens(&self, text: &str) -> usize;
23}
24
25// Implement TokenCounter for the GreedyTokenizer from the tokenmonster crate
26impl TokenCounter for tokenmonster::GreedyTokenizer {
27    fn count_tokens(&self, text: &str) -> usize {
28        self.count_tokens(text)
29    }
30}
31
32// Allow passing a global Lazy<GreedyTokenizer> directly (used by CLI bins)
33impl TokenCounter for once_cell::sync::Lazy<tokenmonster::GreedyTokenizer> {
34    fn count_tokens(&self, text: &str) -> usize {
35        tokenmonster::GreedyTokenizer::count_tokens(&*self, text)
36    }
37}
38
39#[derive(Clone, Debug)]
40pub struct Chunk {
41    pub text: String,
42    pub token_count: usize,
43    pub start_page: i32,
44    pub end_page: i32,
45    pub has_major_heading: bool,
46    pub min_heading_level: i32,
47}
48
49#[derive(Clone, Debug)]
50pub struct ChunkOptions {
51    pub max_tokens: usize,
52    pub min_tokens: usize,
53    pub overlap_tokens: usize,
54}
55
56impl Default for ChunkOptions {
57    fn default() -> Self {
58        Self {
59            max_tokens: 256,
60            min_tokens: 150,
61            overlap_tokens: 50,
62        }
63    }
64}
65
66/// Chunk a list of (text, page_index) into semantic chunks.
67///
68/// Example
69/// ```
70/// use nvs_core::chunker::{chunk_pages, ChunkOptions};
71/// let pages = vec![
72///     ("# Title\nIntro paragraph.".to_string(), 0),
73///     ("## Section\nSome content here.".to_string(), 1),
74/// ];
75/// let tok = tokenmonster::GreedyTokenizer::from_cl100k_bin();
76/// let opts = ChunkOptions { max_tokens: 64, min_tokens: 8, overlap_tokens: 4 };
77/// let chunks = chunk_pages(&pages, &tok, &opts);
78/// assert!(!chunks.is_empty());
79/// assert!(chunks[0].token_count > 0);
80/// ```
81pub fn chunk_pages(
82    pages: &[(String, i32)],
83    tokenizer: &dyn TokenCounter,
84    opts: &ChunkOptions,
85) -> Vec<Chunk> {
86    if pages.is_empty() {
87        return Vec::new();
88    }
89    // Pre-filter: strip repeated headers/footers that appear on most pages.
90    let pages = strip_repeated_headers_footers(pages);
91    let annotated = annotate_lines(&pages, tokenizer);
92    let semantic_units = group_semantic_units(&annotated);
93    let mut tmp = pack_initial_chunks(&semantic_units, opts.max_tokens);
94    add_overlap(&mut tmp, opts.overlap_tokens, tokenizer);
95    tmp = merge_small_chunks(tmp, opts.min_tokens, opts.max_tokens);
96    tmp = split_oversized(tmp, opts.max_tokens, tokenizer);
97    let chunks = final_merge(tmp, opts.min_tokens, opts.max_tokens);
98    chunks
99}
100
101// Heuristic: remove identical header/footer lines that repeat across a large
102// fraction of pages (e.g., journal footers, page numbers). This improves RAG
103// quality by reducing noise and duplication.
104fn strip_repeated_headers_footers(pages: &[(String, i32)]) -> Vec<(String, i32)> {
105    use std::collections::HashMap;
106    let mut first_map: HashMap<String, usize> = HashMap::new();
107    let mut last_map: HashMap<String, usize> = HashMap::new();
108    let mut heads: Vec<Option<String>> = Vec::with_capacity(pages.len());
109    let mut foots: Vec<Option<String>> = Vec::with_capacity(pages.len());
110    for (txt, _p) in pages {
111        let mut lines: Vec<&str> = txt.split('\n').collect();
112        // find first non-empty
113        let first = lines.iter().find(|l| !l.trim().is_empty()).map(|s| s.trim());
114        // find last non-empty
115        let last = lines.iter().rev().find(|l| !l.trim().is_empty()).map(|s| s.trim());
116        heads.push(first.map(|s| s.to_string()));
117        foots.push(last.map(|s| s.to_string()));
118        if let Some(h) = &heads.last().unwrap() {
119            if h.len() > 3 { *first_map.entry(h.clone()).or_insert(0) += 1; }
120        }
121        if let Some(f) = &foots.last().unwrap() {
122            if f.len() > 3 { *last_map.entry(f.clone()).or_insert(0) += 1; }
123        }
124    }
125    let n = pages.len().max(1);
126    let threshold = (n as f32 * 0.6).ceil() as usize; // 60% of pages
127    let common_head: Option<String> = first_map
128        .into_iter()
129        .filter(|(s, c)| *c >= threshold && !is_mostly_numeric(s))
130        .max_by_key(|(_, c)| *c)
131        .map(|(s, _)| s);
132    let common_foot: Option<String> = last_map
133        .into_iter()
134        .filter(|(s, c)| *c >= threshold && !is_mostly_numeric(s))
135        .max_by_key(|(_, c)| *c)
136        .map(|(s, _)| s);
137
138    let mut out = Vec::with_capacity(pages.len());
139    for ((txt, p), h) in pages.iter().zip(heads.into_iter()) {
140        let f = foots.remove(0);
141        let mut lines: Vec<&str> = txt.split('\n').collect();
142        if let (Some(ch), Some(hs)) = (&common_head, &h) {
143            if hs == ch { if let Some(idx) = lines.iter().position(|l| l.trim() == hs) { lines.remove(idx); } }
144        }
145        if let (Some(cf), Some(fs)) = (&common_foot, &f) {
146            if fs == cf { if let Some(idx) = lines.iter().rposition(|l| l.trim() == fs) { lines.remove(idx); } }
147        }
148        out.push((lines.join("\n"), *p));
149    }
150    out
151}
152
153fn is_mostly_numeric(s: &str) -> bool {
154    let trimmed = s.trim();
155    if trimmed.is_empty() { return true; }
156    let digits = trimmed.chars().filter(|c| c.is_ascii_digit()).count();
157    digits * 2 >= trimmed.chars().count() // >= 50% digits
158}
159
160#[derive(Clone, Copy, Default, Debug)]
161pub struct ChunkerStats {
162    pub annotate_ms: u128,
163    pub group_ms: u128,
164    pub pack_ms: u128,
165    pub overlap_ms: u128,
166    pub merge_ms: u128,
167    pub split_ms: u128,
168    pub final_ms: u128,
169    pub total_ms: u128,
170}
171
172/// Like [`chunk_pages`], but also returns timing breakdowns of each stage.
173///
174/// Example
175/// ```
176/// use nvs_core::chunker::{chunk_pages_with_stats, ChunkOptions};
177/// let pages = vec![("# H\nBody".to_string(), 0)];
178/// let tok = tokenmonster::GreedyTokenizer::from_cl100k_bin();
179/// let opts = ChunkOptions { max_tokens: 64, min_tokens: 8, overlap_tokens: 0 };
180/// let (chunks, stats) = chunk_pages_with_stats(&pages, &tok, &opts);
181/// assert_eq!(chunks.len() > 0, true);
182/// assert!(stats.total_ms >= 0);
183/// ```
184pub fn chunk_pages_with_stats(
185    pages: &[(String, i32)],
186    tokenizer: &dyn TokenCounter,
187    opts: &ChunkOptions,
188) -> (Vec<Chunk>, ChunkerStats) {
189    let t0 = Instant::now();
190    let ta = Instant::now();
191    let annotated = annotate_lines(pages, tokenizer);
192    let ta_ms = ta.elapsed().as_millis();
193
194    let tg = Instant::now();
195    let semantic_units = group_semantic_units(&annotated);
196    let tg_ms = tg.elapsed().as_millis();
197
198    let tp = Instant::now();
199    let mut tmp = pack_initial_chunks(&semantic_units, opts.max_tokens);
200    let tp_ms = tp.elapsed().as_millis();
201
202    let to = Instant::now();
203    add_overlap(&mut tmp, opts.overlap_tokens, tokenizer);
204    let to_ms = to.elapsed().as_millis();
205
206    let tm = Instant::now();
207    tmp = merge_small_chunks(tmp, opts.min_tokens, opts.max_tokens);
208    let tm_ms = tm.elapsed().as_millis();
209
210    let ts = Instant::now();
211    tmp = split_oversized(tmp, opts.max_tokens, tokenizer);
212    let ts_ms = ts.elapsed().as_millis();
213
214    let tf = Instant::now();
215    let chunks = final_merge(tmp, opts.min_tokens, opts.max_tokens);
216    let tf_ms = tf.elapsed().as_millis();
217
218    let stats = ChunkerStats {
219        annotate_ms: ta_ms,
220        group_ms: tg_ms,
221        pack_ms: tp_ms,
222        overlap_ms: to_ms,
223        merge_ms: tm_ms,
224        split_ms: ts_ms,
225        final_ms: tf_ms,
226        total_ms: t0.elapsed().as_millis(),
227    };
228    (chunks, stats)
229}
230
231pub mod json;