1use std::time::Instant;
5
6mod annotate;
7mod group;
8mod merge;
9mod overlap;
10mod pack;
11mod split;
12
13pub use annotate::{annotate_lines, AnnotatedLine, LineType};
14pub use group::{group_semantic_units, SemanticUnit};
15pub use merge::{final_merge, merge_small_chunks};
16pub use overlap::add_overlap;
17pub use pack::pack_initial_chunks;
18pub use split::split_oversized;
19
20pub trait TokenCounter {
22 fn count_tokens(&self, text: &str) -> usize;
23}
24
25impl TokenCounter for tokenmonster::GreedyTokenizer {
27 fn count_tokens(&self, text: &str) -> usize {
28 self.count_tokens(text)
29 }
30}
31
32impl TokenCounter for once_cell::sync::Lazy<tokenmonster::GreedyTokenizer> {
34 fn count_tokens(&self, text: &str) -> usize {
35 tokenmonster::GreedyTokenizer::count_tokens(&*self, text)
36 }
37}
38
39#[derive(Clone, Debug)]
40pub struct Chunk {
41 pub text: String,
42 pub token_count: usize,
43 pub start_page: i32,
44 pub end_page: i32,
45 pub has_major_heading: bool,
46 pub min_heading_level: i32,
47}
48
49#[derive(Clone, Debug)]
50pub struct ChunkOptions {
51 pub max_tokens: usize,
52 pub min_tokens: usize,
53 pub overlap_tokens: usize,
54}
55
56impl Default for ChunkOptions {
57 fn default() -> Self {
58 Self {
59 max_tokens: 256,
60 min_tokens: 150,
61 overlap_tokens: 50,
62 }
63 }
64}
65
66pub fn chunk_pages(
82 pages: &[(String, i32)],
83 tokenizer: &dyn TokenCounter,
84 opts: &ChunkOptions,
85) -> Vec<Chunk> {
86 if pages.is_empty() {
87 return Vec::new();
88 }
89 let pages = strip_repeated_headers_footers(pages);
91 let annotated = annotate_lines(&pages, tokenizer);
92 let semantic_units = group_semantic_units(&annotated);
93 let mut tmp = pack_initial_chunks(&semantic_units, opts.max_tokens);
94 add_overlap(&mut tmp, opts.overlap_tokens, tokenizer);
95 tmp = merge_small_chunks(tmp, opts.min_tokens, opts.max_tokens);
96 tmp = split_oversized(tmp, opts.max_tokens, tokenizer);
97 let chunks = final_merge(tmp, opts.min_tokens, opts.max_tokens);
98 chunks
99}
100
101fn strip_repeated_headers_footers(pages: &[(String, i32)]) -> Vec<(String, i32)> {
105 use std::collections::HashMap;
106 let mut first_map: HashMap<String, usize> = HashMap::new();
107 let mut last_map: HashMap<String, usize> = HashMap::new();
108 let mut heads: Vec<Option<String>> = Vec::with_capacity(pages.len());
109 let mut foots: Vec<Option<String>> = Vec::with_capacity(pages.len());
110 for (txt, _p) in pages {
111 let mut lines: Vec<&str> = txt.split('\n').collect();
112 let first = lines.iter().find(|l| !l.trim().is_empty()).map(|s| s.trim());
114 let last = lines.iter().rev().find(|l| !l.trim().is_empty()).map(|s| s.trim());
116 heads.push(first.map(|s| s.to_string()));
117 foots.push(last.map(|s| s.to_string()));
118 if let Some(h) = &heads.last().unwrap() {
119 if h.len() > 3 { *first_map.entry(h.clone()).or_insert(0) += 1; }
120 }
121 if let Some(f) = &foots.last().unwrap() {
122 if f.len() > 3 { *last_map.entry(f.clone()).or_insert(0) += 1; }
123 }
124 }
125 let n = pages.len().max(1);
126 let threshold = (n as f32 * 0.6).ceil() as usize; let common_head: Option<String> = first_map
128 .into_iter()
129 .filter(|(s, c)| *c >= threshold && !is_mostly_numeric(s))
130 .max_by_key(|(_, c)| *c)
131 .map(|(s, _)| s);
132 let common_foot: Option<String> = last_map
133 .into_iter()
134 .filter(|(s, c)| *c >= threshold && !is_mostly_numeric(s))
135 .max_by_key(|(_, c)| *c)
136 .map(|(s, _)| s);
137
138 let mut out = Vec::with_capacity(pages.len());
139 for ((txt, p), h) in pages.iter().zip(heads.into_iter()) {
140 let f = foots.remove(0);
141 let mut lines: Vec<&str> = txt.split('\n').collect();
142 if let (Some(ch), Some(hs)) = (&common_head, &h) {
143 if hs == ch { if let Some(idx) = lines.iter().position(|l| l.trim() == hs) { lines.remove(idx); } }
144 }
145 if let (Some(cf), Some(fs)) = (&common_foot, &f) {
146 if fs == cf { if let Some(idx) = lines.iter().rposition(|l| l.trim() == fs) { lines.remove(idx); } }
147 }
148 out.push((lines.join("\n"), *p));
149 }
150 out
151}
152
153fn is_mostly_numeric(s: &str) -> bool {
154 let trimmed = s.trim();
155 if trimmed.is_empty() { return true; }
156 let digits = trimmed.chars().filter(|c| c.is_ascii_digit()).count();
157 digits * 2 >= trimmed.chars().count() }
159
160#[derive(Clone, Copy, Default, Debug)]
161pub struct ChunkerStats {
162 pub annotate_ms: u128,
163 pub group_ms: u128,
164 pub pack_ms: u128,
165 pub overlap_ms: u128,
166 pub merge_ms: u128,
167 pub split_ms: u128,
168 pub final_ms: u128,
169 pub total_ms: u128,
170}
171
172pub fn chunk_pages_with_stats(
185 pages: &[(String, i32)],
186 tokenizer: &dyn TokenCounter,
187 opts: &ChunkOptions,
188) -> (Vec<Chunk>, ChunkerStats) {
189 let t0 = Instant::now();
190 let ta = Instant::now();
191 let annotated = annotate_lines(pages, tokenizer);
192 let ta_ms = ta.elapsed().as_millis();
193
194 let tg = Instant::now();
195 let semantic_units = group_semantic_units(&annotated);
196 let tg_ms = tg.elapsed().as_millis();
197
198 let tp = Instant::now();
199 let mut tmp = pack_initial_chunks(&semantic_units, opts.max_tokens);
200 let tp_ms = tp.elapsed().as_millis();
201
202 let to = Instant::now();
203 add_overlap(&mut tmp, opts.overlap_tokens, tokenizer);
204 let to_ms = to.elapsed().as_millis();
205
206 let tm = Instant::now();
207 tmp = merge_small_chunks(tmp, opts.min_tokens, opts.max_tokens);
208 let tm_ms = tm.elapsed().as_millis();
209
210 let ts = Instant::now();
211 tmp = split_oversized(tmp, opts.max_tokens, tokenizer);
212 let ts_ms = ts.elapsed().as_millis();
213
214 let tf = Instant::now();
215 let chunks = final_merge(tmp, opts.min_tokens, opts.max_tokens);
216 let tf_ms = tf.elapsed().as_millis();
217
218 let stats = ChunkerStats {
219 annotate_ms: ta_ms,
220 group_ms: tg_ms,
221 pack_ms: tp_ms,
222 overlap_ms: to_ms,
223 merge_ms: tm_ms,
224 split_ms: ts_ms,
225 final_ms: tf_ms,
226 total_ms: t0.elapsed().as_millis(),
227 };
228 (chunks, stats)
229}
230
231pub mod json;