nvs_core/chunker/
group.rs1use super::{AnnotatedLine, LineType};
2use std::collections::BTreeSet;
3
4#[derive(Clone, Debug)]
5pub struct SemanticUnit {
6 pub lines: Vec<AnnotatedLine>,
7 pub total_tokens: usize,
8 pub pages: BTreeSet<i32>,
9 pub has_major_heading: bool,
10 pub min_heading_level: i32,
11}
12
13impl Default for SemanticUnit {
14 fn default() -> Self {
15 Self {
16 lines: Vec::new(),
17 total_tokens: 0,
18 pages: BTreeSet::new(),
19 has_major_heading: false,
20 min_heading_level: i32::MAX,
21 }
22 }
23}
24
25impl SemanticUnit {
26 pub fn add(&mut self, l: AnnotatedLine) {
27 if matches!(l.line_type, LineType::MajorHeading) {
28 self.has_major_heading = true;
29 self.min_heading_level = self.min_heading_level.min(l.heading_level);
30 }
31 self.total_tokens += l.tokens;
32 self.pages.insert(l.page);
33 self.lines.push(l);
34 }
35 pub fn text(&self) -> String {
36 let mut s = String::new();
37 for l in &self.lines {
38 s.push_str(&l.text);
39 s.push('\n');
40 }
41 s
42 }
43}
44
45pub fn group_semantic_units(lines: &[AnnotatedLine]) -> Vec<SemanticUnit> {
46 let mut out: Vec<SemanticUnit> = Vec::new();
47 let mut cur = SemanticUnit::default();
48 for (i, l) in lines.iter().cloned().enumerate() {
49 let mut break_here = false;
50 match l.line_type {
51 LineType::MajorHeading | LineType::MinorHeading => break_here = !cur.lines.is_empty(),
52 LineType::Blank => {
53 if let Some(nxt) = lines.get(i + 1) {
54 if matches!(
55 nxt.line_type,
56 LineType::MajorHeading | LineType::MinorHeading
57 ) {
58 break_here = !cur.lines.is_empty();
59 }
60 }
61 }
62 _ => {}
63 }
64 if break_here {
65 out.push(cur);
66 cur = SemanticUnit::default();
67 }
68 if !(l.line_type == LineType::Blank && cur.lines.is_empty()) {
69 cur.add(l);
70 }
71 }
72 if !cur.lines.is_empty() {
73 out.push(cur);
74 }
75 out
76}