Skip to main content

nvs_core/chunker/
group.rs

1use super::{AnnotatedLine, LineType};
2use std::collections::BTreeSet;
3
4#[derive(Clone, Debug)]
5pub struct SemanticUnit {
6    pub lines: Vec<AnnotatedLine>,
7    pub total_tokens: usize,
8    pub pages: BTreeSet<i32>,
9    pub has_major_heading: bool,
10    pub min_heading_level: i32,
11}
12
13impl Default for SemanticUnit {
14    fn default() -> Self {
15        Self {
16            lines: Vec::new(),
17            total_tokens: 0,
18            pages: BTreeSet::new(),
19            has_major_heading: false,
20            min_heading_level: i32::MAX,
21        }
22    }
23}
24
25impl SemanticUnit {
26    pub fn add(&mut self, l: AnnotatedLine) {
27        if matches!(l.line_type, LineType::MajorHeading) {
28            self.has_major_heading = true;
29            self.min_heading_level = self.min_heading_level.min(l.heading_level);
30        }
31        self.total_tokens += l.tokens;
32        self.pages.insert(l.page);
33        self.lines.push(l);
34    }
35    pub fn text(&self) -> String {
36        let mut s = String::new();
37        for l in &self.lines {
38            s.push_str(&l.text);
39            s.push('\n');
40        }
41        s
42    }
43}
44
45pub fn group_semantic_units(lines: &[AnnotatedLine]) -> Vec<SemanticUnit> {
46    let mut out: Vec<SemanticUnit> = Vec::new();
47    let mut cur = SemanticUnit::default();
48    for (i, l) in lines.iter().cloned().enumerate() {
49        let mut break_here = false;
50        match l.line_type {
51            LineType::MajorHeading | LineType::MinorHeading => break_here = !cur.lines.is_empty(),
52            LineType::Blank => {
53                if let Some(nxt) = lines.get(i + 1) {
54                    if matches!(
55                        nxt.line_type,
56                        LineType::MajorHeading | LineType::MinorHeading
57                    ) {
58                        break_here = !cur.lines.is_empty();
59                    }
60                }
61            }
62            _ => {}
63        }
64        if break_here {
65            out.push(cur);
66            cur = SemanticUnit::default();
67        }
68        if !(l.line_type == LineType::Blank && cur.lines.is_empty()) {
69            cur.add(l);
70        }
71    }
72    if !cur.lines.is_empty() {
73        out.push(cur);
74    }
75    out
76}