Skip to main content

pdfmuse_core/output/
chunk.rs

1//! RAG chunking of the IR.
2//!
3//! Emits one [`Chunk`] per block, each carrying the block's page, bbox, and the
4//! running heading path (the stack of enclosing headings). Tables are never
5//! split — a table becomes exactly one chunk with a flattened text body.
6
7use crate::ir::{BBox, Block, Document, Table};
8use serde::Serialize;
9
10/// A retrieval unit: a block's text plus the context needed to cite it.
11#[derive(Serialize, Clone, Debug)]
12pub struct Chunk {
13    pub text: String,
14    pub page: u32,
15    pub bbox: BBox,
16    /// The stack of enclosing headings, outermost first.
17    pub heading_path: Vec<String>,
18}
19
20/// Split `doc` into chunks (one per non-empty block), tracking heading context.
21pub fn chunk(doc: &Document) -> Vec<Chunk> {
22    let mut chunks = Vec::new();
23    // `heading_path[i]` is the current heading at level `i + 1`.
24    let mut heading_path: Vec<String> = Vec::new();
25
26    for page in &doc.pages {
27        for block in &page.blocks {
28            match block {
29                Block::Paragraph(p) => {
30                    if let Some(level) = p.heading_level.filter(|&n| n > 0) {
31                        // A heading sets the path at its depth and drops deeper
32                        // levels; missing intermediate levels are padded blank.
33                        let depth = level as usize;
34                        heading_path.truncate(depth.saturating_sub(1));
35                        heading_path.resize(depth - 1, String::new());
36                        heading_path.push(p.text.clone());
37                    }
38                    if p.text.trim().is_empty() {
39                        continue;
40                    }
41                    chunks.push(Chunk {
42                        text: p.text.clone(),
43                        page: page.index,
44                        bbox: p.bbox,
45                        heading_path: heading_path.clone(),
46                    });
47                }
48                Block::Table(t) => {
49                    let text = flatten_table(t);
50                    if text.trim().is_empty() {
51                        continue;
52                    }
53                    chunks.push(Chunk {
54                        text,
55                        page: page.index,
56                        bbox: t.bbox,
57                        heading_path: heading_path.clone(),
58                    });
59                }
60                // Images carry no text to embed.
61                Block::Image(_) => {}
62            }
63        }
64    }
65    chunks
66}
67
68/// Flatten a table into a single readable string: cells joined by " | " per row,
69/// rows joined by newlines.
70fn flatten_table(table: &Table) -> String {
71    table
72        .rows
73        .iter()
74        .map(|row| {
75            row.iter()
76                .map(|c| c.text.as_str())
77                .collect::<Vec<_>>()
78                .join(" | ")
79        })
80        .collect::<Vec<_>>()
81        .join("\n")
82}
83
84#[cfg(test)]
85mod tests {
86    use super::*;
87    use crate::ir::{Cell, Paragraph, Table, TableSource};
88
89    fn bbox() -> BBox {
90        BBox { x0: 0.0, y0: 0.0, x1: 1.0, y1: 1.0 }
91    }
92
93    #[test]
94    fn heading_path_tracks_nesting() {
95        let doc = Document {
96            pages: vec![crate::ir::Page {
97                index: 3,
98                blocks: vec![
99                    Block::Paragraph(Paragraph {
100                        bbox: bbox(),
101                        text: "Title".into(),
102                        heading_level: Some(1), role: None,
103                    }),
104                    Block::Paragraph(Paragraph {
105                        bbox: bbox(),
106                        text: "Body".into(),
107                        heading_level: None, role: None,
108                    }),
109                ],
110                ..Default::default()
111            }],
112            ..Default::default()
113        };
114        let chunks = chunk(&doc);
115        assert_eq!(chunks.len(), 2);
116        assert_eq!(chunks[1].heading_path, vec!["Title".to_string()]);
117        assert_eq!(chunks[1].page, 3);
118    }
119
120    #[test]
121    fn table_is_one_chunk() {
122        let cell = |t: &str| Cell {
123            text: t.into(),
124            bbox: bbox(),
125            row_span: 1,
126            col_span: 1,
127        };
128        let doc = Document {
129            pages: vec![crate::ir::Page {
130                index: 0,
131                blocks: vec![Block::Table(Table {
132                    bbox: bbox(),
133                    rows: vec![vec![cell("a"), cell("b")], vec![cell("c"), cell("d")]],
134                    source: TableSource::Ruled,
135                })],
136                ..Default::default()
137            }],
138            ..Default::default()
139        };
140        let chunks = chunk(&doc);
141        assert_eq!(chunks.len(), 1);
142        assert!(chunks[0].text.contains("a | b"));
143    }
144}