pdfmuse_core/output/
chunk.rs1use crate::ir::{BBox, Block, Document, Table};
8use serde::Serialize;
9
10#[derive(Serialize, Clone, Debug)]
12pub struct Chunk {
13 pub text: String,
14 pub page: u32,
15 pub bbox: BBox,
16 pub heading_path: Vec<String>,
18}
19
20pub fn chunk(doc: &Document) -> Vec<Chunk> {
22 let mut chunks = Vec::new();
23 let mut heading_path: Vec<String> = Vec::new();
25
26 for page in &doc.pages {
27 for block in &page.blocks {
28 match block {
29 Block::Paragraph(p) => {
30 if let Some(level) = p.heading_level.filter(|&n| n > 0) {
31 let depth = level as usize;
34 heading_path.truncate(depth.saturating_sub(1));
35 heading_path.resize(depth - 1, String::new());
36 heading_path.push(p.text.clone());
37 }
38 if p.text.trim().is_empty() {
39 continue;
40 }
41 chunks.push(Chunk {
42 text: p.text.clone(),
43 page: page.index,
44 bbox: p.bbox,
45 heading_path: heading_path.clone(),
46 });
47 }
48 Block::Table(t) => {
49 let text = flatten_table(t);
50 if text.trim().is_empty() {
51 continue;
52 }
53 chunks.push(Chunk {
54 text,
55 page: page.index,
56 bbox: t.bbox,
57 heading_path: heading_path.clone(),
58 });
59 }
60 Block::Image(_) => {}
62 }
63 }
64 }
65 chunks
66}
67
68fn flatten_table(table: &Table) -> String {
71 table
72 .rows
73 .iter()
74 .map(|row| {
75 row.iter()
76 .map(|c| c.text.as_str())
77 .collect::<Vec<_>>()
78 .join(" | ")
79 })
80 .collect::<Vec<_>>()
81 .join("\n")
82}
83
84#[cfg(test)]
85mod tests {
86 use super::*;
87 use crate::ir::{Cell, Paragraph, Table, TableSource};
88
89 fn bbox() -> BBox {
90 BBox { x0: 0.0, y0: 0.0, x1: 1.0, y1: 1.0 }
91 }
92
93 #[test]
94 fn heading_path_tracks_nesting() {
95 let doc = Document {
96 pages: vec![crate::ir::Page {
97 index: 3,
98 blocks: vec![
99 Block::Paragraph(Paragraph {
100 bbox: bbox(),
101 text: "Title".into(),
102 heading_level: Some(1),
103 }),
104 Block::Paragraph(Paragraph {
105 bbox: bbox(),
106 text: "Body".into(),
107 heading_level: None,
108 }),
109 ],
110 ..Default::default()
111 }],
112 ..Default::default()
113 };
114 let chunks = chunk(&doc);
115 assert_eq!(chunks.len(), 2);
116 assert_eq!(chunks[1].heading_path, vec!["Title".to_string()]);
117 assert_eq!(chunks[1].page, 3);
118 }
119
120 #[test]
121 fn table_is_one_chunk() {
122 let cell = |t: &str| Cell {
123 text: t.into(),
124 bbox: bbox(),
125 row_span: 1,
126 col_span: 1,
127 };
128 let doc = Document {
129 pages: vec![crate::ir::Page {
130 index: 0,
131 blocks: vec![Block::Table(Table {
132 bbox: bbox(),
133 rows: vec![vec![cell("a"), cell("b")], vec![cell("c"), cell("d")]],
134 source: TableSource::Ruled,
135 })],
136 ..Default::default()
137 }],
138 ..Default::default()
139 };
140 let chunks = chunk(&doc);
141 assert_eq!(chunks.len(), 1);
142 assert!(chunks[0].text.contains("a | b"));
143 }
144}