use crate::{Document, Result, TextFragment};
#[non_exhaustive]
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ChunkType {
Heading(u8),
Paragraph,
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct TextChunk {
pub text: String,
pub bbox: [f32; 4],
pub chunk_type: ChunkType,
pub avg_font_size: f32,
}
impl Document {
pub fn extract_text_chunks(&self, page: u32) -> Result<Vec<TextChunk>> {
let mut fragments = self.extract_text_runs(page)?;
crate::sort_by_reading_order(&mut fragments);
let fragments: Vec<_> = fragments
.into_iter()
.filter(|f| !f.invisible)
.collect();
if fragments.is_empty() {
return Ok(Vec::new());
}
let lines = group_into_lines(&fragments);
let baseline_font_size = estimate_baseline_font_size(&lines);
let classified = lines
.into_iter()
.map(|line| {
let avg_font_size = line.iter().map(|f| f.font_size).sum::<f32>() / line.len() as f32;
let ratio = avg_font_size / baseline_font_size;
let chunk_type = classify_by_ratio(ratio);
(line, avg_font_size, chunk_type)
})
.collect::<Vec<_>>();
let merged = merge_consecutive_chunks(classified);
Ok(merged)
}
pub fn extract_as_markdown(&self, page: u32) -> Result<String> {
let chunks = self.extract_text_chunks(page)?;
let mut result = String::new();
for chunk in chunks {
match chunk.chunk_type {
ChunkType::Heading(level) => {
result.push_str(&"#".repeat(level as usize));
result.push(' ');
result.push_str(&chunk.text);
result.push_str("\n\n");
}
ChunkType::Paragraph => {
result.push_str(&chunk.text);
result.push_str("\n\n");
}
}
}
while result.ends_with('\n') || result.ends_with(' ') {
result.pop();
}
Ok(result)
}
}
fn group_into_lines(fragments: &[TextFragment]) -> Vec<Vec<TextFragment>> {
let mut lines: Vec<Vec<TextFragment>> = Vec::new();
for frag in fragments {
let font_size = if frag.font_size > 0.0 && frag.font_size.is_finite() {
frag.font_size
} else {
1.0 };
let tol = font_size * 0.5;
let mut placed = false;
for line in &mut lines {
if let Some(first) = line.first() && (frag.y - first.y).abs() <= tol {
line.push(frag.clone());
placed = true;
break;
}
}
if !placed {
lines.push(vec![frag.clone()]);
}
}
lines
}
fn estimate_baseline_font_size(lines: &[Vec<TextFragment>]) -> f32 {
let sizes: Vec<f32> = lines
.iter()
.take(10) .filter_map(|line| {
let avg = line.iter().map(|f| f.font_size).sum::<f32>() / line.len() as f32;
if avg > 0.0 && avg.is_finite() {
Some(avg)
} else {
None
}
})
.collect();
if sizes.is_empty() {
return 12.0; }
sizes.into_iter().fold(f32::INFINITY, f32::min).max(1.0)
}
fn classify_by_ratio(ratio: f32) -> ChunkType {
if !ratio.is_finite() {
return ChunkType::Paragraph;
}
if ratio >= 1.8 {
ChunkType::Heading(1)
} else if ratio >= 1.5 {
ChunkType::Heading(2)
} else if ratio >= 1.3 {
ChunkType::Heading(3)
} else if ratio >= 1.15 {
ChunkType::Heading(4)
} else {
ChunkType::Paragraph
}
}
fn merge_consecutive_chunks(
classified: Vec<(Vec<TextFragment>, f32, ChunkType)>,
) -> Vec<TextChunk> {
let mut result: Vec<TextChunk> = Vec::new();
for (line, avg_font_size, chunk_type) in classified {
let text = line.iter().map(|f| f.text.as_str()).collect::<Vec<_>>().join("");
let min_x = line.iter().map(|f| f.x).fold(f32::INFINITY, f32::min);
let min_y = line.iter().map(|f| f.y).fold(f32::INFINITY, f32::min);
let max_x = line
.iter()
.map(|f| f.x + f.width)
.fold(f32::NEG_INFINITY, f32::max);
let max_y = line
.iter()
.map(|f| f.y + f.height)
.fold(f32::NEG_INFINITY, f32::max);
let bbox = [min_x, min_y, (max_x - min_x).max(0.0), (max_y - min_y).max(0.0)];
let merged = if let Some(last) = result.last_mut() {
if last.chunk_type == chunk_type {
last.text.push(' ');
last.text.push_str(&text);
let [x1, y1, w1, h1] = last.bbox;
let x2 = x1 + w1;
let y2 = y1 + h1;
let new_min_x = min_x.min(x1);
let new_min_y = min_y.min(y1);
let new_max_x = max_x.max(x2);
let new_max_y = max_y.max(y2);
last.bbox = [
new_min_x,
new_min_y,
(new_max_x - new_min_x).max(0.0),
(new_max_y - new_min_y).max(0.0),
];
let old_count = last.text.split_whitespace().count() as f32;
let new_count = text.split_whitespace().count() as f32;
let total = old_count + new_count;
if total > 0.0 {
last.avg_font_size = (last.avg_font_size * old_count + avg_font_size * new_count) / total;
}
true
} else {
false
}
} else {
false
};
if !merged {
result.push(TextChunk {
text,
bbox,
chunk_type,
avg_font_size,
});
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn classify_by_ratio_headings() {
assert_eq!(classify_by_ratio(1.9), ChunkType::Heading(1));
assert_eq!(classify_by_ratio(1.8), ChunkType::Heading(1));
assert_eq!(classify_by_ratio(1.5), ChunkType::Heading(2));
assert_eq!(classify_by_ratio(1.3), ChunkType::Heading(3));
assert_eq!(classify_by_ratio(1.15), ChunkType::Heading(4));
assert_eq!(classify_by_ratio(1.0), ChunkType::Paragraph);
assert_eq!(classify_by_ratio(0.8), ChunkType::Paragraph);
}
#[test]
fn baseline_font_size_from_empty() {
let lines: Vec<Vec<TextFragment>> = vec![];
assert_eq!(estimate_baseline_font_size(&lines), 12.0);
}
}