use crate::error::Result;
use crate::layout::text_block::{Color, TextChar, TextSpan};
use office_oxide::format::DocumentFormat;
use office_oxide::ir::{
ColumnLayout, DocumentIR, Element, Heading, Image, ImageFormat, ImagePositioning,
InlineContent, Metadata, PageSetup, Paragraph, Section, SectionBreakType, TextSpan as IrSpan,
};
use std::collections::HashMap;
const PT_TO_TWIPS: f32 = 20.0;
const DEFAULT_MARGIN_PT: f32 = 36.0;
#[derive(Debug, Clone)]
pub struct PdfToIrOptions {
pub heading_ratios: [f32; 3],
pub paragraph_gap_factor: f32,
}
impl Default for PdfToIrOptions {
fn default() -> Self {
Self {
heading_ratios: [1.75, 1.35, 1.15],
paragraph_gap_factor: 1.2,
}
}
}
pub fn pdf_to_ir(
doc: &crate::document::PdfDocument,
format: DocumentFormat,
options: &PdfToIrOptions,
) -> Result<DocumentIR> {
let page_count = doc.page_count()?;
let mut sections: Vec<Section> = Vec::with_capacity(page_count);
let mut all_spans: Vec<Vec<TextSpan>> = Vec::with_capacity(page_count);
for page_idx in 0..page_count {
let (_x1, _y1, _x2, _y2) = doc.get_page_media_box(page_idx)?;
let page_h_for_filter = (_y2 - _y1).abs();
let mut spans = doc.extract_spans(page_idx)?;
if let Ok(chars) = doc.extract_chars(page_idx) {
let chars_horizontal_dominant = if chars.is_empty() {
true
} else {
let horiz = chars
.iter()
.filter(|c| c.rotation_degrees.abs() < 5.0)
.count();
horiz * 4 >= chars.len() * 3
};
spans.retain(|s| !span_overlaps_rotated_chars(s, &chars, chars_horizontal_dominant));
}
spans.retain(|s| !is_page_artifact(s) && !is_geometric_footer(s, page_h_for_filter));
all_spans.push(spans);
}
let color_counts = build_color_histogram(&all_spans);
let face_lookups = doc.page_font_face_lookups().unwrap_or_default();
for (page_idx, spans) in all_spans.iter().enumerate() {
let (x1, y1, x2, y2) = doc.get_page_media_box(page_idx)?;
let page_w = (x2 - x1).abs();
let page_h = (y2 - y1).abs();
let break_type = SectionBreakType::NextPage;
let images = extract_page_images(doc, page_idx, page_h);
let face_lookup = face_lookups.get(page_idx).cloned().unwrap_or_default();
let rules = extract_horizontal_rules(doc, page_idx, page_w, page_h);
sections.push(page_to_section(
spans,
page_w,
page_h,
break_type,
&images,
&rules,
options,
&color_counts,
&face_lookup,
));
}
let mut metadata = Metadata {
format,
..Default::default()
};
populate_metadata_from_pdf_info(doc, &mut metadata);
Ok(DocumentIR { metadata, sections })
}
#[derive(Debug, Clone)]
struct HorizontalRule {
y_pdf: f32,
}
fn extract_horizontal_rules(
doc: &crate::document::PdfDocument,
page_idx: usize,
page_w_pt: f32,
_page_h_pt: f32,
) -> Vec<HorizontalRule> {
let paths = match doc.extract_paths(page_idx) {
Ok(p) => p,
Err(_) => return Vec::new(),
};
let mut out = Vec::new();
let min_w = page_w_pt * 0.3;
for p in paths {
let w = p.bbox.width;
let h = p.bbox.height;
let thin_rect = w >= min_w && h <= 2.0 && h > 0.0;
let h_line = p.is_straight_line() && w >= min_w && h <= 1.0;
if thin_rect || h_line {
out.push(HorizontalRule {
y_pdf: p.bbox.y + h * 0.5,
});
}
}
out
}
fn is_geometric_footer(span: &TextSpan, page_h_pt: f32) -> bool {
if page_h_pt <= 0.0 {
return false;
}
let footer_strip = page_h_pt * 0.05;
let header_strip_low = page_h_pt - page_h_pt * 0.05;
let in_footer = span.bbox.y < footer_strip;
let in_header = span.bbox.y > header_strip_low;
let small_font = span.font_size < 8.0;
(in_footer || in_header) && small_font
}
fn is_page_artifact(span: &TextSpan) -> bool {
use crate::extractors::text::{ArtifactType, PaginationSubtype};
matches!(
span.artifact_type,
Some(ArtifactType::Pagination(_))
| Some(ArtifactType::Page)
| Some(ArtifactType::Background)
) || matches!(span.artifact_type, Some(ArtifactType::Pagination(PaginationSubtype::Watermark)))
}
pub(crate) fn span_overlaps_rotated_chars(
span: &TextSpan,
chars: &[TextChar],
chars_horizontal_dominant: bool,
) -> bool {
if !chars_horizontal_dominant {
return false;
}
if chars.is_empty() {
return false;
}
let bx = span.bbox.x;
let by = span.bbox.y;
let mut best_idx: Option<usize> = None;
let mut best_d2 = f32::INFINITY;
for (i, c) in chars.iter().enumerate() {
let dx = c.origin_x - bx;
let dy = c.origin_y - by;
let d2 = dx * dx + dy * dy;
if d2 < best_d2 {
best_d2 = d2;
best_idx = Some(i);
}
}
const MAX_ORIGIN_DIST: f32 = 5.0;
if best_d2 > MAX_ORIGIN_DIST * MAX_ORIGIN_DIST {
return false;
}
match best_idx {
Some(i) => chars[i].rotation_degrees.abs() >= 5.0,
None => false,
}
}
fn populate_metadata_from_pdf_info(doc: &crate::document::PdfDocument, metadata: &mut Metadata) {
let trailer = doc.trailer();
let info_ref = match trailer
.as_dict()
.and_then(|d| d.get("Info"))
.and_then(|o| o.as_reference())
{
Some(r) => r,
None => return,
};
let info_obj = match doc.load_object(info_ref) {
Ok(o) => o,
Err(_) => return,
};
let info = crate::editor::DocumentInfo::from_object(&info_obj);
if metadata.title.is_none() {
metadata.title = info.title.clone();
}
if metadata.author.is_none() {
metadata.author = info.author.clone();
}
if metadata.subject.is_none() {
metadata.subject = info.subject.clone();
}
if metadata.keywords.is_empty() {
if let Some(kw) = &info.keywords {
metadata.keywords = kw
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
}
}
if metadata.created.is_none() {
metadata.created = info.creation_date.clone();
}
if metadata.modified.is_none() {
metadata.modified = info.mod_date.clone();
}
}
struct PositionedImage {
image: Image,
x_emu: i64,
y_emu: i64,
cx_emu: i64,
cy_emu: i64,
}
fn positioned_image_to_element(pi: &PositionedImage) -> Element {
let mut image = pi.image.clone();
image.positioning = ImagePositioning::Floating(office_oxide::ir::FloatingImage {
x_emu: pi.x_emu,
y_emu: pi.y_emu,
width_emu: pi.cx_emu.max(0) as u64,
height_emu: pi.cy_emu.max(0) as u64,
h_anchor: office_oxide::ir::FloatAnchor::default(),
v_anchor: office_oxide::ir::FloatAnchor::default(),
text_wrap: office_oxide::ir::TextWrap::default(),
allow_overlap: true,
});
Element::Image(image)
}
fn extract_page_images(
doc: &crate::document::PdfDocument,
page_idx: usize,
page_h_pt: f32,
) -> Vec<PositionedImage> {
const EMU_PER_PT: f64 = 12_700.0;
let raw = match doc.extract_images(page_idx) {
Ok(v) => v,
Err(_) => return Vec::new(),
};
let mut out = Vec::with_capacity(raw.len());
for img in raw {
let bbox = match img.bbox() {
Some(b) => *b,
None => continue,
};
let png = match img.to_png_bytes() {
Ok(b) if !b.is_empty() => b,
_ => continue,
};
let w_emu = ((bbox.width as f64).max(1.0) * EMU_PER_PT) as u64;
let h_emu = ((bbox.height as f64).max(1.0) * EMU_PER_PT) as u64;
let x_emu = (bbox.x.max(0.0) as f64 * EMU_PER_PT) as i64;
let y_top_pt = (page_h_pt - bbox.y - bbox.height).max(0.0);
let y_emu = (y_top_pt as f64 * EMU_PER_PT) as i64;
let image = Image {
data: Some(png),
format: Some(ImageFormat::Png),
display_width_emu: Some(w_emu),
display_height_emu: Some(h_emu),
pixel_width: Some(img.width()),
pixel_height: Some(img.height()),
positioning: ImagePositioning::Inline,
..Default::default()
};
out.push(PositionedImage {
image,
x_emu,
y_emu,
cx_emu: w_emu as i64,
cy_emu: h_emu as i64,
});
}
#[cfg(feature = "rendering")]
{
let existing_rects: Vec<(f32, f32, f32, f32)> = out
.iter()
.map(|pi| {
let x = pi.x_emu as f32 / EMU_PER_PT as f32;
let y_top = pi.y_emu as f32 / EMU_PER_PT as f32;
let w = pi.cx_emu as f32 / EMU_PER_PT as f32;
let h = pi.cy_emu as f32 / EMU_PER_PT as f32;
let y_pdf = (page_h_pt - y_top - h).max(0.0);
(x, y_pdf, w, h)
})
.collect();
let regions = crate::converters::form_xobject_finder::rasterize_form_and_inline_regions(
doc,
page_idx,
page_h_pt,
&existing_rects,
);
for ((x_pdf, y_pdf, w, h), png) in regions {
let w_emu = ((w as f64).max(1.0) * EMU_PER_PT) as u64;
let h_emu = ((h as f64).max(1.0) * EMU_PER_PT) as u64;
let x_emu = (x_pdf.max(0.0) as f64 * EMU_PER_PT) as i64;
let y_top_pt = (page_h_pt - y_pdf - h).max(0.0);
let y_emu = (y_top_pt as f64 * EMU_PER_PT) as i64;
let image = Image {
data: Some(png),
format: Some(ImageFormat::Png),
display_width_emu: Some(w_emu),
display_height_emu: Some(h_emu),
positioning: ImagePositioning::Inline,
..Default::default()
};
out.push(PositionedImage {
image,
x_emu,
y_emu,
cx_emu: w_emu as i64,
cy_emu: h_emu as i64,
});
}
}
out
}
fn page_to_section(
spans: &[TextSpan],
page_w_pt: f32,
page_h_pt: f32,
break_type: SectionBreakType,
images: &[PositionedImage],
rules: &[HorizontalRule],
options: &PdfToIrOptions,
color_counts: &HashMap<[u8; 3], u32>,
face_lookup: &HashMap<String, String>,
) -> Section {
let margin_twips = (DEFAULT_MARGIN_PT * PT_TO_TWIPS) as u32;
let page_setup = PageSetup {
width_twips: (page_w_pt * PT_TO_TWIPS) as u32,
height_twips: (page_h_pt * PT_TO_TWIPS) as u32,
margin_top_twips: margin_twips,
margin_bottom_twips: margin_twips,
margin_left_twips: margin_twips,
margin_right_twips: margin_twips,
landscape: page_w_pt > page_h_pt,
..Default::default()
};
if spans.is_empty() {
let elements = images.iter().map(positioned_image_to_element).collect();
return Section {
elements,
page_setup: Some(page_setup),
break_type,
..Default::default()
};
}
let median_pt = median_font_size(spans);
let (para_lines, all_lines) =
group_into_paragraphs_with_lines(spans, options.paragraph_gap_factor);
let columns = detect_columns(&all_lines, page_w_pt);
let mut elements: Vec<Element> = Vec::with_capacity(para_lines.len() + images.len());
for pi in images {
elements.push(positioned_image_to_element(pi));
}
let mut rules_top_down: Vec<f32> = rules.iter().map(|r| r.y_pdf).collect();
rules_top_down.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let mut rules_iter = rules_top_down.into_iter().peekable();
let mut prev_para_min_y: Option<f32> = None;
let mut prev_para_avg_pt: Option<f32> = None;
for lines in ¶_lines {
if lines.is_empty() {
continue;
}
let this_top_y = lines
.first()
.map(|line| {
line.iter()
.map(|s| s.bbox.y + s.bbox.height)
.fold(f32::MIN, f32::max)
})
.unwrap_or(0.0);
let this_bottom_y = lines
.last()
.map(|line| line.iter().map(|s| s.bbox.y).fold(f32::MAX, f32::min))
.unwrap_or(0.0);
while let Some(&rule_y) = rules_iter.peek() {
let after_prev = prev_para_min_y.is_none_or(|prev| rule_y < prev);
let before_this = rule_y > this_top_y;
if after_prev && before_this {
elements.push(Element::ThematicBreak);
rules_iter.next();
} else if rule_y >= prev_para_min_y.unwrap_or(f32::INFINITY) {
rules_iter.next();
} else {
break;
}
}
let this_avg_pt = {
let mut sum = 0.0_f32;
let mut n = 0_u32;
for line in lines {
for s in line {
sum += s.font_size;
n += 1;
}
}
if n > 0 {
sum / n as f32
} else {
median_pt
}
};
let mut excess_pt: f32 = 0.0;
if let (Some(prev_y), Some(prev_avg)) = (prev_para_min_y, prev_para_avg_pt) {
let gap_pt = prev_y - this_top_y;
let line_h_pt = prev_avg.max(this_avg_pt) * 1.2;
if gap_pt > line_h_pt * 1.5 {
excess_pt = (gap_pt - line_h_pt).max(0.0);
if excess_pt > 600.0 {
excess_pt = 600.0;
}
}
}
if excess_pt > 0.5 {
let twips = (excess_pt * PT_TO_TWIPS) as u32;
elements.push(Element::Paragraph(Paragraph {
space_before_twips: Some(twips),
..Default::default()
}));
}
let alignment = detect_paragraph_alignment(lines, page_w_pt);
let is_centered_block =
matches!(alignment, Some(office_oxide::ir::ParagraphAlignment::Center));
let lines_short = lines.iter().all(|line| {
if line.is_empty() {
return true;
}
let left = line.iter().map(|s| s.bbox.x).fold(f32::MAX, f32::min);
let right = line
.iter()
.map(|s| s.bbox.x + s.bbox.width)
.fold(f32::MIN, f32::max);
let line_w = (right - left).max(0.0);
line_w < page_w_pt * 0.75
});
if is_centered_block && lines.len() > 1 && lines_short {
let mut prev_inner_min_y: Option<f32> = prev_para_min_y;
let mut prev_inner_avg_pt: Option<f32> = prev_para_avg_pt;
for line in lines {
if line.is_empty() {
continue;
}
let single = std::slice::from_ref(line);
let inner_top_y = line
.iter()
.map(|s| s.bbox.y + s.bbox.height)
.fold(f32::MIN, f32::max);
let inner_bottom_y = line.iter().map(|s| s.bbox.y).fold(f32::MAX, f32::min);
let inner_avg_pt = if line.is_empty() {
median_pt
} else {
line.iter().map(|s| s.font_size).sum::<f32>() / line.len() as f32
};
while let Some(&rule_y) = rules_iter.peek() {
let after_prev = prev_inner_min_y.is_none_or(|prev| rule_y < prev);
let before_this = rule_y > inner_top_y;
if after_prev && before_this {
elements.push(Element::ThematicBreak);
rules_iter.next();
} else if rule_y >= prev_inner_min_y.unwrap_or(f32::INFINITY) {
rules_iter.next();
} else {
break;
}
}
let mut inner_excess_pt: f32 = 0.0;
if let (Some(prev_y), Some(prev_avg)) = (prev_inner_min_y, prev_inner_avg_pt) {
let gap_pt = prev_y - inner_top_y;
let line_h_pt = prev_avg.max(inner_avg_pt) * 1.2;
if gap_pt > line_h_pt * 1.5 {
inner_excess_pt = (gap_pt - line_h_pt).max(0.0);
if inner_excess_pt > 600.0 {
inner_excess_pt = 600.0;
}
}
}
if inner_excess_pt > 0.5 {
let twips = (inner_excess_pt * PT_TO_TWIPS) as u32;
elements.push(Element::Paragraph(Paragraph {
space_before_twips: Some(twips),
..Default::default()
}));
}
let inner_element = lines_to_element(
single,
median_pt,
options,
color_counts,
page_w_pt,
face_lookup,
);
elements.push(inner_element);
prev_inner_min_y = Some(inner_bottom_y);
prev_inner_avg_pt = Some(inner_avg_pt);
}
prev_para_min_y = prev_inner_min_y;
prev_para_avg_pt = prev_inner_avg_pt;
continue;
} else {
let element =
lines_to_element(lines, median_pt, options, color_counts, page_w_pt, face_lookup);
elements.push(element);
}
prev_para_min_y = Some(this_bottom_y);
prev_para_avg_pt = Some(this_avg_pt);
}
Section {
elements,
page_setup: Some(page_setup),
break_type,
columns,
..Default::default()
}
}
fn merge_lines_into_spans(lines: &[Vec<TextSpan>]) -> Vec<TextSpan> {
let mut out: Vec<TextSpan> = Vec::new();
for (li, line) in lines.iter().enumerate() {
if li > 0 {
if let (Some(prev), Some(next)) = (out.last_mut(), line.first()) {
let prev_text = prev.text.trim_end_matches([' ', '\t']);
let prev_ends_ws = prev.text.chars().last().is_none_or(|c| c.is_whitespace());
let next_starts_ws = next.text.chars().next().is_none_or(|c| c.is_whitespace());
let ends_hyphen = prev_text.ends_with('-')
&& prev_text
.chars()
.rev()
.nth(1)
.is_some_and(|c| c.is_alphabetic());
let starts_lower = next
.text
.trim_start()
.chars()
.next()
.is_some_and(|c| c.is_lowercase());
if ends_hyphen && starts_lower {
let trimmed: String = prev_text[..prev_text.len() - 1].to_string()
+ &prev.text[prev_text.len()..]; prev.text = trimmed;
} else if !prev_ends_ws && !next_starts_ws {
prev.text.push(' ');
}
}
}
out.extend(line.iter().cloned());
}
out
}
fn lines_to_element(
lines: &[Vec<TextSpan>],
median_pt: f32,
opts: &PdfToIrOptions,
color_counts: &HashMap<[u8; 3], u32>,
page_w_pt: f32,
face_lookup: &HashMap<String, String>,
) -> Element {
let group = merge_lines_into_spans(lines);
let avg_pt = if group.is_empty() {
median_pt
} else {
group.iter().map(|s| s.font_size).sum::<f32>() / group.len() as f32
};
let ratio = avg_pt / median_pt.max(1.0);
let alignment = detect_paragraph_alignment(lines, page_w_pt);
let inline = spans_to_inline(&group, color_counts, face_lookup);
if ratio >= opts.heading_ratios[0] {
Element::Heading(Heading {
level: 1,
content: inline,
alignment,
..Default::default()
})
} else if ratio >= opts.heading_ratios[1] {
Element::Heading(Heading {
level: 2,
content: inline,
alignment,
..Default::default()
})
} else if ratio >= opts.heading_ratios[2] {
Element::Heading(Heading {
level: 3,
content: inline,
alignment,
..Default::default()
})
} else {
Element::Paragraph(Paragraph {
content: inline,
alignment,
..Default::default()
})
}
}
fn detect_paragraph_alignment(
lines: &[Vec<TextSpan>],
page_w_pt: f32,
) -> Option<office_oxide::ir::ParagraphAlignment> {
use office_oxide::ir::ParagraphAlignment;
if lines.is_empty() || page_w_pt <= 0.0 {
return None;
}
let mid = page_w_pt * 0.5;
let centre_tol = page_w_pt * 0.08;
let min_left = page_w_pt * 0.10;
let mut all_centered = true;
let mut all_right = true;
for line in lines {
if line.is_empty() {
continue;
}
let left = line.iter().map(|s| s.bbox.x).fold(f32::MAX, f32::min);
let right = line
.iter()
.map(|s| s.bbox.x + s.bbox.width)
.fold(f32::MIN, f32::max);
let left_margin = left.max(0.0);
let right_margin = (page_w_pt - right).max(0.0);
let centre = (left + right) * 0.5;
if (centre - mid).abs() > centre_tol || left_margin < min_left || right_margin < min_left {
all_centered = false;
}
if left_margin <= page_w_pt * 0.25 || right_margin > page_w_pt * 0.10 {
all_right = false;
}
}
if all_centered {
Some(ParagraphAlignment::Center)
} else if all_right {
Some(ParagraphAlignment::Right)
} else {
None
}
}
fn group_into_paragraphs_with_lines(
spans: &[TextSpan],
gap_factor: f32,
) -> (Vec<Vec<Vec<TextSpan>>>, Vec<Vec<TextSpan>>) {
let mut sorted: Vec<&TextSpan> = spans.iter().collect();
sorted.sort_by(|a, b| {
let ay = a.bbox.y + a.bbox.height * 0.5;
let by = b.bbox.y + b.bbox.height * 0.5;
by.partial_cmp(&ay).unwrap_or(std::cmp::Ordering::Equal)
});
if sorted.is_empty() {
return (Vec::new(), Vec::new());
}
let mut lines: Vec<Vec<&TextSpan>> = Vec::new();
let mut cur_line: Vec<&TextSpan> = vec![sorted[0]];
for span in sorted.iter().skip(1) {
let last = cur_line.last().unwrap();
let last_cy = last.bbox.y + last.bbox.height * 0.5;
let span_cy = span.bbox.y + span.bbox.height * 0.5;
let lh = last.font_size.max(span.font_size);
if (last_cy - span_cy).abs() < lh * 0.8 {
cur_line.push(span);
} else {
lines.push(std::mem::take(&mut cur_line));
cur_line = vec![span];
}
}
lines.push(cur_line);
for line in &mut lines {
line.sort_by(|a, b| {
a.bbox
.x
.partial_cmp(&b.bbox.x)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
let owned_lines: Vec<Vec<TextSpan>> = lines
.iter()
.map(|l| l.iter().map(|s| (*s).clone()).collect())
.collect();
let mut paragraphs: Vec<Vec<Vec<TextSpan>>> = Vec::new();
let mut cur_para: Vec<Vec<TextSpan>> = Vec::new();
for i in 0..lines.len() {
let line_owned = owned_lines[i].clone();
if i == 0 {
cur_para.push(line_owned);
continue;
}
let prev = &lines[i - 1];
let cur = &lines[i];
let prev_bottom = prev.iter().map(|s| s.bbox.y).fold(f32::MAX, f32::min);
let cur_top = cur
.iter()
.map(|s| s.bbox.y + s.bbox.height)
.fold(f32::MIN, f32::max);
let lh = cur.iter().map(|s| s.font_size).fold(0.0_f32, f32::max);
let gap = prev_bottom - cur_top;
let prev_avg = prev.iter().map(|s| s.font_size).sum::<f32>() / prev.len() as f32;
let cur_avg = cur.iter().map(|s| s.font_size).sum::<f32>() / cur.len() as f32;
let size_jump = (cur_avg - prev_avg).abs() > 2.0;
if (gap > lh * gap_factor || size_jump) && !cur_para.is_empty() {
paragraphs.push(std::mem::take(&mut cur_para));
}
cur_para.push(line_owned);
}
if !cur_para.is_empty() {
paragraphs.push(cur_para);
}
(paragraphs, owned_lines)
}
fn detect_columns(all_lines: &[Vec<TextSpan>], page_w_pt: f32) -> Option<ColumnLayout> {
const MIN_GUTTER_PT: f32 = 36.0; const BIN_PT: f32 = 5.0;
if all_lines.len() < 8 {
return None;
}
let mid = page_w_pt * 0.5;
let mut col1_lefts: Vec<f32> = Vec::new();
let mut col2_lefts: Vec<f32> = Vec::new();
let mut col1_rights_on_two_col_lines: Vec<f32> = Vec::new();
for line in all_lines {
if line.is_empty() {
continue;
}
let lx = line.iter().map(|s| s.bbox.x).fold(f32::MAX, f32::min);
col1_lefts.push(lx);
let mut col2_l: Option<f32> = None;
for s in line {
if s.bbox.x >= mid {
let v = s.bbox.x;
col2_l = Some(col2_l.map_or(v, |cur| cur.min(v)));
}
}
if let Some(c2l) = col2_l {
col2_lefts.push(c2l);
let r1 = line
.iter()
.filter(|s| s.bbox.x + s.bbox.width <= mid)
.map(|s| s.bbox.x + s.bbox.width)
.fold(f32::MIN, f32::max);
if r1.is_finite() {
col1_rights_on_two_col_lines.push(r1);
}
}
}
let total_lines = all_lines.len() as f32;
if (col2_lefts.len() as f32) < total_lines * 0.25 {
return None;
}
let mode = |xs: &[f32]| -> Option<(f32, usize)> {
if xs.is_empty() {
return None;
}
let mut bins: HashMap<i32, usize> = HashMap::new();
for &x in xs {
*bins.entry((x / BIN_PT).round() as i32).or_insert(0) += 1;
}
let (&best_b, &best_n) = bins.iter().max_by_key(|(_, &n)| n)?;
Some((best_b as f32 * BIN_PT, best_n))
};
let (col1_left, col1_n) = mode(&col1_lefts)?;
let (col2_left, col2_n) = mode(&col2_lefts)?;
if (col1_n as f32) < (col1_lefts.len() as f32) * 0.25 {
return None;
}
if (col2_n as f32) < (col2_lefts.len() as f32) * 0.25 {
return None;
}
let col1_right = if col1_rights_on_two_col_lines.len() >= 5 {
let mut v = col1_rights_on_two_col_lines.clone();
v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
v[(v.len() as f32 * 0.9) as usize]
} else {
col2_left - MIN_GUTTER_PT
};
let gutter = col2_left - col1_right;
if gutter < MIN_GUTTER_PT {
return None;
}
if col2_left <= col1_left + MIN_GUTTER_PT * 2.0 {
return None;
}
let total_block_w = ((col2_left - col1_left) * 2.0).max(1.0);
let col_w = ((total_block_w - gutter) * 0.5).max(1.0);
Some(ColumnLayout {
count: 2,
space_twips: Some((gutter * PT_TO_TWIPS) as u32),
separator: false,
column_widths_twips: vec![(col_w * PT_TO_TWIPS) as u32, (col_w * PT_TO_TWIPS) as u32],
})
}
fn build_color_histogram(all_spans: &[Vec<TextSpan>]) -> HashMap<[u8; 3], u32> {
let mut counts: HashMap<[u8; 3], u32> = HashMap::new();
for spans in all_spans {
for s in spans {
let r = (s.color.r * 255.0).round() as u8;
let g = (s.color.g * 255.0).round() as u8;
let b = (s.color.b * 255.0).round() as u8;
if r == 0 && g == 0 && b == 0 {
continue;
}
*counts.entry([r, g, b]).or_insert(0) += 1;
}
}
counts
}
fn spans_to_inline(
spans: &[TextSpan],
color_counts: &HashMap<[u8; 3], u32>,
face_lookup: &HashMap<String, String>,
) -> Vec<InlineContent> {
spans
.iter()
.map(|s| InlineContent::Text(span_to_ir(s, color_counts, face_lookup)))
.collect()
}
fn span_to_ir(
span: &TextSpan,
color_counts: &HashMap<[u8; 3], u32>,
face_lookup: &HashMap<String, String>,
) -> IrSpan {
let resolved_name = face_lookup
.get(&span.font_name)
.cloned()
.unwrap_or_else(|| span.font_name.clone());
IrSpan {
text: span.text.clone(),
bold: span.font_weight.is_bold(),
italic: span.is_italic,
font_name: real_font_name(&resolved_name),
font_size_half_pt: Some((span.font_size * 2.0).round() as u32),
color: color_opt(&span.color, color_counts),
char_spacing_half_pt: char_spacing_opt(span.char_spacing),
..Default::default()
}
}
fn real_font_name(raw: &str) -> Option<String> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return None;
}
if trimmed.len() <= 4 {
let bytes = trimmed.as_bytes();
let alpha_prefix = bytes.iter().take_while(|b| b.is_ascii_alphabetic()).count();
let digit_suffix = bytes[alpha_prefix..]
.iter()
.take_while(|b| b.is_ascii_digit())
.count();
if (1..=2).contains(&alpha_prefix)
&& digit_suffix >= 1
&& alpha_prefix + digit_suffix == bytes.len()
{
return None;
}
}
Some(trimmed.to_string())
}
fn color_opt(c: &Color, counts: &HashMap<[u8; 3], u32>) -> Option<[u8; 3]> {
let r = (c.r * 255.0).round() as u8;
let g = (c.g * 255.0).round() as u8;
let b = (c.b * 255.0).round() as u8;
if r == 0 && g == 0 && b == 0 {
return None;
}
let rgb = [r, g, b];
if rgb == [0, 0, 255] {
return None;
}
let suspicious_canonical =
matches!(rgb, [0x80, 0x80, 0x80] | [0xC0, 0xC0, 0xC0] | [0xFF, 0, 0] | [0, 0xFF, 0]);
if suspicious_canonical {
let cnt = counts.get(&rgb).copied().unwrap_or(0);
if cnt < 3 {
return None;
}
}
Some(rgb)
}
fn char_spacing_opt(spacing_pt: f32) -> Option<i32> {
if spacing_pt.abs() > 0.005 {
Some((spacing_pt * 2.0).round() as i32)
} else {
None
}
}
fn median_font_size(spans: &[TextSpan]) -> f32 {
let mut sizes: Vec<f32> = spans.iter().map(|s| s.font_size).collect();
sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = sizes.len();
if n == 0 {
return 12.0;
}
if n.is_multiple_of(2) {
(sizes[n / 2 - 1] + sizes[n / 2]) / 2.0
} else {
sizes[n / 2]
}
}