use fleischwolf_core::{Node, PictureImage, Table};
use crate::layout::Region;
use crate::pdfium_backend::{PdfPage, TextCell};
fn area(l: f32, t: f32, r: f32, b: f32) -> f32 {
((r - l).max(0.0)) * ((b - t).max(0.0))
}
fn inter(a: &Region, l: f32, t: f32, r: f32, b: f32) -> f32 {
let il = a.l.max(l);
let it = a.t.max(t);
let ir = a.r.min(r);
let ib = a.b.min(b);
area(il, it, ir, ib)
}
pub fn resolve(mut regions: Vec<Region>) -> Vec<Region> {
regions.sort_by(|a, b| b.score.total_cmp(&a.score));
let mut kept: Vec<Region> = Vec::new();
for r in regions {
let ra = area(r.l, r.t, r.r, r.b).max(1.0);
let covered = kept.iter().any(|k| {
let i = inter(&r, k.l, k.t, k.r, k.b);
let ka = area(k.l, k.t, k.r, k.b).max(1.0);
i / ra > 0.7 || i / (ra + ka - i) > 0.5
});
if !covered {
kept.push(r);
}
}
dedup_nested_code(&mut kept);
kept
}
fn is_code_language(t: &str) -> bool {
let t = t.trim();
if t.is_empty() || t.chars().any(char::is_whitespace) || t.chars().count() > 12 {
return false;
}
const LANGS: &[&str] = &[
"xml",
"html",
"xhtml",
"json",
"jsonc",
"yaml",
"yml",
"toml",
"ini",
"c#",
"csharp",
"f#",
"fsharp",
"vb",
"c",
"c++",
"cpp",
"java",
"kotlin",
"scala",
"go",
"golang",
"rust",
"swift",
"javascript",
"js",
"typescript",
"ts",
"jsx",
"tsx",
"python",
"py",
"ruby",
"rb",
"php",
"perl",
"lua",
"r",
"dart",
"bash",
"sh",
"shell",
"powershell",
"zsh",
"batch",
"cmd",
"sql",
"tsql",
"plsql",
"graphql",
"dockerfile",
"makefile",
"css",
"scss",
"sass",
"less",
"markdown",
"md",
"tex",
"latex",
"diff",
"proto",
"razor",
"cshtml",
"xaml",
"aspx",
"http",
];
let lower = t.to_ascii_lowercase();
LANGS.contains(&lower.as_str())
}
fn code_language_labels(regions: &[Region], cells: &[TextCell]) -> Vec<bool> {
let mut drop = vec![false; regions.len()];
for (i, r) in regions.iter().enumerate() {
if matches!(r.label, "code" | "picture" | "table") {
continue;
}
if !is_code_language(®ion_text(r, cells)) {
continue;
}
let line_h = (r.b - r.t).abs().max(1.0);
let window = (line_h * 4.0).max(28.0);
let labels_code = regions.iter().enumerate().any(|(j, c)| {
if j == i || c.label != "code" {
return false;
}
let gap = c.t - r.b; let h_overlap = (r.r.min(c.r) - r.l.max(c.l)).max(0.0);
gap > -line_h * 3.0 && gap < window && h_overlap > 0.0
});
if labels_code {
drop[i] = true;
}
}
drop
}
fn dedup_nested_code(kept: &mut Vec<Region>) {
let mut drop = vec![false; kept.len()];
for i in 0..kept.len() {
if kept[i].label != "code" {
continue;
}
let ai = area(kept[i].l, kept[i].t, kept[i].r, kept[i].b).max(1.0);
for j in 0..kept.len() {
if i == j || drop[j] || kept[j].label != "code" {
continue;
}
let aj = area(kept[j].l, kept[j].t, kept[j].r, kept[j].b).max(1.0);
let overlap = inter(&kept[i], kept[j].l, kept[j].t, kept[j].r, kept[j].b);
if aj > ai && overlap / ai > 0.7 {
drop[i] = true;
break;
}
}
}
let mut keep = drop.iter();
kept.retain(|_| !*keep.next().unwrap());
}
pub fn add_orphan_regions(regions: &mut Vec<Region>, cells: &[TextCell]) {
let assigned = |c: &TextCell| {
let ca = area(c.l, c.t, c.r, c.b).max(1.0);
regions
.iter()
.any(|r| inter(r, c.l, c.t, c.r, c.b) / ca > 0.2)
};
let mut orphans: Vec<&TextCell> = cells
.iter()
.filter(|c| !c.text.trim().is_empty() && !assigned(c))
.collect();
if orphans.is_empty() {
return;
}
orphans.sort_by(|a, b| a.t.total_cmp(&b.t).then(a.l.total_cmp(&b.l)));
let mut merged: Vec<Region> = Vec::new();
for c in orphans {
let h = (c.b - c.t).abs().max(1.0);
if let Some(last) = merged.last_mut() {
let same_line = (last.t - c.t).abs() < h * 0.5;
let touching = c.l <= last.r + h && c.l >= last.l - h;
if same_line && touching {
last.l = last.l.min(c.l);
last.r = last.r.max(c.r);
last.t = last.t.min(c.t);
last.b = last.b.max(c.b);
continue;
}
}
merged.push(Region {
label: "text",
score: 0.0,
l: c.l,
t: c.t,
r: c.r,
b: c.b,
});
}
regions.extend(merged);
}
pub fn drop_false_pictures(
regions: &mut Vec<Region>,
cells: &[TextCell],
page_w: f32,
page_h: f32,
) {
if cells.iter().all(|c| c.text.trim().is_empty()) {
return; }
let content_regions = regions
.iter()
.filter(|r| r.label != "picture" && !region_text(r, cells).trim().is_empty())
.count();
if content_regions < 2 {
return;
}
let page_area = (page_w * page_h).max(1.0);
regions.retain(|r| {
if r.label != "picture" || r.score >= 0.5 {
return true;
}
if area(r.l, r.t, r.r, r.b) / page_area >= 0.25 {
return true; }
cells.iter().any(|c| {
let ca = area(c.l, c.t, c.r, c.b).max(1.0);
!c.text.trim().is_empty() && inter(r, c.l, c.t, c.r, c.b) / ca > 0.5
})
});
}
fn is_page_number(region: &Region, cells: &[TextCell], page_h: f32) -> bool {
let t = region_text(region, cells);
let t = t.trim();
!t.is_empty()
&& t.chars().all(|c| c.is_ascii_digit())
&& (region.b - region.t).abs() < 30.0
&& (region.t < page_h * 0.12 || region.b > page_h * 0.88)
}
fn is_skipped(label: &str) -> bool {
matches!(
label,
"page_header"
| "page_footer"
| "checkbox_selected"
| "checkbox_unselected"
| "form"
| "key_value_region"
| "document_index"
)
}
fn order_regions<T>(items: &mut [T], page_w: f32, reg: impl Fn(&T) -> &Region) {
let cx = page_w / 2.0;
let band = page_w * 0.08;
let crossing = items
.iter()
.filter(|t| {
let r = reg(t);
r.l < cx - band && r.r > cx + band
})
.count();
let two_col = !items.is_empty()
&& (crossing as f32) / (items.len() as f32) < 0.25
&& items.iter().any(|t| reg(t).r <= cx)
&& items.iter().any(|t| reg(t).l >= cx);
if two_col {
let full_band = page_w * 0.2;
let is_full = |r: &Region| r.l < cx - full_band && r.r > cx + full_band;
let full_tops: Vec<f32> = items
.iter()
.map(®)
.filter(|r| is_full(r))
.map(|r| r.t)
.collect();
let key = |r: &Region| -> (usize, u8) {
let bnd = full_tops.iter().filter(|&&ft| ft < r.t - 1.0).count();
let col = if is_full(r) {
3
} else if (r.l + r.r) / 2.0 >= cx {
2
} else {
1
};
(bnd, col)
};
items.sort_by(|a, b| {
let (a, b) = (reg(a), reg(b));
key(a)
.cmp(&key(b))
.then(a.t.total_cmp(&b.t))
.then(a.l.total_cmp(&b.l))
});
} else {
items.sort_by(|a, b| {
let (a, b) = (reg(a), reg(b));
a.t.total_cmp(&b.t).then(a.l.total_cmp(&b.l))
});
}
}
fn parse_ordered_marker(s: &str) -> Option<(u64, String)> {
let digits: String = s.chars().take_while(|c| c.is_ascii_digit()).collect();
if digits.is_empty() {
return None;
}
let rest = s[digits.len()..].strip_prefix('.')?;
let number = digits.parse().ok()?;
Some((number, rest.trim_start().to_string()))
}
fn md_escape(text: &str) -> String {
text.replace('_', "\\_")
.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
}
fn clean_text(text: &str) -> String {
let hangul = text.chars().any(|c| ('\u{AC00}'..='\u{D7A3}').contains(&c));
let dquote = if hangul { "'" } else { "\"" };
let replaced = text
.replace("\u{2} ", "")
.replace("\u{ad} ", "")
.replace(['\u{2}', '\u{ad}'], "") .replace(['\u{2018}', '\u{2019}'], "'") .replace(['\u{201c}', '\u{201d}'], dquote) .replace(['\u{2013}', '\u{2014}', '\u{2212}'], "-") .replace('\u{2044}', "/") .replace('\u{2026}', "..."); let out = if crate::pdfium_backend::use_dp_lines() {
replaced.replace(['\n', '\r', '\t'], " ").trim().to_string()
} else {
replaced.split_whitespace().collect::<Vec<_>>().join(" ")
};
fix_arabic_lam_alef(&out)
}
fn fix_arabic_lam_alef(s: &str) -> String {
let is_arabic_letter = |c: char| ('\u{0620}'..='\u{064A}').contains(&c);
let chars: Vec<char> = s.chars().collect();
if !chars.iter().any(|&c| is_arabic_letter(c)) {
return s.to_string(); }
let mut a: Vec<char> = Vec::with_capacity(chars.len());
let mut i = 0;
while i < chars.len() {
let c = chars[i];
if matches!(c, '\u{0622}' | '\u{0623}' | '\u{0625}')
&& chars.get(i + 1) == Some(&'\u{0644}')
&& i > 0
&& is_arabic_letter(chars[i - 1])
&& chars[i - 1] != '\u{0644}'
{
a.push('\u{0644}');
a.push(c);
i += 2;
continue;
}
a.push(c);
i += 1;
}
let mut out: Vec<char> = Vec::with_capacity(a.len());
for (j, &c) in a.iter().enumerate() {
if j > 0 {
let p = a[j - 1];
if (is_arabic_letter(p) && c.is_ascii_alphabetic())
|| (p.is_ascii_alphabetic() && is_arabic_letter(c))
{
out.push(' ');
}
}
out.push(c);
}
out.into_iter().collect()
}
pub(crate) fn resolve_link_anchors(page: &PdfPage) -> Vec<(String, String)> {
let mut out: Vec<(String, String)> = Vec::new();
let words = if page.word_cells.is_empty() {
&page.cells
} else {
&page.word_cells
};
for link in &page.links {
let mut inside: Vec<&TextCell> = words
.iter()
.filter(|c| {
let (cx, cy) = ((c.l + c.r) / 2.0, (c.t + c.b) / 2.0);
cx >= link.l && cx <= link.r && cy >= link.t && cy <= link.b
})
.collect();
let band = inside
.iter()
.map(|c| (c.b - c.t).abs())
.fold(0.0f32, f32::max)
.max(1.0);
inside.sort_by_key(|c| ((c.t / band).round() as i64, (c.l * 10.0) as i64));
let anchor = clean_text(
&inside
.iter()
.map(|c| c.text.trim())
.filter(|t| !t.is_empty())
.collect::<Vec<_>>()
.join(" "),
);
if anchor.is_empty() {
continue;
}
if out
.last()
.is_some_and(|(a, u)| a == &anchor && u == &link.uri)
{
continue;
}
out.push((anchor, link.uri.clone()));
}
out
}
fn region_text(region: &Region, cells: &[TextCell]) -> String {
let mut inside: Vec<&TextCell> = cells
.iter()
.filter(|c| {
let ca = area(c.l, c.t, c.r, c.b).max(1.0);
inter(region, c.l, c.t, c.r, c.b) / ca > 0.5
})
.collect();
let band = inside
.iter()
.map(|c| (c.b - c.t).abs())
.fold(0.0f32, f32::max)
.max(1.0);
let arabic = inside
.iter()
.flat_map(|c| c.text.chars())
.filter(|&c| ('\u{0600}'..='\u{06FF}').contains(&c))
.count();
let latin = inside
.iter()
.flat_map(|c| c.text.chars())
.filter(|c| c.is_ascii_alphabetic())
.count();
let rtl = arabic > latin;
inside.sort_by_key(|c| {
let x = (c.l * 10.0) as i64;
((c.t / band).round() as i64, if rtl { -x } else { x })
});
let dp = crate::pdfium_backend::use_dp_lines();
let mut joined = String::new();
let mut prev: Option<&&TextCell> = None;
for c in &inside {
let t = c.text.trim();
if t.is_empty() {
continue;
}
if let Some(p) = prev {
let same_band = ((p.t / band).round() as i64) == ((c.t / band).round() as i64);
let h = (c.b - c.t).abs().max((p.b - p.t).abs()).max(1.0);
let gap = if rtl { p.l - c.r } else { c.l - p.r };
let ends_dash = matches!(
joined.chars().last(),
Some('-' | '\u{2010}' | '\u{2013}' | '\u{2014}')
);
let before = joined.chars().nth_back(1); let next = t.chars().next();
let dehyph = dp
&& ends_dash
&& before.is_some_and(|c| c.is_alphabetic())
&& next.is_some_and(|n| {
n.is_lowercase()
|| (n.is_uppercase() && before.is_some_and(|b| b.is_lowercase()))
});
if dehyph {
joined.pop();
} else if dp || !same_band || gap > h * 0.25 {
joined.push(' ');
}
}
joined.push_str(t);
prev = Some(c);
}
clean_text(&joined)
}
fn tighten_code_punct(s: &str) -> String {
s.replace(" .", ".")
.replace(" ,", ",")
.replace(" ;", ";")
.replace(" )", ")")
.replace(" (", "(")
}
fn code_region_text(region: &Region, cells: &[TextCell]) -> String {
let mut inside: Vec<&TextCell> = cells
.iter()
.filter(|c| {
let ca = area(c.l, c.t, c.r, c.b).max(1.0);
inter(region, c.l, c.t, c.r, c.b) / ca > 0.5
})
.filter(|c| !c.text.trim().is_empty())
.collect();
if inside.is_empty() {
return String::new();
}
let band = inside
.iter()
.map(|c| (c.b - c.t).abs())
.fold(0.0f32, f32::max)
.max(1.0);
let line_of = |c: &TextCell| (c.t / band).round() as i64;
inside.sort_by_key(|c| (line_of(c), (c.l * 10.0) as i64));
let (mut total_w, mut total_chars) = (0.0f32, 0usize);
for c in &inside {
let n = c.text.trim().chars().count();
if n > 0 {
total_w += (c.r - c.l).max(0.0);
total_chars += n;
}
}
let char_w = if total_chars > 0 {
(total_w / total_chars as f32).max(1.0)
} else {
1.0
};
let base_l = inside.iter().map(|c| c.l).fold(f32::INFINITY, f32::min);
let mut lines: Vec<String> = Vec::new();
let mut cur: Option<i64> = None;
for c in &inside {
let text = tighten_code_punct(&clean_text(c.text.trim()));
if Some(line_of(c)) == cur {
if let Some(last) = lines.last_mut() {
last.push(' ');
last.push_str(&text);
}
continue;
}
let indent = ((c.l - base_l) / char_w).round().max(0.0) as usize;
lines.push(format!("{}{}", " ".repeat(indent), text));
cur = Some(line_of(c));
}
lines.join("\n")
}
fn reconstruct_table(region: &Region, cells: &[TextCell]) -> Vec<Vec<String>> {
let mut inside: Vec<&TextCell> = cells
.iter()
.filter(|c| {
let ca = area(c.l, c.t, c.r, c.b).max(1.0);
inter(region, c.l, c.t, c.r, c.b) / ca > 0.5
})
.collect();
if inside.is_empty() {
return Vec::new();
}
inside.sort_by(|a, b| a.t.total_cmp(&b.t));
let mut rows: Vec<(f32, Vec<&TextCell>)> = Vec::new();
for c in &inside {
let cyc = (c.t + c.b) / 2.0;
let lh = (c.b - c.t).abs().max(1.0);
if let Some((ryc, row)) = rows.last_mut() {
if (cyc - *ryc).abs() < lh * 0.7 {
row.push(c);
continue;
}
}
rows.push((cyc, vec![c]));
}
let tol = {
let mut hs: Vec<f32> = inside.iter().map(|c| (c.b - c.t).abs()).collect();
hs.sort_by(f32::total_cmp);
hs[hs.len() / 2].max(4.0) * 1.5
};
let mut lefts: Vec<f32> = inside.iter().map(|c| c.l).collect();
lefts.sort_by(f32::total_cmp);
let mut col_starts: Vec<f32> = Vec::new();
for l in lefts {
if col_starts.last().is_none_or(|&last| l - last > tol) {
col_starts.push(l);
}
}
let ncols = col_starts.len().max(1);
let col_of = |l: f32| -> usize {
col_starts
.iter()
.rposition(|&s| l + tol * 0.5 >= s)
.unwrap_or(0)
.min(ncols - 1)
};
let mut grid = Vec::with_capacity(rows.len());
for (_, mut row) in rows {
row.sort_by(|a, b| a.l.total_cmp(&b.l));
let mut cols = vec![String::new(); ncols];
for c in row {
let ci = col_of(c.l);
let t = c.text.trim().replace(['\u{2}', '\u{ad}'], "");
if cols[ci].is_empty() {
cols[ci] = t;
} else {
cols[ci].push(' ');
cols[ci].push_str(&t);
}
}
grid.push(cols);
}
grid
}
fn crop_region(page: &PdfPage, region: &Region) -> Option<PictureImage> {
let s = page.scale;
let (iw, ih) = (page.image.width(), page.image.height());
let x = (region.l * s).max(0.0) as u32;
let y = (region.t * s).max(0.0) as u32;
if x >= iw || y >= ih {
return None;
}
let w = (((region.r - region.l) * s) as u32).min(iw - x);
let h = (((region.b - region.t) * s) as u32).min(ih - y);
if w == 0 || h == 0 {
return None;
}
let sub = image::imageops::crop_imm(&page.image, x, y, w, h).to_image();
let mut buf = std::io::Cursor::new(Vec::new());
sub.write_to(&mut buf, image::ImageFormat::Png).ok()?;
Some(PictureImage {
mimetype: "image/png".into(),
width: w,
height: h,
data: buf.into_inner(),
})
}
fn pair_captions(regions: &[Region]) -> Vec<Option<usize>> {
let mut pairs = vec![None; regions.len()];
let mut taken = vec![false; regions.len()];
for (pi, p) in regions.iter().enumerate() {
if p.label != "picture" {
continue;
}
let mut best: Option<(usize, f32)> = None;
for (ci, c) in regions.iter().enumerate() {
if c.label != "caption" || taken[ci] {
continue;
}
let line_h = (c.b - c.t).abs().max(1.0);
let gap = c.t - p.b; let h_overlap = (p.r.min(c.r) - p.l.max(c.l)).max(0.0);
if gap > -line_h && gap < line_h * 3.0 && h_overlap > 0.0 {
let dist = gap.abs();
if best.is_none_or(|(_, bd)| dist < bd) {
best = Some((ci, dist));
}
}
}
if let Some((ci, _)) = best {
pairs[pi] = Some(ci);
taken[ci] = true;
}
}
pairs
}
fn pair_code_captions(regions: &[Region]) -> Vec<Option<usize>> {
let mut pairs = vec![None; regions.len()];
let mut taken = vec![false; regions.len()];
for (pi, p) in regions.iter().enumerate() {
if p.label != "code" {
continue;
}
let mut best: Option<(usize, f32)> = None;
for (ci, c) in regions.iter().enumerate() {
if c.label != "caption" || taken[ci] {
continue;
}
let line_h = (c.b - c.t).abs().max(1.0);
let gap = p.t - c.b; let h_overlap = (p.r.min(c.r) - p.l.max(c.l)).max(0.0);
if gap > -line_h && gap < line_h * 3.0 && h_overlap > 0.0 {
let dist = gap.abs();
if best.is_none_or(|(_, bd)| dist < bd) {
best = Some((ci, dist));
}
}
}
if let Some((ci, _)) = best {
pairs[pi] = Some(ci);
taken[ci] = true;
}
}
pairs
}
pub fn assemble_page(
page: &PdfPage,
regions: Vec<Region>,
table_rows: &[Option<Vec<Vec<String>>>],
) -> (Vec<Node>, Vec<(String, String)>) {
let mut nodes: Vec<Node> = Vec::new();
let links = resolve_link_anchors(page);
let mut items: Vec<(Region, Option<Vec<Vec<String>>>)> = regions
.into_iter()
.enumerate()
.map(|(i, r)| (r, table_rows.get(i).cloned().flatten()))
.collect();
order_regions(&mut items, page.width, |it| &it.0);
let page_h = page.height;
items.sort_by_key(|(r, _)| !is_page_number(r, &page.cells, page_h));
let table_rows: Vec<Option<Vec<Vec<String>>>> = items.iter().map(|(_, t)| t.clone()).collect();
let regions: Vec<Region> = items.into_iter().map(|(r, _)| r).collect();
let caption_for = pair_captions(®ions);
let code_caption_for = pair_code_captions(®ions);
let mut consumed = vec![false; regions.len()];
for ci in caption_for.iter().flatten() {
consumed[*ci] = true;
}
for ci in code_caption_for.iter().flatten() {
consumed[*ci] = true;
}
for (i, is_label) in code_language_labels(®ions, &page.cells)
.into_iter()
.enumerate()
{
if is_label {
consumed[i] = true;
}
}
for (i, region) in regions.iter().enumerate() {
if is_skipped(region.label) || consumed[i] {
continue;
}
if region.label == "picture" {
let caption = caption_for[i]
.map(|ci| region_text(®ions[ci], &page.cells))
.filter(|t| !t.is_empty());
nodes.push(Node::Picture {
caption,
image: crate::timing::timed("crop_region", || crop_region(page, region)),
});
continue;
}
let text = region_text(region, &page.cells);
if text.is_empty() {
continue;
}
match region.label {
"title" | "section_header" => nodes.push(Node::Heading {
level: 2,
text: md_escape(&text),
}),
"list_item" => {
let stripped = text
.trim_start_matches(['•', '◦', '▪', '·', '*', '-'])
.trim_start()
.to_string();
if let Some((number, rest)) = parse_ordered_marker(&stripped) {
nodes.push(Node::ListItem {
ordered: true,
number,
first_in_list: false,
text: md_escape(&rest),
level: 0,
});
} else {
nodes.push(Node::ListItem {
ordered: false,
number: 0,
first_in_list: false,
text: md_escape(&stripped),
level: 0,
});
}
}
"table" => {
let rows = table_rows[i].clone().unwrap_or_else(|| {
let rows = reconstruct_table(region, &page.cells);
if rows.iter().any(|r| r.len() > 1) {
rows
} else {
vec![vec![text.clone()]]
}
});
nodes.push(Node::Table(Table { rows }));
}
"formula" => nodes.push(Node::Paragraph {
text: "<!-- formula-not-decoded -->".into(),
}),
"code" => {
let code = code_region_text(region, &page.code_cells);
let code = if code.is_empty() {
tighten_code_punct(&text)
} else {
code
};
nodes.push(Node::Code {
language: None,
text: code,
});
if let Some(ci) = code_caption_for[i] {
let cap = region_text(®ions[ci], &page.cells);
if !cap.is_empty() {
nodes.push(Node::Paragraph { text: cap });
}
}
}
_ => nodes.push(Node::Paragraph {
text: md_escape(&text),
}),
}
}
(nodes, links)
}
fn looks_like_caption(text: &str) -> bool {
let head: String = text.trim_start().chars().take(14).collect();
(head.starts_with("Fig") || head.starts_with("Table"))
&& head.contains(|c: char| c.is_ascii_digit())
}
fn paragraph_is_open(text: &str) -> bool {
text.trim_end().chars().next_back().is_some_and(|c| {
c.is_alphabetic() || matches!(c, '-' | '\u{2010}' | '\u{2013}' | '\u{2014}')
})
}
pub(crate) fn merge_continuations(nodes: &mut Vec<Node>) {
let mut i = 0;
while i + 1 < nodes.len() {
let Node::Paragraph { text: a } = &nodes[i] else {
i += 1;
continue;
};
if looks_like_caption(a) {
i += 1;
continue;
}
if !paragraph_is_open(a) {
i += 1;
continue;
}
let mut j = i + 1;
while matches!(nodes.get(j), Some(Node::Picture { .. }))
|| matches!(nodes.get(j), Some(Node::Paragraph { text }) if looks_like_caption(text))
{
j += 1;
}
let cont = matches!(nodes.get(j), Some(Node::Paragraph { text: b })
if b.trim_start().chars().next().is_some_and(char::is_lowercase));
if cont {
let a = match &nodes[i] {
Node::Paragraph { text } => text.trim_end().to_string(),
_ => unreachable!(),
};
let b = match &nodes[j] {
Node::Paragraph { text } => text.trim_start().to_string(),
_ => unreachable!(),
};
nodes[i] = Node::Paragraph {
text: format!("{a} {b}"),
};
nodes.remove(j);
} else {
i += 1;
}
}
}
fn hold_start(nodes: &[Node]) -> usize {
for k in (0..nodes.len()).rev() {
match &nodes[k] {
Node::Picture { .. } => continue,
Node::Paragraph { text } if looks_like_caption(text) => continue,
Node::Paragraph { text } if paragraph_is_open(text) => return k,
_ => return nodes.len(),
}
}
nodes.len()
}
pub(crate) struct StreamAssembler {
pending: Vec<Node>,
}
impl StreamAssembler {
pub(crate) fn new() -> Self {
Self {
pending: Vec::new(),
}
}
pub(crate) fn push(&mut self, mut nodes: Vec<Node>) -> Vec<Node> {
self.pending.append(&mut nodes);
merge_continuations(&mut self.pending);
let cut = hold_start(&self.pending);
let tail = self.pending.split_off(cut);
std::mem::replace(&mut self.pending, tail)
}
pub(crate) fn finish(self) -> Vec<Node> {
self.pending
}
}
#[cfg(test)]
mod tests {
use super::clean_text;
use super::{code_region_text, merge_continuations, StreamAssembler};
use crate::layout::Region;
use crate::pdfium_backend::TextCell;
use fleischwolf_core::Node;
fn cell(text: &str, l: f32, t: f32, r: f32, b: f32) -> TextCell {
TextCell {
text: text.into(),
l,
t,
r,
b,
}
}
fn region(label: &'static str, score: f32, l: f32, t: f32, r: f32, b: f32) -> Region {
Region {
label,
score,
l,
t,
r,
b,
}
}
#[test]
fn resolve_collapses_nested_code_keeping_the_larger_box() {
let tight = region("code", 0.95, 78.0, 292.0, 300.0, 330.0);
let wide = region("code", 0.66, 63.0, 260.0, 320.0, 346.0);
let kept = super::resolve(vec![tight, wide]);
assert_eq!(kept.len(), 1, "nested code boxes must collapse to one");
assert!(
kept[0].l == 63.0 && kept[0].b == 346.0,
"the larger containing box is kept"
);
}
#[test]
fn resolve_keeps_distinct_and_differently_typed_regions() {
let text = region("text", 0.95, 90.0, 210.0, 200.0, 230.0);
let table = region("table", 0.60, 80.0, 200.0, 400.0, 500.0);
assert_eq!(super::resolve(vec![text, table]).len(), 2);
let code_a = region("code", 0.9, 78.0, 100.0, 300.0, 140.0);
let code_b = region("code", 0.9, 78.0, 300.0, 300.0, 360.0); assert_eq!(super::resolve(vec![code_a, code_b]).len(), 2);
}
#[test]
fn code_language_label_above_code_is_detected() {
let label = region("section_header", 0.9, 76.0, 540.0, 96.0, 549.0);
let code = region("code", 0.7, 77.0, 552.0, 290.0, 640.0);
let heading = region("section_header", 0.9, 76.0, 500.0, 260.0, 512.0);
let cells = vec![
cell("XML", 78.0, 541.0, 94.0, 548.0), cell("Overview", 78.0, 501.0, 250.0, 511.0), ];
let drop = super::code_language_labels(&[label, code, heading], &cells);
assert_eq!(drop, vec![true, false, false], "only the label is consumed");
let label2 = region("section_header", 0.9, 76.0, 540.0, 96.0, 549.0);
let only = vec![cell("XML", 78.0, 541.0, 94.0, 548.0)];
assert_eq!(super::code_language_labels(&[label2], &only), vec![false]);
let inside_lbl = region("text", 0.9, 76.0, 540.0, 96.0, 549.0);
let wide_code = region("code", 0.7, 63.0, 531.0, 320.0, 654.0);
let cells2 = vec![cell("XML", 78.0, 541.0, 94.0, 548.0)];
assert_eq!(
super::code_language_labels(&[inside_lbl, wide_code], &cells2),
vec![true, false]
);
assert!(super::is_code_language("XML") && super::is_code_language("c#"));
assert!(!super::is_code_language("Configure") && !super::is_code_language("XML schema"));
}
#[test]
fn code_region_text_keeps_lines_and_indentation() {
let region = Region {
label: "code",
score: 1.0,
l: 0.0,
t: -5.0,
r: 100.0,
b: 40.0,
};
let cells = vec![
cell("struct P {", 10.0, 0.0, 70.0, 10.0),
cell("int X;", 22.0, 12.0, 58.0, 22.0),
cell("}", 10.0, 24.0, 16.0, 34.0),
];
assert_eq!(code_region_text(®ion, &cells), "struct P {\n int X;\n}");
}
#[test]
fn code_region_text_tightens_punctuation_without_eating_indentation() {
let region = Region {
label: "code",
score: 1.0,
l: 0.0,
t: -5.0,
r: 100.0,
b: 40.0,
};
let cells = vec![
cell("builder", 10.0, 0.0, 52.0, 10.0),
cell(".Foo (x)", 22.0, 12.0, 70.0, 22.0),
];
assert_eq!(code_region_text(®ion, &cells), "builder\n .Foo(x)");
}
#[test]
fn code_region_text_orders_out_of_order_cells_and_ignores_blank_lines() {
let region = Region {
label: "code",
score: 1.0,
l: 0.0,
t: -5.0,
r: 100.0,
b: 60.0,
};
let cells = vec![
cell("b();", 10.0, 24.0, 34.0, 34.0),
cell(" ", 10.0, 12.0, 20.0, 22.0),
cell("a();", 10.0, 0.0, 34.0, 10.0),
];
assert_eq!(code_region_text(®ion, &cells), "a();\nb();");
assert_eq!(code_region_text(®ion, &[]), "");
}
fn para(text: &str) -> Node {
Node::Paragraph { text: text.into() }
}
fn assert_stream_eq(nodes: &[Node], splits: &[usize]) {
let mut want = nodes.to_vec();
merge_continuations(&mut want);
let mut asm = StreamAssembler::new();
let mut got = Vec::new();
let mut start = 0;
for &end in splits {
got.extend(asm.push(nodes[start..end].to_vec()));
start = end;
}
got.extend(asm.push(nodes[start..].to_vec()));
got.extend(asm.finish());
assert_eq!(got, want, "stream assembly diverged (splits={splits:?})");
}
#[test]
fn stream_assembler_matches_merge_continuations() {
let cross = [para("the definition of"), para("lists in scope")];
assert_stream_eq(&cross, &[1]);
assert_stream_eq(&cross, &[]);
let wrap = [
para("the wing type that is"),
Node::Picture {
caption: None,
image: None,
},
para("Fig. 1. a diagram"),
para("the most common kind"),
];
for splits in [&[][..], &[1][..], &[2][..], &[3][..], &[1, 3][..]] {
assert_stream_eq(&wrap, splits);
}
let blocked = [
para("ends mid word and"),
Node::Heading {
level: 2,
text: "New Section".into(),
},
para("more body here"),
];
for splits in [&[][..], &[1][..], &[2][..]] {
assert_stream_eq(&blocked, splits);
}
let chain = [
para("alpha beta"),
para("gamma delta"),
para("epsilon zeta"),
];
assert_stream_eq(&chain, &[1, 2]);
}
#[test]
fn clean_text_dehyphenates_and_normalizes_typography() {
assert_eq!(clean_text("com\u{2} pact"), "compact");
assert_eq!(clean_text("end-to\u{2} end deep"), "end-toend deep");
assert_eq!(clean_text("word\u{2}"), "word");
assert_eq!(
clean_text("Graph\u{2019}s \u{201c}x\u{201d}"),
"Graph's \"x\""
);
assert_eq!(clean_text("a\u{2026}"), "a...");
assert_eq!(clean_text("a b\nc"), "a b c");
}
#[test]
fn lam_alef_only_swaps_a_genuinely_reversed_ligature() {
assert_eq!(
clean_text("\u{0628}\u{0623}\u{0644}"),
"\u{0628}\u{0644}\u{0623}"
);
assert_eq!(
clean_text("\u{0627}\u{0644}\u{0622}\u{0644}\u{064a}"),
"\u{0627}\u{0644}\u{0622}\u{0644}\u{064a}"
);
}
}