Skip to main content

cli_justify/pdf_hybrid/
page_stream.rs

1use super::engine::justify_pdf_hybrid;
2use super::structure::{
3  looks_like_git_log_graph_line, looks_like_table_or_figure_caption,
4  parse_list_marker,
5};
6use crate::text_utils::char_len;
7
8// Same lookback window the per-page engine uses when reasoning about
9// sibling list / caption continuity, applied here to peek across the
10// page boundary in `smooth_pdf_page_seams`.
11const MAX_SEAM_LOOKBACK_LINES: usize = 12;
12
13/// Justified output for a single PDF page, augmented with the raw text of
14/// any partial paragraphs that may continue onto a neighbouring page.
15pub struct PdfPageJustified {
16  pub lines: Vec<String>,
17  pub head_partial: Option<PartialParagraph>,
18  pub tail_partial: Option<PartialParagraph>,
19}
20
21/// A paragraph (or paragraph fragment) at a page boundary together with the
22/// number of justified output lines it occupies when laid out alone. The
23/// `line_count` lets a consumer splice a re-justified seam back into the
24/// page's `lines` by replacing exactly the partial's worth of lines.
25pub struct PartialParagraph {
26  pub raw_text: String,
27  pub line_count: usize,
28}
29
30pub fn justify_pdf_page(raw_text: &str, col: usize) -> PdfPageJustified {
31  // Strip leading and trailing blanks so adjacent pages don't compound
32  // their boundary blanks at concatenation time. A page's leading blank
33  // is always an artifact of pdf_oxide's paragraph-break detector firing
34  // before the first content row; a trailing blank is always the empty
35  // element produced by `text.split('\n')` on a raw page that ends with
36  // `\n`. Neither carries meaning across the page boundary. With them
37  // stripped, the seam between pages is inserted by `flat_lines` (1
38  // blank for an ordinary paragraph break, 0 for sibling list / caption
39  // continuity) and both `rendered_line_count` and `flat_lines` agree
40  // on the per-page contribution.
41  let mut lines = justify_pdf_hybrid(raw_text, col);
42  trim_edge_blanks(&mut lines);
43  let (head_raw, tail_raw) = detect_partial_paragraphs(raw_text);
44
45  let head_partial = head_raw.map(|raw| {
46    let mut head_lines = justify_pdf_hybrid(&raw, col);
47    trim_edge_blanks(&mut head_lines);
48    PartialParagraph { raw_text: raw, line_count: head_lines.len() }
49  });
50  let tail_partial = tail_raw.map(|raw| {
51    let mut tail_lines = justify_pdf_hybrid(&raw, col);
52    trim_edge_blanks(&mut tail_lines);
53    PartialParagraph { raw_text: raw, line_count: tail_lines.len() }
54  });
55
56  PdfPageJustified { lines, head_partial, tail_partial }
57}
58
59fn trim_edge_blanks(lines: &mut Vec<String>) {
60  while lines.last().is_some_and(String::is_empty) {
61    lines.pop();
62  }
63  while lines.first().is_some_and(String::is_empty) {
64    lines.remove(0);
65  }
66}
67
68/// Re-justify a seam paragraph formed by joining the trailing partial of one
69/// page with the leading partial of the next. The joined text is fed through
70/// the standard PDF justifier so soft hyphens, mid-word breaks and similar
71/// cross-line repairs happen as if the paragraph had never been split.
72pub fn justify_pdf_seam(
73  prev_tail_raw: &str,
74  next_head_raw: &str,
75  col: usize,
76) -> Vec<String> {
77  let prev = prev_tail_raw.trim_end_matches(['\n', ' ', '\t']);
78  let next = next_head_raw.trim_start_matches(['\n', ' ', '\t']);
79  let mut lines = if prev.is_empty() {
80    justify_pdf_hybrid(next, col)
81  } else if next.is_empty() {
82    justify_pdf_hybrid(prev, col)
83  } else {
84    let joined = format!("{prev}\n{next}");
85    justify_pdf_hybrid(&joined, col)
86  };
87  trim_edge_blanks(&mut lines);
88  lines
89}
90
91/// Number of blank lines to insert between two adjacent PDF page outputs
92/// in the streaming reader.
93///
94/// Returns:
95///   * `0` when the two pages should read as one continuous block — a bulleted
96///     / numbered list whose sibling items span the page break, or a caption
97///     list (`Plate N …`, `Figure 3.4 …`, `Table 2 …`) whose entries straddle a
98///     page boundary.
99///   * `1` otherwise, as the normal paragraph separator.
100///
101/// Both `this_lines` and `next_lines` are the per-page `standalone_lines`
102/// produced by `justify_pdf_page`, with edge blanks already stripped.
103/// `flat_lines` calls this to decide the separator; `rendered_line_count`
104/// calls it to keep the per-page count in sync with what `flat_lines`
105/// produces, so cursor positioning and "jump to page" stay correct.
106pub fn inter_page_blank_count(
107  this_lines: &[String],
108  next_lines: &[String],
109) -> usize {
110  let Some(first_next) = next_lines.iter().find(|l| !l.is_empty()) else {
111    return 1;
112  };
113
114  if let Some((next_indent, next_marker, _)) = parse_list_marker(first_next)
115    && prior_is_sibling_list_item(this_lines, &next_indent, &next_marker)
116  {
117    return 0;
118  }
119  if looks_like_table_or_figure_caption(first_next.trim())
120    && prior_is_caption(this_lines)
121  {
122    return 0;
123  }
124  if looks_like_git_log_graph_line(first_next.trim())
125    && this_lines
126      .iter()
127      .rev()
128      .find(|l| !l.is_empty())
129      .is_some_and(|l| looks_like_git_log_graph_line(l.trim()))
130  {
131    return 0;
132  }
133
134  1
135}
136
137fn prior_is_sibling_list_item(
138  this_lines: &[String],
139  indent: &str,
140  marker: &str,
141) -> bool {
142  let continuation_indent_width = char_len(indent) + char_len(marker);
143  let scan_floor = this_lines.len().saturating_sub(MAX_SEAM_LOOKBACK_LINES);
144  for idx in (scan_floor..this_lines.len()).rev() {
145    let line = &this_lines[idx];
146    if line.is_empty() {
147      return false;
148    }
149    if line_starts_sibling_list_item(line, indent, marker) {
150      return true;
151    }
152    let leading_ws = line.chars().take_while(|ch| *ch == ' ').count();
153    if leading_ws < continuation_indent_width {
154      return false;
155    }
156  }
157  false
158}
159
160fn prior_is_caption(this_lines: &[String]) -> bool {
161  let scan_floor = this_lines.len().saturating_sub(MAX_SEAM_LOOKBACK_LINES);
162  for idx in (scan_floor..this_lines.len()).rev() {
163    let line = &this_lines[idx];
164    if line.is_empty() {
165      return false;
166    }
167    if looks_like_table_or_figure_caption(line.trim()) {
168      return true;
169    }
170  }
171  false
172}
173
174fn line_starts_sibling_list_item(
175  line: &str,
176  indent: &str,
177  marker: &str,
178) -> bool {
179  if line.starts_with(&format!("{indent}{marker}")) {
180    return true;
181  }
182  let Some(rest) = line.strip_prefix(indent) else {
183    return false;
184  };
185  let Some(marker_punct) = marker.trim_end().chars().last() else {
186    return false;
187  };
188  if marker_punct != '.' && marker_punct != ')' {
189    return false;
190  }
191  let digit_count = rest.chars().take_while(|ch| ch.is_ascii_digit()).count();
192  if digit_count == 0 {
193    return false;
194  }
195  let mut after_digits = rest.chars().skip(digit_count);
196  let Some(delim) = after_digits.next() else {
197    return false;
198  };
199  if delim != marker_punct {
200    return false;
201  }
202  matches!(after_digits.next(), Some(' '))
203}
204
205fn detect_partial_paragraphs(
206  raw_text: &str,
207) -> (Option<String>, Option<String>) {
208  let paragraphs = split_paragraphs(raw_text);
209  if paragraphs.is_empty() {
210    return (None, None);
211  }
212
213  let head =
214    if looks_like_continuation(paragraphs.first().copied().unwrap_or("")) {
215      Some(paragraphs.first().copied().unwrap_or("").to_string())
216    } else {
217      None
218    };
219  let tail = if paragraphs.len() == 1 {
220    None
221  } else if looks_incomplete(paragraphs.last().copied().unwrap_or("")) {
222    Some(paragraphs.last().copied().unwrap_or("").to_string())
223  } else {
224    None
225  };
226
227  (head, tail)
228}
229
230fn split_paragraphs(text: &str) -> Vec<&str> {
231  let mut paragraphs = Vec::new();
232  let mut start: Option<usize> = None;
233  let mut blank_run = 0usize;
234  let mut byte_pos = 0usize;
235
236  for line in text.split_inclusive('\n') {
237    let trimmed = line.trim();
238    if trimmed.is_empty() {
239      if let Some(s) = start.take() {
240        paragraphs.push(text[s..byte_pos].trim_end_matches('\n').trim_end());
241      }
242      blank_run += 1;
243      let _ = blank_run;
244    } else {
245      blank_run = 0;
246      if start.is_none() {
247        start = Some(byte_pos);
248      }
249    }
250    byte_pos += line.len();
251  }
252  if let Some(s) = start {
253    paragraphs.push(text[s..byte_pos].trim_end_matches('\n').trim_end());
254  }
255  paragraphs
256}
257
258fn looks_like_continuation(paragraph: &str) -> bool {
259  let trimmed = paragraph.trim_start();
260  let Some(first_char) = trimmed.chars().next() else {
261    return false;
262  };
263  // Lowercase ASCII or unicode lowercase is a strong signal the paragraph
264  // continues the previous page's sentence.
265  if first_char.is_lowercase() {
266    return true;
267  }
268  // A first line that starts with a small connective word (and, but, or, so)
269  // and the paragraph doesn't end at a sentence boundary also looks like
270  // continuation.
271  let first_word = trimmed
272    .split_whitespace()
273    .next()
274    .map(|w| w.trim_end_matches(|ch: char| !ch.is_alphabetic()))
275    .unwrap_or("");
276  matches!(
277    first_word.to_ascii_lowercase().as_str(),
278    "and" | "but" | "or" | "so"
279  )
280}
281
282fn looks_incomplete(paragraph: &str) -> bool {
283  let trimmed = paragraph.trim_end();
284  if trimmed.is_empty() {
285    return false;
286  }
287  let last_char = trimmed.chars().rev().find(|c| !c.is_whitespace());
288  let Some(last) = last_char else {
289    return false;
290  };
291  // Sentence-terminating punctuation -> seam is clean.
292  if matches!(last, '.' | '!' | '?' | ':' | ';' | ']' | ')' | '}' | '"') {
293    return false;
294  }
295  // Very short fragment (likely a heading) -> don't treat as incomplete.
296  let word_count = trimmed.split_whitespace().count();
297  if word_count <= 4 {
298    return false;
299  }
300  true
301}
302
303#[cfg(test)]
304mod tests {
305  use super::*;
306
307  #[test]
308  fn detects_tail_partial_when_paragraph_lacks_terminator() {
309    let text = "First paragraph ends cleanly.\n\nThis longer paragraph carries over without any punctuation at the end";
310    let (head, tail) = detect_partial_paragraphs(text);
311    assert!(
312      head.is_none(),
313      "first paragraph starts uppercase, not a continuation"
314    );
315    let tail = tail.expect("trailing partial should be detected");
316    assert!(tail.contains("without any punctuation"));
317  }
318
319  #[test]
320  fn detects_head_partial_when_first_paragraph_starts_lowercase() {
321    let text = "continuation of the prior page's sentence finishing here.\n\nA new paragraph begins.";
322    let (head, _tail) = detect_partial_paragraphs(text);
323    let head = head.expect("leading partial should be detected");
324    assert!(head.starts_with("continuation"));
325  }
326
327  #[test]
328  fn ignores_short_trailing_heading() {
329    let text = "Some body text ends here.\n\nSummary";
330    let (_head, tail) = detect_partial_paragraphs(text);
331    assert!(tail.is_none(), "short final fragment is treated as heading");
332  }
333
334  #[test]
335  fn justify_pdf_page_reports_line_counts() {
336    let raw = "first body paragraph stays on page.\n\ntext that continues forward without a period at the end";
337    let p = justify_pdf_page(raw, 30);
338    assert!(p.tail_partial.is_some());
339    let tail = p.tail_partial.unwrap();
340    assert!(tail.line_count >= 1);
341    assert!(tail.line_count <= p.lines.len());
342  }
343
344  #[test]
345  fn justify_pdf_seam_merges_into_one_paragraph() {
346    let prev = "the quick brown fox jumps over";
347    let next = "the lazy dog and goes home.";
348    let merged = justify_pdf_seam(prev, next, 80);
349    let joined = merged.join(" ");
350    assert!(
351      joined.contains("over the lazy dog"),
352      "seam should join into one paragraph: {merged:?}"
353    );
354  }
355
356  #[test]
357  fn justify_pdf_page_strips_leading_and_trailing_blanks() {
358    // Per-page raw text typically starts with a paragraph-break blank
359    // (pdf_oxide y-gap heuristic firing before the first content row)
360    // and ends with a blank produced by `text.split('\n')` on a `\n`-
361    // terminated page. Neither should survive into `standalone_lines`
362    // — they exist only as concatenation artifacts.
363    let raw = "\n• First bullet on this page.\n• Second bullet.\n";
364    let p = justify_pdf_page(raw, 80);
365    assert!(
366      p.lines.first().is_some_and(|l| !l.is_empty()),
367      "leading blank should be stripped, got: {:?}",
368      p.lines
369    );
370    assert!(
371      p.lines.last().is_some_and(|l| !l.is_empty()),
372      "trailing blank should be stripped, got: {:?}",
373      p.lines
374    );
375  }
376
377  #[test]
378  fn inter_page_blank_count_drops_blanks_between_sibling_bullets() {
379    // Two pages each carry one bullet from the same logical list.
380    // The boundary should read as one continuous block (0 blanks).
381    let this = vec!["• Chapter 7, Transparency.".to_string()];
382    let next = vec!["• Chapter 8, Interactive Features.".to_string()];
383    assert_eq!(inter_page_blank_count(&this, &next), 0);
384  }
385
386  #[test]
387  fn inter_page_blank_count_drops_blanks_between_sibling_bullets_with_continuation()
388   {
389    // The trailing line of the previous page is a wrapped continuation
390    // of a bullet, not the bullet header. We must still recognise the
391    // sibling relationship by walking back through continuation lines.
392    let this = vec![
393      "• Chapter 7, Transparency, discusses the operation".to_string(),
394      "  of the transparent imaging model.".to_string(),
395    ];
396    let next = vec!["• Chapter 8, Interactive Features.".to_string()];
397    assert_eq!(inter_page_blank_count(&this, &next), 0);
398  }
399
400  #[test]
401  fn inter_page_blank_count_drops_blanks_between_captions() {
402    let this = vec!["Plate 14 Radial shading effect (page 313)".to_string()];
403    let next = vec!["Plate 15 Coons patch mesh (page 321)".to_string()];
404    assert_eq!(inter_page_blank_count(&this, &next), 0);
405  }
406
407  #[test]
408  fn inter_page_blank_count_drops_blanks_between_captions_via_wrap_tail() {
409    // The previous page's last line is the wrap tail of a caption
410    // (`page 313)`), not the caption header. Walk back to find the
411    // header (`Plate 17 …`).
412    let this = vec![
413      "Plate 17 Isolated and knockout groups (Sections 7.3.4, page".to_string(),
414      "539 and 7.3.5, page 540)".to_string(),
415    ];
416    let next = vec!["Plate 18 RGB blend modes (page 520)".to_string()];
417    assert_eq!(inter_page_blank_count(&this, &next), 0);
418  }
419
420  #[test]
421  fn inter_page_blank_count_keeps_one_blank_between_unrelated_paragraphs() {
422    let this =
423      vec!["End of one prose paragraph on the prior page.".to_string()];
424    let next =
425      vec!["Start of a new prose paragraph on the next page.".to_string()];
426    assert_eq!(inter_page_blank_count(&this, &next), 1);
427  }
428
429  #[test]
430  fn inter_page_blank_count_keeps_one_blank_when_list_ends_and_prose_starts() {
431    let this = vec!["• Final list item on prior page.".to_string()];
432    let next = vec!["A fresh prose paragraph on the next page.".to_string()];
433    assert_eq!(inter_page_blank_count(&this, &next), 1);
434  }
435
436  #[test]
437  fn inter_page_blank_count_drops_blanks_between_git_graph_rows() {
438    let this = vec![
439      "  * 2d3acf9 Ignore errors from SIGCHLD on trap".to_string(),
440      "  * | 30e367c Timeout code and tests".to_string(),
441    ];
442    let next = vec!["  * | 5a09431 Add timeout protection to grit".to_string()];
443    assert_eq!(inter_page_blank_count(&this, &next), 0);
444  }
445}