1use super::engine::justify_pdf_hybrid;
2use super::structure::{
3 looks_like_git_log_graph_line, looks_like_table_or_figure_caption,
4 parse_list_marker,
5};
6use crate::text_utils::char_len;
7
8const MAX_SEAM_LOOKBACK_LINES: usize = 12;
12
13pub struct PdfPageJustified {
16 pub lines: Vec<String>,
17 pub head_partial: Option<PartialParagraph>,
18 pub tail_partial: Option<PartialParagraph>,
19}
20
21pub struct PartialParagraph {
26 pub raw_text: String,
27 pub line_count: usize,
28}
29
30pub fn justify_pdf_page(raw_text: &str, col: usize) -> PdfPageJustified {
31 let mut lines = justify_pdf_hybrid(raw_text, col);
42 trim_edge_blanks(&mut lines);
43 let (head_raw, tail_raw) = detect_partial_paragraphs(raw_text);
44
45 let head_partial = head_raw.map(|raw| {
46 let mut head_lines = justify_pdf_hybrid(&raw, col);
47 trim_edge_blanks(&mut head_lines);
48 PartialParagraph { raw_text: raw, line_count: head_lines.len() }
49 });
50 let tail_partial = tail_raw.map(|raw| {
51 let mut tail_lines = justify_pdf_hybrid(&raw, col);
52 trim_edge_blanks(&mut tail_lines);
53 PartialParagraph { raw_text: raw, line_count: tail_lines.len() }
54 });
55
56 PdfPageJustified { lines, head_partial, tail_partial }
57}
58
59fn trim_edge_blanks(lines: &mut Vec<String>) {
60 while lines.last().is_some_and(String::is_empty) {
61 lines.pop();
62 }
63 while lines.first().is_some_and(String::is_empty) {
64 lines.remove(0);
65 }
66}
67
68pub fn justify_pdf_seam(
73 prev_tail_raw: &str,
74 next_head_raw: &str,
75 col: usize,
76) -> Vec<String> {
77 let prev = prev_tail_raw.trim_end_matches(['\n', ' ', '\t']);
78 let next = next_head_raw.trim_start_matches(['\n', ' ', '\t']);
79 let mut lines = if prev.is_empty() {
80 justify_pdf_hybrid(next, col)
81 } else if next.is_empty() {
82 justify_pdf_hybrid(prev, col)
83 } else {
84 let joined = format!("{prev}\n{next}");
85 justify_pdf_hybrid(&joined, col)
86 };
87 trim_edge_blanks(&mut lines);
88 lines
89}
90
91pub fn inter_page_blank_count(
107 this_lines: &[String],
108 next_lines: &[String],
109) -> usize {
110 let Some(first_next) = next_lines.iter().find(|l| !l.is_empty()) else {
111 return 1;
112 };
113
114 if let Some((next_indent, next_marker, _)) = parse_list_marker(first_next)
115 && prior_is_sibling_list_item(this_lines, &next_indent, &next_marker)
116 {
117 return 0;
118 }
119 if looks_like_table_or_figure_caption(first_next.trim())
120 && prior_is_caption(this_lines)
121 {
122 return 0;
123 }
124 if looks_like_git_log_graph_line(first_next.trim())
125 && this_lines
126 .iter()
127 .rev()
128 .find(|l| !l.is_empty())
129 .is_some_and(|l| looks_like_git_log_graph_line(l.trim()))
130 {
131 return 0;
132 }
133
134 1
135}
136
137fn prior_is_sibling_list_item(
138 this_lines: &[String],
139 indent: &str,
140 marker: &str,
141) -> bool {
142 let continuation_indent_width = char_len(indent) + char_len(marker);
143 let scan_floor = this_lines.len().saturating_sub(MAX_SEAM_LOOKBACK_LINES);
144 for idx in (scan_floor..this_lines.len()).rev() {
145 let line = &this_lines[idx];
146 if line.is_empty() {
147 return false;
148 }
149 if line_starts_sibling_list_item(line, indent, marker) {
150 return true;
151 }
152 let leading_ws = line.chars().take_while(|ch| *ch == ' ').count();
153 if leading_ws < continuation_indent_width {
154 return false;
155 }
156 }
157 false
158}
159
160fn prior_is_caption(this_lines: &[String]) -> bool {
161 let scan_floor = this_lines.len().saturating_sub(MAX_SEAM_LOOKBACK_LINES);
162 for idx in (scan_floor..this_lines.len()).rev() {
163 let line = &this_lines[idx];
164 if line.is_empty() {
165 return false;
166 }
167 if looks_like_table_or_figure_caption(line.trim()) {
168 return true;
169 }
170 }
171 false
172}
173
174fn line_starts_sibling_list_item(
175 line: &str,
176 indent: &str,
177 marker: &str,
178) -> bool {
179 if line.starts_with(&format!("{indent}{marker}")) {
180 return true;
181 }
182 let Some(rest) = line.strip_prefix(indent) else {
183 return false;
184 };
185 let Some(marker_punct) = marker.trim_end().chars().last() else {
186 return false;
187 };
188 if marker_punct != '.' && marker_punct != ')' {
189 return false;
190 }
191 let digit_count = rest.chars().take_while(|ch| ch.is_ascii_digit()).count();
192 if digit_count == 0 {
193 return false;
194 }
195 let mut after_digits = rest.chars().skip(digit_count);
196 let Some(delim) = after_digits.next() else {
197 return false;
198 };
199 if delim != marker_punct {
200 return false;
201 }
202 matches!(after_digits.next(), Some(' '))
203}
204
205fn detect_partial_paragraphs(
206 raw_text: &str,
207) -> (Option<String>, Option<String>) {
208 let paragraphs = split_paragraphs(raw_text);
209 if paragraphs.is_empty() {
210 return (None, None);
211 }
212
213 let head =
214 if looks_like_continuation(paragraphs.first().copied().unwrap_or("")) {
215 Some(paragraphs.first().copied().unwrap_or("").to_string())
216 } else {
217 None
218 };
219 let tail = if paragraphs.len() == 1 {
220 None
221 } else if looks_incomplete(paragraphs.last().copied().unwrap_or("")) {
222 Some(paragraphs.last().copied().unwrap_or("").to_string())
223 } else {
224 None
225 };
226
227 (head, tail)
228}
229
230fn split_paragraphs(text: &str) -> Vec<&str> {
231 let mut paragraphs = Vec::new();
232 let mut start: Option<usize> = None;
233 let mut blank_run = 0usize;
234 let mut byte_pos = 0usize;
235
236 for line in text.split_inclusive('\n') {
237 let trimmed = line.trim();
238 if trimmed.is_empty() {
239 if let Some(s) = start.take() {
240 paragraphs.push(text[s..byte_pos].trim_end_matches('\n').trim_end());
241 }
242 blank_run += 1;
243 let _ = blank_run;
244 } else {
245 blank_run = 0;
246 if start.is_none() {
247 start = Some(byte_pos);
248 }
249 }
250 byte_pos += line.len();
251 }
252 if let Some(s) = start {
253 paragraphs.push(text[s..byte_pos].trim_end_matches('\n').trim_end());
254 }
255 paragraphs
256}
257
258fn looks_like_continuation(paragraph: &str) -> bool {
259 let trimmed = paragraph.trim_start();
260 let Some(first_char) = trimmed.chars().next() else {
261 return false;
262 };
263 if first_char.is_lowercase() {
266 return true;
267 }
268 let first_word = trimmed
272 .split_whitespace()
273 .next()
274 .map(|w| w.trim_end_matches(|ch: char| !ch.is_alphabetic()))
275 .unwrap_or("");
276 matches!(
277 first_word.to_ascii_lowercase().as_str(),
278 "and" | "but" | "or" | "so"
279 )
280}
281
282fn looks_incomplete(paragraph: &str) -> bool {
283 let trimmed = paragraph.trim_end();
284 if trimmed.is_empty() {
285 return false;
286 }
287 let last_char = trimmed.chars().rev().find(|c| !c.is_whitespace());
288 let Some(last) = last_char else {
289 return false;
290 };
291 if matches!(last, '.' | '!' | '?' | ':' | ';' | ']' | ')' | '}' | '"') {
293 return false;
294 }
295 let word_count = trimmed.split_whitespace().count();
297 if word_count <= 4 {
298 return false;
299 }
300 true
301}
302
303#[cfg(test)]
304mod tests {
305 use super::*;
306
307 #[test]
308 fn detects_tail_partial_when_paragraph_lacks_terminator() {
309 let text = "First paragraph ends cleanly.\n\nThis longer paragraph carries over without any punctuation at the end";
310 let (head, tail) = detect_partial_paragraphs(text);
311 assert!(
312 head.is_none(),
313 "first paragraph starts uppercase, not a continuation"
314 );
315 let tail = tail.expect("trailing partial should be detected");
316 assert!(tail.contains("without any punctuation"));
317 }
318
319 #[test]
320 fn detects_head_partial_when_first_paragraph_starts_lowercase() {
321 let text = "continuation of the prior page's sentence finishing here.\n\nA new paragraph begins.";
322 let (head, _tail) = detect_partial_paragraphs(text);
323 let head = head.expect("leading partial should be detected");
324 assert!(head.starts_with("continuation"));
325 }
326
327 #[test]
328 fn ignores_short_trailing_heading() {
329 let text = "Some body text ends here.\n\nSummary";
330 let (_head, tail) = detect_partial_paragraphs(text);
331 assert!(tail.is_none(), "short final fragment is treated as heading");
332 }
333
334 #[test]
335 fn justify_pdf_page_reports_line_counts() {
336 let raw = "first body paragraph stays on page.\n\ntext that continues forward without a period at the end";
337 let p = justify_pdf_page(raw, 30);
338 assert!(p.tail_partial.is_some());
339 let tail = p.tail_partial.unwrap();
340 assert!(tail.line_count >= 1);
341 assert!(tail.line_count <= p.lines.len());
342 }
343
344 #[test]
345 fn justify_pdf_seam_merges_into_one_paragraph() {
346 let prev = "the quick brown fox jumps over";
347 let next = "the lazy dog and goes home.";
348 let merged = justify_pdf_seam(prev, next, 80);
349 let joined = merged.join(" ");
350 assert!(
351 joined.contains("over the lazy dog"),
352 "seam should join into one paragraph: {merged:?}"
353 );
354 }
355
356 #[test]
357 fn justify_pdf_page_strips_leading_and_trailing_blanks() {
358 let raw = "\n• First bullet on this page.\n• Second bullet.\n";
364 let p = justify_pdf_page(raw, 80);
365 assert!(
366 p.lines.first().is_some_and(|l| !l.is_empty()),
367 "leading blank should be stripped, got: {:?}",
368 p.lines
369 );
370 assert!(
371 p.lines.last().is_some_and(|l| !l.is_empty()),
372 "trailing blank should be stripped, got: {:?}",
373 p.lines
374 );
375 }
376
377 #[test]
378 fn inter_page_blank_count_drops_blanks_between_sibling_bullets() {
379 let this = vec!["• Chapter 7, Transparency.".to_string()];
382 let next = vec!["• Chapter 8, Interactive Features.".to_string()];
383 assert_eq!(inter_page_blank_count(&this, &next), 0);
384 }
385
386 #[test]
387 fn inter_page_blank_count_drops_blanks_between_sibling_bullets_with_continuation()
388 {
389 let this = vec![
393 "• Chapter 7, Transparency, discusses the operation".to_string(),
394 " of the transparent imaging model.".to_string(),
395 ];
396 let next = vec!["• Chapter 8, Interactive Features.".to_string()];
397 assert_eq!(inter_page_blank_count(&this, &next), 0);
398 }
399
400 #[test]
401 fn inter_page_blank_count_drops_blanks_between_captions() {
402 let this = vec!["Plate 14 Radial shading effect (page 313)".to_string()];
403 let next = vec!["Plate 15 Coons patch mesh (page 321)".to_string()];
404 assert_eq!(inter_page_blank_count(&this, &next), 0);
405 }
406
407 #[test]
408 fn inter_page_blank_count_drops_blanks_between_captions_via_wrap_tail() {
409 let this = vec![
413 "Plate 17 Isolated and knockout groups (Sections 7.3.4, page".to_string(),
414 "539 and 7.3.5, page 540)".to_string(),
415 ];
416 let next = vec!["Plate 18 RGB blend modes (page 520)".to_string()];
417 assert_eq!(inter_page_blank_count(&this, &next), 0);
418 }
419
420 #[test]
421 fn inter_page_blank_count_keeps_one_blank_between_unrelated_paragraphs() {
422 let this =
423 vec!["End of one prose paragraph on the prior page.".to_string()];
424 let next =
425 vec!["Start of a new prose paragraph on the next page.".to_string()];
426 assert_eq!(inter_page_blank_count(&this, &next), 1);
427 }
428
429 #[test]
430 fn inter_page_blank_count_keeps_one_blank_when_list_ends_and_prose_starts() {
431 let this = vec!["• Final list item on prior page.".to_string()];
432 let next = vec!["A fresh prose paragraph on the next page.".to_string()];
433 assert_eq!(inter_page_blank_count(&this, &next), 1);
434 }
435
436 #[test]
437 fn inter_page_blank_count_drops_blanks_between_git_graph_rows() {
438 let this = vec![
439 " * 2d3acf9 Ignore errors from SIGCHLD on trap".to_string(),
440 " * | 30e367c Timeout code and tests".to_string(),
441 ];
442 let next = vec![" * | 5a09431 Add timeout protection to grit".to_string()];
443 assert_eq!(inter_page_blank_count(&this, &next), 0);
444 }
445}