Skip to main content

spdf_core/
lib.rs

1//! The spdf orchestrator. Equivalent to the `LiteParse` class in
2//! `liteparse/src/core/parser.ts`.
3
4#![warn(clippy::all)]
5
6use std::path::PathBuf;
7use std::sync::Arc;
8
9use rayon::prelude::*;
10use spdf_convert::{ConversionResult, convert_path_to_pdf};
11use spdf_ocr::{HttpOcrEngine, OcrEngine, OcrOptions, OcrResult};
12use spdf_output::{format_text, to_json};
13use spdf_pdf::{ExtractOptions, PageData, PdfDocumentHandle, PdfEngine, PdfiumEngine};
14use spdf_processing::bbox::build_bounding_boxes;
15use spdf_processing::text_utils::clean_ocr_table_artifacts;
16use spdf_projection::{PageInput, project_pages_to_grid};
17use spdf_types::{
18    Language, ParseConfig, ParseInput, ParseResult, ParsedPage, ScreenshotResult, SpdfError,
19    SpdfResult, TextItem,
20};
21use tracing::{debug, info, warn};
22
23pub use spdf_types::OutputFormat;
24
25/// High-level document parser.
26pub struct SpdfParser {
27    config: ParseConfig,
28    pdf_engine: Arc<PdfiumEngine>,
29    ocr_engine: Option<Arc<dyn OcrEngine>>,
30}
31
32impl SpdfParser {
33    /// Build a parser with explicit config. Use [`Self::builder`] for the
34    /// defaults-plus-overrides pattern that mirrors `new LiteParse({ ... })`.
35    pub fn new(config: ParseConfig) -> Self {
36        let ocr_engine = build_ocr_engine(&config);
37        Self {
38            config,
39            pdf_engine: Arc::new(PdfiumEngine::new()),
40            ocr_engine,
41        }
42    }
43
44    /// Inject a custom OCR engine (e.g. for tests or a Tesseract build).
45    pub fn with_ocr_engine(mut self, engine: Arc<dyn OcrEngine>) -> Self {
46        self.ocr_engine = Some(engine);
47        self
48    }
49
50    /// Start from the shared defaults (equivalent to `DEFAULT_CONFIG`).
51    pub fn builder() -> ParseConfigBuilder {
52        ParseConfigBuilder::default()
53    }
54
55    pub fn config(&self) -> &ParseConfig {
56        &self.config
57    }
58
59    /// Parse a document to the caller-selected output.
60    pub fn parse(&self, input: impl Into<ParseInput>) -> SpdfResult<ParseResult> {
61        self.parse_inner(input.into())
62    }
63
64    fn parse_inner(&self, input: ParseInput) -> SpdfResult<ParseResult> {
65        let deadline = self
66            .config
67            .timeout_secs
68            .map(|s| std::time::Instant::now() + std::time::Duration::from_secs(s));
69        let check_deadline = |stage: &str| -> SpdfResult<()> {
70            if let Some(d) = deadline {
71                if std::time::Instant::now() >= d {
72                    return Err(SpdfError::InvalidInput(format!(
73                        "spdf: timeout exceeded during {stage}"
74                    )));
75                }
76            }
77            Ok(())
78        };
79        // Reject oversized in-memory blobs before touching pdfium.
80        if let (ParseInput::Bytes(b), Some(cap)) = (&input, self.config.max_input_bytes) {
81            if b.len() as u64 > cap {
82                return Err(SpdfError::InvalidInput(format!(
83                    "spdf: input {} bytes exceeds max_input_bytes {cap}",
84                    b.len()
85                )));
86            }
87        }
88        let materialised = self.materialise(input)?;
89        let bytes = match materialised {
90            Materialised::Pdf { bytes, .. } => bytes,
91            Materialised::PlainText(content) => return Ok(plain_text_result(content)),
92        };
93        check_deadline("load")?;
94
95        let doc = self
96            .pdf_engine
97            .load_bytes(&bytes, self.config.password.as_deref())?;
98        let total_pages = doc.num_pages().min(self.config.max_pages);
99        info!(pages = total_pages, "spdf: parsing");
100
101        let page_numbers = select_pages(total_pages, self.config.target_pages.as_deref())?;
102        debug!(selected = page_numbers.len(), "spdf: page set selected");
103
104        let opts = ExtractOptions {
105            extract_images: self.config.ocr_enabled,
106        };
107
108        let pdf_engine = Arc::clone(&self.pdf_engine);
109        let mut page_datas: Vec<PageData> = page_numbers
110            .par_iter()
111            .map(|&page_num| pdf_engine.extract_page(&doc, page_num, opts))
112            .collect::<SpdfResult<Vec<_>>>()?;
113        check_deadline("extract")?;
114
115        // Phase 6: Selective OCR. Run on pages with sparse text or embedded
116        // images, then append non-overlapping OCR items to `text_items` so the
117        // downstream projection treats them uniformly.
118        if self.config.ocr_enabled {
119            if let Some(ocr) = self.ocr_engine.as_ref() {
120                self.run_ocr(&doc, &mut page_datas, ocr.as_ref())?;
121            } else {
122                warn_no_ocr_engine();
123            }
124        }
125        check_deadline("ocr")?;
126
127        let pages: Vec<PageInput> = page_datas
128            .into_iter()
129            .map(|p| PageInput {
130                page_num: p.page_num,
131                width: p.width,
132                height: p.height,
133                text_items: p.text_items,
134            })
135            .collect();
136
137        let mut processed: Vec<ParsedPage> = project_pages_to_grid(pages, &self.config);
138
139        if self.config.precise_bounding_box {
140            for page in processed.iter_mut() {
141                page.bounding_boxes = Some(build_bounding_boxes(&page.text_items));
142            }
143        }
144
145        let full_text = processed
146            .iter()
147            .map(|p| p.text.as_str())
148            .collect::<Vec<_>>()
149            .join("\n\n");
150
151        let mut result = ParseResult {
152            pages: processed,
153            text: full_text,
154            json: None,
155        };
156
157        if matches!(self.config.output_format, OutputFormat::Json) {
158            result.json = Some(to_json(&result));
159        }
160
161        self.pdf_engine.close(doc)?;
162        Ok(result)
163    }
164
165    /// Render each candidate page and append OCR text items that don't overlap
166    /// existing PDF text. Mirrors `runOCR`/`processPageOcr` in
167    /// `liteparse/src/core/parser.ts`.
168    ///
169    /// Pages are rendered and OCR'd on a rayon pool sized by
170    /// `config.num_workers`, matching liteparse's `Scheduler` concurrency.
171    /// The rendering step serialises internally on PDFium's global mutex, but
172    /// the heavy OCR step runs fully parallel because the Tesseract engine
173    /// uses a `thread_local!` cache to keep one warmed instance per worker.
174    fn run_ocr(
175        &self,
176        doc: &<PdfiumEngine as PdfEngine>::Doc,
177        pages: &mut [PageData],
178        ocr: &dyn OcrEngine,
179    ) -> SpdfResult<()> {
180        let languages: Vec<String> = match &self.config.ocr_language {
181            Language::Single(s) => vec![s.clone()],
182            Language::Multiple(v) => v.clone(),
183        };
184        let options = OcrOptions {
185            languages,
186            correct_rotation: true,
187            dpi: Some(self.config.dpi),
188        };
189        // PDF spec constant: 72 points per inch. OCR coordinates come back in
190        // image pixels at the render DPI.
191        let scale_factor = 72.0 / self.config.dpi as f64;
192
193        // Phase 1: figure out which pages actually need OCR, and render them.
194        // We collect `(page_idx, png_bytes)` so phase 2 can run OCR in
195        // parallel without borrowing `pages` mutably.
196        let mut todo: Vec<(usize, u32)> = Vec::new();
197        for (idx, page) in pages.iter().enumerate() {
198            let text_length: usize = page.text_items.iter().map(|t| t.str.len()).sum();
199            let needs_full_ocr = text_length < 100 || !page.images.is_empty();
200            if needs_full_ocr {
201                todo.push((idx, page.page_num));
202            }
203        }
204        if todo.is_empty() {
205            return Ok(());
206        }
207
208        // Phase 2: render + OCR in parallel. `pdf_engine.render_page_png` is
209        // `&self` and internally serialises on PDFium's global mutex; that's
210        // fine because OCR dominates wall-clock time by orders of magnitude.
211        let num_workers = self.config.num_workers.max(1);
212        let pool = rayon::ThreadPoolBuilder::new()
213            .num_threads(num_workers)
214            .thread_name(|i| format!("spdf-ocr-{i}"))
215            .build()
216            .map_err(|e| SpdfError::Ocr(format!("ocr thread pool: {e}")))?;
217
218        let engine = self.pdf_engine.clone();
219        let dpi = self.config.dpi;
220        let results: Vec<(usize, Vec<OcrResult>)> = pool.install(|| {
221            todo.par_iter()
222                .map(|&(idx, page_num)| {
223                    let image = match engine.render_page_png(doc, page_num, dpi) {
224                        Ok(b) => b,
225                        Err(e) => {
226                            warn!(page = page_num, error = %e, "spdf: render for OCR failed");
227                            return (idx, Vec::new());
228                        }
229                    };
230                    match ocr.recognize(&image, &options) {
231                        Ok(r) => (idx, r),
232                        Err(e) => {
233                            warn!(page = page_num, error = %e, "spdf: OCR failed");
234                            (idx, Vec::new())
235                        }
236                    }
237                })
238                .collect()
239        });
240
241        // Phase 3: merge OCR words into each page's text items, dropping
242        // low-confidence hits and any that overlap existing PDF text. The
243        // `> 0.3` confidence cut-off matches liteparse exactly.
244        for (idx, ocr_results) in results {
245            let page = &mut pages[idx];
246            // Snapshot the pre-OCR text items so adjacent OCR words don't
247            // shadow each other via the overlap filter — tight kerning
248            // routinely puts neighbouring word bboxes within the 2-point
249            // overlap tolerance, which would otherwise drop every other
250            // word. The overlap check exists to avoid re-emitting text we
251            // already got from the PDF text layer, not to dedupe OCR
252            // against itself.
253            let existing_len = page.text_items.len();
254            let mut appended = 0usize;
255            for r in ocr_results {
256                if r.confidence <= 0.3 {
257                    continue;
258                }
259                let [x1, y1, x2, y2] = r.bbox;
260                let px = x1 * scale_factor;
261                let py = y1 * scale_factor;
262                let pw = (x2 - x1) * scale_factor;
263                let ph = (y2 - y1) * scale_factor;
264                if pw <= 0.0 || ph <= 0.0 {
265                    continue;
266                }
267                if overlaps_existing_text(&page.text_items[..existing_len], px, py, pw, ph) {
268                    continue;
269                }
270                let cleaned = clean_ocr_table_artifacts(&r.text);
271                let cleaned = strip_ocr_pipe_artifacts(&cleaned);
272                if cleaned.is_empty() || is_ocr_punctuation_noise(&cleaned) {
273                    continue;
274                }
275                let mut item = TextItem::new(cleaned, px, py, pw, ph);
276                item.font_name = Some("OCR".into());
277                item.font_size = Some(ph);
278                item.confidence = Some((r.confidence * 1000.0).round() / 1000.0);
279                page.text_items.push(item);
280                appended += 1;
281            }
282            debug!(page = page.page_num, appended, "spdf: OCR merged");
283        }
284        Ok(())
285    }
286
287    /// Stream one parsed page at a time without materialising the full
288    /// document in memory. Yields `(page_index, ParsedPage)` pairs in the
289    /// same order as `parse`. Errors abort the iterator.
290    pub fn stream<I: Into<ParseInput>>(
291        &self,
292        input: I,
293    ) -> SpdfResult<Box<dyn Iterator<Item = SpdfResult<ParsedPage>> + '_>> {
294        let bytes = match self.materialise(input.into())? {
295            Materialised::Pdf { bytes, .. } => bytes,
296            Materialised::PlainText(content) => {
297                let page = plain_text_result(content).pages.remove(0);
298                return Ok(Box::new(std::iter::once(Ok(page))));
299            }
300        };
301        let doc = self
302            .pdf_engine
303            .load_bytes(&bytes, self.config.password.as_deref())?;
304        let total = doc.num_pages().min(self.config.max_pages);
305        let page_numbers = select_pages(total, self.config.target_pages.as_deref())?;
306        let opts = ExtractOptions {
307            extract_images: self.config.ocr_enabled,
308        };
309        let engine = Arc::clone(&self.pdf_engine);
310        let precise_bbox = self.config.precise_bounding_box;
311        let debug_on = self.config.debug.as_ref().is_some_and(|d| d.enabled);
312        let cfg = self.config.clone();
313        let iter = page_numbers.into_iter().map(move |page_num| {
314            let pd = engine.extract_page(&doc, page_num, opts)?;
315            let pages = spdf_projection::project_pages_to_grid(
316                vec![spdf_projection::PageInput {
317                    page_num: pd.page_num,
318                    width: pd.width,
319                    height: pd.height,
320                    text_items: pd.text_items,
321                }],
322                &cfg,
323            );
324            let mut page = pages.into_iter().next().unwrap();
325            if precise_bbox {
326                page.bounding_boxes = Some(spdf_processing::bbox::build_bounding_boxes(
327                    &page.text_items,
328                ));
329            }
330            if debug_on {
331                debug!(page = page.page_num, "spdf: streamed");
332            }
333            Ok(page)
334        });
335        Ok(Box::new(iter))
336    }
337
338    /// Render specific (or all) pages to PNG buffers.
339    pub fn screenshot(
340        &self,
341        input: impl Into<ParseInput>,
342        page_numbers: Option<Vec<u32>>,
343    ) -> SpdfResult<Vec<ScreenshotResult>> {
344        let (bytes, _temp) = match self.materialise(input.into())? {
345            Materialised::Pdf { bytes, tempdir } => (bytes, tempdir),
346            Materialised::PlainText(_) => {
347                return Err(SpdfError::UnsupportedFormat(
348                    "cannot screenshot plain-text input".into(),
349                ));
350            }
351        };
352        let doc = self
353            .pdf_engine
354            .load_bytes(&bytes, self.config.password.as_deref())?;
355        let total = doc.num_pages();
356        let targets = page_numbers.unwrap_or_else(|| (1..=total).collect());
357
358        let mut out = Vec::with_capacity(targets.len());
359        for page_num in targets {
360            let png = self
361                .pdf_engine
362                .render_page_png(&doc, page_num, self.config.dpi)?;
363            // Width/height decoded lazily by the caller; 0 signals "unknown".
364            out.push(ScreenshotResult {
365                page_num,
366                width: 0,
367                height: 0,
368                image_buffer: png,
369                image_path: None,
370            });
371        }
372        self.pdf_engine.close(doc)?;
373        Ok(out)
374    }
375
376    /// Convenience formatter respecting the configured output format.
377    pub fn format(&self, result: &ParseResult) -> String {
378        match self.config.output_format {
379            OutputFormat::Text => format_text(result),
380            OutputFormat::Json => {
381                let json = result.json.clone().unwrap_or_else(|| to_json(result));
382                serde_json::to_string_pretty(&json).unwrap_or_default()
383            }
384        }
385    }
386
387    /// Load bytes for the configured input.
388    fn materialise(&self, input: ParseInput) -> SpdfResult<Materialised> {
389        match input {
390            ParseInput::Bytes(b) => Ok(Materialised::Pdf {
391                bytes: b,
392                tempdir: None,
393            }),
394            ParseInput::Path(p) => {
395                match convert_path_to_pdf(&p, self.config.password.as_deref())? {
396                    ConversionResult::Pdf {
397                        pdf_path, _tempdir, ..
398                    } => Ok(Materialised::Pdf {
399                        bytes: std::fs::read(pdf_path)?,
400                        tempdir: _tempdir,
401                    }),
402                    ConversionResult::PlainText { content } => Ok(Materialised::PlainText(content)),
403                }
404            }
405        }
406    }
407}
408
409/// Internal representation of the parse input once loaded.
410enum Materialised {
411    Pdf {
412        bytes: Vec<u8>,
413        #[allow(dead_code)]
414        tempdir: Option<tempfile::TempDir>,
415    },
416    PlainText(String),
417}
418
419/// Build a `ParseResult` from plain-text input (markdown, txt, log, ...).
420/// Mirrors the short-circuit path in liteparse so callers get one parsed
421/// "page" with the file contents as-is.
422fn plain_text_result(content: String) -> ParseResult {
423    let page = ParsedPage {
424        page_num: 1,
425        width: 0.0,
426        height: 0.0,
427        text: content.clone(),
428        text_items: vec![TextItem::new(&content, 0.0, 0.0, 0.0, 0.0)],
429        bounding_boxes: None,
430    };
431    let mut result = ParseResult {
432        pages: vec![page],
433        text: content,
434        json: None,
435    };
436    result.json = Some(to_json(&result));
437    result
438}
439
440/// Select which page numbers to process. Mirrors liteparse's range-list parser.
441fn select_pages(total_pages: u32, target: Option<&str>) -> SpdfResult<Vec<u32>> {
442    let Some(spec) = target else {
443        return Ok((1..=total_pages).collect());
444    };
445    let mut out = Vec::new();
446    for chunk in spec.split(',').map(str::trim).filter(|s| !s.is_empty()) {
447        if let Some((lo, hi)) = chunk.split_once('-') {
448            let lo: u32 = lo
449                .trim()
450                .parse()
451                .map_err(|_| SpdfError::InvalidConfig(format!("bad range: {chunk}")))?;
452            let hi: u32 = hi
453                .trim()
454                .parse()
455                .map_err(|_| SpdfError::InvalidConfig(format!("bad range: {chunk}")))?;
456            for p in lo..=hi {
457                if p >= 1 && p <= total_pages {
458                    out.push(p);
459                }
460            }
461        } else {
462            let p: u32 = chunk
463                .parse()
464                .map_err(|_| SpdfError::InvalidConfig(format!("bad page: {chunk}")))?;
465            if p >= 1 && p <= total_pages {
466                out.push(p);
467            }
468        }
469    }
470    out.sort_unstable();
471    out.dedup();
472    Ok(out)
473}
474
475/// Fluent builder equivalent to TS `new LiteParse(partial)`.
476#[derive(Debug, Default)]
477pub struct ParseConfigBuilder {
478    config: ParseConfig,
479}
480
481impl ParseConfigBuilder {
482    pub fn ocr_enabled(mut self, on: bool) -> Self {
483        self.config.ocr_enabled = on;
484        self
485    }
486    pub fn ocr_server_url(mut self, url: impl Into<String>) -> Self {
487        self.config.ocr_server_url = Some(url.into());
488        self
489    }
490    pub fn dpi(mut self, dpi: u32) -> Self {
491        self.config.dpi = dpi;
492        self
493    }
494    pub fn output_format(mut self, fmt: OutputFormat) -> Self {
495        self.config.output_format = fmt;
496        self
497    }
498    pub fn max_pages(mut self, max: u32) -> Self {
499        self.config.max_pages = max;
500        self
501    }
502    pub fn target_pages(mut self, spec: impl Into<String>) -> Self {
503        self.config.target_pages = Some(spec.into());
504        self
505    }
506    pub fn num_workers(mut self, n: usize) -> Self {
507        self.config.num_workers = n;
508        self
509    }
510    pub fn password(mut self, pw: impl Into<String>) -> Self {
511        self.config.password = Some(pw.into());
512        self
513    }
514    pub fn precise_bounding_box(mut self, on: bool) -> Self {
515        self.config.precise_bounding_box = on;
516        self
517    }
518    /// Fail `parse()` if wall-clock work exceeds this many seconds.
519    pub fn timeout_secs(mut self, secs: u64) -> Self {
520        self.config.timeout_secs = Some(secs);
521        self
522    }
523    /// Reject `ParseInput::Bytes` payloads larger than this many bytes.
524    pub fn max_input_bytes(mut self, bytes: u64) -> Self {
525        self.config.max_input_bytes = Some(bytes);
526        self
527    }
528    pub fn config(self) -> ParseConfig {
529        self.config
530    }
531    pub fn build(self) -> SpdfParser {
532        SpdfParser::new(self.config)
533    }
534}
535
536/// Stub: kept so callers can see the intended `PathBuf`-returning API for
537/// screenshot persistence that Phase 8 will finish.
538pub fn default_screenshot_path(output_dir: &std::path::Path, page_num: u32) -> PathBuf {
539    output_dir.join(format!("page-{page_num}.png"))
540}
541
542/// Build the default OCR engine from config. HTTP if a URL is configured,
543/// Tesseract if the feature is enabled, otherwise `None`.
544fn build_ocr_engine(config: &ParseConfig) -> Option<Arc<dyn OcrEngine>> {
545    if !config.ocr_enabled {
546        return None;
547    }
548    if let Some(url) = config.ocr_server_url.as_deref() {
549        return Some(Arc::new(HttpOcrEngine::new(url)));
550    }
551    #[cfg(feature = "tesseract")]
552    {
553        return Some(Arc::new(spdf_ocr::TesseractEngine::new(
554            config.tessdata_path.clone(),
555        )));
556    }
557    #[cfg(not(feature = "tesseract"))]
558    {
559        let _ = config;
560        None
561    }
562}
563
564/// Emit a one-shot warning when OCR is requested but no engine is available,
565/// with concrete remediation steps.
566fn warn_no_ocr_engine() {
567    use std::sync::Once;
568    static ONCE: Once = Once::new();
569    ONCE.call_once(|| {
570        let tesseract_built = cfg!(feature = "tesseract");
571        let msg = if tesseract_built {
572            "spdf: OCR requested but no engine configured. This build supports \
573             Tesseract; install libtesseract + language data (e.g. \
574             `apt install tesseract-ocr tesseract-ocr-eng`) or pass \
575             --ocr-server-url to use an HTTP OCR server. Any rasterized text \
576             in the PDF will be missing from the output."
577        } else {
578            "spdf: OCR requested but no engine configured. Either pass \
579             --ocr-server-url <URL> to use an HTTP OCR server, or rebuild \
580             spdf with the `tesseract` feature (`cargo build --release \
581             -p spdf-cli --features tesseract`, requires libtesseract and \
582             libleptonica). Rasterized text will be missing from the output."
583        };
584        warn!("{msg}");
585    });
586}
587
588/// True when an OCR bbox overlaps any existing text item (with a 2-point
589/// tolerance), matching liteparse's `overlapsExistingText`.
590fn overlaps_existing_text(items: &[TextItem], x: f64, y: f64, w: f64, h: f64) -> bool {
591    const TOL: f64 = 2.0;
592    let right = x + w;
593    let bottom = y + h;
594    for it in items {
595        let iw = if it.width > 0.0 { it.width } else { it.w };
596        let ih = if it.height > 0.0 { it.height } else { it.h };
597        let ir = it.x + iw;
598        let ib = it.y + ih;
599        let overlap_x = x < ir + TOL && right > it.x - TOL;
600        let overlap_y = y < ib + TOL && bottom > it.y - TOL;
601        if overlap_x && overlap_y {
602            return true;
603        }
604    }
605    false
606}
607
608/// Drop single-token OCR words that are pure punctuation, which Tesseract
609/// frequently hallucinates at the edges of rasterized text (trailing `|`,
610/// stray `.`, orphan brackets, etc.). A real sentence ends in punctuation
611/// *attached* to a word, not as its own token.
612fn is_ocr_punctuation_noise(text: &str) -> bool {
613    let t = text.trim();
614    if t.is_empty() {
615        return true;
616    }
617    // Keep anything that contains at least one alphanumeric character.
618    !t.chars().any(|c| c.is_alphanumeric())
619}
620
621/// Strip leading/trailing pipe characters that Tesseract hallucinates from
622/// vertical strokes at the edges of rasterized text (e.g. `"words.|"` → `"words."`).
623/// Only pipes are removed — other punctuation is legitimate.
624fn strip_ocr_pipe_artifacts(text: &str) -> String {
625    text.trim().trim_matches('|').trim().to_string()
626}
627
628#[cfg(test)]
629mod tests {
630    use super::*;
631
632    #[test]
633    fn select_pages_defaults_to_all() {
634        assert_eq!(select_pages(3, None).unwrap(), vec![1, 2, 3]);
635    }
636
637    #[test]
638    fn select_pages_parses_mixed_spec() {
639        let out = select_pages(20, Some("1-3,5,10-11")).unwrap();
640        assert_eq!(out, vec![1, 2, 3, 5, 10, 11]);
641    }
642
643    #[test]
644    fn select_pages_rejects_bad_spec() {
645        let err = select_pages(10, Some("1-abc")).unwrap_err();
646        match err {
647            SpdfError::InvalidConfig(msg) => assert!(msg.contains("bad range")),
648            _ => panic!("expected InvalidConfig"),
649        }
650    }
651
652    #[test]
653    fn overlap_detects_collision_with_existing_text() {
654        let items = vec![TextItem::new("hi", 10.0, 20.0, 40.0, 12.0)];
655        // Same bbox -> overlaps
656        assert!(overlaps_existing_text(&items, 10.0, 20.0, 40.0, 12.0));
657        // Far away -> no overlap
658        assert!(!overlaps_existing_text(&items, 200.0, 200.0, 40.0, 12.0));
659        // Within tolerance -> overlaps
660        assert!(overlaps_existing_text(&items, 11.0, 21.0, 1.0, 1.0));
661    }
662}