text_extract_search/
text_extract_search.rs

1// PDFium-rs -- Modern Rust interface to PDFium, the PDF library from Google
2//
3// Copyright (c) 2025 Martin van der Werff <github (at) newinnovations.nl>
4//
5// This file is part of PDFium-rs.
6//
7// PDFium-rs is free software: you can redistribute it and/or modify it under the terms of
8// the GNU General Public License as published by the Free Software Foundation, either version 3
9// of the License, or (at your option) any later version.
10//
11// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
12// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
13// FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
14// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
15// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
16// BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
17// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
18// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20use pdfium::*;
21
22/// Extracts and prints text from all pages of a PDF document
23pub fn example_extract_text() -> PdfiumResult<()> {
24    // Load the PDF document from the specified file path
25    // The second parameter (None) indicates no password is required
26    let document = PdfiumDocument::new_from_path("resources/chapter1.pdf", None)?;
27
28    // Iterate through all pages in the document
29    // enumerate() provides both the index and the page object
30    for (index, page) in document.pages().enumerate() {
31        // Extract the full text content from the current page
32        // The ?. operators handle potential errors at each step:
33        // - page? ensures the page loaded successfully
34        // - .text()? extracts text objects from the page
35        // - .full() gets the complete text content as a string
36        let text = page?.text()?.full();
37
38        // Print formatted output for each page
39        println!("Page {}", index + 1); // Pages are 1-indexed for user display
40        println!("------");
41        println!("{text}");
42        println!() // Empty line for separation between pages
43    }
44
45    // Expected output:
46    //
47    // Page 1
48    // ------
49    //
50    // Page 2
51    // ------
52    // Ruskin
53    // House.
54    // 156. Charing
55    // Cross Road.
56    // London
57    // George Allen.
58    //
59    // Page 3
60    // ------
61    //
62    // Page 4
63    // ------
64    // I
65    // Chapter I.
66    // T is a truth universally acknowledged, that a single man in possession of a good
67    // fortune must be in want of a wife.
68    // However little known the feelings or views of such a man may be on his first
69    // entering a neighbourhood, this truth is so well fixed in the minds of the surrounding
70    // families, that he is considered as the rightful property of some one or other of their
71    // daughters.
72    // “My dear Mr. Bennet,” said his lady to him one day, “have you heard that
73    // Netherfield Park is let at last?”
74    // ...
75
76    Ok(())
77}
78
79/// Demonstrates text search functionality within a PDF document
80pub fn example_search() -> PdfiumResult<()> {
81    // Load the PDF document to search within
82    let document = PdfiumDocument::new_from_path("resources/groningen.pdf", None)?;
83
84    // Get the first page (index 0) for searching
85    let page = document.page(0)?;
86
87    // Extract text objects from the page for searching
88    let text = page.text()?;
89
90    // Search for "amsterdam" with case-insensitive matching
91    // PdfiumSearchFlags::empty() means no special search flags (case-insensitive by default)
92    // The last parameter (0) is the starting position for the search
93    let search = text.find("amsterdam", PdfiumSearchFlags::empty(), 0);
94    println!("Found amsterdam {} times", search.count());
95
96    // Search for "groningen" with case-insensitive matching
97    let search = text.find("groningen", PdfiumSearchFlags::empty(), 0);
98    println!(
99        "Found groningen {} times (case insensitive)",
100        search.count()
101    );
102
103    // Search for "Groningen" with case-sensitive matching
104    // MATCH_CASE flag enforces exact case matching
105    let search = text.find("Groningen", PdfiumSearchFlags::MATCH_CASE, 0);
106    println!("Found Groningen {} times (case sensitive)", search.count());
107
108    // Perform another case-insensitive search to iterate through results
109    let search = text.find("groningen", PdfiumSearchFlags::empty(), 0);
110
111    // Iterate through each search result to extract detailed information
112    for result in search {
113        // Extract the text fragment at the found position
114        // result.index() gives the character position where the match starts
115        // result.count() gives the length of the matched text
116        let fragment = text.extract(result.index(), result.count());
117        println!(
118            "Found groningen (case insensitive) at {}, fragment = '{fragment}'",
119            result.index()
120        );
121    }
122
123    // Expected output:
124    //
125    // Found amsterdam 0 times
126    // Found groningen 5 times (case insensitive)
127    // Found Groningen 5 times (case sensitive)
128    // Found groningen (case insensitive) at 14, fragment = 'Groningen'
129    // Found groningen (case insensitive) at 232, fragment = 'Groningen'
130    // Found groningen (case insensitive) at 475, fragment = 'Groningen'
131    // Found groningen (case insensitive) at 920, fragment = 'Groningen'
132    // Found groningen (case insensitive) at 1050, fragment = 'Groningen'
133
134    Ok(())
135}
136
137/// Main function that demonstrates both text extraction and search functionality
138fn main() -> PdfiumResult<()> {
139    example_extract_text()?;
140    example_search()?;
141    Ok(())
142}