text_extract_search/text_extract_search.rs
1// PDFium-rs -- Modern Rust interface to PDFium, the PDF library from Google
2//
3// Copyright (c) 2025 Martin van der Werff <github (at) newinnovations.nl>
4//
5// This file is part of PDFium-rs.
6//
7// PDFium-rs is free software: you can redistribute it and/or modify it under the terms of
8// the GNU General Public License as published by the Free Software Foundation, either version 3
9// of the License, or (at your option) any later version.
10//
11// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
12// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
13// FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
14// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
15// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
16// BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
17// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
18// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
20use pdfium::*;
21
22/// Extracts and prints text from all pages of a PDF document
23pub fn example_extract_text() -> PdfiumResult<()> {
24 // Load the PDF document from the specified file path
25 // The second parameter (None) indicates no password is required
26 let document = PdfiumDocument::new_from_path("resources/chapter1.pdf", None)?;
27
28 // Iterate through all pages in the document
29 // enumerate() provides both the index and the page object
30 for (index, page) in document.pages().enumerate() {
31 // Extract the full text content from the current page
32 // The ?. operators handle potential errors at each step:
33 // - page? ensures the page loaded successfully
34 // - .text()? extracts text objects from the page
35 // - .full() gets the complete text content as a string
36 let text = page?.text()?.full();
37
38 // Print formatted output for each page
39 println!("Page {}", index + 1); // Pages are 1-indexed for user display
40 println!("------");
41 println!("{text}");
42 println!() // Empty line for separation between pages
43 }
44
45 // Expected output:
46 //
47 // Page 1
48 // ------
49 //
50 // Page 2
51 // ------
52 // Ruskin
53 // House.
54 // 156. Charing
55 // Cross Road.
56 // London
57 // George Allen.
58 //
59 // Page 3
60 // ------
61 //
62 // Page 4
63 // ------
64 // I
65 // Chapter I.
66 // T is a truth universally acknowledged, that a single man in possession of a good
67 // fortune must be in want of a wife.
68 // However little known the feelings or views of such a man may be on his first
69 // entering a neighbourhood, this truth is so well fixed in the minds of the surrounding
70 // families, that he is considered as the rightful property of some one or other of their
71 // daughters.
72 // “My dear Mr. Bennet,” said his lady to him one day, “have you heard that
73 // Netherfield Park is let at last?”
74 // ...
75
76 Ok(())
77}
78
79/// Demonstrates text search functionality within a PDF document
80pub fn example_search() -> PdfiumResult<()> {
81 // Load the PDF document to search within
82 let document = PdfiumDocument::new_from_path("resources/groningen.pdf", None)?;
83
84 // Get the first page (index 0) for searching
85 let page = document.page(0)?;
86
87 // Extract text objects from the page for searching
88 let text = page.text()?;
89
90 // Search for "amsterdam" with case-insensitive matching
91 // PdfiumSearchFlags::empty() means no special search flags (case-insensitive by default)
92 // The last parameter (0) is the starting position for the search
93 let search = text.find("amsterdam", PdfiumSearchFlags::empty(), 0);
94 println!("Found amsterdam {} times", search.count());
95
96 // Search for "groningen" with case-insensitive matching
97 let search = text.find("groningen", PdfiumSearchFlags::empty(), 0);
98 println!(
99 "Found groningen {} times (case insensitive)",
100 search.count()
101 );
102
103 // Search for "Groningen" with case-sensitive matching
104 // MATCH_CASE flag enforces exact case matching
105 let search = text.find("Groningen", PdfiumSearchFlags::MATCH_CASE, 0);
106 println!("Found Groningen {} times (case sensitive)", search.count());
107
108 // Perform another case-insensitive search to iterate through results
109 let search = text.find("groningen", PdfiumSearchFlags::empty(), 0);
110
111 // Iterate through each search result to extract detailed information
112 for result in search {
113 // Extract the text fragment at the found position
114 // result.index() gives the character position where the match starts
115 // result.count() gives the length of the matched text
116 let fragment = text.extract(result.index(), result.count());
117 println!(
118 "Found groningen (case insensitive) at {}, fragment = '{fragment}'",
119 result.index()
120 );
121 }
122
123 // Expected output:
124 //
125 // Found amsterdam 0 times
126 // Found groningen 5 times (case insensitive)
127 // Found Groningen 5 times (case sensitive)
128 // Found groningen (case insensitive) at 14, fragment = 'Groningen'
129 // Found groningen (case insensitive) at 232, fragment = 'Groningen'
130 // Found groningen (case insensitive) at 475, fragment = 'Groningen'
131 // Found groningen (case insensitive) at 920, fragment = 'Groningen'
132 // Found groningen (case insensitive) at 1050, fragment = 'Groningen'
133
134 Ok(())
135}
136
137/// Main function that demonstrates both text extraction and search functionality
138fn main() -> PdfiumResult<()> {
139 example_extract_text()?;
140 example_search()?;
141 Ok(())
142}