tabula/lib.rs
1//!
2//! # Rust bindings for tabulapdf/tabula-java
3//!
4//! ## Prerequisites
5//! In order to use tabula-rs, you will need a tabula-java bytecode archive (jar).
6//! You can build it yourself by cloning <ssh://git@github.com/tabulapdf/tabula-java.git> and then running invoking [maven](https://software.opensuse.org/package/maven) to build it.
7//! ```sh
8//! git clone git@github.com:tabulapdf/tabula-java.git && cd tabula-java
9//! git apply path/to/tabula-rs/0001-add-ffi-constructor-to-CommandLineApp.patch
10//! mvn compile assembly:single
11//! ```
12//! the built archive should then be target/tabula-$TABULA_VER-jar-with-dependencies.jar.
13//!
14//! Additionally, make sure `$JAVA_HOME/lib/server/libjvm.so` is reachable through `LD_LIBRARY_PATH` or explicitly set it as `LD_PRELOAD`.
15//!
16//! This can look like this:
17//! ```sh
18//! export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$JAVA_HOME/lib/server/
19//! ```
20//!
21//! ## Using tabula-rs
22//! ### Initalizing JVM & accessing JNI
23//! In order to make use of tabula-java, you'll need to start [jni::JavaVM] with the built archive added to its classpath.
24//! You could either do this manually, or call [TabulaVM::new()]` with the (space escaped) path to the archive as parameter.
25//!
26//! Using [TabulaVM] you can now access the Java native interface by calling [TabulaVM::attach()].
27//! ```
28//! # use tabula::TabulaVM;
29//! let vm = TabulaVM::new("../tabula-java/target/tabula-1.0.6-SNAPSHOT-jar-with-dependencies.jar", false).unwrap();
30//! let env = vm.attach().unwrap();
31//! ```
32//!
33//! ### Instantiating Tabula class
34//! with access to the JNI you can instantia the [Tabula] class by calling [TabulaEnv::configure_tabula()].
35//! ```
36//! # use tabula::{ExtractionMethod, OutputFormat, TabulaVM};
37//! # let vm = TabulaVM::new("../tabula-java/target/tabula-1.0.6-SNAPSHOT-jar-with-dependencies.jar", false).unwrap();
38//! # let env = vm.attach().unwrap();
39//! let t = env.configure_tabula(None, None, OutputFormat::Csv, true, ExtractionMethod::Basic, false, None).unwrap();
40//! ```
41//!
42//! ### Parsing the document
43//! [Tabula] provides [Tabula::parse_document()] that then parses a document located a its given path and returns a [std::fs::File] located in memory.
44//! ```
45//! # use tabula::{ExtractionMethod, OutputFormat, TabulaVM};
46//! # let vm = TabulaVM::new("../tabula-java/target/tabula-1.0.6-SNAPSHOT-jar-with-dependencies.jar", false).unwrap();
47//! # let env = vm.attach().unwrap();
48//! # let t = env.configure_tabula(None, None, OutputFormat::Csv, true, ExtractionMethod::Basic, false, None).unwrap();
49//! let file = t.parse_document(&std::path::Path::new("./test_data/spanning_cells.pdf"), "test_spanning_cells").unwrap();
50//! ```
51//!
52//! ## Relavant links
53//! - tabula-rs forge: <https://github.com/sp1ritCS/tabula-rs>
54//! - tabula-java project: <https://github.com/tabulapdf/tabula-java/>
55
56
57mod tmp_file;
58mod objects;
59use objects::{IntoJObject, Pair};
60pub use objects::{RELATIVE_AREA_CALCULATION_MODE, ABSOLUTE_AREA_CALCULATION_MODE, Rectangle, OutputFormat, ExtractionMethod};
61
62use anyhow::Result;
63use jni::{AttachGuard, InitArgsBuilder, JNIEnv, JNIVersion, JavaVM, objects::{JObject, JValue}, errors::Error as JError};
64pub use jni;
65use tmp_file::TempFile; // reexport
66
67use std::result::Result as StdResult;
68use std::ops::Deref;
69use std::path::Path;
70
71/// Result returned from JNI
72pub type JResult<T> = StdResult<T, JError>;
73
74///
75/// # Java VM capable of using Tabula
76///
77/// Can be created using [TabulaVM::new()] or by putting a [jni::JavaVM] as it's first inner parameter
78///
79pub struct TabulaVM(JavaVM);
80impl <'env> TabulaVM {
81 ///
82 /// Create a new Java VM capable of using Tabula
83 ///
84 /// - `libpath`: Escaped path to `tabula-java.jar`
85 /// - `debug`: runs jvm with `-Xcheck:jni`
86 ///
87 pub fn new(libpath: &str, debug: bool) -> Result<Self> {
88 let opt = format!("-Djava.class.path={}", libpath);
89 let mut jvm_args = InitArgsBuilder::new()
90 .version(JNIVersion::V8)
91 .option(&opt);
92
93 if debug {
94 jvm_args = jvm_args.option("-Xcheck:jni");
95 }
96
97 let jvm_args = jvm_args.build()?;
98
99 Ok(Self(JavaVM::new(jvm_args)?))
100 }
101
102 /// Get Java native interface capable of instantiating Tabula
103 pub fn attach(&'env self) -> Result<TabulaEnv<'env>> {
104 Ok(TabulaEnv(self.0.attach_current_thread()?))
105 }
106}
107
108
109///
110/// # Java native interface capable of instantiating Tabula class
111///
112/// received by calling [TabulaVM::attach()]
113///
114pub struct TabulaEnv<'env>(AttachGuard<'env>);
115
116impl <'env> TabulaEnv<'env> {
117 fn get_pages_jarray(&self, pages: &[i32]) -> JResult<*mut jni::sys::_jobject> {
118 let null = JObject::null();
119 let array = self.new_object_array(pages.len() as i32, "java/lang/Integer", null)?;
120 for (i, pg) in pages.iter().enumerate() {
121 self.set_object_array_element(array, i as i32, pg.get_jobject(self)?)?;
122 }
123 Ok(array)
124 }
125
126 fn get_page_areas_jarray(&self, page_areas: &[(i32, Rectangle)]) -> JResult<*mut jni::sys::_jobject> {
127 let null = JObject::null();
128 let array = self.new_object_array(page_areas.len() as i32, "technology/tabula/Pair", null)?;
129 for (i, (mode, rect)) in page_areas.iter().enumerate() {
130 let pga = Pair::new(*mode, *rect);
131 self.set_object_array_element(array, i as i32, pga.get_jobject(self)?)?;
132 }
133 Ok(array)
134 }
135
136 ///
137 /// # Instantiate Tabula class
138 ///
139 /// - `page_areas`: Portion of the page to analyze. If mode is [Relative](crate::RELATIVE_AREA_CALCULATION_MODE) the [Rectangle](crate::Rectangle) will be taken as % of actual height or width of the page.
140 /// - `pages`: Nullable slice (if None then all pages) to be parsed
141 /// - `output_format`: [crate::OutputFormat]
142 /// - `guess`: Guess the portion of the page to analyze per page.
143 /// - `method`: [crate::ExtractionMethod]
144 /// - `use_returns`: Use embedded line returns in cells. (Only in spreadsheet mode.)
145 /// - `password`: Password to decrypt document. None in case of no password.
146 ///
147 #[allow(clippy::too_many_arguments)]
148 pub fn configure_tabula(&self,
149 page_areas: Option<&[(i32, Rectangle)]>,
150 pages: Option<&[i32]>,
151 output_format: OutputFormat,
152 guess: bool,
153 method: ExtractionMethod,
154 use_returns: bool,
155 password: Option<&str>
156 ) -> JResult<Tabula> {
157 let areas = if let Some(page_areas) = page_areas {
158 JValue::from(self.get_page_areas_jarray(page_areas)?)
159 } else {
160 JValue::from(JObject::null())
161 };
162 let pages = if let Some(pages) = pages {
163 JValue::from(self.get_pages_jarray(pages)?)
164 } else {
165 JValue::from(JObject::null())
166 };
167 let password = password
168 .and_then(|pw| self.new_string(pw).ok())
169 .map(JValue::from)
170 .unwrap_or(JValue::from(JObject::null()));
171 let tabula = self.new_object("technology/tabula/CommandLineApp", "([Ltechnology/tabula/Pair;[Ljava/lang/Integer;Ltechnology/tabula/CommandLineApp$OutputFormat;ZLtechnology/tabula/CommandLineApp$ExtractionMethod;ZLjava/lang/String;)V", &[
172 areas,
173 pages,
174 JValue::from(output_format.get_jobject(self)?),
175 JValue::from(guess),
176 JValue::from(method.get_jobject(self)?),
177 JValue::from(use_returns),
178 password
179 ])?;
180
181 Ok(Tabula {
182 env: self,
183 inner: tabula
184 })
185 }
186}
187
188impl <'env> Deref for TabulaEnv<'env> {
189 type Target = JNIEnv<'env>;
190
191 fn deref(&self) -> &Self::Target {
192 &self.0
193 }
194}
195
196///
197/// # Tabula class
198///
199/// received by calling [TabulaEnv::configure_tabula()]
200///
201pub struct Tabula<'env> {
202 env: &'env TabulaEnv<'env>,
203 inner: JObject<'env>
204}
205
206impl Tabula<'_> {
207 ///
208 /// # Parse document located at `path`.
209 ///
210 /// `descriptor_name` refers to the filename passed to [memfd_create()](https://git.kernel.org/pub/scm/docs/man-pages/man-pages.git/tree/man2/memfd_create.2)
211 ///
212 pub fn parse_document(&self, path: &Path, descriptor_name: &str) -> Result<std::fs::File> {
213 let output = tmp_file::new(descriptor_name)?;
214
215 let output_path = output.get_path();
216
217 self.parse_document_into(path, &output_path)?;
218
219 let file = output.into_file();
220 Ok(file)
221 }
222
223 ///
224 /// # Parse document located at `path`, writing the output into the file at `output`.
225 ///
226 pub fn parse_document_into(&self, path: &Path, output: &Path) -> Result<()> {
227 let file = path.get_jobject(self.env)?;
228 let outfile = output.get_jobject(self.env)?;
229
230 self.env.call_method(*self.deref(), "extractFileInto", "(Ljava/io/File;Ljava/io/File;)V", &[
231 JValue::Object(file),
232 JValue::Object(outfile)
233 ])?;
234
235 Ok(())
236 }
237}
238
239impl <'env> Deref for Tabula<'env> {
240 type Target = JObject<'env>;
241
242 fn deref(&self) -> &Self::Target {
243 &self.inner
244 }
245}
246
247#[cfg(test)]
248mod tests;