tabula/
lib.rs

1//!
2//! # Rust bindings for tabulapdf/tabula-java
3//! 
4//! ## Prerequisites
5//! In order to use tabula-rs, you will need a tabula-java bytecode archive (jar).
6//! You can build it yourself by cloning <ssh://git@github.com/tabulapdf/tabula-java.git> and then running invoking [maven](https://software.opensuse.org/package/maven) to build it.
7//! ```sh
8//! git clone git@github.com:tabulapdf/tabula-java.git && cd tabula-java
9//! git apply path/to/tabula-rs/0001-add-ffi-constructor-to-CommandLineApp.patch
10//! mvn compile assembly:single
11//! ```
12//! the built archive should then be target/tabula-$TABULA_VER-jar-with-dependencies.jar.
13//!
14//! Additionally, make sure `$JAVA_HOME/lib/server/libjvm.so` is reachable through `LD_LIBRARY_PATH` or explicitly set it as `LD_PRELOAD`.
15//!
16//! This can look like this:
17//! ```sh
18//! export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$JAVA_HOME/lib/server/
19//! ```
20//!
21//! ## Using tabula-rs
22//! ### Initalizing JVM & accessing JNI
23//! In order to make use of tabula-java, you'll need to start [jni::JavaVM] with the built archive added to its classpath.
24//! You could either do this manually, or call [TabulaVM::new()]` with the (space escaped) path to the archive as parameter.
25//! 
26//! Using [TabulaVM] you can now access the Java native interface by calling [TabulaVM::attach()].
27//! ```
28//! # use tabula::TabulaVM;
29//! let vm = TabulaVM::new("../tabula-java/target/tabula-1.0.6-SNAPSHOT-jar-with-dependencies.jar", false).unwrap();
30//! let env = vm.attach().unwrap();
31//! ```
32//!
33//! ### Instantiating Tabula class
34//! with access to the JNI you can instantia the [Tabula] class by calling [TabulaEnv::configure_tabula()].
35//! ```
36//! # use tabula::{ExtractionMethod, OutputFormat, TabulaVM};
37//! # let vm = TabulaVM::new("../tabula-java/target/tabula-1.0.6-SNAPSHOT-jar-with-dependencies.jar", false).unwrap();
38//! # let env = vm.attach().unwrap();
39//! let t = env.configure_tabula(None, None, OutputFormat::Csv, true, ExtractionMethod::Basic, false, None).unwrap();
40//! ```
41//!
42//! ### Parsing the document
43//! [Tabula] provides [Tabula::parse_document()] that then parses a document located a its given path and returns a [std::fs::File] located in memory.
44//! ```
45//! # use tabula::{ExtractionMethod, OutputFormat, TabulaVM};
46//! # let vm = TabulaVM::new("../tabula-java/target/tabula-1.0.6-SNAPSHOT-jar-with-dependencies.jar", false).unwrap();
47//! # let env = vm.attach().unwrap();
48//! # let t = env.configure_tabula(None, None, OutputFormat::Csv, true, ExtractionMethod::Basic, false, None).unwrap();
49//! let file = t.parse_document(&std::path::Path::new("./test_data/spanning_cells.pdf"), "test_spanning_cells").unwrap();
50//! ```  
51//! 
52//! ## Relavant links
53//! - tabula-rs forge: <https://github.com/sp1ritCS/tabula-rs>
54//! - tabula-java project: <https://github.com/tabulapdf/tabula-java/>
55
56
57mod tmp_file;
58mod objects;
59use objects::{IntoJObject, Pair};
60pub use objects::{RELATIVE_AREA_CALCULATION_MODE, ABSOLUTE_AREA_CALCULATION_MODE, Rectangle, OutputFormat, ExtractionMethod};
61
62use anyhow::Result;
63use jni::{AttachGuard, InitArgsBuilder, JNIEnv, JNIVersion, JavaVM, objects::{JObject, JValue}, errors::Error as JError};
64pub use jni;
65use tmp_file::TempFile; // reexport
66
67use std::result::Result as StdResult;
68use std::ops::Deref;
69use std::path::Path;
70
71/// Result returned from JNI
72pub type JResult<T> = StdResult<T, JError>;
73
74///
75/// # Java VM capable of using Tabula
76///
77/// Can be created using [TabulaVM::new()] or by putting a [jni::JavaVM] as it's first inner parameter 
78///
79pub struct TabulaVM(JavaVM);
80impl <'env> TabulaVM {
81	/// 
82	/// Create a new Java VM capable of using Tabula
83	///
84	/// - `libpath`: Escaped path to `tabula-java.jar`
85	/// - `debug`: runs jvm with `-Xcheck:jni`
86	///
87	pub fn new(libpath: &str, debug: bool) -> Result<Self> {
88		let opt = format!("-Djava.class.path={}", libpath);
89		let mut jvm_args = InitArgsBuilder::new()
90			.version(JNIVersion::V8)
91			.option(&opt);
92
93		if debug {
94			jvm_args = jvm_args.option("-Xcheck:jni");
95		}
96		
97		let jvm_args = jvm_args.build()?;
98
99		Ok(Self(JavaVM::new(jvm_args)?))
100	}
101	
102	/// Get Java native interface capable of instantiating Tabula
103	pub fn attach(&'env self) -> Result<TabulaEnv<'env>> {
104		Ok(TabulaEnv(self.0.attach_current_thread()?))
105	}
106}
107
108
109///
110/// # Java native interface capable of instantiating Tabula class
111///
112/// received by calling [TabulaVM::attach()]
113///
114pub struct TabulaEnv<'env>(AttachGuard<'env>);
115
116impl <'env> TabulaEnv<'env> {
117	fn get_pages_jarray(&self, pages: &[i32]) -> JResult<*mut jni::sys::_jobject> {
118		let null = JObject::null();
119		let array = self.new_object_array(pages.len() as i32, "java/lang/Integer", null)?;
120		for (i, pg) in pages.iter().enumerate() {
121			self.set_object_array_element(array, i as i32, pg.get_jobject(self)?)?;
122		}
123		Ok(array)
124	}
125	
126	fn get_page_areas_jarray(&self, page_areas: &[(i32, Rectangle)]) -> JResult<*mut jni::sys::_jobject> {
127		let null = JObject::null();
128		let array = self.new_object_array(page_areas.len() as i32, "technology/tabula/Pair", null)?;
129		for (i, (mode, rect)) in page_areas.iter().enumerate() {
130			let pga = Pair::new(*mode, *rect);
131			self.set_object_array_element(array, i as i32, pga.get_jobject(self)?)?;
132		}
133		Ok(array)
134	}
135	
136	///
137	/// # Instantiate Tabula class 
138	///
139	/// - `page_areas`: Portion of the page to analyze. If mode is [Relative](crate::RELATIVE_AREA_CALCULATION_MODE) the [Rectangle](crate::Rectangle) will be taken as % of actual height or width of the page.
140	/// - `pages`: Nullable slice (if None then all pages) to be parsed
141	/// - `output_format`: [crate::OutputFormat]
142	/// - `guess`: Guess the portion of the page to analyze per page.
143	/// - `method`: [crate::ExtractionMethod]
144	/// - `use_returns`: Use embedded line returns in cells. (Only in spreadsheet mode.)
145	/// - `password`: Password to decrypt document. None in case of no password.
146	///
147	#[allow(clippy::too_many_arguments)]
148	pub fn configure_tabula(&self,
149		page_areas: Option<&[(i32, Rectangle)]>,
150		pages: Option<&[i32]>,
151		output_format: OutputFormat,
152		guess: bool,
153		method: ExtractionMethod,
154		use_returns: bool,
155		password: Option<&str>
156	) -> JResult<Tabula> {
157		let areas = if let Some(page_areas) = page_areas {
158			JValue::from(self.get_page_areas_jarray(page_areas)?)
159		} else {
160			JValue::from(JObject::null())
161		};
162		let pages = if let Some(pages) = pages {
163			JValue::from(self.get_pages_jarray(pages)?)
164		} else {
165			JValue::from(JObject::null())
166		};
167		let password = password
168			.and_then(|pw| self.new_string(pw).ok())
169			.map(JValue::from)
170			.unwrap_or(JValue::from(JObject::null()));
171		let tabula = self.new_object("technology/tabula/CommandLineApp", "([Ltechnology/tabula/Pair;[Ljava/lang/Integer;Ltechnology/tabula/CommandLineApp$OutputFormat;ZLtechnology/tabula/CommandLineApp$ExtractionMethod;ZLjava/lang/String;)V", &[
172			areas,
173			pages,
174			JValue::from(output_format.get_jobject(self)?),
175			JValue::from(guess),
176			JValue::from(method.get_jobject(self)?),
177			JValue::from(use_returns),
178			password
179		])?;
180
181		Ok(Tabula {
182			env: self,
183			inner: tabula
184		})
185	}
186}
187
188impl <'env> Deref for TabulaEnv<'env> {
189	type Target = JNIEnv<'env>;
190
191	fn deref(&self) -> &Self::Target {
192		&self.0
193	}
194}
195
196///
197/// # Tabula class
198///
199/// received by calling [TabulaEnv::configure_tabula()]
200///
201pub struct Tabula<'env> {
202	env: &'env TabulaEnv<'env>,
203	inner: JObject<'env>
204}
205
206impl Tabula<'_> {
207	///
208	/// # Parse document located at `path`.
209	///
210	/// `descriptor_name` refers to the filename passed to [memfd_create()](https://git.kernel.org/pub/scm/docs/man-pages/man-pages.git/tree/man2/memfd_create.2)
211	///
212	pub fn parse_document(&self, path: &Path, descriptor_name: &str) -> Result<std::fs::File> {
213		let output = tmp_file::new(descriptor_name)?;
214
215		let output_path = output.get_path();
216
217		self.parse_document_into(path, &output_path)?;
218		
219		let file = output.into_file();
220		Ok(file)
221	}
222
223	///
224	/// # Parse document located at `path`, writing the output into the file at `output`.
225	///
226	pub fn parse_document_into(&self, path: &Path, output: &Path) -> Result<()> {
227		let file = path.get_jobject(self.env)?;
228		let outfile = output.get_jobject(self.env)?;
229
230		self.env.call_method(*self.deref(), "extractFileInto", "(Ljava/io/File;Ljava/io/File;)V", &[
231			JValue::Object(file),
232			JValue::Object(outfile)
233		])?;
234
235		Ok(())
236	}
237}
238
239impl <'env> Deref for Tabula<'env> {
240	type Target = JObject<'env>;
241
242	fn deref(&self) -> &Self::Target {
243		&self.inner
244	}
245}
246
247#[cfg(test)]
248mod tests;