processors_rs/pdf/tesseract/
output_data.rs1use input::{Args, Image};
2use parse_line_util::{parse_next, FromLine};
3
4use super::*;
5use core::fmt;
6
7#[derive(Debug, PartialEq)]
8pub struct DataOutput {
9 pub output: String,
10 pub data: Vec<Data>,
11}
12
13impl fmt::Display for DataOutput {
14 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
15 write!(f, "{}", self.output)
16 }
17}
18
19#[derive(Debug, PartialEq)]
20pub struct Data {
21 pub level: i32,
22 pub page_num: i32,
23 pub block_num: i32,
24 pub par_num: i32,
25 pub line_num: i32,
26 pub word_num: i32,
27 pub left: i32,
28 pub top: i32,
29 pub width: i32,
30 pub height: i32,
31 pub conf: f32,
32 pub text: String,
33}
34
35impl fmt::Display for Data {
36 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37 write!(
38 f,
39 "{} {} {} {} {} {} {} {} {} {} {} {}",
40 self.level,
41 self.page_num,
42 self.block_num,
43 self.par_num,
44 self.line_num,
45 self.word_num,
46 self.left,
47 self.top,
48 self.width,
49 self.height,
50 self.conf,
51 self.text,
52 )
53 }
54}
55
56impl FromLine for Data {
57 fn from_line(line: &str) -> Option<Self> {
58 let mut x = line.split_whitespace();
59 Some(Data {
60 level: parse_next(&mut x)?,
61 page_num: parse_next(&mut x)?,
62 block_num: parse_next(&mut x)?,
63 par_num: parse_next(&mut x)?,
64 line_num: parse_next(&mut x)?,
65 word_num: parse_next(&mut x)?,
66 left: parse_next(&mut x)?,
67 top: parse_next(&mut x)?,
68 width: parse_next(&mut x)?,
69 height: parse_next(&mut x)?,
70 conf: parse_next(&mut x)?,
71 text: x.next().unwrap_or("").to_string(),
72 })
73 }
74}
75
76pub fn image_to_data(image: &Image, args: &Args) -> error::TessResult<DataOutput> {
77 let mut command = command::create_tesseract_command(image, args)?;
78 command.arg("tsv");
79
80 let output = command::run_tesseract_command(&mut command)?;
81
82 let data = string_to_data(&output)?;
83
84 Ok(DataOutput { output, data })
85}
86
87fn string_to_data(output: &str) -> error::TessResult<Vec<Data>> {
88 output.lines().skip(1).map(Data::parse).collect::<_>()
89}
90
91#[cfg(test)]
92mod tests {
93 use crate::pdf::tesseract::output_data::{string_to_data, Data};
94
95 #[test]
96 fn test_string_to_data() {
97 let result = string_to_data("level page_num block_num par_num line_num word_num left top width height conf text
98 5 1 1 1 1 1 65 41 46 20 96.063751 The");
99 assert_eq!(
100 *result.unwrap().first().unwrap(),
101 Data {
102 level: 5,
103 page_num: 1,
104 block_num: 1,
105 par_num: 1,
106 line_num: 1,
107 word_num: 1,
108 left: 65,
109 top: 41,
110 width: 46,
111 height: 20,
112 conf: 96.063_75,
113 text: String::from("The"),
114 }
115 )
116 }
117
118 #[test]
119 fn test_string_to_data_parse_error() {
120 let result = string_to_data("level page_num block_num par_num line_num word_num left top width height conf text\n\
121 Test");
122 assert_eq!(
123 result,
124 Err(crate::pdf::tesseract::error::TessError::ParseError(
125 "invalid line 'Test'".into()
126 ))
127 )
128 }
129}