processors_rs/pdf/tesseract/
output_data.rs

1use input::{Args, Image};
2use parse_line_util::{parse_next, FromLine};
3
4use super::*;
5use core::fmt;
6
7#[derive(Debug, PartialEq)]
8pub struct DataOutput {
9    pub output: String,
10    pub data: Vec<Data>,
11}
12
13impl fmt::Display for DataOutput {
14    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
15        write!(f, "{}", self.output)
16    }
17}
18
19#[derive(Debug, PartialEq)]
20pub struct Data {
21    pub level: i32,
22    pub page_num: i32,
23    pub block_num: i32,
24    pub par_num: i32,
25    pub line_num: i32,
26    pub word_num: i32,
27    pub left: i32,
28    pub top: i32,
29    pub width: i32,
30    pub height: i32,
31    pub conf: f32,
32    pub text: String,
33}
34
35impl fmt::Display for Data {
36    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37        write!(
38            f,
39            "{} {} {} {} {} {} {} {} {} {} {} {}",
40            self.level,
41            self.page_num,
42            self.block_num,
43            self.par_num,
44            self.line_num,
45            self.word_num,
46            self.left,
47            self.top,
48            self.width,
49            self.height,
50            self.conf,
51            self.text,
52        )
53    }
54}
55
56impl FromLine for Data {
57    fn from_line(line: &str) -> Option<Self> {
58        let mut x = line.split_whitespace();
59        Some(Data {
60            level: parse_next(&mut x)?,
61            page_num: parse_next(&mut x)?,
62            block_num: parse_next(&mut x)?,
63            par_num: parse_next(&mut x)?,
64            line_num: parse_next(&mut x)?,
65            word_num: parse_next(&mut x)?,
66            left: parse_next(&mut x)?,
67            top: parse_next(&mut x)?,
68            width: parse_next(&mut x)?,
69            height: parse_next(&mut x)?,
70            conf: parse_next(&mut x)?,
71            text: x.next().unwrap_or("").to_string(),
72        })
73    }
74}
75
76pub fn image_to_data(image: &Image, args: &Args) -> error::TessResult<DataOutput> {
77    let mut command = command::create_tesseract_command(image, args)?;
78    command.arg("tsv");
79
80    let output = command::run_tesseract_command(&mut command)?;
81
82    let data = string_to_data(&output)?;
83
84    Ok(DataOutput { output, data })
85}
86
87fn string_to_data(output: &str) -> error::TessResult<Vec<Data>> {
88    output.lines().skip(1).map(Data::parse).collect::<_>()
89}
90
91#[cfg(test)]
92mod tests {
93    use crate::pdf::tesseract::output_data::{string_to_data, Data};
94
95    #[test]
96    fn test_string_to_data() {
97        let result = string_to_data("level   page_num        block_num       par_num line_num        word_num        left    top     width   height  conf    text
98        5       1       1       1       1       1       65      41      46      20      96.063751       The");
99        assert_eq!(
100            *result.unwrap().first().unwrap(),
101            Data {
102                level: 5,
103                page_num: 1,
104                block_num: 1,
105                par_num: 1,
106                line_num: 1,
107                word_num: 1,
108                left: 65,
109                top: 41,
110                width: 46,
111                height: 20,
112                conf: 96.063_75,
113                text: String::from("The"),
114            }
115        )
116    }
117
118    #[test]
119    fn test_string_to_data_parse_error() {
120        let result = string_to_data("level   page_num        block_num       par_num line_num        word_num        left    top     width   height  conf    text\n\
121        Test");
122        assert_eq!(
123            result,
124            Err(crate::pdf::tesseract::error::TessError::ParseError(
125                "invalid line 'Test'".into()
126            ))
127        )
128    }
129}