docbox_processing/office/
convert_server.rs

1use super::{ConvertToPdf, PdfConvertError};
2use bytes::Bytes;
3use office_convert_client::{
4    OfficeConvertClient, OfficeConvertLoadBalancer, OfficeConverter, RequestError,
5};
6use reqwest::Client;
7use serde::{Deserialize, Serialize};
8use thiserror::Error;
9
10/// List of supported convertable formats
11pub const CONVERTABLE_FORMATS: &[&str] = &[
12    // .dotm
13    "application/vnd.ms-word.template.macroenabled.12",
14    // .xlsb
15    "application/vnd.ms-excel.sheet.binary.macroenabled.12",
16    // .xlsm
17    "application/vnd.ms-excel.sheet.macroenabled.12",
18    // .xltm
19    "application/vnd.ms-excel.template.macroenabled.12",
20    // .ods
21    "application/vnd.oasis.opendocument.spreadsheet",
22    "text/html",
23    "application/msword",
24    "application/vnd.oasis.opendocument.text-flat-xml",
25    "application/rtf",
26    "application/vnd.sun.xml.writer",
27    "application/vnd.wordperfect",
28    "application/vnd.ms-works",
29    "application/x-mswrite",
30    "application/clarisworks",
31    "application/macwriteii",
32    "application/x-abiword",
33    "application/x-t602",
34    "application/vnd.lotus-wordpro",
35    "application/x-hwp",
36    "application/vnd.sun.xml.writer.template",
37    "application/pdf",
38    "application/vnd.oasis.opendocument.text",
39    "application/vnd.oasis.opendocument.text-template",
40    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
41    "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
42    "application/vnd.openxmlformats-officedocument.wordprocessingml.slideshow",
43    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
44    "application/vnd.oasis.opendocument.presentation",
45    "application/x-fictionbook+xml",
46    "application/x-aportisdoc",
47    "application/prs.plucker",
48    "application/x-iwork-pages-sffpages",
49    "application/vnd.palm",
50    "application/epub+zip",
51    "application/x-pocket-word",
52    "application/vnd.oasis.opendocument.spreadsheet-flat-xml",
53    "application/vnd.lotus-1-2-3",
54    "application/vnd.ms-excel",
55    "text/spreadsheet",
56    "application/vnd.sun.xml.calc",
57    "application/vnd.sun.xml.calc.template",
58    "application/x-gnumeric",
59    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
60    "application/vnd.ms-excel.sheet.macroEnabled.12",
61    "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
62    "application/clarisworks",
63    "application/x-iwork-numbers-sffnumbers",
64    "application/mathml+xml",
65    "application/vnd.sun.xml.math",
66    "application/vnd.oasis.opendocument.formula",
67    "application/vnd.sun.xml.base",
68    "image/jpeg",
69    "image/png",
70    "image/svg+xml",
71    "image/webp",
72    "application/docbook+xml",
73    "application/xhtml+xml",
74];
75
76#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct OfficeConvertServerConfig {
78    pub addresses: Vec<String>,
79    pub use_proxy: bool,
80}
81
82impl OfficeConvertServerConfig {
83    pub fn from_env() -> OfficeConvertServerConfig {
84        let addresses =
85            std::env::var("CONVERT_SERVER_ADDRESS").unwrap_or("http://127.0.0.1:8081".to_string());
86        let addresses = addresses
87            .split(',')
88            .map(|value| value.to_string())
89            .collect();
90
91        // By default the office convert server will ignore the system proxy
92        // since we don't want file conversion to take an extra network hop since
93        // it shouldn't be leaving the private network
94        //
95        // CONVERT_SERVER_USE_PROXY allows this behavior to be disabled
96        let use_proxy = match std::env::var("CONVERT_SERVER_USE_PROXY") {
97            Ok(value) => match value.parse::<bool>() {
98                Ok(value) => value,
99                Err(error) => {
100                    tracing::error!(
101                        ?error,
102                        "invalid CONVERT_SERVER_USE_PROXY environment variable, defaulting to false"
103                    );
104                    false
105                }
106            },
107            Err(_) => false,
108        };
109
110        OfficeConvertServerConfig {
111            addresses,
112            use_proxy,
113        }
114    }
115}
116
117/// Variant of [ConvertToPdf] that uses LibreOffice through a
118/// office-converter server for the conversion
119#[derive(Clone)]
120pub struct OfficeConverterServer {
121    client: OfficeConverter,
122}
123
124#[derive(Debug, Error)]
125pub enum OfficeConvertServerError {
126    #[error("failed to build http client")]
127    BuildHttpClient(reqwest::Error),
128    #[error("no office convert server addresses provided")]
129    NoAddresses,
130}
131
132impl OfficeConverterServer {
133    pub fn new(client: OfficeConverter) -> Self {
134        Self { client }
135    }
136
137    pub fn from_config(
138        config: OfficeConvertServerConfig,
139    ) -> Result<Self, OfficeConvertServerError> {
140        Self::from_addresses(
141            config.addresses.iter().map(|value| value.as_str()),
142            config.use_proxy,
143        )
144    }
145
146    pub fn from_addresses<'a, I>(
147        addresses: I,
148        use_proxy: bool,
149    ) -> Result<Self, OfficeConvertServerError>
150    where
151        I: IntoIterator<Item = &'a str>,
152    {
153        let mut convert_clients: Vec<OfficeConvertClient> = Vec::new();
154        let mut http_client = Client::builder();
155
156        if !use_proxy {
157            http_client = http_client.no_proxy();
158        }
159
160        let http_client = http_client
161            .build()
162            .map_err(OfficeConvertServerError::BuildHttpClient)?;
163
164        for convert_server_address in addresses {
165            tracing::debug!(address = ?convert_server_address, "added convert server");
166
167            let convert_client =
168                OfficeConvertClient::from_client(convert_server_address, http_client.clone());
169
170            convert_clients.push(convert_client);
171        }
172
173        if convert_clients.is_empty() {
174            return Err(OfficeConvertServerError::NoAddresses);
175        }
176
177        // Create a convert load balancer
178        let load_balancer = OfficeConvertLoadBalancer::new(convert_clients);
179        Ok(Self::new(OfficeConverter::from_load_balancer(
180            load_balancer,
181        )))
182    }
183}
184
185impl ConvertToPdf for OfficeConverterServer {
186    async fn convert_to_pdf(&self, file_bytes: Bytes) -> Result<Bytes, PdfConvertError> {
187        self.client
188            .convert(file_bytes)
189            .await
190            .map_err(|err| match err {
191                // File was encrypted
192                RequestError::ErrorResponse { reason, .. } if reason == "file is encrypted" => {
193                    PdfConvertError::EncryptedDocument
194                }
195                // File was corrupted or unreadable
196                RequestError::ErrorResponse { reason, .. } if reason == "file is corrupted" => {
197                    PdfConvertError::MalformedDocument
198                }
199                // Other unknown error
200                err => PdfConvertError::ConversionFailed(err),
201            })
202    }
203
204    fn is_convertable(&self, mime: &mime::Mime) -> bool {
205        is_known_pdf_convertable(mime)
206    }
207}
208
209/// Checks if the provided mime is included in the known convertable mime types
210pub fn is_known_pdf_convertable(mime: &mime::Mime) -> bool {
211    // We don't want to send images through the office converter
212    mime.type_() != mime::IMAGE &&
213    // Must be in the convertable formats list
214    CONVERTABLE_FORMATS.contains(&mime.essence_str())
215}