docbox_processing/office/
convert_server.rs

1//! # Convert Server
2//!
3//! Persistent file conversion server https://github.com/jacobtread/office-convert-server backend
4//! for performing office file conversion
5//!
6//! ## Environment Variables
7//!
8//! * `DOCBOX_CONVERT_SERVER_ADDRESS` - Comma separated list of server addresses
9//! * `DOCBOX_CONVERT_SERVER_USE_PROXY` - Whether to use the system proxy when talking to the server
10
11use crate::office::libreoffice::is_known_libreoffice_pdf_convertable;
12
13use super::{ConvertToPdf, PdfConvertError};
14use bytes::Bytes;
15use office_convert_client::{
16    OfficeConvertClient, OfficeConvertLoadBalancer, OfficeConverter, RequestError,
17};
18use reqwest::Client;
19use serde::{Deserialize, Serialize};
20use thiserror::Error;
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct OfficeConvertServerConfig {
24    pub addresses: Vec<String>,
25    pub use_proxy: bool,
26}
27
28impl OfficeConvertServerConfig {
29    pub fn from_env() -> OfficeConvertServerConfig {
30        let addresses = std::env::var("DOCBOX_CONVERT_SERVER_ADDRESS")
31            .or(std::env::var("CONVERT_SERVER_ADDRESS"))
32            .unwrap_or("http://127.0.0.1:8081".to_string());
33        let addresses = addresses
34            .split(',')
35            .map(|value| value.to_string())
36            .collect();
37
38        // By default the office convert server will ignore the system proxy
39        // since we don't want file conversion to take an extra network hop since
40        // it shouldn't be leaving the private network
41        //
42        // CONVERT_SERVER_USE_PROXY allows this behavior to be disabled
43        let use_proxy = match std::env::var("DOCBOX_CONVERT_SERVER_USE_PROXY")
44            .or(std::env::var("CONVERT_SERVER_USE_PROXY"))
45        {
46            Ok(value) => match value.parse::<bool>() {
47                Ok(value) => value,
48                Err(error) => {
49                    tracing::error!(
50                        ?error,
51                        "invalid CONVERT_SERVER_USE_PROXY environment variable, defaulting to false"
52                    );
53                    false
54                }
55            },
56            Err(_) => false,
57        };
58
59        OfficeConvertServerConfig {
60            addresses,
61            use_proxy,
62        }
63    }
64}
65
66/// Variant of [ConvertToPdf] that uses LibreOffice through a
67/// office-converter server for the conversion
68#[derive(Clone)]
69pub struct OfficeConverterServer {
70    client: OfficeConverter,
71}
72
73#[derive(Debug, Error)]
74pub enum OfficeConvertServerError {
75    #[error("failed to build http client")]
76    BuildHttpClient(reqwest::Error),
77    #[error("no office convert server addresses provided")]
78    NoAddresses,
79}
80
81impl OfficeConverterServer {
82    pub fn new(client: OfficeConverter) -> Self {
83        Self { client }
84    }
85
86    pub fn from_config(
87        config: OfficeConvertServerConfig,
88    ) -> Result<Self, OfficeConvertServerError> {
89        Self::from_addresses(
90            config.addresses.iter().map(|value| value.as_str()),
91            config.use_proxy,
92        )
93    }
94
95    pub fn from_addresses<'a, I>(
96        addresses: I,
97        use_proxy: bool,
98    ) -> Result<Self, OfficeConvertServerError>
99    where
100        I: IntoIterator<Item = &'a str>,
101    {
102        let mut convert_clients: Vec<OfficeConvertClient> = Vec::new();
103        let mut http_client = Client::builder();
104
105        if !use_proxy {
106            http_client = http_client.no_proxy();
107        }
108
109        let http_client = http_client
110            .build()
111            .map_err(OfficeConvertServerError::BuildHttpClient)?;
112
113        for convert_server_address in addresses {
114            tracing::debug!(address = ?convert_server_address, "added convert server");
115
116            let convert_client =
117                OfficeConvertClient::from_client(convert_server_address, http_client.clone());
118
119            convert_clients.push(convert_client);
120        }
121
122        if convert_clients.is_empty() {
123            return Err(OfficeConvertServerError::NoAddresses);
124        }
125
126        // Create a convert load balancer
127        let load_balancer = OfficeConvertLoadBalancer::new(convert_clients);
128        Ok(Self::new(OfficeConverter::from_load_balancer(
129            load_balancer,
130        )))
131    }
132}
133
134impl ConvertToPdf for OfficeConverterServer {
135    async fn convert_to_pdf(&self, file_bytes: Bytes) -> Result<Bytes, PdfConvertError> {
136        self.client
137            .convert(file_bytes)
138            .await
139            .map_err(|err| match err {
140                // File was encrypted
141                RequestError::ErrorResponse { reason, .. } if reason == "file is encrypted" => {
142                    PdfConvertError::EncryptedDocument
143                }
144                // File was corrupted or unreadable
145                RequestError::ErrorResponse { reason, .. } if reason == "file is corrupted" => {
146                    PdfConvertError::MalformedDocument
147                }
148                // Other unknown error
149                err => PdfConvertError::ConversionFailed(err),
150            })
151    }
152
153    fn is_convertable(&self, mime: &mime::Mime) -> bool {
154        is_known_libreoffice_pdf_convertable(mime)
155    }
156}