Skip to main content

docbox_processing/office/
convert_lambda.rs

1//! # Convert Lambda
2//!
3//! Lambda based file conversion server https://github.com/jacobtread/office-convert-lambda backend
4//! for performing office file conversion
5//!
6//! ## Environment Variables
7//!
8//! * `DOCBOX_CONVERT_LAMBDA_TMP_BUCKET` - S3 bucket to store the temporary input and output files from conversion
9//! * `DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME` - The name or ARN of the Lambda function, version, or alias.
10//! * `DOCBOX_CONVERT_LAMBDA_QUALIFIER` - Optionally specify a version or alias to invoke a published version of the function.
11//! * `DOCBOX_CONVERT_LAMBDA_TENANT_ID` - Optional identifier of the tenant in a multi-tenant Lambda function.
12//! * `DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS` - Maximum number of times to retry on unexpected failures
13//! * `DOCBOX_CONVERT_LAMBDA_RETRY_WAIT` - Delay to wait between each retry attempt
14
15use std::{str::FromStr, time::Duration};
16
17use crate::office::libreoffice::is_known_libreoffice_pdf_convertable;
18
19use super::{ConvertToPdf, PdfConvertError};
20use aws_config::SdkConfig;
21use bytes::Bytes;
22use docbox_database::sqlx::types::Uuid;
23use docbox_storage::{
24    StorageLayer, StorageLayerError, StorageLayerFactory, StorageLayerOptions, UploadFileOptions,
25};
26use office_convert_lambda_client::{ConvertError, OfficeConvertLambda, OfficeConvertLambdaOptions};
27use serde::{Deserialize, Serialize};
28use thiserror::Error;
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct OfficeConvertLambdaConfig {
32    /// The name or ARN of the Lambda function, version, or alias.
33    pub function_name: String,
34    /// Specify a version or alias to invoke a published version of the function.
35    pub qualifier: Option<String>,
36    /// The identifier of the tenant in a multi-tenant Lambda function.
37    pub tenant_id: Option<String>,
38    /// Number of retry attempts to perform
39    pub retry_attempts: usize,
40    /// Time to wait between retry attempts
41    pub retry_wait: Duration,
42    /// Temporary bucket to use for lambda input and output files
43    pub tmp_bucket: String,
44}
45
46#[derive(Debug, Error)]
47pub enum OfficeConvertLambdaConfigError {
48    #[error("missing DOCBOX_CONVERT_LAMBDA_TMP_BUCKET environment variable")]
49    MissingTempBucket,
50    #[error("missing DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME environment variable")]
51    MissingFunctionName,
52    #[error("DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS must be a number")]
53    InvalidRetryAttempts(<usize as FromStr>::Err),
54    #[error("DOCBOX_CONVERT_LAMBDA_RETRY_WAIT must be a number in seconds: {0}")]
55    InvalidRetryWait(<u64 as FromStr>::Err),
56}
57
58impl OfficeConvertLambdaConfig {
59    pub fn from_env() -> Result<OfficeConvertLambdaConfig, OfficeConvertLambdaConfigError> {
60        let tmp_bucket = std::env::var("DOCBOX_CONVERT_LAMBDA_TMP_BUCKET")
61            .map_err(|_| OfficeConvertLambdaConfigError::MissingTempBucket)?;
62
63        let function_name = std::env::var("DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME")
64            .map_err(|_| OfficeConvertLambdaConfigError::MissingFunctionName)?;
65
66        let qualifier = std::env::var("DOCBOX_CONVERT_LAMBDA_QUALIFIER").ok();
67        let tenant_id = std::env::var("DOCBOX_CONVERT_LAMBDA_TENANT_ID").ok();
68
69        let retry_attempts = match std::env::var("DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS") {
70            Ok(retry_attempts) => retry_attempts
71                .parse::<usize>()
72                .map_err(OfficeConvertLambdaConfigError::InvalidRetryAttempts)?,
73            Err(_) => 3,
74        };
75
76        let retry_wait = match std::env::var("DOCBOX_CONVERT_LAMBDA_RETRY_WAIT") {
77            Ok(retry_wait) => retry_wait
78                .parse::<u64>()
79                .map_err(OfficeConvertLambdaConfigError::InvalidRetryWait)
80                .map(Duration::from_secs)?,
81            Err(_) => Duration::from_secs(1),
82        };
83
84        Ok(OfficeConvertLambdaConfig {
85            tmp_bucket,
86            function_name,
87            qualifier,
88            tenant_id,
89            retry_attempts,
90            retry_wait,
91        })
92    }
93}
94
95/// Variant of [ConvertToPdf] that uses LibreOffice through a
96/// office-converter server for the conversion
97#[derive(Clone)]
98pub struct OfficeConverterLambda {
99    client: OfficeConvertLambda,
100    storage: StorageLayer,
101}
102
103#[derive(Debug, Error)]
104pub enum OfficeConvertLambdaError {
105    /// Error on the storage layer
106    #[error(transparent)]
107    Storage(#[from] StorageLayerError),
108
109    /// Error when converting
110    #[error(transparent)]
111    Convert(#[from] Box<ConvertError>),
112}
113
114impl OfficeConverterLambda {
115    pub fn new(client: OfficeConvertLambda, storage: StorageLayer) -> Self {
116        Self { client, storage }
117    }
118
119    pub fn from_config(
120        aws_config: &SdkConfig,
121        storage: &StorageLayerFactory,
122        config: OfficeConvertLambdaConfig,
123    ) -> Result<Self, OfficeConvertLambdaError> {
124        let client = aws_sdk_lambda::Client::new(aws_config);
125        let storage = storage.create_layer(StorageLayerOptions {
126            bucket_name: config.tmp_bucket,
127        });
128
129        Ok(Self {
130            client: OfficeConvertLambda::new(
131                client,
132                OfficeConvertLambdaOptions {
133                    function_name: config.function_name,
134                    qualifier: config.qualifier,
135                    tenant_id: config.tenant_id,
136                    retry_attempts: config.retry_attempts,
137                    retry_wait: config.retry_wait,
138                },
139            ),
140            storage,
141        })
142    }
143}
144
145impl ConvertToPdf for OfficeConverterLambda {
146    async fn convert_to_pdf(&self, file_bytes: Bytes) -> Result<Bytes, PdfConvertError> {
147        let bucket_name = self.storage.bucket_name();
148        let input_key = Uuid::new_v4().simple().to_string();
149        let output_key = Uuid::new_v4().simple().to_string();
150
151        tracing::debug!("uploading file for conversion");
152
153        // Upload the file to S3
154        self.storage
155            .upload_file(
156                &input_key,
157                file_bytes,
158                UploadFileOptions {
159                    content_type: "application/octet-stream".to_string(),
160                    ..Default::default()
161                },
162            )
163            .await
164            .map_err(OfficeConvertLambdaError::Storage)?;
165
166        tracing::debug!("calling conversion lambda");
167
168        let result = self
169            .client
170            .convert(office_convert_lambda_client::ConvertRequest {
171                source_bucket: bucket_name.clone(),
172                source_key: input_key.clone(),
173                dest_bucket: bucket_name.clone(),
174                dest_key: output_key.clone(),
175            })
176            .await;
177
178        tracing::debug!("conversion complete");
179
180        // Delete the input file after completion
181        self.storage
182            .delete_file(&input_key)
183            .await
184            .map_err(OfficeConvertLambdaError::Storage)?;
185
186        match result {
187            Ok(_) => {
188                tracing::debug!("reading converted file");
189
190                // Read the output file back
191                let output_bytes = self
192                    .storage
193                    .get_file(&output_key)
194                    .await
195                    .map_err(OfficeConvertLambdaError::Storage)?
196                    .collect_bytes()
197                    .await
198                    .map_err(OfficeConvertLambdaError::Storage)?;
199
200                // Delete the output file after download
201                self.storage
202                    .delete_file(&output_key)
203                    .await
204                    .map_err(OfficeConvertLambdaError::Storage)?;
205
206                Ok(output_bytes)
207            }
208            Err(error) => {
209                tracing::error!(?error, "failed to convert file");
210                Err(match error {
211                    ConvertError::Lambda(err) if err.reason.as_str() == "FILE_LIKELY_ENCRYPTED" => {
212                        PdfConvertError::EncryptedDocument
213                    }
214                    ConvertError::Lambda(err) if err.reason.as_str() == "FILE_LIKELY_CORRUPTED" => {
215                        PdfConvertError::MalformedDocument
216                    }
217                    err => PdfConvertError::ConversionFailedLambda(
218                        OfficeConvertLambdaError::Convert(Box::new(err)),
219                    ),
220                })
221            }
222        }
223    }
224
225    fn is_convertable(&self, mime: &mime::Mime) -> bool {
226        is_known_libreoffice_pdf_convertable(mime)
227    }
228}