Skip to main content

docbox_processing/office/
convert_lambda.rs

1//! # Convert Lambda
2//!
3//! Lambda based file conversion server https://github.com/jacobtread/office-convert-lambda backend
4//! for performing office file conversion
5//!
6//! ## Environment Variables
7//!
8//! * `DOCBOX_CONVERT_LAMBDA_TMP_BUCKET` - S3 bucket to store the temporary input and output files from conversion
9//! * `DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME` - The name or ARN of the Lambda function, version, or alias.
10//! * `DOCBOX_CONVERT_LAMBDA_QUALIFIER` - Optionally specify a version or alias to invoke a published version of the function.
11//! * `DOCBOX_CONVERT_LAMBDA_TENANT_ID` - Optional identifier of the tenant in a multi-tenant Lambda function.
12//! * `DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS` - Maximum number of times to retry on unexpected failures
13//! * `DOCBOX_CONVERT_LAMBDA_RETRY_WAIT` - Delay to wait between each retry attempt
14
15use std::{str::FromStr, time::Duration};
16
17use crate::office::libreoffice::is_known_libreoffice_pdf_convertable;
18
19use super::{ConvertToPdf, PdfConvertError};
20use aws_config::SdkConfig;
21use bytes::Bytes;
22use docbox_database::sqlx::types::Uuid;
23use docbox_storage::{StorageLayer, StorageLayerError, StorageLayerFactory, StorageLayerOptions};
24use office_convert_lambda_client::{ConvertError, OfficeConvertLambda, OfficeConvertLambdaOptions};
25use serde::{Deserialize, Serialize};
26use thiserror::Error;
27
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct OfficeConvertLambdaConfig {
30    /// The name or ARN of the Lambda function, version, or alias.
31    pub function_name: String,
32    /// Specify a version or alias to invoke a published version of the function.
33    pub qualifier: Option<String>,
34    /// The identifier of the tenant in a multi-tenant Lambda function.
35    pub tenant_id: Option<String>,
36    /// Number of retry attempts to perform
37    pub retry_attempts: usize,
38    /// Time to wait between retry attempts
39    pub retry_wait: Duration,
40    /// Temporary bucket to use for lambda input and output files
41    pub tmp_bucket: String,
42}
43
44#[derive(Debug, Error)]
45pub enum OfficeConvertLambdaConfigError {
46    #[error("missing DOCBOX_CONVERT_LAMBDA_TMP_BUCKET environment variable")]
47    MissingTempBucket,
48    #[error("missing DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME environment variable")]
49    MissingFunctionName,
50    #[error("DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS must be a number")]
51    InvalidRetryAttempts(<usize as FromStr>::Err),
52    #[error("DOCBOX_CONVERT_LAMBDA_RETRY_WAIT must be a number in seconds: {0}")]
53    InvalidRetryWait(<u64 as FromStr>::Err),
54}
55
56impl OfficeConvertLambdaConfig {
57    pub fn from_env() -> Result<OfficeConvertLambdaConfig, OfficeConvertLambdaConfigError> {
58        let tmp_bucket = std::env::var("DOCBOX_CONVERT_LAMBDA_TMP_BUCKET")
59            .map_err(|_| OfficeConvertLambdaConfigError::MissingTempBucket)?;
60
61        let function_name = std::env::var("DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME")
62            .map_err(|_| OfficeConvertLambdaConfigError::MissingFunctionName)?;
63
64        let qualifier = std::env::var("DOCBOX_CONVERT_LAMBDA_QUALIFIER").ok();
65        let tenant_id = std::env::var("DOCBOX_CONVERT_LAMBDA_TENANT_ID").ok();
66
67        let retry_attempts = match std::env::var("DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS") {
68            Ok(retry_attempts) => retry_attempts
69                .parse::<usize>()
70                .map_err(OfficeConvertLambdaConfigError::InvalidRetryAttempts)?,
71            Err(_) => 3,
72        };
73
74        let retry_wait = match std::env::var("DOCBOX_CONVERT_LAMBDA_RETRY_WAIT") {
75            Ok(retry_wait) => retry_wait
76                .parse::<u64>()
77                .map_err(OfficeConvertLambdaConfigError::InvalidRetryWait)
78                .map(Duration::from_secs)?,
79            Err(_) => Duration::from_secs(1),
80        };
81
82        Ok(OfficeConvertLambdaConfig {
83            tmp_bucket,
84            function_name,
85            qualifier,
86            tenant_id,
87            retry_attempts,
88            retry_wait,
89        })
90    }
91}
92
93/// Variant of [ConvertToPdf] that uses LibreOffice through a
94/// office-converter server for the conversion
95#[derive(Clone)]
96pub struct OfficeConverterLambda {
97    client: OfficeConvertLambda,
98    storage: StorageLayer,
99}
100
101#[derive(Debug, Error)]
102pub enum OfficeConvertLambdaError {
103    /// Error on the storage layer
104    #[error(transparent)]
105    Storage(#[from] StorageLayerError),
106
107    /// Error when converting
108    #[error(transparent)]
109    Convert(#[from] Box<ConvertError>),
110}
111
112impl OfficeConverterLambda {
113    pub fn new(client: OfficeConvertLambda, storage: StorageLayer) -> Self {
114        Self { client, storage }
115    }
116
117    pub fn from_config(
118        aws_config: &SdkConfig,
119        storage: &StorageLayerFactory,
120        config: OfficeConvertLambdaConfig,
121    ) -> Result<Self, OfficeConvertLambdaError> {
122        let client = aws_sdk_lambda::Client::new(aws_config);
123        let storage = storage.create_layer(StorageLayerOptions {
124            bucket_name: config.tmp_bucket,
125        });
126
127        Ok(Self {
128            client: OfficeConvertLambda::new(
129                client,
130                OfficeConvertLambdaOptions {
131                    function_name: config.function_name,
132                    qualifier: config.qualifier,
133                    tenant_id: config.tenant_id,
134                    retry_attempts: config.retry_attempts,
135                    retry_wait: config.retry_wait,
136                },
137            ),
138            storage,
139        })
140    }
141}
142
143impl ConvertToPdf for OfficeConverterLambda {
144    async fn convert_to_pdf(&self, file_bytes: Bytes) -> Result<Bytes, PdfConvertError> {
145        let bucket_name = self.storage.bucket_name();
146        let input_key = Uuid::new_v4().simple().to_string();
147        let output_key = Uuid::new_v4().simple().to_string();
148
149        tracing::debug!("uploading file for conversion");
150
151        // Upload the file to S3
152        self.storage
153            .upload_file(
154                &input_key,
155                "application/octet-stream".to_string(),
156                file_bytes,
157            )
158            .await
159            .map_err(OfficeConvertLambdaError::Storage)?;
160
161        tracing::debug!("calling conversion lambda");
162
163        let result = self
164            .client
165            .convert(office_convert_lambda_client::ConvertRequest {
166                source_bucket: bucket_name.clone(),
167                source_key: input_key.clone(),
168                dest_bucket: bucket_name.clone(),
169                dest_key: output_key.clone(),
170            })
171            .await;
172
173        tracing::debug!("conversion complete");
174
175        // Delete the input file after completion
176        self.storage
177            .delete_file(&input_key)
178            .await
179            .map_err(OfficeConvertLambdaError::Storage)?;
180
181        match result {
182            Ok(_) => {
183                tracing::debug!("reading converted file");
184
185                // Read the output file back
186                let output_bytes = self
187                    .storage
188                    .get_file(&output_key)
189                    .await
190                    .map_err(OfficeConvertLambdaError::Storage)?
191                    .collect_bytes()
192                    .await
193                    .map_err(OfficeConvertLambdaError::Storage)?;
194
195                // Delete the output file after download
196                self.storage
197                    .delete_file(&output_key)
198                    .await
199                    .map_err(OfficeConvertLambdaError::Storage)?;
200
201                Ok(output_bytes)
202            }
203            Err(error) => {
204                tracing::error!(?error, "failed to convert file");
205                Err(match error {
206                    ConvertError::Lambda(err) if err.reason.as_str() == "FILE_LIKELY_ENCRYPTED" => {
207                        PdfConvertError::EncryptedDocument
208                    }
209                    ConvertError::Lambda(err) if err.reason.as_str() == "FILE_LIKELY_CORRUPTED" => {
210                        PdfConvertError::MalformedDocument
211                    }
212                    err => PdfConvertError::ConversionFailedLambda(
213                        OfficeConvertLambdaError::Convert(Box::new(err)),
214                    ),
215                })
216            }
217        }
218    }
219
220    fn is_convertable(&self, mime: &mime::Mime) -> bool {
221        is_known_libreoffice_pdf_convertable(mime)
222    }
223}