docbox_processing/office/
convert_lambda.rs1use std::{str::FromStr, time::Duration};
16
17use crate::office::libreoffice::is_known_libreoffice_pdf_convertable;
18
19use super::{ConvertToPdf, PdfConvertError};
20use aws_config::SdkConfig;
21use bytes::Bytes;
22use docbox_database::sqlx::types::Uuid;
23use docbox_storage::{
24 StorageLayer, StorageLayerError, StorageLayerFactory, StorageLayerOptions, UploadFileOptions,
25};
26use office_convert_lambda_client::{ConvertError, OfficeConvertLambda, OfficeConvertLambdaOptions};
27use serde::{Deserialize, Serialize};
28use thiserror::Error;
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct OfficeConvertLambdaConfig {
32 pub function_name: String,
34 pub qualifier: Option<String>,
36 pub tenant_id: Option<String>,
38 pub retry_attempts: usize,
40 pub retry_wait: Duration,
42 pub tmp_bucket: String,
44}
45
46#[derive(Debug, Error)]
47pub enum OfficeConvertLambdaConfigError {
48 #[error("missing DOCBOX_CONVERT_LAMBDA_TMP_BUCKET environment variable")]
49 MissingTempBucket,
50 #[error("missing DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME environment variable")]
51 MissingFunctionName,
52 #[error("DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS must be a number")]
53 InvalidRetryAttempts(<usize as FromStr>::Err),
54 #[error("DOCBOX_CONVERT_LAMBDA_RETRY_WAIT must be a number in seconds: {0}")]
55 InvalidRetryWait(<u64 as FromStr>::Err),
56}
57
58impl OfficeConvertLambdaConfig {
59 pub fn from_env() -> Result<OfficeConvertLambdaConfig, OfficeConvertLambdaConfigError> {
60 let tmp_bucket = std::env::var("DOCBOX_CONVERT_LAMBDA_TMP_BUCKET")
61 .map_err(|_| OfficeConvertLambdaConfigError::MissingTempBucket)?;
62
63 let function_name = std::env::var("DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME")
64 .map_err(|_| OfficeConvertLambdaConfigError::MissingFunctionName)?;
65
66 let qualifier = std::env::var("DOCBOX_CONVERT_LAMBDA_QUALIFIER").ok();
67 let tenant_id = std::env::var("DOCBOX_CONVERT_LAMBDA_TENANT_ID").ok();
68
69 let retry_attempts = match std::env::var("DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS") {
70 Ok(retry_attempts) => retry_attempts
71 .parse::<usize>()
72 .map_err(OfficeConvertLambdaConfigError::InvalidRetryAttempts)?,
73 Err(_) => 3,
74 };
75
76 let retry_wait = match std::env::var("DOCBOX_CONVERT_LAMBDA_RETRY_WAIT") {
77 Ok(retry_wait) => retry_wait
78 .parse::<u64>()
79 .map_err(OfficeConvertLambdaConfigError::InvalidRetryWait)
80 .map(Duration::from_secs)?,
81 Err(_) => Duration::from_secs(1),
82 };
83
84 Ok(OfficeConvertLambdaConfig {
85 tmp_bucket,
86 function_name,
87 qualifier,
88 tenant_id,
89 retry_attempts,
90 retry_wait,
91 })
92 }
93}
94
95#[derive(Clone)]
98pub struct OfficeConverterLambda {
99 client: OfficeConvertLambda,
100 storage: StorageLayer,
101}
102
103#[derive(Debug, Error)]
104pub enum OfficeConvertLambdaError {
105 #[error(transparent)]
107 Storage(#[from] StorageLayerError),
108
109 #[error(transparent)]
111 Convert(#[from] Box<ConvertError>),
112}
113
114impl OfficeConverterLambda {
115 pub fn new(client: OfficeConvertLambda, storage: StorageLayer) -> Self {
116 Self { client, storage }
117 }
118
119 pub fn from_config(
120 aws_config: &SdkConfig,
121 storage: &StorageLayerFactory,
122 config: OfficeConvertLambdaConfig,
123 ) -> Result<Self, OfficeConvertLambdaError> {
124 let client = aws_sdk_lambda::Client::new(aws_config);
125 let storage = storage.create_layer(StorageLayerOptions {
126 bucket_name: config.tmp_bucket,
127 });
128
129 Ok(Self {
130 client: OfficeConvertLambda::new(
131 client,
132 OfficeConvertLambdaOptions {
133 function_name: config.function_name,
134 qualifier: config.qualifier,
135 tenant_id: config.tenant_id,
136 retry_attempts: config.retry_attempts,
137 retry_wait: config.retry_wait,
138 },
139 ),
140 storage,
141 })
142 }
143}
144
145impl ConvertToPdf for OfficeConverterLambda {
146 async fn convert_to_pdf(&self, file_bytes: Bytes) -> Result<Bytes, PdfConvertError> {
147 let bucket_name = self.storage.bucket_name();
148 let input_key = Uuid::new_v4().simple().to_string();
149 let output_key = Uuid::new_v4().simple().to_string();
150
151 tracing::debug!("uploading file for conversion");
152
153 self.storage
155 .upload_file(
156 &input_key,
157 file_bytes,
158 UploadFileOptions {
159 content_type: "application/octet-stream".to_string(),
160 ..Default::default()
161 },
162 )
163 .await
164 .map_err(OfficeConvertLambdaError::Storage)?;
165
166 tracing::debug!("calling conversion lambda");
167
168 let result = self
169 .client
170 .convert(office_convert_lambda_client::ConvertRequest {
171 source_bucket: bucket_name.clone(),
172 source_key: input_key.clone(),
173 dest_bucket: bucket_name.clone(),
174 dest_key: output_key.clone(),
175 })
176 .await;
177
178 tracing::debug!("conversion complete");
179
180 self.storage
182 .delete_file(&input_key)
183 .await
184 .map_err(OfficeConvertLambdaError::Storage)?;
185
186 match result {
187 Ok(_) => {
188 tracing::debug!("reading converted file");
189
190 let output_bytes = self
192 .storage
193 .get_file(&output_key)
194 .await
195 .map_err(OfficeConvertLambdaError::Storage)?
196 .collect_bytes()
197 .await
198 .map_err(OfficeConvertLambdaError::Storage)?;
199
200 self.storage
202 .delete_file(&output_key)
203 .await
204 .map_err(OfficeConvertLambdaError::Storage)?;
205
206 Ok(output_bytes)
207 }
208 Err(error) => {
209 tracing::error!(?error, "failed to convert file");
210 Err(match error {
211 ConvertError::Lambda(err) if err.reason.as_str() == "FILE_LIKELY_ENCRYPTED" => {
212 PdfConvertError::EncryptedDocument
213 }
214 ConvertError::Lambda(err) if err.reason.as_str() == "FILE_LIKELY_CORRUPTED" => {
215 PdfConvertError::MalformedDocument
216 }
217 err => PdfConvertError::ConversionFailedLambda(
218 OfficeConvertLambdaError::Convert(Box::new(err)),
219 ),
220 })
221 }
222 }
223 }
224
225 fn is_convertable(&self, mime: &mime::Mime) -> bool {
226 is_known_libreoffice_pdf_convertable(mime)
227 }
228}