docbox_processing/office/
convert_lambda.rs1use std::{str::FromStr, time::Duration};
16
17use crate::office::libreoffice::is_known_libreoffice_pdf_convertable;
18
19use super::{ConvertToPdf, PdfConvertError};
20use aws_config::SdkConfig;
21use bytes::Bytes;
22use docbox_database::sqlx::types::Uuid;
23use docbox_storage::{StorageLayer, StorageLayerError, StorageLayerFactory, StorageLayerOptions};
24use office_convert_lambda_client::{ConvertError, OfficeConvertLambda, OfficeConvertLambdaOptions};
25use serde::{Deserialize, Serialize};
26use thiserror::Error;
27
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct OfficeConvertLambdaConfig {
30 pub function_name: String,
32 pub qualifier: Option<String>,
34 pub tenant_id: Option<String>,
36 pub retry_attempts: usize,
38 pub retry_wait: Duration,
40 pub tmp_bucket: String,
42}
43
44#[derive(Debug, Error)]
45pub enum OfficeConvertLambdaConfigError {
46 #[error("missing DOCBOX_CONVERT_LAMBDA_TMP_BUCKET environment variable")]
47 MissingTempBucket,
48 #[error("missing DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME environment variable")]
49 MissingFunctionName,
50 #[error("DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS must be a number")]
51 InvalidRetryAttempts(<usize as FromStr>::Err),
52 #[error("DOCBOX_CONVERT_LAMBDA_RETRY_WAIT must be a number in seconds: {0}")]
53 InvalidRetryWait(<u64 as FromStr>::Err),
54}
55
56impl OfficeConvertLambdaConfig {
57 pub fn from_env() -> Result<OfficeConvertLambdaConfig, OfficeConvertLambdaConfigError> {
58 let tmp_bucket = std::env::var("DOCBOX_CONVERT_LAMBDA_TMP_BUCKET")
59 .map_err(|_| OfficeConvertLambdaConfigError::MissingTempBucket)?;
60
61 let function_name = std::env::var("DOCBOX_CONVERT_LAMBDA_FUNCTION_NAME")
62 .map_err(|_| OfficeConvertLambdaConfigError::MissingFunctionName)?;
63
64 let qualifier = std::env::var("DOCBOX_CONVERT_LAMBDA_QUALIFIER").ok();
65 let tenant_id = std::env::var("DOCBOX_CONVERT_LAMBDA_TENANT_ID").ok();
66
67 let retry_attempts = match std::env::var("DOCBOX_CONVERT_LAMBDA_RETRY_ATTEMPTS") {
68 Ok(retry_attempts) => retry_attempts
69 .parse::<usize>()
70 .map_err(OfficeConvertLambdaConfigError::InvalidRetryAttempts)?,
71 Err(_) => 3,
72 };
73
74 let retry_wait = match std::env::var("DOCBOX_CONVERT_LAMBDA_RETRY_WAIT") {
75 Ok(retry_wait) => retry_wait
76 .parse::<u64>()
77 .map_err(OfficeConvertLambdaConfigError::InvalidRetryWait)
78 .map(Duration::from_secs)?,
79 Err(_) => Duration::from_secs(1),
80 };
81
82 Ok(OfficeConvertLambdaConfig {
83 tmp_bucket,
84 function_name,
85 qualifier,
86 tenant_id,
87 retry_attempts,
88 retry_wait,
89 })
90 }
91}
92
93#[derive(Clone)]
96pub struct OfficeConverterLambda {
97 client: OfficeConvertLambda,
98 storage: StorageLayer,
99}
100
101#[derive(Debug, Error)]
102pub enum OfficeConvertLambdaError {
103 #[error(transparent)]
105 Storage(#[from] StorageLayerError),
106
107 #[error(transparent)]
109 Convert(#[from] Box<ConvertError>),
110}
111
112impl OfficeConverterLambda {
113 pub fn new(client: OfficeConvertLambda, storage: StorageLayer) -> Self {
114 Self { client, storage }
115 }
116
117 pub fn from_config(
118 aws_config: &SdkConfig,
119 storage: &StorageLayerFactory,
120 config: OfficeConvertLambdaConfig,
121 ) -> Result<Self, OfficeConvertLambdaError> {
122 let client = aws_sdk_lambda::Client::new(aws_config);
123 let storage = storage.create_layer(StorageLayerOptions {
124 bucket_name: config.tmp_bucket,
125 });
126
127 Ok(Self {
128 client: OfficeConvertLambda::new(
129 client,
130 OfficeConvertLambdaOptions {
131 function_name: config.function_name,
132 qualifier: config.qualifier,
133 tenant_id: config.tenant_id,
134 retry_attempts: config.retry_attempts,
135 retry_wait: config.retry_wait,
136 },
137 ),
138 storage,
139 })
140 }
141}
142
143impl ConvertToPdf for OfficeConverterLambda {
144 async fn convert_to_pdf(&self, file_bytes: Bytes) -> Result<Bytes, PdfConvertError> {
145 let bucket_name = self.storage.bucket_name();
146 let input_key = Uuid::new_v4().simple().to_string();
147 let output_key = Uuid::new_v4().simple().to_string();
148
149 tracing::debug!("uploading file for conversion");
150
151 self.storage
153 .upload_file(
154 &input_key,
155 "application/octet-stream".to_string(),
156 file_bytes,
157 )
158 .await
159 .map_err(OfficeConvertLambdaError::Storage)?;
160
161 tracing::debug!("calling conversion lambda");
162
163 let result = self
164 .client
165 .convert(office_convert_lambda_client::ConvertRequest {
166 source_bucket: bucket_name.clone(),
167 source_key: input_key.clone(),
168 dest_bucket: bucket_name.clone(),
169 dest_key: output_key.clone(),
170 })
171 .await;
172
173 tracing::debug!("conversion complete");
174
175 self.storage
177 .delete_file(&input_key)
178 .await
179 .map_err(OfficeConvertLambdaError::Storage)?;
180
181 match result {
182 Ok(_) => {
183 tracing::debug!("reading converted file");
184
185 let output_bytes = self
187 .storage
188 .get_file(&output_key)
189 .await
190 .map_err(OfficeConvertLambdaError::Storage)?
191 .collect_bytes()
192 .await
193 .map_err(OfficeConvertLambdaError::Storage)?;
194
195 self.storage
197 .delete_file(&output_key)
198 .await
199 .map_err(OfficeConvertLambdaError::Storage)?;
200
201 Ok(output_bytes)
202 }
203 Err(error) => {
204 tracing::error!(?error, "failed to convert file");
205 Err(match error {
206 ConvertError::Lambda(err) if err.reason.as_str() == "FILE_LIKELY_ENCRYPTED" => {
207 PdfConvertError::EncryptedDocument
208 }
209 ConvertError::Lambda(err) if err.reason.as_str() == "FILE_LIKELY_CORRUPTED" => {
210 PdfConvertError::MalformedDocument
211 }
212 err => PdfConvertError::ConversionFailedLambda(
213 OfficeConvertLambdaError::Convert(Box::new(err)),
214 ),
215 })
216 }
217 }
218 }
219
220 fn is_convertable(&self, mime: &mime::Mime) -> bool {
221 is_known_libreoffice_pdf_convertable(mime)
222 }
223}