parser_web/routes/
parse.rs

1//! Routes for parsing documents.
2
3use crate::errors::ApiError;
4use actix_multipart::Multipart;
5use actix_web::{body::BoxBody, post, HttpRequest, HttpResponse, Responder};
6use futures_util::TryStreamExt;
7use parser_core::{parse, ParserError};
8use rayon::prelude::*;
9use serde::{Deserialize, Serialize};
10
11/// Response type for parsed texts
12#[derive(Serialize, Deserialize)]
13struct ParseResponse {
14    /// Parsed text from the documents
15    texts: Vec<String>,
16}
17
18impl Responder for ParseResponse {
19    type Body = BoxBody;
20
21    fn respond_to(self, _req: &HttpRequest) -> HttpResponse<Self::Body> {
22        HttpResponse::Ok().json(self)
23    }
24}
25
26/// Parses various document formats into plain text.
27#[post("/parse")]
28async fn parse_file(mut payload: Multipart) -> Result<ParseResponse, ApiError> {
29    let mut files = Vec::new();
30
31    // Process each field in the multipart payload
32    while let Some(mut field) = payload.try_next().await? {
33        // Buffer to store the file data
34        let mut buffer = Vec::new();
35
36        // Stream chunks directly into buffer
37        while let Some(chunk) = field.try_next().await? {
38            buffer.extend_from_slice(&chunk);
39        }
40
41        // Only add non-empty files
42        if !buffer.is_empty() {
43            files.push(buffer);
44        }
45    }
46
47    if files.is_empty() {
48        return Err(ApiError::BadRequest("No files provided".to_string()));
49    }
50
51    // Process files in parallel
52    let parsed_text = files
53        .par_iter()
54        .map(|data| parse(data))
55        .collect::<Result<Vec<String>, ParserError>>();
56
57    Ok(ParseResponse {
58        texts: parsed_text?,
59    })
60}