parser_core/
parsers.rs

1//! Parsing module for various file formats.
2//!
3//! This module serves as the central entry point for all parsing functions,
4//! providing a unified interface for different file formats like PDF, CSV, etc.
5//! Each specific parser is implemented in its own submodule.
6
7mod docx;
8mod image;
9mod pdf;
10mod pptx;
11mod text;
12mod xlsx;
13
14use self::{
15    docx::parse_docx, image::parse_image, pdf::parse_pdf, pptx::parse_pptx, text::parse_text,
16    xlsx::parse_xlsx,
17};
18
19use crate::{
20    constants::{APPLICATION_DOCX, APPLICATION_PDF, APPLICATION_PPTX, APPLICATION_XLSX},
21    errors::ParserError,
22};
23use infer::Infer;
24use lazy_static::lazy_static;
25use mime::{Mime, IMAGE, TEXT, TEXT_PLAIN};
26use std::str;
27
28// Create a static infer instance to avoid recreating it on every call
29lazy_static! {
30    static ref INFER: Infer = Infer::new();
31}
32
33/// Parses the given data into plain text.
34///
35/// This function is the main entry point for the parser library. It automatically
36/// detects the file type from the provided byte data and delegates the parsing
37/// to the appropriate specialized parser.
38///
39/// # Arguments
40///
41/// * `data` - A byte slice containing the file data to be parsed
42///
43/// # Returns
44///
45/// * `Ok(String)` - The extracted text content from the file
46/// * `Err(ParserError)` - If the file type is unsupported, unrecognized, or an error occurs during parsing
47///
48/// # Examples
49///
50/// ```
51/// # use parser_core::parse;
52/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
53/// # let data = Vec::new(); // In a real example, this would be file data
54/// // Attempt to parse the data
55/// match parse(&data) {
56///     Ok(text) => println!("Parsed text: {}", text),
57///     Err(err) => println!("Failed to parse: {}", err),
58/// }
59/// # Ok(())
60/// # }
61/// ```
62///
63/// # Text file example
64///
65/// ```
66/// use parser_core::parse;
67///
68/// // Create a simple text file content
69/// let text_data = b"Hello, world! This is a sample text file.";
70///
71/// // Parse the text data
72/// let result = parse(text_data).expect("Failed to parse text data");
73///
74/// // Verify the result
75/// assert_eq!(result, "Hello, world! This is a sample text file.");
76/// ```
77pub fn parse(data: &[u8]) -> Result<String, ParserError> {
78    match determine_mime_type(data) {
79        Some(mime) if mime == APPLICATION_PDF => parse_pdf(data),
80        Some(mime) if mime == APPLICATION_DOCX => parse_docx(data),
81        Some(mime) if mime == APPLICATION_XLSX => parse_xlsx(data),
82        Some(mime) if mime == APPLICATION_PPTX => parse_pptx(data),
83        Some(mime) if mime.type_() == TEXT => parse_text(data),
84        Some(mime) if mime.type_() == IMAGE => parse_image(data),
85        Some(mime) => Err(ParserError::InvalidFormat(format!(
86            "Unsupported file type: {}",
87            mime
88        ))),
89        None => Err(ParserError::InvalidFormat(
90            "Could not determine file type.".to_string(),
91        )),
92    }
93}
94
95/// Determines the MIME type of data from its binary content.
96///
97/// This function uses file signatures (magic bytes) to detect the type of the data
98/// and as a fallback, checks if the data is valid UTF-8 text.
99///
100/// # Arguments
101///
102/// * `data` - A byte slice containing the file data to be analyzed
103///
104/// # Returns
105///
106/// * `Some(Mime)` - The detected MIME type of the data
107/// * `None` - If the data type could not be determined
108///
109/// # Implementation Details
110///
111/// - First tries to identify the file type based on its binary signature
112/// - As a fallback, checks if the content is valid UTF-8 text
113/// - Uses a static infer instance to improve performance
114fn determine_mime_type(data: &[u8]) -> Option<Mime> {
115    // Use the static infer instance
116    // Try to detect using file signatures
117    if let Some(kind) = INFER.get(data) {
118        if let Ok(mime) = kind.mime_type().parse() {
119            return Some(mime);
120        }
121    }
122
123    // Finally, check if it could be plain text (if it's UTF-8 decodable)
124    if str::from_utf8(data).is_ok() {
125        return Some(TEXT_PLAIN);
126    }
127
128    None
129}
130
131#[cfg(test)]
132mod tests {
133    use super::*;
134
135    #[test]
136    fn parse_success() {
137        // Already tested in the specific parser tests
138        // Test case for coverage only
139    }
140
141    fn assert_mime_type_from_data(filename: &str, expected_type: &str, check_category: bool) {
142        // Read the file to get its content
143        let data = parser_test_utils::read_test_file(filename);
144
145        let result = determine_mime_type(&data);
146        assert!(result.is_some());
147        if check_category {
148            assert_eq!(result.unwrap().type_(), expected_type);
149        } else {
150            assert_eq!(result.unwrap(), expected_type);
151        }
152    }
153
154    #[test]
155    fn determine_mime_success() {
156        // Office documents
157        assert_mime_type_from_data("test_pdf_1.pdf", APPLICATION_PDF, false);
158        assert_mime_type_from_data("test_docx_1.docx", APPLICATION_DOCX, false);
159        assert_mime_type_from_data("test_xlsx_1.xlsx", APPLICATION_XLSX, false);
160        assert_mime_type_from_data("test_pptx_1.pptx", APPLICATION_PPTX, false);
161
162        // Text files
163        assert_mime_type_from_data("test_txt_1.txt", TEXT.into(), true);
164        assert_mime_type_from_data("test_csv_1.csv", TEXT.into(), true);
165        assert_mime_type_from_data("test_json_1.json", TEXT.into(), true);
166
167        // Images
168        assert_mime_type_from_data("test_png_1.png", IMAGE.into(), true);
169        assert_mime_type_from_data("test_jpg_1.jpg", IMAGE.into(), true);
170        assert_mime_type_from_data("test_webp_1.webp", IMAGE.into(), true);
171    }
172}