parser_core/parsers.rs
1//! Parsing module for various file formats.
2//!
3//! This module serves as the central entry point for all parsing functions,
4//! providing a unified interface for different file formats like PDF, CSV, etc.
5//! Each specific parser is implemented in its own submodule.
6
7mod docx;
8mod image;
9mod pdf;
10mod pptx;
11mod text;
12mod xlsx;
13
14use self::{
15 docx::parse_docx, image::parse_image, pdf::parse_pdf, pptx::parse_pptx, text::parse_text,
16 xlsx::parse_xlsx,
17};
18
19use crate::{
20 constants::{APPLICATION_DOCX, APPLICATION_PDF, APPLICATION_PPTX, APPLICATION_XLSX},
21 errors::ParserError,
22};
23use infer::Infer;
24use lazy_static::lazy_static;
25use mime::{Mime, IMAGE, TEXT, TEXT_PLAIN};
26use std::str;
27
28// Create a static infer instance to avoid recreating it on every call
29lazy_static! {
30 static ref INFER: Infer = Infer::new();
31}
32
33/// Parses the given data into plain text.
34///
35/// This function is the main entry point for the parser library. It automatically
36/// detects the file type from the provided byte data and delegates the parsing
37/// to the appropriate specialized parser.
38///
39/// # Arguments
40///
41/// * `data` - A byte slice containing the file data to be parsed
42///
43/// # Returns
44///
45/// * `Ok(String)` - The extracted text content from the file
46/// * `Err(ParserError)` - If the file type is unsupported, unrecognized, or an error occurs during parsing
47///
48/// # Examples
49///
50/// ```
51/// # use parser_core::parse;
52/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
53/// # let data = Vec::new(); // In a real example, this would be file data
54/// // Attempt to parse the data
55/// match parse(&data) {
56/// Ok(text) => println!("Parsed text: {}", text),
57/// Err(err) => println!("Failed to parse: {}", err),
58/// }
59/// # Ok(())
60/// # }
61/// ```
62///
63/// # Text file example
64///
65/// ```
66/// use parser_core::parse;
67///
68/// // Create a simple text file content
69/// let text_data = b"Hello, world! This is a sample text file.";
70///
71/// // Parse the text data
72/// let result = parse(text_data).expect("Failed to parse text data");
73///
74/// // Verify the result
75/// assert_eq!(result, "Hello, world! This is a sample text file.");
76/// ```
77pub fn parse(data: &[u8]) -> Result<String, ParserError> {
78 match determine_mime_type(data) {
79 Some(mime) if mime == APPLICATION_PDF => parse_pdf(data),
80 Some(mime) if mime == APPLICATION_DOCX => parse_docx(data),
81 Some(mime) if mime == APPLICATION_XLSX => parse_xlsx(data),
82 Some(mime) if mime == APPLICATION_PPTX => parse_pptx(data),
83 Some(mime) if mime.type_() == TEXT => parse_text(data),
84 Some(mime) if mime.type_() == IMAGE => parse_image(data),
85 Some(mime) => Err(ParserError::InvalidFormat(format!(
86 "Unsupported file type: {}",
87 mime
88 ))),
89 None => Err(ParserError::InvalidFormat(
90 "Could not determine file type.".to_string(),
91 )),
92 }
93}
94
95/// Determines the MIME type of data from its binary content.
96///
97/// This function uses file signatures (magic bytes) to detect the type of the data
98/// and as a fallback, checks if the data is valid UTF-8 text.
99///
100/// # Arguments
101///
102/// * `data` - A byte slice containing the file data to be analyzed
103///
104/// # Returns
105///
106/// * `Some(Mime)` - The detected MIME type of the data
107/// * `None` - If the data type could not be determined
108///
109/// # Implementation Details
110///
111/// - First tries to identify the file type based on its binary signature
112/// - As a fallback, checks if the content is valid UTF-8 text
113/// - Uses a static infer instance to improve performance
114fn determine_mime_type(data: &[u8]) -> Option<Mime> {
115 // Use the static infer instance
116 // Try to detect using file signatures
117 if let Some(kind) = INFER.get(data) {
118 if let Ok(mime) = kind.mime_type().parse() {
119 return Some(mime);
120 }
121 }
122
123 // Finally, check if it could be plain text (if it's UTF-8 decodable)
124 if str::from_utf8(data).is_ok() {
125 return Some(TEXT_PLAIN);
126 }
127
128 None
129}
130
131#[cfg(test)]
132mod tests {
133 use super::*;
134
135 #[test]
136 fn parse_success() {
137 // Already tested in the specific parser tests
138 // Test case for coverage only
139 }
140
141 fn assert_mime_type_from_data(filename: &str, expected_type: &str, check_category: bool) {
142 // Read the file to get its content
143 let data = parser_test_utils::read_test_file(filename);
144
145 let result = determine_mime_type(&data);
146 assert!(result.is_some());
147 if check_category {
148 assert_eq!(result.unwrap().type_(), expected_type);
149 } else {
150 assert_eq!(result.unwrap(), expected_type);
151 }
152 }
153
154 #[test]
155 fn determine_mime_success() {
156 // Office documents
157 assert_mime_type_from_data("test_pdf_1.pdf", APPLICATION_PDF, false);
158 assert_mime_type_from_data("test_docx_1.docx", APPLICATION_DOCX, false);
159 assert_mime_type_from_data("test_xlsx_1.xlsx", APPLICATION_XLSX, false);
160 assert_mime_type_from_data("test_pptx_1.pptx", APPLICATION_PPTX, false);
161
162 // Text files
163 assert_mime_type_from_data("test_txt_1.txt", TEXT.into(), true);
164 assert_mime_type_from_data("test_csv_1.csv", TEXT.into(), true);
165 assert_mime_type_from_data("test_json_1.json", TEXT.into(), true);
166
167 // Images
168 assert_mime_type_from_data("test_png_1.png", IMAGE.into(), true);
169 assert_mime_type_from_data("test_jpg_1.jpg", IMAGE.into(), true);
170 assert_mime_type_from_data("test_webp_1.webp", IMAGE.into(), true);
171 }
172}