use crate::core::{Content, Error, ExtractionResult, Metadata, MetadataValue, Result};
use crate::parsers::Parser;
use crate::utils::security::{validate_file_size, FileSizeLimits};
pub struct DocParser;
impl Parser for DocParser {
fn name(&self) -> &str {
"DocParser"
}
fn supported_types(&self) -> &[&str] {
&[
"application/msword",
"application/doc",
]
}
fn parse(&self, data: &[u8], mime_type: &str) -> Result<ExtractionResult> {
validate_file_size(data, FileSizeLimits::DOC, "DOC")?;
if !is_ole2_file(data) {
return Err(Error::ParseError(
"Invalid DOC file: missing OLE2 header".to_string(),
));
}
let content_text = extract_text(data)?;
let metadata = extract_metadata(data)?;
Ok(ExtractionResult {
mime_type: mime_type.to_string(),
content: Content::Text(content_text),
metadata,
detection_confidence: 0.90,
})
}
}
fn is_ole2_file(data: &[u8]) -> bool {
data.len() >= 8 && data[0..8] == [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1]
}
fn extract_text(data: &[u8]) -> Result<String> {
let text = extract_readable_text(data);
if text.trim().is_empty() {
return Err(Error::ParseError(
"Unable to extract text from DOC file. Full DOC parsing requires complex OLE2 structure analysis.".to_string(),
));
}
Ok(text)
}
fn extract_readable_text(data: &[u8]) -> String {
let mut text = String::new();
let mut current_word = Vec::new();
for &byte in data {
if (byte >= 32 && byte <= 126) || byte == b'\n' || byte == b'\r' || byte == b'\t' {
current_word.push(byte);
} else if byte == 0 && !current_word.is_empty() {
if current_word.len() >= 3 {
if let Ok(s) = String::from_utf8(current_word.clone()) {
if is_likely_text(&s) {
text.push_str(&s);
text.push(' ');
}
}
}
current_word.clear();
} else if !current_word.is_empty() {
if current_word.len() >= 3 {
if let Ok(s) = String::from_utf8(current_word.clone()) {
if is_likely_text(&s) {
text.push_str(&s);
text.push(' ');
}
}
}
current_word.clear();
}
}
if current_word.len() >= 3 {
if let Ok(s) = String::from_utf8(current_word) {
if is_likely_text(&s) {
text.push_str(&s);
}
}
}
text.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn is_likely_text(s: &str) -> bool {
if s.len() < 3 {
return false;
}
let alpha_count = s.chars().filter(|c| c.is_alphabetic()).count();
let total_chars = s.chars().count();
alpha_count as f32 / total_chars as f32 >= 0.5
}
fn extract_metadata(data: &[u8]) -> Result<Metadata> {
let mut metadata = Metadata::new();
metadata.insert(
"parser_note".to_string(),
MetadataValue::Text(
"Basic DOC support - full parsing requires OLE2 structure analysis".to_string(),
),
);
if let Ok(ole_metadata) = extract_ole2_metadata(data) {
for (key, value) in ole_metadata {
metadata.insert(key, value);
}
}
Ok(metadata)
}
fn extract_ole2_metadata(data: &[u8]) -> Result<Vec<(String, MetadataValue)>> {
let mut metadata = Vec::new();
if let Some(title) = extract_property_string(data, b"Title") {
if !title.is_empty() && title.len() < 200 {
metadata.push(("title".to_string(), MetadataValue::Text(title)));
}
}
if let Some(author) = extract_property_string(data, b"Author") {
if !author.is_empty() && author.len() < 200 {
metadata.push(("author".to_string(), MetadataValue::Text(author)));
}
}
if let Some(subject) = extract_property_string(data, b"Subject") {
if !subject.is_empty() && subject.len() < 200 {
metadata.push(("subject".to_string(), MetadataValue::Text(subject)));
}
}
Ok(metadata)
}
fn extract_property_string(data: &[u8], property_name: &[u8]) -> Option<String> {
for i in 0..data.len().saturating_sub(property_name.len() + 100) {
if data[i..i + property_name.len()] == *property_name {
let start = i + property_name.len();
for offset in 0..100 {
let pos = start + offset;
if pos >= data.len() {
break;
}
if let Some(value) = extract_null_terminated_string(&data[pos..], 100) {
if value.len() >= 2 && is_likely_text(&value) {
return Some(value);
}
}
}
}
}
None
}
fn extract_null_terminated_string(data: &[u8], max_len: usize) -> Option<String> {
let mut bytes = Vec::new();
for &byte in data.iter().take(max_len) {
if byte == 0 {
break;
}
if byte >= 32 && byte <= 126 {
bytes.push(byte);
} else if !bytes.is_empty() {
break;
}
}
if bytes.len() >= 2 {
String::from_utf8(bytes).ok()
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_supported_types() {
let parser = DocParser;
let types = parser.supported_types();
assert!(types.contains(&"application/msword"));
assert!(types.contains(&"application/doc"));
}
#[test]
fn test_parser_name() {
let parser = DocParser;
assert_eq!(parser.name(), "DocParser");
}
#[test]
fn test_ole2_detection() {
let ole2_header = vec![0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
assert!(is_ole2_file(&ole2_header));
let invalid_header = vec![0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
assert!(!is_ole2_file(&invalid_header));
}
#[test]
fn test_is_likely_text() {
assert!(is_likely_text("Hello World"));
assert!(is_likely_text("Document"));
assert!(!is_likely_text("123"));
assert!(!is_likely_text("!!!"));
assert!(!is_likely_text("ab")); }
}