use std::io::Read;
pub fn extract_text(data: &[u8], mime: &str, filename: Option<&str>) -> Result<String, String> {
let base_mime = mime.split(';').next().unwrap_or(mime).trim();
match base_mime {
"application/pdf" => extract_pdf(data),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
extract_docx(data)
}
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
extract_pptx(data)
}
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => extract_xlsx(data),
"application/msword" | "application/vnd.ms-powerpoint" | "application/vnd.ms-excel" => {
extract_binary_strings(data)
}
"text/plain"
| "text/csv"
| "text/tab-separated-values"
| "text/markdown"
| "text/html"
| "text/xml"
| "text/x-python"
| "text/x-java"
| "text/x-c"
| "text/x-c++"
| "text/x-rust"
| "text/x-go"
| "text/x-ruby"
| "text/x-shellscript"
| "text/javascript"
| "text/css"
| "text/x-toml"
| "text/x-yaml"
| "text/x-log" => extract_utf8(data),
"application/json" | "application/xml" | "application/x-yaml" | "application/yaml"
| "application/toml" | "application/x-sh" => extract_utf8(data),
"application/rtf" | "text/rtf" => extract_rtf(data),
_ => {
if let Some(text) = try_extract_by_extension(data, filename) {
Ok(text)
} else {
Err(format!("unsupported document type: {base_mime}"))
}
}
}
}
fn extract_pdf(data: &[u8]) -> Result<String, String> {
pdf_extract::extract_text_from_mem(data)
.map(|t| t.trim().to_string())
.map_err(|e| format!("PDF extraction failed: {e}"))
}
fn extract_docx(data: &[u8]) -> Result<String, String> {
extract_office_xml(data, "word/document.xml")
}
fn extract_pptx(data: &[u8]) -> Result<String, String> {
let cursor = std::io::Cursor::new(data);
let mut archive =
zip::ZipArchive::new(cursor).map_err(|e| format!("invalid PPTX archive: {e}"))?;
let mut slide_names: Vec<String> = Vec::new();
for i in 0..archive.len() {
if let Ok(file) = archive.by_index(i) {
let name = file.name().to_string();
if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
slide_names.push(name);
}
}
}
slide_names.sort();
let mut all_text = Vec::new();
for name in &slide_names {
if let Ok(mut file) = archive.by_name(name) {
let mut xml = String::new();
if file.read_to_string(&mut xml).is_ok() {
let text = strip_xml_tags(&xml);
if !text.is_empty() {
all_text.push(text);
}
}
}
}
if all_text.is_empty() {
return Err("no text found in PPTX slides".to_string());
}
Ok(all_text.join("\n\n---\n\n"))
}
fn extract_xlsx(data: &[u8]) -> Result<String, String> {
let cursor = std::io::Cursor::new(data);
let mut archive =
zip::ZipArchive::new(cursor).map_err(|e| format!("invalid XLSX archive: {e}"))?;
let shared_strings = if let Ok(mut file) = archive.by_name("xl/sharedStrings.xml") {
let mut xml = String::new();
file.read_to_string(&mut xml)
.map_err(|e| format!("failed to read shared strings: {e}"))?;
parse_xlsx_shared_strings(&xml)
} else {
Vec::new()
};
let mut sheet_names: Vec<String> = Vec::new();
for i in 0..archive.len() {
if let Ok(file) = archive.by_index(i) {
let name = file.name().to_string();
if name.starts_with("xl/worksheets/sheet") && name.ends_with(".xml") {
sheet_names.push(name);
}
}
}
sheet_names.sort();
let mut all_text = Vec::new();
for name in &sheet_names {
if let Ok(mut file) = archive.by_name(name) {
let mut xml = String::new();
if file.read_to_string(&mut xml).is_ok() {
let text = parse_xlsx_sheet(&xml, &shared_strings);
if !text.is_empty() {
all_text.push(text);
}
}
}
}
if all_text.is_empty() && !shared_strings.is_empty() {
return Ok(shared_strings.join("\n"));
}
if all_text.is_empty() {
return Err("no text found in XLSX".to_string());
}
Ok(all_text.join("\n\n"))
}
fn extract_office_xml(data: &[u8], content_path: &str) -> Result<String, String> {
let cursor = std::io::Cursor::new(data);
let mut archive =
zip::ZipArchive::new(cursor).map_err(|e| format!("invalid Office XML archive: {e}"))?;
let mut file = archive
.by_name(content_path)
.map_err(|e| format!("content file not found in archive: {e}"))?;
let mut xml = String::new();
file.read_to_string(&mut xml)
.map_err(|e| format!("failed to read content: {e}"))?;
let text = strip_xml_tags(&xml);
if text.is_empty() {
return Err("no text content found".to_string());
}
Ok(text)
}
fn extract_utf8(data: &[u8]) -> Result<String, String> {
match std::str::from_utf8(data) {
Ok(s) => Ok(s.to_string()),
Err(_) => Ok(String::from_utf8_lossy(data).to_string()),
}
}
fn extract_rtf(data: &[u8]) -> Result<String, String> {
let text = String::from_utf8_lossy(data);
let mut result = String::new();
let mut depth = 0i32;
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
match ch {
'{' => depth += 1,
'}' => depth = (depth - 1).max(0),
'\\' => {
let mut word = String::new();
while let Some(&next) = chars.peek() {
if next.is_ascii_alphabetic() {
chars.next();
word.push(next);
} else {
break;
}
}
while let Some(&next) = chars.peek() {
if next.is_ascii_digit() || next == '-' {
chars.next();
} else {
break;
}
}
if let Some(&' ') = chars.peek() {
chars.next();
}
match word.as_str() {
"par" | "line" => result.push('\n'),
"tab" => result.push('\t'),
_ => {}
}
}
_ => {
if depth <= 1 {
result.push(ch);
}
}
}
}
let trimmed = result.trim().to_string();
if trimmed.is_empty() {
return Err("no text found in RTF".to_string());
}
Ok(trimmed)
}
fn extract_binary_strings(data: &[u8]) -> Result<String, String> {
let mut strings = Vec::new();
let mut current = String::new();
for &byte in data {
if (0x20..0x7F).contains(&byte) {
current.push(byte as char);
} else {
if current.len() >= 4 {
strings.push(std::mem::take(&mut current));
}
current.clear();
}
}
if current.len() >= 4 {
strings.push(current);
}
if strings.is_empty() {
return Err("no readable text in binary document".to_string());
}
Ok(strings.join(" "))
}
fn strip_xml_tags(xml: &str) -> String {
let mut result = String::with_capacity(xml.len() / 2);
let mut in_tag = false;
let mut last_was_space = true;
for ch in xml.chars() {
match ch {
'<' => {
in_tag = true;
}
'>' => {
in_tag = false;
if !last_was_space && !result.is_empty() {
result.push(' ');
last_was_space = true;
}
}
_ if !in_tag => {
if ch.is_whitespace() {
if !last_was_space {
result.push(' ');
last_was_space = true;
}
} else {
result.push(ch);
last_was_space = false;
}
}
_ => {}
}
}
result
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.trim()
.to_string()
}
fn parse_xlsx_shared_strings(xml: &str) -> Vec<String> {
let mut strings = Vec::new();
let mut in_t = false;
let mut current = String::new();
let mut in_tag = false;
let mut tag_name = String::new();
for ch in xml.chars() {
match ch {
'<' => {
in_tag = true;
tag_name.clear();
}
'>' => {
in_tag = false;
let tag = tag_name.trim().to_string();
if tag == "t" || tag.starts_with("t ") {
in_t = true;
current.clear();
} else if tag == "/t" {
in_t = false;
strings.push(std::mem::take(&mut current));
} else if tag == "/si" {
in_t = false;
}
}
_ if in_tag => {
tag_name.push(ch);
}
_ if in_t => {
current.push(ch);
}
_ => {}
}
}
strings
}
fn parse_xlsx_sheet(xml: &str, shared_strings: &[String]) -> String {
let mut rows: Vec<Vec<String>> = Vec::new();
let mut current_row: Vec<String> = Vec::new();
let mut in_v = false;
let mut in_row = false;
let mut current_val = String::new();
let mut cell_type = String::new();
let mut in_tag = false;
let mut tag_buf = String::new();
for ch in xml.chars() {
match ch {
'<' => {
in_tag = true;
tag_buf.clear();
}
'>' => {
in_tag = false;
let tag = tag_buf.trim().to_string();
if tag == "row" || tag.starts_with("row ") {
in_row = true;
current_row.clear();
} else if tag == "/row" {
in_row = false;
if !current_row.is_empty() {
rows.push(std::mem::take(&mut current_row));
}
} else if in_row && (tag.starts_with("c ") || tag == "c") {
cell_type.clear();
if let Some(t_pos) = tag.find("t=\"") {
let rest = &tag[t_pos + 3..];
if let Some(end) = rest.find('"') {
cell_type = rest[..end].to_string();
}
}
} else if tag == "v" || tag.starts_with("v ") {
in_v = true;
current_val.clear();
} else if tag == "/v" {
in_v = false;
let val = if cell_type == "s" {
current_val
.trim()
.parse::<usize>()
.ok()
.and_then(|idx| shared_strings.get(idx))
.cloned()
.unwrap_or_default()
} else {
current_val.clone()
};
current_row.push(val);
} else if tag == "/c" {
cell_type.clear();
}
}
_ if in_tag => {
tag_buf.push(ch);
}
_ if in_v => {
current_val.push(ch);
}
_ => {}
}
}
rows.iter()
.map(|row| row.join("\t"))
.collect::<Vec<_>>()
.join("\n")
}
fn try_extract_by_extension(data: &[u8], filename: Option<&str>) -> Option<String> {
let ext = filename?.rsplit('.').next()?.to_lowercase();
match ext.as_str() {
"pdf" => extract_pdf(data).ok(),
"docx" => extract_docx(data).ok(),
"pptx" => extract_pptx(data).ok(),
"xlsx" => extract_xlsx(data).ok(),
"doc" | "ppt" | "xls" => extract_binary_strings(data).ok(),
"rtf" => extract_rtf(data).ok(),
"txt" | "csv" | "tsv" | "json" | "xml" | "yaml" | "yml" | "toml" | "md" | "markdown"
| "py" | "js" | "ts" | "rs" | "go" | "java" | "c" | "cpp" | "h" | "hpp" | "rb" | "sh"
| "bash" | "zsh" | "fish" | "css" | "html" | "htm" | "sql" | "log" | "ini" | "cfg"
| "conf" | "env" | "gitignore" | "dockerfile" => extract_utf8(data).ok(),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strip_xml_basic() {
let xml = "<root><p>Hello</p><p>World</p></root>";
assert_eq!(strip_xml_tags(xml), "Hello World");
}
#[test]
fn strip_xml_entities() {
let xml = "<t>A & B < C</t>";
assert_eq!(strip_xml_tags(xml), "A & B < C");
}
#[test]
fn extract_utf8_valid() {
assert_eq!(extract_utf8(b"hello").unwrap(), "hello");
}
#[test]
fn extract_utf8_lossy() {
let data = b"hello \xff world";
let result = extract_utf8(data).unwrap();
assert!(result.contains("hello"));
assert!(result.contains("world"));
}
#[test]
fn extract_by_extension_txt() {
let result = try_extract_by_extension(b"content", Some("notes.txt"));
assert_eq!(result, Some("content".to_string()));
}
#[test]
fn extract_by_extension_unknown() {
let result = try_extract_by_extension(b"data", Some("file.xyz"));
assert!(result.is_none());
}
#[test]
fn extract_by_extension_no_filename() {
let result = try_extract_by_extension(b"data", None);
assert!(result.is_none());
}
#[test]
fn rtf_basic_extraction() {
let rtf = br"{\rtf1\ansi Hello World\par Second line}";
let result = extract_rtf(rtf).unwrap();
assert!(result.contains("Hello World"));
assert!(result.contains("Second line"));
}
#[test]
fn xlsx_shared_strings_parsing() {
let xml = r#"<sst><si><t>Name</t></si><si><t>Age</t></si></sst>"#;
let strings = parse_xlsx_shared_strings(xml);
assert_eq!(strings, vec!["Name", "Age"]);
}
}