use std::{io::Error, path::{Path}};
mod extractors;
use extractors::html::html_extract;
use extractors::office::{extract_docx,extract_odt,extract_ods,extract_xlx,extract_odp,extract_pptx};
use tree_magic;
pub fn extract(filepath: &str, extension:Option<&str>) -> Result<String,Error> {
let ext:String;
if extension == None {
ext = guess_file_type(filepath);
} else {
ext = extension.unwrap().to_string();
}
let content = match &ext[..] {
"text/plain" | "txt" => extractors::generic::generic(filepath)?,
"html" | "htm" | "text/html" => html_extract(filepath)?,
"pdf" | "application/pdf" | "application/x-pdf"=> extractors::pdf::pdf_process(filepath)?,
"docx" | "application/msword" => extract_docx(filepath)?,
"odp" | "application/vnd.oasis.opendocument.presentation" => extract_odp(filepath)?,
"ods" | "application/vnd.oasis.opendocument.spreadsheet" => extract_ods(filepath)?,
"odt" | "application/vnd.oasis.opendocument.text" => extract_odt(filepath)?,
"pptx" | "application/vnd.openxmlformats-officedocument.presentationml.presentation" => extract_pptx(filepath)?,
"xlsx" | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => extract_xlx(filepath)?,
"unknown" => " ".to_string(),
_ => " ".to_string(),
};
return Ok(content);
}
fn guess_file_type(filepath:&str) -> String {
let mime_types_string = tree_magic::from_filepath(Path::new(filepath));
if mime_types_string.is_empty() {
return "unknown".to_string();
}
return mime_types_string;
}
#[test]
fn test_guess_file_type() {
let extension = guess_file_type("samples/sample.docx");
println!("{}",extension);
}
#[test]
fn test_extract() {
let content = extract("src/lib.rs", None);
println!("{}",content.unwrap());
}