textract 0.1.0 - Docs.rs

use std::{io::Error, path::{Path}};
mod extractors;
use extractors::html::html_extract;
use extractors::office::{extract_docx,extract_odt,extract_ods,extract_xlx,extract_odp,extract_pptx};
use tree_magic;



pub fn extract(filepath: &str, extension:Option<&str>) -> Result<String,Error> {
    let ext:String;
    if extension == None {
        ext = guess_file_type(filepath);
    } else {
        ext = extension.unwrap().to_string();
    }
    let content = match &ext[..] {
        "text/plain" | "txt" => extractors::generic::generic(filepath)?,
        "html" | "htm" | "text/html" => html_extract(filepath)?,
        "pdf" | "application/pdf" | "application/x-pdf"=> extractors::pdf::pdf_process(filepath)?,
        "docx" | "application/msword" => extract_docx(filepath)?,
        "odp" | "application/vnd.oasis.opendocument.presentation" => extract_odp(filepath)?,
        "ods" | "application/vnd.oasis.opendocument.spreadsheet" => extract_ods(filepath)?,
        "odt" | "application/vnd.oasis.opendocument.text" => extract_odt(filepath)?,
        "pptx" | "application/vnd.openxmlformats-officedocument.presentationml.presentation" => extract_pptx(filepath)?,
        "xlsx" | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => extract_xlx(filepath)?,
        "unknown" => " ".to_string(),
        _ => " ".to_string(),
    };
    
    return Ok(content);
    
}


fn guess_file_type(filepath:&str) -> String {
        let mime_types_string = tree_magic::from_filepath(Path::new(filepath));
        if mime_types_string.is_empty() {
        
            return "unknown".to_string();
    
        }
        return mime_types_string;
}
#[test]
fn test_guess_file_type() {
    let extension = guess_file_type("samples/sample.docx");
    println!("{}",extension);
    

}
#[test]
fn test_extract() {
    let content = extract("src/lib.rs", None);
    println!("{}",content.unwrap());
}