use minidom::{
Element
};
use zip::read::ZipArchive;
use std::ffi::{
OsString,
OsStr
};
use std::path::{
PathBuf
};
use std::io::{
Result,
Read,
};
use std::fs::{
read_to_string,
File,
ReadDir,
read_dir,
DirEntry,
Metadata,
};
use std::collections::VecDeque;
type DocResult<T> = std::result::Result<T, UnsupportedDocumentError>;
#[derive(Debug, Clone)]
pub struct UnsupportedDocumentError;
pub enum DocType{
Pdf,
Docx,
Txt,
Unsupported
}
#[derive(Debug,Clone)]
pub struct Document{
pub name:String,
pub metadata:Metadata,
pub path:PathBuf,
pub data:Option<Vec<String>>,
pub encodings:Option<Vec<Vec<f32>>>
}
impl Document{
pub fn is_supported(&self)->bool{
let splits: (&str, &str) = self.name.rsplit_once('.').unwrap();
let supported_types:[&str;3] = ["pdf","docx","txt"];
supported_types.contains(&splits.1)
}
pub fn get_extension(&self)->Result<String>{
Ok(self.name.rsplit_once(".").unwrap().1.to_string())
}
pub fn get_doc_type(&self,)->DocType{
let extension: (&str, &str) = self.name.rsplit_once(".").unwrap();
match extension.1{
"pdf" => DocType::Pdf,
"docx" => DocType::Docx,
"txt" => DocType::Txt,
_ => DocType::Unsupported
}
}
pub fn get_document_path_as_string(&self)->Result<String>{
Ok(self.path.clone().to_string_lossy().to_string())
}
pub fn get_document_name_as_string(&self)->Result<String>{
Ok(self.name.clone())
}
pub fn set_document_data(&mut self, data:Vec<String>){
self.data = Some(data);
}
pub fn get_document_data(&self)->&Option<Vec<String>>{
&self.data
}
pub fn get_document_data_as_string(&self)->DocResult<String>{
let doctype:DocType = self.get_doc_type();
match doctype{
DocType::Docx=>{
match get_text_from_docx(self.path.to_string_lossy().to_string()){
Some(doc_text_result)=>{
Ok(doc_text_result)
},
None=>{
return Err(UnsupportedDocumentError)
}
}
},
DocType::Pdf=>{
match get_text_from_pdf(self.path.to_string_lossy().to_string()){
Some(doc_text_result)=>{
Ok(doc_text_result)
},
None=>{
return Err(UnsupportedDocumentError)
}
}
},
DocType::Txt=>{
match get_text_from_txt(self.path.to_string_lossy().to_string()){
Some(doc_text_result)=>{
Ok(doc_text_result)
},
None=>{
return Err(UnsupportedDocumentError)
}
}
},
DocType::Unsupported=>{
return Err(UnsupportedDocumentError)
}
}
}
}
pub fn get_file_list(path:&PathBuf) ->Option<Vec<Document>> {
let path_objects:ReadDir = read_dir(path).unwrap();
let mut file_list:Vec<Document> = vec![];
for path_object in path_objects{
let object:DirEntry;
match path_object{
Ok(obj)=>{
object = obj;
},
Err(err)=>{
println!("Error reading file: {:?}",err);
continue;
}
}
let file_metadata:Metadata = object.metadata().unwrap();
let file_name:OsString = object.file_name();
let file_path:PathBuf = object.path();
if file_path.is_file() && is_supported(&file_name){
match file_name.into_string(){
Ok(fname)=>file_list.push(
Document {
name:fname,
metadata:file_metadata,
path:file_path,
data:None,
encodings:None
}
),
Err(os_str)=>file_list.push(
Document {
name:os_str.to_string_lossy().to_string(),
metadata:file_metadata,
path:file_path,
data:None,
encodings:None
}
),
}
}
}
Some(file_list)
}
pub fn is_supported(file_name:&OsStr)->bool{
let split_str:String = file_name.to_string_lossy().to_string();
let splits = split_str.rsplit_once('.').unwrap();
let supported_types:[&str;3] = ["pdf","docx","txt"];
supported_types.contains(&splits.1)
}
pub fn get_file_text( doc:&Document, chunk_size:usize)->Option<Vec<String>>{
let file_path:String;
match doc.get_document_path_as_string(){
Ok(doc_path)=> file_path=doc_path,
Err(_err)=>return None
}
let file_text_option:Option<String>;
match doc.get_doc_type(){
DocType::Pdf => file_text_option=get_text_from_pdf(file_path),
DocType::Docx => file_text_option=get_text_from_docx(file_path),
DocType::Txt => file_text_option=get_text_from_txt(file_path),
DocType::Unsupported => return None
}
let text_vec:Vec<String>;
match file_text_option{
Some(text)=>{
match split_text_into_chunks(text, chunk_size){
Ok(string_vec)=>{
text_vec=string_vec;
},
Err(err)=>{
println!("Error splitting text into chunks!: {:?}",err);
return None
}
}
},
None=>{
text_vec= vec![String::from(" ")];
}
}
Some(text_vec)
}
pub fn split_text_into_chunks(text:String, chunk_size:usize)->Result<Vec<String>>{
let text_vector: Vec<String> = text.as_bytes().chunks(chunk_size).map(|chunk| String::from_utf8_lossy(chunk).to_string()).collect::<Vec<_>>();
Ok(text_vector)
}
pub fn get_text_from_txt(file_path:String)->Option<String>{
let text_result = read_to_string(file_path);
let text_string:String;
match text_result{
Ok(text)=>text_string=text,
Err(_err)=>return None
}
Some(text_string)
}
pub fn get_text_from_docx(file_path:String)->Option<String>{
let mut result: String = String::new();
let mut xml_string:String = String::new();
let file: File;
match File::open(file_path){
Ok(f)=>file=f,
Err(_e)=> return None
}
let mut zip_reader: ZipArchive<_>;
match ZipArchive::new(file){
Ok(zp)=> zip_reader = zp,
Err(_err)=>return None
}
let mut document_xml_file: zip::read::ZipFile<'_>;
match zip_reader.by_name("word/document.xml"){
Ok(zpf)=> document_xml_file=zpf,
Err(_err)=> return None
}
let _outcome: std::result::Result<usize, std::io::Error> = document_xml_file.read_to_string(&mut xml_string);
let element:Element = xml_string.parse().unwrap();
let mut node_que:VecDeque<&Element> = VecDeque::new();
let mut _text_string:String = String::new();
node_que.push_back(&element);
while let Some(node) = node_que.pop_front(){ if node.name() == "t"{
result.push_str(&node.text());
result.push_str("\n");
}
for child in node.children(){
node_que.push_back(child);
}
}
if result.len() == 0{ result.push_str(" ");
}
Some(result)
}
pub fn get_text_from_pdf(file_path:String)->Option<String>{
match std::panic::catch_unwind(move || {
let result_string:String;
let bytes: Vec<u8>;
match std::fs::read(file_path){
Ok(fs_bytes)=> bytes=fs_bytes,
Err(_err)=>{
println!("Error reading file: {:?}",_err);
return None
}
}
match pdf_extract::extract_text_from_mem(&bytes){
Ok(pdf_text)=>result_string = pdf_text,
Err(_err)=> return None
}
Some(result_string)
}){
Ok(output)=>return output,
Err(err)=>{
println!("Error reading pdf: {:?}",err);
return None
}
}
}