use crate::models::model::encode_text_with_model_from_path;
#[cfg(not(feature="no-ml"))]
use crate::models::{
model::encode_text,
summarize_model::generate_summary,
};
use clap::builder::OsStr;
use rayon::{prelude::*, vec};
use hora::index::hnsw_idx::HNSWIndex;
use hora::core::ann_index::{
ANNIndex,
SerializableIndex
};
use hora::core::metrics::Metric;
use toml::{map::Map, Value};
use serde::{Serialize,Deserialize};
use crate::documents::document::{
Document,
get_file_text,
get_file_list, is_supported,
UnsupportedDocumentError, DocType, get_text_from_docx, get_text_from_pdf, get_text_from_txt
};
use crate::database::vectordb::{
create_index,
load_index,
save_index,
};
use crate::database::sql_database::{
create_sqlite_db,
load_sqlite_db,
insert_data_into_sql_db,
sql_search_by_id
};
use std::fs::create_dir;
use std::path::PathBuf;
use std::io::ErrorKind;
use std::fmt;
use std::process::exit;
type Result<T> = std::result::Result<T, KnowledgeBaseError>;
#[cfg(not(feature="no-ml"))]
#[derive(Debug, Clone)]
pub struct KnowledgeBaseError;
impl fmt::Display for KnowledgeBaseError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid sql transaction or connection")
}
}
#[cfg(not(feature="no-ml"))]
#[derive(Debug)]
pub struct Matches{
pub document_name:String,
pub line:String,
}
#[cfg(not(feature="no-ml"))]
#[derive(Debug,Serialize,Deserialize)]
pub struct Cephalon{
path:PathBuf,
local_model:bool,
local_model_path:Option<String>
}
#[cfg(not(feature="no-ml"))]
impl Cephalon{
fn get_text_from_all_docs(&self, doc_list:&mut Vec<Document>){
doc_list.par_iter_mut().for_each(|doc: &mut Document|{
println!("Now Processing {:?} ...",doc.get_document_name_as_string());
let document_data_option: Option<Vec<String>> = get_file_text(doc,256);
match document_data_option{
Some(doc_text)=> doc.set_document_data(doc_text),
None=>println!("Error reading document {:?}",doc.get_document_name_as_string())
}
println!("\r Finished Processing file {:?}",doc.get_document_name_as_string());
});
}
pub fn search_and_build_index(self, path:&PathBuf){
let mut project_path: PathBuf = path.clone();
project_path.push(".cephalon");
let mut doc_list:Vec<Document>;
match get_file_list(path){
Some(f_list)=>{
doc_list=f_list;
},
None=>{
panic!("Unable to get a list of file!")
}
}
self.get_text_from_all_docs(&mut doc_list);
if self.local_model{
let _ = Document::build_semantic_search(&mut doc_list, (*project_path).to_path_buf(), true, self.local_model_path.unwrap());
}else{
let _ = Document::build_semantic_search(&mut doc_list, (*project_path).to_path_buf(), false, "".to_string());
}
}
pub fn search(self, path:PathBuf, query:String,count:usize)->Option<Vec<Matches>>{
let mut results:Vec<usize> = vec![];
let mut project_path = path.clone();
project_path.push(".cephalon");
if self.local_model{
match encode_text_with_model_from_path(&self.local_model_path.unwrap(), &vec![query]){ Some(encodings)=>{
for encoding in encodings{
let index: HNSWIndex<f32, usize> = load_index(project_path.clone());
match encoding.1{ Some(mut embedding)=>{
results.append(&mut index.search(&mut embedding, count));
},
None=>{}
}
}
},
None=>{
return None
}
}
}else{
match encode_text(&vec![query]){ Some(encodings)=>{
for encoding in encodings{
let index: HNSWIndex<f32, usize> = load_index(project_path.clone());
match encoding.1{ Some(mut embedding)=>{
results.append(&mut index.search(&mut embedding, count));
},
None=>{}
}
}
},
None=>{
return None
}
}
}
let mut search_results:Vec<Matches> = vec![];
match sql_search_by_id(project_path,results){
Some(search_output)=>{
for output in search_output{
search_results.push(Matches{document_name:output.0, line:output.1});
}
},
None=>{
return None
}
}
Some(search_results)
}
}
#[cfg(not(feature="no-ml"))]
pub trait Util{
fn new(path:PathBuf, local:bool, model_path:String)->Self;
fn load(path:PathBuf)->Self;
}
#[cfg(not(feature="no-ml"))]
impl Util for Cephalon{
fn new(path:PathBuf, local:bool, model_path:String)->Cephalon{
let mut project_path: PathBuf = path.clone();
project_path.push(".cephalon");
match create_dir(&project_path){
Ok(_msg)=>println!("Created project folder"),
Err(err)=> {
if err.kind() == ErrorKind::AlreadyExists{
println!("Loading Cephalon from previous project")
}else{
panic!("Error creating cephalon project: {:?}",err)
}
}
}
let _index: HNSWIndex<f32, usize> = create_index((*project_path).to_path_buf(),384);
let conn = create_sqlite_db((*project_path).to_path_buf());
match conn.close(){
Ok(_c)=>println!("Successfully created database"),
Err(err)=>panic!("Error close database connection: {:?}",err)
}
let cephy: Cephalon;
if local {
cephy = Cephalon{path:path.to_path_buf(),local_model:local, local_model_path:Some(model_path)};
}else{
cephy = Cephalon{path:path.to_path_buf(),local_model:local, local_model_path:None};
}
let cephy_toml = toml::to_string(&cephy).expect("Could not encode TOML value");
project_path.push("cephalon.toml");
std::fs::write(project_path, cephy_toml).expect("Error writing to .toml file");
cephy
}
fn load(path:PathBuf)->Cephalon{
let mut project_path: PathBuf = path.clone();
project_path.push(".cephalon");
project_path.push("cephalon.toml");
let data = match std::fs::read_to_string(project_path){
Ok(d)=>d,
Err(err)=>{
println!("Error reading cephalon.toml file: {:?}",err);
exit(1);
}
};
let cephy:Cephalon = match toml::from_str(&data){
Ok(c)=>c,
Err(err)=>{
println!("Error Generating Cephalon object from cephalon.toml file: {:?}",err);
exit(1)
}
};
cephy
}
}
#[cfg(not(feature="no-ml"))]
pub trait DocumentEncoder{
fn build_semantic_search(doc_list:&mut Vec<Document>, project_path:PathBuf, local:bool, model_path:String)->Result<()>;
fn encode_text_via_model(&self, model:&str, local:bool, model_path:&String)->Option<Vec<(String,Option<Vec<f32>>)>>;
fn load(file_path:String)->Result<Document>;
fn summarize(&self)->Result<Vec<String>>;
}
#[cfg(not(feature="no-ml"))]
impl DocumentEncoder for Document{
fn build_semantic_search(doc_list:&mut Vec<Document>, project_path:PathBuf, local:bool, model_path:String)->Result<()>{
let mut index:HNSWIndex<f32,usize> = create_index(project_path.clone(), 384);
let mut id:usize = 0;
for doc in doc_list{
match doc.encode_text_via_model("all-MiniLM-L6-v2", local, &model_path){
Some(vector_embeddings)=>{
let sentences:&Vec<String>;
match doc.get_document_data(){
Some(data)=>{
sentences = data;
for embedding_data in vector_embeddings{
id+=1;
match embedding_data.1{
Some(embedding)=>{
match index.add(&embedding, id){
Ok(_msg)=>{},
Err(err)=>{
println!("Error: {}, on id:{}",err,id);
}
}
match insert_data_into_sql_db(project_path.clone() ,&doc.get_document_name_as_string().unwrap(),&embedding_data.0,id){
Ok(_msg)=>{},
Err(err)=>{
println!("Error inserting line:{} due to error:{:?}",embedding_data.0, err);
}
}
},
None=>{
}
}
}
},
None=>{
println!("No Text found for file:{}",doc.get_document_name_as_string().unwrap());
continue
}
}
},
None=>{
println!("Error generating embeddings for: {:?}",doc.get_document_name_as_string().unwrap());
continue
}
}
}
save_index(&mut index, project_path);
Ok(())
}
fn encode_text_via_model(&self, _model:&str, local:bool, model_path:&String)->Option<Vec<(String,Option<Vec<f32>>)>>{
let mut encodings:Vec<Vec<f32>> = vec![];
let sentences:Vec<String>;
match self.get_document_data(){
Some(vec_string)=>{
sentences = vec_string.to_vec();
},
None=>{
println!("Document has no parsed data");
return None
}
}
if local{
match encode_text_with_model_from_path(model_path,&sentences){
Some(embedded_sentences)=>{
return Some(embedded_sentences)
},
None=>{
println!("Unable to generate Embeddings for document:{:?}",self.get_document_name_as_string());
return None
}
}
}else{
match encode_text(&sentences){
Some(embedded_sentences)=>{
return Some(embedded_sentences)
},
None=>{
println!("Unable to generate Embeddings for document:{:?}",self.get_document_name_as_string());
return None
}
}
}
}
fn load(file_path:String)->Result<Document>{
let file_metadata:std::fs::Metadata;
match std::fs::metadata(&file_path){
Ok(mdata)=>{
file_metadata= mdata;
},
Err(err)=>{
return Err(KnowledgeBaseError)
}
}
if !file_metadata.is_file(){ return Err(KnowledgeBaseError)
}
let file_name:String;
match std::path::Path::new(&file_path).file_name(){
Some(name)=>{
file_name = name.to_string_lossy().to_string();
},
None=>{
return Err(KnowledgeBaseError)
}
}
let document:Document = Document{ name:file_name,
path:PathBuf::from(file_path),
metadata:file_metadata,
data:None,
encodings:None
};
Ok(document)
}
fn summarize(&self)->Result<Vec<String>> {
let doc_text:String;
let summary:Vec<String>;
match self.get_document_data_as_string(){
Ok(doc_text)=>{
match generate_summary(doc_text){
Ok(doc_summary)=>{
summary = doc_summary;
},
Err(err)=>{
return Err(KnowledgeBaseError)
}
}
},
Err(err)=>{
return Err(KnowledgeBaseError)
}
}
Ok(summary)
}
}