use std::*;
extern crate unicode_segmentation;
use std::collections::HashMap;
use unicode_segmentation::UnicodeSegmentation;
pub fn get_segmentation(_sentence:&str,dict_path:&str,stopwords_path:&str,method:&str)->Vec<String>{
if dict_path.eq(""){
let result= get_word_list(_sentence);
let mut result_final:Vec<String>=Vec::new();
for r in result{
result_final.push(String::from(r));
}
return result_final;
}
let common_words=load_dictionary(dict_path);
let stop_words:Vec<String>;
if stopwords_path.eq(""){
stop_words=Vec::new();
}else{
stop_words=load_dictionary(stopwords_path);
}
let sentence=_sentence.graphemes(true).collect::<Vec<&str>>();
let mut list_result:Vec<String>=Vec::new();
if method=="bimm" || method==""{
list_result=bimm(sentence,common_words);
}else if method=="fmm"{
list_result=fmm(sentence,common_words);
}else if method=="bmm"{
list_result=bmm(sentence,common_words);
}
if !stopwords_path.eq(""){
let mut meaningful_words:Vec<String>=Vec::new();
for word in list_result{
if !stop_words.contains(&word){
meaningful_words.push(word);
}
}
return meaningful_words;
}else{
return list_result;
}
}
pub fn bimm(sentence:Vec<&str>, words_dict:Vec<String>)->Vec<String>{
let s1=sentence.clone();
let s2=sentence.clone();
let dict1=words_dict.clone();
let dict2=words_dict.clone();
let forward =fmm(s1,dict1);
let backward=bmm(s2,dict2);
let mut f_single_word=0;
let mut b_single_word=0;
let mut tot_fmm=forward.len();
let mut tot_bmm=backward.len();
let mut oov_fmm=0;
let mut oov_bmm=0;
let mut score_fmm=0;
let mut score_bmm=0;
if forward==backward{
return backward;
}else{
for each in forward.clone(){
if each.len()==1{
f_single_word+=1;
}
}
for each in backward.clone(){
if each.len()==1{
b_single_word+=1;
}
}
for each in forward.clone(){
if !words_dict.contains(&each){
oov_fmm+=1;
}
}
for each in backward.clone(){
if !words_dict.contains(&each){
oov_bmm+=1;
}
}
if oov_fmm>oov_bmm{
score_bmm+=1;
}
if oov_fmm<oov_bmm{
score_fmm+=1;
}
if tot_fmm>tot_bmm{
score_bmm+=1;
}else if tot_fmm<tot_bmm{
score_fmm+=1;
}
if f_single_word>b_single_word{
score_bmm+=1;
}else if f_single_word<b_single_word{
score_fmm+=1;
}
if score_fmm<score_bmm{
return forward;
}else{
return backward;
}
}
}
pub fn get_word_list(str:&str)->Vec<&str>{
let g = str.graphemes(true).collect::<Vec<&str>>();
return g;
}
pub fn bmm(sentence:Vec<&str>,dict:Vec<String>)->Vec<String>{
let mut list_words:Vec<String>=Vec::new();
let mut index:i32=sentence.len() as i32;
let window_size:i32=4;
while index>0{
let mut match_flag=false;
let mut i=window_size.clone();
while i>=0{
let a;
if index-i<0{
a=0 as usize;
}else{
a=(index-i) as usize;
}
let b =index as usize;
let sub_str=sentence[a..b].concat();
if dict.contains(&sub_str) {
match_flag = true;
list_words.push(sub_str);
index -= i;
break;
}
i-=1;
}
if match_flag==false{
if index-1<0{
index=1;
}
let a=(index-1) as usize;
list_words.push(String::from(sentence[a]));
index-=1;
}
}
list_words.reverse();
return list_words;
}
pub fn fmm(sentence:Vec<&str>,dict:Vec<String>)->Vec<String>{
let token_len=sentence.len() as i32;
let mut index:i32=0;
let mut list_words:Vec<String>=Vec::new();
let window_size=4;
while index<token_len{
let mut match_flag=false;
let mut i=window_size.clone();
while i>=0{
let a=index as usize;
let mut b=(index+i) as usize;
if b>(token_len) as usize{
b=token_len as usize;
}
let sub_str=sentence[a..b].concat();
if dict.contains(&sub_str){
match_flag=true;
list_words.push(sub_str);
index+=i;
break;
}
i-=1;
}
if match_flag==false{
let a=index as usize;
let v=String::from(sentence[a]);
list_words.push(v);
index+=1;
}
}
return list_words;
}
use std::io::{self, BufRead};
use std::fs::{File, read_dir};
use std::io::prelude::*;
use std::path::Path;
pub fn _read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
where P: AsRef<Path>, {
let file = File::open(filename)?;
Ok(io::BufReader::new(file).lines())
}
pub fn load_dictionary(filepath:&str)->Vec<String>{
let mut strings=Vec::new();
if let Ok(lines) = _read_lines(filepath) {
for line in lines {
if let Ok(line) = line {
strings.push(String::from(line.replace("\n","").trim()));
}
}
}
strings
}