use crate::result_output_html::TextMaybeBold;
use gcollections::ops::*;
use interval::interval_set::*;
use lazy_static::lazy_static;
use regex::Regex;
pub fn extract_clean_word_ngrams(words: &Vec<String>, n: usize) -> Vec<String> {
let mut output = Vec::new();
if n > words.len() {
return Vec::new();
}
for i in 0..=(words.len() - n) {
let mut ngram = Vec::new();
for j in 0..n {
ngram.push(words[i + j].to_string());
}
output.push(ngram.join(" "));
}
output
}
pub fn clean_text(text: &str) -> Vec<String> {
lazy_static! {
static ref REMOVE_NONALPHA: Regex = Regex::new(r"[^A-Za-z0-9 ]")
.expect("Regex to remove non-alphanumeric characters could not be compiled properly!");
static ref REMOVE_SPACES: Regex = Regex::new(r"\s+")
.expect("Regex to remove redundant spaces could not be compiled properly!");
}
let mut new_text: String = text.replace('\n', " ").to_lowercase();
new_text = REMOVE_NONALPHA.replace_all(&new_text, " ").to_string();
REMOVE_SPACES
.replace_all(&new_text, " ")
.trim()
.split_whitespace()
.map(String::from)
.collect()
}
pub fn get_boldtext_segments_from_intervals(
words: &Vec<String>,
text_intervals: &IntervalSet<usize>,
) -> Vec<TextMaybeBold> {
let mut text_segments: Vec<TextMaybeBold> = Vec::new();
let mut cur_words: Vec<String> = Vec::new();
let mut contains_previously = false;
for (i, item) in words.iter().enumerate() {
let word = item.clone();
if text_intervals.contains(&i) {
if contains_previously {
cur_words.push(word);
} else {
if !cur_words.is_empty() {
text_segments.push(TextMaybeBold {
text: cur_words.join(" "),
is_bold: false,
});
}
cur_words = Vec::new();
cur_words.push(word);
}
contains_previously = true;
} else {
if contains_previously {
if !cur_words.is_empty() {
text_segments.push(TextMaybeBold {
text: cur_words.join(" "),
is_bold: true,
});
}
cur_words = Vec::new();
cur_words.push(word);
} else {
cur_words.push(word);
}
contains_previously = false;
}
}
if !cur_words.is_empty() {
if contains_previously {
text_segments.push(TextMaybeBold {
text: cur_words.join(" "),
is_bold: true,
});
} else {
text_segments.push(TextMaybeBold {
text: cur_words.join(" "),
is_bold: false,
});
}
}
text_segments
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ngram() {
assert_eq!(
extract_clean_word_ngrams(
&vec!["mary".to_string(), "had".to_string(), "a".to_string()],
2
),
vec!["mary had", "had a"]
);
assert_eq!(
extract_clean_word_ngrams(&clean_text(" ||| mary\n @@@@ .... had a\n\n\n"), 2),
vec!["mary had", "had a"]
);
}
#[test]
fn test_clean() {
assert_eq!(
clean_text(" a b c \n 2 3 @ 4\n224acb@\n"),
vec!["a", "b", "c", "2", "3", "4", "224acb"]
)
}
#[test]
fn test_intervals_firstwords_bold() {
let words = vec!["a", "b", "c", "d", "e"]
.iter()
.map(|x| x.to_string())
.collect();
let intervals = vec![(0, 1)].to_interval_set();
assert_eq!(
get_boldtext_segments_from_intervals(&words, &intervals),
vec![
TextMaybeBold {
text: "a b".to_string(),
is_bold: true
},
TextMaybeBold {
text: "c d e".to_string(),
is_bold: false
}
]
);
}
#[test]
fn test_intervals_lastwords_bold() {
let words = vec!["a", "b", "c", "d", "e"]
.iter()
.map(|x| x.to_string())
.collect();
let intervals = vec![(2, 4)].to_interval_set();
assert_eq!(
get_boldtext_segments_from_intervals(&words, &intervals),
vec![
TextMaybeBold {
text: "a b".to_string(),
is_bold: false
},
TextMaybeBold {
text: "c d e".to_string(),
is_bold: true
}
]
);
}
#[test]
fn test_intervals_no_bold() {
let words = vec!["a", "b", "c", "d", "e"]
.iter()
.map(|x| x.to_string())
.collect();
let intervals = vec![].to_interval_set();
assert_eq!(
get_boldtext_segments_from_intervals(&words, &intervals),
vec![TextMaybeBold {
text: "a b c d e".to_string(),
is_bold: false
},]
);
}
#[test]
fn test_intervals_all_bold() {
let words = vec!["a", "b", "c", "d", "e"]
.iter()
.map(|x| x.to_string())
.collect();
let intervals = vec![(0, 4)].to_interval_set();
assert_eq!(
get_boldtext_segments_from_intervals(&words, &intervals),
vec![TextMaybeBold {
text: "a b c d e".to_string(),
is_bold: true
},]
);
}
#[test]
fn test_intervals_single_letters_bold() {
let words = vec!["a", "b", "c", "d", "e"]
.iter()
.map(|x| x.to_string())
.collect();
let intervals = vec![(0, 0), (2, 2)].to_interval_set();
assert_eq!(
get_boldtext_segments_from_intervals(&words, &intervals),
vec![
TextMaybeBold {
text: "a".to_string(),
is_bold: true
},
TextMaybeBold {
text: "b".to_string(),
is_bold: false
},
TextMaybeBold {
text: "c".to_string(),
is_bold: true
},
TextMaybeBold {
text: "d e".to_string(),
is_bold: false
},
]
);
}
}