use crate::utils::bow::Bow;
use super::fs::*;
use super::json::*;
use anyhow::{anyhow, bail, Context, Result};
use std::collections::{HashMap, HashSet};
use std::hash::Hash;
use std::io::BufRead;
use std::io::BufReader;
use tracing::warn;
use regex::bytes::Regex;
#[derive(Debug)]
pub struct Matcher {
regex: Option<Regex>,
}
impl Matcher {
pub fn words_matcher() -> Self {
Matcher {
regex: Some(Regex::new(r"\b\w+\b").unwrap()),
}
}
pub fn empty_matcher() -> Self {
Matcher { regex: None }
}
pub fn keywords_matcher<I, T>(
keywords: I,
case_sensitive: bool,
whole_words: bool,
) -> Result<Self>
where
I: IntoIterator<Item = T>,
T: ToString,
{
let joined_keywords = keywords
.into_iter()
.filter_map(|s| Some(s.to_string()).filter(|s| !s.is_empty()))
.collect::<Vec<String>>()
.join("|");
if !joined_keywords.is_empty() {
let new_pattern: String = if whole_words {
format!(r"\b(?:{joined_keywords})\b")
} else {
joined_keywords
};
let new_pattern_with_sensitivity: String = if case_sensitive {
new_pattern
} else {
format!("(?i){new_pattern}")
};
Ok(Self {
regex: Some(Regex::new(&new_pattern_with_sensitivity)?),
})
} else {
Ok(Self::words_matcher())
}
}
pub fn keywords_matchers<T>(
local_keywords: &HashMap<T, HashSet<String>>,
global_keywords: &HashSet<String>,
case_sensitive: bool,
whole_words: bool,
) -> Result<HashMap<T, Matcher>>
where
T: Eq + Hash + Clone,
{
let mut res = HashMap::<T, Matcher>::new();
for (ext, kw) in local_keywords {
let joined_keywords = Self::keywords_matcher(
kw.iter().chain(global_keywords.iter()).cloned(),
case_sensitive,
whole_words,
)?;
res.insert(ext.clone(), joined_keywords);
}
Ok(res)
}
pub fn count_matches_in_text(&self, text: &[u8]) -> usize {
self.regex
.as_ref()
.map(|r| r.find_iter(text).count())
.unwrap_or(0)
}
pub fn has_matches_in_text(&self, text: &[u8]) -> bool {
self.regex
.as_ref()
.map(|r| r.is_match(text))
.unwrap_or(false)
}
pub fn count_matches_in_file(&self, path: &str) -> Result<usize> {
let mut count: usize = 0;
for l in BufReader::new(open_file(path, FileMode::Read)?).lines() {
let line = l.with_context(|| format!("Could not read lines from {path}"))?;
count += self.count_matches_in_text(line.as_bytes());
}
Ok(count)
}
pub fn bag_of_words(&self, text: &[u8]) -> Bow {
let mut bow: Bow = Bow::new();
if let Some(re) = &self.regex {
bow.add_all(re.find_iter(text).map(|w| w.as_bytes()));
}
bow
}
}
pub fn count_text_lines(text: &[u8]) -> usize {
text.lines().count()
}
pub struct KeywordFiles {
pub paths: Vec<String>,
pub matchers: HashMap<String, Vec<Matcher>>,
pub extensions_to_language: HashMap<String, String>,
}
impl Default for KeywordFiles {
fn default() -> Self {
KeywordFiles::new()
}
}
impl KeywordFiles {
pub fn new() -> KeywordFiles {
KeywordFiles {
paths: Vec::new(),
matchers: HashMap::new(),
extensions_to_language: HashMap::new(),
}
}
pub fn len(&self) -> usize {
self.paths.len()
}
pub fn is_empty(&self) -> bool {
self.paths.is_empty()
}
pub fn add_files(self, paths: &[&str], warning: bool) -> Result<KeywordFiles> {
if paths.is_empty() {
Ok(self)
} else {
self.add_file(paths[0], warning)?
.add_files(&paths[1..], warning)
}
}
pub fn add_file(self, path: &str, warning: bool) -> Result<KeywordFiles> {
let mut updated_paths: Vec<String> = self.paths.clone();
updated_paths.push(path.to_string());
let json = open_json_from_path(path)?;
let categories = json_to_map(&json);
let mut local_kw = HashMap::<String, HashSet<String>>::new();
let mut extensions_to_language = self.extensions_to_language.clone();
let cat1 = "languages";
let languages = categories
.get(cat1)
.with_context(|| format!("Keyword file {path} does not contain a {cat1} field"))?;
for l in languages.members() {
let (name, extensions, keywords) = if l.is_string() {
(
l.as_str()
.with_context(|| "Language name is not a string")?,
HashSet::new(),
HashSet::new(),
)
} else {
let language = json_to_map(l);
let name: &str = language
.get("name")
.with_context(|| {
format!("Keyword file {path} contains a language with no name")
})?
.as_str()
.with_context(|| anyhow!("Language name is not a string"))?;
let extensions: HashSet<String> = match language.get("extensions") {
Some(ext) => json_to_set(ext),
None => {
if warning {
warn!("Language {} in {} has no extensions field", name, path);
}
HashSet::new()
}
};
let keywords: HashSet<String> = language
.get("keywords")
.map(|json| json_to_set(json))
.unwrap_or_default();
(name, extensions, keywords)
};
for ext in extensions {
match extensions_to_language.get(&ext) {
Some(value) if value != name => {
bail!(
"Extension {} is associated with both {} and {} when loading {}",
&ext,
value,
name,
updated_paths.join(", ")
);
}
None => {
extensions_to_language.insert(ext.clone(), name.to_string());
}
_ => (),
}
extensions_to_language.insert(ext, name.to_string());
}
local_kw.insert(name.to_string(), keywords.clone());
}
let cat2 = "keywords";
let global_kw = categories
.get(cat2)
.map(|json| json_to_set(json))
.unwrap_or_default();
let file_matchers = Matcher::keywords_matchers(&local_kw, &global_kw, false, true)?;
let mut updated_matchers = self.matchers;
for (lang, entry) in updated_matchers.iter_mut() {
if !file_matchers.contains_key(lang) {
entry.push(Matcher::empty_matcher());
}
}
for (lang, matcher) in file_matchers {
match updated_matchers.get_mut(&lang) {
None => {
let mut empty_matchers = Vec::new();
for _ in 0..self.paths.len() {
empty_matchers.push(Matcher::empty_matcher());
}
empty_matchers.push(matcher);
updated_matchers.insert(lang.to_string(), empty_matchers);
}
Some(entry) => entry.push(matcher),
}
}
Ok(KeywordFiles {
paths: updated_paths,
matchers: updated_matchers,
extensions_to_language,
})
}
pub fn count_matches_in_file(&self, lang: &str, path: &str) -> Result<Vec<usize>> {
match self.matchers.get(lang) {
Some(m) => m.iter().map(|m| m.count_matches_in_file(path)).collect(),
None => Ok(vec![0, self.paths.len()]),
}
}
pub fn count_matches_in_text(&self, lang: &str, text: &[u8]) -> Vec<usize> {
match self.matchers.get(lang) {
Some(m) => m.iter().map(|m| m.count_matches_in_text(text)).collect(),
None => vec![0; self.paths.len()],
}
}
pub fn has_matches_in_text(&self, lang: &str, text: &[u8]) -> bool {
match self.matchers.get(lang) {
Some(v) => v.iter().any(|m| m.has_matches_in_text(text)),
None => false,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn count_matches_test() -> Result<()> {
let text = b"Parole, parole, parole, paroleParole parole_parole parole_Parole";
let matcher_lower_unsensitive_whole = Matcher::keywords_matcher(["parole"], false, true)?;
let matcher_lower_unsensitive_part = Matcher::keywords_matcher(["parole"], false, false)?;
let matcher_lower_sensitive_whole = Matcher::keywords_matcher(["parole"], true, true)?;
let matcher_lower_sensitive_part = Matcher::keywords_matcher(["parole"], true, false)?;
let matcher_upper_unsensitive_whole = Matcher::keywords_matcher(["Parole"], false, true)?;
let matcher_upper_unsensitive_part = Matcher::keywords_matcher(["Parole"], false, false)?;
let matcher_upper_sensitive_whole = Matcher::keywords_matcher(["Parole"], true, true)?;
let matcher_upper_sensitive_part = Matcher::keywords_matcher(["Parole"], true, false)?;
assert_eq!(
matcher_lower_unsensitive_whole.count_matches_in_text(text),
3
);
assert_eq!(
matcher_lower_unsensitive_part.count_matches_in_text(text),
9
);
assert_eq!(matcher_lower_sensitive_whole.count_matches_in_text(text), 2);
assert_eq!(matcher_lower_sensitive_part.count_matches_in_text(text), 6);
assert_eq!(
matcher_upper_unsensitive_whole.count_matches_in_text(text),
3
);
assert_eq!(
matcher_upper_unsensitive_part.count_matches_in_text(text),
9
);
assert_eq!(matcher_upper_sensitive_whole.count_matches_in_text(text), 1);
assert_eq!(matcher_upper_sensitive_part.count_matches_in_text(text), 3);
Ok(())
}
#[test]
fn count_words_test() -> Result<()> {
let matcher = Matcher::words_matcher();
assert_eq!(matcher.count_matches_in_text(b""), 0);
assert_eq!(matcher.count_matches_in_text(b"word"), 1);
assert_eq!(matcher.count_matches_in_text(b" word word word "), 3);
assert_eq!(matcher.count_matches_in_text(b"word\nword\nword"), 3);
assert_eq!(matcher.count_matches_in_text(b"<word>"), 1);
Ok(())
}
#[test]
fn count_text_lines_test() -> Result<()> {
assert_eq!(count_text_lines(b""), 0);
assert_eq!(count_text_lines(b"word"), 1);
assert_eq!(count_text_lines(b"word\nword\nword"), 3);
Ok(())
}
#[test]
fn keywords_patterns_test() -> Result<()> {
let local_keywords: HashMap<usize, HashSet<String>> = [
(
3,
["word1".to_string(), "word2".to_string()]
.iter()
.cloned()
.collect(),
),
(
6,
["word3".to_string(), "word4".to_string()]
.iter()
.cloned()
.collect(),
),
]
.iter()
.cloned()
.collect();
let global_keywords: HashSet<String> = ["word5".to_string(), "word6".to_string()]
.iter()
.cloned()
.collect();
let patterns = Matcher::keywords_matchers(&local_keywords, &global_keywords, false, true)?;
assert_eq!(patterns.len(), 2);
let text = b"word1 word2 word3 word4 word5 word6";
assert_eq!(
patterns
.get(&3)
.with_context(|| "Pattern for key 3 not found")?
.count_matches_in_text(text),
4
);
assert_eq!(
patterns
.get(&6)
.with_context(|| "Pattern for key 6 not found")?
.count_matches_in_text(text),
4
);
Ok(())
}
}