use crate::use_m::*;
use ahash::AHashMap;
use lazy_static::lazy_static;
use python_comm_macros::auto_func_name;
use serde::{Deserialize, Serialize};
use std::{collections::HashMap, mem::take, sync::Mutex};
#[derive(Serialize, Deserialize, Clone)]
struct KeywordNode {
letters: Vec<char>,
length: usize,
name: String,
is_blue: bool,
}
impl KeywordNode {
fn name(&self) -> String {
self.name.clone()
}
fn new(letters: Vec<char>) -> Self {
let length = letters.len();
Self {
letters,
length,
name: String::new(),
is_blue: false,
}
}
#[cfg(test)]
fn to_string(&self) -> String {
format!("{:?}/{}, {}, {}", self.letters, self.length, self.name, self.is_blue)
}
}
#[cfg(test)]
mod keyword_node_test {
use super::*;
#[test]
fn test_new() {
let node = KeywordNode::new("abc".chars().collect::<Vec<char>>());
assert_eq!(node.to_string(), "[\'a\', \'b\', \'c\']/3, , false");
}
#[test]
fn test_serde() {
let node = KeywordNode::new("abc".chars().collect());
let text = serde_json::to_string(&node).unwrap();
assert_eq!(
text,
"{\"letters\":[\"a\",\"b\",\"c\"],\"length\":3,\"name\":\"\",\"is_blue\":false}"
);
let node: KeywordNode = serde_json::from_str(&text).unwrap();
assert_eq!(node.to_string(), "[\'a\', \'b\', \'c\']/3, , false");
}
}
pub struct TextSearcher {
nodes: Vec<KeywordNode>,
blacks: AHashMap<(usize, char), usize>,
blues: AHashMap<usize, usize>,
}
impl TextSearcher {
pub fn add_keyword(&mut self, keyword: String, name: Option<String>) {
let mut node_id = 1;
let mut letters = Vec::new();
for letter in keyword.chars() {
letters.push(letter);
if let Some(&next_node_id) = self.blacks.get(&(node_id, letter)) {
node_id = next_node_id;
} else {
self.nodes.push(KeywordNode::new(letters.clone()));
let next_node_id = self.nodes.len();
self.blacks.insert((node_id, letter), next_node_id);
node_id = next_node_id;
}
}
let node = &mut self.nodes[node_id - 1];
node.is_blue = true;
if let Some(name) = name {
node.name = name;
} else {
node.name = keyword;
}
}
pub fn create_blues(&mut self) {
for node_id in 1..=self.nodes.len() {
let letters = take(&mut self.nodes[node_id - 1].letters);
for start in 1..letters.len() {
let target_node_id = self.get_node_by_keyword(&letters[start..]);
if target_node_id != 0 {
self.blues.insert(node_id, target_node_id);
break;
}
}
}
}
fn get_node_by_keyword(&self, keyword: &[char]) -> usize {
let mut node_id = 1;
for letter in keyword {
if let Some(&next_node_id) = self.blacks.get(&(node_id, *letter)) {
node_id = next_node_id;
} else {
return 0;
}
}
return node_id;
}
#[auto_func_name]
pub fn load(text: String) -> Result<Self, MoreError> {
Ok(serde_json::from_str::<TextSearcherForSerde>(&text)
.m(m!(__func__))?
.to())
}
pub fn match_(&self, text: &str) -> Vec<(String, usize, usize)> {
let mut names = Vec::new();
let mut node_id = 1;
let mut posy = 0;
for letter in text.chars() {
posy += 1;
loop {
let (next_node_id, used) = self.move_front(node_id, letter);
node_id = next_node_id;
let node = &self.nodes[node_id - 1];
if node.is_blue {
if used {
names.push((node.name(), posy - node.length, posy));
} else {
names.push((node.name(), posy - node.length - 1, posy - 1));
}
}
if used {
break;
}
}
}
names
}
pub fn match_line(&self, text: &str) -> Vec<(String, usize, usize)> {
let mut names = Vec::new();
let mut name = String::new();
let mut found = (false, 0, 0);
let mut node_id = 1;
let mut posy = 0;
for letter in text.chars() {
if letter == '\r' || letter == '\n' {
if found.0 {
names.push((name, found.1, found.2));
}
name = String::new();
found = (false, 0, 0);
node_id = 1;
posy = 0;
continue;
} else {
name.push(letter);
posy += 1;
}
loop {
let (next_node_id, used) = self.move_front(node_id, letter);
node_id = next_node_id;
let node = &self.nodes[node_id - 1];
if node.is_blue {
if used {
found = (true, posy - node.length, posy);
} else {
found = (true, posy - node.length - 1, posy - 1);
}
}
if used {
break;
}
}
}
if found.0 {
names.push((name, found.1, found.2));
}
names
}
fn move_front(
&self,
node_id: usize,
letter: char,
) -> (
usize, // 新的 node
bool, // 是否消耗 letter
) {
if let Some(&next_node_id) = self.blacks.get(&(node_id, letter)) {
(next_node_id, true)
} else {
if let Some(&next_node_id) = self.blues.get(&node_id) {
(next_node_id, false)
} else {
(1, if node_id == 1 { true } else { false })
}
}
}
pub fn new() -> Self {
Self {
nodes: vec![KeywordNode::new(Vec::new())],
blacks: AHashMap::new(),
blues: AHashMap::new(),
}
}
#[auto_func_name]
pub fn save(&self) -> Result<String, MoreError> {
serde_json::to_string(&TextSearcherForSerde::from(self)).m(m!(__func__))
}
pub fn subst(&self, text: &str) -> String {
let mut result: (String, usize) = (String::new(), 0);
let mut last_found: (String, usize, usize) = (String::new(), 0, 0);
let mut node_id = 1;
let mut posy = 0;
let letters = text.chars().collect::<Vec<char>>();
for letter in &letters {
posy += 1;
loop {
let (next_node_id, used) = self.move_front(node_id, *letter);
node_id = next_node_id;
let node = &self.nodes[node_id - 1];
if node.is_blue {
let found = if used {
(node.name(), posy - node.length, posy)
} else {
(node.name(), posy - node.length - 1, posy - 1)
};
if found.1 != last_found.1 {
if last_found.1 >= result.1 {
for i in result.1..last_found.1 {
result.0.push(letters[i]);
}
result.0 += &last_found.0;
result.1 = last_found.2;
}
}
last_found = found;
}
if used {
break;
}
}
}
if last_found.2 >= last_found.1 && last_found.1 >= result.1 {
for i in result.1..last_found.1 {
result.0.push(letters[i]);
}
result.0 += &last_found.0;
result.1 = last_found.2;
}
for letter in &letters[result.1..] {
result.0.push(*letter);
}
result.0
}
}
#[cfg(test)]
mod text_searcher_test {
use super::*;
#[test]
fn test_add_keyword1() {
let mut ts = TextSearcher::new();
ts.add_keyword("ab".to_string(), None);
assert_eq!(ts.nodes.len(), 3);
assert_eq!(ts.nodes[1].to_string(), "[\'a\']/1, , false");
assert_eq!(ts.nodes[2].to_string(), "[\'a\', \'b\']/2, ab, true");
let mut blacks = ts
.blacks
.iter()
.map(|(k, v)| (*k, *v))
.collect::<Vec<((usize, char), usize)>>();
blacks.sort();
assert_eq!(blacks, [((1, 'a'), 2), ((2, 'b'), 3)]);
}
#[test]
fn test_add_keyword2() {
let mut ts = TextSearcher::new();
for keyword in &["a", "ab", "bab", "bc", "bca", "c", "caa"] {
ts.add_keyword(keyword.to_string(), None);
}
assert_eq!(ts.nodes.len(), 11);
assert_eq!(ts.nodes[6].to_string(), "[\'b\', \'c\']/2, bc, true");
let mut blacks = ts
.blacks
.iter()
.map(|(k, v)| (*k, *v))
.collect::<Vec<((usize, char), usize)>>();
blacks.sort();
assert_eq!(
blacks,
[
((1, 'a'), 2),
((1, 'b'), 4),
((1, 'c'), 9),
((2, 'b'), 3),
((4, 'a'), 5),
((4, 'c'), 7),
((5, 'b'), 6),
((7, 'a'), 8),
((9, 'a'), 10),
((10, 'a'), 11)
]
);
}
#[test]
fn test_create_blues() {
let mut ts = TextSearcher::new();
for keyword in &["a", "ab", "bab", "bc", "bca", "c", "caa"] {
ts.add_keyword(keyword.to_string(), None);
}
ts.create_blues();
let mut blues = ts.blues.iter().map(|(k, v)| (*k, *v)).collect::<Vec<(usize, usize)>>();
blues.sort();
assert_eq!(blues, [(3, 4), (5, 2), (6, 3), (7, 9), (8, 10), (10, 2), (11, 2)]);
}
#[test]
fn test_get_node_by_keyword() {
let mut ts = TextSearcher::new();
let mut ids = Vec::new();
for keyword in &["a", "ab", "bab", "bc", "bca", "c", "caa"] {
ts.add_keyword(keyword.to_string(), None);
ids.push(ts.nodes.len());
}
let mut i = 0;
for keyword in &["a", "ab", "bab", "bc", "bca", "c", "caa"] {
println!("{}", keyword);
assert_eq!(ts.get_node_by_keyword(&keyword.chars().collect::<Vec<char>>()), ids[i]);
i += 1;
}
assert_eq!(ts.get_node_by_keyword(&"ac".chars().collect::<Vec<char>>()), 0);
assert_eq!(ts.get_node_by_keyword(&"xy".chars().collect::<Vec<char>>()), 0);
}
#[test]
fn test_match1() {
let mut ts = TextSearcher::new();
for keyword in &["a", "ab", "bab", "bc", "bca", "c", "caa"] {
ts.add_keyword(keyword.to_string(), None);
}
ts.create_blues();
assert_eq!(
ts.match_("abccab"),
[
("a".to_string(), 0, 1),
("ab".to_string(), 0, 2),
("bc".to_string(), 1, 3),
("c".to_string(), 2, 3),
("c".to_string(), 3, 4),
("a".to_string(), 4, 5),
("ab".to_string(), 4, 6)
]
);
}
#[test]
fn test_match2() {
let mut ts = TextSearcher::new();
for keyword in &["北京", "欢迎", "你"] {
ts.add_keyword(keyword.to_string(), None);
}
ts.create_blues();
assert_eq!(
ts.match_("北京欢迎你"),
[
("北京".to_string(), 0, 2),
("欢迎".to_string(), 2, 4),
("你".to_string(), 4, 5),
]
);
}
#[test]
fn test_match3() {
let mut ts = TextSearcher::new();
for keyword in &["bcdef", "defghi", "hijk"] {
ts.add_keyword(keyword.to_string(), Some(format!("x{}y", keyword)));
}
ts.create_blues();
assert_eq!(
ts.match_("abcdefghijklmn"),
[
("xbcdefy".to_string(), 1, 6),
("xdefghiy".to_string(), 3, 9),
("xhijky".to_string(), 7, 11)
]
);
}
#[test]
fn test_match_line() {
let mut ts = TextSearcher::new();
for keyword in &["abc", "def"] {
ts.add_keyword(keyword.to_string(), None);
}
ts.create_blues();
assert_eq!(
ts.match_line("...\n.abc.\n\n---def---\n...\nabc"),
[
(".abc.".to_string(), 1, 4),
("---def---".to_string(), 3, 6),
("abc".to_string(), 0, 3)
]
)
}
#[test]
fn test_new() {
let ts = TextSearcher::new();
assert_eq!(ts.nodes.len(), 1);
assert_eq!(ts.blacks.len(), 0);
assert_eq!(ts.blues.len(), 0);
assert_eq!(ts.nodes[0].to_string(), "[]/0, , false");
}
#[test]
fn test_serde() {
let mut ts = TextSearcher::new();
for keyword in &["a", "ab", "bab"] {
ts.add_keyword(keyword.to_string(), Some(format!("{}!", keyword)));
}
ts.create_blues();
let text = serde_json::to_string(&TextSearcherForSerde::from(&ts)).unwrap();
assert_eq!(
text.len(),
"{\"nodes\":
[
{
\"letters\":[],
\"length\":0,
\"name\":\"\",
\"is_blue\":false
},
{
\"letters\":[],
\"length\":1,
\"name\":\"a!\",
\"is_blue\":true
},
{
\"letters\":[],
\"length\":2,
\"name\":\"ab!\",
\"is_blue\":true
},
{
\"letters\":[],
\"length\":1,
\"name\":\"\",
\"is_blue\":false
},
{
\"letters\":[],
\"length\":2,
\"name\":\"\",
\"is_blue\":false
},
{
\"letters\":[],
\"length\":3,
\"name\":\"bab!\",
\"is_blue\":true
}
],
\"blacks\":
[
[[2,\"b\"],3],
[[1,\"b\"],4],
[[1,\"a\"],2],
[[5,\"b\"],6],
[[4,\"a\"],5]
],
\"blues\":
[
[3,4],
[6,3],
[5,2]
]
}"
.replace("\n", "")
.replace(" ", "")
.len() );
let ts = serde_json::from_str::<TextSearcherForSerde>(&text).unwrap().to();
assert_eq!(ts.nodes.len(), 6);
assert_eq!(ts.blacks.len(), 5);
assert_eq!(ts.blues.len(), 3);
assert_eq!(ts.nodes[5].to_string(), "[]/3, bab!, true");
}
#[test]
fn test_subst1() {
let mut ts = TextSearcher::new();
for keyword in &["a", "ab", "bab", "bc", "bca", "c", "caa"] {
ts.add_keyword(keyword.to_string(), Some(format!("x{}y", keyword)));
}
ts.create_blues();
assert_eq!(ts.subst("abccab"), "xabyxcyxcyxaby");
}
#[test]
fn test_subst2() {
let mut ts = TextSearcher::new();
for keyword in &["bcdef", "defghi", "hijk"] {
ts.add_keyword(keyword.to_string(), Some(format!("x{}y", keyword)));
}
ts.create_blues();
assert_eq!(ts.subst("abcdefghijklmn"), "axbcdefygxhijkylmn");
}
#[test]
fn test_subst3() {
let mut ts = TextSearcher::new();
for keyword in &["bdpk", "dpk"] {
ts.add_keyword(keyword.to_string(), Some("_keyword_".to_string()));
}
ts.create_blues();
assert_eq!(ts.subst("abdpkz"), "a_keyword_z");
}
}
#[derive(Serialize, Deserialize)]
pub struct TextSearcherForSerde {
nodes: Vec<KeywordNode>,
blacks: Vec<((usize, char), usize)>,
blues: Vec<(usize, usize)>,
}
impl TextSearcherForSerde {
fn from(ts: &TextSearcher) -> Self {
Self {
nodes: ts.nodes.clone(),
blacks: ts.blacks.iter().map(|(&k, &v)| (k, v)).collect(),
blues: ts.blues.iter().map(|(&k, &v)| (k, v)).collect(),
}
}
fn to(self) -> TextSearcher {
TextSearcher {
nodes: self.nodes,
blacks: self.blacks.iter().map(|&x| x).collect(),
blues: self.blues.iter().map(|&x| x).collect(),
}
}
}
pub struct TextSearcherManager {
count: i32,
tss: HashMap<i32, TextSearcher>,
}
impl TextSearcherManager {
pub fn add_text_searcher(&mut self, tsid: i32, ts: TextSearcher) {
self.tss.insert(tsid, ts);
}
#[auto_func_name]
pub fn get_text_searcher(&mut self, tsid: i32) -> Result<TextSearcher, MoreError> {
self.tss
.remove(&tsid)
.ok_or_else(|| m!(__func__, &format!("指定的 TextSearcher={} 无效", tsid), "more"))
}
fn new() -> Self {
Self {
count: 0,
tss: HashMap::new(),
}
}
pub fn new_text_searcher(&mut self, keywords: Vec<(String, Option<String>)>) -> i32 {
self.count += 1;
let mut ts = TextSearcher::new();
for (keyword, name) in keywords {
ts.add_keyword(keyword, name);
}
ts.create_blues();
self.tss.insert(self.count, ts);
self.count
}
pub fn remove_text_searcher(&mut self, tsid: i32) {
self.tss.remove(&tsid);
}
}
lazy_static! {
static ref TSM: Mutex<TextSearcherManager> = Mutex::new(TextSearcherManager::new());
}