1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
use std::collections::HashMap;
use std::iter::IntoIterator;
use std::str::FromStr;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use once_cell::unsync::Lazy;
use regex::Regex;
use crate::{
pagerules::PageRules,
rule::{Conv, ConvAction, ConvRule},
variant::Variant,
};
// Ref: https://github.com/wikimedia/mediawiki/blob/7bf779524ab1fd8e1d74f79ea4840564d48eea4d/includes/language/LanguageConverter.php#L76
const NESTED_RULE_MAX_DEPTH: usize = 10;
/// A ZhConverter. See also [`ZhConverterBuilder`].
pub struct ZhConverter {
variant: Variant,
automaton: AhoCorasick,
mapping: HashMap<String, String>,
}
impl ZhConverter {
/// Create a new converter from a automaton and a mapping.
///
/// It is provided for convenience and not expected to be called directly.
/// [`ZhConverterBuilder`] would take care of these
/// details.
pub fn new(automaton: AhoCorasick, mapping: HashMap<String, String>) -> ZhConverter {
ZhConverter {
variant: Variant::Zh,
automaton,
mapping,
}
}
/// Create a new converter of a sequence of `(from, to)` pairs.
///
/// It use [`ZhConverterBuilder`] internally.
#[inline(always)]
pub fn from_pairs(pairs: &[(impl AsRef<str>, impl AsRef<str>)]) -> ZhConverter {
let mut builder = ZhConverterBuilder::new();
for (from, to) in pairs {
builder = builder.add_conv_pair(from, to);
}
builder.build()
}
/// Convert a text.
pub fn convert(&self, text: &str) -> String {
let mut output = String::with_capacity(text.len());
self.converted(text, &mut output);
output
}
/// Same as `convert`, except that it takes a `&mut String` as dest instead of returning a `String`.
pub fn converted(&self, text: &str, output: &mut String) {
// Ref: https://github.dev/rust-lang/regex/blob/5197f21287344d2994f9cf06758a3ea30f5a26c3/src/re_trait.rs#L192
let mut last = 0;
// let mut cnt = HashMap::<usize, usize>::new();
// leftmost-longest matching
for (s, e) in self.automaton.find_iter(text).map(|m| (m.start(), m.end())) {
if s > last {
output.push_str(&text[last..s]);
}
// *cnt.entry(text[s..e].chars().count()).or_insert(0) += 1;
output.push_str(self.mapping.get(&text[s..e]).unwrap());
last = e;
}
output.push_str(&text[last..]);
}
/// Convert a text with inline conv rules applied.
///
/// It only processes the display output of inline rules. Mutations to global rules specified
/// via inline rules are just ignored. To activate global rules, build a [`ZhConverterBuilder`]
/// with [`PageRules`](crate::pagerules::PageRules).
///
/// The internal implementation are intendedly replicating the behavior of
/// [LanguageConverter.php](https://github.com/wikimedia/mediawiki/blob/7bf779524ab1fd8e1d74f79ea4840564d48eea4d/includes/language/LanguageConverter.php#L855)
/// in MediaWiki. But it is not fully compliant with MediaWiki and providing NO PROTECTION over
/// XSS attack.
///
/// Compared to the plain `convert`, this is known to be much slower due to the inevitable
/// nature of the implementation decision made by MediaWiki.
pub fn convert_allowing_inline_rules(&self, text: &str) -> String {
// Ref: https://github.com/wikimedia/mediawiki/blob/7bf779524ab1fd8e1d74f79ea4840564d48eea4d/includes/language/LanguageConverter.php#L855
// and https://github.com/wikimedia/mediawiki/blob/7bf779524ab1fd8e1d74f79ea4840564d48eea4d/includes/language/LanguageConverter.php#L910
// and https://github.com/wikimedia/mediawiki/blob/7bf779524ab1fd8e1d74f79ea4840564d48eea4d/includes/language/LanguageConverter.php#L532
// TODO: this may degrade to O(n^2)
let p1 = Lazy::new(|| {
// start of rule | noHtml | noStyle | no code | no pre
Regex::new(r#"-\{|<script.*?>.*?</script>|<style.*?>.*?</style>|<code>.*?</code>|<pre.*?>.*?</pre>"#).unwrap()
});
// TODO: we need to understand what the hell it is so that to adapt it to compatible syntax
// $noHtml = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)(*SKIP)(*FAIL)';
let p2 = Lazy::new(|| Regex::new(r#"-\{|\}-"#).unwrap());
let mut pos = 0;
let mut converted = String::with_capacity(text.len());
let mut pieces = vec![];
while let Some(m1) = p1.find_at(text, pos) {
// convert anything before (possible) the toplevel -{
self.converted(&text[pos..m1.start()], &mut converted);
if m1.as_str() != "-{" {
// not start of rule, just <foobar></foobar> to exclude
converted.push_str(&text[m1.start()..m1.end()]); // kept as-is
pos = m1.end();
continue; // i.e. <SKIP><FAIL>
}
// found toplevel -{
pos = m1.start() + 2;
pieces.push(String::new());
while let Some(m2) = p2.find_at(text, pos) {
// let mut piece = String::from(&text[pos..m2.start()]);
if m2.as_str() == "-{" {
// if there are two many open start tag, ignore the new nested rule
if pieces.len() >= NESTED_RULE_MAX_DEPTH {
pos += 2;
continue;
}
// start tag
pieces.last_mut().unwrap().push_str(&text[pos..m2.start()]);
pieces.push(String::new()); // e.g. -{ zh: AAA -{
pos = m2.end();
} else {
// end tag
let mut piece = pieces.pop().unwrap();
piece.push_str(&text[pos..m2.start()]);
let upper = if let Some(upper) = pieces.last_mut() {
upper
} else {
&mut converted
};
// only take it output; mutations to global rules are ignored
ConvRule::from_str_infallible(&piece)
.write_output(upper, self.variant)
.unwrap();
// if let Ok(rule) = dbg!(ConvRule::from_str(&piece)) {
// rule.write_output(upper, self.variant).unwrap();
// } else {
// // rule is invalid
// // TODO: what should we do actually? for now, we just do nothing to it
// upper.push_str(&piece);
// }
pos = m2.end();
if pieces.is_empty() {
// return to toplevel
break;
}
}
}
while let Some(piece) = pieces.pop() {
converted.push_str("-{");
converted.push_str(&piece);
}
// TODO: produce convert(&text[pos..])
}
if pos < text.len() {
// no more conv rules, just convert and append
converted.push_str(&self.convert(&text[pos..]));
}
converted
}
// TODO: inplace? we need to maintain a stack which could be at most O(n)
// and it requires access to underlying bytes for subtle mutations
// pub fn convert_inplace(&self, text: &mut String) {
// let tbp = VecDeque::<&str>::new(); // to be pushed
// let mut wi = 0; // writing index
// let mut ri = 0; // reading index
// while let Some((s, e)) = self.regex.find_at(text, ri).map(|m| (m.start(), m.end())) {
// while !tbp.is_empty() && s - wi >= tbp[0].len() {
// let raw = unsafe { text.as_bytes_mut() };
// raw[wi..wi + tbp[0].len()].clone_from_slice(tbp[0].as_bytes());
// tbp.pop_front();
// }
// }
// }
}
/// A builder that helps build a `ZhConverter`.
///
/// # Example
/// Build a Zh2CN converter with some additional rules.
/// ```
/// use zhconv::{zhconv, ZhConverterBuilder, Variant, tables::ZH_HANS_CN_TABLE};
/// // extracted from https://zh.wikipedia.org/wiki/Template:CGroup/Template:CGroup/文學.
/// let conv_lines = r"zh-hans:三个火枪手;zh-hant:三劍客;zh-tw:三劍客;
/// zh-cn:雾都孤儿;zh-tw:孤雛淚;zh-hk:苦海孤雛;zh-sg:雾都孤儿;zh-mo:苦海孤雛;";
/// let converter = ZhConverterBuilder::new()
/// .target(Variant::ZhCN)
/// .table(*ZH_HANS_CN_TABLE)
/// .dfa(true) // dfa enabled: slower build, faster conversion
/// .conv_lines(conv_lines)
/// .build();
/// let original = "《三劍客》是亞歷山大·仲馬的作品。《孤雛淚》是查爾斯·狄更斯的作品。";
/// assert_eq!(converter.convert(original), "《三个火枪手》是亚历山大·仲马的作品。《雾都孤儿》是查尔斯·狄更斯的作品。");
/// assert_eq!(zhconv(original, Variant::ZhCN), "《三剑客》是亚历山大·仲马的作品。《孤雏泪》是查尔斯·狄更斯的作品。")
#[derive(Debug, Clone, Default)]
pub struct ZhConverterBuilder<'t> {
target: Variant,
/// The base conversion table
tables: Vec<(&'t str, &'t str)>,
/// Rules to be added, from page rules or cgroups
adds: HashMap<String, String>,
/// Rules to be removed, from page rules or cgroups
removes: HashMap<String, String>, // TODO: unnecessary owned type
dfa: bool,
}
impl<'t> ZhConverterBuilder<'t> {
pub fn new() -> Self {
Default::default()
}
/// Set the target Chinese variant to convert to.
///
/// The target variant is only useful to get proper conv pairs from
/// [`ConvRule`](crate::rule::ConvRule)s. That is, if only tables are specified, the target
/// variant would be useless.
pub fn target(mut self, variant: Variant) -> Self {
self.target = variant;
self
}
/// Add a conversion table, which is typically those in [`tables`](crate::tables).
pub fn table(mut self, table: (&'t str, &'t str)) -> Self {
self.tables.push(table);
self
}
// [CGroup](https://zh.wikipedia.org/wiki/Module:CGroup) (a.k.a 公共轉換組)
/// Add a set of rules extracted from a page.
///
/// This is a helper wrapper around `page_rules`.
#[inline(always)]
pub fn rules_from_page(self, text: &str) -> Self {
self.page_rules(
&PageRules::from_str(text).expect("Page rules parsing is infallible for now"),
)
}
/// Add a set of rules from `PageRules`.
#[inline(always)]
pub fn page_rules(self, page_rules: &PageRules) -> Self {
self.conv_actions(page_rules.as_conv_actions())
}
/// Add a set of rules.
///
/// These rules take the higher precedence over those specified via `table`.
fn conv_actions<'i>(mut self, conv_actions: impl IntoIterator<Item = &'i ConvAction>) -> Self {
for conv_action in conv_actions {
let pairs = conv_action.as_conv().get_conv_pairs(self.target);
if conv_action.adds() {
self.adds
.extend(pairs.iter().map(|&(f, t)| (f.to_owned(), t.to_owned())));
} else {
self.removes
.extend(pairs.iter().map(|&(f, t)| (f.to_owned(), t.to_owned())));
}
}
self
}
/// Add a [`Conv`].
///
/// For general cases, check [`add_conv_pair`](#method.add_conv_pair) which takes a plain
/// `from -> to` pair.
pub fn add_conv(mut self, conv: Conv) -> Self {
let pairs = conv.get_conv_pairs(self.target);
self.adds
.extend(pairs.iter().map(|&(f, t)| (f.to_owned(), t.to_owned())));
self
}
/// Mark a conv as removed.
pub fn remove_conv(mut self, conv: Conv) -> Self {
let pairs = conv.get_conv_pairs(self.target);
self.removes
.extend(pairs.iter().map(|&(f, t)| (f.to_owned(), t.to_owned())));
self
}
/// Add a single `from -> to` conversion pair.
///
/// It takes the precedence over those specified via `table`. It shares the same precedence level with those specified via `cgroup`.
pub fn add_conv_pair(mut self, from: impl AsRef<str>, to: impl AsRef<str>) -> Self {
let (from, to): (&str, &str) = (from.as_ref(), to.as_ref());
if from.is_empty() {
panic!("Conv pair should have non-empty from.")
}
self.adds.insert(from.to_owned(), to.to_owned());
self
}
/// Mark a single conversion pair as removed.
///
/// Any rule with the same `from`, whether specified via `add_conv_pair`, `conv_lines` or `table`, is removed.
pub fn remove_conv_pair(mut self, from: impl AsRef<str>, to: impl AsRef<str>) -> Self {
self.removes
.insert(from.as_ref().to_owned(), to.as_ref().to_owned());
self
}
/// Add a text of conv lines.
///
/// e.g.
/// ```text
/// zh-cn:天堂执法者; zh-hk:夏威夷探案; zh-tw:檀島警騎2.0;
/// zh-cn:史蒂芬·'史蒂夫'·麦格瑞特; zh-tw:史提夫·麥加雷; zh-hk:麥星帆;
/// zh-cn:丹尼尔·'丹尼/丹诺'·威廉姆斯; zh-tw:丹尼·威廉斯; zh-hk:韋丹尼;
/// ```
pub fn conv_lines(mut self, lines: &str) -> Self {
for line in lines.lines().map(str::trim).filter(|s| !s.is_empty()) {
if let Ok(conv) = Conv::from_str(line.trim()) {
self.adds
.extend(conv.get_conv_pairs(self.target).iter().map(|&(f, t)| {
if f.is_empty() {
panic!("Conv pair should have non-empty from.")
}
(f.to_owned(), t.to_owned())
}));
}
}
self
}
/// Set whether to activate the feature DFA of Aho-Corasick.
///
/// With DFA enabled, it takes rougly 5x time to build the converter while the conversion
/// speed is < 2x faster. All built-in converters have this feature enabled for better
/// conversion performance.
pub fn dfa(mut self, enabled: bool) -> Self {
self.dfa = enabled;
self
}
/// Do the build.
///
/// It internally aggregate previously specified tables, rules and pairs, from where an
/// automaton and a mapping are built, which are then feed into the new converter.
pub fn build(&self) -> ZhConverter {
let Self {
target,
tables,
dfa,
adds,
removes,
} = self;
let mut mapping = HashMap::with_capacity(
(tables.iter().map(|(fs, _ts)| fs.len()).sum::<usize>() + adds.len())
.saturating_sub(removes.len()),
);
mapping.extend(
tables
.iter()
.flat_map(|(froms, tos)| {
itertools::zip(froms.trim().split('|'), tos.trim().split('|'))
})
.filter(|&(from, to)| !(from.is_empty() && to.is_empty())) // empty str will trouble AC
.filter(|&(from, _to)| !removes.contains_key(from))
.map(|(from, to)| (from.to_owned(), to.to_owned())),
);
mapping.extend(
adds.iter()
.filter(|(from, _to)| !removes.contains_key(from.as_str()))
.map(|(from, to)| (from.to_owned(), to.to_owned())),
);
let sequence = mapping.keys();
let automaton = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.dfa(*dfa)
.build(sequence);
ZhConverter {
variant: *target,
mapping,
automaton,
}
}
}