pragmatic_segmenter/lib.rs
1//! Rust port of [pySBD] v3.1.0 and Ruby [pragmatic_segmenter]. **[Documentations]**
2//!
3//! rust-pragmatic-segmenter is rule-based SBD. It uses a lot of regular expressions to separate
4//! sentences.
5//!
6//! ```rust
7//! use pragmatic_segmenter::Segmenter;
8//!
9//! let segmenter = Segmenter::new()?;
10//! let result: Vec<_> = segmenter.segment("Hi Mr. Kim. Let's meet at 3 P.M.").collect();
11//! println!("{:?}", result); // ["Hi Mr. Kim. ", "Let\'s meet at 3 P.M."]
12//! # Ok::<(), Box<dyn std::error::Error>>(())
13//! ```
14//!
15//! [pySBD]: https://github.com/nipunsadvilkar/pySBD
16//! [pragmatic_segmenter]: https://github.com/diasks2/pragmatic_segmenter
17//! [Documentations]: https://docs.rs/pragmatic-segmenter
18
19mod abbreviation_replacer;
20mod list_item_replacer;
21mod rule;
22mod util;
23
24use std::borrow::Cow;
25use std::error::Error;
26use std::iter::Iterator;
27
28use onig::{Captures, Regex};
29
30use abbreviation_replacer::AbbreviationReplacer;
31use list_item_replacer::ListItemReplacer;
32use rule::Rule;
33use util::re;
34
35const PUNCTUATIONS: [char; 7] = ['。', '.', '.', '!', '!', '?', '?'];
36
37/// Segmenter type. It stores the compilation results of regular expressions used internally by
38/// pragmatic-segmenter in memory.
39///
40/// ```rust
41/// use pragmatic_segmenter::Segmenter;
42///
43/// let segmenter = Segmenter::new()?;
44/// let result: Vec<_> = segmenter.segment("Hi Mr. Kim. Let's meet at 3 P.M.").collect();
45/// assert_eq!(result, vec!["Hi Mr. Kim. ", "Let's meet at 3 P.M."]);
46/// # Ok::<(), Box<dyn std::error::Error>>(())
47/// ```
48pub struct Segmenter {
49 list_item_replacer: ListItemReplacer,
50 abbreviation_replacer: AbbreviationReplacer,
51
52 number_rules: [Rule; 5],
53 continuous_punctuation_regex: Regex,
54 numbered_reference: Rule,
55 abbreviation_with_multiple_periods_and_email_regex: regex::Regex,
56 misc_rules: [Rule; 2],
57
58 parens_between_double_quotes_regex: Regex,
59 parens_between_double_quotes_0: Rule,
60 parens_between_double_quotes_1: Rule,
61
62 ellipsis_rules: [Rule; 5],
63
64 exclamation_regex: Regex,
65 sub_escaped_regex_reserved_characters: [Rule; 5],
66
67 word_with_leading_apostrophe: Regex,
68 trailing_apostrophe: Regex,
69 between_single_quotes_regex: Regex,
70 between_single_quote_slanted_regex: Regex,
71 between_double_quotes_regex_2: Regex,
72 between_square_brackets_regex_2: Regex,
73 between_parens_regex_2: Regex,
74 between_quote_arrow_regex_2: Regex,
75 between_em_dashes_regex_2: Regex,
76 between_quote_slanted_regex_2: Regex,
77
78 double_punctuation: Regex,
79 question_mark_in_quotation_and_exclamation_point_rules: [Rule; 4],
80
81 replace_parens: Rule,
82
83 sentence_boundary_regex: Regex,
84 post_process_regex: Regex,
85 quotation_at_end_of_sentence_regex: Regex,
86 split_space_quotation_at_end_of_sentence_regex: Regex,
87}
88
89impl Segmenter {
90 /// Create a new Segmenter instance. The regular expressions used internally by
91 /// pragmatic-segmenter are compiled here.
92 ///
93 /// ```rust
94 /// use pragmatic_segmenter::Segmenter;
95 ///
96 /// let segmenter = Segmenter::new()?;
97 /// # Ok::<(), Box<dyn std::error::Error>>(())
98 /// ```
99 pub fn new() -> Result<Self, Box<dyn Error>> {
100 Ok(Segmenter {
101 list_item_replacer: ListItemReplacer::new()?,
102 abbreviation_replacer: AbbreviationReplacer::new()?,
103
104 number_rules: [
105 // PeriodBeforeNumberRule
106 // Example: https://rubular.com/r/oNyxBOqbyy
107 Rule::new(r"\.(?=\d)", "∯")?,
108 // NumberAfterPeriodBeforeLetterRule
109 // Example: https://rubular.com/r/EMk5MpiUzt
110 Rule::new(r"(?<=\d)\.(?=\S)", "∯")?,
111 // NewLineNumberPeriodSpaceLetterRule
112 // Example: https://rubular.com/r/rf4l1HjtjG
113 Rule::new(r"(?<=\r\d)\.(?=(\s\S)|\))", "∯")?,
114 // StartLineNumberPeriodRule
115 // Example: https://rubular.com/r/HPa4sdc6b9
116 Rule::new(r"(?<=^\d)\.(?=(\s\S)|\))", "∯")?,
117 // StartLineTwoDigitNumberPeriodRule
118 // Example: https://rubular.com/r/NuvWnKleFl
119 Rule::new(r"(?<=^\d\d)\.(?=(\s\S)|\))", "∯")?,
120 ],
121
122 // Example: https://rubular.com/r/mQ8Es9bxtk
123 continuous_punctuation_regex: re(r"(?<=\S)(!|\?){3,}(?=(\s|\Z|$))")?,
124
125 // Example: https://rubular.com/r/UkumQaILKbkeyc
126 numbered_reference: Rule::new(
127 r"(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])",
128 r"∯\2\r\7",
129 )?,
130
131 // English.Abbreviation.WithMultiplePeriodsAndEmailRule,
132 //
133 // NOTE: pySBD와 루비 구현체가 다른 정규표현식을 쓴다. pySBD의 동작을 따라간다.
134 //
135 // Example: https://rubular.com/r/EUbZCNfgei
136 abbreviation_with_multiple_periods_and_email_regex: regex::Regex::new(
137 r"([a-zA-Z0-9_])(?:\.)([a-zA-Z0-9_])",
138 )?,
139
140 misc_rules: [
141 // English.GeoLocationRule,
142 Rule::new(r"(?<=[a-zA-z]°)\.(?=\s*\d+)", "∯")?,
143 // English.FileFormatRule,
144 Rule::new(
145 r"(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)",
146 "∯",
147 )?,
148 ],
149
150 // Example: https://rubular.com/r/6flGnUMEVl
151 parens_between_double_quotes_regex: re(r#"["\”]\s\(.*\)\s["\“]"#)?,
152 parens_between_double_quotes_0: Rule::new(r"\s(?=\()", "\r")?,
153 parens_between_double_quotes_1: Rule::new(r"(?<=\))\s", "\r")?,
154
155 // NOTE: 이부분은 pySBD 구현과 루비 구현이 동작이 다르다. pySBD의 동작을 따른다.
156 // 이 부분을 고치게 되면 ReinsertEllipsisRules도 함께 고쳐야한다.
157 ellipsis_rules: [
158 // ThreeSpaceRule
159 // Example: https://rubular.com/r/YBG1dIHTRu
160 Rule::new(r"(\s\.){3}\s", "♟♟♟♟♟♟♟")?,
161 // FourSpaceRule
162 // Example: https://rubular.com/r/2VvZ8wRbd8
163 Rule::new(r"(?<=[a-z])(\.\s){3}\.($|\\n)", "♝♝♝♝♝♝♝")?,
164 // FourConsecutiveRule
165 // Example: https://rubular.com/r/Hdqpd90owl
166 Rule::new(r"(?<=\S)\.{3}(?=\.\s[A-Z])", "ƪƪƪ")?,
167 // ThreeConsecutiveRule
168 // Example: https://rubular.com/r/i60hCK81fz
169 Rule::new(r"\.\.\.(?=\s+[A-Z])", "☏☏.")?,
170 // OtherThreePeriodRule
171 Rule::new(r"\.\.\.", "ƪƪƪ")?,
172 ],
173
174 exclamation_regex: re(
175 r"!Xũ|!Kung|ǃʼOǃKung|!Xuun|!Kung\-Ekoka|ǃHu|ǃKhung|ǃKu|ǃung|ǃXo|ǃXû|ǃXung|ǃXũ|!Xun|Yahoo!|Y!J|Yum!",
176 )?,
177
178 // NOTE: pySBD에 구현 실수가 있어 루비 구현체와 동작이 전혀 다르지만, pySBD의 동작을
179 // 따르기 위해 버그를 유지하겠다.
180 sub_escaped_regex_reserved_characters: [
181 // SubLeftParen
182 Rule::new(r"\\\(", "(")?,
183 // SubRightParen
184 Rule::new(r"\\\)", ")")?,
185 // SubLeftBracket
186 Rule::new(r"\\\[", "[")?,
187 // SubRightBracket
188 Rule::new(r"\\\]", "]")?,
189 // SubDash
190 Rule::new(r"\\\-", "-")?,
191 ],
192
193 // Example: https://rubular.com/r/mXf8cW025o
194 word_with_leading_apostrophe: re(r"(?<=\s)'(?:[^']|'[a-zA-Z])*'\S")?,
195
196 trailing_apostrophe: re(r"'\s")?,
197
198 // Example: https://rubular.com/r/2YFrKWQUYi
199 between_single_quotes_regex: re(r"(?<=\s)'(?:[^']|'[a-zA-Z])*'")?,
200
201 between_single_quote_slanted_regex: re(r"(?<=\s)‘(?:[^’]|’[a-zA-Z])*’")?,
202
203 // Example: https://regex101.com/r/r6I1bW/1
204 //
205 // NOTE: pySBD에선 파이썬 regex의 기능 한계로 인해 원본인 루비 pragmatic_segmenter와
206 // 동작이 다른데, 우리는 Oniguruma regex engine을 쓰고있으므로 루비 구현을 재현할 수
207 // 있다. 그러나 pySBD와 동작을 맞추기 위해 의도적으로 pySBD 정규표현식을 사용한다.
208 //
209 // NOTE: Python regex와 Oniguruma regex는 named capture group과 backreference 문법이
210 // 다르다. 주의
211 //
212 // Reference: https://stackoverflow.com/a/13577411/13977061
213 between_double_quotes_regex_2: re(r#""(?=(?<tmp>[^\"\\]+|\\{2}|\\.)*)\k<tmp>""#)?,
214 between_square_brackets_regex_2: re(r#"\[(?=(?<tmp>[^\]\\]+|\\{2}|\\.)*)\k<tmp>\]"#)?,
215 between_parens_regex_2: re(r"\((?=(?<tmp>[^\(\)\\]+|\\{2}|\\.)*)\k<tmp>\)")?,
216 between_quote_arrow_regex_2: re(r"\«(?=(?<tmp>[^»\\]+|\\{2}|\\.)*)\k<tmp>\»")?,
217 between_em_dashes_regex_2: re(r"--(?=(?<tmp>[^--]*))\k<tmp>--")?,
218 between_quote_slanted_regex_2: re(r"\“(?=(?<tmp>[^”\\]+|\\{2}|\\.)*)\k<tmp>\”")?,
219
220 double_punctuation: re(r"^(?:\?!|!\?|\?\?|!!)")?,
221 question_mark_in_quotation_and_exclamation_point_rules: [
222 // QuestionMarkInQuotationRule
223 // Example: https://rubular.com/r/aXPUGm6fQh
224 Rule::new(r#"\?(?=(\'|\"))"#, "&ᓷ&")?,
225 // InQuotationRule
226 // Example: https://rubular.com/r/XS1XXFRfM2
227 Rule::new(r#"\!(?=(\'|\"))"#, "&ᓴ&")?,
228 // BeforeCommaMidSentenceRule
229 // Example: https://rubular.com/r/sl57YI8LkA
230 Rule::new(r"\!(?=\,\s[a-z])", "&ᓴ&")?,
231 // MidSentenceRule
232 // Example: https://rubular.com/r/f9zTjmkIPb
233 Rule::new(r"\!(?=\s[a-z])", "&ᓴ&")?,
234 ],
235
236 // Example: https://rubular.com/r/GcnmQt4a3I
237 replace_parens: Rule::new(
238 // ROMAN_NUMERALS_IN_PARENTHESES
239 r"\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])",
240 r"&✂&\1&⌬&",
241 )?,
242
243 // added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc.
244 sentence_boundary_regex: re(
245 r#"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]"#,
246 )?,
247 post_process_regex: re(r"\A[a-zA-Z]*\Z")?,
248 // Example: https://rubular.com/r/NqCqv372Ix
249 quotation_at_end_of_sentence_regex: re(r#"[!?\.-][\"\'“”]\s{1}[A-Z]"#)?,
250 // Example: https://rubular.com/r/JMjlZHAT4g
251 split_space_quotation_at_end_of_sentence_regex: re(
252 r#"(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])"#,
253 )?,
254 })
255 }
256
257 /// Separate sentences from given input. Although it is a function that returns an Iterator,
258 /// not all processing is done by streaming. After pre-processing the entire input once,
259 /// processing is performed for each sentence by streaming.
260 ///
261 /// ```rust
262 /// use pragmatic_segmenter::Segmenter;
263 ///
264 /// let segmenter = Segmenter::new()?;
265 /// let mut iter = segmenter.segment("Hi Mr. Kim. Let's meet at 3 P.M.");
266 ///
267 /// assert_eq!(iter.next(), Some("Hi Mr. Kim. "));
268 /// assert_eq!(iter.next(), Some("Let's meet at 3 P.M."));
269 /// assert_eq!(iter.next(), None);
270 /// # Ok::<(), Box<dyn std::error::Error>>(())
271 /// ```
272 pub fn segment<'a>(&'a self, original_input: &'a str) -> impl Iterator<Item = &'a str> {
273 // NOTE: 루비 버전에는 이런 처리가 없으나, pySBD 3.1.0에 이 처리가 들어갔다. pySBD와 동작을
274 // 맞추기위해 동일하게 처리해준다.
275 let text = original_input.replace('\n', "\r");
276
277 let text = self.list_item_replacer.add_line_break(&text);
278
279 // replace_abbreviations()
280 let mut text = self.abbreviation_replacer.replace(&text);
281
282 // replace_numbers()
283 for rule in &self.number_rules {
284 text = rule.replace_all(&text);
285 }
286
287 // replace_continuous_punctuation()
288 let text = self
289 .continuous_punctuation_regex
290 .replace_all(&text, |c: &Captures| {
291 let mat = c.at(0).unwrap(); // Must exists
292 mat.replace('!', "&ᓴ&").replace('?', "&ᓷ&")
293 });
294
295 // replace_periods_before_numeric_references()
296 //
297 // Reference:
298 // https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a35
299 let text = self.numbered_reference.replace_all(&text);
300
301 let mut text = self
302 .abbreviation_with_multiple_periods_and_email_regex
303 .replace_all(&text, "$1∮$2");
304 for rule in &self.misc_rules {
305 text = Cow::Owned(rule.replace_all(&text));
306 }
307
308 //
309 // split_into_segments()
310 //
311
312 // check_for_parens_between_quotes()
313 let text = self
314 .parens_between_double_quotes_regex
315 .replace_all(&text, |c: &Captures| {
316 let mat = c.at(0).unwrap(); // Must exists
317 let mat = self.parens_between_double_quotes_0.replace_all(mat);
318 self.parens_between_double_quotes_1.replace_all(&mat)
319 });
320
321 let mut prior_start_char_idx = 0;
322
323 // TODO: flat_map() 에서 임시 Vec, String 할당 줄이기
324 text.split('\r')
325 .filter(|s| !s.is_empty())
326 .map(|s| s.to_string())
327 .collect::<Vec<_>>() // String을 own하는 버전의 새 split 함수를 만들면 이부분을 제거할 수 있음
328 .into_iter()
329 .flat_map(move |sent| {
330 // English.SingleNewLineRule
331 let mut sent = sent.replace(r"\n", "ȹ");
332 // English.EllipsisRules.All
333 for rule in &self.ellipsis_rules {
334 sent = rule.replace_all(&sent);
335 }
336 // check_for_punctuation()
337 if PUNCTUATIONS.iter().any(|&p| sent.contains(p)) {
338 // process_text()
339 if !sent.ends_with(&PUNCTUATIONS[..]) {
340 sent += "ȸ";
341 }
342
343 // ExclamationWords.apply_rules()
344 sent = self
345 .exclamation_regex
346 .replace_all(&sent, self.replace_punctuation(false));
347
348 // between_punctuation()
349 if self.word_with_leading_apostrophe.find(&sent).is_none()
350 || self.trailing_apostrophe.find(&sent).is_some()
351 {
352 sent = self
353 .between_single_quotes_regex
354 .replace_all(&sent, self.replace_punctuation(true));
355 }
356 sent = self
357 .between_single_quote_slanted_regex
358 .replace_all(&sent, self.replace_punctuation(false));
359 sent = self
360 .between_double_quotes_regex_2
361 .replace_all(&sent, self.replace_punctuation(false));
362 sent = self
363 .between_square_brackets_regex_2
364 .replace_all(&sent, self.replace_punctuation(false));
365 sent = self
366 .between_parens_regex_2
367 .replace_all(&sent, self.replace_punctuation(false));
368 sent = self
369 .between_quote_arrow_regex_2
370 .replace_all(&sent, self.replace_punctuation(false));
371 sent = self
372 .between_em_dashes_regex_2
373 .replace_all(&sent, self.replace_punctuation(false));
374 sent = self
375 .between_quote_slanted_regex_2
376 .replace_all(&sent, self.replace_punctuation(false));
377
378 // handle text having only doublepunctuations
379 if self.double_punctuation.find(&sent).is_none() {
380 sent = sent
381 .replace(r"?!", "☉")
382 .replace(r"!?", "☈")
383 .replace(r"??", "☇")
384 .replace(r"!!", "☄");
385 }
386 for rule in &self.question_mark_in_quotation_and_exclamation_point_rules {
387 sent = rule.replace_all(&sent);
388 }
389
390 // ListItemReplacer(sent).replace_parens()
391 sent = self.replace_parens.replace_all(&sent);
392
393 // sentence_boundary_punctuation()
394 // retain exclamation mark if it is an ending character of a given text
395 sent = sent.replace(r"&ᓴ&$", "!");
396 self.sentence_boundary_regex
397 .find_iter(&sent)
398 .map(|r| sent[r.0..r.1].to_string())
399 .collect::<Vec<_>>()
400 } else {
401 vec![sent]
402 }
403 })
404 .flat_map(move |mut sent| {
405 // SubSymbolsRules
406 sent = sent
407 .replace('∯', ".")
408 .replace('♬', "،")
409 .replace('♭', ":")
410 .replace(r"&ᓰ&", "。")
411 .replace(r"&ᓱ&", ".")
412 .replace(r"&ᓳ&", "!")
413 .replace(r"&ᓴ&", "!")
414 .replace(r"&ᓷ&", "?")
415 .replace(r"&ᓸ&", "?")
416 .replace('☉', "?!")
417 .replace('☇', "??")
418 .replace('☈', "!?")
419 .replace('☄', "!!")
420 .replace(r"&✂&", "(")
421 .replace(r"&⌬&", ")")
422 .replace('ȸ', "")
423 .replace('ȹ', "\n");
424
425 // post_process_segments()
426 //
427 // NOTE: post_process_segments 함수는 pySBD와 루비 pragmatic_segmenter의 동작이 전혀
428 // 다르다. pySBD를 따라간다.
429 if sent.len() > 2 && self.post_process_regex.find(&sent).is_some() {
430 return vec![sent];
431 }
432
433 // ReinsertEllipsisRules
434 // NOTE: 이부분은 pySBD 구현과 루비 구현이 동작이 다르다. pySBD의 동작을 따른다.
435 sent = sent
436 .replace(r"ƪƪƪ", "...")
437 .replace(r"♟♟♟♟♟♟♟", " . . . ")
438 .replace(r"♝♝♝♝♝♝♝", ". . . .")
439 .replace(r"☏☏", "..")
440 .replace('∮', ".");
441
442 if self
443 .quotation_at_end_of_sentence_regex
444 .find(&sent)
445 .is_some()
446 {
447 self.split_space_quotation_at_end_of_sentence_regex
448 .split(&sent)
449 .map(|s| s.to_string())
450 .collect()
451 } else {
452 vec![sent.replace('\n', "").trim().to_string()]
453 }
454 })
455 .map(|sent| sent.replace(r"&⎋&", "'"))
456 // NOTE: pySBD에만 이하의 처리가 존재하고, 원본 루비코드에는 이런 동작이 없다. 일단
457 // 동작을 맞추기 위해 동일한 처리를 해주지만, 아래 코드때문에 성능손실이 크다.
458 .flat_map(move |sent| -> Vec<_> {
459 // since SENTENCE_BOUNDARY_REGEX doesnt account
460 // for trailing whitespaces \s* & is used as suffix
461 // to keep non-destructive text after segments joins
462
463 // NOTE: escape 한 뒤 compile했기 때문에, 실패의 여지가 없다.
464 let re = regex::Regex::new(&format!(r"{}\s*", regex::escape(&sent))).unwrap();
465 re.find_iter(original_input)
466 .filter_map(|mat| {
467 let match_str = mat.as_str();
468 let match_start_idx = mat.start();
469 if match_start_idx >= prior_start_char_idx {
470 prior_start_char_idx = match_start_idx;
471 Some(match_str)
472 // making sure if curren sentence and its span
473 // is either first sentence along with its char spans
474 // or current sent spans adjacent to prior sentence spans
475 } else {
476 None
477 }
478 })
479 .collect()
480 })
481 }
482
483 fn replace_punctuation(&self, is_match_type_single: bool) -> impl Fn(&Captures) -> String + '_ {
484 move |c: &Captures| {
485 let mat = c.at(0).unwrap(); // Must exists
486
487 // NOTE: 원래 이 자리에서 EscapeRegexReservedCharacters.All 규칙이 적용되어야
488 // 하나, pySBD의 구현 버그로 인해 EscapeRegexReservedCharacters.All가 아무일도
489 // 하지 않는다. 버그이지만, pySBD의 동작을 따라가기위해 버그를 유지하겠다.
490
491 let mut mat = mat.replace('.', "∯");
492 mat = mat.replace('。', "&ᓰ&");
493 mat = mat.replace('.', "&ᓱ&");
494 mat = mat.replace('!', "&ᓳ&");
495 mat = mat.replace('!', "&ᓴ&");
496 mat = mat.replace('?', "&ᓷ&");
497 mat = mat.replace('?', "&ᓸ&");
498 if !is_match_type_single {
499 mat = mat.replace('\'', "&⎋&");
500 }
501 for rule in &self.sub_escaped_regex_reserved_characters {
502 mat = rule.replace_all(&mat);
503 }
504 mat
505 }
506 }
507}
508
509#[cfg(test)]
510mod tests {
511 use super::*;
512 use std::error::Error;
513
514 type TestResult = Result<(), Box<dyn Error>>;
515
516 #[test]
517 fn regex_should_be_compiled() -> TestResult {
518 let _seg = Segmenter::new()?;
519 Ok(())
520 }
521
522 #[test]
523 fn empty_string() -> TestResult {
524 let seg = Segmenter::new()?;
525
526 let expected: [String; 0] = [];
527 let actual: Vec<_> = seg.segment("").collect();
528 assert_eq!(actual, expected);
529 Ok(())
530 }
531}