pragmatic_segmenter/
lib.rs

1//! Rust port of [pySBD] v3.1.0 and Ruby [pragmatic_segmenter]. **[Documentations]**
2//!
3//! rust-pragmatic-segmenter is rule-based SBD. It uses a lot of regular expressions to separate
4//! sentences.
5//!
6//! ```rust
7//! use pragmatic_segmenter::Segmenter;
8//!
9//! let segmenter = Segmenter::new()?;
10//! let result: Vec<_> = segmenter.segment("Hi Mr. Kim. Let's meet at 3 P.M.").collect();
11//! println!("{:?}", result); // ["Hi Mr. Kim. ", "Let\'s meet at 3 P.M."]
12//! # Ok::<(), Box<dyn std::error::Error>>(())
13//! ```
14//!
15//! [pySBD]: https://github.com/nipunsadvilkar/pySBD
16//! [pragmatic_segmenter]: https://github.com/diasks2/pragmatic_segmenter
17//! [Documentations]: https://docs.rs/pragmatic-segmenter
18
19mod abbreviation_replacer;
20mod list_item_replacer;
21mod rule;
22mod util;
23
24use std::borrow::Cow;
25use std::error::Error;
26use std::iter::Iterator;
27
28use onig::{Captures, Regex};
29
30use abbreviation_replacer::AbbreviationReplacer;
31use list_item_replacer::ListItemReplacer;
32use rule::Rule;
33use util::re;
34
35const PUNCTUATIONS: [char; 7] = ['。', '.', '.', '!', '!', '?', '?'];
36
37/// Segmenter type. It stores the compilation results of regular expressions used internally by
38/// pragmatic-segmenter in memory.
39///
40/// ```rust
41/// use pragmatic_segmenter::Segmenter;
42///
43/// let segmenter = Segmenter::new()?;
44/// let result: Vec<_> = segmenter.segment("Hi Mr. Kim. Let's meet at 3 P.M.").collect();
45/// assert_eq!(result, vec!["Hi Mr. Kim. ", "Let's meet at 3 P.M."]);
46/// # Ok::<(), Box<dyn std::error::Error>>(())
47/// ```
48pub struct Segmenter {
49    list_item_replacer: ListItemReplacer,
50    abbreviation_replacer: AbbreviationReplacer,
51
52    number_rules: [Rule; 5],
53    continuous_punctuation_regex: Regex,
54    numbered_reference: Rule,
55    abbreviation_with_multiple_periods_and_email_regex: regex::Regex,
56    misc_rules: [Rule; 2],
57
58    parens_between_double_quotes_regex: Regex,
59    parens_between_double_quotes_0: Rule,
60    parens_between_double_quotes_1: Rule,
61
62    ellipsis_rules: [Rule; 5],
63
64    exclamation_regex: Regex,
65    sub_escaped_regex_reserved_characters: [Rule; 5],
66
67    word_with_leading_apostrophe: Regex,
68    trailing_apostrophe: Regex,
69    between_single_quotes_regex: Regex,
70    between_single_quote_slanted_regex: Regex,
71    between_double_quotes_regex_2: Regex,
72    between_square_brackets_regex_2: Regex,
73    between_parens_regex_2: Regex,
74    between_quote_arrow_regex_2: Regex,
75    between_em_dashes_regex_2: Regex,
76    between_quote_slanted_regex_2: Regex,
77
78    double_punctuation: Regex,
79    question_mark_in_quotation_and_exclamation_point_rules: [Rule; 4],
80
81    replace_parens: Rule,
82
83    sentence_boundary_regex: Regex,
84    post_process_regex: Regex,
85    quotation_at_end_of_sentence_regex: Regex,
86    split_space_quotation_at_end_of_sentence_regex: Regex,
87}
88
89impl Segmenter {
90    /// Create a new Segmenter instance. The regular expressions used internally by
91    /// pragmatic-segmenter are compiled here.
92    ///
93    /// ```rust
94    /// use pragmatic_segmenter::Segmenter;
95    ///
96    /// let segmenter = Segmenter::new()?;
97    /// # Ok::<(), Box<dyn std::error::Error>>(())
98    /// ```
99    pub fn new() -> Result<Self, Box<dyn Error>> {
100        Ok(Segmenter {
101            list_item_replacer: ListItemReplacer::new()?,
102            abbreviation_replacer: AbbreviationReplacer::new()?,
103
104            number_rules: [
105                // PeriodBeforeNumberRule
106                // Example: https://rubular.com/r/oNyxBOqbyy
107                Rule::new(r"\.(?=\d)", "∯")?,
108                // NumberAfterPeriodBeforeLetterRule
109                // Example: https://rubular.com/r/EMk5MpiUzt
110                Rule::new(r"(?<=\d)\.(?=\S)", "∯")?,
111                // NewLineNumberPeriodSpaceLetterRule
112                // Example: https://rubular.com/r/rf4l1HjtjG
113                Rule::new(r"(?<=\r\d)\.(?=(\s\S)|\))", "∯")?,
114                // StartLineNumberPeriodRule
115                // Example: https://rubular.com/r/HPa4sdc6b9
116                Rule::new(r"(?<=^\d)\.(?=(\s\S)|\))", "∯")?,
117                // StartLineTwoDigitNumberPeriodRule
118                // Example: https://rubular.com/r/NuvWnKleFl
119                Rule::new(r"(?<=^\d\d)\.(?=(\s\S)|\))", "∯")?,
120            ],
121
122            // Example: https://rubular.com/r/mQ8Es9bxtk
123            continuous_punctuation_regex: re(r"(?<=\S)(!|\?){3,}(?=(\s|\Z|$))")?,
124
125            // Example: https://rubular.com/r/UkumQaILKbkeyc
126            numbered_reference: Rule::new(
127                r"(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])",
128                r"∯\2\r\7",
129            )?,
130
131            // English.Abbreviation.WithMultiplePeriodsAndEmailRule,
132            //
133            // NOTE: pySBD와 루비 구현체가 다른 정규표현식을 쓴다. pySBD의 동작을 따라간다.
134            //
135            // Example: https://rubular.com/r/EUbZCNfgei
136            abbreviation_with_multiple_periods_and_email_regex: regex::Regex::new(
137                r"([a-zA-Z0-9_])(?:\.)([a-zA-Z0-9_])",
138            )?,
139
140            misc_rules: [
141                // English.GeoLocationRule,
142                Rule::new(r"(?<=[a-zA-z]°)\.(?=\s*\d+)", "∯")?,
143                // English.FileFormatRule,
144                Rule::new(
145                    r"(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)",
146                    "∯",
147                )?,
148            ],
149
150            // Example: https://rubular.com/r/6flGnUMEVl
151            parens_between_double_quotes_regex: re(r#"["\”]\s\(.*\)\s["\“]"#)?,
152            parens_between_double_quotes_0: Rule::new(r"\s(?=\()", "\r")?,
153            parens_between_double_quotes_1: Rule::new(r"(?<=\))\s", "\r")?,
154
155            // NOTE: 이부분은 pySBD 구현과 루비 구현이 동작이 다르다. pySBD의 동작을 따른다.
156            // 이 부분을 고치게 되면 ReinsertEllipsisRules도 함께 고쳐야한다.
157            ellipsis_rules: [
158                // ThreeSpaceRule
159                // Example: https://rubular.com/r/YBG1dIHTRu
160                Rule::new(r"(\s\.){3}\s", "♟♟♟♟♟♟♟")?,
161                // FourSpaceRule
162                // Example: https://rubular.com/r/2VvZ8wRbd8
163                Rule::new(r"(?<=[a-z])(\.\s){3}\.($|\\n)", "♝♝♝♝♝♝♝")?,
164                // FourConsecutiveRule
165                // Example: https://rubular.com/r/Hdqpd90owl
166                Rule::new(r"(?<=\S)\.{3}(?=\.\s[A-Z])", "ƪƪƪ")?,
167                // ThreeConsecutiveRule
168                // Example: https://rubular.com/r/i60hCK81fz
169                Rule::new(r"\.\.\.(?=\s+[A-Z])", "☏☏.")?,
170                // OtherThreePeriodRule
171                Rule::new(r"\.\.\.", "ƪƪƪ")?,
172            ],
173
174            exclamation_regex: re(
175                r"!Xũ|!Kung|ǃʼOǃKung|!Xuun|!Kung\-Ekoka|ǃHu|ǃKhung|ǃKu|ǃung|ǃXo|ǃXû|ǃXung|ǃXũ|!Xun|Yahoo!|Y!J|Yum!",
176            )?,
177
178            // NOTE: pySBD에 구현 실수가 있어 루비 구현체와 동작이 전혀 다르지만, pySBD의 동작을
179            // 따르기 위해 버그를 유지하겠다.
180            sub_escaped_regex_reserved_characters: [
181                // SubLeftParen
182                Rule::new(r"\\\(", "(")?,
183                // SubRightParen
184                Rule::new(r"\\\)", ")")?,
185                // SubLeftBracket
186                Rule::new(r"\\\[", "[")?,
187                // SubRightBracket
188                Rule::new(r"\\\]", "]")?,
189                // SubDash
190                Rule::new(r"\\\-", "-")?,
191            ],
192
193            // Example: https://rubular.com/r/mXf8cW025o
194            word_with_leading_apostrophe: re(r"(?<=\s)'(?:[^']|'[a-zA-Z])*'\S")?,
195
196            trailing_apostrophe: re(r"'\s")?,
197
198            // Example: https://rubular.com/r/2YFrKWQUYi
199            between_single_quotes_regex: re(r"(?<=\s)'(?:[^']|'[a-zA-Z])*'")?,
200
201            between_single_quote_slanted_regex: re(r"(?<=\s)‘(?:[^’]|’[a-zA-Z])*’")?,
202
203            // Example: https://regex101.com/r/r6I1bW/1
204            //
205            // NOTE: pySBD에선 파이썬 regex의 기능 한계로 인해 원본인 루비 pragmatic_segmenter와
206            // 동작이 다른데, 우리는 Oniguruma regex engine을 쓰고있으므로 루비 구현을 재현할 수
207            // 있다. 그러나 pySBD와 동작을 맞추기 위해 의도적으로 pySBD 정규표현식을 사용한다.
208            //
209            // NOTE: Python regex와 Oniguruma regex는 named capture group과 backreference 문법이
210            // 다르다. 주의
211            //
212            // Reference: https://stackoverflow.com/a/13577411/13977061
213            between_double_quotes_regex_2: re(r#""(?=(?<tmp>[^\"\\]+|\\{2}|\\.)*)\k<tmp>""#)?,
214            between_square_brackets_regex_2: re(r#"\[(?=(?<tmp>[^\]\\]+|\\{2}|\\.)*)\k<tmp>\]"#)?,
215            between_parens_regex_2: re(r"\((?=(?<tmp>[^\(\)\\]+|\\{2}|\\.)*)\k<tmp>\)")?,
216            between_quote_arrow_regex_2: re(r"\«(?=(?<tmp>[^»\\]+|\\{2}|\\.)*)\k<tmp>\»")?,
217            between_em_dashes_regex_2: re(r"--(?=(?<tmp>[^--]*))\k<tmp>--")?,
218            between_quote_slanted_regex_2: re(r"\“(?=(?<tmp>[^”\\]+|\\{2}|\\.)*)\k<tmp>\”")?,
219
220            double_punctuation: re(r"^(?:\?!|!\?|\?\?|!!)")?,
221            question_mark_in_quotation_and_exclamation_point_rules: [
222                // QuestionMarkInQuotationRule
223                // Example: https://rubular.com/r/aXPUGm6fQh
224                Rule::new(r#"\?(?=(\'|\"))"#, "&ᓷ&")?,
225                // InQuotationRule
226                // Example: https://rubular.com/r/XS1XXFRfM2
227                Rule::new(r#"\!(?=(\'|\"))"#, "&ᓴ&")?,
228                // BeforeCommaMidSentenceRule
229                // Example: https://rubular.com/r/sl57YI8LkA
230                Rule::new(r"\!(?=\,\s[a-z])", "&ᓴ&")?,
231                // MidSentenceRule
232                // Example: https://rubular.com/r/f9zTjmkIPb
233                Rule::new(r"\!(?=\s[a-z])", "&ᓴ&")?,
234            ],
235
236            // Example: https://rubular.com/r/GcnmQt4a3I
237            replace_parens: Rule::new(
238                // ROMAN_NUMERALS_IN_PARENTHESES
239                r"\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])",
240                r"&✂&\1&⌬&",
241            )?,
242
243            // added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc.
244            sentence_boundary_regex: re(
245                r#"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]"#,
246            )?,
247            post_process_regex: re(r"\A[a-zA-Z]*\Z")?,
248            // Example: https://rubular.com/r/NqCqv372Ix
249            quotation_at_end_of_sentence_regex: re(r#"[!?\.-][\"\'“”]\s{1}[A-Z]"#)?,
250            // Example: https://rubular.com/r/JMjlZHAT4g
251            split_space_quotation_at_end_of_sentence_regex: re(
252                r#"(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])"#,
253            )?,
254        })
255    }
256
257    /// Separate sentences from given input. Although it is a function that returns an Iterator,
258    /// not all processing is done by streaming. After pre-processing the entire input once,
259    /// processing is performed for each sentence by streaming.
260    ///
261    /// ```rust
262    /// use pragmatic_segmenter::Segmenter;
263    ///
264    /// let segmenter = Segmenter::new()?;
265    /// let mut iter = segmenter.segment("Hi Mr. Kim. Let's meet at 3 P.M.");
266    ///
267    /// assert_eq!(iter.next(), Some("Hi Mr. Kim. "));
268    /// assert_eq!(iter.next(), Some("Let's meet at 3 P.M."));
269    /// assert_eq!(iter.next(), None);
270    /// # Ok::<(), Box<dyn std::error::Error>>(())
271    /// ```
272    pub fn segment<'a>(&'a self, original_input: &'a str) -> impl Iterator<Item = &'a str> {
273        // NOTE: 루비 버전에는 이런 처리가 없으나, pySBD 3.1.0에 이 처리가 들어갔다. pySBD와 동작을
274        // 맞추기위해 동일하게 처리해준다.
275        let text = original_input.replace('\n', "\r");
276
277        let text = self.list_item_replacer.add_line_break(&text);
278
279        // replace_abbreviations()
280        let mut text = self.abbreviation_replacer.replace(&text);
281
282        // replace_numbers()
283        for rule in &self.number_rules {
284            text = rule.replace_all(&text);
285        }
286
287        // replace_continuous_punctuation()
288        let text = self
289            .continuous_punctuation_regex
290            .replace_all(&text, |c: &Captures| {
291                let mat = c.at(0).unwrap(); // Must exists
292                mat.replace('!', "&ᓴ&").replace('?', "&ᓷ&")
293            });
294
295        // replace_periods_before_numeric_references()
296        //
297        // Reference:
298        //   https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a35
299        let text = self.numbered_reference.replace_all(&text);
300
301        let mut text = self
302            .abbreviation_with_multiple_periods_and_email_regex
303            .replace_all(&text, "$1∮$2");
304        for rule in &self.misc_rules {
305            text = Cow::Owned(rule.replace_all(&text));
306        }
307
308        //
309        // split_into_segments()
310        //
311
312        // check_for_parens_between_quotes()
313        let text = self
314            .parens_between_double_quotes_regex
315            .replace_all(&text, |c: &Captures| {
316                let mat = c.at(0).unwrap(); // Must exists
317                let mat = self.parens_between_double_quotes_0.replace_all(mat);
318                self.parens_between_double_quotes_1.replace_all(&mat)
319            });
320
321        let mut prior_start_char_idx = 0;
322
323        // TODO: flat_map() 에서 임시 Vec, String 할당 줄이기
324        text.split('\r')
325            .filter(|s| !s.is_empty())
326            .map(|s| s.to_string())
327            .collect::<Vec<_>>() // String을 own하는 버전의 새 split 함수를 만들면 이부분을 제거할 수 있음
328            .into_iter()
329            .flat_map(move |sent| {
330                // English.SingleNewLineRule
331                let mut sent = sent.replace(r"\n", "ȹ");
332                // English.EllipsisRules.All
333                for rule in &self.ellipsis_rules {
334                    sent = rule.replace_all(&sent);
335                }
336                // check_for_punctuation()
337                if PUNCTUATIONS.iter().any(|&p| sent.contains(p)) {
338                    // process_text()
339                    if !sent.ends_with(&PUNCTUATIONS[..]) {
340                        sent += "ȸ";
341                    }
342
343                    // ExclamationWords.apply_rules()
344                    sent = self
345                        .exclamation_regex
346                        .replace_all(&sent, self.replace_punctuation(false));
347
348                    // between_punctuation()
349                    if self.word_with_leading_apostrophe.find(&sent).is_none()
350                        || self.trailing_apostrophe.find(&sent).is_some()
351                    {
352                        sent = self
353                            .between_single_quotes_regex
354                            .replace_all(&sent, self.replace_punctuation(true));
355                    }
356                    sent = self
357                        .between_single_quote_slanted_regex
358                        .replace_all(&sent, self.replace_punctuation(false));
359                    sent = self
360                        .between_double_quotes_regex_2
361                        .replace_all(&sent, self.replace_punctuation(false));
362                    sent = self
363                        .between_square_brackets_regex_2
364                        .replace_all(&sent, self.replace_punctuation(false));
365                    sent = self
366                        .between_parens_regex_2
367                        .replace_all(&sent, self.replace_punctuation(false));
368                    sent = self
369                        .between_quote_arrow_regex_2
370                        .replace_all(&sent, self.replace_punctuation(false));
371                    sent = self
372                        .between_em_dashes_regex_2
373                        .replace_all(&sent, self.replace_punctuation(false));
374                    sent = self
375                        .between_quote_slanted_regex_2
376                        .replace_all(&sent, self.replace_punctuation(false));
377
378                    // handle text having only doublepunctuations
379                    if self.double_punctuation.find(&sent).is_none() {
380                        sent = sent
381                            .replace(r"?!", "☉")
382                            .replace(r"!?", "☈")
383                            .replace(r"??", "☇")
384                            .replace(r"!!", "☄");
385                    }
386                    for rule in &self.question_mark_in_quotation_and_exclamation_point_rules {
387                        sent = rule.replace_all(&sent);
388                    }
389
390                    // ListItemReplacer(sent).replace_parens()
391                    sent = self.replace_parens.replace_all(&sent);
392
393                    // sentence_boundary_punctuation()
394                    // retain exclamation mark if it is an ending character of a given text
395                    sent = sent.replace(r"&ᓴ&$", "!");
396                    self.sentence_boundary_regex
397                        .find_iter(&sent)
398                        .map(|r| sent[r.0..r.1].to_string())
399                        .collect::<Vec<_>>()
400                } else {
401                    vec![sent]
402                }
403            })
404            .flat_map(move |mut sent| {
405                // SubSymbolsRules
406                sent = sent
407                    .replace('∯', ".")
408                    .replace('♬', "،")
409                    .replace('♭', ":")
410                    .replace(r"&ᓰ&", "。")
411                    .replace(r"&ᓱ&", ".")
412                    .replace(r"&ᓳ&", "!")
413                    .replace(r"&ᓴ&", "!")
414                    .replace(r"&ᓷ&", "?")
415                    .replace(r"&ᓸ&", "?")
416                    .replace('☉', "?!")
417                    .replace('☇', "??")
418                    .replace('☈', "!?")
419                    .replace('☄', "!!")
420                    .replace(r"&✂&", "(")
421                    .replace(r"&⌬&", ")")
422                    .replace('ȸ', "")
423                    .replace('ȹ', "\n");
424
425                // post_process_segments()
426                //
427                // NOTE: post_process_segments 함수는 pySBD와 루비 pragmatic_segmenter의 동작이 전혀
428                // 다르다. pySBD를 따라간다.
429                if sent.len() > 2 && self.post_process_regex.find(&sent).is_some() {
430                    return vec![sent];
431                }
432
433                // ReinsertEllipsisRules
434                // NOTE: 이부분은 pySBD 구현과 루비 구현이 동작이 다르다. pySBD의 동작을 따른다.
435                sent = sent
436                    .replace(r"ƪƪƪ", "...")
437                    .replace(r"♟♟♟♟♟♟♟", " . . . ")
438                    .replace(r"♝♝♝♝♝♝♝", ". . . .")
439                    .replace(r"☏☏", "..")
440                    .replace('∮', ".");
441
442                if self
443                    .quotation_at_end_of_sentence_regex
444                    .find(&sent)
445                    .is_some()
446                {
447                    self.split_space_quotation_at_end_of_sentence_regex
448                        .split(&sent)
449                        .map(|s| s.to_string())
450                        .collect()
451                } else {
452                    vec![sent.replace('\n', "").trim().to_string()]
453                }
454            })
455            .map(|sent| sent.replace(r"&⎋&", "'"))
456            // NOTE: pySBD에만 이하의 처리가 존재하고, 원본 루비코드에는 이런 동작이 없다. 일단
457            // 동작을 맞추기 위해 동일한 처리를 해주지만, 아래 코드때문에 성능손실이 크다.
458            .flat_map(move |sent| -> Vec<_> {
459                // since SENTENCE_BOUNDARY_REGEX doesnt account
460                // for trailing whitespaces \s* & is used as suffix
461                // to keep non-destructive text after segments joins
462
463                // NOTE: escape 한 뒤 compile했기 때문에, 실패의 여지가 없다.
464                let re = regex::Regex::new(&format!(r"{}\s*", regex::escape(&sent))).unwrap();
465                re.find_iter(original_input)
466                    .filter_map(|mat| {
467                        let match_str = mat.as_str();
468                        let match_start_idx = mat.start();
469                        if match_start_idx >= prior_start_char_idx {
470                            prior_start_char_idx = match_start_idx;
471                            Some(match_str)
472                        // making sure if curren sentence and its span
473                        // is either first sentence along with its char spans
474                        // or current sent spans adjacent to prior sentence spans
475                        } else {
476                            None
477                        }
478                    })
479                    .collect()
480            })
481    }
482
483    fn replace_punctuation(&self, is_match_type_single: bool) -> impl Fn(&Captures) -> String + '_ {
484        move |c: &Captures| {
485            let mat = c.at(0).unwrap(); // Must exists
486
487            // NOTE: 원래 이 자리에서 EscapeRegexReservedCharacters.All 규칙이 적용되어야
488            // 하나, pySBD의 구현 버그로 인해 EscapeRegexReservedCharacters.All가 아무일도
489            // 하지 않는다. 버그이지만, pySBD의 동작을 따라가기위해 버그를 유지하겠다.
490
491            let mut mat = mat.replace('.', "∯");
492            mat = mat.replace('。', "&ᓰ&");
493            mat = mat.replace('.', "&ᓱ&");
494            mat = mat.replace('!', "&ᓳ&");
495            mat = mat.replace('!', "&ᓴ&");
496            mat = mat.replace('?', "&ᓷ&");
497            mat = mat.replace('?', "&ᓸ&");
498            if !is_match_type_single {
499                mat = mat.replace('\'', "&⎋&");
500            }
501            for rule in &self.sub_escaped_regex_reserved_characters {
502                mat = rule.replace_all(&mat);
503            }
504            mat
505        }
506    }
507}
508
509#[cfg(test)]
510mod tests {
511    use super::*;
512    use std::error::Error;
513
514    type TestResult = Result<(), Box<dyn Error>>;
515
516    #[test]
517    fn regex_should_be_compiled() -> TestResult {
518        let _seg = Segmenter::new()?;
519        Ok(())
520    }
521
522    #[test]
523    fn empty_string() -> TestResult {
524        let seg = Segmenter::new()?;
525
526        let expected: [String; 0] = [];
527        let actual: Vec<_> = seg.segment("").collect();
528        assert_eq!(actual, expected);
529        Ok(())
530    }
531}