mdbook_i18n_helpers/
lib.rs

1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Helpers for translating `mdbook` projects.
16//!
17//! The functions here are used to implement a robust
18//! internationalization (i18n) workflow for `mdbook`. This allows you
19//! to translate your books into other languages while also making it
20//! easy to keep the translations up to date as you edit the original
21//! source text.
22//!
23//! See <https://github.com/google/mdbook-i18n-helpers> for details on
24//! how to use the supplied `mdbook` plugins.
25
26use polib::catalog::Catalog;
27use pulldown_cmark::{
28    BrokenLinkCallback, CodeBlockKind, DefaultBrokenLinkCallback, Event, LinkType, Tag, TagEnd,
29};
30use pulldown_cmark_to_cmark::{calculate_code_block_token_count, cmark_resume_with_options};
31use pulldown_cmark_to_cmark::{Error as CmarkError, Options, State};
32use std::sync::OnceLock;
33use syntect::easy::ScopeRangeIterator;
34use syntect::parsing::{ParseState, Scope, ScopeStack, SyntaxSet};
35
36pub mod directives;
37pub mod gettext;
38pub mod normalize;
39pub mod preprocessors;
40pub mod renderers;
41pub mod xgettext;
42
43/// Re-wrap the sources field of a message.
44///
45/// This function tries to wrap the `file:lineno` pairs so they look
46/// the same as what you get from `msgcat` or `msgmerge`.
47pub fn wrap_sources(sources: &str) -> String {
48    let options = textwrap::Options::new(76)
49        .break_words(false)
50        .word_splitter(textwrap::WordSplitter::NoHyphenation);
51    textwrap::refill(sources, options)
52}
53
54/// Like `mdbook::utils::new_cmark_parser`, but also passes a
55/// `BrokenLinkCallback`.
56pub fn new_cmark_parser<'input, F: BrokenLinkCallback<'input>>(
57    text: &'input str,
58    broken_link_callback: Option<F>,
59) -> pulldown_cmark::Parser<'input, F> {
60    let mut options = pulldown_cmark::Options::empty();
61    options.insert(pulldown_cmark::Options::ENABLE_TABLES);
62    options.insert(pulldown_cmark::Options::ENABLE_OLD_FOOTNOTES);
63    options.insert(pulldown_cmark::Options::ENABLE_STRIKETHROUGH);
64    options.insert(pulldown_cmark::Options::ENABLE_TASKLISTS);
65    options.insert(pulldown_cmark::Options::ENABLE_HEADING_ATTRIBUTES);
66    pulldown_cmark::Parser::new_with_broken_link_callback(text, options, broken_link_callback)
67}
68
69/// Extract Markdown events from `text`.
70///
71/// The `state` can be used to give the parsing context. In
72/// particular:
73///
74/// - If a code block has started, the text should be parsed
75///   without interpreting special Markdown characters.
76/// - In a table cell, the text should be parsed as inlines.
77///
78/// The events are labeled with the line number where they start in
79/// the document.
80///
81/// # Examples
82///
83/// ```
84/// use mdbook_i18n_helpers::extract_events;
85/// use pulldown_cmark::{Event, Tag, TagEnd};
86///
87/// assert_eq!(
88///     extract_events("Hello,\nworld!", None),
89///     vec![
90///         (1, Event::Start(Tag::Paragraph)),
91///         (1, Event::Text("Hello,".into())),
92///         (1, Event::Text(" ".into())),
93///         (2, Event::Text("world!".into())),
94///         (1, Event::End(TagEnd::Paragraph)),
95///     ]
96/// );
97/// ```
98pub fn extract_events<'a>(text: &'a str, state: Option<State<'a>>) -> Vec<(usize, Event<'a>)> {
99    // Expand a `[foo]` style links into inline links like `[foo](url)`
100    fn expand_shortcut_link(tag: Tag<'_>) -> Tag<'_> {
101        match tag {
102            Tag::Link {
103                link_type: LinkType::Shortcut | LinkType::Collapsed | LinkType::Reference,
104                dest_url,
105                title,
106                id,
107            } => Tag::Link {
108                link_type: LinkType::Inline,
109                dest_url,
110                title,
111                id,
112            },
113            Tag::Image {
114                link_type: LinkType::Shortcut | LinkType::Collapsed | LinkType::Reference,
115                dest_url,
116                title,
117                id,
118            } => Tag::Image {
119                link_type: LinkType::Inline,
120                dest_url,
121                title,
122                id,
123            },
124            _ => tag,
125        }
126    }
127
128    // Perform some common transformations on the events
129    fn convert_event_common(event: Event<'_>) -> Event<'_> {
130        match event {
131            Event::SoftBreak => Event::Text(" ".into()),
132            // Shortcut links like "[foo]" end up as "[foo]"
133            // in output. By changing them to a reference
134            // link, the link is expanded on the fly and the
135            // output becomes self-contained.
136            Event::Start(tag @ (Tag::Link { .. } | Tag::Image { .. })) => {
137                Event::Start(expand_shortcut_link(tag))
138            }
139            _ => event,
140        }
141    }
142
143    // Offsets of each newline in the input, used to calculate line
144    // numbers from byte offsets.
145    let offsets = text
146        .match_indices('\n')
147        .map(|(offset, _)| offset)
148        .collect::<Vec<_>>();
149
150    match state {
151        // If we're in a code block, we disable the normal parsing and
152        // return lines of text. This matches the behavior of the
153        // parser in this case.
154        Some(state) if state.is_in_code_block() => text
155            .split_inclusive('\n')
156            .enumerate()
157            .map(|(idx, line)| (idx + 1, Event::Text(line.into())))
158            .collect(),
159        // If we're in a table cell, we put the text in a minimal table, parse the
160        // table, and return the contents of the cell. This matches the behavior of
161        // the parser in this case.
162        Some(state) if state.in_table_cell => {
163            let text = format!("|{text}|\n|-|");
164            new_cmark_parser::<'_, DefaultBrokenLinkCallback>(&text, None)
165                .filter_map(|event| {
166                    if let Event::Start(Tag::Table(..) | Tag::TableHead | Tag::TableCell)
167                    | Event::End(TagEnd::Table | TagEnd::TableHead | TagEnd::TableCell) = event
168                    {
169                        return None;
170                    }
171                    // The line number is always 1 because tables don't allow newlines
172                    Some((1, convert_event_common(event).into_static()))
173                })
174                .collect()
175        }
176        // Otherwise, we parse the text line normally.
177        _ => new_cmark_parser::<'a, DefaultBrokenLinkCallback>(text, None)
178            .into_offset_iter()
179            .map(|(event, range)| {
180                let lineno = offsets.partition_point(|&o| o < range.start) + 1;
181                (lineno, convert_event_common(event))
182            })
183            .collect(),
184    }
185}
186
187/// Markdown events grouped by type.
188#[derive(Debug, Clone, PartialEq)]
189pub enum Group<'a> {
190    /// Markdown events which should be translated.
191    ///
192    /// This includes `[Text("foo")]` as well as sequences with text
193    /// such as `[Start(Emphasis), Text("foo") End(Emphasis)]`.
194    Translate {
195        events: Vec<(usize, Event<'a>)>,
196        /// A comment that may be associated with the translation text.
197        comment: String,
198    },
199
200    /// Markdown events which should be skipped when translating.
201    ///
202    /// This includes structural events such as `Start(Heading(H1,
203    /// None, vec![]))`.
204    Skip(Vec<(usize, Event<'a>)>),
205}
206
207#[derive(Debug, Default)]
208struct GroupingContext {
209    skip_next_group: bool,
210    comments: Vec<String>,
211}
212
213impl GroupingContext {
214    fn clear_skip_next_group(self) -> Self {
215        Self {
216            skip_next_group: false,
217            ..self
218        }
219    }
220}
221
222/// Group Markdown events into translatable and skipped events.
223///
224/// This function will partition the input events into groups of
225/// events which should be translated or skipped. Concatenating the
226/// events in each group will give you back the original events.
227///
228/// # Examples
229///
230/// ```
231/// use mdbook_i18n_helpers::{extract_events, group_events, Group};
232/// use pulldown_cmark::{Event, Tag, TagEnd};
233///
234/// let events = extract_events("- A list item.", None);
235/// assert_eq!(
236///     events,
237///     vec![
238///         (1, Event::Start(Tag::List(None))),
239///         (1, Event::Start(Tag::Item)),
240///         (1, Event::Text("A list item.".into())),
241///         (1, Event::End(TagEnd::Item)),
242///         (1, Event::End(TagEnd::List(false))),
243///     ],
244/// );
245///
246/// let groups = group_events(&events).unwrap();
247/// assert_eq!(
248///     groups,
249///     vec![
250///         Group::Skip(vec![
251///             (1, Event::Start(Tag::List(None))),
252///             (1, Event::Start(Tag::Item)),
253///         ]),
254///         Group::Translate {
255///             events: vec![
256///                 (1, Event::Text("A list item.".into())),
257///             ], comment: "".into()},
258///         Group::Skip(vec![
259///             (1, Event::End(TagEnd::Item)),
260///             (1, Event::End(TagEnd::List(false))),
261///         ]),
262///     ]
263/// );
264/// ```
265pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Result<Vec<Group<'a>>, CmarkError> {
266    #[derive(Debug)]
267    enum State {
268        Translate(usize),
269        Skip(usize),
270    }
271
272    impl State {
273        /// Creates groups based on the capturing state and context.
274        fn into_groups<'a>(
275            self,
276            idx: usize,
277            events: &'a [(usize, Event<'a>)],
278            mut ctx: GroupingContext,
279        ) -> Result<(Vec<Group<'a>>, GroupingContext), CmarkError> {
280            let groups = match self {
281                State::Translate(start) => {
282                    if ctx.skip_next_group {
283                        (
284                            vec![Group::Skip(events[start..idx].into())],
285                            ctx.clear_skip_next_group(),
286                        )
287                    } else if is_codeblock_group(&events[start..idx]) {
288                        parse_codeblock(&events[start..idx], ctx)?
289                    } else {
290                        (
291                            vec![Group::Translate {
292                                events: events[start..idx].into(),
293                                comment: std::mem::take(&mut ctx.comments).join(" "),
294                            }],
295                            ctx,
296                        )
297                    }
298                }
299                State::Skip(start) => (vec![Group::Skip(events[start..idx].into())], ctx),
300            };
301            Ok(groups)
302        }
303    }
304
305    let mut groups = Vec::new();
306    let mut state = State::Skip(0);
307    let mut ctx = GroupingContext::default();
308
309    for (idx, (_, event)) in events.iter().enumerate() {
310        match event {
311            // These block-level events force new groups. We do this
312            // because we want to include these events in the group to
313            // make the group self-contained.
314            Event::Start(Tag::Paragraph | Tag::CodeBlock(..)) => {
315                // A translatable group starts here.
316                let mut next_groups;
317                (next_groups, ctx) = state.into_groups(idx, events, ctx)?;
318                groups.append(&mut next_groups);
319
320                state = State::Translate(idx);
321            }
322            Event::End(TagEnd::Paragraph | TagEnd::CodeBlock) => {
323                // A translatable group ends after `idx`.
324                let idx = idx + 1;
325                let mut next_groups;
326                (next_groups, ctx) = state.into_groups(idx, events, ctx)?;
327                groups.append(&mut next_groups);
328
329                state = State::Skip(idx);
330            }
331
332            // Inline events start or continue a translating group.
333            Event::Start(
334                Tag::Emphasis
335                | Tag::Strong
336                | Tag::Strikethrough
337                | Tag::Link { .. }
338                | Tag::Image { .. },
339            )
340            | Event::End(
341                TagEnd::Emphasis
342                | TagEnd::Strong
343                | TagEnd::Strikethrough
344                | TagEnd::Link
345                | TagEnd::Image,
346            )
347            | Event::Text(_)
348            | Event::Code(_)
349            | Event::FootnoteReference(_)
350            | Event::SoftBreak
351            | Event::HardBreak => {
352                // If we're currently skipping, then a new
353                // translatable group starts here.
354                if let State::Skip(_) = state {
355                    let mut next_groups;
356                    (next_groups, ctx) = state.into_groups(idx, events, ctx)?;
357                    groups.append(&mut next_groups);
358
359                    state = State::Translate(idx);
360                }
361            }
362
363            Event::Html(s) | Event::InlineHtml(s) => {
364                match directives::find(s) {
365                    Some(directives::Directive::Skip) => {
366                        // If in the middle of translation, finish it.
367                        if let State::Translate(_) = state {
368                            let mut next_groups;
369                            (next_groups, ctx) = state.into_groups(idx, events, ctx)?;
370                            groups.append(&mut next_groups);
371
372                            // Restart translation: subtle but should be
373                            // needed to handle the skipping of the rest of
374                            // the inlined content.
375                            state = State::Translate(idx);
376                        }
377
378                        ctx.skip_next_group = true;
379                    }
380
381                    Some(directives::Directive::Comment(comment)) => {
382                        // If in the middle of translation, finish it.
383                        if let State::Translate(_) = state {
384                            let mut next_groups;
385                            (next_groups, ctx) = state.into_groups(idx, events, ctx)?;
386                            groups.append(&mut next_groups);
387
388                            // Restart translation: subtle but should be
389                            // needed to handle the skipping of the rest of
390                            // the inlined content.
391                            state = State::Translate(idx);
392                        }
393
394                        ctx.comments.push(comment);
395                    }
396                    _ => {
397                        match event {
398                            Event::Html(_) => {
399                                // Otherwise, treat as a skipping group if this is a block level Html tag
400                                if let State::Translate(_) = state {
401                                    let mut next_groups;
402                                    (next_groups, ctx) = state.into_groups(idx, events, ctx)?;
403                                    groups.append(&mut next_groups);
404
405                                    state = State::Skip(idx);
406                                }
407                            }
408                            Event::InlineHtml(_) =>
409                            // If we're currently skipping, then a new
410                            // translatable group starts here.
411                            {
412                                if let State::Skip(_) = state {
413                                    let mut next_groups;
414                                    (next_groups, ctx) = state.into_groups(idx, events, ctx)?;
415                                    groups.append(&mut next_groups);
416
417                                    state = State::Translate(idx);
418                                }
419                            }
420                            // this code is inside a match of Event::{Html|InlineHtml}, other types are not possible
421                            _ => unreachable!(),
422                        }
423                    }
424                }
425            }
426
427            // All other block-level events start or continue a
428            // skipping group.
429            _ => {
430                if let State::Translate(_) = state {
431                    let mut next_groups;
432                    (next_groups, ctx) = state.into_groups(idx, events, ctx)?;
433                    groups.append(&mut next_groups);
434
435                    state = State::Skip(idx);
436                }
437            }
438        }
439    }
440
441    match state {
442        State::Translate(start) => groups.push(Group::Translate {
443            events: events[start..].into(),
444            comment: "".into(),
445        }),
446        State::Skip(start) => groups.push(Group::Skip(events[start..].into())),
447    }
448
449    Ok(groups)
450}
451
452/// Returns true if the events appear to be a codeblock.
453fn is_codeblock_group(events: &[(usize, Event<'_>)]) -> bool {
454    matches!(
455        events,
456        [
457            (_, Event::Start(Tag::CodeBlock(_))),
458            ..,
459            (_, Event::End(TagEnd::CodeBlock))
460        ]
461    )
462}
463
464/// Returns true if the scope should be translated.
465fn is_translate_scope(x: Scope) -> bool {
466    static SCOPE_STRING: OnceLock<Scope> = OnceLock::new();
467    static SCOPE_COMMENT: OnceLock<Scope> = OnceLock::new();
468
469    let scope_string = SCOPE_STRING.get_or_init(|| Scope::new("string").unwrap());
470    let scope_comment = SCOPE_COMMENT.get_or_init(|| Scope::new("comment").unwrap());
471    scope_string.is_prefix_of(x) || scope_comment.is_prefix_of(x)
472}
473
474/// Creates groups by checking codeblock with heuristic way.
475fn heuristic_codeblock<'a>(
476    events: &'a [(usize, Event<'_>)],
477    mut ctx: GroupingContext,
478) -> Result<(Vec<Group<'a>>, GroupingContext), CmarkError> {
479    let is_translate = match events {
480        [(_, Event::Start(Tag::CodeBlock(_))), .., (_, Event::End(TagEnd::CodeBlock))] => {
481            let (codeblock_text, _) = reconstruct_markdown(events, None)?;
482            // Heuristic to check whether the codeblock nether has a
483            // literal string nor a line comment.  We may actually
484            // want to use a lexer here to make this more robust.
485            codeblock_text.contains('"') || codeblock_text.contains("//")
486        }
487        _ => true,
488    };
489
490    let (groups, ctx) = if is_translate {
491        (
492            vec![Group::Translate {
493                events: events.into(),
494                comment: std::mem::take(&mut ctx.comments).join(" "),
495            }],
496            ctx,
497        )
498    } else {
499        (vec![Group::Skip(events.into())], ctx)
500    };
501    Ok((groups, ctx))
502}
503
504/// Special Admonish "codeblock" - extract the body content for translation
505/// If a title is present, also extract that for translation
506/// Note this is from mdbook-admonish: https://github.com/tommilligan/mdbook-admonish
507/// This assumes it is called by `parse_codeblock()` when `is_admonish() == true`
508fn admonish_codeblock<'a>(
509    events: &'a [(usize, Event<'_>)],
510    mut ctx: GroupingContext,
511) -> Result<(Vec<Group<'a>>, GroupingContext), CmarkError> {
512    // Handle the entire block as a single translatable unit
513    // The translate_events function will take care of matching the proper translations
514    // for both the title and body content based on the PO file entries
515    let groups = vec![Group::Translate {
516        events: events.into(),
517        comment: std::mem::take(&mut ctx.comments).join(" "),
518    }];
519
520    Ok((groups, ctx))
521}
522
523/// Check if the code block is an admonish block
524fn is_admonish(events: &[(usize, Event<'_>)]) -> bool {
525    const ADMONISH_CODEBLOCK_NAME: &str = "admonish";
526
527    // pull the info_string (aka language specifier) out of the code block
528    // (the string after the ```)
529    match events {
530        [(_, Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(info_string)))), .., (_, Event::End(TagEnd::CodeBlock))] =>
531        {
532            // Check if the language specifier contains "admonish"
533            matches!(info_string.split_once(' '), Some((keyword, _)) if keyword == ADMONISH_CODEBLOCK_NAME)
534        }
535        _ => false,
536    }
537}
538
539/// Creates groups by parsing codeblock.
540fn parse_codeblock<'a>(
541    events: &'a [(usize, Event<'_>)],
542    mut ctx: GroupingContext,
543) -> Result<(Vec<Group<'a>>, GroupingContext), CmarkError> {
544    // Language detection from language identifier of codeblock.
545    static SYNTAX_SET: OnceLock<SyntaxSet> = OnceLock::new();
546    let ss = SYNTAX_SET.get_or_init(SyntaxSet::load_defaults_newlines);
547
548    let syntax = if let (_, Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(x)))) = &events[0] {
549        ss.find_syntax_by_token(x.split(',').next().unwrap())
550    } else {
551        None
552    };
553
554    let Some(syntax) = syntax else {
555        if is_admonish(events) {
556            return admonish_codeblock(events, ctx);
557        } else {
558            // If there is no language specifier, falling back to heuristic way.
559            return heuristic_codeblock(events, ctx);
560        }
561    };
562
563    let mut ps = ParseState::new(syntax);
564    let mut ret = vec![];
565
566    for (idx, event) in events.iter().enumerate() {
567        match event {
568            (text_line, Event::Text(text)) => {
569                let mut stack = ScopeStack::new();
570                let mut stack_failure = false;
571
572                let Ok(ops) = ps.parse_line(text, ss) else {
573                    // If parse is failed, the text event should be translated.
574                    ret.push(Group::Translate {
575                        events: events[idx..idx + 1].into(),
576                        comment: std::mem::take(&mut ctx.comments).join(" "),
577                    });
578                    continue;
579                };
580
581                let mut translate_events = vec![];
582                let mut groups = vec![];
583
584                for (range, op) in ScopeRangeIterator::new(&ops, text) {
585                    if stack.apply(op).is_err() {
586                        stack_failure = true;
587                        break;
588                    }
589
590                    if range.is_empty() {
591                        continue;
592                    }
593
594                    // Calculate line number of the range
595                    let range_line = if range.start == 0 {
596                        *text_line
597                    } else {
598                        text_line + text[0..range.start].lines().count() - 1
599                    };
600
601                    let text = &text[range];
602
603                    // Whitespaces between translate texts should be added to translate
604                    // group.
605                    // So all whitespaces are added to the translate events buffer temporary,
606                    // and the trailing whitespaces will be remvoed finally.
607                    let is_whitespace = text.trim_matches(&[' ', '\t'] as &[_]).is_empty();
608
609                    let is_translate = stack.scopes.iter().any(|x| is_translate_scope(*x));
610
611                    if is_translate || (is_whitespace && !translate_events.is_empty()) {
612                        translate_events.push((range_line, Event::Text(text.into())));
613                    } else {
614                        let whitespace_events = extract_trailing_whitespaces(&mut translate_events);
615                        if !translate_events.is_empty() {
616                            groups.push(Group::Translate {
617                                events: std::mem::take(&mut translate_events),
618                                comment: std::mem::take(&mut ctx.comments).join(" "),
619                            });
620                        }
621                        if !whitespace_events.is_empty() {
622                            groups.push(Group::Skip(whitespace_events));
623                        }
624                        groups.push(Group::Skip(vec![(range_line, Event::Text(text.into()))]));
625                    }
626                }
627
628                let whitespace_events = extract_trailing_whitespaces(&mut translate_events);
629                if !translate_events.is_empty() {
630                    groups.push(Group::Translate {
631                        events: std::mem::take(&mut translate_events),
632                        comment: std::mem::take(&mut ctx.comments).join(" "),
633                    });
634                }
635                if !whitespace_events.is_empty() {
636                    groups.push(Group::Skip(whitespace_events));
637                }
638
639                if stack_failure {
640                    // If stack operation is failed, the text event should be translated.
641                    ret.push(Group::Translate {
642                        events: events[idx..idx + 1].into(),
643                        comment: std::mem::take(&mut ctx.comments).join(" "),
644                    });
645                } else {
646                    ret.append(&mut groups);
647                }
648            }
649            _ => {
650                ret.push(Group::Skip(events[idx..idx + 1].into()));
651            }
652        }
653    }
654    Ok((ret, ctx))
655}
656
657/// Extract trailing events which have whitespace only.
658fn extract_trailing_whitespaces<'a>(buf: &mut Vec<(usize, Event<'a>)>) -> Vec<(usize, Event<'a>)> {
659    let mut ret = vec![];
660
661    while let Some(last) = buf.last() {
662        match &last.1 {
663            Event::Text(text) if text.as_ref().trim_matches(&[' ', '\t'] as &[_]).is_empty() => {
664                let last = buf.pop().unwrap();
665                ret.push(last);
666            }
667            _ => break,
668        }
669    }
670    ret.reverse();
671    ret
672}
673
674/// Render a slice of Markdown events back to Markdown.
675///
676/// # Examples
677///
678/// ```
679/// use mdbook_i18n_helpers::{extract_events, reconstruct_markdown};
680/// use pulldown_cmark::{Event, Tag};
681///
682/// let group = extract_events("Hello *world!*", None);
683/// let (reconstructed, _) = reconstruct_markdown(&group, None).unwrap();
684/// assert_eq!(reconstructed, "Hello _world!_");
685/// ```
686///
687/// Notice how this will normalize the Markdown to use `_` for
688/// emphasis and `**` for strong emphasis. The style is chosen to
689/// match the [Google developer documentation style
690/// guide](https://developers.google.com/style/text-formatting).
691pub fn reconstruct_markdown<'a>(
692    group: &[(usize, Event<'a>)],
693    state: Option<State<'a>>,
694) -> Result<(String, State<'a>), CmarkError> {
695    let events = group.iter().map(|(_, event)| event);
696    let code_block_token_count = calculate_code_block_token_count(events.clone()).unwrap_or(3);
697    let mut markdown = String::new();
698    let options = Options {
699        code_block_token_count,
700        list_token: '-',
701        emphasis_token: '_',
702        strong_token: "**",
703        ..Options::default()
704    };
705    // Advance the true state, but throw away the rendered Markdown
706    // since it can contain unwanted padding.
707    let new_state = cmark_resume_with_options(
708        events.clone(),
709        String::new(),
710        state.clone(),
711        options.clone(),
712    )?;
713
714    // Block quotes and lists add padding to the state, which is
715    // reflected in the rendered Markdown. We want to capture the
716    // Markdown without the padding to remove the effect of these
717    // structural elements. Similarly, we don't want extra newlines at
718    // the start.
719    let simplified_state = state.map(|mut state| {
720        state.newlines_before_start = 0;
721        state.padding.clear();
722        state
723    });
724    cmark_resume_with_options(events, &mut markdown, simplified_state, options)?;
725    // Even with `newlines_before_start` set to zero, we get a leading
726    // `\n` for code blocks (since they must start on a new line). We
727    // can safely trim this here since we know that we always
728    // reconstruct Markdown for a self-contained group of events.
729    Ok((String::from(markdown.trim_start_matches('\n')), new_state))
730}
731
732#[derive(Debug, PartialEq)]
733pub struct ExtractedMessage {
734    pub message: String,
735    pub comment: String,
736}
737impl From<&str> for ExtractedMessage {
738    fn from(s: &str) -> Self {
739        ExtractedMessage {
740            message: s.to_owned(),
741            comment: "".into(),
742        }
743    }
744}
745
746/// Extract translatable strings from `document`.
747///
748/// # Examples
749///
750/// Structural markup like headings and lists are removed from the
751/// messages:
752///
753/// ```
754/// use mdbook_i18n_helpers::extract_messages;
755///
756/// assert_eq!(
757///     extract_messages("# A heading").unwrap(),
758///     vec![(1, "A heading".into())],
759/// );
760/// assert_eq!(
761///     extract_messages(
762///         "1. First item\n\
763///          2. Second item\n"
764///     ).unwrap(),
765///     vec![
766///         (1, "First item".into()),
767///         (2, "Second item".into()),
768///     ],
769/// );
770/// ```
771///
772/// Indentation due to structural elements like block quotes and lists
773/// is ignored:
774///
775/// ```
776/// use mdbook_i18n_helpers::extract_messages;
777///
778/// let messages = extract_messages(
779///     "> *   Hello, this is a\n\
780///      >     list in a quote.\n\
781///      >\n\
782///      >     This is the second\n\
783///      >     paragraph.\n"
784/// ).unwrap();
785/// assert_eq!(
786///     messages,
787///     vec![
788///         (1, "Hello, this is a list in a quote.".into()),
789///         (4, "This is the second paragraph.".into()),
790///     ],
791/// );
792/// ```
793pub fn extract_messages(document: &str) -> Result<Vec<(usize, ExtractedMessage)>, CmarkError> {
794    let events = extract_events(document, None);
795    let mut messages = Vec::new();
796    let mut state = None;
797
798    for group in group_events(&events)? {
799        match group {
800            Group::Translate { events, comment } => {
801                if let Some((lineno, _)) = events.first() {
802                    let (text, new_state) = reconstruct_markdown(&events, state)?;
803                    // Skip empty messages since they are special:
804                    // they contains the PO file metadata.
805                    if !text.trim().is_empty() {
806                        messages.push((
807                            *lineno,
808                            ExtractedMessage {
809                                message: text,
810                                comment,
811                            },
812                        ));
813                    }
814                    state = Some(new_state);
815                }
816            }
817            Group::Skip(events) => {
818                let (_, new_state) = reconstruct_markdown(&events, state)?;
819                state = Some(new_state);
820            }
821        }
822    }
823
824    Ok(messages)
825}
826
827/// Trim `new_events` if they're wrapped in an unwanted paragraph.
828///
829/// If `new_events` is wrapped in a paragraph and `old_events` isn't,
830/// then the paragraph is removed. This is useful when a text event
831/// has been wrapped in a paragraph:
832///
833/// ```
834/// use pulldown_cmark::{Event, Tag, TagEnd};
835/// use mdbook_i18n_helpers::{extract_events, reconstruct_markdown, trim_paragraph};
836///
837/// let old_events = vec![(1, Event::Text("A line of text".into()))];
838/// let (markdown, _) = reconstruct_markdown(&old_events, None).unwrap();
839/// let new_events = extract_events(&markdown, None);
840/// // The stand-alone text has been wrapped in an extra paragraph:
841/// assert_eq!(
842///     new_events,
843///     &[
844///         (1, Event::Start(Tag::Paragraph)),
845///         (1, Event::Text("A line of text".into())),
846///         (1, Event::End(TagEnd::Paragraph)),
847///     ],
848/// );
849///
850/// assert_eq!(
851///     trim_paragraph(&new_events, &old_events),
852///     &[(1, Event::Text("A line of text".into()))],
853/// );
854/// ```
855pub fn trim_paragraph<'a, 'event>(
856    new_events: &'a [(usize, Event<'event>)],
857    old_events: &'a [(usize, Event<'event>)],
858) -> &'a [(usize, Event<'event>)] {
859    use pulldown_cmark::Event::{End, Start};
860    use pulldown_cmark::Tag::Paragraph;
861    match new_events {
862        [(_, Start(Paragraph)), inner @ .., (_, End(TagEnd::Paragraph))] => match old_events {
863            [(_, Start(Paragraph)), .., (_, End(TagEnd::Paragraph))] => new_events,
864            [..] => inner,
865        },
866        [..] => new_events,
867    }
868}
869
870/// Translate `events` using `catalog`.
871pub fn translate_events<'a>(
872    events: &'a [(usize, Event<'a>)],
873    catalog: &'a Catalog,
874) -> Result<Vec<(usize, Event<'a>)>, CmarkError> {
875    let mut translated_events = Vec::new();
876    let mut state = None;
877
878    for group in group_events(events)? {
879        match group {
880            Group::Translate { events, .. } => {
881                // Reconstruct the message.
882                let (msgid, new_state) = reconstruct_markdown(&events, state.clone())?;
883                let translated = catalog
884                    .find_message(None, &msgid, None)
885                    .filter(|msg| !msg.flags().is_fuzzy() && msg.is_translated())
886                    .and_then(|msg| msg.msgstr().ok());
887                match translated {
888                    Some(msgstr) => {
889                        // Generate new events for `msgstr`, taking
890                        // care to trim away unwanted paragraphs.
891                        translated_events.extend_from_slice(trim_paragraph(
892                            &extract_events(msgstr, state),
893                            &events,
894                        ));
895                    }
896                    None => translated_events.extend_from_slice(&events),
897                }
898                // Advance the state.
899                state = Some(new_state);
900            }
901            Group::Skip(events) => {
902                // Copy the events unchanged to the output.
903                translated_events.extend_from_slice(&events);
904                // Advance the state.
905                let (_, new_state) = reconstruct_markdown(&events, state)?;
906                state = Some(new_state);
907            }
908        }
909    }
910
911    Ok(translated_events)
912}
913
914#[cfg(test)]
915mod tests {
916    use super::*;
917    use pretty_assertions::assert_eq;
918    use pulldown_cmark::Alignment;
919    use pulldown_cmark::CodeBlockKind;
920    use pulldown_cmark::Event::*;
921    use pulldown_cmark::HeadingLevel::*;
922    use pulldown_cmark::Tag::*;
923
924    /// Extract messages in `document`, assert they match `expected`.
925    #[track_caller]
926    fn assert_extract_messages(document: &str, expected: &[(usize, &str)]) {
927        assert_eq!(
928            extract_messages(document)
929                .unwrap()
930                .iter()
931                .map(|(lineno, msg)| (*lineno, &msg.message[..]))
932                .collect::<Vec<_>>(),
933            expected,
934        );
935    }
936
937    #[test]
938    fn extract_events_empty() {
939        assert_eq!(extract_events("", None), vec![]);
940    }
941
942    #[test]
943    fn extract_events_paragraph() {
944        assert_eq!(
945            extract_events("foo bar", None),
946            vec![
947                (1, Start(Paragraph)),
948                (1, Text("foo bar".into())),
949                (1, End(TagEnd::Paragraph)),
950            ]
951        );
952    }
953
954    #[test]
955    fn extract_events_softbreak() {
956        assert_eq!(
957            extract_events("foo\nbar", None),
958            vec![
959                (1, Start(Paragraph)),
960                (1, Text("foo".into())),
961                (1, Text(" ".into())),
962                (2, Text("bar".into())),
963                (1, End(TagEnd::Paragraph)),
964            ]
965        );
966    }
967
968    #[test]
969    fn extract_events_heading() {
970        assert_eq!(
971            extract_events("# Foo Bar", None),
972            vec![
973                (
974                    1,
975                    Start(Tag::Heading {
976                        level: H1,
977                        id: None,
978                        classes: vec![],
979                        attrs: vec![]
980                    })
981                ),
982                (1, Text("Foo Bar".into())),
983                (1, End(TagEnd::Heading(H1))),
984            ]
985        );
986    }
987
988    #[test]
989    fn extract_events_list_item() {
990        assert_eq!(
991            extract_events("* foo bar", None),
992            vec![
993                (1, Start(List(None))),
994                (1, Start(Item)),
995                (1, Text("foo bar".into())),
996                (1, End(TagEnd::Item)),
997                (1, End(TagEnd::List(false))),
998            ]
999        );
1000    }
1001
1002    #[test]
1003    fn extract_events_code_block() {
1004        let (_, state) =
1005            reconstruct_markdown(&[(1, Start(CodeBlock(CodeBlockKind::Indented)))], None).unwrap();
1006        assert_eq!(
1007            extract_events("foo\nbar\nbaz", Some(state)),
1008            vec![
1009                (1, Text("foo\n".into())),
1010                (2, Text("bar\n".into())),
1011                (3, Text("baz".into())),
1012            ]
1013        );
1014
1015        // Compare with extraction without state:
1016        assert_eq!(
1017            extract_events("foo\nbar\nbaz", None),
1018            vec![
1019                (1, Start(Paragraph)),
1020                (1, Text("foo".into())),
1021                (1, Text(" ".into())),
1022                (2, Text("bar".into())),
1023                (2, Text(" ".into())),
1024                (3, Text("baz".into())),
1025                (1, End(TagEnd::Paragraph)),
1026            ]
1027        );
1028    }
1029
1030    #[test]
1031    fn extract_events_comments() {
1032        assert_eq!(
1033            extract_events("<!-- mdbook-xgettext:skip -->\nHello", None),
1034            vec![
1035                (1, Start(HtmlBlock)),
1036                (1, Html("<!-- mdbook-xgettext:skip -->\n".into())),
1037                (1, End(TagEnd::HtmlBlock)),
1038                (2, Start(Paragraph)),
1039                (2, Text("Hello".into())),
1040                (2, End(TagEnd::Paragraph)),
1041            ]
1042        );
1043    }
1044
1045    #[test]
1046    fn extract_events_html_block() {
1047        let (_, state) = reconstruct_markdown(
1048            &[
1049                (1, Start(Table(vec![Alignment::None]))),
1050                (1, Start(TableHead)),
1051                (1, Start(TableCell)),
1052            ],
1053            None,
1054        )
1055        .unwrap();
1056        // Should be parsed as an inline in a table.
1057        assert_eq!(
1058            extract_events("<img />", Some(state)),
1059            vec![(1, InlineHtml("<img />".into()))]
1060        );
1061
1062        // Compare with extraction without state:
1063        assert_eq!(
1064            extract_events("<img />", None),
1065            vec![
1066                (1, Start(HtmlBlock)),
1067                (1, Html("<img />".into())),
1068                (1, End(TagEnd::HtmlBlock)),
1069            ]
1070        );
1071    }
1072
1073    #[test]
1074    fn extract_messages_empty() {
1075        assert_extract_messages("", &[]);
1076    }
1077
1078    #[test]
1079    fn extract_messages_keep_empty_inline_html() {
1080        // Keep inline html tags
1081        assert_extract_messages("<span></span>", &[(1, "<span></span>")]);
1082    }
1083
1084    #[test]
1085    fn extract_messages_keep_whitespace_inline_html() {
1086        // span is an inline html tag so even whitespace is kept as is
1087        assert_extract_messages("<span>  </span>", &[(1, "<span>  </span>")]);
1088    }
1089
1090    #[test]
1091    fn extract_messages_ignore_whitespace_only_block_html() {
1092        // Whitespace in block level html tags is ignored
1093        assert_extract_messages("<p>  </p>", &[]);
1094    }
1095
1096    #[test]
1097    fn extract_messages_single_line() {
1098        assert_extract_messages("This is a paragraph.", &[(1, "This is a paragraph.")]);
1099    }
1100
1101    #[test]
1102    fn extract_messages_simple() {
1103        assert_extract_messages(
1104            "This is\n\
1105             the first\n\
1106             paragraph.🦀\n\
1107             \n\
1108             Second paragraph.",
1109            &[
1110                (1, "This is the first paragraph.🦀"),
1111                (5, "Second paragraph."),
1112            ],
1113        );
1114    }
1115
1116    #[test]
1117    fn extract_messages_leading_newlines() {
1118        assert_extract_messages(
1119            "\n\
1120             \n\
1121             \n\
1122             This is the\n\
1123             first paragraph.",
1124            &[(4, "This is the first paragraph.")],
1125        );
1126    }
1127
1128    #[test]
1129    fn extract_messages_trailing_newlines() {
1130        assert_extract_messages(
1131            "This is\n\
1132             a paragraph.\n\
1133             \n\
1134             \n",
1135            &[(1, "This is a paragraph.")],
1136        );
1137    }
1138
1139    #[test]
1140    fn extract_messages_styled_text() {
1141        // The parser normalizes "*emphasis*" to "_emphasis_" and
1142        // "__strong emphasis__" to "**strong emphasis**".
1143        assert_extract_messages(
1144            "**This** __~~message~~__ _has_ `code` *style*\n",
1145            &[(1, "**This** **~~message~~** _has_ `code` _style_")],
1146        );
1147    }
1148
1149    #[test]
1150    fn extract_messages_inline_html() {
1151        // Inline HTML tag is kept as is in the translation.
1152        assert_extract_messages(
1153            "Hi from <span dir=\"ltr\">Rust</div>",
1154            &[(1, "Hi from <span dir=\"ltr\">Rust</div>")],
1155        );
1156    }
1157
1158    #[test]
1159    fn extract_messages_block_html() {
1160        // block level HTML tag is skipped, but text inside is extracted.
1161        assert_extract_messages(
1162            "<div class=\"warning\">\n\
1163            \n\
1164            Beware of the dog!\n\
1165            \n\
1166            </div>",
1167            &[(3, "Beware of the dog!")],
1168        );
1169    }
1170
1171    #[test]
1172    fn extract_messages_mixed_html() {
1173        // block level HTML tag is skipped, but text inside is extracted with inline html as is.
1174        assert_extract_messages(
1175            "<div>\n\
1176            \n\
1177            Hi from <span dir=\"ltr\">Rust</span>\n\
1178            \n\
1179            </div>",
1180            &[(3, "Hi from <span dir=\"ltr\">Rust</span>")],
1181        );
1182    }
1183
1184    #[test]
1185    fn extract_messages_inline_link() {
1186        assert_extract_messages(
1187            "See [this page](https://example.com) for more info.",
1188            &[(1, "See [this page](https://example.com) for more info.")],
1189        );
1190    }
1191
1192    #[test]
1193    fn extract_messages_reference_link() {
1194        assert_extract_messages(
1195            "See [this page][1] for more info.\n\n\
1196             [1]: https://example.com",
1197            // The parser expands reference links on the fly.
1198            &[(1, "See [this page](https://example.com) for more info.")],
1199        );
1200    }
1201
1202    #[test]
1203    fn extract_messages_collapsed_link() {
1204        // We make the parser expand collapsed links on the fly.
1205        assert_extract_messages(
1206            "Click [here][]!\n\n\
1207             [here]: http://example.net/",
1208            &[(1, "Click [here](http://example.net/)!")],
1209        );
1210    }
1211
1212    #[test]
1213    fn extract_messages_shortcut_link() {
1214        assert_extract_messages(
1215            "Click [here]!\n\n\
1216             [here]: http://example.net/",
1217            &[(1, "Click [here](http://example.net/)!")],
1218        );
1219    }
1220
1221    #[test]
1222    fn extract_messages_autolink() {
1223        assert_extract_messages(
1224            "Visit <http://example.net>!",
1225            &[(1, "Visit <http://example.net>!")],
1226        );
1227    }
1228
1229    #[test]
1230    fn extract_messages_email() {
1231        assert_extract_messages(
1232            "Contact <info@example.net>!",
1233            &[(1, "Contact <info@example.net>!")],
1234        );
1235    }
1236
1237    #[test]
1238    fn extract_messages_broken_reference_link() {
1239        // A reference link without the corresponding link definition
1240        // results in an escaped link.
1241        //
1242        // See `SourceMap::extract_messages` for a more complex
1243        // approach which can work around this in some cases.
1244        assert_extract_messages("[foo][unknown]", &[(1, r"\[foo\]\[unknown\]")]);
1245    }
1246
1247    #[test]
1248    fn extract_messages_footnotes() {
1249        assert_extract_messages(
1250            "
1251The document[^1] text.
1252
1253[^1]: The footnote text.
1254",
1255            &[
1256                (2, "The document[^1] text."), //
1257                (4, "The footnote text."),
1258            ],
1259        );
1260    }
1261
1262    #[test]
1263    fn extract_messages_block_quote() {
1264        assert_extract_messages(
1265            r"One of my favorite quotes is:
1266
1267> Don't believe everything you read on the Internet.
1268>
1269> I didn't say this second part, but I needed a paragraph for testing.
1270
1271--Abraham Lincoln
1272",
1273            &[
1274                (1, "One of my favorite quotes is:"),
1275                (3, "Don't believe everything you read on the Internet."),
1276                (
1277                    5,
1278                    "I didn't say this second part, but I needed a paragraph for testing.",
1279                ),
1280                (7, "\\--Abraham Lincoln"),
1281            ],
1282        );
1283    }
1284
1285    #[test]
1286    fn extract_messages_table() {
1287        let input = "\
1288            | Module Type       | Description\n\
1289            |-------------------|-------------------------\n\
1290            | `rust_binary`     | Produces a Rust binary.\n\
1291            | `rust_library`    | Produces a Rust library.\n\
1292        ";
1293        assert_extract_messages(
1294            input,
1295            &[
1296                (1, "Module Type"),
1297                (1, "Description"),
1298                (3, "`rust_binary`"),
1299                (3, "Produces a Rust binary."),
1300                (4, "`rust_library`"),
1301                (4, "Produces a Rust library."),
1302            ],
1303        );
1304    }
1305
1306    #[test]
1307    fn extract_messages_code_block() {
1308        assert_extract_messages(
1309            "Preamble\n```rust\n// Example:\nfn hello() {\n  some_code()\n\n  todo!()\n}\n```\nPostamble",
1310            &[
1311                (1, "Preamble"),
1312                (
1313                    3,
1314                    "// Example:\n",
1315                ),
1316                (10, "Postamble"),
1317            ],
1318        );
1319    }
1320
1321    #[test]
1322    fn extract_messages_two_code_blocks() {
1323        assert_extract_messages(
1324            "```\n\
1325             \"First\" block\n\
1326             ```\n\
1327             ```\n\
1328             \"Second\" block\n\
1329             ```\n\
1330             ",
1331            &[
1332                (1, "```\n\"First\" block\n```"), //
1333                (4, "```\n\"Second\" block\n```"),
1334            ],
1335        );
1336    }
1337
1338    #[test]
1339    fn extract_messages_quoted_code_block() {
1340        assert_extract_messages(
1341            "\
1342            > Preamble\n\
1343            > ```rust\n\
1344            > fn hello() {\n\
1345            >     some_code()\n\
1346            >\n\
1347            >     // FIXME: do something here!\n\
1348            >     todo!()\n\
1349            > }\n\
1350            > ```\n\
1351            > Postamble",
1352            &[
1353                (1, "Preamble"),
1354                (6, "// FIXME: do something here!\n"),
1355                (10, "Postamble"),
1356            ],
1357        );
1358    }
1359
1360    #[test]
1361    fn extract_messages_code_block_with_block_comment() {
1362        assert_extract_messages(
1363            "```rust\n\
1364            /* block comment\n\
1365             * /* nested block comment\n\
1366             * */\n\
1367             * \n\
1368             * \n\
1369             * \n\
1370             * */\n\
1371            ```\n",
1372            &[(
1373                2,
1374                "/* block comment\n* /* nested block comment\n* */\n* \n* \n* \n* */",
1375            )],
1376        );
1377    }
1378
1379    #[test]
1380    fn extract_messages_code_block_with_continuous_line_comments() {
1381        assert_extract_messages(
1382            r"```rust
1383// continuous
1384// line
1385// comments
1386{
1387    // continuous
1388    // line
1389    // comments
1390    let a = 1; // single line comment
1391    let b = 1; // single line comment
1392}
1393```",
1394            &[
1395                (2, "// continuous\n// line\n// comments\n"),
1396                (6, "// continuous\n    // line\n    // comments\n"),
1397                (9, "// single line comment\n"),
1398                (10, "// single line comment\n"),
1399            ],
1400        );
1401    }
1402
1403    #[test]
1404    fn extract_messages_multi_language_code_blocks() {
1405        assert_extract_messages(
1406            r#"```c
1407// C
1408'C'; "C";
1409```
1410```html
1411<!-- HTML
1412HTML -->
1413```
1414```javascript
1415`JavaScript`
1416```
1417```ruby
1418# Ruby
1419```"#,
1420            &[
1421                (2, "// C\n'C'"),
1422                (3, "\"C\""),
1423                (6, "<!-- HTML\nHTML -->"),
1424                (10, "`JavaScript`"),
1425                (13, "# Ruby\n"),
1426            ],
1427        );
1428    }
1429
1430    #[test]
1431    fn extract_messages_details() {
1432        // This isn't great: we lose text following a HTML tag:
1433        assert_extract_messages(
1434            "Preamble\n\
1435             <details>\n\
1436             Some Details\n\
1437             </details>\n\
1438             \n\
1439             Postamble",
1440            &[
1441                (1, "Preamble"), //
1442                // Missing "Some Details"
1443                (6, "Postamble"),
1444            ],
1445        );
1446        // It works well enough when `<details>` has blank lines
1447        // before and after.
1448        assert_extract_messages(
1449            "Preamble\n\
1450             \n\
1451             <details>\n\
1452             \n\
1453             Some Details\n\
1454             \n\
1455             </details>\n\
1456             \n\
1457             Postamble",
1458            &[
1459                (1, "Preamble"), //
1460                (5, "Some Details"),
1461                (9, "Postamble"),
1462            ],
1463        );
1464    }
1465
1466    #[test]
1467    fn extract_messages_list() {
1468        assert_extract_messages(
1469            "Some text\n * List item 1🦀\n * List item 2\n\nMore text",
1470            &[
1471                (1, "Some text"), //
1472                (2, "List item 1🦀"),
1473                (3, "List item 2"),
1474                (5, "More text"),
1475            ],
1476        );
1477    }
1478
1479    #[test]
1480    fn extract_messages_multilevel_list() {
1481        assert_extract_messages(
1482            "Some text\n * List item 1\n * List item 2\n    * Sublist 1\n    * Sublist 2\n\nMore text",
1483            &[
1484                (1, "Some text"), //
1485                (2, "List item 1"),
1486                (3, "List item 2"),
1487                (4, "Sublist 1"),
1488                (5, "Sublist 2"),
1489                (7, "More text"),
1490            ],
1491        );
1492    }
1493
1494    #[test]
1495    fn extract_messages_list_with_paragraphs() {
1496        assert_extract_messages(
1497            r"* Item 1.
1498* Item 2,
1499  two lines.
1500
1501  * Sub 1.
1502  * Sub 2.
1503",
1504            &[
1505                (1, "Item 1."),
1506                (2, "Item 2, two lines."),
1507                (5, "Sub 1."),
1508                (6, "Sub 2."),
1509            ],
1510        );
1511    }
1512
1513    #[test]
1514    fn extract_messages_headings() {
1515        assert_extract_messages(
1516            r"Some text
1517# Headline News🦀
1518
1519* A
1520* List
1521
1522## Subheading
1523",
1524            &[
1525                (1, "Some text"),
1526                (2, "Headline News🦀"),
1527                (4, "A"),
1528                (5, "List"),
1529                (7, "Subheading"),
1530            ],
1531        );
1532    }
1533
1534    #[test]
1535    fn extract_messages_code_followed_by_details() {
1536        // This is a regression test for an error that would
1537        // incorrectly combine CodeBlock and HTML.
1538        assert_extract_messages(
1539            r"```bob
1540// BOB
1541```
1542
1543<details>
1544
1545* Blah blah
1546
1547</details>
1548",
1549            &[
1550                (1, "```bob\n// BOB\n```"), //
1551                (7, "Blah blah"),
1552            ],
1553        );
1554    }
1555
1556    #[test]
1557    fn extract_messages_backslashes() {
1558        // Demonstrate how a single backslash in the Markdown becomes
1559        // a backslash-escaped backslash when we extract the text.
1560        // This is consistent with the CommonMark spec:
1561        // https://spec.commonmark.org/0.30/#backslash-escapes.
1562        // However, it causes problems for LaTeX preprocessors:
1563        // https://github.com/google/mdbook-i18n-helpers/issues/105.
1564        assert_extract_messages(
1565            r"
1566$$
1567\sum_{n=1}^{\infty} 2^{-n} = 1
1568$$
1569",
1570            &[(2, r"$$ \\sum\_{n=1}^{\infty} 2^{-n} = 1 $$")],
1571        );
1572    }
1573
1574    #[test]
1575
1576    fn extract_messages_skip_simple() {
1577        assert_extract_messages(
1578            r"<!-- mdbook-xgettext:skip -->
1579
1580This is a paragraph.",
1581            &[],
1582        );
1583    }
1584
1585    #[test]
1586    fn extract_messages_skip_next_paragraph_ok() {
1587        assert_extract_messages(
1588            r"<!-- mdbook-xgettext:skip -->
1589This is a paragraph.
1590
1591This should be translated.
1592",
1593            &[(4, "This should be translated.")],
1594        );
1595    }
1596
1597    #[test]
1598    fn extract_messages_skip_next_codeblock() {
1599        assert_extract_messages(
1600            r"<!-- mdbook-xgettext:skip -->
1601```
1602def f(x): return x * x
1603```
1604This should be translated.
1605",
1606            &[(5, "This should be translated.")],
1607        );
1608    }
1609
1610    #[test]
1611    fn extract_messages_skip_back_to_back() {
1612        assert_extract_messages(
1613            r"<!-- mdbook-xgettext:skip -->
1614```
1615def f(x): return x * x
1616```
1617<!-- mdbook-xgettext:skip -->
1618This should not translated.
1619
1620But *this* should!
1621",
1622            &[(8, "But _this_ should!")],
1623        );
1624    }
1625
1626    #[test]
1627    fn extract_messages_block_html_skip() {
1628        // The comment is a block level html tag.
1629        assert_extract_messages(
1630            "<!-- mdbook-xgettext:skip -->\n\
1631            This is ignored\n\
1632            \n\
1633            but this is not",
1634            &[(4, "but this is not")],
1635        );
1636    }
1637
1638    #[test]
1639    fn extract_messages_inline_html_skips() {
1640        // The comment is an inline html tag.
1641        assert_extract_messages(
1642            "
1643this should be translated <!-- mdbook-xgettext:skip --> but not this.
1644... nor this.
1645
1646But *this* should!",
1647            &[(2, "this should be translated "), (5, "But _this_ should!")],
1648        );
1649    }
1650
1651    #[test]
1652    fn extract_messages_skipping_second_item() {
1653        assert_extract_messages(
1654            "
1655* A
1656<!-- mdbook-xgettext:skip -->
1657* B
1658* C
1659",
1660            &[(2, "A"), (5, "C")],
1661        );
1662    }
1663
1664    #[test]
1665    fn extract_messages_skipping_second_paragraphed_item() {
1666        assert_extract_messages(
1667            "
1668* A
1669
1670<!-- mdbook-xgettext:skip -->
1671* B
1672
1673* C
1674",
1675            &[(2, "A"), (7, "C")],
1676        );
1677    }
1678
1679    #[test]
1680    fn extract_messages_skipping_inline_second_item() {
1681        // This isn't great: we lose text following a HTML comment.
1682        // Very similar to the failure mode of the
1683        // `extract_messages_details` test.
1684        //
1685        // The root cause is due to the Markdown spec and how the
1686        // Markdown parser treats HTML blocks.  The text that
1687        // immediately follows an HTML block on the same line is
1688        // included as part of the HTML block.
1689        assert_extract_messages(
1690            "
1691* A
1692* <!-- mdbook-xgettext:skip --> B
1693* C
1694",
1695            &[(2, "A")],
1696        );
1697    }
1698
1699    #[test]
1700    fn extract_messages_inline_skip_to_end_of_block() {
1701        assert_extract_messages(
1702            "foo <!-- mdbook-xgettext:skip --> **bold** bar
1703still skipped
1704
1705not-skipped",
1706            &[(1, "foo "), (4, "not-skipped")],
1707        );
1708    }
1709
1710    #[test]
1711    fn extract_messages_automatic_skipping_nontranslatable_codeblocks_simple() {
1712        assert_extract_messages(
1713            r"
1714```python
1715def g(x):
1716  this_should_be_skipped_no_strings_or_comments()
1717```
1718",
1719            &[],
1720        );
1721    }
1722
1723    #[test]
1724    fn extract_messages_automatic_skipping_nontranslatable_codeblocks() {
1725        assert_extract_messages(
1726            r#"
1727```python
1728def f(x):
1729  print("this should be translated")
1730```
1731
1732
1733```python
1734def g(x):
1735  but_this_should_not()
1736```
1737"#,
1738            &[(4, "\"this should be translated\"")],
1739        );
1740    }
1741
1742    #[test]
1743    fn extract_messages_without_language_specifier() {
1744        assert_extract_messages(
1745            r#"
1746```
1747def f(x):
1748  print("this should be translated")
1749```
1750
1751
1752```
1753def g(x):
1754  but_this_should_not()
1755```
1756"#,
1757            &[(
1758                2,
1759                "```\ndef f(x):\n  print(\"this should be translated\")\n```",
1760            )],
1761        );
1762    }
1763
1764    #[test]
1765    fn extract_messages_codeblock_in_codeblock() {
1766        assert_extract_messages(
1767            r#"
1768````
1769```
1770// codeblock in codeblock
1771```
1772````
1773"#,
1774            &[(2, "````\n```\n// codeblock in codeblock\n```\n````")],
1775        );
1776    }
1777
1778    #[test]
1779    fn extract_message_comments() {
1780        assert_eq!(
1781            extract_messages(
1782                "
1783<!-- mdbook-xgettext:comment: first comment! -->
1784Hello world!
1785"
1786            )
1787            .unwrap(),
1788            vec![(
1789                3,
1790                ExtractedMessage {
1791                    message: "Hello world!".into(),
1792                    comment: "first comment!".into(),
1793                }
1794            )]
1795        );
1796    }
1797
1798    #[test]
1799    fn extract_message_comments_multiple_joined() {
1800        assert_eq!(
1801            extract_messages(
1802                "
1803<!-- mdbook-xgettext:comment: this is a test -->
1804<!-- mdbook-xgettext:comment: of a comment that spans. -->
1805Greetings!
1806"
1807            )
1808            .unwrap(),
1809            vec![(
1810                4,
1811                ExtractedMessage {
1812                    message: "Greetings!".into(),
1813                    comment: "this is a test of a comment that spans.".into(),
1814                }
1815            )]
1816        );
1817    }
1818
1819    #[test]
1820    fn extract_message_multiple_comments() {
1821        assert_eq!(
1822            extract_messages(
1823                "
1824before-no-comment
1825
1826<!-- mdbook-xgettext:comment: another -->
1827Hello again, this is some text
1828with a comment on it.
1829
1830<!-- mdbook-xgettext:comment: one more comment. -->
1831after
1832
1833after-no-comment
1834"
1835            )
1836            .unwrap(),
1837            vec![
1838                (
1839                    2,
1840                    ExtractedMessage {
1841                        message: "before-no-comment".into(),
1842                        comment: "".into(),
1843                    }
1844                ),
1845                (
1846                    5,
1847                    ExtractedMessage {
1848                        message: "Hello again, this is some text with a comment on it.".into(),
1849                        comment: "another".into(),
1850                    }
1851                ),
1852                (
1853                    9,
1854                    ExtractedMessage {
1855                        message: "after".into(),
1856                        comment: "one more comment.".into(),
1857                    }
1858                ),
1859                (
1860                    11,
1861                    ExtractedMessage {
1862                        message: "after-no-comment".into(),
1863                        comment: "".into(),
1864                    }
1865                ),
1866            ]
1867        );
1868    }
1869
1870    #[test]
1871    fn extract_message_comments_on_codeblock() {
1872        assert_eq!(
1873            extract_messages(
1874                r#"
1875<!-- mdbook-xgettext:comment: greetings! -->
1876```python
1877print("Hello world")
1878```
1879"#
1880            )
1881            .unwrap(),
1882            vec![(
1883                4,
1884                ExtractedMessage {
1885                    message: "\"Hello world\"".into(),
1886                    comment: "greetings!".into(),
1887                }
1888            ),]
1889        );
1890    }
1891
1892    #[test]
1893    fn extract_admonish_codeblock() {
1894        assert_extract_messages(
1895            r#"```admonish tip title="Important Tips"
1896My Message
1897```"#,
1898            &[(
1899                1,
1900                "```admonish tip title=\"Important Tips\"\nMy Message\n```",
1901            )],
1902        );
1903    }
1904
1905    #[test]
1906    fn extract_admonish_codeblock_no_title() {
1907        assert_extract_messages(
1908            r#"```admonish tip
1909My Message
1910```"#,
1911            &[(1, "```admonish tip\nMy Message\n```")],
1912        );
1913    }
1914
1915    #[test]
1916    fn extract_admonish_codeblock_no_close_codeblock() {
1917        assert_extract_messages(
1918            r#"```admonish tip
1919My Message
1920"#,
1921            &[(1, "```admonish tip\nMy Message\n```")],
1922        );
1923    }
1924
1925    #[test]
1926    fn extract_newlang_codeblock_string() {
1927        assert_extract_messages(
1928            r#"```new_lang
1929some_syntax = "My String";
1930```"#,
1931            &[(1, "```new_lang\nsome_syntax = \"My String\";\n```")],
1932        );
1933    }
1934
1935    #[test]
1936    fn extract_nolang_codeblock_string() {
1937        assert_extract_messages(
1938            r#"```
1939some_syntax = "My String";
1940```"#,
1941            &[(1, "```\nsome_syntax = \"My String\";\n```")],
1942        );
1943    }
1944
1945    #[test]
1946    fn extract_nolang_nostring_codeblock() {
1947        assert_extract_messages(
1948            r#"```
1949some_syntax = do_something();
1950```"#,
1951            &[],
1952        );
1953    }
1954}