Skip to main content

lang_check/
scoping.rs

1use std::ops::Range;
2
3/// A region of text with an explicitly annotated natural language.
4///
5/// Parsed from scope markers like `<!-- lang: fr -->` or `// @lang: de`.
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub struct ScopedRegion {
8    /// BCP-47 language tag (e.g. "fr", "de", "en-US").
9    pub language: String,
10    /// Byte range this scope covers (from the marker to the next marker or EOF).
11    pub byte_range: Range<usize>,
12}
13
14/// Parses language scope annotations from document text.
15///
16/// Supports the following marker formats:
17/// - `<!-- lang: xx -->` (HTML/Markdown comments)
18/// - `// @lang: xx` (line comments)
19/// - `/* @lang: xx */` (block comments)
20/// - `% @lang: xx` (LaTeX comments)
21pub struct ScopeParser;
22
23impl ScopeParser {
24    /// Extract all language scope regions from the given text.
25    ///
26    /// Returns scoped regions sorted by byte offset. Text between
27    /// the start of the document and the first marker (or with no markers
28    /// at all) is *not* included - the caller should fall back to the
29    /// default language for those ranges.
30    #[must_use]
31    pub fn parse(text: &str) -> Vec<ScopedRegion> {
32        let mut markers: Vec<(usize, String)> = Vec::new();
33
34        for (line_start, line) in line_byte_offsets(text) {
35            if let Some(lang) = Self::extract_marker(line) {
36                // The scope starts after the marker line
37                let scope_start = line_start + line.len();
38                // Skip trailing newline if present
39                let scope_start = if text.as_bytes().get(scope_start) == Some(&b'\n') {
40                    scope_start + 1
41                } else {
42                    scope_start
43                };
44                markers.push((scope_start, lang));
45            }
46        }
47
48        let mut regions = Vec::with_capacity(markers.len());
49        for (i, (start, lang)) in markers.iter().enumerate() {
50            let end = markers.get(i + 1).map_or(text.len(), |(next_start, _)| {
51                // Walk back to before the marker line
52                text[..*next_start]
53                    .rfind('\n')
54                    .map_or(*next_start, |nl_pos| {
55                        // Find the start of the marker line
56                        text[..nl_pos].rfind('\n').map_or(0, |prev_nl| prev_nl + 1)
57                    })
58            });
59
60            if end > *start {
61                regions.push(ScopedRegion {
62                    language: lang.clone(),
63                    byte_range: *start..end,
64                });
65            }
66        }
67
68        regions
69    }
70
71    /// Look up the language for a given byte offset, if it falls within a scoped region.
72    #[must_use]
73    pub fn language_at(regions: &[ScopedRegion], byte_offset: usize) -> Option<&str> {
74        regions
75            .iter()
76            .find(|r| r.byte_range.contains(&byte_offset))
77            .map(|r| r.language.as_str())
78    }
79
80    fn extract_marker(line: &str) -> Option<String> {
81        let trimmed = line.trim();
82
83        // <!-- lang: xx --> format
84        if let Some(rest) = trimmed.strip_prefix("<!--")
85            && let Some(inner) = rest.strip_suffix("-->")
86        {
87            return Self::parse_lang_directive(inner.trim());
88        }
89
90        // // @lang: xx format
91        if let Some(rest) = trimmed.strip_prefix("//") {
92            return Self::parse_lang_directive(rest.trim());
93        }
94
95        // /* @lang: xx */ format
96        if let Some(rest) = trimmed.strip_prefix("/*")
97            && let Some(inner) = rest.strip_suffix("*/")
98        {
99            return Self::parse_lang_directive(inner.trim());
100        }
101
102        // % @lang: xx format (LaTeX)
103        if let Some(rest) = trimmed.strip_prefix('%') {
104            return Self::parse_lang_directive(rest.trim());
105        }
106
107        None
108    }
109
110    fn parse_lang_directive(s: &str) -> Option<String> {
111        // Accept: "lang: xx", "@lang: xx", "lang:xx", "@lang:xx"
112        let s = s.strip_prefix('@').unwrap_or(s);
113        let s = s.strip_prefix("lang").unwrap_or_default();
114        let s = s.strip_prefix(':').unwrap_or_default();
115        let lang = s.trim();
116
117        if lang.is_empty() || lang.len() > 10 || lang.contains(' ') {
118            return None;
119        }
120
121        Some(lang.to_string())
122    }
123}
124
125/// Yields `(byte_offset_of_line_start, line_str)` for each line including the trailing `\n`.
126fn line_byte_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
127    let mut offset = 0;
128    text.split_inclusive('\n').map(move |line| {
129        let start = offset;
130        offset += line.len();
131        (start, line)
132    })
133}
134
135#[cfg(test)]
136mod tests {
137    use super::*;
138
139    #[test]
140    fn html_comment_marker() {
141        let text = "English text.\n<!-- lang: fr -->\nTexte français.\n";
142        let regions = ScopeParser::parse(text);
143        assert_eq!(regions.len(), 1);
144        assert_eq!(regions[0].language, "fr");
145        let scoped_text = &text[regions[0].byte_range.clone()];
146        assert!(scoped_text.contains("Texte français"));
147    }
148
149    #[test]
150    fn line_comment_marker() {
151        let text = "English.\n// @lang: de\nDeutscher Text.\n";
152        let regions = ScopeParser::parse(text);
153        assert_eq!(regions.len(), 1);
154        assert_eq!(regions[0].language, "de");
155    }
156
157    #[test]
158    fn block_comment_marker() {
159        let text = "Hello.\n/* @lang: es */\nTexto español.\n";
160        let regions = ScopeParser::parse(text);
161        assert_eq!(regions.len(), 1);
162        assert_eq!(regions[0].language, "es");
163    }
164
165    #[test]
166    fn latex_comment_marker() {
167        let text = "English.\n% @lang: fr\nFrançais.\n";
168        let regions = ScopeParser::parse(text);
169        assert_eq!(regions.len(), 1);
170        assert_eq!(regions[0].language, "fr");
171    }
172
173    #[test]
174    fn multiple_regions() {
175        let text = "\
176English paragraph.
177<!-- lang: fr -->
178Paragraphe français.
179<!-- lang: de -->
180Deutscher Absatz.
181";
182        let regions = ScopeParser::parse(text);
183        assert_eq!(regions.len(), 2);
184        assert_eq!(regions[0].language, "fr");
185        assert_eq!(regions[1].language, "de");
186    }
187
188    #[test]
189    fn no_markers() {
190        let text = "Just plain English text with no annotations.";
191        let regions = ScopeParser::parse(text);
192        assert!(regions.is_empty());
193    }
194
195    #[test]
196    fn language_at_lookup() {
197        let text = "Hello.\n<!-- lang: fr -->\nBonjour.\n";
198        let regions = ScopeParser::parse(text);
199        // "Bonjour" starts somewhere after the marker
200        let bonjour_offset = text.find("Bonjour").unwrap();
201        assert_eq!(
202            ScopeParser::language_at(&regions, bonjour_offset),
203            Some("fr")
204        );
205        assert_eq!(ScopeParser::language_at(&regions, 0), None);
206    }
207
208    #[test]
209    fn marker_without_at_sign() {
210        let text = "Hello.\n<!-- lang: ja -->\n日本語テキスト.\n";
211        let regions = ScopeParser::parse(text);
212        assert_eq!(regions.len(), 1);
213        assert_eq!(regions[0].language, "ja");
214    }
215
216    #[test]
217    fn ignores_invalid_markers() {
218        let text = "<!-- lang: -->\n<!-- lang: this is not a lang -->\n<!-- notlang: fr -->\n";
219        let regions = ScopeParser::parse(text);
220        assert!(regions.is_empty());
221    }
222}