use std::ops::Range;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ScopedRegion {
pub language: String,
pub byte_range: Range<usize>,
}
pub struct ScopeParser;
impl ScopeParser {
#[must_use]
pub fn parse(text: &str) -> Vec<ScopedRegion> {
let mut markers: Vec<(usize, String)> = Vec::new();
for (line_start, line) in line_byte_offsets(text) {
if let Some(lang) = Self::extract_marker(line) {
let scope_start = line_start + line.len();
let scope_start = if text.as_bytes().get(scope_start) == Some(&b'\n') {
scope_start + 1
} else {
scope_start
};
markers.push((scope_start, lang));
}
}
let mut regions = Vec::with_capacity(markers.len());
for (i, (start, lang)) in markers.iter().enumerate() {
let end = markers.get(i + 1).map_or(text.len(), |(next_start, _)| {
text[..*next_start]
.rfind('\n')
.map_or(*next_start, |nl_pos| {
text[..nl_pos].rfind('\n').map_or(0, |prev_nl| prev_nl + 1)
})
});
if end > *start {
regions.push(ScopedRegion {
language: lang.clone(),
byte_range: *start..end,
});
}
}
regions
}
#[must_use]
pub fn language_at(regions: &[ScopedRegion], byte_offset: usize) -> Option<&str> {
regions
.iter()
.find(|r| r.byte_range.contains(&byte_offset))
.map(|r| r.language.as_str())
}
fn extract_marker(line: &str) -> Option<String> {
let trimmed = line.trim();
if let Some(rest) = trimmed.strip_prefix("<!--")
&& let Some(inner) = rest.strip_suffix("-->")
{
return Self::parse_lang_directive(inner.trim());
}
if let Some(rest) = trimmed.strip_prefix("//") {
return Self::parse_lang_directive(rest.trim());
}
if let Some(rest) = trimmed.strip_prefix("/*")
&& let Some(inner) = rest.strip_suffix("*/")
{
return Self::parse_lang_directive(inner.trim());
}
if let Some(rest) = trimmed.strip_prefix('%') {
return Self::parse_lang_directive(rest.trim());
}
None
}
fn parse_lang_directive(s: &str) -> Option<String> {
let s = s.strip_prefix('@').unwrap_or(s);
let s = s.strip_prefix("lang").unwrap_or_default();
let s = s.strip_prefix(':').unwrap_or_default();
let lang = s.trim();
if lang.is_empty() || lang.len() > 10 || lang.contains(' ') {
return None;
}
Some(lang.to_string())
}
}
fn line_byte_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
let mut offset = 0;
text.split_inclusive('\n').map(move |line| {
let start = offset;
offset += line.len();
(start, line)
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn html_comment_marker() {
let text = "English text.\n<!-- lang: fr -->\nTexte français.\n";
let regions = ScopeParser::parse(text);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].language, "fr");
let scoped_text = &text[regions[0].byte_range.clone()];
assert!(scoped_text.contains("Texte français"));
}
#[test]
fn line_comment_marker() {
let text = "English.\n// @lang: de\nDeutscher Text.\n";
let regions = ScopeParser::parse(text);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].language, "de");
}
#[test]
fn block_comment_marker() {
let text = "Hello.\n/* @lang: es */\nTexto español.\n";
let regions = ScopeParser::parse(text);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].language, "es");
}
#[test]
fn latex_comment_marker() {
let text = "English.\n% @lang: fr\nFrançais.\n";
let regions = ScopeParser::parse(text);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].language, "fr");
}
#[test]
fn multiple_regions() {
let text = "\
English paragraph.
<!-- lang: fr -->
Paragraphe français.
<!-- lang: de -->
Deutscher Absatz.
";
let regions = ScopeParser::parse(text);
assert_eq!(regions.len(), 2);
assert_eq!(regions[0].language, "fr");
assert_eq!(regions[1].language, "de");
}
#[test]
fn no_markers() {
let text = "Just plain English text with no annotations.";
let regions = ScopeParser::parse(text);
assert!(regions.is_empty());
}
#[test]
fn language_at_lookup() {
let text = "Hello.\n<!-- lang: fr -->\nBonjour.\n";
let regions = ScopeParser::parse(text);
let bonjour_offset = text.find("Bonjour").unwrap();
assert_eq!(
ScopeParser::language_at(®ions, bonjour_offset),
Some("fr")
);
assert_eq!(ScopeParser::language_at(®ions, 0), None);
}
#[test]
fn marker_without_at_sign() {
let text = "Hello.\n<!-- lang: ja -->\n日本語テキスト.\n";
let regions = ScopeParser::parse(text);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].language, "ja");
}
#[test]
fn ignores_invalid_markers() {
let text = "<!-- lang: -->\n<!-- lang: this is not a lang -->\n<!-- notlang: fr -->\n";
let regions = ScopeParser::parse(text);
assert!(regions.is_empty());
}
}