Skip to main content

meta_language/
mixed_regions.rs

1use crate::configuration::RegionDetectionPolicy;
2use crate::source::{ByteRange, Point, SourceSpan};
3
4const TXT_LANGUAGE: &str = "txt";
5
6/// Embedded region discovered inside a mixed-language document.
7#[derive(Clone, Debug, PartialEq, Eq)]
8pub struct EmbeddedRegion {
9    language: String,
10    span: SourceSpan,
11}
12
13impl EmbeddedRegion {
14    pub(crate) const fn new(language: String, span: SourceSpan) -> Self {
15        Self { language, span }
16    }
17
18    /// Language detected for the embedded region.
19    #[must_use]
20    pub fn language(&self) -> &str {
21        &self.language
22    }
23
24    /// Source span covered by the embedded region.
25    #[must_use]
26    pub const fn span(&self) -> SourceSpan {
27        self.span
28    }
29}
30
31pub(crate) fn detect_embedded_regions(
32    text: &str,
33    language: &str,
34    policy: RegionDetectionPolicy,
35) -> Vec<EmbeddedRegion> {
36    let mut regions = Vec::new();
37    match language.to_ascii_lowercase().as_str() {
38        TXT_LANGUAGE => regions.push(region_for(text, TXT_LANGUAGE.to_string(), 0, text.len())),
39        "markdown" => {
40            regions.extend(detect_markdown_fenced_regions(text, policy));
41            regions.extend(detect_markdown_html_regions(text));
42        }
43        "html" => {
44            regions.extend(detect_html_element_regions(text, "script", "JavaScript"));
45            regions.extend(detect_html_element_regions(text, "style", "CSS"));
46            regions.extend(detect_html_style_attributes(text));
47        }
48        _ => {}
49    }
50    regions
51}
52
53fn detect_markdown_fenced_regions(
54    text: &str,
55    policy: RegionDetectionPolicy,
56) -> Vec<EmbeddedRegion> {
57    let mut regions = Vec::new();
58    let mut offset = 0;
59    let mut open_fence: Option<(String, usize)> = None;
60
61    for line in text.split_inclusive('\n') {
62        let trimmed = line.trim_end_matches(['\r', '\n']).trim_start();
63        if let Some((language_tag, content_start)) = open_fence.take() {
64            if trimmed.starts_with("```") {
65                if let Some(language) = region_language_from_tag_or_content(
66                    &language_tag,
67                    &text[content_start..offset],
68                    policy,
69                ) {
70                    regions.push(region_for(text, language, content_start, offset));
71                }
72            } else {
73                open_fence = Some((language_tag, content_start));
74            }
75        } else if let Some(rest) = trimmed.strip_prefix("```") {
76            let language_tag = rest
77                .split_whitespace()
78                .next()
79                .unwrap_or_default()
80                .to_string();
81            open_fence = Some((language_tag, offset + line.len()));
82        }
83        offset += line.len();
84    }
85
86    if let Some((language_tag, content_start)) = open_fence {
87        if let Some(language) =
88            region_language_from_tag_or_content(&language_tag, &text[content_start..], policy)
89        {
90            regions.push(region_for(text, language, content_start, text.len()));
91        }
92    }
93
94    regions
95}
96
97fn region_language_from_tag_or_content(
98    language_tag: &str,
99    content: &str,
100    policy: RegionDetectionPolicy,
101) -> Option<String> {
102    match policy {
103        RegionDetectionPolicy::NameDriven => {
104            (!language_tag.is_empty()).then(|| language_tag.to_string())
105        }
106        RegionDetectionPolicy::ContentDriven => {
107            Some(sniff_language(content).unwrap_or(TXT_LANGUAGE).to_string())
108        }
109        RegionDetectionPolicy::Both => {
110            if language_tag.is_empty() {
111                Some(sniff_language(content).unwrap_or(TXT_LANGUAGE).to_string())
112            } else {
113                Some(language_tag.to_string())
114            }
115        }
116    }
117}
118
119fn detect_markdown_html_regions(text: &str) -> Vec<EmbeddedRegion> {
120    let mut regions = Vec::new();
121    let mut search_start = 0;
122
123    while let Some(relative_start) = text[search_start..].find('<') {
124        let start = search_start + relative_start;
125        let Some(next) = text[start + 1..].chars().next() else {
126            break;
127        };
128        if !next.is_ascii_alphabetic() {
129            search_start = start + 1;
130            continue;
131        }
132
133        let Some(close) = text[start..].find('>') else {
134            break;
135        };
136        let first_tag_end = start + close + 1;
137        let tag_name = text[start + 1..first_tag_end - 1]
138            .split_whitespace()
139            .next()
140            .unwrap_or_default()
141            .trim_matches('/')
142            .to_ascii_lowercase();
143        if tag_name.is_empty() {
144            search_start = first_tag_end;
145            continue;
146        }
147
148        let closing_tag = format!("</{tag_name}>");
149        let region_end = text[first_tag_end..]
150            .to_ascii_lowercase()
151            .find(&closing_tag)
152            .map_or(first_tag_end, |relative_end| {
153                first_tag_end + relative_end + closing_tag.len()
154            });
155        regions.push(region_for(text, "HTML".to_string(), start, region_end));
156        search_start = region_end;
157    }
158
159    regions
160}
161
162fn detect_html_element_regions(text: &str, element: &str, language: &str) -> Vec<EmbeddedRegion> {
163    let mut regions = Vec::new();
164    let lower = text.to_ascii_lowercase();
165    let open = format!("<{element}");
166    let close = format!("</{element}>");
167    let mut search_start = 0;
168
169    while let Some(relative_start) = lower[search_start..].find(&open) {
170        let start = search_start + relative_start;
171        let Some(open_end_relative) = lower[start..].find('>') else {
172            break;
173        };
174        let content_start = start + open_end_relative + 1;
175        let Some(close_relative) = lower[content_start..].find(&close) else {
176            break;
177        };
178        let content_end = content_start + close_relative;
179        regions.push(region_for(
180            text,
181            language.to_string(),
182            content_start,
183            content_end,
184        ));
185        search_start = content_end + close.len();
186    }
187
188    regions
189}
190
191fn detect_html_style_attributes(text: &str) -> Vec<EmbeddedRegion> {
192    let mut regions = Vec::new();
193    let lower = text.to_ascii_lowercase();
194    let mut search_start = 0;
195
196    while let Some(relative_start) = lower[search_start..].find("style=\"") {
197        let value_start = search_start + relative_start + "style=\"".len();
198        let Some(value_end_relative) = text[value_start..].find('"') else {
199            break;
200        };
201        let value_end = value_start + value_end_relative;
202        regions.push(region_for(text, "CSS".to_string(), value_start, value_end));
203        search_start = value_end + 1;
204    }
205
206    regions
207}
208
209fn sniff_language(content: &str) -> Option<&'static str> {
210    let trimmed = content.trim_start();
211    let upper = trimmed.to_ascii_uppercase();
212
213    if trimmed.contains("fn main") {
214        Some("rust")
215    } else if trimmed.starts_with("def ") {
216        Some("Python")
217    } else if trimmed.starts_with('<') {
218        Some("HTML")
219    } else if trimmed.contains("function ")
220        || trimmed.contains("const ")
221        || trimmed.contains("let ")
222    {
223        Some("JavaScript")
224    } else if upper.starts_with("SELECT ") {
225        Some("SQL")
226    } else {
227        None
228    }
229}
230
231fn region_for(text: &str, language: String, start: usize, end: usize) -> EmbeddedRegion {
232    EmbeddedRegion::new(
233        language,
234        SourceSpan::new(
235            ByteRange::new(start, end),
236            point_at_byte(text, start),
237            point_at_byte(text, end),
238        ),
239    )
240}
241
242fn point_at_byte(text: &str, byte: usize) -> Point {
243    let mut row = 0;
244    let mut column = 0;
245
246    for (index, character) in text.char_indices() {
247        if index >= byte {
248            break;
249        }
250        if character == '\n' {
251            row += 1;
252            column = 0;
253        } else {
254            column += 1;
255        }
256    }
257
258    Point::new(row, column)
259}