meta_language/
mixed_regions.rs1use crate::configuration::RegionDetectionPolicy;
2use crate::source::{ByteRange, Point, SourceSpan};
3
4const TXT_LANGUAGE: &str = "txt";
5
6#[derive(Clone, Debug, PartialEq, Eq)]
8pub struct EmbeddedRegion {
9 language: String,
10 span: SourceSpan,
11}
12
13impl EmbeddedRegion {
14 pub(crate) const fn new(language: String, span: SourceSpan) -> Self {
15 Self { language, span }
16 }
17
18 #[must_use]
20 pub fn language(&self) -> &str {
21 &self.language
22 }
23
24 #[must_use]
26 pub const fn span(&self) -> SourceSpan {
27 self.span
28 }
29}
30
31pub(crate) fn detect_embedded_regions(
32 text: &str,
33 language: &str,
34 policy: RegionDetectionPolicy,
35) -> Vec<EmbeddedRegion> {
36 let mut regions = Vec::new();
37 match language.to_ascii_lowercase().as_str() {
38 TXT_LANGUAGE => regions.push(region_for(text, TXT_LANGUAGE.to_string(), 0, text.len())),
39 "markdown" => {
40 regions.extend(detect_markdown_fenced_regions(text, policy));
41 regions.extend(detect_markdown_html_regions(text));
42 }
43 "html" => {
44 regions.extend(detect_html_element_regions(text, "script", "JavaScript"));
45 regions.extend(detect_html_element_regions(text, "style", "CSS"));
46 regions.extend(detect_html_style_attributes(text));
47 }
48 _ => {}
49 }
50 regions
51}
52
53fn detect_markdown_fenced_regions(
54 text: &str,
55 policy: RegionDetectionPolicy,
56) -> Vec<EmbeddedRegion> {
57 let mut regions = Vec::new();
58 let mut offset = 0;
59 let mut open_fence: Option<(String, usize)> = None;
60
61 for line in text.split_inclusive('\n') {
62 let trimmed = line.trim_end_matches(['\r', '\n']).trim_start();
63 if let Some((language_tag, content_start)) = open_fence.take() {
64 if trimmed.starts_with("```") {
65 if let Some(language) = region_language_from_tag_or_content(
66 &language_tag,
67 &text[content_start..offset],
68 policy,
69 ) {
70 regions.push(region_for(text, language, content_start, offset));
71 }
72 } else {
73 open_fence = Some((language_tag, content_start));
74 }
75 } else if let Some(rest) = trimmed.strip_prefix("```") {
76 let language_tag = rest
77 .split_whitespace()
78 .next()
79 .unwrap_or_default()
80 .to_string();
81 open_fence = Some((language_tag, offset + line.len()));
82 }
83 offset += line.len();
84 }
85
86 if let Some((language_tag, content_start)) = open_fence {
87 if let Some(language) =
88 region_language_from_tag_or_content(&language_tag, &text[content_start..], policy)
89 {
90 regions.push(region_for(text, language, content_start, text.len()));
91 }
92 }
93
94 regions
95}
96
97fn region_language_from_tag_or_content(
98 language_tag: &str,
99 content: &str,
100 policy: RegionDetectionPolicy,
101) -> Option<String> {
102 match policy {
103 RegionDetectionPolicy::NameDriven => {
104 (!language_tag.is_empty()).then(|| language_tag.to_string())
105 }
106 RegionDetectionPolicy::ContentDriven => {
107 Some(sniff_language(content).unwrap_or(TXT_LANGUAGE).to_string())
108 }
109 RegionDetectionPolicy::Both => {
110 if language_tag.is_empty() {
111 Some(sniff_language(content).unwrap_or(TXT_LANGUAGE).to_string())
112 } else {
113 Some(language_tag.to_string())
114 }
115 }
116 }
117}
118
119fn detect_markdown_html_regions(text: &str) -> Vec<EmbeddedRegion> {
120 let mut regions = Vec::new();
121 let mut search_start = 0;
122
123 while let Some(relative_start) = text[search_start..].find('<') {
124 let start = search_start + relative_start;
125 let Some(next) = text[start + 1..].chars().next() else {
126 break;
127 };
128 if !next.is_ascii_alphabetic() {
129 search_start = start + 1;
130 continue;
131 }
132
133 let Some(close) = text[start..].find('>') else {
134 break;
135 };
136 let first_tag_end = start + close + 1;
137 let tag_name = text[start + 1..first_tag_end - 1]
138 .split_whitespace()
139 .next()
140 .unwrap_or_default()
141 .trim_matches('/')
142 .to_ascii_lowercase();
143 if tag_name.is_empty() {
144 search_start = first_tag_end;
145 continue;
146 }
147
148 let closing_tag = format!("</{tag_name}>");
149 let region_end = text[first_tag_end..]
150 .to_ascii_lowercase()
151 .find(&closing_tag)
152 .map_or(first_tag_end, |relative_end| {
153 first_tag_end + relative_end + closing_tag.len()
154 });
155 regions.push(region_for(text, "HTML".to_string(), start, region_end));
156 search_start = region_end;
157 }
158
159 regions
160}
161
162fn detect_html_element_regions(text: &str, element: &str, language: &str) -> Vec<EmbeddedRegion> {
163 let mut regions = Vec::new();
164 let lower = text.to_ascii_lowercase();
165 let open = format!("<{element}");
166 let close = format!("</{element}>");
167 let mut search_start = 0;
168
169 while let Some(relative_start) = lower[search_start..].find(&open) {
170 let start = search_start + relative_start;
171 let Some(open_end_relative) = lower[start..].find('>') else {
172 break;
173 };
174 let content_start = start + open_end_relative + 1;
175 let Some(close_relative) = lower[content_start..].find(&close) else {
176 break;
177 };
178 let content_end = content_start + close_relative;
179 regions.push(region_for(
180 text,
181 language.to_string(),
182 content_start,
183 content_end,
184 ));
185 search_start = content_end + close.len();
186 }
187
188 regions
189}
190
191fn detect_html_style_attributes(text: &str) -> Vec<EmbeddedRegion> {
192 let mut regions = Vec::new();
193 let lower = text.to_ascii_lowercase();
194 let mut search_start = 0;
195
196 while let Some(relative_start) = lower[search_start..].find("style=\"") {
197 let value_start = search_start + relative_start + "style=\"".len();
198 let Some(value_end_relative) = text[value_start..].find('"') else {
199 break;
200 };
201 let value_end = value_start + value_end_relative;
202 regions.push(region_for(text, "CSS".to_string(), value_start, value_end));
203 search_start = value_end + 1;
204 }
205
206 regions
207}
208
209fn sniff_language(content: &str) -> Option<&'static str> {
210 let trimmed = content.trim_start();
211 let upper = trimmed.to_ascii_uppercase();
212
213 if trimmed.contains("fn main") {
214 Some("rust")
215 } else if trimmed.starts_with("def ") {
216 Some("Python")
217 } else if trimmed.starts_with('<') {
218 Some("HTML")
219 } else if trimmed.contains("function ")
220 || trimmed.contains("const ")
221 || trimmed.contains("let ")
222 {
223 Some("JavaScript")
224 } else if upper.starts_with("SELECT ") {
225 Some("SQL")
226 } else {
227 None
228 }
229}
230
231fn region_for(text: &str, language: String, start: usize, end: usize) -> EmbeddedRegion {
232 EmbeddedRegion::new(
233 language,
234 SourceSpan::new(
235 ByteRange::new(start, end),
236 point_at_byte(text, start),
237 point_at_byte(text, end),
238 ),
239 )
240}
241
242fn point_at_byte(text: &str, byte: usize) -> Point {
243 let mut row = 0;
244 let mut column = 0;
245
246 for (index, character) in text.char_indices() {
247 if index >= byte {
248 break;
249 }
250 if character == '\n' {
251 row += 1;
252 column = 0;
253 } else {
254 column += 1;
255 }
256 }
257
258 Point::new(row, column)
259}