use memchr::memmem;
use oxc_span::SourceType;
#[inline(always)]
const fn is_ws(b: u8) -> bool {
matches!(b, b' ' | b'\t' | b'\n' | b'\r')
}
#[inline(always)]
const fn is_attr_name_end(b: u8) -> bool {
matches!(b, b'=' | b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/')
}
#[inline(always)]
const fn is_unquoted_value_end(b: u8) -> bool {
matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/')
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
pub enum ScriptLanguage {
JavaScript,
#[default]
TypeScript,
JSX,
TSX,
Unknown,
}
impl ScriptLanguage {
pub fn from_bytes(lang: &[u8]) -> Self {
match lang {
b"ts" | b"typescript" => ScriptLanguage::TypeScript,
b"tsx" => ScriptLanguage::TSX,
b"jsx" => ScriptLanguage::JSX,
b"js" | b"javascript" => ScriptLanguage::JavaScript,
_ => ScriptLanguage::Unknown,
}
}
pub fn to_source_type(&self) -> SourceType {
match self {
ScriptLanguage::JavaScript => SourceType::mjs(),
ScriptLanguage::TypeScript => SourceType::ts(),
ScriptLanguage::JSX => SourceType::jsx(),
ScriptLanguage::TSX => SourceType::tsx(),
ScriptLanguage::Unknown => SourceType::cjs(),
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
pub struct DetectResult {
pub language: ScriptLanguage,
pub script_found: bool,
pub lang_attr_found: bool,
pub lang_start: usize,
pub lang_end: usize,
}
pub struct ScriptDetector {
script_finder: memmem::Finder<'static>,
comment_start_finder: memmem::Finder<'static>,
comment_end_finder: memmem::Finder<'static>,
}
impl Default for ScriptDetector {
fn default() -> Self {
Self::new()
}
}
impl ScriptDetector {
pub fn new() -> Self {
Self {
script_finder: memmem::Finder::new(b"<script"),
comment_start_finder: memmem::Finder::new(b"<!--"),
comment_end_finder: memmem::Finder::new(b"-->"),
}
}
#[inline(always)]
fn validate_script_tag(bytes: &[u8], script_end: usize) -> Option<usize> {
match bytes.get(script_end) {
Some(b' ') => Some(script_end + 1), Some(b'>') => Some(script_end), Some(b'\t') | Some(b'\n') | Some(b'\r') => Some(script_end + 1), _ => None, }
}
#[inline(always)]
pub fn find_script(&self, bytes: &[u8]) -> Option<usize> {
if bytes.starts_with(b"<script") {
return Self::validate_script_tag(bytes, 7);
}
let mut search_start: usize = 0;
loop {
let script_offset = self.script_finder.find(&bytes[search_start..])?;
let script_pos = search_start + script_offset;
let script_end = script_pos + 7;
let Some(attr_start) = Self::validate_script_tag(bytes, script_end) else {
search_start = script_end;
continue;
};
if script_pos < 4 {
return Some(attr_start);
}
const WINDOW_SIZE: usize = 1024;
let window_start = script_pos.saturating_sub(WINDOW_SIZE).max(search_start);
let window = &bytes[window_start..script_pos];
let comment_offset = match self.comment_start_finder.find(window) {
Some(rel_offset) => window_start + rel_offset,
None => {
if window_start > search_start {
let before_window = &bytes[search_start..window_start];
if memchr::memchr(b'!', before_window).is_some() {
if let Some(early_comment) =
self.comment_start_finder.find(before_window)
{
let early_pos = search_start + early_comment + 4;
if let Some(end_off) = self
.comment_end_finder
.find(&bytes[early_pos..window_start])
{
search_start = early_pos + end_off + 3;
continue;
}
let Some(end_off) =
self.comment_end_finder.find(&bytes[early_pos..])
else {
return cold_unclosed_comment();
};
search_start = early_pos + end_off + 3;
continue;
}
}
}
return Some(attr_start);
}
};
let comment_content_start = comment_offset + 4;
if comment_content_start < script_pos {
let region_before_script = &bytes[comment_content_start..script_pos];
if let Some(end_in_region) = self.comment_end_finder.find(region_before_script) {
let after_comment = comment_content_start + end_in_region + 3;
if after_comment >= script_pos
|| self
.comment_start_finder
.find(&bytes[after_comment..script_pos])
.is_none()
{
return Some(attr_start);
}
search_start = after_comment;
continue;
}
}
let Some(end_offset) = self
.comment_end_finder
.find(&bytes[comment_content_start..])
else {
return cold_unclosed_comment();
};
search_start = comment_content_start + end_offset + 3;
}
}
#[inline(always)]
pub fn detect(&self, bytes: &[u8]) -> DetectResult {
let Some(n) = self.find_script(bytes) else {
return DetectResult {
language: ScriptLanguage::TypeScript,
script_found: false,
lang_attr_found: false,
lang_start: 0,
lang_end: 0,
};
};
if let Some(fast_result) = parse_script_tag_for_lang_fast(&bytes[n..]) {
return match fast_result {
Some(info) => DetectResult {
language: info.language,
script_found: true,
lang_attr_found: true,
lang_start: n + info.start,
lang_end: n + info.end,
},
None => DetectResult {
language: ScriptLanguage::TypeScript,
script_found: true,
lang_attr_found: false,
lang_start: 0,
lang_end: 0,
},
};
}
if let Some(info) = parse_script_tag_for_lang(&bytes[n..]) {
return DetectResult {
language: info.language,
script_found: true,
lang_attr_found: true,
lang_start: n + info.start,
lang_end: n + info.end,
};
}
DetectResult {
language: ScriptLanguage::TypeScript,
script_found: true,
lang_attr_found: false,
lang_start: 0,
lang_end: 0,
}
}
}
struct LangInfo {
language: ScriptLanguage,
start: usize,
end: usize,
}
#[inline(always)]
fn parse_script_tag_for_lang_fast(bytes: &[u8]) -> Option<Option<LangInfo>> {
if bytes.is_empty() || bytes[0] == b'>' {
return Some(None); }
let len = bytes.len();
let mut i = 0;
let mut in_quote: u8 = 0;
while i < len {
let b = bytes[i];
if in_quote != 0 {
if b == in_quote {
in_quote = 0;
}
i += 1;
continue;
}
if b == b'>' {
return Some(None); }
if b == b'/' && i + 1 < len && bytes[i + 1] == b'>' {
return Some(None);
}
if b == b'"' || b == b'\'' {
in_quote = b;
i += 1;
continue;
}
if b == b'l'
&& i + 4 < len
&& (i == 0 || is_ws(bytes[i - 1]))
&& bytes[i + 1] == b'a'
&& bytes[i + 2] == b'n'
&& bytes[i + 3] == b'g'
{
let mut j = i + 4;
while j < len && is_ws(bytes[j]) {
j += 1;
}
if j >= len {
return None; }
if bytes[j] != b'=' {
i += 1;
continue;
}
j += 1;
while j < len && is_ws(bytes[j]) {
j += 1;
}
if j >= len {
return None; }
let (value_start, value_end) = if bytes[j] == b'"' || bytes[j] == b'\'' {
let quote = bytes[j];
j += 1;
let start = j;
while j < len && bytes[j] != quote {
j += 1;
}
if j >= len {
return None; }
(start, j)
} else {
let start = j;
while j < len && !is_unquoted_value_end(bytes[j]) {
j += 1;
}
(start, j)
};
return Some(Some(LangInfo {
language: match_lang_value(&bytes[value_start..value_end]),
start: value_start,
end: value_end,
}));
}
i += 1;
}
None }
#[inline(always)]
fn parse_script_tag_for_lang(bytes: &[u8]) -> Option<LangInfo> {
if bytes.is_empty() {
return None;
}
if bytes[0] == b'>' {
return None;
}
let mut i = 0;
let len = bytes.len();
loop {
while i < len && is_ws(bytes[i]) {
i += 1;
}
if i >= len {
return None; }
if bytes[i] == b'>' {
return None; }
if bytes[i] == b'/' {
if i + 1 < len && bytes[i + 1] == b'>' {
return None;
}
i += 1;
continue;
}
let attr_start = i;
while i < len && !is_attr_name_end(bytes[i]) {
i += 1;
}
let is_lang_attr = i - attr_start == 4
&& bytes[attr_start] == b'l'
&& bytes[attr_start + 1] == b'a'
&& bytes[attr_start + 2] == b'n'
&& bytes[attr_start + 3] == b'g';
while i < len && is_ws(bytes[i]) {
i += 1;
}
if i >= len {
return None;
}
if bytes[i] == b'>' || bytes[i] == b'/' {
if is_lang_attr {
return Some(LangInfo {
language: ScriptLanguage::TypeScript,
start: i,
end: i,
});
}
if bytes[i] == b'>' {
return None;
}
continue;
}
if bytes[i] != b'=' {
continue;
}
i += 1;
while i < len && is_ws(bytes[i]) {
i += 1;
}
if i >= len {
return None;
}
let (value_start, value_end) = if bytes[i] == b'"' || bytes[i] == b'\'' {
let quote = bytes[i];
i += 1;
let start = i;
if is_lang_attr {
while i < len && bytes[i] != quote {
i += 1;
}
} else {
if let Some(pos) = memchr::memchr(quote, &bytes[i..]) {
i += pos;
} else {
return cold_unclosed_quote(); }
}
let end = i;
if i < len {
i += 1; }
(start, end)
} else {
let start = i;
while i < len && !is_unquoted_value_end(bytes[i]) {
i += 1;
}
(start, i)
};
if is_lang_attr {
return Some(LangInfo {
language: match_lang_value(&bytes[value_start..value_end]),
start: value_start,
end: value_end,
});
}
}
}
#[cold]
#[inline(never)]
fn cold_unclosed_comment() -> Option<usize> {
None
}
#[cold]
#[inline(never)]
fn cold_unclosed_quote() -> Option<LangInfo> {
None
}
#[inline(always)]
fn match_lang_value(value: &[u8]) -> ScriptLanguage {
match value {
b"ts" | b"typescript" => ScriptLanguage::TypeScript,
b"tsx" => ScriptLanguage::TSX,
b"jsx" => ScriptLanguage::JSX,
b"js" | b"javascript" => ScriptLanguage::JavaScript,
_ => ScriptLanguage::Unknown,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn no_script_tag() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<template><div>Hello</div></template>");
assert!(!result.script_found);
assert!(!result.lang_attr_found);
}
#[test]
fn script_no_lang() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<template/><script></script>");
assert!(result.script_found);
assert!(!result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TypeScript);
}
#[test]
fn script_lang_ts() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<template/><script lang=\"ts\"></script>");
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TypeScript);
}
#[test]
fn script_lang_tsx() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<template/><script lang=\"tsx\"></script>");
assert_eq!(result.language, ScriptLanguage::TSX);
}
#[test]
fn script_lang_jsx() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<template/><script lang=\"jsx\"></script>");
assert_eq!(result.language, ScriptLanguage::JSX);
}
#[test]
fn script_lang_js() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<template/><script lang=\"js\"></script>");
assert_eq!(result.language, ScriptLanguage::JavaScript);
}
#[test]
fn script_at_beginning() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<script lang=\"tsx\"></script><template/>");
assert_eq!(result.language, ScriptLanguage::TSX);
}
#[test]
fn skips_script_in_html_comment() {
let detector = ScriptDetector::new();
let result =
detector.detect(b"<!-- <script lang=\"js\"> -->\n<script lang=\"ts\">\n</script>");
assert!(result.script_found);
assert_eq!(result.language, ScriptLanguage::TypeScript);
}
#[test]
fn no_script_when_only_commented() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<template/><!-- <script lang=\"ts\"></script> -->");
assert!(!result.script_found);
}
#[test]
fn handles_bang_heavy_template() {
let detector = ScriptDetector::new();
let result = detector.detect(
b"<template><div v-if=\"!a && !b && !c\">Hello!</div></template>\n<script lang=\"ts\"></script>"
);
assert!(result.script_found);
assert_eq!(result.language, ScriptLanguage::TypeScript);
}
#[test]
fn handles_multiple_comments() {
let detector = ScriptDetector::new();
let result = detector
.detect(b"<!-- c1 --><!-- <script lang=\"js\"> -->\n<script lang=\"ts\"></script>");
assert!(result.script_found);
assert_eq!(result.language, ScriptLanguage::TypeScript);
}
#[test]
fn lang_after_setup() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<script setup lang=\"ts\"></script>");
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TypeScript);
}
#[test]
fn lang_before_setup() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<script lang=\"tsx\" setup></script>");
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TSX);
}
#[test]
fn lang_after_multiple_attrs() {
let detector = ScriptDetector::new();
let result = detector
.detect(b"<script setup async defer custom-attr=\"value\" lang=\"jsx\"></script>");
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::JSX);
}
#[test]
fn lang_after_many_attrs_long() {
let detector = ScriptDetector::new();
let result = detector
.detect(b"<script setup async super-random-prop-that-takes-a-very-long-name-but-should-not-affect-anything-whatsoever\n defer custom-attr=\"value\" lang=\"jsx\"></script>");
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::JSX);
}
#[test]
fn no_lang_but_lang_in_template() {
let detector = ScriptDetector::new();
let result = detector
.detect(b"<script></script><template>\n<Comp lang=\"jsx\">Hello</Comp>\n</template>");
assert!(result.script_found);
assert!(!result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TypeScript);
}
#[test]
fn multiple_scripts_close_together() {
let detector = ScriptDetector::new();
let result = detector.detect(b"<script setup>\n</script>\n<script lang=\"js\"></script>");
assert!(result.script_found);
assert!(!result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TypeScript);
}
#[test]
fn multiple_scripts_first_has_lang() {
let detector = ScriptDetector::new();
let result = detector
.detect(b"<script setup lang=\"tsx\">\n</script>\n<script lang=\"js\"></script>");
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TSX);
}
#[test]
fn very_long_comment_script_input() {
let mut input = Vec::new();
input.extend_from_slice(b"<!-- ");
input.extend(std::iter::repeat_n(b'!', 100_000_024));
input.extend_from_slice(b"<script lang=\"ts\">-->\n<script lang=\"jsx\"></script>");
let detector = ScriptDetector::new();
let result = detector.detect(&input);
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::JSX);
}
#[test]
fn generic_attr_with_angle_brackets() {
let detector = ScriptDetector::new();
let result = detector
.detect(b"<script setup lang=\"ts\" generic=\"T extends Foo<Bar>\">\n</script>");
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TypeScript);
}
#[test]
fn generic_attr_before_lang() {
let detector = ScriptDetector::new();
let result = detector
.detect(b"<script setup generic=\"T extends Map<K, V>\" lang=\"tsx\">\n</script>");
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TSX);
}
#[test]
fn unknown_language_coffee() {
let detector = ScriptDetector::new();
let input = b"<script lang=\"coffee\"></script>";
let result = detector.detect(input);
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::Unknown);
assert_eq!(&input[result.lang_start..result.lang_end], b"coffee");
}
#[test]
fn unknown_language_custom() {
let detector = ScriptDetector::new();
let input = b"<script lang=\"my-custom-lang\"></script>";
let result = detector.detect(input);
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::Unknown);
assert_eq!(
&input[result.lang_start..result.lang_end],
b"my-custom-lang"
);
}
#[test]
fn lang_offsets() {
let detector = ScriptDetector::new();
let input = b"<script lang=\"tsx\"></script>";
let result = detector.detect(input);
assert!(result.script_found);
assert!(result.lang_attr_found);
assert_eq!(result.language, ScriptLanguage::TSX);
assert_eq!(result.lang_start, 14);
assert_eq!(result.lang_end, 17);
assert_eq!(&input[result.lang_start..result.lang_end], b"tsx");
}
#[test]
fn lang_offsets_single_quotes() {
let detector = ScriptDetector::new();
let input = b"<script lang='typescript'></script>";
let result = detector.detect(input);
assert_eq!(&input[result.lang_start..result.lang_end], b"typescript");
}
#[test]
fn lang_offsets_unquoted() {
let detector = ScriptDetector::new();
let input = b"<script lang=js></script>";
let result = detector.detect(input);
assert_eq!(&input[result.lang_start..result.lang_end], b"js");
}
#[test]
fn lang_offsets_after_template() {
let detector = ScriptDetector::new();
let input = b"<template><div>Hello</div></template>\n<script setup lang=\"tsx\"></script>";
let result = detector.detect(input);
assert_eq!(&input[result.lang_start..result.lang_end], b"tsx");
}
}