use core::ops::Range;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum TokenKind {
Keyword,
Literal,
StringLit,
Comment,
}
pub(crate) fn tokenize(lang: &str, src: &str) -> Vec<(Range<usize>, TokenKind)> {
match lang.trim().to_ascii_lowercase().as_str() {
"rust" | "rs" => tokenize_rust(src),
"json" | "jsonc" | "json5" => tokenize_json(src),
"toml" => tokenize_toml(src),
"python" | "py" => tokenize_python(src),
"javascript" | "js" | "typescript" | "ts" | "jsx" | "tsx" => tokenize_javascript(src),
"shell" | "sh" | "bash" | "zsh" | "console" => tokenize_shell(src),
"c" | "cpp" | "c++" | "h" | "hpp" | "cc" => tokenize_c(src),
_ => Vec::new(),
}
}
const RUST_KW: &[&str] = &[
"as", "async", "await", "break", "const", "continue", "crate", "dyn", "else", "enum",
"extern", "fn", "for", "if", "impl", "in", "let", "loop", "match", "mod", "move", "mut",
"pub", "ref", "return", "self", "Self", "static", "struct", "super", "trait", "type",
"union", "unsafe", "use", "where", "while",
];
fn tokenize_rust(src: &str) -> Vec<(Range<usize>, TokenKind)> {
let b = src.as_bytes();
let mut out = Vec::new();
let mut i = 0;
while i < b.len() {
match b[i] {
b'/' if b.get(i + 1) == Some(&b'/') => {
let end = scan_line_end(b, i + 2);
out.push((i..end, TokenKind::Comment));
i = end;
}
b'/' if b.get(i + 1) == Some(&b'*') => {
let end = scan_block_nested(b, i + 2);
out.push((i..end, TokenKind::Comment));
i = end;
}
b'"' => {
let end = scan_quoted(b, i + 1, b'"');
out.push((i..end, TokenKind::StringLit));
i = end;
}
b'r' | b'b' => {
if let Some(end) = scan_rust_prefixed_string(b, i) {
out.push((i..end, TokenKind::StringLit));
i = end;
} else {
i = push_ident(b, i, RUST_KW, &["true", "false"], &mut out);
}
}
b'\'' => {
if b.get(i + 1) == Some(&b'\\') {
let end = scan_quoted(b, i + 1, b'\'');
out.push((i..end, TokenKind::StringLit));
i = end;
} else if let Some(&c1) = b.get(i + 1) {
let n = utf8_len(c1);
if b.get(i + 1 + n) == Some(&b'\'') {
out.push((i..i + 2 + n, TokenKind::StringLit));
i += 2 + n;
} else if is_ident_byte(c1) && !c1.is_ascii_digit() {
i = scan_ident_end(b, i + 1); } else {
i += 1;
}
} else {
i += 1;
}
}
c if c.is_ascii_digit() => {
let end = scan_code_number(b, i);
out.push((i..end, TokenKind::Literal));
i = end;
}
c if is_ident_start(c) => {
i = push_ident(b, i, RUST_KW, &["true", "false"], &mut out);
}
_ => i += 1,
}
}
out
}
fn tokenize_json(src: &str) -> Vec<(Range<usize>, TokenKind)> {
let b = src.as_bytes();
let mut out = Vec::new();
let mut i = 0;
while i < b.len() {
match b[i] {
q @ (b'"' | b'\'') => {
let end = scan_quoted(b, i + 1, q);
out.push((i..end, TokenKind::StringLit));
i = end;
}
b'/' if b.get(i + 1) == Some(&b'/') => {
let end = scan_line_end(b, i + 2);
out.push((i..end, TokenKind::Comment));
i = end;
}
b'/' if b.get(i + 1) == Some(&b'*') => {
let end = scan_block_end(b, i + 2);
out.push((i..end, TokenKind::Comment));
i = end;
}
c if c.is_ascii_digit()
|| (c == b'-' && b.get(i + 1).is_some_and(u8::is_ascii_digit)) =>
{
let end = scan_code_number(b, if c == b'-' { i + 1 } else { i }).max(i + 1);
out.push((i..end, TokenKind::Literal));
i = end;
}
c if is_ident_start(c) => {
i = push_ident(b, i, &[], &["true", "false", "null"], &mut out);
}
_ => i += 1,
}
}
out
}
fn tokenize_toml(src: &str) -> Vec<(Range<usize>, TokenKind)> {
let b = src.as_bytes();
let mut out = Vec::new();
let mut i = 0;
while i < b.len() {
match b[i] {
b'#' => {
let end = scan_line_end(b, i + 1);
out.push((i..end, TokenKind::Comment));
i = end;
}
b'"' if b.get(i + 1) == Some(&b'"') && b.get(i + 2) == Some(&b'"') => {
let end = scan_triple(b, i + 3, b'"', true);
out.push((i..end, TokenKind::StringLit));
i = end;
}
b'"' => {
let end = scan_quoted(b, i + 1, b'"');
out.push((i..end, TokenKind::StringLit));
i = end;
}
b'\'' if b.get(i + 1) == Some(&b'\'') && b.get(i + 2) == Some(&b'\'') => {
let end = scan_triple(b, i + 3, b'\'', false);
out.push((i..end, TokenKind::StringLit));
i = end;
}
b'\'' => {
let end = scan_raw_quoted(b, i + 1, b'\'');
out.push((i..end, TokenKind::StringLit));
i = end;
}
c if c.is_ascii_digit() => {
let end = scan_code_number(b, i);
out.push((i..end, TokenKind::Literal));
i = end;
}
c if is_ident_start(c) => {
i = push_ident(b, i, &[], &["true", "false"], &mut out);
}
_ => i += 1,
}
}
out
}
const PY_KW: &[&str] = &[
"and", "as", "assert", "async", "await", "break", "case", "class", "continue", "def",
"del", "elif", "else", "except", "finally", "for", "from", "global", "if", "import", "in",
"is", "lambda", "match", "nonlocal", "not", "or", "pass", "raise", "return", "try",
"while", "with", "yield",
];
fn tokenize_python(src: &str) -> Vec<(Range<usize>, TokenKind)> {
let b = src.as_bytes();
let mut out = Vec::new();
let mut i = 0;
while i < b.len() {
match b[i] {
b'#' => {
let end = scan_line_end(b, i + 1);
out.push((i..end, TokenKind::Comment));
i = end;
}
q @ (b'"' | b'\'') => {
let end = scan_py_string(b, i, q);
out.push((i..end, TokenKind::StringLit));
i = end;
}
c if c.is_ascii_digit() => {
let end = scan_code_number(b, i);
out.push((i..end, TokenKind::Literal));
i = end;
}
c if is_ident_start(c) => {
let end = scan_ident_end(b, i);
let word = &b[i..end];
let is_prefix = word.len() <= 2
&& word.iter().all(|c| matches!(c, b'r' | b'b' | b'f' | b'u' | b'R' | b'B' | b'F' | b'U'))
&& matches!(b.get(end), Some(b'"' | b'\''));
if is_prefix {
let q = b[end];
let send = scan_py_string(b, end, q);
out.push((i..send, TokenKind::StringLit));
i = send;
} else {
i = push_ident(b, i, PY_KW, &["True", "False", "None"], &mut out);
}
}
_ => i += 1,
}
}
out
}
const JS_KW: &[&str] = &[
"abstract", "as", "async", "await", "break", "case", "catch", "class", "const",
"continue", "debugger", "declare", "default", "delete", "do", "else", "enum", "export",
"extends", "finally", "for", "from", "function", "if", "implements", "import", "in",
"instanceof", "interface", "keyof", "let", "namespace", "new", "of", "private",
"protected", "public", "readonly", "return", "satisfies", "static", "super", "switch",
"this", "throw", "try", "type", "typeof", "var", "void", "while", "with", "yield",
];
fn tokenize_javascript(src: &str) -> Vec<(Range<usize>, TokenKind)> {
let b = src.as_bytes();
let mut out = Vec::new();
let mut i = 0;
while i < b.len() {
match b[i] {
b'/' if b.get(i + 1) == Some(&b'/') => {
let end = scan_line_end(b, i + 2);
out.push((i..end, TokenKind::Comment));
i = end;
}
b'/' if b.get(i + 1) == Some(&b'*') => {
let end = scan_block_end(b, i + 2);
out.push((i..end, TokenKind::Comment));
i = end;
}
q @ (b'"' | b'\'' | b'`') => {
let end = scan_quoted(b, i + 1, q);
out.push((i..end, TokenKind::StringLit));
i = end;
}
c if c.is_ascii_digit() => {
let end = scan_code_number(b, i);
out.push((i..end, TokenKind::Literal));
i = end;
}
c if is_ident_start(c) => {
i = push_ident(b, i, JS_KW, &["true", "false", "null", "undefined", "NaN"], &mut out);
}
_ => i += 1,
}
}
out
}
const SH_KW: &[&str] = &[
"case", "do", "done", "elif", "else", "esac", "export", "fi", "for", "function", "if",
"in", "local", "readonly", "return", "select", "then", "until", "while",
];
fn tokenize_shell(src: &str) -> Vec<(Range<usize>, TokenKind)> {
let b = src.as_bytes();
let mut out = Vec::new();
let mut i = 0;
while i < b.len() {
match b[i] {
b'#' if i == 0 || matches!(b[i - 1], b' ' | b'\t' | b'\n' | b';' | b'|' | b'&' | b'(' | b'`') => {
let end = scan_line_end(b, i + 1);
out.push((i..end, TokenKind::Comment));
i = end;
}
b'\'' => {
let end = scan_raw_quoted(b, i + 1, b'\'');
out.push((i..end, TokenKind::StringLit));
i = end;
}
b'"' => {
let end = scan_quoted(b, i + 1, b'"');
out.push((i..end, TokenKind::StringLit));
i = end;
}
c if is_ident_start(c) => {
i = push_ident(b, i, SH_KW, &[], &mut out);
}
_ => i += 1,
}
}
out
}
const C_KW: &[&str] = &[
"auto", "bool", "break", "case", "catch", "char", "class", "const", "constexpr",
"continue", "decltype", "default", "delete", "do", "double", "else", "enum", "explicit",
"extern", "final", "float", "for", "friend", "goto", "if", "inline", "int", "long",
"mutable", "namespace", "new", "noexcept", "operator", "override", "private",
"protected", "public", "return", "short", "signed", "sizeof", "static", "struct",
"switch", "template", "this", "throw", "try", "typedef", "typename", "unsigned",
"using", "virtual", "void", "volatile", "while",
];
fn tokenize_c(src: &str) -> Vec<(Range<usize>, TokenKind)> {
let b = src.as_bytes();
let mut out = Vec::new();
let mut i = 0;
while i < b.len() {
match b[i] {
b'#' if at_line_start(b, i) => {
let end = scan_line_end(b, i + 1);
out.push((i..end, TokenKind::Keyword));
i = end;
}
b'/' if b.get(i + 1) == Some(&b'/') => {
let end = scan_line_end(b, i + 2);
out.push((i..end, TokenKind::Comment));
i = end;
}
b'/' if b.get(i + 1) == Some(&b'*') => {
let end = scan_block_end(b, i + 2);
out.push((i..end, TokenKind::Comment));
i = end;
}
q @ (b'"' | b'\'') => {
let end = scan_quoted(b, i + 1, q);
out.push((i..end, TokenKind::StringLit));
i = end;
}
c if c.is_ascii_digit() => {
let end = scan_code_number(b, i);
out.push((i..end, TokenKind::Literal));
i = end;
}
c if is_ident_start(c) => {
i = push_ident(b, i, C_KW, &["true", "false", "nullptr", "NULL"], &mut out);
}
_ => i += 1,
}
}
out
}
fn push_ident(
b: &[u8],
i: usize,
keywords: &[&str],
literals: &[&str],
out: &mut Vec<(Range<usize>, TokenKind)>,
) -> usize {
let end = scan_ident_end(b, i);
let word = &b[i..end];
if literals.iter().any(|k| k.as_bytes() == word) {
out.push((i..end, TokenKind::Literal));
} else if keywords.iter().any(|k| k.as_bytes() == word) {
out.push((i..end, TokenKind::Keyword));
}
end
}
fn scan_quoted(b: &[u8], mut i: usize, quote: u8) -> usize {
while i < b.len() {
if b[i] == b'\\' {
i += 2;
} else if b[i] == quote {
return i + 1;
} else {
i += 1;
}
}
b.len()
}
fn scan_raw_quoted(b: &[u8], mut i: usize, quote: u8) -> usize {
while i < b.len() {
if b[i] == quote {
return i + 1;
}
i += 1;
}
b.len()
}
fn scan_py_string(b: &[u8], i: usize, q: u8) -> usize {
if b.get(i + 1) == Some(&q) && b.get(i + 2) == Some(&q) {
scan_triple(b, i + 3, q, true)
} else {
scan_quoted(b, i + 1, q)
}
}
fn scan_triple(b: &[u8], mut i: usize, quote: u8, esc: bool) -> usize {
while i < b.len() {
if esc && b[i] == b'\\' {
i += 2;
} else if b[i] == quote && b.get(i + 1) == Some("e) && b.get(i + 2) == Some("e) {
return i + 3;
} else {
i += 1;
}
}
b.len()
}
fn scan_line_end(b: &[u8], mut i: usize) -> usize {
while i < b.len() && b[i] != b'\n' {
i += 1;
}
i
}
fn scan_block_end(b: &[u8], mut i: usize) -> usize {
while i + 1 < b.len() {
if b[i] == b'*' && b[i + 1] == b'/' {
return i + 2;
}
i += 1;
}
b.len()
}
fn scan_block_nested(b: &[u8], mut i: usize) -> usize {
let mut depth = 1usize;
while i + 1 < b.len() {
if b[i] == b'/' && b[i + 1] == b'*' {
depth += 1;
i += 2;
} else if b[i] == b'*' && b[i + 1] == b'/' {
depth -= 1;
i += 2;
if depth == 0 {
return i;
}
} else {
i += 1;
}
}
b.len()
}
fn scan_rust_prefixed_string(b: &[u8], i: usize) -> Option<usize> {
let mut j = i;
if b.get(j) == Some(&b'b') {
j += 1;
}
let raw = b.get(j) == Some(&b'r');
if raw {
j += 1;
}
if !raw {
return (j > i && b.get(j) == Some(&b'"')).then(|| scan_quoted(b, j + 1, b'"'));
}
let hash_start = j;
while b.get(j) == Some(&b'#') {
j += 1;
}
let hashes = j - hash_start;
if b.get(j) != Some(&b'"') {
return None;
}
j += 1;
while j < b.len() {
if b[j] == b'"' && b[j + 1..].len() >= hashes && b[j + 1..j + 1 + hashes].iter().all(|c| *c == b'#') {
return Some(j + 1 + hashes);
}
j += 1;
}
Some(b.len())
}
fn scan_code_number(b: &[u8], start: usize) -> usize {
let mut i = start;
if b.get(i) == Some(&b'0') && matches!(b.get(i + 1), Some(b'x' | b'X' | b'o' | b'O' | b'b' | b'B')) {
i += 2;
while i < b.len() && (b[i].is_ascii_hexdigit() || b[i] == b'_') {
i += 1;
}
} else {
while i < b.len() && (b[i].is_ascii_digit() || b[i] == b'_') {
i += 1;
}
if b.get(i) == Some(&b'.') && b.get(i + 1).is_some_and(u8::is_ascii_digit) {
i += 1;
while i < b.len() && (b[i].is_ascii_digit() || b[i] == b'_') {
i += 1;
}
}
if matches!(b.get(i), Some(b'e' | b'E')) {
let mut j = i + 1;
if matches!(b.get(j), Some(b'+' | b'-')) {
j += 1;
}
if b.get(j).is_some_and(u8::is_ascii_digit) {
i = j;
}
}
}
while i < b.len() && is_ident_byte(b[i]) {
i += 1;
}
i.max(start + 1)
}
fn scan_ident_end(b: &[u8], mut i: usize) -> usize {
while i < b.len() && is_ident_byte(b[i]) {
i += 1;
}
i
}
fn is_ident_start(c: u8) -> bool {
c.is_ascii_alphabetic() || c == b'_'
}
fn is_ident_byte(c: u8) -> bool {
c.is_ascii_alphanumeric() || c == b'_'
}
fn at_line_start(b: &[u8], i: usize) -> bool {
let mut j = i;
while j > 0 {
j -= 1;
match b[j] {
b' ' | b'\t' => continue,
b'\n' => return true,
_ => return false,
}
}
true
}
fn utf8_len(first: u8) -> usize {
match first {
0xF0..=0xF7 => 4,
0xE0..=0xEF => 3,
0xC0..=0xDF => 2,
_ => 1,
}
}