use std::path::Path;
use oxc_allocator::Allocator;
use oxc_ast_visit::Visit;
use oxc_parser::Parser;
use oxc_span::{SourceType, Span};
pub use super::token_types::{
FileTokens, KeywordType, OperatorType, PunctuationType, SourceToken, TokenKind,
};
use super::token_visitor::TokenExtractor;
#[must_use]
pub fn tokenize_file(path: &Path, source: &str, skip_imports: bool) -> FileTokens {
tokenize_file_inner(path, source, false, skip_imports)
}
#[must_use]
pub fn tokenize_file_cross_language(
path: &Path,
source: &str,
strip_types: bool,
skip_imports: bool,
) -> FileTokens {
tokenize_file_inner(path, source, strip_types, skip_imports)
}
fn tokenize_file_inner(
path: &Path,
source: &str,
strip_types: bool,
skip_imports: bool,
) -> FileTokens {
use crate::extract::{extract_astro_frontmatter, extract_mdx_statements, is_sfc_file};
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
if is_sfc_file(path) {
return tokenize_sfc(source, strip_types, skip_imports);
}
if ext == "astro" {
return tokenize_astro(source, strip_types, skip_imports, extract_astro_frontmatter);
}
if ext == "mdx" {
return tokenize_mdx(source, strip_types, skip_imports, extract_mdx_statements);
}
if ext == "css" || ext == "scss" {
return empty_tokens(source);
}
tokenize_js_ts(path, source, strip_types, skip_imports)
}
#[expect(
clippy::cast_possible_truncation,
reason = "byte offsets are bounded by source size"
)]
fn tokenize_sfc(source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
let scripts = crate::extract::extract_sfc_scripts(source);
let mut all_tokens = Vec::new();
for script in &scripts {
let source_type = match (script.is_typescript, script.is_jsx) {
(true, true) => SourceType::tsx(),
(true, false) => SourceType::ts(),
(false, true) => SourceType::jsx(),
(false, false) => SourceType::mjs(),
};
let allocator = Allocator::default();
let parser_return = Parser::new(&allocator, &script.body, source_type).parse();
let mut extractor = TokenExtractor::new(strip_types, skip_imports);
extractor.visit_program(&parser_return.program);
let offset = script.byte_offset as u32;
for token in &mut extractor.tokens {
token.span = Span::new(token.span.start + offset, token.span.end + offset);
}
all_tokens.extend(extractor.tokens);
}
FileTokens {
tokens: all_tokens,
source: source.to_string(),
line_count: source.lines().count().max(1),
}
}
#[expect(
clippy::cast_possible_truncation,
reason = "byte offsets are bounded by source size"
)]
fn tokenize_astro(
source: &str,
strip_types: bool,
skip_imports: bool,
extract_fn: fn(&str) -> Option<fallow_extract::sfc::SfcScript>,
) -> FileTokens {
if let Some(script) = extract_fn(source) {
let allocator = Allocator::default();
let parser_return = Parser::new(&allocator, &script.body, SourceType::ts()).parse();
let mut extractor = TokenExtractor::new(strip_types, skip_imports);
extractor.visit_program(&parser_return.program);
let offset = script.byte_offset as u32;
for token in &mut extractor.tokens {
token.span = Span::new(token.span.start + offset, token.span.end + offset);
}
return FileTokens {
tokens: extractor.tokens,
source: source.to_string(),
line_count: source.lines().count().max(1),
};
}
empty_tokens(source)
}
fn tokenize_mdx(
source: &str,
strip_types: bool,
skip_imports: bool,
extract_fn: fn(&str) -> String,
) -> FileTokens {
let statements = extract_fn(source);
if !statements.is_empty() {
let allocator = Allocator::default();
let parser_return = Parser::new(&allocator, &statements, SourceType::jsx()).parse();
let mut extractor = TokenExtractor::new(strip_types, skip_imports);
extractor.visit_program(&parser_return.program);
return FileTokens {
tokens: extractor.tokens,
source: source.to_string(),
line_count: source.lines().count().max(1),
};
}
empty_tokens(source)
}
fn empty_tokens(source: &str) -> FileTokens {
FileTokens {
tokens: Vec::new(),
source: source.to_string(),
line_count: source.lines().count().max(1),
}
}
fn tokenize_js_ts(path: &Path, source: &str, strip_types: bool, skip_imports: bool) -> FileTokens {
let source_type = SourceType::from_path(path).unwrap_or_default();
let allocator = Allocator::default();
let parser_return = Parser::new(&allocator, source, source_type).parse();
let mut extractor = TokenExtractor::new(strip_types, skip_imports);
extractor.visit_program(&parser_return.program);
if extractor.tokens.len() < 5 && source.len() > 100 && !source_type.is_jsx() {
let jsx_type = if source_type.is_typescript() {
SourceType::tsx()
} else {
SourceType::jsx()
};
let allocator2 = Allocator::default();
let retry_return = Parser::new(&allocator2, source, jsx_type).parse();
let mut retry_extractor = TokenExtractor::new(strip_types, skip_imports);
retry_extractor.visit_program(&retry_return.program);
if retry_extractor.tokens.len() > extractor.tokens.len() {
extractor = retry_extractor;
}
}
FileTokens {
tokens: extractor.tokens,
source: source.to_string(),
line_count: source.lines().count().max(1),
}
}
#[cfg(test)]
mod tests;