Skip to main content

jscpd_rs/
tokenizer.rs

1mod apex;
2mod blocks;
3mod embedded;
4mod generic;
5mod hash;
6mod ignore;
7mod line_index;
8mod markdown;
9mod markup_attrs;
10mod oxc;
11mod scan;
12mod tap;
13
14use serde::Serialize;
15
16use crate::cli::{Mode, Options};
17
18use generic::tokenize_generic;
19use hash::hash_token;
20use ignore::find_ignore_regions;
21use line_index::LineIndex;
22use oxc::{is_oxc_format, tokenize_oxc_maps};
23use scan::count_prism_whitespace_tokens;
24
25/// One-based source location used in tokens, fragments, and reports.
26#[derive(Clone, Debug, Serialize)]
27pub struct Location {
28    /// One-based line number.
29    pub line: usize,
30    /// Zero-based column number.
31    pub column: usize,
32    /// Zero-based byte position in the original source text.
33    pub position: usize,
34}
35
36/// Detection token after mode filtering and jscpd-compatible hashing.
37#[derive(Clone, Debug)]
38pub struct DetectionToken {
39    /// Stable token hash used by the duplicate detector.
40    pub hash: u64,
41    /// Start location of the token.
42    pub start: Location,
43    /// End location of the token.
44    pub end: Location,
45    /// Byte range in the original source text.
46    pub range: [usize; 2],
47}
48
49/// Token map for a single detected format block.
50///
51/// Embedded formats can produce more than one map for one source document, for
52/// example script/style blocks extracted from markup-like files.
53#[derive(Clone, Debug)]
54pub struct TokenMap {
55    /// Format name associated with this token map.
56    pub format: String,
57    /// Detection tokens in source order.
58    pub tokens: Vec<DetectionToken>,
59    positions_assigned: bool,
60}
61
62/// Token map associated with a source identifier and line count.
63#[derive(Clone, Debug)]
64pub struct SourceTokenMap {
65    /// Stable source identifier, usually a file path.
66    pub source_id: String,
67    /// Format name associated with this token map.
68    pub format: String,
69    /// Detection tokens in source order.
70    pub tokens: Vec<DetectionToken>,
71    /// Total source lines represented by this map.
72    pub lines: usize,
73}
74
75/// Native tokenizer used by the detector.
76///
77/// JS/TS/JSX/TSX formats use Oxc-backed tokenization. Long-tail formats use
78/// the generic native tokenizer unless a format has a dedicated implementation.
79#[derive(Clone, Debug)]
80pub struct Tokenizer {
81    options: Options,
82}
83
84impl Default for Tokenizer {
85    fn default() -> Self {
86        Self::new()
87    }
88}
89
90impl Tokenizer {
91    /// Create a tokenizer with default detector options.
92    pub fn new() -> Self {
93        Self {
94            options: Options::default(),
95        }
96    }
97
98    /// Create a tokenizer with caller-provided options.
99    pub fn with_options(options: Options) -> Self {
100        Self { options }
101    }
102
103    /// Return the options used by this tokenizer.
104    pub fn options(&self) -> &Options {
105        &self.options
106    }
107
108    /// Mutably access tokenizer options.
109    pub fn options_mut(&mut self) -> &mut Options {
110        &mut self.options
111    }
112
113    /// Tokenize a source string and return the first token stream.
114    ///
115    /// Use [`Tokenizer::tokenize_maps`] when a format can produce multiple
116    /// embedded token maps.
117    pub fn tokenize(&self, content: &str, format: &str) -> Vec<DetectionToken> {
118        self.tokenize_maps(content, format)
119            .into_iter()
120            .next()
121            .map(|map| map.tokens)
122            .unwrap_or_default()
123    }
124
125    /// Tokenize source text into one or more format-specific token maps.
126    pub fn tokenize_maps(&self, content: &str, format: &str) -> Vec<TokenMap> {
127        tokenize_maps_for_detection(content, format, &self.options)
128    }
129
130    /// Tokenize source text and attach a source identifier to each generated map.
131    pub fn generate_maps(
132        &self,
133        source_id: impl Into<String>,
134        content: &str,
135        format: &str,
136    ) -> Vec<SourceTokenMap> {
137        let source_id = source_id.into();
138        self.tokenize_maps(content, format)
139            .into_iter()
140            .map(|map| SourceTokenMap {
141                source_id: source_id.clone(),
142                lines: token_map_line_count(&map.tokens),
143                format: map.format,
144                tokens: map.tokens,
145            })
146            .collect()
147    }
148}
149
150#[derive(Clone, Copy, Debug, PartialEq, Eq)]
151enum TokenKind {
152    Comment,
153    Constant,
154    Empty,
155    Keyword,
156    NewLine,
157    Number,
158    Operator,
159    Punctuation,
160    String,
161    Default,
162}
163
164#[derive(Clone, Copy)]
165struct ByteSpan {
166    start: usize,
167    end: usize,
168}
169
170struct TokenContext<'a> {
171    content: &'a str,
172    options: &'a Options,
173    ignore_regions: &'a [[usize; 2]],
174}
175
176impl TokenContext<'_> {
177    fn slice(&self, span: ByteSpan) -> &str {
178        &self.content[span.start..span.end]
179    }
180
181    fn overlaps_ignore_region(&self, span: ByteSpan) -> bool {
182        self.ignore_regions
183            .iter()
184            .any(|[region_start, region_end]| span.start < *region_end && span.end > *region_start)
185    }
186}
187
188#[cfg(test)]
189fn tokenize_for_detection(content: &str, format: &str, options: &Options) -> Vec<DetectionToken> {
190    tokenize_maps_for_detection(content, format, options)
191        .into_iter()
192        .next()
193        .map(|map| map.tokens)
194        .unwrap_or_default()
195}
196
197pub fn tokenize_maps_for_detection(
198    content: &str,
199    format: &str,
200    options: &Options,
201) -> Vec<TokenMap> {
202    let ignore_regions = find_ignore_regions(content, options);
203    let mut maps = if format == "markdown" {
204        markdown::tokenize_maps(content, options, &ignore_regions)
205    } else if format == "apex" {
206        apex::tokenize_maps(content, options, &ignore_regions)
207    } else if format == "tap" {
208        tap::tokenize_maps(content, options, &ignore_regions)
209    } else if matches!(format, "markup" | "vue" | "svelte" | "astro") {
210        blocks::tokenize_maps(content, format, options, &ignore_regions)
211    } else if is_oxc_format(format) {
212        tokenize_oxc_maps(content, format, options, &ignore_regions)
213    } else {
214        vec![TokenMap {
215            format: format.to_string(),
216            tokens: tokenize_generic(content, format, options, &ignore_regions),
217            positions_assigned: false,
218        }]
219    };
220    for map in &mut maps {
221        if !map.positions_assigned {
222            assign_token_positions(content, &map.format, options, &mut map.tokens);
223        }
224    }
225    maps
226}
227
228fn token_map_line_count(tokens: &[DetectionToken]) -> usize {
229    match (tokens.first(), tokens.last()) {
230        (Some(first), Some(last)) => last.end.line.saturating_sub(first.start.line),
231        _ => 0,
232    }
233}
234
235fn assign_token_positions(
236    content: &str,
237    format: &str,
238    options: &Options,
239    tokens: &mut [DetectionToken],
240) {
241    let needs_report_positions =
242        options.reporters.iter().any(|reporter| reporter == "json") || !options.silent;
243    if !needs_report_positions || !matches!(format, "javascript" | "typescript" | "jsx" | "tsx") {
244        for (position, token) in tokens.iter_mut().enumerate() {
245            token.start.position = position;
246            token.end.position = position;
247        }
248        return;
249    }
250
251    let mut position = 0usize;
252    let mut previous_end = 0usize;
253    for token in tokens {
254        if token.range[0] > previous_end {
255            position += count_prism_whitespace_tokens(content, previous_end, token.range[0]);
256        }
257        token.start.position = position;
258        token.end.position = position;
259        position += 1;
260        previous_end = previous_end.max(token.range[1]);
261    }
262}
263
264fn push_token(
265    tokens: &mut Vec<DetectionToken>,
266    context: &TokenContext<'_>,
267    kind: TokenKind,
268    span: ByteSpan,
269    start: Location,
270    end: Location,
271) {
272    if context.options.mode == Mode::Weak && kind == TokenKind::Comment {
273        return;
274    }
275    if context.overlaps_ignore_region(span) {
276        return;
277    }
278    tokens.push(DetectionToken {
279        hash: hash_token(kind, context.slice(span), context.options.ignore_case),
280        start,
281        end,
282        range: [span.start, span.end],
283    });
284}
285
286fn push_strict_whitespace_tokens(
287    tokens: &mut Vec<DetectionToken>,
288    context: &TokenContext<'_>,
289    span: ByteSpan,
290    line_index: &LineIndex,
291) {
292    if context.options.mode != Mode::Strict {
293        return;
294    }
295    let mut start = span.start;
296    while start < span.end {
297        let (end, kind) = scan_whitespace_token(context.content, start, span.end);
298        push_token(
299            tokens,
300            context,
301            kind,
302            ByteSpan { start, end },
303            line_index.location(start),
304            line_index.location(end),
305        );
306        start = end.max(start + 1);
307    }
308}
309
310fn scan_whitespace_token(content: &str, start: usize, limit: usize) -> (usize, TokenKind) {
311    let bytes = content.as_bytes();
312    if bytes[start] == b'\n' {
313        return (start + 1, TokenKind::NewLine);
314    }
315
316    let mut end = start;
317    while end < limit {
318        let ch = content[end..].chars().next().unwrap_or('\0');
319        if ch == '\n' || !ch.is_whitespace() {
320            break;
321        }
322        end += ch.len_utf8();
323    }
324    (end, TokenKind::Empty)
325}
326
327#[cfg(test)]
328mod tests;