Skip to main content

fallow_v8_coverage/
lib.rs

1//! V8 `ScriptCoverage` JSON parser and UTF-16 source-offset mapper.
2//!
3//! This is the open-source layer of fallow's runtime-coverage pipeline. It
4//! provides the two things the `fallow` CLI consumes:
5//!
6//! 1. Serde input types for the V8 coverage dump format emitted by
7//!    `node --experimental-test-coverage`, `c8`, the Inspector protocol, or
8//!    any V8 isolate ([`V8CoverageDump`] and friends).
9//! 2. [`LineOffsetTable`], which converts V8 source offsets into 1-indexed
10//!    line / 0-indexed column [`IstanbulPosition`]s.
11//!
12//! ## Offset semantics (load-bearing)
13//!
14//! V8 reports coverage offsets in **UTF-16 code units**, not UTF-8 bytes (V8
15//! strings are UTF-16). Verified against real Node: a function preceded by a
16//! `😀` (2 UTF-16 units / 4 UTF-8 bytes) on the same line is reported at the
17//! UTF-16 offset, not the byte offset. [`LineOffsetTable`] therefore stores
18//! line starts in UTF-16 units. This is the invariant the `line_table_*` tests
19//! pin, and the one a byte-offset implementation gets wrong.
20//!
21//! ## Relationship to `oxc_coverage_v8`
22//!
23//! `oxc_coverage_v8` (in `oxc-coverage-instrument`) solves the inverse problem:
24//! it takes an AST-built Istanbul `FileCoverage` and fills its statement /
25//! function / branch counts by converting Istanbul positions into **byte**
26//! offsets. The two crates are intentionally not consolidated: opposite
27//! directions, opposite unit spaces, and different producers (real Node V8
28//! dumps here vs. an instrumenter-controlled pipeline there). See
29//! `decisions/010-v8-coverage-vs-oxc-coverage-boundary.md`.
30//!
31//! The closed-source cross-reference, combined scoring, hot-path heuristics and
32//! verdict generation live in `fallow-cov` (private) and consume the CLI's
33//! remapped function output via the `fallow-cov-protocol` envelope.
34
35#![forbid(unsafe_code)]
36#![cfg_attr(
37    test,
38    allow(
39        clippy::unwrap_used,
40        clippy::expect_used,
41        reason = "tests use unwrap and expect to keep fixture setup concise"
42    )
43)]
44
45use serde::{Deserialize, Deserializer, Serialize};
46
47/// Top-level shape emitted by Node's `NODE_V8_COVERAGE` directory: one file
48/// per worker / process containing a `result` array of [`ScriptCoverage`].
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct V8CoverageDump {
51    /// Per-script coverage entries.
52    pub result: Vec<ScriptCoverage>,
53    /// Optional source-map cache emitted by Node 13+.
54    #[serde(default, rename = "source-map-cache")]
55    pub source_map_cache: Option<serde_json::Value>,
56}
57
58/// V8's per-script coverage record. Field names mirror the V8 inspector
59/// protocol verbatim.
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct ScriptCoverage {
62    /// V8 script identifier.
63    #[serde(rename = "scriptId")]
64    pub script_id: String,
65    /// File URL — typically `file:///abs/path` for Node, `https://…` for
66    /// browsers. Callers normalize to absolute paths before merging.
67    pub url: String,
68    /// One entry per function (including the implicit module-level function).
69    pub functions: Vec<FunctionCoverage>,
70}
71
72/// V8 per-function coverage. Each function carries one or more
73/// [`CoverageRange`]s — block-level for instrumented coverage, function-level
74/// for `--coverage=best-effort`.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct FunctionCoverage {
77    /// Source-as-written function name. Empty for the module-level wrapper
78    /// and anonymous functions.
79    #[serde(rename = "functionName")]
80    pub function_name: String,
81    /// Coverage ranges, UTF-16 code-unit offsets relative to the script's
82    /// source text (see the crate-level "Offset semantics" note).
83    pub ranges: Vec<CoverageRange>,
84    /// True when V8 emitted block-level data for this function (instrumented
85    /// coverage). False when only the outer function range is reliable
86    /// (best-effort / runtime coverage).
87    #[serde(rename = "isBlockCoverage", default)]
88    pub is_block_coverage: bool,
89}
90
91/// A single coverage range. `count == 0` means the range was never hit.
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct CoverageRange {
94    /// Inclusive UTF-16 code-unit offset into the script's source.
95    #[serde(rename = "startOffset")]
96    pub start_offset: u32,
97    /// Exclusive UTF-16 code-unit offset into the script's source.
98    #[serde(rename = "endOffset")]
99    pub end_offset: u32,
100    /// Number of times the range was executed.
101    pub count: u64,
102}
103
104/// 1-indexed line + 0-indexed column.
105#[derive(Debug, Clone, Serialize, Deserialize)]
106pub struct IstanbulPosition {
107    /// 1-indexed line number.
108    pub line: u32,
109    /// 0-indexed column within the line.
110    ///
111    /// Some real Istanbul producers (including Vitest in certain transforms)
112    /// emit `null` for end columns. We normalize those to `0` at parse time
113    /// so downstream CRAP/prod-coverage consumers can still ingest the file.
114    #[serde(deserialize_with = "deserialize_nullable_u32")]
115    pub column: u32,
116}
117
118fn deserialize_nullable_u32<'de, D>(deserializer: D) -> Result<u32, D::Error>
119where
120    D: Deserializer<'de>,
121{
122    Ok(Option::<u32>::deserialize(deserializer)?.unwrap_or(0))
123}
124
125/// Pre-computed line-start table for converting V8 source offsets into
126/// Istanbul line/column positions in O(log n) per lookup.
127///
128/// V8 reports offsets in JavaScript source positions: UTF-16 code units, not
129/// UTF-8 bytes. Istanbul columns use the same 0-indexed source-position model,
130/// so this table stores line starts in UTF-16 units.
131///
132/// The source is consumed once at construction; subsequent lookups are
133/// allocation-free.
134#[derive(Debug)]
135pub struct LineOffsetTable {
136    /// UTF-16 offset of the first character of each line. `line_starts[0]`
137    /// is always `0` (the start of the file).
138    line_starts: Vec<u32>,
139}
140
141impl LineOffsetTable {
142    /// Build a table from the full source text. The source must be UTF-8 with
143    /// LF, CRLF, or CR line endings (mixed endings are tolerated).
144    #[must_use]
145    pub fn from_source(source: &str) -> Self {
146        let mut line_starts = Vec::with_capacity(source.lines().count() + 1);
147        line_starts.push(0);
148        let mut offset = 0u32;
149        let mut chars = source.chars().peekable();
150        while let Some(ch) = chars.next() {
151            match ch {
152                '\n' => {
153                    offset = offset.saturating_add(1);
154                    line_starts.push(offset);
155                }
156                '\r' => {
157                    offset = offset.saturating_add(1);
158                    if chars.peek() == Some(&'\n') {
159                        chars.next();
160                        offset = offset.saturating_add(1);
161                    }
162                    line_starts.push(offset);
163                }
164                _ => offset = offset.saturating_add(ch.len_utf16() as u32),
165            }
166        }
167        Self { line_starts }
168    }
169
170    /// Build a table from V8's `source-map-cache.lineLengths` data.
171    ///
172    /// `lineLengths` are already measured in JavaScript source positions. The
173    /// cache does not carry line-ending widths, so this preserves the existing
174    /// Node fallback behavior and advances one source position between lines.
175    #[must_use]
176    pub fn from_v8_line_lengths(line_lengths: &[u32]) -> Option<Self> {
177        if line_lengths.is_empty() {
178            return None;
179        }
180
181        let mut line_starts = Vec::with_capacity(line_lengths.len());
182        line_starts.push(0);
183        let mut offset = 0u32;
184        for length in line_lengths
185            .iter()
186            .take(line_lengths.len().saturating_sub(1))
187        {
188            offset = offset.saturating_add(*length).saturating_add(1);
189            line_starts.push(offset);
190        }
191        Some(Self { line_starts })
192    }
193
194    /// Convert a V8 source offset to a 1-indexed line + 0-indexed column.
195    ///
196    /// Offsets at or past the end of the source clamp to the last line +
197    /// remaining column.
198    #[must_use]
199    pub fn position(&self, source_offset: u32) -> IstanbulPosition {
200        let line_zero_indexed = match self.line_starts.binary_search(&source_offset) {
201            Ok(exact) => exact,
202            Err(insertion_point) => insertion_point.saturating_sub(1),
203        };
204        let line_start = self.line_starts[line_zero_indexed];
205        IstanbulPosition {
206            line: (line_zero_indexed as u32) + 1,
207            column: source_offset.saturating_sub(line_start),
208        }
209    }
210}
211
212impl Copy for CoverageRange {}
213impl Copy for IstanbulPosition {}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218
219    #[test]
220    fn line_table_handles_lf() {
221        let table = LineOffsetTable::from_source("a\nbb\nccc");
222        assert_eq!(table.position(0).line, 1);
223        assert_eq!(table.position(0).column, 0);
224        assert_eq!(table.position(2).line, 2);
225        assert_eq!(table.position(2).column, 0);
226        assert_eq!(table.position(5).line, 3);
227        assert_eq!(table.position(5).column, 0);
228    }
229
230    #[test]
231    fn line_table_handles_crlf() {
232        let table = LineOffsetTable::from_source("a\r\nbb\r\nccc");
233        assert_eq!(table.position(3).line, 2);
234        assert_eq!(table.position(3).column, 0);
235    }
236
237    #[test]
238    fn line_table_handles_lone_cr() {
239        let table = LineOffsetTable::from_source("a\rbb");
240        assert_eq!(table.position(2).line, 2);
241        assert_eq!(table.position(2).column, 0);
242    }
243
244    #[test]
245    fn line_table_uses_utf16_offsets_for_non_ascii_source() {
246        let source = "const smile = \"😀\";\nfunction after_emoji() {}\n";
247        let function_byte_offset = source
248            .find("function")
249            .expect("test source should contain function");
250        let function_v8_offset = source[..function_byte_offset].encode_utf16().count() as u32;
251
252        assert_ne!(function_v8_offset, function_byte_offset as u32);
253
254        let table = LineOffsetTable::from_source(source);
255        let pos = table.position(function_v8_offset);
256
257        assert_eq!(pos.line, 2);
258        assert_eq!(pos.column, 0);
259    }
260
261    /// The discriminating case: a multibyte character and the offset live on the
262    /// SAME line, so the UTF-16-vs-byte distinction shows up as the *column*,
263    /// not just line counting. A byte-offset implementation would report a
264    /// strictly larger column here. `😀` is 2 UTF-16 units / 4 UTF-8 bytes, so
265    /// two of them put the byte offset 4 ahead of the V8 (UTF-16) offset. This
266    /// mirrors what real Node emits (`Profiler.takePreciseCoverage` reports the
267    /// UTF-16 offset, e.g. 18 rather than the byte offset 22 for this shape).
268    #[test]
269    fn line_table_maps_columns_in_utf16_units_within_a_line() {
270        let source = "const e = \"😀😀\"; function f(){}\n";
271        let function_byte_offset = source
272            .find("function")
273            .expect("test source should contain function")
274            as u32;
275        let function_v8_offset = source[..function_byte_offset as usize]
276            .encode_utf16()
277            .count() as u32;
278
279        assert!(
280            function_v8_offset < function_byte_offset,
281            "fixture must place a multibyte char before the function",
282        );
283
284        let table = LineOffsetTable::from_source(source);
285        let pos = table.position(function_v8_offset);
286
287        assert_eq!(pos.line, 1);
288        assert_eq!(pos.column, function_v8_offset);
289        assert!(
290            pos.column < function_byte_offset,
291            "column must be measured in UTF-16 units, not bytes",
292        );
293    }
294
295    #[test]
296    fn line_table_builds_from_v8_line_lengths() {
297        let table = LineOffsetTable::from_v8_line_lengths(&[20, 12])
298            .expect("line lengths should build table");
299
300        assert_eq!(table.position(20).line, 1);
301        assert_eq!(table.position(20).column, 20);
302        assert_eq!(table.position(21).line, 2);
303        assert_eq!(table.position(21).column, 0);
304    }
305
306    #[test]
307    fn line_table_clamps_past_end() {
308        let table = LineOffsetTable::from_source("abc");
309        let pos = table.position(100);
310        assert_eq!(pos.line, 1);
311        assert_eq!(pos.column, 100);
312    }
313
314    #[test]
315    fn parse_node_v8_coverage_dump() {
316        let raw = serde_json::json!({
317            "result": [{
318                "scriptId": "42",
319                "url": "file:///t/x.js",
320                "functions": [{
321                    "functionName": "a",
322                    "ranges": [{"startOffset": 0, "endOffset": 10, "count": 3}],
323                    "isBlockCoverage": false
324                }]
325            }]
326        });
327        let dump: V8CoverageDump = serde_json::from_value(raw).unwrap();
328        assert_eq!(dump.result.len(), 1);
329        assert_eq!(dump.result[0].functions[0].function_name, "a");
330    }
331
332    /// Some real Istanbul producers (e.g. Vitest under certain transforms) emit
333    /// `null` for end columns. [`IstanbulPosition`] tolerates that via
334    /// `deserialize_nullable_u32`, normalizing `null` to `0` so a downstream
335    /// consumer deserializing positions does not choke. Pinned directly on the
336    /// position type since that is where the custom deserializer lives.
337    #[test]
338    fn istanbul_position_normalizes_null_column_to_zero() {
339        let with_null: IstanbulPosition =
340            serde_json::from_value(serde_json::json!({ "line": 76, "column": null })).unwrap();
341        assert_eq!(with_null.line, 76);
342        assert_eq!(with_null.column, 0);
343
344        let with_value: IstanbulPosition =
345            serde_json::from_value(serde_json::json!({ "line": 66, "column": 4 })).unwrap();
346        assert_eq!(with_value.column, 4);
347    }
348
349    /// Property tests for the UTF-16-offset-to-line/column mapper.
350    ///
351    /// The `position` mapper backs every Istanbul range fallow emits for runtime
352    /// coverage, so its invariants are encoded as properties rather than relying
353    /// on hand-picked examples. The line-boundary tests build their input from
354    /// known line bodies and join them with a chosen ending, so the expected
355    /// offsets are computed independently of the char-walking construction loop.
356    mod proptests {
357        use super::*;
358        use proptest::prelude::*;
359
360        /// A line body drawn from an alphabet that exercises the UTF-16 width
361        /// branch: ASCII (1 unit), `€` (1 unit / 3 UTF-8 bytes), and `😀` (a
362        /// surrogate pair, 2 units / 4 UTF-8 bytes). Never contains CR or LF, so
363        /// the only line breaks are the ones the harness inserts deliberately.
364        fn line_body() -> impl Strategy<Value = String> {
365            prop::collection::vec(prop::sample::select(vec!['a', 'b', ' ', '€', '😀']), 0..12)
366                .prop_map(|chars| chars.into_iter().collect())
367        }
368
369        /// UTF-16 length of a `str`, matching the units `LineOffsetTable` stores.
370        fn utf16_len(s: &str) -> u32 {
371            s.encode_utf16().count() as u32
372        }
373
374        proptest! {
375            /// `position` is monotonic: a non-decreasing offset never yields an
376            /// earlier `(line, column)`. Guards the `binary_search` Err-branch
377            /// `saturating_sub(1)` and the saturating column subtraction against
378            /// off-by-one regressions, for any source including past-end offsets.
379            #[test]
380            fn position_is_monotonic_in_offset(
381                source in prop::collection::vec(any::<char>(), 0..200)
382                    .prop_map(|chars| chars.into_iter().collect::<String>()),
383                a in any::<u32>(),
384                b in any::<u32>(),
385            ) {
386                let table = LineOffsetTable::from_source(&source);
387                let (lo, hi) = (a.min(b), a.max(b));
388                let p_lo = table.position(lo);
389                let p_hi = table.position(hi);
390                prop_assert!(p_lo.line >= 1, "line numbers are 1-indexed");
391                prop_assert!(
392                    (p_lo.line, p_lo.column) <= (p_hi.line, p_hi.column),
393                    "position({lo}) = {p_lo:?} should not exceed position({hi}) = {p_hi:?}",
394                );
395            }
396
397            /// Every true line boundary maps back to column 0 on the right line,
398            /// and offsets within a line recover their column. Input is assembled
399            /// from known bodies + ending, so the expectation is independent of
400            /// the mapper's own line-splitting logic.
401            #[test]
402            fn line_starts_and_columns_round_trip(
403                bodies in prop::collection::vec(line_body(), 1..8),
404                ending in prop::sample::select(vec!["\n", "\r\n", "\r"]),
405            ) {
406                let source = bodies.join(ending);
407                let table = LineOffsetTable::from_source(&source);
408                let ending_units = utf16_len(ending);
409
410                let mut line_start = 0u32;
411                for (index, body) in bodies.iter().enumerate() {
412                    let body_units = utf16_len(body);
413                    let at_start = table.position(line_start);
414                    prop_assert_eq!(at_start.line, index as u32 + 1);
415                    prop_assert_eq!(at_start.column, 0);
416                    for column in 0..=body_units {
417                        let pos = table.position(line_start + column);
418                        prop_assert_eq!(pos.line, index as u32 + 1);
419                        prop_assert_eq!(pos.column, column);
420                    }
421                    line_start += body_units;
422                    if index + 1 < bodies.len() {
423                        line_start += ending_units;
424                    }
425                }
426            }
427
428            /// `from_v8_line_lengths` advances one source position per line. The
429            /// cumulative line starts are strictly increasing and each maps to
430            /// column 0 on its line; offsets within a non-final line recover the
431            /// column. Lengths are bounded so the cumulative offset never
432            /// saturates, keeping the reference model exact.
433            #[test]
434            fn v8_line_lengths_build_consistent_table(
435                lengths in prop::collection::vec(0u32..1000, 1..20),
436            ) {
437                let table = LineOffsetTable::from_v8_line_lengths(&lengths)
438                    .expect("non-empty lengths build a table");
439
440                let mut starts = vec![0u32];
441                let mut acc = 0u32;
442                for length in &lengths[..lengths.len() - 1] {
443                    acc += length + 1;
444                    starts.push(acc);
445                }
446
447                let mut previous: Option<u32> = None;
448                for (index, &start) in starts.iter().enumerate() {
449                    if let Some(prev) = previous {
450                        prop_assert!(start > prev, "line starts must strictly increase");
451                    }
452                    previous = Some(start);
453
454                    let at_start = table.position(start);
455                    prop_assert_eq!(at_start.line, index as u32 + 1);
456                    prop_assert_eq!(at_start.column, 0);
457
458                    if index + 1 < lengths.len() {
459                        for column in 0..=lengths[index] {
460                            let pos = table.position(start + column);
461                            prop_assert_eq!(pos.line, index as u32 + 1);
462                            prop_assert_eq!(pos.column, column);
463                        }
464                    }
465                }
466            }
467        }
468    }
469}