Skip to main content

shuck_indexer/
lib.rs

1#![warn(missing_docs)]
2#![cfg_attr(not(test), warn(clippy::unwrap_used))]
3
4//! Positional and structural indexes over parsed shell scripts.
5//!
6//! The indexer complements `shuck-parser` by building compact lookup tables for
7//! source lines, comments, syntactic regions, heredoc bodies, and physical line
8//! continuations. It is intended to be built once from parser output and then
9//! shared by semantic analysis, lint rules, suppressions, formatters, and report
10//! rendering.
11//!
12//! All positions are byte offsets represented with `shuck_ast::TextSize` and
13//! `shuck_ast::TextRange`. The crate does not build a character index: callers
14//! that need display columns should combine these byte offsets with the original
15//! source text at the UI boundary.
16//!
17//! [`Indexer`] is the preferred construction path when parser output is
18//! available. The lower-level indexes are also exported for integrations that
19//! only need line mapping or that already have an AST-shaped source of comments
20//! or regions.
21mod comment_index;
22mod line_index;
23mod region_index;
24
25/// Comment lookup types derived from parser output.
26pub use comment_index::{CommentIndex, IndexedComment};
27/// Line-based offset lookup utilities.
28pub use line_index::{LineEndingStyle, LineIndex};
29/// Structural region indexes over parsed shell source.
30pub use region_index::{IndexedHeredoc, RegionIndex, RegionKind};
31
32use line_index::{RawContinuationCandidate, RawContinuationMode};
33use shuck_ast::{File, TextSize};
34use shuck_parser::parser::ParseResult;
35
36/// Optional index families that are not needed by every consumer.
37///
38/// The default options build the indexes used by linting and semantic analysis.
39/// Source-layout indexes retain formatter-oriented lookup tables such as raw
40/// continuation backslash offsets and heredoc closing-marker ranges.
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
42pub struct IndexerOptions {
43    source_layout_indexes: bool,
44}
45
46impl IndexerOptions {
47    /// Return default indexer options.
48    pub fn new() -> Self {
49        Self::default()
50    }
51
52    /// Enable or disable formatter-oriented source-layout indexes.
53    ///
54    /// When enabled, [`LineIndex::raw_continuation_line_starts`],
55    /// [`LineIndex::raw_continuation_backslashes`], and
56    /// [`RegionIndex::heredoc_closing_marker_range`] retain their lookup data.
57    /// The default keeps those retained indexes disabled while still computing
58    /// semantic continuation lines for [`Indexer::continuation_line_starts`].
59    pub fn with_source_layout_indexes(mut self, enabled: bool) -> Self {
60        self.source_layout_indexes = enabled;
61        self
62    }
63
64    /// Return whether formatter-oriented source-layout indexes are enabled.
65    pub fn source_layout_indexes(self) -> bool {
66        self.source_layout_indexes
67    }
68}
69
70/// Pre-computed positional and structural index over a parsed shell script.
71///
72/// `Indexer` owns the line, comment, and syntactic-region indexes for one source
73/// file. It also filters raw backslash-newline candidates into the continuation
74/// lines that matter to shell analysis: continuations in comments, quoted text,
75/// and heredoc bodies are excluded.
76///
77/// Build one `Indexer` for a parse result and pass references to downstream
78/// analysis code. Query methods borrow precomputed data and do not walk the AST,
79/// rescan the full source, or allocate.
80#[derive(Debug, Clone, PartialEq, Eq)]
81pub struct Indexer {
82    line_index: LineIndex,
83    comment_index: CommentIndex,
84    region_index: RegionIndex,
85    continuation_lines: Vec<TextSize>,
86}
87
88impl Indexer {
89    /// Build all indexes from parser output and the original source text.
90    ///
91    /// `source` must be the exact text used to produce `output`; ranges in the
92    /// parse result are interpreted as byte offsets into that string. Mismatched
93    /// source text can make line and region queries meaningless, even though the
94    /// constructor defensively avoids panicking on malformed comment ranges.
95    pub fn new(source: &str, output: &ParseResult) -> Self {
96        Self::for_file(source, &output.file)
97    }
98
99    /// Build indexes from parser output using explicit options.
100    ///
101    /// `source` must be the exact text used to produce `output`; ranges in the
102    /// parse result are interpreted as byte offsets into that string.
103    pub fn new_with_options(source: &str, output: &ParseResult, options: IndexerOptions) -> Self {
104        Self::for_file_with_options(source, &output.file, options)
105    }
106
107    /// Build all indexes from an already parsed file and the original source text.
108    ///
109    /// `source` must be the exact text used to produce `file`; ranges in the
110    /// AST are interpreted as byte offsets into that string.
111    pub fn for_file(source: &str, file: &File) -> Self {
112        Self::for_file_with_options(source, file, IndexerOptions::default())
113    }
114
115    /// Build indexes from an already parsed file using explicit options.
116    ///
117    /// `source` must be the exact text used to produce `file`; ranges in the
118    /// AST are interpreted as byte offsets into that string.
119    pub fn for_file_with_options(source: &str, file: &File, options: IndexerOptions) -> Self {
120        let raw_mode = if options.source_layout_indexes() {
121            RawContinuationMode::StoreAndReturn
122        } else {
123            RawContinuationMode::ReturnOnly
124        };
125        let (line_index, raw_continuations) = LineIndex::build(source, raw_mode);
126        let comment_index = CommentIndex::new(source, &line_index, file);
127        let region_index = RegionIndex::new_with_source_layout_indexes(
128            source,
129            file,
130            options.source_layout_indexes(),
131        );
132        let continuation_lines =
133            collect_continuation_lines(&raw_continuations, &comment_index, &region_index);
134
135        Self {
136            line_index,
137            comment_index,
138            region_index,
139            continuation_lines,
140        }
141    }
142
143    /// Return the line index for this source text.
144    ///
145    /// This is useful for converting diagnostic byte offsets to 1-based line
146    /// numbers or for extracting line-local snippets from the original source.
147    pub fn line_index(&self) -> &LineIndex {
148        &self.line_index
149    }
150
151    /// Return the comment index extracted from parser-owned comments.
152    ///
153    /// Comments are exposed in source order and include parser-recognized
154    /// comments inside nested shell constructs.
155    pub fn comment_index(&self) -> &CommentIndex {
156        &self.comment_index
157    }
158
159    /// Return the syntactic region index for quoted, heredoc, and related spans.
160    ///
161    /// Region lookups are intended for rules and formatters that need to avoid
162    /// interpreting bytes the same way in every syntactic context.
163    pub fn region_index(&self) -> &RegionIndex {
164        &self.region_index
165    }
166
167    /// Return byte offsets for the start of each semantic continuation line.
168    ///
169    /// Each offset points at the first byte of a physical line that continues
170    /// the previous one because that previous line ended with an active
171    /// backslash-newline. Continuations inside comments, quotes, and heredocs
172    /// are filtered out.
173    pub fn continuation_line_starts(&self) -> &[TextSize] {
174        &self.continuation_lines
175    }
176
177    /// Return whether `offset` is on a semantic continuation line.
178    ///
179    /// The query first maps `offset` to its containing 1-based line, then checks
180    /// whether that line starts at one of [`Self::continuation_line_starts`].
181    /// Offsets past the final byte of the source are treated according to the
182    /// last indexed line.
183    pub fn is_continuation(&self, offset: TextSize) -> bool {
184        let line = self.line_index.line_number(offset);
185        let Some(line_start) = self.line_index.line_start(line) else {
186            return false;
187        };
188
189        contains_offset(&self.continuation_lines, line_start)
190    }
191}
192
193fn collect_continuation_lines(
194    raw_continuations: &[RawContinuationCandidate],
195    comment_index: &CommentIndex,
196    region_index: &RegionIndex,
197) -> Vec<TextSize> {
198    let mut continuation_lines = Vec::new();
199
200    for continuation in raw_continuations {
201        if comment_index.is_comment(continuation.backslash)
202            || region_index.is_heredoc(continuation.backslash)
203            || region_index.is_quoted(continuation.backslash)
204        {
205            continue;
206        }
207
208        continuation_lines.push(continuation.line_start);
209    }
210
211    continuation_lines
212}
213
214fn contains_offset(offsets: &[TextSize], offset: TextSize) -> bool {
215    offsets.binary_search(&offset).is_ok()
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221    use shuck_parser::parser::Parser;
222
223    fn index(source: &str) -> Indexer {
224        let output = Parser::new(source).parse().unwrap();
225        Indexer::new(source, &output)
226    }
227
228    #[test]
229    fn detects_continuation_lines_without_allocating_source_copies() {
230        let source = "echo foo \\\n  bar\necho \"foo\\\nbar\"\n";
231        let indexer = index(source);
232
233        assert_eq!(indexer.continuation_line_starts().len(), 1);
234        assert!(indexer.is_continuation(TextSize::new(11)));
235        assert!(!indexer.is_continuation(TextSize::new(28)));
236        assert!(
237            indexer
238                .line_index()
239                .raw_continuation_backslashes()
240                .is_empty()
241        );
242        assert!(indexer.region_index().heredocs().is_empty());
243    }
244
245    #[test]
246    fn retains_source_layout_indexes_only_when_requested() {
247        let source = "echo foo \\\n  bar\ncat <<EOF\nbody\nEOF\n";
248        let output = Parser::new(source).parse().unwrap();
249        let indexer = Indexer::new_with_options(
250            source,
251            &output,
252            IndexerOptions::new().with_source_layout_indexes(true),
253        );
254
255        assert_eq!(
256            indexer.line_index().raw_continuation_backslashes(),
257            &[TextSize::new(source.find('\\').unwrap() as u32)]
258        );
259        assert_eq!(indexer.region_index().heredocs().len(), 1);
260    }
261
262    #[test]
263    fn round_trips_parser_output_into_regions_comments_and_lines() {
264        let source = "\
265#!/bin/bash
266echo \"$(printf '%s' \"$name\")\" # inline
267cat <<'EOF'
268literal $body
269EOF
270";
271        let indexer = index(source);
272
273        assert_eq!(indexer.line_index().line_count(), 6);
274        assert_eq!(indexer.comment_index().comments().len(), 2);
275
276        let heredoc_offset = TextSize::new(source.find("literal $body").unwrap() as u32);
277        assert_eq!(
278            indexer.region_index().region_at(heredoc_offset),
279            Some(RegionKind::Heredoc)
280        );
281        assert!(indexer.region_index().is_quoted(heredoc_offset));
282
283        let name_offset = TextSize::new(source.find("$name").unwrap() as u32);
284        assert_eq!(
285            indexer.region_index().region_at(name_offset),
286            Some(RegionKind::DoubleQuoted)
287        );
288    }
289}