include_preprocessor/
include_preprocessor.rs

1use std::collections::{HashMap, HashSet, hash_map::DefaultHasher};
2use std::hash::{Hash, Hasher};
3use std::io::Error as IOError;
4use std::ops::Range;
5use std::path::{Path, PathBuf};
6use std::sync::{Arc, mpsc};
7use std::{fs, mem, slice};
8
9use threadpool::ThreadPool;
10
11use crate::line_parser::{IncludePath, Line, parse_line};
12
13/// Passed to [preprocess] or [preprocess_with_source_tracker] to configure file resolution.
14///
15/// For details on file resolution, see the [File Resolution](crate#file-resolution) section of the
16/// top-level documentation.
17pub struct SearchPaths {
18    base_paths: Vec<PathBuf>,
19    quoted_paths: Vec<PathBuf>,
20}
21
22impl SearchPaths {
23    /// Creates a new empty [SearchPaths] instance that does not yet contain any search-paths.
24    pub fn new() -> Self {
25        SearchPaths {
26            base_paths: Vec::new(),
27            quoted_paths: Vec::new(),
28        }
29    }
30
31    /// Adds a new "base path" to the search paths.
32    ///
33    /// Refer to the [File Resolution](crate#file-resolution) section of the top-level documentation
34    /// for details on how base paths are used to resolve included file paths.
35    pub fn push_base_path<P>(&mut self, path: P)
36    where
37        P: AsRef<Path>,
38    {
39        let mut buf = PathBuf::new();
40
41        buf.push(path);
42
43        self.base_paths.push(buf);
44    }
45
46    /// Adds a new "quoted path" to the search paths.
47    ///
48    /// Refer to the [File Resolution](crate#file-resolution) section of the top-level documentation
49    /// for details on how quoted paths are used to resolve included file paths.
50    pub fn push_quoted_path<P>(&mut self, path: P)
51    where
52        P: AsRef<Path>,
53    {
54        let mut buf = PathBuf::new();
55
56        buf.push(path);
57
58        self.quoted_paths.push(buf);
59    }
60
61    /// Returns in iterator over the "base paths" registered with this [SearchPaths] instance, in
62    /// the order in which they were added.
63    ///
64    /// Refer to the [File Resolution](crate#file-resolution) section of the top-level documentation
65    /// for details on how base paths are used to resolve included file paths.
66    pub fn base_paths(&self) -> impl Iterator<Item = &PathBuf> {
67        self.base_paths.iter()
68    }
69
70    /// Returns in iterator over the "quoted paths" registered with this [SearchPaths] instance, in
71    /// the order in which they were added.
72    ///
73    /// Refer to the [File Resolution](crate#file-resolution) section of the top-level documentation
74    /// for details on how quoted paths are used to resolve included file paths.
75    pub fn quoted_paths(&self) -> impl Iterator<Item = &PathBuf> {
76        self.quoted_paths.iter().chain(self.base_paths.iter())
77    }
78}
79
80/// Error returned when [preprocess] or [preprocess_with_source_tracker] fails.
81#[derive(Debug)]
82pub enum Error {
83    /// Variant returned when an included file path cannot be resolved to a file.
84    FileNotFound(FileNotFoundError),
85
86    /// Variant returned when an interaction with the underlying OS failed.
87    IO(IOError),
88
89    /// Variant returned when either the entry-point file or any included file contains invalid
90    /// directives.
91    Parse(ParseError),
92}
93
94impl From<FileNotFoundError> for Error {
95    fn from(err: FileNotFoundError) -> Self {
96        Error::FileNotFound(err)
97    }
98}
99
100impl From<IOError> for Error {
101    fn from(err: IOError) -> Self {
102        Error::IO(err)
103    }
104}
105
106impl From<ParseError> for Error {
107    fn from(err: ParseError) -> Self {
108        Error::Parse(err)
109    }
110}
111
112/// Error returned by [preprocess] or [preprocess_with_source_tracker] when an included file path
113/// cannot be resolved to a file.
114///
115/// See also [Error].
116#[derive(Debug)]
117pub struct FileNotFoundError {
118    included_path: PathBuf,
119    source_file: PathBuf,
120    source: String,
121    line_number: usize,
122}
123
124impl FileNotFoundError {
125    /// The path the preprocessor tried and failed to resolve.
126    pub fn included_path(&self) -> &Path {
127        &self.included_path
128    }
129
130    /// The path of the file that contains the `#include` directive the preprocessor tried to
131    /// resolve.
132    pub fn source_file(&self) -> &Path {
133        &self.source_file
134    }
135
136    /// The text of the file that contains the `#include` directive the preprocessor tried to
137    /// resolve.
138    pub fn source(&self) -> &str {
139        &self.source
140    }
141
142    /// The line number of the `#include` directive the preprocessor tried to resolve.
143    pub fn line_number(&self) -> usize {
144        self.line_number
145    }
146}
147
148/// Error returned by [preprocess] or [preprocess_with_source_tracker] when either the entry-point
149/// file or any included file contains invalid directives.
150///
151/// See also [Error].
152#[derive(Debug)]
153pub struct ParseError {
154    message: String,
155    source_file: PathBuf,
156    source: String,
157    line_number: usize,
158}
159
160impl ParseError {
161    /// Message describing what failed to parse.
162    pub fn message(&self) -> &str {
163        &self.message
164    }
165
166    /// The path of the file that contains the invalid directive.
167    pub fn source_file(&self) -> &Path {
168        &self.source_file
169    }
170
171    /// The text of the file that contains the invalid directive.
172    pub fn source(&self) -> &str {
173        &self.source
174    }
175
176    /// The line number of the invalid directive.
177    pub fn line_number(&self) -> usize {
178        self.line_number
179    }
180}
181
182/// Processes the `entry_point` file, potentially inlining files referenced by `#include`
183/// directives.
184///
185/// Outputs string chunks to the given `output_sink` which, when concatenated in order, represent
186/// the intended result string.
187///
188/// For details on directive processing and inlining, refer to the [top-level documentation](crate).
189///
190/// Returns an error when:
191///
192/// - An invalid directive is encountered.
193/// - A path referenced by an `#include` directive fails to resolve to a valid file.
194/// - An error occurred while interacting with the underlying OS.
195///
196pub fn preprocess<P, S>(
197    entry_point: P,
198    search_paths: SearchPaths,
199    output_sink: S,
200) -> Result<S, Error>
201where
202    P: AsRef<Path>,
203    S: OutputSink,
204{
205    preprocess_with_source_tracker(entry_point, search_paths, output_sink, NoTrack)
206}
207
208/// Processes the `entry_point` file, potentially inlining files referenced by `#include`
209/// directives and notifying the `source_tracker` of included files.
210///
211/// Behaves exactly like [preprocess], except in that it takes an additional `source_tracker`
212/// argument. The `source_tracker` will be notified once for each file that is included.
213///
214/// See also the [SourceTracker] trait and the [Source Tracking](crate#source-tracking) section of
215/// the top-level-documentation.
216pub fn preprocess_with_source_tracker<P, S, T>(
217    entry_point: P,
218    search_paths: SearchPaths,
219    mut output_sink: S,
220    mut source_tracker: T,
221) -> Result<S, Error>
222where
223    P: AsRef<Path>,
224    S: OutputSink,
225    T: SourceTracker,
226{
227    let parsed = Parsed::try_init(entry_point, search_paths)?;
228
229    parsed.write(&mut output_sink, &mut source_tracker);
230
231    Ok(output_sink)
232}
233
234enum LoadState {
235    Loaded(ParsedNode),
236    Pending,
237}
238
239impl LoadState {
240    fn loaded(&self) -> Option<&ParsedNode> {
241        if let LoadState::Loaded(node) = self {
242            Some(node)
243        } else {
244            None
245        }
246    }
247}
248
249struct Parsed {
250    lookup: HashMap<u64, LoadState>,
251    root_key: u64,
252}
253
254impl Parsed {
255    fn try_init<P>(entry_point: P, search_paths: SearchPaths) -> Result<Self, Error>
256    where
257        P: AsRef<Path>,
258    {
259        let mut lookup = HashMap::new();
260        let (tx, rx) = mpsc::channel();
261        let pool = ThreadPool::new(num_cpus::get());
262        let entry_path = entry_point.as_ref().canonicalize()?;
263
264        let mut hasher = DefaultHasher::new();
265
266        entry_path.hash(&mut hasher);
267
268        let root_key = hasher.finish();
269        let root_node = ParsedNode::try_parse(entry_path, &search_paths);
270
271        lookup.insert(root_key, LoadState::Pending);
272
273        tx.send(root_node).unwrap();
274
275        let search_paths = Arc::new(search_paths);
276        let mut balance = 1;
277
278        loop {
279            if balance == 0 {
280                break;
281            }
282
283            let node = rx.recv().unwrap()?;
284
285            balance -= 1;
286
287            // Load and parse any files included by this node.
288            'inner: for chunk in node.chunks() {
289                if let NodeChunk::Include(path) = chunk {
290                    let mut hasher = DefaultHasher::new();
291
292                    path.hash(&mut hasher);
293
294                    let key = hasher.finish();
295
296                    if lookup.contains_key(&key) {
297                        // File has been/is being loaded, skip
298                        continue 'inner;
299                    }
300
301                    // Not yet loaded, try and load
302                    lookup.insert(key, LoadState::Pending);
303                    balance += 1;
304
305                    let tx_clone = tx.clone();
306                    let search_paths_clone = search_paths.clone();
307                    let path_buf = path.to_path_buf();
308
309                    pool.execute(move || {
310                        tx_clone
311                            .send(ParsedNode::try_parse(path_buf, &search_paths_clone))
312                            .unwrap();
313                    });
314                }
315            }
316
317            lookup.insert(node.key(), LoadState::Loaded(node));
318        }
319
320        Ok(Parsed { lookup, root_key })
321    }
322
323    fn get_by_key(&self, key: u64) -> Option<&ParsedNode> {
324        self.lookup.get(&key).and_then(|node| node.loaded())
325    }
326
327    fn get_by_path<P>(&self, path: P) -> Option<&ParsedNode>
328    where
329        P: AsRef<Path>,
330    {
331        let mut hasher = DefaultHasher::new();
332
333        path.as_ref().hash(&mut hasher);
334
335        let key = hasher.finish();
336
337        self.get_by_key(key)
338    }
339
340    fn write<S, T>(&self, output_sink: &mut S, source_tracker: &mut T)
341    where
342        S: OutputSink,
343        T: SourceTracker,
344    {
345        let mut stack = Vec::new();
346        let mut seen = HashSet::new();
347
348        let root_node = self.get_by_key(self.root_key).unwrap();
349
350        if root_node.once() {
351            seen.insert(root_node.key());
352        }
353
354        let mut current_node = root_node;
355        let mut current_chunk = 0;
356
357        loop {
358            if let Some(chunk) = current_node.get_chunk(current_chunk) {
359                match chunk {
360                    NodeChunk::Text(chunk) => {
361                        output_sink.sink_source_mapped(SourceMappedChunk {
362                            text: chunk.text(),
363                            source_path: current_node.path(),
364                            source_range: chunk.byte_range(),
365                        });
366
367                        current_chunk += 1;
368                    }
369                    NodeChunk::Include(path) => {
370                        let node = self.get_by_path(path).unwrap();
371
372                        if node.once() && seen.contains(&node.key()) {
373                            current_chunk += 1;
374                        } else {
375                            seen.insert(node.key());
376
377                            stack.push((current_node.key(), current_chunk));
378
379                            current_node = node;
380                            current_chunk = 0;
381                        }
382                    }
383                }
384            } else {
385                if let Some((parent_key, child_chunk)) = stack.pop() {
386                    // Ensure newline after included chunk
387                    output_sink.sink("\n");
388
389                    current_node = self.get_by_key(parent_key).unwrap();
390                    current_chunk = child_chunk + 1;
391                } else {
392                    break;
393                }
394            }
395        }
396
397        for node in self.lookup.values() {
398            let node = node.loaded().unwrap();
399
400            source_tracker.track(node.path(), node.source());
401        }
402    }
403}
404
405#[derive(Debug)]
406enum NodeChunkInternal {
407    Text(Range<usize>),
408    Include(PathBuf),
409}
410
411struct TextChunk<'a> {
412    byte_range: Range<usize>,
413    text: &'a str,
414}
415
416impl<'a> TextChunk<'a> {
417    fn text(&self) -> &str {
418        &self.text
419    }
420
421    fn byte_range(&self) -> Range<usize> {
422        self.byte_range.clone()
423    }
424}
425
426enum NodeChunk<'a> {
427    Text(TextChunk<'a>),
428    Include(&'a Path),
429}
430
431struct ParsedNode {
432    path: PathBuf,
433    key: u64,
434    once: bool,
435    source: String,
436    chunk_buffer: Vec<NodeChunkInternal>,
437}
438
439impl ParsedNode {
440    fn try_parse(path: PathBuf, search_paths: &SearchPaths) -> Result<Self, Error> {
441        let source = fs::read_to_string(&path)?;
442        let source_len = source.len();
443
444        let mut remainder = source.as_str();
445        let mut line_number = 0;
446        let mut chunk_buffer = Vec::new();
447        let mut once = false;
448        let mut current_text_range = 0..0;
449
450        while remainder.len() > 0 {
451            let (new_remainder, line) = parse_line(remainder).map_err(|err| {
452                let mut buf = PathBuf::new();
453
454                buf.push(&path);
455
456                ParseError {
457                    source_file: buf,
458                    line_number,
459                    source: source.clone(),
460                    message: err.to_string(),
461                }
462            })?;
463
464            let pos = source_len - new_remainder.len();
465
466            if line == Line::Text {
467                current_text_range.end = pos;
468            } else {
469                let range = mem::replace(&mut current_text_range, pos..pos);
470
471                if range.len() > 0 {
472                    chunk_buffer.push(NodeChunkInternal::Text(range))
473                }
474            }
475
476            match line {
477                Line::Include(target) => {
478                    let resolved = try_resolve_include_path(
479                        target,
480                        (path.as_ref(), &source, line_number),
481                        search_paths,
482                    )?;
483
484                    chunk_buffer.push(NodeChunkInternal::Include(resolved));
485                }
486                Line::PragmaOnce => {
487                    once = true;
488                }
489                Line::Text => (),
490            }
491
492            remainder = new_remainder;
493            line_number += 1;
494        }
495
496        if current_text_range.len() != 0 {
497            chunk_buffer.push(NodeChunkInternal::Text(current_text_range))
498        }
499
500        let mut hasher = DefaultHasher::new();
501
502        path.hash(&mut hasher);
503
504        let key = hasher.finish();
505
506        Ok(ParsedNode {
507            path,
508            key,
509            once,
510            source,
511            chunk_buffer,
512        })
513    }
514
515    fn path(&self) -> &Path {
516        self.path.as_ref()
517    }
518
519    fn key(&self) -> u64 {
520        self.key
521    }
522
523    fn source(&self) -> &str {
524        &self.source
525    }
526
527    fn once(&self) -> bool {
528        self.once
529    }
530
531    fn get_chunk(&self, index: usize) -> Option<NodeChunk<'_>> {
532        self.chunk_buffer.get(index).map(|chunk| match chunk {
533            NodeChunkInternal::Text(range) => NodeChunk::Text(TextChunk {
534                byte_range: range.clone(),
535                text: &self.source[range.clone()],
536            }),
537            NodeChunkInternal::Include(path) => NodeChunk::Include(path.as_path()),
538        })
539    }
540
541    fn chunks(&self) -> NodeChunks<'_> {
542        let ParsedNode {
543            source,
544            chunk_buffer,
545            ..
546        } = self;
547
548        NodeChunks {
549            source,
550            chunks: chunk_buffer.iter(),
551        }
552    }
553}
554
555struct NodeChunks<'a> {
556    source: &'a String,
557    chunks: slice::Iter<'a, NodeChunkInternal>,
558}
559
560impl<'a> Iterator for NodeChunks<'a> {
561    type Item = NodeChunk<'a>;
562
563    fn next(&mut self) -> Option<Self::Item> {
564        let NodeChunks { source, chunks } = self;
565
566        if let Some(chunk) = chunks.next() {
567            let chunk = match chunk {
568                NodeChunkInternal::Text(range) => NodeChunk::Text(TextChunk {
569                    byte_range: range.clone(),
570                    text: &source[range.clone()],
571                }),
572                NodeChunkInternal::Include(path) => NodeChunk::Include(path),
573            };
574
575            Some(chunk)
576        } else {
577            None
578        }
579    }
580}
581
582/// A chunk of source text along with a path and range that identify the origin of the next.
583///
584/// See also [OutputSink::sink_source_mapped] and the [Custom OutputSink](crate#custom-outputsink)
585/// section of the top-level documentation.
586pub struct SourceMappedChunk<'a> {
587    text: &'a str,
588    source_path: &'a Path,
589    source_range: Range<usize>,
590}
591
592impl<'a> SourceMappedChunk<'a> {
593    /// The text of the chunk.
594    pub fn text(&self) -> &str {
595        &self.text
596    }
597
598    /// The path of the file from which the chunk originates.
599    pub fn source_path(&self) -> &Path {
600        &self.source_path
601    }
602
603    /// The range in bytes of the span within the source file from which the chunk originates.
604    pub fn source_range(&self) -> Range<usize> {
605        self.source_range.clone()
606    }
607}
608
609/// Trait implemented by types into which the preprocessor may sink its output.
610///
611/// The preprocessor outputs chunk of text, that when concatenated in order represent the intended
612/// output string of the preprocessor.
613///
614/// Most chunks sunk by the preprocessor can be mapped to a source text-span within a file in the
615/// filesystem. In this case the preprocessor will call [OutputSink::sink_source_mapped] to sink
616/// the text-chunk, along with information identifying the source text-span.
617///
618/// The preprocessor may generate some chunks that cannot be mapped to a source text-span. If this
619/// is the case, preprocessor will call [OutputSink::sink] to the text-chunk, without any additional
620/// information.
621///
622/// See also the [Custom OutputSink](crate#custom-outputsink) section of the top-level
623/// documentation.
624pub trait OutputSink {
625    /// Called with chunks that cannot be mapped to a source span.
626    fn sink(&mut self, chunk: &str);
627
628    /// Called with chunks that can be mapped to a source span.
629    fn sink_source_mapped(&mut self, source_mapped_chunk: SourceMappedChunk);
630}
631
632impl OutputSink for String {
633    fn sink(&mut self, chunk: &str) {
634        self.push_str(chunk);
635    }
636
637    fn sink_source_mapped(&mut self, source_mapped_chunk: SourceMappedChunk) {
638        self.push_str(source_mapped_chunk.text)
639    }
640}
641
642/// Trait implemented by types that track source-file use by [preprocess_with_source_tracker].
643///
644/// [preprocess_with_source_tracker] will call [SourceTracker::track] for each file that was
645/// included into the preprocessor's output. [SourceTracker::track] will only be called once for
646/// each included file, irrespective of how many times the file is referenced or included.
647///
648/// See also the [Source Tracking](crate#source-tracking) section of the top-level documentation.
649pub trait SourceTracker {
650    /// Called for each include path successfully included by the preprocessor, along with a
651    /// reference to the source text of the included file.
652    fn track(&mut self, path: &Path, source: &str);
653}
654
655impl<T> SourceTracker for &'_ mut T
656where
657    T: SourceTracker,
658{
659    fn track(&mut self, path: &Path, source: &str) {
660        <T as SourceTracker>::track(self, path, source)
661    }
662}
663
664struct NoTrack;
665
666impl SourceTracker for NoTrack {
667    fn track(&mut self, _path: &Path, _source: &str) {}
668}
669
670fn try_resolve_include_path(
671    include_path: IncludePath,
672    included_from: (&Path, &str, usize),
673    search_paths: &SearchPaths,
674) -> Result<PathBuf, Error> {
675    let mut resolved = None;
676
677    let path = match include_path {
678        IncludePath::Angle(path) => {
679            for search_path in search_paths.base_paths() {
680                let join = search_path.join(path);
681
682                if join.is_file() {
683                    resolved = Some(join);
684
685                    break;
686                }
687            }
688
689            path
690        }
691        IncludePath::Quote(path) => {
692            let join = included_from.0.parent().unwrap().join(path);
693
694            if join.is_file() {
695                resolved = Some(join);
696            } else {
697                for search_path in search_paths.quoted_paths() {
698                    let join = search_path.join(path);
699
700                    if join.is_file() {
701                        resolved = Some(join);
702
703                        break;
704                    }
705                }
706            }
707
708            path
709        }
710    };
711
712    if let Some(resolved) = resolved {
713        Ok(resolved.canonicalize()?)
714    } else {
715        Err(FileNotFoundError {
716            included_path: path.to_path_buf(),
717            source_file: included_from.0.to_path_buf(),
718            source: included_from.1.to_string(),
719            line_number: included_from.2,
720        }
721        .into())
722    }
723}