Skip to main content

wdl_engine/eval/
trie.rs

1//! Implementation of the inputs prefix trie.
2//!
3//! The inputs prefix trie is used to map input host paths to guest paths for
4//! task evaluation.
5
6use std::collections::HashMap;
7use std::path::Component;
8use std::path::PathBuf;
9
10use anyhow::Context;
11use anyhow::Result;
12use anyhow::bail;
13use url::Url;
14
15use crate::ContentKind;
16use crate::EvaluationPath;
17use crate::EvaluationPathKind;
18use crate::GuestPath;
19use crate::backend::Input;
20use crate::eval::ROOT_NAME;
21
22/// Represents a node in an input trie.
23#[derive(Debug)]
24struct InputTrieNode {
25    /// The children of this node.
26    children: HashMap<String, Self>,
27    /// The identifier of the node in the trie.
28    ///
29    /// A node's identifier is used when formatting guest paths of children.
30    id: usize,
31    /// The index into the trie's `inputs` collection.
32    ///
33    /// This is `Some` only for terminal nodes in the trie.
34    index: Option<usize>,
35}
36
37impl InputTrieNode {
38    /// Constructs a new input trie node with the given id.
39    fn new(id: usize) -> Self {
40        Self {
41            children: Default::default(),
42            id,
43            index: None,
44        }
45    }
46}
47
48/// Represents a prefix trie based on input paths.
49///
50/// This is used to determine guest paths for inputs.
51///
52/// From the root to a terminal node represents a unique input.
53#[derive(Debug)]
54pub struct InputTrie {
55    /// The guest inputs root directory.
56    ///
57    /// This is `None` for backends that don't use containers.
58    guest_inputs_dir: Option<&'static str>,
59    /// The URL path children of the tree.
60    ///
61    /// The key in the map is the scheme of each URL.
62    urls: HashMap<String, InputTrieNode>,
63    /// The local path children of the tree.
64    ///
65    /// The key in the map is the first component of each path.
66    paths: HashMap<String, InputTrieNode>,
67    /// The inputs in the trie.
68    inputs: Vec<Input>,
69    /// The next node identifier.
70    next_id: usize,
71}
72
73impl InputTrie {
74    /// Constructs a new inputs trie without a guest inputs directory.
75    ///
76    /// Terminal nodes in the trie will not be mapped to guest paths.
77    pub fn new() -> Self {
78        Self {
79            guest_inputs_dir: None,
80            urls: Default::default(),
81            paths: Default::default(),
82            inputs: Default::default(),
83            // The first id starts at 1 as 0 is considered the "virtual root" of the trie
84            next_id: 1,
85        }
86    }
87
88    /// Constructs a new inputs trie with a guest inputs directory.
89    ///
90    /// Inputs with a host path will be mapped to a guest path relative to the
91    /// guest inputs directory.
92    ///
93    /// Note: a guest inputs directory is always a Unix-style path.
94    ///
95    /// # Panics
96    ///
97    /// Panics if the guest inputs directory does not end with a slash.
98    pub fn new_with_guest_dir(guest_inputs_dir: &'static str) -> Self {
99        assert!(guest_inputs_dir.ends_with('/'));
100
101        let mut trie = Self::new();
102        trie.guest_inputs_dir = Some(guest_inputs_dir);
103        trie
104    }
105
106    /// Inserts a new input into the trie.
107    ///
108    /// The path is either a local or remote input path.
109    ///
110    /// Relative paths are made absolute via the provided base path.
111    ///
112    /// If an input was added, returns `Ok(Some(index))` where `index` is the
113    /// index of the input in the trie.
114    ///
115    /// Returns `Ok(None)` if the provided path was already a guest input path.
116    ///
117    /// Returns an error for an invalid input path.
118    pub fn insert(
119        &mut self,
120        kind: ContentKind,
121        path: &str,
122        base_dir: &EvaluationPath,
123    ) -> Result<Option<usize>> {
124        match base_dir.join(path)?.into_kind() {
125            EvaluationPathKind::Local(path) => {
126                // Check to see if the path being inserted is already a guest path
127                if let Some(dir) = self.guest_inputs_dir
128                    && path.starts_with(dir)
129                {
130                    return Ok(None);
131                }
132
133                self.insert_path(kind, path).map(Some)
134            }
135            EvaluationPathKind::Remote(url) => self.insert_url(kind, url).map(Some),
136        }
137    }
138
139    /// Gets the inputs of the trie as a slice.
140    pub fn as_slice(&self) -> &[Input] {
141        &self.inputs
142    }
143
144    /// Gets the inputs of the trie as a mutable slice.
145    pub fn as_slice_mut(&mut self) -> &mut [Input] {
146        &mut self.inputs
147    }
148
149    /// Inserts an input with a local path into the trie.
150    fn insert_path(&mut self, kind: ContentKind, path: PathBuf) -> Result<usize> {
151        let mut components = path.components();
152
153        let component = components
154            .next()
155            .context("input path cannot be empty")?
156            .as_os_str()
157            .to_str()
158            .with_context(|| format!("input path `{path}` is not UTF-8", path = path.display()))?;
159
160        let mut parent_id = 0;
161        let mut node = self.paths.entry(component.to_string()).or_insert_with(|| {
162            let node = InputTrieNode::new(self.next_id);
163            self.next_id += 1;
164            node
165        });
166
167        let mut last_component = None;
168        for component in components {
169            match component {
170                Component::CurDir | Component::ParentDir => {
171                    bail!(
172                        "input path `{path}` may not contain `.` or `..`",
173                        path = path.display()
174                    );
175                }
176                _ => {}
177            }
178
179            let component = component.as_os_str().to_str().with_context(|| {
180                format!("input path `{path}` is not UTF-8", path = path.display())
181            })?;
182
183            parent_id = node.id;
184
185            node = node
186                .children
187                .entry(component.to_string())
188                .or_insert_with(|| {
189                    let node = InputTrieNode::new(self.next_id);
190                    self.next_id += 1;
191                    node
192                });
193
194            last_component = Some(component);
195        }
196
197        // Check to see if the input already exists in the trie
198        if let Some(index) = node.index {
199            return Ok(index);
200        }
201
202        let guest_path = self.guest_inputs_dir.map(|d| {
203            GuestPath::new(format!(
204                "{d}{parent_id}/{last}",
205                // On Windows, `last_component` might be `Some` despite being a root due to the
206                // prefix (e.g. `C:`); instead check if the path has a parent
207                last = if path.parent().is_none() {
208                    ROOT_NAME
209                } else {
210                    last_component.unwrap_or(ROOT_NAME)
211                }
212            ))
213        });
214
215        let index = self.inputs.len();
216        self.inputs.push(Input::new(
217            kind,
218            EvaluationPath::from_local_path(path),
219            guest_path,
220        ));
221        node.index = Some(index);
222        Ok(index)
223    }
224
225    /// Inserts an input with a URL into the trie.
226    fn insert_url(&mut self, kind: ContentKind, url: Url) -> Result<usize> {
227        // Insert for scheme
228        let mut node = self
229            .urls
230            .entry(url.scheme().to_string())
231            .or_insert_with(|| {
232                let node = InputTrieNode::new(self.next_id);
233                self.next_id += 1;
234                node
235            });
236
237        // Insert the authority; if the URL's path is empty, we'll
238        let mut parent_id = node.id;
239        node = node
240            .children
241            .entry(url.authority().to_string())
242            .or_insert_with(|| {
243                let node = InputTrieNode::new(self.next_id);
244                self.next_id += 1;
245                node
246            });
247
248        // Insert the path segments
249        let mut last_segment = None;
250        if let Some(segments) = url.path_segments() {
251            for segment in segments {
252                parent_id = node.id;
253                node = node.children.entry(segment.to_string()).or_insert_with(|| {
254                    let node = InputTrieNode::new(self.next_id);
255                    self.next_id += 1;
256                    node
257                });
258
259                if !segment.is_empty() {
260                    last_segment = Some(segment);
261                }
262            }
263        }
264
265        // Check to see if the input already exists in the trie
266        if let Some(index) = node.index {
267            return Ok(index);
268        }
269
270        let guest_path = self.guest_inputs_dir.as_ref().map(|d| {
271            GuestPath::new(format!(
272                "{d}{parent_id}/{last}",
273                last = last_segment.unwrap_or(ROOT_NAME)
274            ))
275        });
276
277        let index = self.inputs.len();
278        self.inputs
279            .push(Input::new(kind, EvaluationPath::try_from(url)?, guest_path));
280        node.index = Some(index);
281        Ok(index)
282    }
283}
284
285#[cfg(test)]
286mod test {
287    use pretty_assertions::assert_eq;
288
289    use super::*;
290
291    #[test]
292    fn empty_trie() {
293        let empty = InputTrie::new();
294        assert!(empty.as_slice().is_empty());
295    }
296
297    #[cfg(unix)]
298    #[test]
299    fn unmapped_inputs_unix() {
300        let mut trie = InputTrie::new();
301        let base_dir: EvaluationPath = "/base".parse().unwrap();
302        trie.insert(ContentKind::File, "/foo/bar/baz", &base_dir)
303            .unwrap();
304        assert_eq!(trie.as_slice().len(), 1);
305        assert_eq!(trie.as_slice()[0].path().to_string(), "/foo/bar/baz");
306        assert!(trie.as_slice()[0].guest_path().is_none());
307    }
308
309    #[cfg(windows)]
310    #[test]
311    fn unmapped_inputs_windows() {
312        let mut trie = InputTrie::new();
313        let base_dir: EvaluationPath = "C:\\base".parse().unwrap();
314        trie.insert(ContentKind::File, "C:\\foo\\bar\\baz", &base_dir)
315            .unwrap();
316        assert_eq!(trie.as_slice().len(), 1);
317        assert_eq!(trie.as_slice()[0].path().to_string(), "C:\\foo\\bar\\baz");
318        assert!(trie.as_slice()[0].guest_path().is_none());
319    }
320
321    #[cfg(unix)]
322    #[test]
323    fn non_empty_trie_unix() {
324        let mut trie = InputTrie::new_with_guest_dir("/inputs/");
325        let base_dir: EvaluationPath = "/base".parse().unwrap();
326        trie.insert(ContentKind::Directory, "/", &base_dir)
327            .unwrap()
328            .unwrap();
329        trie.insert(ContentKind::File, "/foo/bar/foo.txt", &base_dir)
330            .unwrap()
331            .unwrap();
332        trie.insert(ContentKind::File, "/foo/bar/bar.txt", &base_dir)
333            .unwrap()
334            .unwrap();
335        trie.insert(ContentKind::File, "/foo/baz/foo.txt", &base_dir)
336            .unwrap()
337            .unwrap();
338        trie.insert(ContentKind::File, "/foo/baz/bar.txt", &base_dir)
339            .unwrap()
340            .unwrap();
341        trie.insert(ContentKind::File, "/bar/foo/foo.txt", &base_dir)
342            .unwrap()
343            .unwrap();
344        trie.insert(ContentKind::File, "/bar/foo/bar.txt", &base_dir)
345            .unwrap()
346            .unwrap();
347        trie.insert(ContentKind::Directory, "/baz", &base_dir)
348            .unwrap()
349            .unwrap();
350        trie.insert(ContentKind::File, "https://example.com/", &base_dir)
351            .unwrap()
352            .unwrap();
353        trie.insert(
354            ContentKind::File,
355            "https://example.com/foo/bar/foo.txt",
356            &base_dir,
357        )
358        .unwrap()
359        .unwrap();
360        trie.insert(
361            ContentKind::File,
362            "https://example.com/foo/bar/bar.txt",
363            &base_dir,
364        )
365        .unwrap()
366        .unwrap();
367        trie.insert(
368            ContentKind::File,
369            "https://example.com/foo/baz/foo.txt",
370            &base_dir,
371        )
372        .unwrap()
373        .unwrap();
374        trie.insert(
375            ContentKind::File,
376            "https://example.com/foo/baz/bar.txt",
377            &base_dir,
378        )
379        .unwrap()
380        .unwrap();
381        trie.insert(
382            ContentKind::File,
383            "https://example.com/bar/foo/foo.txt",
384            &base_dir,
385        )
386        .unwrap()
387        .unwrap();
388        trie.insert(
389            ContentKind::File,
390            "https://example.com/bar/foo/bar.txt",
391            &base_dir,
392        )
393        .unwrap()
394        .unwrap();
395        trie.insert(ContentKind::File, "https://foo.com/bar", &base_dir)
396            .unwrap()
397            .unwrap();
398        trie.insert(ContentKind::File, "foo.txt", &base_dir)
399            .unwrap()
400            .unwrap();
401
402        // The important part of the guest paths are:
403        // 1) The guest file name should be the same (or `.root` if the path is
404        //    considered to be root)
405        // 2) Paths with the same parent should have the same guest parent
406        let paths: Vec<_> = trie
407            .as_slice()
408            .iter()
409            .map(|i| {
410                (
411                    i.path().to_string(),
412                    i.guest_path().expect("should have guest path").as_str(),
413                )
414            })
415            .collect();
416
417        assert_eq!(
418            paths,
419            [
420                ("/".to_string(), "/inputs/0/.root"),
421                ("/foo/bar/foo.txt".to_string(), "/inputs/3/foo.txt"),
422                ("/foo/bar/bar.txt".to_string(), "/inputs/3/bar.txt"),
423                ("/foo/baz/foo.txt".to_string(), "/inputs/6/foo.txt"),
424                ("/foo/baz/bar.txt".to_string(), "/inputs/6/bar.txt"),
425                ("/bar/foo/foo.txt".to_string(), "/inputs/10/foo.txt"),
426                ("/bar/foo/bar.txt".to_string(), "/inputs/10/bar.txt"),
427                ("/baz".to_string(), "/inputs/1/baz"),
428                ("https://example.com/".to_string(), "/inputs/15/.root"),
429                (
430                    "https://example.com/foo/bar/foo.txt".to_string(),
431                    "/inputs/18/foo.txt"
432                ),
433                (
434                    "https://example.com/foo/bar/bar.txt".to_string(),
435                    "/inputs/18/bar.txt"
436                ),
437                (
438                    "https://example.com/foo/baz/foo.txt".to_string(),
439                    "/inputs/21/foo.txt"
440                ),
441                (
442                    "https://example.com/foo/baz/bar.txt".to_string(),
443                    "/inputs/21/bar.txt"
444                ),
445                (
446                    "https://example.com/bar/foo/foo.txt".to_string(),
447                    "/inputs/25/foo.txt"
448                ),
449                (
450                    "https://example.com/bar/foo/bar.txt".to_string(),
451                    "/inputs/25/bar.txt"
452                ),
453                ("https://foo.com/bar".to_string(), "/inputs/28/bar"),
454                ("/base/foo.txt".to_string(), "/inputs/30/foo.txt"),
455            ]
456        );
457    }
458
459    #[cfg(windows)]
460    #[test]
461    fn non_empty_trie_windows() {
462        let mut trie = InputTrie::new_with_guest_dir("/inputs/");
463        let base_dir: EvaluationPath = "C:\\base".parse().unwrap();
464        trie.insert(ContentKind::Directory, "C:\\", &base_dir)
465            .unwrap()
466            .unwrap();
467        trie.insert(ContentKind::File, "C:\\foo\\bar\\foo.txt", &base_dir)
468            .unwrap()
469            .unwrap();
470        trie.insert(ContentKind::File, "C:\\foo\\bar\\bar.txt", &base_dir)
471            .unwrap()
472            .unwrap();
473        trie.insert(ContentKind::File, "C:\\foo\\baz\\foo.txt", &base_dir)
474            .unwrap()
475            .unwrap();
476        trie.insert(ContentKind::File, "C:\\foo\\baz\\bar.txt", &base_dir)
477            .unwrap()
478            .unwrap();
479        trie.insert(ContentKind::File, "C:\\bar\\foo\\foo.txt", &base_dir)
480            .unwrap()
481            .unwrap();
482        trie.insert(ContentKind::File, "C:\\bar\\foo\\bar.txt", &base_dir)
483            .unwrap()
484            .unwrap();
485        trie.insert(ContentKind::Directory, "C:\\baz", &base_dir)
486            .unwrap()
487            .unwrap();
488        trie.insert(ContentKind::File, "https://example.com/", &base_dir)
489            .unwrap()
490            .unwrap();
491        trie.insert(
492            ContentKind::File,
493            "https://example.com/foo/bar/foo.txt",
494            &base_dir,
495        )
496        .unwrap()
497        .unwrap();
498        trie.insert(
499            ContentKind::File,
500            "https://example.com/foo/bar/bar.txt",
501            &base_dir,
502        )
503        .unwrap()
504        .unwrap();
505        trie.insert(
506            ContentKind::File,
507            "https://example.com/foo/baz/foo.txt",
508            &base_dir,
509        )
510        .unwrap()
511        .unwrap();
512        trie.insert(
513            ContentKind::File,
514            "https://example.com/foo/baz/bar.txt",
515            &base_dir,
516        )
517        .unwrap()
518        .unwrap();
519        trie.insert(
520            ContentKind::File,
521            "https://example.com/bar/foo/foo.txt",
522            &base_dir,
523        )
524        .unwrap()
525        .unwrap();
526        trie.insert(
527            ContentKind::File,
528            "https://example.com/bar/foo/bar.txt",
529            &base_dir,
530        )
531        .unwrap()
532        .unwrap();
533        trie.insert(ContentKind::File, "https://foo.com/bar", &base_dir)
534            .unwrap()
535            .unwrap();
536        trie.insert(ContentKind::File, "foo.txt", &base_dir)
537            .unwrap()
538            .unwrap();
539
540        // The important part of the guest paths are:
541        // 1) The guest file name should be the same (or `.root` if the path is
542        //    considered to be root)
543        // 2) Paths with the same parent should have the same guest parent
544        let paths: Vec<_> = trie
545            .as_slice()
546            .iter()
547            .map(|i| {
548                (
549                    i.path().to_string(),
550                    i.guest_path().expect("should have guest path").as_str(),
551                )
552            })
553            .collect();
554
555        assert_eq!(
556            paths,
557            [
558                ("C:\\".to_string(), "/inputs/1/.root"),
559                ("C:\\foo\\bar\\foo.txt".to_string(), "/inputs/4/foo.txt"),
560                ("C:\\foo\\bar\\bar.txt".to_string(), "/inputs/4/bar.txt"),
561                ("C:\\foo\\baz\\foo.txt".to_string(), "/inputs/7/foo.txt"),
562                ("C:\\foo\\baz\\bar.txt".to_string(), "/inputs/7/bar.txt"),
563                ("C:\\bar\\foo\\foo.txt".to_string(), "/inputs/11/foo.txt"),
564                ("C:\\bar\\foo\\bar.txt".to_string(), "/inputs/11/bar.txt"),
565                ("C:\\baz".to_string(), "/inputs/2/baz"),
566                ("https://example.com/".to_string(), "/inputs/16/.root"),
567                (
568                    "https://example.com/foo/bar/foo.txt".to_string(),
569                    "/inputs/19/foo.txt"
570                ),
571                (
572                    "https://example.com/foo/bar/bar.txt".to_string(),
573                    "/inputs/19/bar.txt"
574                ),
575                (
576                    "https://example.com/foo/baz/foo.txt".to_string(),
577                    "/inputs/22/foo.txt"
578                ),
579                (
580                    "https://example.com/foo/baz/bar.txt".to_string(),
581                    "/inputs/22/bar.txt"
582                ),
583                (
584                    "https://example.com/bar/foo/foo.txt".to_string(),
585                    "/inputs/26/foo.txt"
586                ),
587                (
588                    "https://example.com/bar/foo/bar.txt".to_string(),
589                    "/inputs/26/bar.txt"
590                ),
591                ("https://foo.com/bar".to_string(), "/inputs/29/bar"),
592                ("C:\\base\\foo.txt".to_string(), "/inputs/31/foo.txt"),
593            ]
594        );
595    }
596}