domain_lookup_tree/lib.rs
1use std::collections::HashMap;
2
3/// DomainLookupTree is a data structure which provides efficient domain name lookup matching with
4/// support for wildcard entries.
5///
6/// Requirements for this implementation:
7/// - Given a domain name, determine if it matches an entry in the tree
8/// - There can be an ever-growing amount of tree entries
9/// - Entries can be absolute matches, e.g.: www.google.com
10/// - Entries may be wildcard entries, which is denoted in the entry by providing a leading dot,
11/// e.g.: .twitter.com, .en.wikipedia.org, .giggl.app
12/// - Wilcard entries can not be embedded
13///
14/// To achieve this, we implement a simple tree-style structure which has a root structure that
15/// contains a vector of nodes. These nodes can then contain other node decendants, and also be
16/// marked as "wildcard" which means theres a rule that matches that domain level and all of its
17/// decendants.
18///
19/// If, when performing a lookup, the search domain contains segments deeper than the wildcard
20/// match, it can continue to traverse the tree until it exhausts its lookup options. At that
21/// point, the deepest wildcard entry found would be returned, if no absolute match was found.
22///
23/// It's good to keep in mind that, when traversing the tree, domain names are sorted by top level
24/// to infinite n-level, or in simpler terms, in reverse. This means that if "google.com" is looked
25/// up in the tree, it would split by ".", reverse the vector, then first perform a root node
26/// lookup for "com", and so on.
27///
28/// Walking down the tree - the story of a lookup:
29/// Let's say have a DomainLookupTree with an entry ".giggl.app" which means that the tree looks
30/// like this:
31///
32/// app
33/// └── giggl [wildcard]
34///
35/// A domain lookup for "canary.giggl.app" is requested. First, "app" is matched, but it's not a
36/// wildcard, so it's ignored. We now check the decendants of "app" for "giggl" - it matches, and
37/// it's a wildcard match, so we store it within the context of the lookup. This lookup will now
38/// 100% return a match, even if it isn't absolute. Anyway, we now check the decendants of "giggl"
39/// for "canary", though it doesn't exist, and the traversal ends. Now, we didn't have an absolute
40/// match, but we did have a wildcard match earlier on for ".giggl.app", so we successfully return
41/// the result ".giggl.app" from the lookup function.
42///
43///
44
45type NodeList = HashMap<String, Node>;
46
47#[derive(Debug)]
48pub struct DomainLookupTree {
49 nodes: NodeList,
50 minimum_level: usize,
51}
52
53#[derive(Debug)]
54pub struct Node {
55 wildcard: bool,
56 nodes: NodeList,
57 data: String,
58}
59
60impl Node {
61 fn new(wildcard: bool, data: &str) -> Self {
62 Self {
63 wildcard,
64 nodes: Default::default(),
65 data: data.to_owned(),
66 }
67 }
68}
69
70impl DomainLookupTree {
71 pub fn new() -> DomainLookupTree {
72 DomainLookupTree {
73 nodes: Default::default(),
74 minimum_level: 0,
75 }
76 }
77
78 pub fn insert(&mut self, domain: &str) {
79 let is_wildcard = domain.starts_with(".");
80 let segments = domain_to_rseg(domain);
81 let n_segments = segments.len();
82
83 let mut head = &mut self.nodes;
84 for (i, segment) in segments.iter().copied().enumerate() {
85 let node = head
86 .entry(segment.to_owned())
87 .or_insert_with(|| Node::new(i == n_segments - 2 && is_wildcard, segment));
88
89 if i == n_segments - 2 && is_wildcard {
90 return;
91 }
92
93 head = &mut node.nodes;
94 }
95 }
96
97 pub fn lookup(&self, domain: &str) -> Option<String> {
98 match self.traverse(domain) {
99 None => None,
100 Some((fqdn, node)) => {
101 return Some(format!("{}{}", if node.wildcard { "." } else { "" }, fqdn))
102 }
103 }
104 }
105
106 pub fn traverse(&self, domain: &str) -> Option<(String, &Node)> {
107 let segments = domain_to_rseg(domain);
108 let mut wildcard_match = None;
109 // We start the traversal at the root
110 let mut head: &NodeList = &self.nodes;
111
112 let mut fqdn = String::new();
113
114 // We traverse the tree in level-reverse order
115 for (i, segment) in segments.iter().copied().enumerate() {
116 // Now we look up the children of the latest matched node
117 // If this is the first iteration, then it's the root NodeList
118 if let Some(child) = head.get(segment) {
119 fqdn = format!(
120 "{}{}{}",
121 segment.to_owned(),
122 if i == 0 { "" } else { "." },
123 fqdn
124 );
125 head = &child.nodes;
126 // We have exhausted the traversal. If the traversal depth is equal to the segment
127 // length, then we've found an absolute match!
128 if i == segments.len() - 1 {
129 return Some((fqdn, child));
130 } else if child.wildcard {
131 // Current node is wildcard, so we now 100% have a value to return
132 wildcard_match = Some(child);
133 }
134 } else {
135 // We have exhausted the traversal.
136 break;
137 }
138 }
139
140 if let Some(m) = wildcard_match {
141 Some((fqdn, m))
142 } else {
143 None
144 }
145 }
146}
147
148fn domain_to_rseg(domain: &str) -> Vec<&str> {
149 domain.rsplit(".").collect::<Vec<&str>>()
150}