Skip to main content

libdd_trace_obfuscation/
ip_address.rs

1// Copyright 2024-Present Datadog, Inc. https://www.datadoghq.com/
2// SPDX-License-Identifier: Apache-2.0
3
4use libdd_common::regex_engine::Regex;
5use std::{borrow::Cow, collections::HashSet, net::Ipv6Addr, sync::LazyLock};
6
7const ALLOWED_IP_ADDRESSES: [&str; 5] = [
8    // localhost
9    "127.0.0.1",
10    "::1",
11    // link-local cloud provider metadata server addresses
12    "169.254.169.254",
13    "fd00:ec2::254",
14    // ECS task metadata
15    "169.254.170.2",
16];
17
18const PREFIX_REGEX_LITERAL: &str = r"^((?:dnspoll|ftp|file|http|https):/{2,3})";
19static PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| {
20    #[allow(clippy::unwrap_used)]
21    Regex::new(PREFIX_REGEX_LITERAL).unwrap()
22});
23
24/// Quantizes a comma separated list of hosts.
25///
26/// Each entry which is an IP address is replaced using quantizeIP. Duplicate entries
27/// post-quantization or collapsed into a single unique value. Entries which are not IP addresses
28/// are left unchanged. Comma-separated host lists are common for peer tags like
29/// peer.cassandra.contact.points, peer.couchbase.seed.nodes, peer.kafka.bootstrap.servers
30///
31/// The quantized value is return as a `Cow` containing the input slice `s` if no modification was
32/// done or a new String if the value has been modified.
33///
34/// This quantization is used to reduce cardinality on peer tags in trace metrics. As such it is
35/// not exhaustive and some ip format may not be obfuscated.
36/// The reference implementation lives in [dd-go](https://github.com/DataDog/dd-go/blob/393e6de733807b20597d80b1e5103d6e823d8a0c/trace/pkg/peertags/peer_tags.go#L56)
37#[must_use]
38pub fn quantize_peer_ip_addresses<'a>(s: &'a str) -> Cow<'a, str> {
39    let values = s.split(',');
40    let mut should_return_new_string = false; // Set to true if the function should return a modified
41                                              // version of the string
42
43    let quantized_values = values
44        .map(|v| {
45            if let Some(quantize_string) = quantize_ip(v) {
46                should_return_new_string = true;
47                Cow::from(quantize_string)
48            } else {
49                Cow::from(v)
50            }
51        })
52        .collect::<Vec<Cow<'a, str>>>();
53
54    // Quantized value list without duplicates
55    let mut quantized_values_dedup: Vec<&str> = Vec::new();
56    let mut quantized_values_set: HashSet<&str> = HashSet::new();
57
58    for quantized_value in &quantized_values {
59        if quantized_values_set.insert(quantized_value) {
60            quantized_values_dedup.push(quantized_value);
61        } else {
62            should_return_new_string = true;
63        }
64    }
65    if should_return_new_string {
66        Cow::from(quantized_values_dedup.join(","))
67    } else {
68        Cow::from(s)
69    }
70}
71
72/// Replace valid ip address in `s` to allow quantization.
73///
74/// The ip is replaced if it is a valid IPv4 or v6
75///
76/// # Caveats
77/// - IPv6 with zone specifier '%' are not detected
78/// - IPv6 with suffix are not detected e.g. `::1-foo`
79fn quantize_ip(s: &str) -> Option<String> {
80    let (prefix, stripped_s) = split_prefix(s);
81    if let Some((ip, suffix)) = parse_ip(stripped_s) {
82        if !ALLOWED_IP_ADDRESSES.contains(&ip) {
83            return Some(format!("{prefix}blocked-ip-address{suffix}"));
84        }
85    }
86    None
87}
88
89/// Split the ip prefix, can be either a provider specific prefix or a protocol
90fn split_prefix(s: &str) -> (&str, &str) {
91    if let Some(tail) = s.strip_prefix("ip-") {
92        ("ip-", tail)
93    } else if let Some(protocol) = PREFIX_REGEX.find(s) {
94        s.split_at(protocol.end())
95    } else {
96        ("", s)
97    }
98}
99
100/// Check if `s` starts with a valid ip. If it does return Some((ip, suffix)), else return None.
101fn parse_ip(s: &str) -> Option<(&str, &str)> {
102    for ch in s.chars() {
103        // Determine the version of the ip
104        match ch {
105            '0'..='9' => {}
106            '.' | '-' | '_' => return parse_ip_v4(s, ch),
107            ':' | 'A'..='F' | 'a'..='f' if s.parse::<Ipv6Addr>().is_ok() => {
108                return Some((s, ""));
109            }
110            '[' => {
111                // Parse IPv6 in [host]:port format
112                if let Some((host, port)) = s[1..].split_once(']') {
113                    if host.parse::<Ipv6Addr>().is_ok() {
114                        return Some((host, port));
115                    }
116                }
117                return None;
118            }
119            _ => return None,
120        }
121    }
122    None
123}
124
125/// Check if `s` starts with a valid ipv4. If it does return Some(ip, suffix), else return None.
126/// We implement a custom ipv4 parsing to allow `-` and `_` as separator.
127fn parse_ip_v4(s: &str, sep: char) -> Option<(&str, &str)> {
128    let mut field_value = 0;
129    let mut field_digits = 0;
130    let mut current_field = 0;
131    let mut last_index = s.len();
132    for (i, ch) in s.char_indices() {
133        if let Some(digit) = ch.to_digit(10) {
134            // A field can't have a leading 0
135            if field_digits == 1 && field_value == 0 {
136                return None;
137            }
138            field_value = field_value * 10 + digit;
139            field_digits += 1;
140            if field_value > 255 {
141                return None;
142            }
143        } else if ch == sep {
144            // A valid field has at least one digit
145            if field_digits == 0 {
146                return None;
147            }
148            // If we already have 4 fields, parsing is over
149            if current_field == 3 {
150                last_index = i;
151                break;
152            }
153            field_value = 0;
154            field_digits = 0;
155            current_field += 1;
156        } else {
157            // An invalid character ends parsing
158            last_index = i;
159            break;
160        }
161    }
162    // Check that we found at 4 fields and that the last one as at least one digit
163    if field_digits > 0 && current_field == 3 {
164        Some(s.split_at(last_index))
165    } else {
166        None
167    }
168}
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173
174    #[test]
175    fn test_split_prefix() {
176        assert_eq!(split_prefix("ip-1.1.1.1"), ("ip-", "1.1.1.1"));
177        assert_eq!(split_prefix("https://1.1.1.1"), ("https://", "1.1.1.1"));
178        assert_eq!(split_prefix("ftp:///1.1.1.1"), ("ftp:///", "1.1.1.1"));
179        assert_eq!(split_prefix("1.1.1.1"), ("", "1.1.1.1"));
180        assert_eq!(split_prefix("foo,bar-1.1.1.1"), ("", "foo,bar-1.1.1.1"));
181    }
182
183    #[test]
184    #[allow(clippy::cognitive_complexity)]
185    fn test_quantize_peer_ip_addresses() {
186        // Special cases
187        // - localhost
188        assert_eq!(quantize_peer_ip_addresses("127.0.0.1"), "127.0.0.1");
189        assert_eq!(quantize_peer_ip_addresses("::1"), "::1");
190        // - link-local IP address, aka "metadata server" for various cloud providers
191        assert_eq!(
192            quantize_peer_ip_addresses("169.254.169.254"),
193            "169.254.169.254"
194        );
195        assert_eq!(quantize_peer_ip_addresses("169.254.170.2"), "169.254.170.2");
196        // blocking cases
197        assert_eq!(quantize_peer_ip_addresses(""), "");
198        assert_eq!(quantize_peer_ip_addresses("foo.dog"), "foo.dog");
199        assert_eq!(
200            quantize_peer_ip_addresses("192.168.1.1"),
201            "blocked-ip-address"
202        );
203        assert_eq!(
204            quantize_peer_ip_addresses("192.168.1.1.foo"),
205            "blocked-ip-address.foo"
206        );
207        assert_eq!(
208            quantize_peer_ip_addresses("192.168.1.1.2.3.4.5"),
209            "blocked-ip-address.2.3.4.5"
210        );
211        assert_eq!(
212            quantize_peer_ip_addresses("192_168_1_1"),
213            "blocked-ip-address"
214        );
215        assert_eq!(
216            quantize_peer_ip_addresses("192-168-1-1"),
217            "blocked-ip-address"
218        );
219        assert_eq!(
220            quantize_peer_ip_addresses("192-168-1-1.foo"),
221            "blocked-ip-address.foo"
222        );
223        assert_eq!(
224            quantize_peer_ip_addresses("192-168-1-1-foo"),
225            "blocked-ip-address-foo"
226        );
227        assert_eq!(
228            quantize_peer_ip_addresses("2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF"),
229            "blocked-ip-address"
230        );
231        assert_eq!(
232            quantize_peer_ip_addresses("2001:db8:3c4d:15::1a2f:1a2b"),
233            "blocked-ip-address"
234        );
235        assert_eq!(
236            quantize_peer_ip_addresses("[fe80::1ff:fe23:4567:890a]:8080"),
237            "blocked-ip-address:8080"
238        );
239        assert_eq!(
240            quantize_peer_ip_addresses("192.168.1.1:1234"),
241            "blocked-ip-address:1234"
242        );
243        assert_eq!(
244            quantize_peer_ip_addresses("dnspoll:///10.21.120.145:6400"),
245            "dnspoll:///blocked-ip-address:6400"
246        );
247        assert_eq!(
248            quantize_peer_ip_addresses("http://10.21.120.145:6400"),
249            "http://blocked-ip-address:6400"
250        );
251        assert_eq!(
252            quantize_peer_ip_addresses("https://10.21.120.145:6400"),
253            "https://blocked-ip-address:6400"
254        );
255        assert_eq!(
256            quantize_peer_ip_addresses(
257                "192.168.1.1:1234,10.23.1.1:53,10.23.1.1,fe80::1ff:fe23:4567:890a,foo.dog"
258            ),
259            "blocked-ip-address:1234,blocked-ip-address:53,blocked-ip-address,foo.dog"
260        );
261        assert_eq!(quantize_peer_ip_addresses("http://172.24.160.151:8091,172.24.163.33:8091,172.24.164.111:8091,172.24.165.203:8091,172.24.168.235:8091,172.24.170.130:8091"), "http://blocked-ip-address:8091,blocked-ip-address:8091");
262        assert_eq!(
263            quantize_peer_ip_addresses("10-60-160-172.my-service.namespace.svc.abc.cluster.local"),
264            "blocked-ip-address.my-service.namespace.svc.abc.cluster.local"
265        );
266        assert_eq!(
267            quantize_peer_ip_addresses("ip-10-152-4-129.ec2.internal"),
268            "ip-blocked-ip-address.ec2.internal"
269        );
270        assert_eq!(quantize_peer_ip_addresses("1-foo"), "1-foo");
271        assert_eq!(quantize_peer_ip_addresses("1-2-foo"), "1-2-foo");
272        assert_eq!(quantize_peer_ip_addresses("1-2-3-foo"), "1-2-3-foo");
273        assert_eq!(quantize_peer_ip_addresses("1-2-3-999"), "1-2-3-999");
274        assert_eq!(quantize_peer_ip_addresses("1-2-999-foo"), "1-2-999-foo");
275        assert_eq!(quantize_peer_ip_addresses("1-2-3-999-foo"), "1-2-3-999-foo");
276        assert_eq!(
277            quantize_peer_ip_addresses("1-2-3-4-foo"),
278            "blocked-ip-address-foo"
279        );
280        assert_eq!(
281            quantize_peer_ip_addresses("7-55-2-app.agent.datadoghq.com"),
282            "7-55-2-app.agent.datadoghq.com"
283        );
284    }
285}