libdd_trace_normalization/
normalize_utils.rs

1// Copyright 2024-Present Datadog, Inc. https://www.datadoghq.com/
2// SPDX-License-Identifier: Apache-2.0
3
4use std::time::SystemTime;
5
6// MAX_TYPE_LEN the maximum size for a span type
7pub(crate) const MAX_TYPE_LEN: usize = 100;
8/// an arbitrary cutoff to spot weird-looking values
9/// nanoseconds since epoch on Jan 1, 2000
10const YEAR_2000_NANOSEC_TS: i64 = 946684800000000000;
11/// DEFAULT_SPAN_NAME is the default name we assign a span if it's missing and we have no reasonable
12/// fallback
13pub(crate) const DEFAULT_SPAN_NAME: &str = "unnamed_operation";
14/// DEFAULT_SERVICE_NAME is the default name we assign a service if it's missing and we have no
15/// reasonable fallback
16pub(crate) const DEFAULT_SERVICE_NAME: &str = "unnamed-service";
17/// MAX_NAME_LEN the maximum length a name can have
18pub(crate) const MAX_NAME_LEN: usize = 100;
19/// MAX_SERVICE_LEN the maximum length a service can have
20const MAX_SERVICE_LEN: usize = 100;
21/// MAX_SERVICE_LEN the maximum length a tag can have
22const MAX_TAG_LEN: usize = 200;
23
24// normalize_service normalizes a span service
25pub fn normalize_service(svc: &mut String) {
26    truncate_utf8(svc, MAX_SERVICE_LEN);
27    normalize_tag(svc);
28    if svc.is_empty() {
29        DEFAULT_SERVICE_NAME.clone_into(svc);
30    }
31}
32
33// normalize_name normalizes a span name or an error describing why normalization failed.
34pub fn normalize_name(name: &mut String) {
35    truncate_utf8(name, MAX_NAME_LEN);
36    normalize_metric_name(name);
37    if name.is_empty() {
38        DEFAULT_SPAN_NAME.clone_into(name);
39    }
40}
41
42#[allow(clippy::ptr_arg)]
43pub fn normalize_resource(resource: &mut String, name: &str) {
44    if resource.is_empty() {
45        name.clone_into(resource);
46    }
47}
48
49pub fn normalize_span_type(span_type: &mut String) {
50    truncate_utf8(span_type, MAX_TYPE_LEN);
51}
52
53pub fn normalize_span_start_duration(start: &mut i64, duration: &mut i64) {
54    // Start & Duration as nanoseconds timestamps
55    // if s.Start is very little, less than year 2000 probably a unit issue so discard
56    if *duration < 0 {
57        *duration = 0;
58    }
59    if *duration > i64::MAX - *start {
60        *duration = 0;
61    }
62
63    if *start < YEAR_2000_NANOSEC_TS {
64        let now = SystemTime::UNIX_EPOCH.elapsed().map_or_else(
65            |e| -(e.duration().as_nanos() as i64),
66            |t| t.as_nanos() as i64,
67        );
68        *start = now - *duration;
69        if *start < 0 {
70            *start = now;
71        }
72    }
73}
74
75pub fn normalize_parent_id(parent_id: &mut u64, trace_id: u64, span_id: u64) {
76    // ParentID, TraceID and SpanID set in the client could be the same
77    // Supporting the ParentID == TraceID == SpanID for the root span, is compliant
78    // with the Zipkin implementation. Furthermore, as described in the PR
79    // https://github.com/openzipkin/zipkin/pull/851 the constraint that the
80    // root span's ``trace id = span id`` has been removed
81    if *parent_id == trace_id && *parent_id == span_id {
82        *parent_id = 0;
83    }
84}
85
86pub fn normalize_tag(tag: &mut String) {
87    // Since we know that we're only going to write valid utf8 we can work with the Vec directly
88    let bytes = unsafe { tag.as_mut_vec() };
89    if bytes.is_empty() {
90        return;
91    }
92    let mut read_cursor = 0;
93    let mut write_cursor = 0;
94    let mut is_in_illegal_span = true;
95    let mut codepoints_written = 0;
96
97    loop {
98        if read_cursor >= bytes.len()
99            || write_cursor >= 2 * MAX_TAG_LEN
100            || codepoints_written >= MAX_TAG_LEN
101        {
102            break;
103        }
104
105        let b = bytes[read_cursor];
106        // ascii fast-path
107        match b {
108            b'a'..=b'z' | b':' => {
109                bytes[write_cursor] = b;
110                is_in_illegal_span = false;
111                write_cursor += 1;
112                codepoints_written += 1;
113                read_cursor += 1;
114                continue;
115            }
116            b'A'..=b'Z' => {
117                bytes[write_cursor] = b - b'A' + b'a';
118                is_in_illegal_span = false;
119                write_cursor += 1;
120                codepoints_written += 1;
121                read_cursor += 1;
122                continue;
123            }
124            b'0'..=b'9' | b'.' | b'/' | b'-' => {
125                if write_cursor != 0 {
126                    bytes[write_cursor] = b;
127                    is_in_illegal_span = false;
128                    write_cursor += 1;
129                    codepoints_written += 1;
130                }
131                read_cursor += 1;
132                continue;
133            }
134            b'_' if !is_in_illegal_span => {
135                if write_cursor != 0 {
136                    bytes[write_cursor] = b;
137                    is_in_illegal_span = true;
138                    write_cursor += 1;
139                    codepoints_written += 1;
140                }
141                read_cursor += 1;
142                continue;
143            }
144            // ASCII range
145            0x00..=0x7F if !is_in_illegal_span => {
146                bytes[write_cursor] = b'_';
147                is_in_illegal_span = true;
148                write_cursor += 1;
149                codepoints_written += 1;
150                read_cursor += 1;
151                continue;
152            }
153            0x00..=0x7F if is_in_illegal_span => {
154                read_cursor += 1;
155                continue;
156            }
157            _ => {}
158        }
159
160        // Grab current unicode codepoint
161        let mut c = {
162            let mut it = bytes[read_cursor..].iter();
163            // This won't panic because we now bytes is a valid utf8 array, and next_code_point
164            // returns and actual utf8 codepoint
165            #[allow(clippy::unwrap_used)]
166            std::char::from_u32(crate::utf8_helpers::next_code_point(&mut it).unwrap()).unwrap()
167        };
168        let mut len_utf8 = c.len_utf8();
169        read_cursor += len_utf8;
170
171        if c.is_lowercase() {
172            c.encode_utf8(&mut bytes[write_cursor..write_cursor + len_utf8]);
173            is_in_illegal_span = false;
174            write_cursor += len_utf8;
175            codepoints_written += 1;
176            continue;
177        }
178        if c.is_uppercase() {
179            // Take only first codepoint of the lowercase conversion
180            // Lowercase the current character if it has the same width as it's lower
181            if let Some(lower) = c.to_lowercase().next() {
182                if lower.len_utf8() <= len_utf8 {
183                    c = lower;
184                    len_utf8 = c.len_utf8();
185                }
186            }
187        }
188
189        // The method in the agent checks if the character is of a Letter unicode class,
190        // which is not exactly the same. Alphabetics also contains Nl and Other_aplhabetics
191        // unicode character classes https://www.unicode.org/reports/tr44/#Alphabetic , but
192        // close enough
193        if c.is_alphabetic() {
194            c.encode_utf8(&mut bytes[write_cursor..write_cursor + len_utf8]);
195            is_in_illegal_span = false;
196            write_cursor += len_utf8;
197            codepoints_written += 1;
198        } else if c.is_numeric() {
199            if write_cursor != 0 {
200                c.encode_utf8(&mut bytes[write_cursor..write_cursor + len_utf8]);
201                is_in_illegal_span = false;
202                write_cursor += len_utf8;
203                codepoints_written += 1;
204            }
205        } else if !is_in_illegal_span {
206            bytes[write_cursor] = b'_';
207            is_in_illegal_span = true;
208            write_cursor += 1;
209            codepoints_written += 1;
210        }
211    }
212    // If we end up in an illegal span, remove the last written _
213    if is_in_illegal_span && write_cursor > 0 {
214        write_cursor -= 1;
215    }
216    bytes.truncate(write_cursor);
217}
218
219fn normalize_metric_name(name: &mut String) {
220    // Since we know that we're only going to write valid utf8 we can work with the Vec directly
221    let bytes = unsafe { name.as_mut_vec() };
222    if bytes.is_empty() {
223        return;
224    }
225
226    // Find first alpha character, if none is found the metric name is empty
227    let Some((mut read_cursor, _)) = bytes
228        .iter()
229        .enumerate()
230        .find(|(_, c)| c.is_ascii_alphabetic())
231    else {
232        *name = String::new();
233        return;
234    };
235    let mut write_cursor = 0;
236    let mut last_written_char = 0;
237    loop {
238        if read_cursor >= bytes.len() {
239            break;
240        }
241        match (bytes[read_cursor], last_written_char) {
242            (b @ (b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9'), _) => {
243                bytes[write_cursor] = b;
244                last_written_char = b;
245            }
246            // If we've written a _ last, replace it with a .
247            (b'.', b'_') => {
248                // This safe because the first character is alpha so
249                // we don't go back to the beginning
250                write_cursor -= 1;
251                bytes[write_cursor] = b'.';
252                last_written_char = b'.'
253            }
254            // If we've written a _ or a . last, do nothing
255            (_, b'_' | b'.') => {}
256            (b @ (b'_' | b'.'), _) => {
257                bytes[write_cursor] = b;
258                last_written_char = b;
259            }
260            // Otherwise write _ instead of any non conforming char
261            (_, _) => {
262                bytes[write_cursor] = b'_';
263                last_written_char = b'_';
264            }
265        }
266        write_cursor += 1;
267        read_cursor += 1;
268    }
269    if last_written_char == b'_' {
270        write_cursor -= 1;
271    }
272    bytes.truncate(write_cursor);
273}
274
275// truncate_utf8 truncates the given string to make sure it uses less than limit bytes.
276// If the last character is a utf8 character that would be split, it removes it
277// entirely to make sure the resulting string is not broken.
278pub(crate) fn truncate_utf8(s: &mut String, limit: usize) {
279    let boundary = crate::utf8_helpers::floor_char_boundary(s, limit);
280    s.truncate(boundary);
281}
282
283#[cfg(test)]
284mod tests {
285
286    use super::*;
287    use duplicate::duplicate_item;
288
289    #[duplicate_item(
290        test_name                       input                               expected;
291        [test_normalize_empty_string]   [""]                                ["unnamed_operation"];
292        [test_normalize_valid_string]   ["good"]                            ["good"];
293        [test_normalize_long_string]    ["Too-Long-.".repeat(20).as_str()]  ["Too_Long.".repeat(10)];
294        [test_normalize_dash_string]    ["bad-name"]                        ["bad_name"];
295        [test_normalize_invalid_string] ["&***"]                            ["unnamed_operation"];
296        [test_normalize_invalid_prefix] ["&&&&&&&_test-name-"]              ["test_name"];
297    )]
298    #[test]
299    fn test_name() {
300        let mut val = input.to_owned();
301        normalize_name(&mut val);
302        assert_eq!(val, expected);
303    }
304
305    #[duplicate_item(
306        test_name                       input                               expected;
307        [test_normalize_empty_service]  [""]                                ["unnamed-service"];
308        [test_normalize_valid_service]  ["good"]                            ["good"];
309        [test_normalize_long_service]   ["Too$Long$.".repeat(20).as_str()]  ["too_long_.".repeat(10)];
310        [test_normalize_dash_service]   ["bad&service"]                     ["bad_service"];
311    )]
312    #[test]
313    fn test_name() {
314        let mut val = input.to_owned();
315        normalize_service(&mut val);
316        assert_eq!(val, expected);
317    }
318
319    #[duplicate_item(
320        test_name               input   expected;
321        [test_normalize_tag_1]  ["#test_starting_hash"] ["test_starting_hash"];
322        [test_normalize_tag_2]  ["TestCAPSandSuch"] ["testcapsandsuch"];
323        [test_normalize_tag_3]  ["Test Conversion Of Weird !@#$%^&**() Characters"] ["test_conversion_of_weird_characters"];
324        [test_normalize_tag_4]  ["$#weird_starting"] ["weird_starting"];
325        [test_normalize_tag_5]  ["allowed:c0l0ns"] ["allowed:c0l0ns"];
326        [test_normalize_tag_6]  ["1love"] ["love"];
327        [test_normalize_tag_7]  ["ünicöde"] ["ünicöde"];
328        [test_normalize_tag_8]  ["ünicöde:metäl"] ["ünicöde:metäl"];
329        [test_normalize_tag_9]  ["Data🐨dog🐶 繋がっ⛰てて"] ["data_dog_繋がっ_てて"];
330        [test_normalize_tag_10] [" spaces   "] ["spaces"];
331        [test_normalize_tag_11] [" #hashtag!@#spaces #__<>#  "] ["hashtag_spaces"];
332        [test_normalize_tag_12] [":testing"] [":testing"];
333        [test_normalize_tag_13] ["_foo"] ["foo"];
334        [test_normalize_tag_14] [":::test"] [":::test"];
335        [test_normalize_tag_15] ["contiguous_____underscores"] ["contiguous_underscores"];
336        [test_normalize_tag_16] ["foo_"] ["foo"];
337        [test_normalize_tag_17] ["\u{017F}odd_\u{017F}case\u{017F}"] ["\u{017F}odd_\u{017F}case\u{017F}"] ; // edge-case
338        [test_normalize_tag_18] [""] [""];
339        [test_normalize_tag_19] [" "] [""];
340        [test_normalize_tag_20] ["ok"] ["ok"];
341        [test_normalize_tag_21] ["™Ö™Ö™™Ö™"] ["ö_ö_ö"];
342        [test_normalize_tag_22] ["AlsO:ök"] ["also:ök"];
343        [test_normalize_tag_23] [":still_ok"] [":still_ok"];
344        [test_normalize_tag_24] ["___trim"] ["trim"];
345        [test_normalize_tag_25] ["12.:trim@"] [":trim"];
346        [test_normalize_tag_26] ["12.:trim@@"] [":trim"];
347        [test_normalize_tag_27] ["fun:ky__tag/1"] ["fun:ky_tag/1"];
348        [test_normalize_tag_28] ["fun:ky@tag/2"] ["fun:ky_tag/2"];
349        [test_normalize_tag_29] ["fun:ky@@@tag/3"] ["fun:ky_tag/3"];
350        [test_normalize_tag_30] ["tag:1/2.3"] ["tag:1/2.3"];
351        [test_normalize_tag_31] ["---fun:k####y_ta@#g/1_@@#"]["fun:k_y_ta_g/1"];
352        [test_normalize_tag_32] ["AlsO:œ#@ö))œk"] ["also:œ_ö_œk"];
353        [test_normalize_tag_33] ["a".repeat(888).as_str()] ["a".repeat(200)];
354        [test_normalize_tag_34] [("a".to_owned() + &"🐶".repeat(799)).as_str()] ["a"];
355        [test_normalize_tag_35] [("a".to_string() + &char::REPLACEMENT_CHARACTER.to_string()).as_str()] ["a"];
356        [test_normalize_tag_36] [("a".to_string() + &char::REPLACEMENT_CHARACTER.to_string() + &char::REPLACEMENT_CHARACTER.to_string()).as_str()] ["a"];
357        [test_normalize_tag_37] [("a".to_string() + &char::REPLACEMENT_CHARACTER.to_string() + &char::REPLACEMENT_CHARACTER.to_string() + "b").as_str()] ["a_b"];
358        [test_normalize_tag_38]
359            ["A00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000"]
360            ["a00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000_0"]
361           ;
362    )]
363    #[test]
364    fn test_name() {
365        let mut v = input.to_owned();
366        normalize_tag(&mut v);
367        assert_eq!(v, expected)
368    }
369}