Skip to main content

zerobox_utils_string/
lib.rs

1mod truncate;
2
3pub use truncate::approx_bytes_for_tokens;
4pub use truncate::approx_token_count;
5pub use truncate::approx_tokens_from_byte_count;
6pub use truncate::truncate_middle_chars;
7pub use truncate::truncate_middle_with_token_budget;
8
9// Truncate a &str to a byte budget at a char boundary (prefix)
10#[inline]
11pub fn take_bytes_at_char_boundary(s: &str, maxb: usize) -> &str {
12    if s.len() <= maxb {
13        return s;
14    }
15    let mut last_ok = 0;
16    for (i, ch) in s.char_indices() {
17        let nb = i + ch.len_utf8();
18        if nb > maxb {
19            break;
20        }
21        last_ok = nb;
22    }
23    &s[..last_ok]
24}
25
26/// Sanitize a tag value to comply with metric tag validation rules:
27/// only ASCII alphanumeric, '.', '_', '-', and '/' are allowed.
28pub fn sanitize_metric_tag_value(value: &str) -> String {
29    const MAX_LEN: usize = 256;
30    let sanitized: String = value
31        .chars()
32        .map(|ch| {
33            if ch.is_ascii_alphanumeric() || matches!(ch, '.' | '_' | '-' | '/') {
34                ch
35            } else {
36                '_'
37            }
38        })
39        .collect();
40    let trimmed = sanitized.trim_matches('_');
41    if trimmed.is_empty() || trimmed.chars().all(|ch| !ch.is_ascii_alphanumeric()) {
42        return "unspecified".to_string();
43    }
44    if trimmed.len() <= MAX_LEN {
45        trimmed.to_string()
46    } else {
47        trimmed[..MAX_LEN].to_string()
48    }
49}
50
51/// Find all UUIDs in a string.
52#[allow(clippy::unwrap_used)]
53pub fn find_uuids(s: &str) -> Vec<String> {
54    static RE: std::sync::OnceLock<regex_lite::Regex> = std::sync::OnceLock::new();
55    let re = RE.get_or_init(|| {
56        regex_lite::Regex::new(
57            r"[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}",
58        )
59        .unwrap() // Unwrap is safe thanks to the tests.
60    });
61
62    re.find_iter(s).map(|m| m.as_str().to_string()).collect()
63}
64
65/// Convert a markdown-style `#L..` location suffix into a terminal-friendly
66/// `:line[:column][-line[:column]]` suffix.
67pub fn normalize_markdown_hash_location_suffix(suffix: &str) -> Option<String> {
68    let fragment = suffix.strip_prefix('#')?;
69    let (start, end) = match fragment.split_once('-') {
70        Some((start, end)) => (start, Some(end)),
71        None => (fragment, None),
72    };
73    let (start_line, start_column) = parse_markdown_hash_location_point(start)?;
74    let mut normalized = String::from(":");
75    normalized.push_str(start_line);
76    if let Some(column) = start_column {
77        normalized.push(':');
78        normalized.push_str(column);
79    }
80    if let Some(end) = end {
81        let (end_line, end_column) = parse_markdown_hash_location_point(end)?;
82        normalized.push('-');
83        normalized.push_str(end_line);
84        if let Some(column) = end_column {
85            normalized.push(':');
86            normalized.push_str(column);
87        }
88    }
89    Some(normalized)
90}
91
92fn parse_markdown_hash_location_point(point: &str) -> Option<(&str, Option<&str>)> {
93    let point = point.strip_prefix('L')?;
94    match point.split_once('C') {
95        Some((line, column)) => Some((line, Some(column))),
96        None => Some((point, None)),
97    }
98}
99
100#[cfg(test)]
101#[allow(warnings, clippy::all)]
102mod tests {
103    use super::find_uuids;
104    use super::normalize_markdown_hash_location_suffix;
105    use super::sanitize_metric_tag_value;
106    use pretty_assertions::assert_eq;
107
108    #[test]
109    fn find_uuids_finds_multiple() {
110        let input =
111            "x 00112233-4455-6677-8899-aabbccddeeff-k y 12345678-90ab-cdef-0123-456789abcdef";
112        assert_eq!(
113            find_uuids(input),
114            vec![
115                "00112233-4455-6677-8899-aabbccddeeff".to_string(),
116                "12345678-90ab-cdef-0123-456789abcdef".to_string(),
117            ]
118        );
119    }
120
121    #[test]
122    fn find_uuids_ignores_invalid() {
123        let input = "not-a-uuid-1234-5678-9abc-def0-123456789abc";
124        assert_eq!(find_uuids(input), Vec::<String>::new());
125    }
126
127    #[test]
128    fn find_uuids_handles_non_ascii_without_overlap() {
129        let input = "🙂 55e5d6f7-8a7f-4d2a-8d88-123456789012abc";
130        assert_eq!(
131            find_uuids(input),
132            vec!["55e5d6f7-8a7f-4d2a-8d88-123456789012".to_string()]
133        );
134    }
135
136    #[test]
137    fn sanitize_metric_tag_value_trims_and_fills_unspecified() {
138        let msg = "///";
139        assert_eq!(sanitize_metric_tag_value(msg), "unspecified");
140    }
141
142    #[test]
143    fn sanitize_metric_tag_value_replaces_invalid_chars() {
144        let msg = "bad value!";
145        assert_eq!(sanitize_metric_tag_value(msg), "bad_value");
146    }
147
148    #[test]
149    fn normalize_markdown_hash_location_suffix_converts_single_location() {
150        assert_eq!(
151            normalize_markdown_hash_location_suffix("#L74C3"),
152            Some(":74:3".to_string())
153        );
154    }
155
156    #[test]
157    fn normalize_markdown_hash_location_suffix_converts_ranges() {
158        assert_eq!(
159            normalize_markdown_hash_location_suffix("#L74C3-L76C9"),
160            Some(":74:3-76:9".to_string())
161        );
162    }
163}