Skip to main content

zerobox_utils_string/
lib.rs

1mod truncate;
2
3pub use truncate::approx_bytes_for_tokens;
4pub use truncate::approx_token_count;
5pub use truncate::approx_tokens_from_byte_count;
6pub use truncate::truncate_middle_chars;
7pub use truncate::truncate_middle_with_token_budget;
8
9// Truncate a &str to a byte budget at a char boundary (prefix)
10#[inline]
11pub fn take_bytes_at_char_boundary(s: &str, maxb: usize) -> &str {
12    if s.len() <= maxb {
13        return s;
14    }
15    let mut last_ok = 0;
16    for (i, ch) in s.char_indices() {
17        let nb = i + ch.len_utf8();
18        if nb > maxb {
19            break;
20        }
21        last_ok = nb;
22    }
23    &s[..last_ok]
24}
25
26// Take a suffix of a &str within a byte budget at a char boundary
27#[inline]
28pub fn take_last_bytes_at_char_boundary(s: &str, maxb: usize) -> &str {
29    if s.len() <= maxb {
30        return s;
31    }
32    let mut start = s.len();
33    let mut used = 0usize;
34    for (i, ch) in s.char_indices().rev() {
35        let nb = ch.len_utf8();
36        if used + nb > maxb {
37            break;
38        }
39        start = i;
40        used += nb;
41        if start == 0 {
42            break;
43        }
44    }
45    &s[start..]
46}
47
48/// Sanitize a tag value to comply with metric tag validation rules:
49/// only ASCII alphanumeric, '.', '_', '-', and '/' are allowed.
50pub fn sanitize_metric_tag_value(value: &str) -> String {
51    const MAX_LEN: usize = 256;
52    let sanitized: String = value
53        .chars()
54        .map(|ch| {
55            if ch.is_ascii_alphanumeric() || matches!(ch, '.' | '_' | '-' | '/') {
56                ch
57            } else {
58                '_'
59            }
60        })
61        .collect();
62    let trimmed = sanitized.trim_matches('_');
63    if trimmed.is_empty() || trimmed.chars().all(|ch| !ch.is_ascii_alphanumeric()) {
64        return "unspecified".to_string();
65    }
66    if trimmed.len() <= MAX_LEN {
67        trimmed.to_string()
68    } else {
69        trimmed[..MAX_LEN].to_string()
70    }
71}
72
73/// Find all UUIDs in a string.
74#[allow(clippy::unwrap_used)]
75pub fn find_uuids(s: &str) -> Vec<String> {
76    static RE: std::sync::OnceLock<regex_lite::Regex> = std::sync::OnceLock::new();
77    let re = RE.get_or_init(|| {
78        regex_lite::Regex::new(
79            r"[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}",
80        )
81        .unwrap() // Unwrap is safe thanks to the tests.
82    });
83
84    re.find_iter(s).map(|m| m.as_str().to_string()).collect()
85}
86
87/// Convert a markdown-style `#L..` location suffix into a terminal-friendly
88/// `:line[:column][-line[:column]]` suffix.
89pub fn normalize_markdown_hash_location_suffix(suffix: &str) -> Option<String> {
90    let fragment = suffix.strip_prefix('#')?;
91    let (start, end) = match fragment.split_once('-') {
92        Some((start, end)) => (start, Some(end)),
93        None => (fragment, None),
94    };
95    let (start_line, start_column) = parse_markdown_hash_location_point(start)?;
96    let mut normalized = String::from(":");
97    normalized.push_str(start_line);
98    if let Some(column) = start_column {
99        normalized.push(':');
100        normalized.push_str(column);
101    }
102    if let Some(end) = end {
103        let (end_line, end_column) = parse_markdown_hash_location_point(end)?;
104        normalized.push('-');
105        normalized.push_str(end_line);
106        if let Some(column) = end_column {
107            normalized.push(':');
108            normalized.push_str(column);
109        }
110    }
111    Some(normalized)
112}
113
114fn parse_markdown_hash_location_point(point: &str) -> Option<(&str, Option<&str>)> {
115    let point = point.strip_prefix('L')?;
116    match point.split_once('C') {
117        Some((line, column)) => Some((line, Some(column))),
118        None => Some((point, None)),
119    }
120}
121
122#[cfg(test)]
123#[allow(warnings, clippy::all)]
124mod tests {
125    use super::find_uuids;
126    use super::normalize_markdown_hash_location_suffix;
127    use super::sanitize_metric_tag_value;
128    use pretty_assertions::assert_eq;
129
130    #[test]
131    fn find_uuids_finds_multiple() {
132        let input =
133            "x 00112233-4455-6677-8899-aabbccddeeff-k y 12345678-90ab-cdef-0123-456789abcdef";
134        assert_eq!(
135            find_uuids(input),
136            vec![
137                "00112233-4455-6677-8899-aabbccddeeff".to_string(),
138                "12345678-90ab-cdef-0123-456789abcdef".to_string(),
139            ]
140        );
141    }
142
143    #[test]
144    fn find_uuids_ignores_invalid() {
145        let input = "not-a-uuid-1234-5678-9abc-def0-123456789abc";
146        assert_eq!(find_uuids(input), Vec::<String>::new());
147    }
148
149    #[test]
150    fn find_uuids_handles_non_ascii_without_overlap() {
151        let input = "🙂 55e5d6f7-8a7f-4d2a-8d88-123456789012abc";
152        assert_eq!(
153            find_uuids(input),
154            vec!["55e5d6f7-8a7f-4d2a-8d88-123456789012".to_string()]
155        );
156    }
157
158    #[test]
159    fn sanitize_metric_tag_value_trims_and_fills_unspecified() {
160        let msg = "///";
161        assert_eq!(sanitize_metric_tag_value(msg), "unspecified");
162    }
163
164    #[test]
165    fn sanitize_metric_tag_value_replaces_invalid_chars() {
166        let msg = "bad value!";
167        assert_eq!(sanitize_metric_tag_value(msg), "bad_value");
168    }
169
170    #[test]
171    fn normalize_markdown_hash_location_suffix_converts_single_location() {
172        assert_eq!(
173            normalize_markdown_hash_location_suffix("#L74C3"),
174            Some(":74:3".to_string())
175        );
176    }
177
178    #[test]
179    fn normalize_markdown_hash_location_suffix_converts_ranges() {
180        assert_eq!(
181            normalize_markdown_hash_location_suffix("#L74C3-L76C9"),
182            Some(":74:3-76:9".to_string())
183        );
184    }
185}