zerobox_utils_string/
lib.rs1mod truncate;
2
3pub use truncate::approx_bytes_for_tokens;
4pub use truncate::approx_token_count;
5pub use truncate::approx_tokens_from_byte_count;
6pub use truncate::truncate_middle_chars;
7pub use truncate::truncate_middle_with_token_budget;
8
9#[inline]
11pub fn take_bytes_at_char_boundary(s: &str, maxb: usize) -> &str {
12 if s.len() <= maxb {
13 return s;
14 }
15 let mut last_ok = 0;
16 for (i, ch) in s.char_indices() {
17 let nb = i + ch.len_utf8();
18 if nb > maxb {
19 break;
20 }
21 last_ok = nb;
22 }
23 &s[..last_ok]
24}
25
26#[inline]
28pub fn take_last_bytes_at_char_boundary(s: &str, maxb: usize) -> &str {
29 if s.len() <= maxb {
30 return s;
31 }
32 let mut start = s.len();
33 let mut used = 0usize;
34 for (i, ch) in s.char_indices().rev() {
35 let nb = ch.len_utf8();
36 if used + nb > maxb {
37 break;
38 }
39 start = i;
40 used += nb;
41 if start == 0 {
42 break;
43 }
44 }
45 &s[start..]
46}
47
48pub fn sanitize_metric_tag_value(value: &str) -> String {
51 const MAX_LEN: usize = 256;
52 let sanitized: String = value
53 .chars()
54 .map(|ch| {
55 if ch.is_ascii_alphanumeric() || matches!(ch, '.' | '_' | '-' | '/') {
56 ch
57 } else {
58 '_'
59 }
60 })
61 .collect();
62 let trimmed = sanitized.trim_matches('_');
63 if trimmed.is_empty() || trimmed.chars().all(|ch| !ch.is_ascii_alphanumeric()) {
64 return "unspecified".to_string();
65 }
66 if trimmed.len() <= MAX_LEN {
67 trimmed.to_string()
68 } else {
69 trimmed[..MAX_LEN].to_string()
70 }
71}
72
73#[allow(clippy::unwrap_used)]
75pub fn find_uuids(s: &str) -> Vec<String> {
76 static RE: std::sync::OnceLock<regex_lite::Regex> = std::sync::OnceLock::new();
77 let re = RE.get_or_init(|| {
78 regex_lite::Regex::new(
79 r"[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}",
80 )
81 .unwrap() });
83
84 re.find_iter(s).map(|m| m.as_str().to_string()).collect()
85}
86
87pub fn normalize_markdown_hash_location_suffix(suffix: &str) -> Option<String> {
90 let fragment = suffix.strip_prefix('#')?;
91 let (start, end) = match fragment.split_once('-') {
92 Some((start, end)) => (start, Some(end)),
93 None => (fragment, None),
94 };
95 let (start_line, start_column) = parse_markdown_hash_location_point(start)?;
96 let mut normalized = String::from(":");
97 normalized.push_str(start_line);
98 if let Some(column) = start_column {
99 normalized.push(':');
100 normalized.push_str(column);
101 }
102 if let Some(end) = end {
103 let (end_line, end_column) = parse_markdown_hash_location_point(end)?;
104 normalized.push('-');
105 normalized.push_str(end_line);
106 if let Some(column) = end_column {
107 normalized.push(':');
108 normalized.push_str(column);
109 }
110 }
111 Some(normalized)
112}
113
114fn parse_markdown_hash_location_point(point: &str) -> Option<(&str, Option<&str>)> {
115 let point = point.strip_prefix('L')?;
116 match point.split_once('C') {
117 Some((line, column)) => Some((line, Some(column))),
118 None => Some((point, None)),
119 }
120}
121
122#[cfg(test)]
123#[allow(warnings, clippy::all)]
124mod tests {
125 use super::find_uuids;
126 use super::normalize_markdown_hash_location_suffix;
127 use super::sanitize_metric_tag_value;
128 use pretty_assertions::assert_eq;
129
130 #[test]
131 fn find_uuids_finds_multiple() {
132 let input =
133 "x 00112233-4455-6677-8899-aabbccddeeff-k y 12345678-90ab-cdef-0123-456789abcdef";
134 assert_eq!(
135 find_uuids(input),
136 vec![
137 "00112233-4455-6677-8899-aabbccddeeff".to_string(),
138 "12345678-90ab-cdef-0123-456789abcdef".to_string(),
139 ]
140 );
141 }
142
143 #[test]
144 fn find_uuids_ignores_invalid() {
145 let input = "not-a-uuid-1234-5678-9abc-def0-123456789abc";
146 assert_eq!(find_uuids(input), Vec::<String>::new());
147 }
148
149 #[test]
150 fn find_uuids_handles_non_ascii_without_overlap() {
151 let input = "🙂 55e5d6f7-8a7f-4d2a-8d88-123456789012abc";
152 assert_eq!(
153 find_uuids(input),
154 vec!["55e5d6f7-8a7f-4d2a-8d88-123456789012".to_string()]
155 );
156 }
157
158 #[test]
159 fn sanitize_metric_tag_value_trims_and_fills_unspecified() {
160 let msg = "///";
161 assert_eq!(sanitize_metric_tag_value(msg), "unspecified");
162 }
163
164 #[test]
165 fn sanitize_metric_tag_value_replaces_invalid_chars() {
166 let msg = "bad value!";
167 assert_eq!(sanitize_metric_tag_value(msg), "bad_value");
168 }
169
170 #[test]
171 fn normalize_markdown_hash_location_suffix_converts_single_location() {
172 assert_eq!(
173 normalize_markdown_hash_location_suffix("#L74C3"),
174 Some(":74:3".to_string())
175 );
176 }
177
178 #[test]
179 fn normalize_markdown_hash_location_suffix_converts_ranges() {
180 assert_eq!(
181 normalize_markdown_hash_location_suffix("#L74C3-L76C9"),
182 Some(":74:3-76:9".to_string())
183 );
184 }
185}