1use crate::ensure;
19use crate::error::Error;
20use crate::util::is_latin;
21use std::sync::OnceLock;
22
23const SHORT_MAX_VALUE: usize = 32767;
25pub static NAMESPACE_ENCODER: MetaStringEncoder = MetaStringEncoder::new('.', '_');
28pub static TYPE_NAME_ENCODER: MetaStringEncoder = MetaStringEncoder::new('$', '_');
29pub static FIELD_NAME_ENCODER: MetaStringEncoder = MetaStringEncoder::new('$', '_');
30
31pub static NAMESPACE_DECODER: MetaStringDecoder = MetaStringDecoder::new('.', '_');
32pub static FIELD_NAME_DECODER: MetaStringDecoder = MetaStringDecoder::new('$', '_');
33pub static TYPE_NAME_DECODER: MetaStringDecoder = MetaStringDecoder::new('$', '_');
34
35#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy, Default)]
36#[repr(i16)]
37pub enum Encoding {
38 #[default]
39 Utf8 = 0x00,
40 LowerSpecial = 0x01,
41 LowerUpperDigitSpecial = 0x02,
42 FirstToLowerSpecial = 0x03,
43 AllToLowerSpecial = 0x04,
44}
45
46#[derive(Debug, Clone, Default)]
47pub struct MetaString {
48 pub original: String,
49 pub encoding: Encoding,
50 pub bytes: Vec<u8>,
51 pub strip_last_char: bool,
52 pub special_char1: char,
53 pub special_char2: char,
54}
55
56impl PartialEq for MetaString {
57 fn eq(&self, other: &Self) -> bool {
58 self.bytes == other.bytes
59 }
60}
61
62impl Eq for MetaString {}
63
64impl std::hash::Hash for MetaString {
65 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
66 self.bytes.hash(state);
67 }
68}
69
70#[cfg(test)]
71mod tests {
72 use super::*;
73
74 #[test]
75 fn rejects_invalid_utf8_meta_string() {
76 let err = TYPE_NAME_DECODER
77 .decode(&[0xff], Encoding::Utf8)
78 .unwrap_err();
79 assert!(
80 err.to_string().contains("invalid UTF-8 meta string"),
81 "unexpected error: {err}"
82 );
83 }
84}
85
86static EMPTY: OnceLock<MetaString> = OnceLock::new();
87
88impl MetaString {
89 pub fn new(
90 original: String,
91 encoding: Encoding,
92 bytes: Vec<u8>,
93 special_char1: char,
94 special_char2: char,
95 ) -> Result<Self, Error> {
96 let mut strip_last_char = false;
97 if encoding != Encoding::Utf8 {
98 if bytes.is_empty() {
99 return Err(Error::encode_error("Encoded data cannot be empty"));
100 }
101 strip_last_char = (bytes[0] & 0x80) != 0;
102 }
103 Ok(MetaString {
104 original,
105 encoding,
106 bytes,
107 strip_last_char,
108 special_char1,
109 special_char2,
110 })
111 }
112
113 pub fn write_to(&self, writer: &mut crate::buffer::Writer) {
114 writer.write_var_u32(self.bytes.len() as u32);
115 writer.write_bytes(&self.bytes);
116 }
117
118 pub fn get_empty() -> &'static MetaString {
119 EMPTY.get_or_init(|| MetaString {
120 original: "".to_string(),
121 encoding: Encoding::default(),
122 bytes: vec![],
123 strip_last_char: false,
124 special_char1: '\0',
125 special_char2: '\0',
126 })
127 }
128}
129
130#[derive(Clone)]
131pub struct MetaStringDecoder {
132 pub special_char1: char,
133 pub special_char2: char,
134}
135
136#[derive(Clone)]
137pub struct MetaStringEncoder {
138 pub special_char1: char,
139 pub special_char2: char,
140}
141
142#[derive(Debug)]
143struct StringStatistics {
144 digit_count: usize,
145 upper_count: usize,
146 can_lower_upper_digit_special_encoded: bool,
147 can_lower_special_encoded: bool,
148}
149
150impl MetaStringEncoder {
151 pub const fn new(special_char1: char, special_char2: char) -> Self {
152 Self {
153 special_char1,
154 special_char2,
155 }
156 }
157
158 fn is_latin(&self, s: &str) -> bool {
159 is_latin(s)
160 }
161
162 fn _encode(&self, input: &str) -> Result<Option<MetaString>, Error> {
163 if input.is_empty() {
164 return Ok(Some(MetaString::new(
165 input.to_string(),
166 Encoding::Utf8,
167 vec![],
168 self.special_char1,
169 self.special_char2,
170 )?));
171 }
172
173 ensure!(
174 input.len() < SHORT_MAX_VALUE,
175 Error::encode_error(format!(
176 "Meta string is too long, max:{SHORT_MAX_VALUE}, current:{}",
177 input.len()
178 ))
179 );
180
181 if !self.is_latin(input) {
182 return Ok(Some(MetaString::new(
183 input.to_string(),
184 Encoding::Utf8,
185 input.as_bytes().to_vec(),
186 self.special_char1,
187 self.special_char2,
188 )?));
189 }
190
191 Ok(None)
192 }
193
194 pub fn encode(&self, input: &str) -> Result<MetaString, Error> {
195 if let Some(ms) = self._encode(input)? {
196 return Ok(ms);
197 }
198 let encoding = self.compute_encoding(input, None);
199 self.encode_with_encoding(input, encoding)
200 }
201
202 pub fn encode_with_encodings(
203 &self,
204 input: &str,
205 encodings: &[Encoding],
206 ) -> Result<MetaString, Error> {
207 if let Some(ms) = self._encode(input)? {
208 return Ok(ms);
209 }
210 let encoding = self.compute_encoding(input, Some(encodings));
211 self.encode_with_encoding(input, encoding)
212 }
213
214 fn compute_encoding(&self, input: &str, encodings: Option<&[Encoding]>) -> Encoding {
215 let allow = |e: Encoding| encodings.map_or(true, |opts| opts.contains(&e));
216 let statistics = self.compute_statistics(input);
217 if statistics.can_lower_special_encoded && allow(Encoding::LowerSpecial) {
218 return Encoding::LowerSpecial;
219 }
220 if statistics.can_lower_upper_digit_special_encoded {
221 if statistics.digit_count != 0 && allow(Encoding::LowerUpperDigitSpecial) {
222 return Encoding::LowerUpperDigitSpecial;
223 }
224 let upper_count: usize = statistics.upper_count;
225 if upper_count == 1
226 && input.chars().next().unwrap().is_uppercase()
227 && allow(Encoding::FirstToLowerSpecial)
228 {
229 return Encoding::FirstToLowerSpecial;
230 }
231 if ((input.len() + upper_count) * 5) < (input.len() * 6)
232 && allow(Encoding::AllToLowerSpecial)
233 {
234 return Encoding::AllToLowerSpecial;
235 }
236 if allow(Encoding::LowerUpperDigitSpecial) {
237 return Encoding::LowerUpperDigitSpecial;
238 }
239 }
240 Encoding::Utf8
241 }
242
243 fn compute_statistics(&self, chars: &str) -> StringStatistics {
244 let mut can_lower_upper_digit_special_encoded = true;
245 let mut can_lower_special_encoded = true;
246 let mut digit_count = 0;
247 let mut upper_count = 0;
248 for c in chars.chars() {
249 if can_lower_upper_digit_special_encoded
250 && !(c.is_lowercase()
251 || c.is_uppercase()
252 || c.is_ascii_digit()
253 || (c == self.special_char1 || c == self.special_char2))
254 {
255 can_lower_upper_digit_special_encoded = false;
256 }
257 if can_lower_special_encoded
258 && !(c.is_lowercase() || matches!(c, '.' | '_' | '$' | '|'))
259 {
260 can_lower_special_encoded = false;
261 }
262 if c.is_ascii_digit() {
263 digit_count += 1;
264 }
265 if c.is_uppercase() {
266 upper_count += 1;
267 }
268 }
269 StringStatistics {
270 digit_count,
271 upper_count,
272 can_lower_upper_digit_special_encoded,
273 can_lower_special_encoded,
274 }
275 }
276
277 pub fn encode_with_encoding(
278 &self,
279 input: &str,
280 encoding: Encoding,
281 ) -> Result<MetaString, Error> {
282 if input.is_empty() {
283 return MetaString::new(
284 input.to_string(),
285 Encoding::Utf8,
286 vec![],
287 self.special_char1,
288 self.special_char2,
289 );
290 }
291 ensure!(
292 input.len() < SHORT_MAX_VALUE,
293 Error::encode_error(format!(
294 "Meta string is too long, max:{SHORT_MAX_VALUE}, current:{}",
295 input.len()
296 ))
297 );
298 ensure!(
299 encoding == Encoding::Utf8 || self.is_latin(input),
300 Error::encode_error("Non-ASCII characters in meta string are not allowed")
301 );
302
303 if input.is_empty() {
304 return MetaString::new(
305 input.to_string(),
306 Encoding::Utf8,
307 vec![],
308 self.special_char1,
309 self.special_char2,
310 );
311 };
312
313 match encoding {
314 Encoding::LowerSpecial => {
315 let encoded_data = self.encode_lower_special(input)?;
316 MetaString::new(
317 input.to_string(),
318 encoding,
319 encoded_data,
320 self.special_char1,
321 self.special_char2,
322 )
323 }
324 Encoding::LowerUpperDigitSpecial => {
325 let encoded_data = self.encode_lower_upper_digit_special(input)?;
326 MetaString::new(
327 input.to_string(),
328 encoding,
329 encoded_data,
330 self.special_char1,
331 self.special_char2,
332 )
333 }
334 Encoding::FirstToLowerSpecial => {
335 let encoded_data = self.encode_first_to_lower_special(input)?;
336 MetaString::new(
337 input.to_string(),
338 encoding,
339 encoded_data,
340 self.special_char1,
341 self.special_char2,
342 )
343 }
344 Encoding::AllToLowerSpecial => {
345 let upper_count = input.chars().filter(|c| c.is_uppercase()).count();
346 let encoded_data = self.encode_all_to_lower_special(input, upper_count)?;
347 MetaString::new(
348 input.to_string(),
349 encoding,
350 encoded_data,
351 self.special_char1,
352 self.special_char2,
353 )
354 }
355 Encoding::Utf8 => {
356 let encoded_data = input.as_bytes().to_vec();
357 MetaString::new(
358 input.to_string(),
359 Encoding::Utf8,
360 encoded_data,
361 self.special_char1,
362 self.special_char2,
363 )
364 }
365 }
366 }
367
368 fn encode_generic(&self, input: &str, bits_per_char: u8) -> Result<Vec<u8>, Error> {
369 let total_bits: usize = input.len() * bits_per_char as usize + 1;
370 let byte_length: usize = (total_bits + 7) / 8;
371 let mut bytes = vec![0; byte_length];
372 let mut current_bit = 1;
373 for c in input.chars() {
374 let value = self.char_to_value(c, bits_per_char)?;
375 for i in (0..bits_per_char).rev() {
376 if (value & (1 << i)) != 0 {
377 let byte_pos: usize = current_bit / 8;
378 let bit_pos: usize = current_bit % 8;
379 bytes[byte_pos] |= 1 << (7 - bit_pos);
380 }
381 current_bit += 1;
382 }
383 }
384 if byte_length * 8 >= total_bits + bits_per_char as usize {
385 bytes[0] |= 0x80;
386 }
387 Ok(bytes)
388 }
389 pub fn encode_lower_special(&self, input: &str) -> Result<Vec<u8>, Error> {
390 self.encode_generic(input, 5)
391 }
392
393 pub fn encode_lower_upper_digit_special(&self, input: &str) -> Result<Vec<u8>, Error> {
394 self.encode_generic(input, 6)
395 }
396
397 pub fn encode_first_to_lower_special(&self, input: &str) -> Result<Vec<u8>, Error> {
398 if input.is_empty() {
399 return self.encode_generic("", 5);
400 }
401
402 let mut iter = input.char_indices();
403 let (first_idx, first_char) = iter.next().unwrap();
404
405 let lower = first_char.to_lowercase().to_string();
406
407 if lower.len() == first_char.len_utf8() && first_char.is_ascii() {
410 let mut bytes = input.as_bytes().to_owned();
411 bytes[first_idx] = lower.as_bytes()[0];
412 return self.encode_generic(std::str::from_utf8(&bytes).unwrap(), 5);
413 }
414
415 let (_, rest) = input.split_at(first_char.len_utf8());
417 let mut result = String::with_capacity(input.len() + lower.len() - first_char.len_utf8());
418 result.push_str(&lower);
419 result.push_str(rest);
420 self.encode_generic(&result, 5)
421 }
422
423 pub fn encode_all_to_lower_special(
424 &self,
425 input: &str,
426 upper_count: usize,
427 ) -> Result<Vec<u8>, Error> {
428 let mut new_chars = Vec::with_capacity(input.len() + upper_count);
429 for c in input.chars() {
430 if c.is_uppercase() {
431 new_chars.push('|');
432 new_chars.push(c.to_lowercase().next().unwrap());
433 } else {
434 new_chars.push(c);
435 }
436 }
437 self.encode_generic(&new_chars.iter().collect::<String>(), 5)
438 }
439
440 fn char_to_value(&self, c: char, bits_per_char: u8) -> Result<u8, Error> {
441 match bits_per_char {
442 5 => match c {
443 'a'..='z' => Ok(c as u8 - b'a'),
444 '.' => Ok(26),
445 '_' => Ok(27),
446 '$' => Ok(28),
447 '|' => Ok(29),
448 _ => Err(Error::encode_error(format!(
449 "Unsupported character for LOWER_UPPER_DIGIT_SPECIAL encoding: {c}",
450 )))?,
451 },
452 6 => match c {
453 'a'..='z' => Ok(c as u8 - b'a'),
454 'A'..='Z' => Ok(c as u8 - b'A' + 26),
455 '0'..='9' => Ok(c as u8 - b'0' + 52),
456 _ => {
457 if c == self.special_char1 {
458 Ok(62)
459 } else if c == self.special_char2 {
460 Ok(63)
461 } else {
462 Err(Error::encode_error(format!(
463 "Invalid character value for LOWER_SPECIAL decoding: {c:?}",
464 )))?
465 }
466 }
467 },
468 _ => unreachable!(),
469 }
470 }
471}
472
473impl MetaStringDecoder {
474 pub const fn new(special_char1: char, special_char2: char) -> Self {
475 MetaStringDecoder {
476 special_char1,
477 special_char2,
478 }
479 }
480
481 pub fn decode(&self, encoded_data: &[u8], encoding: Encoding) -> Result<MetaString, Error> {
482 let str = {
483 if encoded_data.is_empty() {
484 Ok("".to_string())
485 } else {
486 match encoding {
487 Encoding::LowerSpecial => self.decode_lower_special(encoded_data),
488 Encoding::LowerUpperDigitSpecial => {
489 self.decode_lower_upper_digit_special(encoded_data)
490 }
491 Encoding::FirstToLowerSpecial => {
492 self.decode_rep_first_lower_special(encoded_data)
493 }
494 Encoding::AllToLowerSpecial => {
495 self.decode_rep_all_to_lower_special(encoded_data)
496 }
497 Encoding::Utf8 => std::str::from_utf8(encoded_data)
498 .map(str::to_owned)
499 .map_err(|_| Error::encoding_error("invalid UTF-8 meta string")),
500 }
501 }
502 }?;
503 MetaString::new(
504 str,
505 encoding,
506 Vec::from(encoded_data),
507 self.special_char1,
508 self.special_char2,
509 )
510 }
511
512 fn decode_lower_special(&self, data: &[u8]) -> Result<String, Error> {
513 let mut decoded = String::new();
514 let total_bits: usize = data.len() * 8;
515 let strip_last_char = (data[0] & 0x80) != 0;
516 let bit_mask: usize = 0b11111;
517 let mut bit_index = 1;
518 while bit_index + 5 <= total_bits && !(strip_last_char && (bit_index + 2 * 5 > total_bits))
519 {
520 let byte_index = bit_index / 8;
521 let intra_byte_index = bit_index % 8;
522 let char_value: usize = if intra_byte_index > 3 {
523 ((((data[byte_index] as usize) << 8)
524 | if byte_index + 1 < data.len() {
525 data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF
526 } else {
527 0
528 })
529 >> (11 - intra_byte_index))
530 & bit_mask
531 } else {
532 ((data[byte_index] as usize) >> (3 - intra_byte_index)) & bit_mask
533 };
534 bit_index += 5;
535 decoded.push(self.decode_lower_special_char(char_value as u8)?);
536 }
537 Ok(decoded)
538 }
539
540 fn decode_lower_upper_digit_special(&self, data: &[u8]) -> Result<String, Error> {
541 let mut decoded = String::new();
542 let num_bits = data.len() * 8;
543 let strip_last_char = (data[0] & 0x80) != 0;
544 let mut bit_index = 1;
545 let bit_mask: usize = 0b111111;
546 while bit_index + 6 <= num_bits && !(strip_last_char && (bit_index + 2 * 6 > num_bits)) {
547 let byte_index = bit_index / 8;
548 let intra_byte_index = bit_index % 8;
549 let char_value: usize = if intra_byte_index > 2 {
550 ((((data[byte_index] as usize) << 8)
551 | if byte_index + 1 < data.len() {
552 data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF
553 } else {
554 0
555 })
556 >> (10 - intra_byte_index))
557 & bit_mask
558 } else {
559 ((data[byte_index] as usize) >> (2 - intra_byte_index)) & bit_mask
560 };
561 bit_index += 6;
562 decoded.push(self.decode_lower_upper_digit_special_char(char_value as u8)?);
563 }
564 Ok(decoded)
565 }
566
567 fn decode_lower_special_char(&self, char_value: u8) -> Result<char, Error> {
568 match char_value {
569 0..=25 => Ok((b'a' + char_value) as char),
570 26 => Ok('.'),
571 27 => Ok('_'),
572 28 => Ok('$'),
573 29 => Ok('|'),
574 _ => Err(Error::encode_error(format!(
575 "Invalid character value for LOWER_SPECIAL decoding: {char_value}",
576 )))?,
577 }
578 }
579
580 fn decode_lower_upper_digit_special_char(&self, char_value: u8) -> Result<char, Error> {
581 match char_value {
582 0..=25 => Ok((b'a' + char_value) as char),
583 26..=51 => Ok((b'A' + char_value - 26) as char),
584 52..=61 => Ok((b'0' + char_value - 52) as char),
585 62 => Ok(self.special_char1),
586 63 => Ok(self.special_char2),
587 _ => Err(Error::encode_error(format!(
588 "Invalid character value for LOWER_UPPER_DIGIT_SPECIAL decoding: {char_value}",
589 )))?,
590 }
591 }
592
593 fn decode_rep_first_lower_special(&self, data: &[u8]) -> Result<String, Error> {
594 let decoded_str = self.decode_lower_special(data)?;
595 let mut chars = decoded_str.chars();
596 match chars.next() {
597 Some(first_char) => {
598 let mut result = first_char.to_ascii_uppercase().to_string();
599 result.extend(chars);
600 Ok(result)
601 }
602 None => Ok(decoded_str),
603 }
604 }
605 fn decode_rep_all_to_lower_special(&self, data: &[u8]) -> Result<String, Error> {
606 let decoded_str = self.decode_lower_special(data)?;
607 let mut result = String::new();
608 let mut skip = false;
609 for (i, char) in decoded_str.chars().enumerate() {
610 if skip {
611 skip = false;
612 continue;
613 }
614 if char == '|' {
617 if let Some(next_char) = decoded_str.chars().nth(i + 1) {
618 result.push(next_char.to_ascii_uppercase());
619 }
620 skip = true;
621 } else {
622 result.push(char);
623 }
624 }
625 Ok(result)
626 }
627}