Skip to main content

fory_core/meta/
meta_string.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::ensure;
19use crate::error::Error;
20use crate::util::is_latin;
21use std::sync::OnceLock;
22
23// equal to "std::i16::MAX"
24const SHORT_MAX_VALUE: usize = 32767;
25// const HEADER_MASK:i64 = 0xff;
26
27pub static NAMESPACE_ENCODER: MetaStringEncoder = MetaStringEncoder::new('.', '_');
28pub static TYPE_NAME_ENCODER: MetaStringEncoder = MetaStringEncoder::new('$', '_');
29pub static FIELD_NAME_ENCODER: MetaStringEncoder = MetaStringEncoder::new('$', '_');
30
31pub static NAMESPACE_DECODER: MetaStringDecoder = MetaStringDecoder::new('.', '_');
32pub static FIELD_NAME_DECODER: MetaStringDecoder = MetaStringDecoder::new('$', '_');
33pub static TYPE_NAME_DECODER: MetaStringDecoder = MetaStringDecoder::new('$', '_');
34
35#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy, Default)]
36#[repr(i16)]
37pub enum Encoding {
38    #[default]
39    Utf8 = 0x00,
40    LowerSpecial = 0x01,
41    LowerUpperDigitSpecial = 0x02,
42    FirstToLowerSpecial = 0x03,
43    AllToLowerSpecial = 0x04,
44}
45
46#[derive(Debug, Clone, Default)]
47pub struct MetaString {
48    pub original: String,
49    pub encoding: Encoding,
50    pub bytes: Vec<u8>,
51    pub strip_last_char: bool,
52    pub special_char1: char,
53    pub special_char2: char,
54}
55
56impl PartialEq for MetaString {
57    fn eq(&self, other: &Self) -> bool {
58        self.bytes == other.bytes
59    }
60}
61
62impl Eq for MetaString {}
63
64impl std::hash::Hash for MetaString {
65    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
66        self.bytes.hash(state);
67    }
68}
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73
74    #[test]
75    fn rejects_invalid_utf8_meta_string() {
76        let err = TYPE_NAME_DECODER
77            .decode(&[0xff], Encoding::Utf8)
78            .unwrap_err();
79        assert!(
80            err.to_string().contains("invalid UTF-8 meta string"),
81            "unexpected error: {err}"
82        );
83    }
84}
85
86static EMPTY: OnceLock<MetaString> = OnceLock::new();
87
88impl MetaString {
89    pub fn new(
90        original: String,
91        encoding: Encoding,
92        bytes: Vec<u8>,
93        special_char1: char,
94        special_char2: char,
95    ) -> Result<Self, Error> {
96        let mut strip_last_char = false;
97        if encoding != Encoding::Utf8 {
98            if bytes.is_empty() {
99                return Err(Error::encode_error("Encoded data cannot be empty"));
100            }
101            strip_last_char = (bytes[0] & 0x80) != 0;
102        }
103        Ok(MetaString {
104            original,
105            encoding,
106            bytes,
107            strip_last_char,
108            special_char1,
109            special_char2,
110        })
111    }
112
113    pub fn write_to(&self, writer: &mut crate::buffer::Writer) {
114        writer.write_var_u32(self.bytes.len() as u32);
115        writer.write_bytes(&self.bytes);
116    }
117
118    pub fn get_empty() -> &'static MetaString {
119        EMPTY.get_or_init(|| MetaString {
120            original: "".to_string(),
121            encoding: Encoding::default(),
122            bytes: vec![],
123            strip_last_char: false,
124            special_char1: '\0',
125            special_char2: '\0',
126        })
127    }
128}
129
130#[derive(Clone)]
131pub struct MetaStringDecoder {
132    pub special_char1: char,
133    pub special_char2: char,
134}
135
136#[derive(Clone)]
137pub struct MetaStringEncoder {
138    pub special_char1: char,
139    pub special_char2: char,
140}
141
142#[derive(Debug)]
143struct StringStatistics {
144    digit_count: usize,
145    upper_count: usize,
146    can_lower_upper_digit_special_encoded: bool,
147    can_lower_special_encoded: bool,
148}
149
150impl MetaStringEncoder {
151    pub const fn new(special_char1: char, special_char2: char) -> Self {
152        Self {
153            special_char1,
154            special_char2,
155        }
156    }
157
158    fn is_latin(&self, s: &str) -> bool {
159        is_latin(s)
160    }
161
162    fn _encode(&self, input: &str) -> Result<Option<MetaString>, Error> {
163        if input.is_empty() {
164            return Ok(Some(MetaString::new(
165                input.to_string(),
166                Encoding::Utf8,
167                vec![],
168                self.special_char1,
169                self.special_char2,
170            )?));
171        }
172
173        ensure!(
174            input.len() < SHORT_MAX_VALUE,
175            Error::encode_error(format!(
176                "Meta string is too long, max:{SHORT_MAX_VALUE}, current:{}",
177                input.len()
178            ))
179        );
180
181        if !self.is_latin(input) {
182            return Ok(Some(MetaString::new(
183                input.to_string(),
184                Encoding::Utf8,
185                input.as_bytes().to_vec(),
186                self.special_char1,
187                self.special_char2,
188            )?));
189        }
190
191        Ok(None)
192    }
193
194    pub fn encode(&self, input: &str) -> Result<MetaString, Error> {
195        if let Some(ms) = self._encode(input)? {
196            return Ok(ms);
197        }
198        let encoding = self.compute_encoding(input, None);
199        self.encode_with_encoding(input, encoding)
200    }
201
202    pub fn encode_with_encodings(
203        &self,
204        input: &str,
205        encodings: &[Encoding],
206    ) -> Result<MetaString, Error> {
207        if let Some(ms) = self._encode(input)? {
208            return Ok(ms);
209        }
210        let encoding = self.compute_encoding(input, Some(encodings));
211        self.encode_with_encoding(input, encoding)
212    }
213
214    fn compute_encoding(&self, input: &str, encodings: Option<&[Encoding]>) -> Encoding {
215        let allow = |e: Encoding| encodings.map_or(true, |opts| opts.contains(&e));
216        let statistics = self.compute_statistics(input);
217        if statistics.can_lower_special_encoded && allow(Encoding::LowerSpecial) {
218            return Encoding::LowerSpecial;
219        }
220        if statistics.can_lower_upper_digit_special_encoded {
221            if statistics.digit_count != 0 && allow(Encoding::LowerUpperDigitSpecial) {
222                return Encoding::LowerUpperDigitSpecial;
223            }
224            let upper_count: usize = statistics.upper_count;
225            if upper_count == 1
226                && input.chars().next().unwrap().is_uppercase()
227                && allow(Encoding::FirstToLowerSpecial)
228            {
229                return Encoding::FirstToLowerSpecial;
230            }
231            if ((input.len() + upper_count) * 5) < (input.len() * 6)
232                && allow(Encoding::AllToLowerSpecial)
233            {
234                return Encoding::AllToLowerSpecial;
235            }
236            if allow(Encoding::LowerUpperDigitSpecial) {
237                return Encoding::LowerUpperDigitSpecial;
238            }
239        }
240        Encoding::Utf8
241    }
242
243    fn compute_statistics(&self, chars: &str) -> StringStatistics {
244        let mut can_lower_upper_digit_special_encoded = true;
245        let mut can_lower_special_encoded = true;
246        let mut digit_count = 0;
247        let mut upper_count = 0;
248        for c in chars.chars() {
249            if can_lower_upper_digit_special_encoded
250                && !(c.is_lowercase()
251                    || c.is_uppercase()
252                    || c.is_ascii_digit()
253                    || (c == self.special_char1 || c == self.special_char2))
254            {
255                can_lower_upper_digit_special_encoded = false;
256            }
257            if can_lower_special_encoded
258                && !(c.is_lowercase() || matches!(c, '.' | '_' | '$' | '|'))
259            {
260                can_lower_special_encoded = false;
261            }
262            if c.is_ascii_digit() {
263                digit_count += 1;
264            }
265            if c.is_uppercase() {
266                upper_count += 1;
267            }
268        }
269        StringStatistics {
270            digit_count,
271            upper_count,
272            can_lower_upper_digit_special_encoded,
273            can_lower_special_encoded,
274        }
275    }
276
277    pub fn encode_with_encoding(
278        &self,
279        input: &str,
280        encoding: Encoding,
281    ) -> Result<MetaString, Error> {
282        if input.is_empty() {
283            return MetaString::new(
284                input.to_string(),
285                Encoding::Utf8,
286                vec![],
287                self.special_char1,
288                self.special_char2,
289            );
290        }
291        ensure!(
292            input.len() < SHORT_MAX_VALUE,
293            Error::encode_error(format!(
294                "Meta string is too long, max:{SHORT_MAX_VALUE}, current:{}",
295                input.len()
296            ))
297        );
298        ensure!(
299            encoding == Encoding::Utf8 || self.is_latin(input),
300            Error::encode_error("Non-ASCII characters in meta string are not allowed")
301        );
302
303        if input.is_empty() {
304            return MetaString::new(
305                input.to_string(),
306                Encoding::Utf8,
307                vec![],
308                self.special_char1,
309                self.special_char2,
310            );
311        };
312
313        match encoding {
314            Encoding::LowerSpecial => {
315                let encoded_data = self.encode_lower_special(input)?;
316                MetaString::new(
317                    input.to_string(),
318                    encoding,
319                    encoded_data,
320                    self.special_char1,
321                    self.special_char2,
322                )
323            }
324            Encoding::LowerUpperDigitSpecial => {
325                let encoded_data = self.encode_lower_upper_digit_special(input)?;
326                MetaString::new(
327                    input.to_string(),
328                    encoding,
329                    encoded_data,
330                    self.special_char1,
331                    self.special_char2,
332                )
333            }
334            Encoding::FirstToLowerSpecial => {
335                let encoded_data = self.encode_first_to_lower_special(input)?;
336                MetaString::new(
337                    input.to_string(),
338                    encoding,
339                    encoded_data,
340                    self.special_char1,
341                    self.special_char2,
342                )
343            }
344            Encoding::AllToLowerSpecial => {
345                let upper_count = input.chars().filter(|c| c.is_uppercase()).count();
346                let encoded_data = self.encode_all_to_lower_special(input, upper_count)?;
347                MetaString::new(
348                    input.to_string(),
349                    encoding,
350                    encoded_data,
351                    self.special_char1,
352                    self.special_char2,
353                )
354            }
355            Encoding::Utf8 => {
356                let encoded_data = input.as_bytes().to_vec();
357                MetaString::new(
358                    input.to_string(),
359                    Encoding::Utf8,
360                    encoded_data,
361                    self.special_char1,
362                    self.special_char2,
363                )
364            }
365        }
366    }
367
368    fn encode_generic(&self, input: &str, bits_per_char: u8) -> Result<Vec<u8>, Error> {
369        let total_bits: usize = input.len() * bits_per_char as usize + 1;
370        let byte_length: usize = (total_bits + 7) / 8;
371        let mut bytes = vec![0; byte_length];
372        let mut current_bit = 1;
373        for c in input.chars() {
374            let value = self.char_to_value(c, bits_per_char)?;
375            for i in (0..bits_per_char).rev() {
376                if (value & (1 << i)) != 0 {
377                    let byte_pos: usize = current_bit / 8;
378                    let bit_pos: usize = current_bit % 8;
379                    bytes[byte_pos] |= 1 << (7 - bit_pos);
380                }
381                current_bit += 1;
382            }
383        }
384        if byte_length * 8 >= total_bits + bits_per_char as usize {
385            bytes[0] |= 0x80;
386        }
387        Ok(bytes)
388    }
389    pub fn encode_lower_special(&self, input: &str) -> Result<Vec<u8>, Error> {
390        self.encode_generic(input, 5)
391    }
392
393    pub fn encode_lower_upper_digit_special(&self, input: &str) -> Result<Vec<u8>, Error> {
394        self.encode_generic(input, 6)
395    }
396
397    pub fn encode_first_to_lower_special(&self, input: &str) -> Result<Vec<u8>, Error> {
398        if input.is_empty() {
399            return self.encode_generic("", 5);
400        }
401
402        let mut iter = input.char_indices();
403        let (first_idx, first_char) = iter.next().unwrap();
404
405        let lower = first_char.to_lowercase().to_string();
406
407        // Fast path: if lowercase has the same byte length and is ASCII,
408        // we can modify the first byte directly without rebuilding the string.
409        if lower.len() == first_char.len_utf8() && first_char.is_ascii() {
410            let mut bytes = input.as_bytes().to_owned();
411            bytes[first_idx] = lower.as_bytes()[0];
412            return self.encode_generic(std::str::from_utf8(&bytes).unwrap(), 5);
413        }
414
415        // rebuild only the necessary prefix + suffix (still efficient).
416        let (_, rest) = input.split_at(first_char.len_utf8());
417        let mut result = String::with_capacity(input.len() + lower.len() - first_char.len_utf8());
418        result.push_str(&lower);
419        result.push_str(rest);
420        self.encode_generic(&result, 5)
421    }
422
423    pub fn encode_all_to_lower_special(
424        &self,
425        input: &str,
426        upper_count: usize,
427    ) -> Result<Vec<u8>, Error> {
428        let mut new_chars = Vec::with_capacity(input.len() + upper_count);
429        for c in input.chars() {
430            if c.is_uppercase() {
431                new_chars.push('|');
432                new_chars.push(c.to_lowercase().next().unwrap());
433            } else {
434                new_chars.push(c);
435            }
436        }
437        self.encode_generic(&new_chars.iter().collect::<String>(), 5)
438    }
439
440    fn char_to_value(&self, c: char, bits_per_char: u8) -> Result<u8, Error> {
441        match bits_per_char {
442            5 => match c {
443                'a'..='z' => Ok(c as u8 - b'a'),
444                '.' => Ok(26),
445                '_' => Ok(27),
446                '$' => Ok(28),
447                '|' => Ok(29),
448                _ => Err(Error::encode_error(format!(
449                    "Unsupported character for LOWER_UPPER_DIGIT_SPECIAL encoding: {c}",
450                )))?,
451            },
452            6 => match c {
453                'a'..='z' => Ok(c as u8 - b'a'),
454                'A'..='Z' => Ok(c as u8 - b'A' + 26),
455                '0'..='9' => Ok(c as u8 - b'0' + 52),
456                _ => {
457                    if c == self.special_char1 {
458                        Ok(62)
459                    } else if c == self.special_char2 {
460                        Ok(63)
461                    } else {
462                        Err(Error::encode_error(format!(
463                            "Invalid character value for LOWER_SPECIAL decoding: {c:?}",
464                        )))?
465                    }
466                }
467            },
468            _ => unreachable!(),
469        }
470    }
471}
472
473impl MetaStringDecoder {
474    pub const fn new(special_char1: char, special_char2: char) -> Self {
475        MetaStringDecoder {
476            special_char1,
477            special_char2,
478        }
479    }
480
481    pub fn decode(&self, encoded_data: &[u8], encoding: Encoding) -> Result<MetaString, Error> {
482        let str = {
483            if encoded_data.is_empty() {
484                Ok("".to_string())
485            } else {
486                match encoding {
487                    Encoding::LowerSpecial => self.decode_lower_special(encoded_data),
488                    Encoding::LowerUpperDigitSpecial => {
489                        self.decode_lower_upper_digit_special(encoded_data)
490                    }
491                    Encoding::FirstToLowerSpecial => {
492                        self.decode_rep_first_lower_special(encoded_data)
493                    }
494                    Encoding::AllToLowerSpecial => {
495                        self.decode_rep_all_to_lower_special(encoded_data)
496                    }
497                    Encoding::Utf8 => std::str::from_utf8(encoded_data)
498                        .map(str::to_owned)
499                        .map_err(|_| Error::encoding_error("invalid UTF-8 meta string")),
500                }
501            }
502        }?;
503        MetaString::new(
504            str,
505            encoding,
506            Vec::from(encoded_data),
507            self.special_char1,
508            self.special_char2,
509        )
510    }
511
512    fn decode_lower_special(&self, data: &[u8]) -> Result<String, Error> {
513        let mut decoded = String::new();
514        let total_bits: usize = data.len() * 8;
515        let strip_last_char = (data[0] & 0x80) != 0;
516        let bit_mask: usize = 0b11111;
517        let mut bit_index = 1;
518        while bit_index + 5 <= total_bits && !(strip_last_char && (bit_index + 2 * 5 > total_bits))
519        {
520            let byte_index = bit_index / 8;
521            let intra_byte_index = bit_index % 8;
522            let char_value: usize = if intra_byte_index > 3 {
523                ((((data[byte_index] as usize) << 8)
524                    | if byte_index + 1 < data.len() {
525                        data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF
526                    } else {
527                        0
528                    })
529                    >> (11 - intra_byte_index))
530                    & bit_mask
531            } else {
532                ((data[byte_index] as usize) >> (3 - intra_byte_index)) & bit_mask
533            };
534            bit_index += 5;
535            decoded.push(self.decode_lower_special_char(char_value as u8)?);
536        }
537        Ok(decoded)
538    }
539
540    fn decode_lower_upper_digit_special(&self, data: &[u8]) -> Result<String, Error> {
541        let mut decoded = String::new();
542        let num_bits = data.len() * 8;
543        let strip_last_char = (data[0] & 0x80) != 0;
544        let mut bit_index = 1;
545        let bit_mask: usize = 0b111111;
546        while bit_index + 6 <= num_bits && !(strip_last_char && (bit_index + 2 * 6 > num_bits)) {
547            let byte_index = bit_index / 8;
548            let intra_byte_index = bit_index % 8;
549            let char_value: usize = if intra_byte_index > 2 {
550                ((((data[byte_index] as usize) << 8)
551                    | if byte_index + 1 < data.len() {
552                        data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF
553                    } else {
554                        0
555                    })
556                    >> (10 - intra_byte_index))
557                    & bit_mask
558            } else {
559                ((data[byte_index] as usize) >> (2 - intra_byte_index)) & bit_mask
560            };
561            bit_index += 6;
562            decoded.push(self.decode_lower_upper_digit_special_char(char_value as u8)?);
563        }
564        Ok(decoded)
565    }
566
567    fn decode_lower_special_char(&self, char_value: u8) -> Result<char, Error> {
568        match char_value {
569            0..=25 => Ok((b'a' + char_value) as char),
570            26 => Ok('.'),
571            27 => Ok('_'),
572            28 => Ok('$'),
573            29 => Ok('|'),
574            _ => Err(Error::encode_error(format!(
575                "Invalid character value for LOWER_SPECIAL decoding: {char_value}",
576            )))?,
577        }
578    }
579
580    fn decode_lower_upper_digit_special_char(&self, char_value: u8) -> Result<char, Error> {
581        match char_value {
582            0..=25 => Ok((b'a' + char_value) as char),
583            26..=51 => Ok((b'A' + char_value - 26) as char),
584            52..=61 => Ok((b'0' + char_value - 52) as char),
585            62 => Ok(self.special_char1),
586            63 => Ok(self.special_char2),
587            _ => Err(Error::encode_error(format!(
588                "Invalid character value for LOWER_UPPER_DIGIT_SPECIAL decoding: {char_value}",
589            )))?,
590        }
591    }
592
593    fn decode_rep_first_lower_special(&self, data: &[u8]) -> Result<String, Error> {
594        let decoded_str = self.decode_lower_special(data)?;
595        let mut chars = decoded_str.chars();
596        match chars.next() {
597            Some(first_char) => {
598                let mut result = first_char.to_ascii_uppercase().to_string();
599                result.extend(chars);
600                Ok(result)
601            }
602            None => Ok(decoded_str),
603        }
604    }
605    fn decode_rep_all_to_lower_special(&self, data: &[u8]) -> Result<String, Error> {
606        let decoded_str = self.decode_lower_special(data)?;
607        let mut result = String::new();
608        let mut skip = false;
609        for (i, char) in decoded_str.chars().enumerate() {
610            if skip {
611                skip = false;
612                continue;
613            }
614            // Encounter a '|', capitalize the next character
615            // and skip the following character.
616            if char == '|' {
617                if let Some(next_char) = decoded_str.chars().nth(i + 1) {
618                    result.push(next_char.to_ascii_uppercase());
619                }
620                skip = true;
621            } else {
622                result.push(char);
623            }
624        }
625        Ok(result)
626    }
627}