Skip to main content

fory_core/meta/
meta_string.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::ensure;
19use crate::error::Error;
20use crate::meta::string_util;
21use std::sync::OnceLock;
22
23// equal to "std::i16::MAX"
24const SHORT_MAX_VALUE: usize = 32767;
25// const HEADER_MASK:i64 = 0xff;
26
27pub static NAMESPACE_ENCODER: MetaStringEncoder = MetaStringEncoder::new('.', '_');
28pub static TYPE_NAME_ENCODER: MetaStringEncoder = MetaStringEncoder::new('$', '_');
29pub static FIELD_NAME_ENCODER: MetaStringEncoder = MetaStringEncoder::new('$', '_');
30
31pub static NAMESPACE_DECODER: MetaStringDecoder = MetaStringDecoder::new('.', '_');
32pub static FIELD_NAME_DECODER: MetaStringDecoder = MetaStringDecoder::new('$', '_');
33pub static TYPE_NAME_DECODER: MetaStringDecoder = MetaStringDecoder::new('$', '_');
34
35#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy, Default)]
36#[repr(i16)]
37pub enum Encoding {
38    #[default]
39    Utf8 = 0x00,
40    LowerSpecial = 0x01,
41    LowerUpperDigitSpecial = 0x02,
42    FirstToLowerSpecial = 0x03,
43    AllToLowerSpecial = 0x04,
44}
45
46#[derive(Debug, Clone, Default)]
47pub struct MetaString {
48    pub original: String,
49    pub encoding: Encoding,
50    pub bytes: Vec<u8>,
51    pub strip_last_char: bool,
52    pub special_char1: char,
53    pub special_char2: char,
54}
55
56impl PartialEq for MetaString {
57    fn eq(&self, other: &Self) -> bool {
58        self.bytes == other.bytes
59    }
60}
61
62impl Eq for MetaString {}
63
64impl std::hash::Hash for MetaString {
65    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
66        self.bytes.hash(state);
67    }
68}
69
70static EMPTY: OnceLock<MetaString> = OnceLock::new();
71
72impl MetaString {
73    pub fn new(
74        original: String,
75        encoding: Encoding,
76        bytes: Vec<u8>,
77        special_char1: char,
78        special_char2: char,
79    ) -> Result<Self, Error> {
80        let mut strip_last_char = false;
81        if encoding != Encoding::Utf8 {
82            if bytes.is_empty() {
83                return Err(Error::encode_error("Encoded data cannot be empty"));
84            }
85            strip_last_char = (bytes[0] & 0x80) != 0;
86        }
87        Ok(MetaString {
88            original,
89            encoding,
90            bytes,
91            strip_last_char,
92            special_char1,
93            special_char2,
94        })
95    }
96
97    pub fn write_to(&self, writer: &mut crate::buffer::Writer) {
98        writer.write_var_uint32(self.bytes.len() as u32);
99        writer.write_bytes(&self.bytes);
100    }
101
102    pub fn get_empty() -> &'static MetaString {
103        EMPTY.get_or_init(|| MetaString {
104            original: "".to_string(),
105            encoding: Encoding::default(),
106            bytes: vec![],
107            strip_last_char: false,
108            special_char1: '\0',
109            special_char2: '\0',
110        })
111    }
112}
113
114#[derive(Clone)]
115pub struct MetaStringDecoder {
116    pub special_char1: char,
117    pub special_char2: char,
118}
119
120#[derive(Clone)]
121pub struct MetaStringEncoder {
122    pub special_char1: char,
123    pub special_char2: char,
124}
125
126#[derive(Debug)]
127struct StringStatistics {
128    digit_count: usize,
129    upper_count: usize,
130    can_lower_upper_digit_special_encoded: bool,
131    can_lower_special_encoded: bool,
132}
133
134impl MetaStringEncoder {
135    pub const fn new(special_char1: char, special_char2: char) -> Self {
136        Self {
137            special_char1,
138            special_char2,
139        }
140    }
141
142    fn is_latin(&self, s: &str) -> bool {
143        string_util::is_latin(s)
144    }
145
146    fn _encode(&self, input: &str) -> Result<Option<MetaString>, Error> {
147        if input.is_empty() {
148            return Ok(Some(MetaString::new(
149                input.to_string(),
150                Encoding::Utf8,
151                vec![],
152                self.special_char1,
153                self.special_char2,
154            )?));
155        }
156
157        ensure!(
158            input.len() < SHORT_MAX_VALUE,
159            Error::encode_error(format!(
160                "Meta string is too long, max:{SHORT_MAX_VALUE}, current:{}",
161                input.len()
162            ))
163        );
164
165        if !self.is_latin(input) {
166            return Ok(Some(MetaString::new(
167                input.to_string(),
168                Encoding::Utf8,
169                input.as_bytes().to_vec(),
170                self.special_char1,
171                self.special_char2,
172            )?));
173        }
174
175        Ok(None)
176    }
177
178    pub fn encode(&self, input: &str) -> Result<MetaString, Error> {
179        if let Some(ms) = self._encode(input)? {
180            return Ok(ms);
181        }
182        let encoding = self.compute_encoding(input, None);
183        self.encode_with_encoding(input, encoding)
184    }
185
186    pub fn encode_with_encodings(
187        &self,
188        input: &str,
189        encodings: &[Encoding],
190    ) -> Result<MetaString, Error> {
191        if let Some(ms) = self._encode(input)? {
192            return Ok(ms);
193        }
194        let encoding = self.compute_encoding(input, Some(encodings));
195        self.encode_with_encoding(input, encoding)
196    }
197
198    fn compute_encoding(&self, input: &str, encodings: Option<&[Encoding]>) -> Encoding {
199        let allow = |e: Encoding| encodings.map_or(true, |opts| opts.contains(&e));
200        let statistics = self.compute_statistics(input);
201        if statistics.can_lower_special_encoded && allow(Encoding::LowerSpecial) {
202            return Encoding::LowerSpecial;
203        }
204        if statistics.can_lower_upper_digit_special_encoded {
205            if statistics.digit_count != 0 && allow(Encoding::LowerUpperDigitSpecial) {
206                return Encoding::LowerUpperDigitSpecial;
207            }
208            let upper_count: usize = statistics.upper_count;
209            if upper_count == 1
210                && input.chars().next().unwrap().is_uppercase()
211                && allow(Encoding::FirstToLowerSpecial)
212            {
213                return Encoding::FirstToLowerSpecial;
214            }
215            if ((input.len() + upper_count) * 5) < (input.len() * 6)
216                && allow(Encoding::AllToLowerSpecial)
217            {
218                return Encoding::AllToLowerSpecial;
219            }
220            if allow(Encoding::LowerUpperDigitSpecial) {
221                return Encoding::LowerUpperDigitSpecial;
222            }
223        }
224        Encoding::Utf8
225    }
226
227    fn compute_statistics(&self, chars: &str) -> StringStatistics {
228        let mut can_lower_upper_digit_special_encoded = true;
229        let mut can_lower_special_encoded = true;
230        let mut digit_count = 0;
231        let mut upper_count = 0;
232        for c in chars.chars() {
233            if can_lower_upper_digit_special_encoded
234                && !(c.is_lowercase()
235                    || c.is_uppercase()
236                    || c.is_ascii_digit()
237                    || (c == self.special_char1 || c == self.special_char2))
238            {
239                can_lower_upper_digit_special_encoded = false;
240            }
241            if can_lower_special_encoded
242                && !(c.is_lowercase() || matches!(c, '.' | '_' | '$' | '|'))
243            {
244                can_lower_special_encoded = false;
245            }
246            if c.is_ascii_digit() {
247                digit_count += 1;
248            }
249            if c.is_uppercase() {
250                upper_count += 1;
251            }
252        }
253        StringStatistics {
254            digit_count,
255            upper_count,
256            can_lower_upper_digit_special_encoded,
257            can_lower_special_encoded,
258        }
259    }
260
261    pub fn encode_with_encoding(
262        &self,
263        input: &str,
264        encoding: Encoding,
265    ) -> Result<MetaString, Error> {
266        if input.is_empty() {
267            return MetaString::new(
268                input.to_string(),
269                Encoding::Utf8,
270                vec![],
271                self.special_char1,
272                self.special_char2,
273            );
274        }
275        ensure!(
276            input.len() < SHORT_MAX_VALUE,
277            Error::encode_error(format!(
278                "Meta string is too long, max:{SHORT_MAX_VALUE}, current:{}",
279                input.len()
280            ))
281        );
282        ensure!(
283            encoding == Encoding::Utf8 || self.is_latin(input),
284            Error::encode_error("Non-ASCII characters in meta string are not allowed")
285        );
286
287        if input.is_empty() {
288            return MetaString::new(
289                input.to_string(),
290                Encoding::Utf8,
291                vec![],
292                self.special_char1,
293                self.special_char2,
294            );
295        };
296
297        match encoding {
298            Encoding::LowerSpecial => {
299                let encoded_data = self.encode_lower_special(input)?;
300                MetaString::new(
301                    input.to_string(),
302                    encoding,
303                    encoded_data,
304                    self.special_char1,
305                    self.special_char2,
306                )
307            }
308            Encoding::LowerUpperDigitSpecial => {
309                let encoded_data = self.encode_lower_upper_digit_special(input)?;
310                MetaString::new(
311                    input.to_string(),
312                    encoding,
313                    encoded_data,
314                    self.special_char1,
315                    self.special_char2,
316                )
317            }
318            Encoding::FirstToLowerSpecial => {
319                let encoded_data = self.encode_first_to_lower_special(input)?;
320                MetaString::new(
321                    input.to_string(),
322                    encoding,
323                    encoded_data,
324                    self.special_char1,
325                    self.special_char2,
326                )
327            }
328            Encoding::AllToLowerSpecial => {
329                let upper_count = input.chars().filter(|c| c.is_uppercase()).count();
330                let encoded_data = self.encode_all_to_lower_special(input, upper_count)?;
331                MetaString::new(
332                    input.to_string(),
333                    encoding,
334                    encoded_data,
335                    self.special_char1,
336                    self.special_char2,
337                )
338            }
339            Encoding::Utf8 => {
340                let encoded_data = input.as_bytes().to_vec();
341                MetaString::new(
342                    input.to_string(),
343                    Encoding::Utf8,
344                    encoded_data,
345                    self.special_char1,
346                    self.special_char2,
347                )
348            }
349        }
350    }
351
352    fn encode_generic(&self, input: &str, bits_per_char: u8) -> Result<Vec<u8>, Error> {
353        let total_bits: usize = input.len() * bits_per_char as usize + 1;
354        let byte_length: usize = (total_bits + 7) / 8;
355        let mut bytes = vec![0; byte_length];
356        let mut current_bit = 1;
357        for c in input.chars() {
358            let value = self.char_to_value(c, bits_per_char)?;
359            for i in (0..bits_per_char).rev() {
360                if (value & (1 << i)) != 0 {
361                    let byte_pos: usize = current_bit / 8;
362                    let bit_pos: usize = current_bit % 8;
363                    bytes[byte_pos] |= 1 << (7 - bit_pos);
364                }
365                current_bit += 1;
366            }
367        }
368        if byte_length * 8 >= total_bits + bits_per_char as usize {
369            bytes[0] |= 0x80;
370        }
371        Ok(bytes)
372    }
373    pub fn encode_lower_special(&self, input: &str) -> Result<Vec<u8>, Error> {
374        self.encode_generic(input, 5)
375    }
376
377    pub fn encode_lower_upper_digit_special(&self, input: &str) -> Result<Vec<u8>, Error> {
378        self.encode_generic(input, 6)
379    }
380
381    pub fn encode_first_to_lower_special(&self, input: &str) -> Result<Vec<u8>, Error> {
382        if input.is_empty() {
383            return self.encode_generic("", 5);
384        }
385
386        let mut iter = input.char_indices();
387        let (first_idx, first_char) = iter.next().unwrap();
388
389        let lower = first_char.to_lowercase().to_string();
390
391        // Fast path: if lowercase has the same byte length and is ASCII,
392        // we can modify the first byte directly without rebuilding the string.
393        if lower.len() == first_char.len_utf8() && first_char.is_ascii() {
394            let mut bytes = input.as_bytes().to_owned();
395            bytes[first_idx] = lower.as_bytes()[0];
396            return self.encode_generic(std::str::from_utf8(&bytes).unwrap(), 5);
397        }
398
399        // rebuild only the necessary prefix + suffix (still efficient).
400        let (_, rest) = input.split_at(first_char.len_utf8());
401        let mut result = String::with_capacity(input.len() + lower.len() - first_char.len_utf8());
402        result.push_str(&lower);
403        result.push_str(rest);
404        self.encode_generic(&result, 5)
405    }
406
407    pub fn encode_all_to_lower_special(
408        &self,
409        input: &str,
410        upper_count: usize,
411    ) -> Result<Vec<u8>, Error> {
412        let mut new_chars = Vec::with_capacity(input.len() + upper_count);
413        for c in input.chars() {
414            if c.is_uppercase() {
415                new_chars.push('|');
416                new_chars.push(c.to_lowercase().next().unwrap());
417            } else {
418                new_chars.push(c);
419            }
420        }
421        self.encode_generic(&new_chars.iter().collect::<String>(), 5)
422    }
423
424    fn char_to_value(&self, c: char, bits_per_char: u8) -> Result<u8, Error> {
425        match bits_per_char {
426            5 => match c {
427                'a'..='z' => Ok(c as u8 - b'a'),
428                '.' => Ok(26),
429                '_' => Ok(27),
430                '$' => Ok(28),
431                '|' => Ok(29),
432                _ => Err(Error::encode_error(format!(
433                    "Unsupported character for LOWER_UPPER_DIGIT_SPECIAL encoding: {c}",
434                )))?,
435            },
436            6 => match c {
437                'a'..='z' => Ok(c as u8 - b'a'),
438                'A'..='Z' => Ok(c as u8 - b'A' + 26),
439                '0'..='9' => Ok(c as u8 - b'0' + 52),
440                _ => {
441                    if c == self.special_char1 {
442                        Ok(62)
443                    } else if c == self.special_char2 {
444                        Ok(63)
445                    } else {
446                        Err(Error::encode_error(format!(
447                            "Invalid character value for LOWER_SPECIAL decoding: {c:?}",
448                        )))?
449                    }
450                }
451            },
452            _ => unreachable!(),
453        }
454    }
455}
456
457impl MetaStringDecoder {
458    pub const fn new(special_char1: char, special_char2: char) -> Self {
459        MetaStringDecoder {
460            special_char1,
461            special_char2,
462        }
463    }
464
465    pub fn decode(&self, encoded_data: &[u8], encoding: Encoding) -> Result<MetaString, Error> {
466        let str = {
467            if encoded_data.is_empty() {
468                Ok("".to_string())
469            } else {
470                match encoding {
471                    Encoding::LowerSpecial => self.decode_lower_special(encoded_data),
472                    Encoding::LowerUpperDigitSpecial => {
473                        self.decode_lower_upper_digit_special(encoded_data)
474                    }
475                    Encoding::FirstToLowerSpecial => {
476                        self.decode_rep_first_lower_special(encoded_data)
477                    }
478                    Encoding::AllToLowerSpecial => {
479                        self.decode_rep_all_to_lower_special(encoded_data)
480                    }
481                    Encoding::Utf8 => Ok(String::from_utf8_lossy(encoded_data).into_owned()),
482                }
483            }
484        }?;
485        MetaString::new(
486            str,
487            encoding,
488            Vec::from(encoded_data),
489            self.special_char1,
490            self.special_char2,
491        )
492    }
493
494    fn decode_lower_special(&self, data: &[u8]) -> Result<String, Error> {
495        let mut decoded = String::new();
496        let total_bits: usize = data.len() * 8;
497        let strip_last_char = (data[0] & 0x80) != 0;
498        let bit_mask: usize = 0b11111;
499        let mut bit_index = 1;
500        while bit_index + 5 <= total_bits && !(strip_last_char && (bit_index + 2 * 5 > total_bits))
501        {
502            let byte_index = bit_index / 8;
503            let intra_byte_index = bit_index % 8;
504            let char_value: usize = if intra_byte_index > 3 {
505                ((((data[byte_index] as usize) << 8)
506                    | if byte_index + 1 < data.len() {
507                        data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF
508                    } else {
509                        0
510                    })
511                    >> (11 - intra_byte_index))
512                    & bit_mask
513            } else {
514                ((data[byte_index] as usize) >> (3 - intra_byte_index)) & bit_mask
515            };
516            bit_index += 5;
517            decoded.push(self.decode_lower_special_char(char_value as u8)?);
518        }
519        Ok(decoded)
520    }
521
522    fn decode_lower_upper_digit_special(&self, data: &[u8]) -> Result<String, Error> {
523        let mut decoded = String::new();
524        let num_bits = data.len() * 8;
525        let strip_last_char = (data[0] & 0x80) != 0;
526        let mut bit_index = 1;
527        let bit_mask: usize = 0b111111;
528        while bit_index + 6 <= num_bits && !(strip_last_char && (bit_index + 2 * 6 > num_bits)) {
529            let byte_index = bit_index / 8;
530            let intra_byte_index = bit_index % 8;
531            let char_value: usize = if intra_byte_index > 2 {
532                ((((data[byte_index] as usize) << 8)
533                    | if byte_index + 1 < data.len() {
534                        data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF
535                    } else {
536                        0
537                    })
538                    >> (10 - intra_byte_index))
539                    & bit_mask
540            } else {
541                ((data[byte_index] as usize) >> (2 - intra_byte_index)) & bit_mask
542            };
543            bit_index += 6;
544            decoded.push(self.decode_lower_upper_digit_special_char(char_value as u8)?);
545        }
546        Ok(decoded)
547    }
548
549    fn decode_lower_special_char(&self, char_value: u8) -> Result<char, Error> {
550        match char_value {
551            0..=25 => Ok((b'a' + char_value) as char),
552            26 => Ok('.'),
553            27 => Ok('_'),
554            28 => Ok('$'),
555            29 => Ok('|'),
556            _ => Err(Error::encode_error(format!(
557                "Invalid character value for LOWER_SPECIAL decoding: {char_value}",
558            )))?,
559        }
560    }
561
562    fn decode_lower_upper_digit_special_char(&self, char_value: u8) -> Result<char, Error> {
563        match char_value {
564            0..=25 => Ok((b'a' + char_value) as char),
565            26..=51 => Ok((b'A' + char_value - 26) as char),
566            52..=61 => Ok((b'0' + char_value - 52) as char),
567            62 => Ok(self.special_char1),
568            63 => Ok(self.special_char2),
569            _ => Err(Error::encode_error(format!(
570                "Invalid character value for LOWER_UPPER_DIGIT_SPECIAL decoding: {char_value}",
571            )))?,
572        }
573    }
574
575    fn decode_rep_first_lower_special(&self, data: &[u8]) -> Result<String, Error> {
576        let decoded_str = self.decode_lower_special(data)?;
577        let mut chars = decoded_str.chars();
578        match chars.next() {
579            Some(first_char) => {
580                let mut result = first_char.to_ascii_uppercase().to_string();
581                result.extend(chars);
582                Ok(result)
583            }
584            None => Ok(decoded_str),
585        }
586    }
587    fn decode_rep_all_to_lower_special(&self, data: &[u8]) -> Result<String, Error> {
588        let decoded_str = self.decode_lower_special(data)?;
589        let mut result = String::new();
590        let mut skip = false;
591        for (i, char) in decoded_str.chars().enumerate() {
592            if skip {
593                skip = false;
594                continue;
595            }
596            // Encounter a '|', capitalize the next character
597            // and skip the following character.
598            if char == '|' {
599                if let Some(next_char) = decoded_str.chars().nth(i + 1) {
600                    result.push(next_char.to_ascii_uppercase());
601                }
602                skip = true;
603            } else {
604                result.push(char);
605            }
606        }
607        Ok(result)
608    }
609}