Struct magnus::encoding::RbEncoding

source · [−]

#[repr(transparent)]
pub struct RbEncoding(_);

Expand description

Ruby’s internal encoding type.

This type contains the data for an encoding, and is used with operations such as converting a string from one encoding to another, or reading a string character by character.

Implementations

source

impl RbEncoding

source

pub fn ascii8bit() -> Self

Returns the encoding that represents ASCII-8BIT a.k.a. binary.

source

pub fn utf8() -> Self

Returns the encoding that represents UTF-8.

source

pub fn usascii() -> Self

Returns the encoding that represents US-ASCII.

source

pub fn locale() -> Self

Returns the encoding that represents the process’ current locale.

This is dynamic. If you change the process’ locale that should also change the return value of this function.

source

pub fn filesystem() -> Self

Returns the filesystem encoding.

This is the encoding that Ruby expects data from the OS’ file system to be encoded as, such as directory names.

source

pub fn default_external() -> Self

Returns the default external encoding.

This is the encoding used for anything out-of-process, such as reading from files or sockets.

source

pub fn default_internal() -> Option<Self>

Returns the default internal encoding.

If set, any out-of-process data is transcoded from the default external encoding to the default internal encoding.

source

pub fn find(name: &str) -> Option<Self>

Returns the encoding with the name or alias name.

pub fn name(&self) -> &str

Returns the canonical name of the encoding.

Examples

use magnus::{eval, encoding::RbEncoding};

assert_eq!(RbEncoding::utf8().name(), "UTF-8");

Panics

Panics if the name is not valid UTF-8. Encoding names are expected to be ASCII only.

source

pub fn mbminlen(&self) -> usize

Returns the minimum number of bytes the encoding needs to represent a single character.

pub fn mbmaxlen(&self) -> usize

Returns the maximum number of bytes the encoding may need to represent a single character.

pub fn mbclen(&self, slice: &[u8 ]) -> usize

Returns the number of bytes of the first character in slice.

If the first byte of slice is mid way through a character this will return the number of bytes until the next character boundry.

If the slice ends before the last byte of the character this will return the number of bytes until the end of the slice.

See also fast_mbclen and precise_mbclen.

pub fn fast_mbclen(&self, slice: &[u8 ]) -> usize

Returns the number of bytes of the first character in slice.

If the first byte of slice is mid way through a character this will return the number of bytes until the next character boundry.

If the slice ends before the last byte of the character this will return the theoretical number of bytes until the end of the character, which will be past the end of the slice. If the string has been read from an IO source this may indicate more data needs to be read.

Examples

use magnus::{eval, encoding::{EncodingCapable, RbEncoding}, RString};

let s = RString::new("🦀 café");
let encoding: RbEncoding = s.enc_get().into();
let mut chars = 0;

unsafe {
    let mut bytes = s.as_slice();
    assert_eq!(bytes.len(), 10);

    while !bytes.is_empty() {
        chars += 1;
        let len = encoding.fast_mbclen(bytes);
        bytes = &bytes[len..];
    }
}

assert_eq!(chars, 6);

source

pub fn precise_mbclen(&self, slice: &[u8 ]) -> MbcLen

Returns the number of bytes of the first character in slice.

Examples

use magnus::{eval, encoding::{EncodingCapable, MbcLen, RbEncoding}, RString};

let s = RString::new("🦀 café");
let encoding: RbEncoding = s.enc_get().into();
let mut chars = 0;

unsafe {
    let mut bytes = s.as_slice();
    assert_eq!(bytes.len(), 10);

    while !bytes.is_empty() {
        chars += 1;
        match encoding.precise_mbclen(bytes) {
            MbcLen::CharFound(len) => bytes = &bytes[len..],
            MbcLen::NeedMore(len) => panic!("Met end of string expecting {} bytes", len),
            MbcLen::Invalid => panic!("corrupted string"),
        }
    }
}

assert_eq!(chars, 6);

source

pub fn ascget(&self, slice: &[u8 ]) -> Option<(u8, usize )>

If the first character in slice is included in ASCII return it and its encoded length in slice, otherwise returns None.

Typically the length will be 1, but some encodings such as UTF-16 will encode ASCII characters in 2 bytes.

Examples

use magnus::{eval, encoding::{EncodingCapable, RbEncoding}, RString};

let s = RString::new("example");
let encoding: RbEncoding = s.enc_get().into();
let mut chars = Vec::new();

unsafe {
    let mut bytes = s.as_slice();

    while !bytes.is_empty() {
        match encoding.ascget(bytes) {
            Some((char, len)) => {
                chars.push(char);
                bytes = &bytes[len..];
            }
            None => panic!("string not ASCII"),
        }
    }
}

assert_eq!(chars, [101, 120, 97, 109, 112, 108, 101]);

source

pub fn codepoint_len(&self, slice: &[u8 ]) -> Result<(u32, usize ), Error>

Returns the codepoint and length in bytes of the first character in slice.

Examples

use magnus::{eval, encoding::{EncodingCapable, RbEncoding}, RString};

let s = RString::new("🦀 café");
let encoding: RbEncoding = s.enc_get().into();
let mut codepoints = Vec::new();

unsafe {
    let mut bytes = s.as_slice();

    while !bytes.is_empty() {
        let (codepoint, len) = encoding.codepoint_len(bytes).unwrap();
        codepoints.push(codepoint);
        bytes = &bytes[len..];
    }
}

assert_eq!(codepoints, [129408, 32, 99, 97, 102, 233]);

source

pub fn codelen(&self, code: u32) -> Result<usize, Error>

Returns the number of bytes required to represent the code point code in the encoding of self.

Examples

use magnus::{eval, encoding::RbEncoding};

assert_eq!(RbEncoding::utf8().codelen(97).unwrap(), 1);
assert_eq!(RbEncoding::utf8().codelen(129408).unwrap(), 4);

source

pub fn chr(&self, code: u32) -> Result<RString, Error>

Encode the codepoint code as a series of bytes in the encoding self and return the result as a Ruby string.

Examples

use magnus::{eval, encoding::RbEncoding};

let c = RbEncoding::usascii().chr(97).unwrap();
let res: bool = eval!(r#"c == "a""#, c).unwrap();
assert!(res);

use magnus::{eval, encoding::RbEncoding};

let c = RbEncoding::utf8().chr(129408).unwrap();
let res: bool = eval!(r#"c == "🦀""#, c).unwrap();
assert!(res);

source

pub fn is_mbc_newline(&self, slice: &[u8 ]) -> bool

Returns true if the first character in slice is a newline in the encoding self, false otherwise.

Examples

use magnus::{eval, encoding::RbEncoding};

assert!(RbEncoding::utf8().is_mbc_newline(&[10]));
assert!(!RbEncoding::utf8().is_mbc_newline(&[32]));

source

pub fn is_code_ctype(&self, code: u32, ctype: CType) -> bool

Returns whether the given codepoint code is of the character type ctype in the encoding self.

Examples

use magnus::{eval, encoding::{CType, RbEncoding}};

assert!(RbEncoding::utf8().is_code_ctype(9, CType::Space));   // "\t"
assert!(RbEncoding::utf8().is_code_ctype(32, CType::Space));  // " "
assert!(!RbEncoding::utf8().is_code_ctype(65, CType::Space)); // "A"
assert!(RbEncoding::utf8().is_code_ctype(65, CType::Alnum));  // "A"
assert!(RbEncoding::utf8().is_code_ctype(65, CType::Upper));  // "A"