rwkv_tokenizer/
lib.rs

1mod trie;
2use std::{str, env};
3use std::collections::HashMap;
4use std::fs::File;
5use std::io::{self, BufRead};
6use std::path::{Path};
7use std::str::Utf8Error;
8use regex::Regex;
9use trie::Trie;
10use unescape::unescape;
11use rayon::prelude::*;
12
13
14#[derive(Debug)]
15pub struct WorldTokenizer {
16    tokens: Vec<Vec<u8>>,
17    trie: Trie
18}
19
20impl WorldTokenizer {
21    pub fn new(vocab_filepath: Option<&str>) -> io::Result<Self> {
22        let mut tokenizer = WorldTokenizer {
23            tokens: Vec::new(),
24            trie: Trie::new()
25        };
26        let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("assets").join("rwkv_vocab_v20230424.txt");
27        let vocab_filepath = vocab_filepath.unwrap_or(manifest_path.to_str().unwrap());
28        let file = File::open(vocab_filepath)?;
29        let reader = io::BufReader::new(file);
30
31        let re = Regex::new(r"(\d+)\s+(b?)(.+)\s+(\d+)").unwrap();
32        tokenizer.tokens.push(vec![0]);
33        for line in reader.lines() {
34            let line = line?;
35            if let Some(captures) = re.captures(&line) {
36                let id = captures[1].parse::<u16>().unwrap();
37                let is_byte = captures[2].to_string();
38                let length = captures[4].parse::<usize>().unwrap();
39                let mut string: String = captures[3].to_string();
40                string = string[1..string.len()-1].parse().unwrap();
41                let sbytes: Vec<u8>;
42                if is_byte.len() == 0 {
43                    string = unescape(string.as_str()).unwrap();
44                    sbytes = string.clone().into_bytes();
45                    tokenizer.tokens.push(Vec::from(string.as_bytes()));
46                } else {
47                    sbytes = WorldTokenizer::hex_to_bytes(string.as_str()).unwrap();
48                    tokenizer.tokens.push(sbytes.clone());
49                }
50                assert_eq!(sbytes.len(), length);
51                tokenizer.trie.insert(&sbytes, id);
52            }
53            else {
54                println!("Line with issue: {:?}", line)
55            }
56        }
57        Ok(tokenizer)
58    }
59
60    pub fn encode(&self, word: &str) -> Vec<u16> {
61        self.trie.tokenize(word)
62    }
63
64    pub fn encode_batch(&self, word_list: Vec<String>) -> Vec<Vec<u16>> {
65        word_list.par_iter().map(|word| self.trie.tokenize(word)).collect()
66    }
67
68    pub fn decode(&self, vec: Vec<u16>) -> Result<String, Utf8Error> {
69        let mut result: Vec<u8> = Vec::new();
70        for index in vec.iter() {
71            let mut current_tokens = self.tokens[*index as usize].clone();
72            result.append(&mut current_tokens);
73        }
74        Ok(str::from_utf8(&*result)?.to_string())
75    }
76
77    pub fn vocab_size(&self) -> usize {
78        self.tokens.len()
79    }
80
81    pub fn get_vocab(&self) -> HashMap<String, usize> {
82        let mut vocabularies: HashMap<String, usize> = HashMap::new();
83        for (index, value) in self.tokens.iter().enumerate() {
84            let text: String = String::from_utf8((*value).to_owned()).unwrap_or_else(|_e| "Binary string (TODO)".to_string());
85            vocabularies.insert(text, index);
86        }
87        vocabularies
88    }
89
90    fn hex_to_bytes(hex: &str) -> Option<Vec<u8>> {
91        let hex = hex.replace("\\x", "");
92        if hex.len() % 2 == 0 {
93            (0..hex.len())
94                .step_by(2)
95                .map(|i| hex.get(i..i + 2)
96                    .and_then(|sub| u8::from_str_radix(sub, 16).ok()))
97                .collect()
98        } else {
99            None
100        }
101    }
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107    const BEAUTIFUL_DAY: &str = "Today is a beautiful day. 今天是美好的一天。";
108
109    const JAPANESE: &str = "起業家イーロン・マスク氏が創業した宇宙開発企業「スペースX（エックス）」の巨大新型ロケット「スターシップ」が20日朝、初めて打ち上げられたが、爆発した。
110打ち上げは米テキサス州の東海岸で行われた。無人の試験で、負傷者はいなかった。
111打ち上げから2～3分後、史上最大のロケットが制御不能になり、まもなく搭載された装置で破壊された。
112マスク氏は、数カ月後に再挑戦すると表明した。
113スペースXのエンジニアたちは、それでもこの日のミッションは成功だったとしている。「早期に頻繁に試験する」ことを好む人たちなので、破壊を恐れていない。次のフライトに向け、大量のデータを収集したはずだ。2機目のスターシップは、ほぼ飛行準備が整っている。
114マスク氏は、「SpaceXチームの皆さん、スターシップのエキサイティングな試験打ち上げ、おめでとう！　数カ月後に行われる次の試験打ち上げに向けて、多くを学んだ」とツイートした。
115アメリカでのロケット打ち上げを認可する米連邦航空局（NASA）は、事故調査を監督するとした。広報担当者は、飛行中に機体が失われた場合の通常の対応だと述べた。
116マスク氏は打ち上げ前、期待値を下げようとしていた。発射台の設備を破壊せずに機体を打ち上げるだけでも「成功」だとしていた。
117その願いはかなった。スターシップは打ち上げ施設からどんどん上昇し、メキシコ湾の上空へと向かっていった。しかし1分もしないうち、すべてが計画通りに進んでいるのではないことが明らかになった。";
118
119    const LONG_UTF8_TEXT: &str = r#"UTF-8 decoder capability and stress test
120----------------------------------------
121
122Markus Kuhn <https://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
123
124This test file can help you examine, how your UTF-8 decoder handles
125various types of correct, malformed, or otherwise interesting UTF-8
126sequences. This file is not meant to be a conformance test. It does
127not prescribe any particular outcome. Therefore, there is no way to
128"pass" or "fail" this test file, even though the text does suggest a
129preferable decoder behaviour at some places. Its aim is, instead, to
130help you think about, and test, the behaviour of your UTF-8 decoder on a
131systematic collection of unusual inputs. Experience so far suggests
132that most first-time authors of UTF-8 decoders find at least one
133serious problem in their decoder using this file.
134
135The test lines below cover boundary conditions, malformed UTF-8
136sequences, as well as correctly encoded UTF-8 sequences of Unicode code
137points that should never occur in a correct UTF-8 file.
138
139According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
140receiving UTF-8 shall interpret a "malformed sequence in the same way
141that it interprets a character that is outside the adopted subset" and
142"characters that are not within the adopted subset shall be indicated
143to the user" by a receiving device. One commonly used approach in
144UTF-8 decoders is to replace any malformed UTF-8 sequence by a
145replacement character (U+FFFD), which looks a bit like an inverted
146question mark, or a similar symbol. It might be a good idea to
147visually distinguish a malformed UTF-8 sequence from a correctly
148encoded Unicode character that is just not available in the current
149font but otherwise fully legal, even though ISO 10646-1 doesn't
150mandate this. In any case, just ignoring malformed sequences or
151unavailable characters does not conform to ISO 10646, will make
152debugging more difficult, and can lead to user confusion.
153
154Please check, whether a malformed UTF-8 sequence is (1) represented at
155all, (2) represented by exactly one single replacement character (or
156equivalent signal), and (3) the following quotation mark after an
157illegal UTF-8 sequence is correctly displayed, i.e. proper
158resynchronization takes place immediately after any malformed
159sequence. This file says "THE END" in the last line, so if you don't
160see that, your decoder crashed somehow before, which should always be
161cause for concern.
162
163All lines in this file are exactly 79 characters long (plus the line
164feed). In addition, all lines end with "|", except for the two test
165lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
166U+0000 and U+007F. If you display this file with a fixed-width font,
167these "|" characters should all line up in column 79 (right margin).
168This allows you to test quickly, whether your UTF-8 decoder finds the
169correct number of characters in every line, that is whether each
170malformed sequences is replaced by a single replacement character.
171
172Note that, as an alternative to the notion of malformed sequence used
173here, it is also a perfectly acceptable (and in some situations even
174preferable) solution to represent each individual byte of a malformed
175sequence with a replacement character. If you follow this strategy in
176your decoder, then please ignore the "|" column.
177
178
179Here come the tests:                                                          |
180                                                                              |
1811  Some correct UTF-8 text                                                    |
182                                                                              |
183You should see the Greek word 'kosme':       "κόσμε"                          |
184                                                                              |
1852  Boundary condition test cases                                              |
186                                                                              |
1872.1  First possible sequence of a certain length                              |
188                                                                              |
1892.1.1  1 byte  (U-00000000):        "�"
1902.1.2  2 bytes (U-00000080):        ""                                       |
1912.1.3  3 bytes (U-00000800):        "ࠀ"                                       |
1922.1.4  4 bytes (U-00010000):        "𐀀"                                       |
1932.1.5  5 bytes (U-00200000):        "�����"                                       |
1942.1.6  6 bytes (U-04000000):        "������"                                       |
195                                                                              |
1962.2  Last possible sequence of a certain length                               |
197                                                                              |
1982.2.1  1 byte  (U-0000007F):        ""
1992.2.2  2 bytes (U-000007FF):        "߿"                                       |
2002.2.3  3 bytes (U-0000FFFF):        ""                                       |
2012.2.4  4 bytes (U-001FFFFF):        "����"                                       |
2022.2.5  5 bytes (U-03FFFFFF):        "�����"                                       |
2032.2.6  6 bytes (U-7FFFFFFF):        "������"                                       |
204                                                                              |
2052.3  Other boundary conditions                                                |
206                                                                              |
2072.3.1  U-0000D7FF = ed 9f bf = "퟿"                                            |
2082.3.2  U-0000E000 = ee 80 80 = ""                                            |
2092.3.3  U-0000FFFD = ef bf bd = "�"                                            |
2102.3.4  U-0010FFFF = f4 8f bf bf = "􏿿"                                         |
2112.3.5  U-00110000 = f4 90 80 80 = "����"                                         |
212                                                                              |
2133  Malformed sequences                                                        |
214                                                                              |
2153.1  Unexpected continuation bytes                                            |
216                                                                              |
217Each unexpected continuation byte should be separately signalled as a         |
218malformed sequence of its own.                                                |
219                                                                              |
2203.1.1  First continuation byte 0x80: "�"                                      |
2213.1.2  Last  continuation byte 0xbf: "�"                                      |
222                                                                              |
2233.1.3  2 continuation bytes: "��"                                             |
2243.1.4  3 continuation bytes: "���"                                            |
2253.1.5  4 continuation bytes: "����"                                           |
2263.1.6  5 continuation bytes: "�����"                                          |
2273.1.7  6 continuation bytes: "������"                                         |
2283.1.8  7 continuation bytes: "�������"                                        |
229                                                                              |
2303.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):            |
231                                                                              |
232   "����������������                                                          |
233    ����������������                                                          |
234    ����������������                                                          |
235    ����������������"                                                         |
236                                                                              |
2373.2  Lonely start characters                                                  |
238                                                                              |
2393.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf),                    |
240       each followed by a space character:                                    |
241                                                                              |
242   "� � � � � � � � � � � � � � � �                                           |
243    � � � � � � � � � � � � � � � � "                                         |
244                                                                              |
2453.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef),                    |
246       each followed by a space character:                                    |
247                                                                              |
248   "� � � � � � � � � � � � � � � � "                                         |
249                                                                              |
2503.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7),                     |
251       each followed by a space character:                                    |
252                                                                              |
253   "� � � � � � � � "                                                         |
254                                                                              |
2553.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb),                     |
256       each followed by a space character:                                    |
257                                                                              |
258   "� � � � "                                                                 |
259                                                                              |
2603.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd),                     |
261       each followed by a space character:                                    |
262                                                                              |
263   "� � "                                                                     |
264                                                                              |
2653.3  Sequences with last continuation byte missing                            |
266                                                                              |
267All bytes of an incomplete sequence should be signalled as a single           |
268malformed sequence, i.e., you should see only a single replacement            |
269character in each of the next 10 tests. (Characters as in section 2)          |
270                                                                              |
2713.3.1  2-byte sequence with last byte missing (U+0000):     "�"               |
2723.3.2  3-byte sequence with last byte missing (U+0000):     "��"               |
2733.3.3  4-byte sequence with last byte missing (U+0000):     "���"               |
2743.3.4  5-byte sequence with last byte missing (U+0000):     "����"               |
2753.3.5  6-byte sequence with last byte missing (U+0000):     "�����"               |
2763.3.6  2-byte sequence with last byte missing (U-000007FF): "�"               |
2773.3.7  3-byte sequence with last byte missing (U-0000FFFF): "�"               |
2783.3.8  4-byte sequence with last byte missing (U-001FFFFF): "���"               |
2793.3.9  5-byte sequence with last byte missing (U-03FFFFFF): "����"               |
2803.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "�����"               |
281                                                                              |
2823.4  Concatenation of incomplete sequences                                    |
283                                                                              |
284All the 10 sequences of 3.3 concatenated, you should see 10 malformed         |
285sequences being signalled:                                                    |
286                                                                              |
287   "�����������������������������"                                                               |
288                                                                              |
2893.5  Impossible bytes                                                         |
290                                                                              |
291The following two bytes cannot appear in a correct UTF-8 string               |
292                                                                              |
2933.5.1  fe = "�"                                                               |
2943.5.2  ff = "�"                                                               |
2953.5.3  fe fe ff ff = "����"                                                   |
296                                                                              |
2974  Overlong sequences                                                         |
298                                                                              |
299The following sequences are not malformed according to the letter of          |
300the Unicode 2.0 standard. However, they are longer then necessary and         |
301a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8         |
302decoder" should reject them just like malformed sequences for two             |
303reasons: (1) It helps to debug applications if overlong sequences are         |
304not treated as valid representations of characters, because this helps        |
305to spot problems more quickly. (2) Overlong sequences provide                 |
306alternative representations of characters, that could maliciously be          |
307used to bypass filters that check only for ASCII characters. For              |
308instance, a 2-byte encoded line feed (LF) would not be caught by a            |
309line counter that counts only 0x0a bytes, but it would still be               |
310processed as a line feed by an unsafe UTF-8 decoder later in the              |
311pipeline. From a security point of view, ASCII compatibility of UTF-8         |
312sequences means also, that ASCII characters are *only* allowed to be          |
313represented by ASCII bytes in the range 0x00-0x7f. To ensure this             |
314aspect of ASCII compatibility, use only "safe UTF-8 decoders" that            |
315reject overlong UTF-8 sequences for which a shorter encoding exists.          |
316                                                                              |
3174.1  Examples of an overlong ASCII character                                  |
318                                                                              |
319With a safe UTF-8 decoder, all of the following five overlong                 |
320representations of the ASCII character slash ("/") should be rejected         |
321like a malformed UTF-8 sequence, for instance by substituting it with         |
322a replacement character. If you see a slash below, you do not have a          |
323safe UTF-8 decoder!                                                           |
324                                                                              |
3254.1.1 U+002F = c0 af             = "��"                                        |
3264.1.2 U+002F = e0 80 af          = "���"                                        |
3274.1.3 U+002F = f0 80 80 af       = "����"                                        |
3284.1.4 U+002F = f8 80 80 80 af    = "�����"                                        |
3294.1.5 U+002F = fc 80 80 80 80 af = "������"                                        |
330                                                                              |
3314.2  Maximum overlong sequences                                               |
332                                                                              |
333Below you see the highest Unicode value that is still resulting in an         |
334overlong sequence if represented with the given number of bytes. This         |
335is a boundary test for safe UTF-8 decoders. All five characters should        |
336be rejected like malformed UTF-8 sequences.                                   |
337                                                                              |
3384.2.1  U-0000007F = c1 bf             = "��"                                   |
3394.2.2  U-000007FF = e0 9f bf          = "���"                                   |
3404.2.3  U-0000FFFF = f0 8f bf bf       = "����"                                   |
3414.2.4  U-001FFFFF = f8 87 bf bf bf    = "�����"                                   |
3424.2.5  U-03FFFFFF = fc 83 bf bf bf bf = "������"                                   |
343                                                                              |
3444.3  Overlong representation of the NUL character                             |
345                                                                              |
346The following five sequences should also be rejected like malformed           |
347UTF-8 sequences and should not be treated like the ASCII NUL                  |
348character.                                                                    |
349                                                                              |
3504.3.1  U+0000 = c0 80             = "��"                                       |
3514.3.2  U+0000 = e0 80 80          = "���"                                       |
3524.3.3  U+0000 = f0 80 80 80       = "����"                                       |
3534.3.4  U+0000 = f8 80 80 80 80    = "�����"                                       |
3544.3.5  U+0000 = fc 80 80 80 80 80 = "������"                                       |
355                                                                              |
3565  Illegal code positions                                                     |
357                                                                              |
358The following UTF-8 sequences should be rejected like malformed               |
359sequences, because they never represent valid ISO 10646 characters and        |
360a UTF-8 decoder that accepts them might introduce security problems           |
361comparable to overlong UTF-8 sequences.                                       |
362                                                                              |
3635.1 Single UTF-16 surrogates                                                  |
364                                                                              |
3655.1.1  U+D800 = ed a0 80 = "���"                                                |
3665.1.2  U+DB7F = ed ad bf = "���"                                                |
3675.1.3  U+DB80 = ed ae 80 = "���"                                                |
3685.1.4  U+DBFF = ed af bf = "���"                                                |
3695.1.5  U+DC00 = ed b0 80 = "���"                                                |
3705.1.6  U+DF80 = ed be 80 = "���"                                                |
3715.1.7  U+DFFF = ed bf bf = "���"                                                |
372                                                                              |
3735.2 Paired UTF-16 surrogates                                                  |
374                                                                              |
3755.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = "������"                               |
3765.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = "������"                               |
3775.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = "������"                               |
3785.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = "������"                               |
3795.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = "������"                               |
3805.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = "������"                               |
3815.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = "������"                               |
3825.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = "������"                               |
383                                                                              |
3845.3 Noncharacter code positions                                               |
385                                                                              |
386The following "noncharacters" are "reserved for internal use" by              |
387applications, and according to older versions of the Unicode Standard         |
388"should never be interchanged". Unicode Corrigendum #9 dropped the            |
389latter restriction. Nevertheless, their presence in incoming UTF-8 data       |
390can remain a potential security risk, depending on what use is made of        |
391these codes subsequently. Examples of such internal use:                      |
392                                                                              |
393 - Some file APIs with 16-bit characters may use the integer value -1         |
394   = U+FFFF to signal an end-of-file (EOF) or error condition.                |
395                                                                              |
396 - In some UTF-16 receivers, code point U+FFFE might trigger a                |
397   byte-swap operation (to convert between UTF-16LE and UTF-16BE).            |
398                                                                              |
399With such internal use of noncharacters, it may be desirable and safer        |
400to block those code points in UTF-8 decoders, as they should never            |
401occur legitimately in incoming UTF-8 data, and could trigger unsafe           |
402behaviour in subsequent processing.                                           |
403                                                                              |
404Particularly problematic noncharacters in 16-bit applications:                |
405                                                                              |
4065.3.1  U+FFFE = ef bf be = ""                                                |
4075.3.2  U+FFFF = ef bf bf = ""                                                |
408                                                                              |
409Other noncharacters:                                                          |
410                                                                              |
4115.3.3  U+FDD0 .. U+FDEF = "﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯"|
412                                                                              |
4135.3.4  U+nFFFE U+nFFFF (for n = 1..10)                                        |
414                                                                              |
415       "🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿                                    |
416        򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿"                                   |
417                                                                              |
418THE END                                                                       |
419
420
421UTF-8 encoded sample plain-text file
422‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
423
424Markus Kuhn [ˈmaʳkʊs kuːn] <https://www.cl.cam.ac.uk/~mgk25/> — 2002-07-25 CC BY
425
426
427The ASCII compatible UTF-8 encoding used in this plain-text file
428is defined in Unicode, ISO 10646-1, and RFC 2279.
429
430
431Using Unicode/UTF-8, you can write in emails and source code things such as
432
433Mathematics and sciences:
434
435  ∮ E⋅da = Q,  n → ∞, ∑ f(i) = ∏ g(i),      ⎧⎡⎛┌─────┐⎞⎤⎫
436                                            ⎪⎢⎜│a²+b³ ⎟⎥⎪
437  ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),    ⎪⎢⎜│───── ⎟⎥⎪
438                                            ⎪⎢⎜⎷ c₈   ⎟⎥⎪
439  ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ,                   ⎨⎢⎜       ⎟⎥⎬
440                                            ⎪⎢⎜ ∞     ⎟⎥⎪
441  ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫),      ⎪⎢⎜ ⎲     ⎟⎥⎪
442                                            ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
443  2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm     ⎩⎣⎝i=1    ⎠⎦⎭
444
445Linguistics and dictionaries:
446
447  ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
448  Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
449
450APL:
451
452  ((V⍳V)=⍳⍴V)/V←,V    ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
453
454Nicer typography in plain text files:
455
456  ╔══════════════════════════════════════════╗
457  ║                                          ║
458  ║   • ‘single’ and “double” quotes         ║
459  ║                                          ║
460  ║   • Curly apostrophes: “We’ve been here” ║
461  ║                                          ║
462  ║   • Latin-1 apostrophe and accents: '´`  ║
463  ║                                          ║
464  ║   • ‚deutsche‘ „Anführungszeichen“       ║
465  ║                                          ║
466  ║   • †, ‡, ‰, •, 3–4, —, −5/+5, ™, …      ║
467  ║                                          ║
468  ║   • ASCII safety test: 1lI|, 0OD, 8B     ║
469  ║                      ╭─────────╮         ║
470  ║   • the euro symbol: │ 14.95 € │         ║
471  ║                      ╰─────────╯         ║
472  ╚══════════════════════════════════════════╝
473
474Combining characters:
475
476  STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
477
478Greek (in Polytonic):
479
480  The Greek anthem:
481
482  Σὲ γνωρίζω ἀπὸ τὴν κόψη
483  τοῦ σπαθιοῦ τὴν τρομερή,
484  σὲ γνωρίζω ἀπὸ τὴν ὄψη
485  ποὺ μὲ βία μετράει τὴ γῆ.
486
487  ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
488  τῶν ῾Ελλήνων τὰ ἱερά
489  καὶ σὰν πρῶτα ἀνδρειωμένη
490  χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
491
492  From a speech of Demosthenes in the 4th century BC:
493
494  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
495  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
496  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
497  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
498  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
499  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
500  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
501  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
502  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
503  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
504  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
505  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
506  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
507  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
508  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
509  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
510
511  Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
512
513Georgian:
514
515  From a Unicode conference invitation:
516
517  გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
518  კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
519  ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
520  ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
521  ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
522  ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
523  ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
524
525Russian:
526
527  From a Unicode conference invitation:
528
529  Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
530  Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
531  Конференция соберет широкий круг экспертов по  вопросам глобального
532  Интернета и Unicode, локализации и интернационализации, воплощению и
533  применению Unicode в различных операционных системах и программных
534  приложениях, шрифтах, верстке и многоязычных компьютерных системах.
535
536Thai (UCS Level 2):
537
538  Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
539  classic 'San Gua'):
540
541  [----------------------------|------------------------]
542    ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช  พระปกเกศกองบู๊กู้ขึ้นใหม่
543  สิบสองกษัตริย์ก่อนหน้าแลถัดไป       สององค์ไซร้โง่เขลาเบาปัญญา
544    ทรงนับถือขันทีเป็นที่พึ่ง           บ้านเมืองจึงวิปริตเป็นนักหนา
545  โฮจิ๋นเรียกทัพทั่วหัวเมืองมา         หมายจะฆ่ามดชั่วตัวสำคัญ
546    เหมือนขับไสไล่เสือจากเคหา      รับหมาป่าเข้ามาเลยอาสัญ
547  ฝ่ายอ้องอุ้นยุแยกให้แตกกัน          ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
548    พลันลิฉุยกุยกีกลับก่อเหตุ          ช่างอาเพศจริงหนาฟ้าร้องไห้
549  ต้องรบราฆ่าฟันจนบรรลัย           ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
550
551  (The above is a two-column text. If combining characters are handled
552  correctly, the lines of the second column should be aligned with the
553  | character above.)
554
555Ethiopian:
556
557  Proverbs in the Amharic language:
558
559  ሰማይ አይታረስ ንጉሥ አይከሰስ።
560  ብላ ካለኝ እንደአባቴ በቆመጠኝ።
561  ጌጥ ያለቤቱ ቁምጥና ነው።
562  ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
563  የአፍ ወለምታ በቅቤ አይታሽም።
564  አይጥ በበላ ዳዋ ተመታ።
565  ሲተረጉሙ ይደረግሙ።
566  ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
567  ድር ቢያብር አንበሳ ያስር።
568  ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
569  እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
570  የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
571  ሥራ ከመፍታት ልጄን ላፋታት።
572  ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
573  የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
574  ተንጋሎ ቢተፉ ተመልሶ ባፉ።
575  ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
576  እግርህን በፍራሽህ ልክ ዘርጋ።
577
578Runes:
579
580  ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
581
582  (Old English, which transcribed into Latin reads 'He cwaeth that he
583  bude thaem lande northweardum with tha Westsae.' and means 'He said
584  that he lived in the northern land near the Western Sea.')
585
586Braille:
587
588  ⡌⠁⠧⠑ ⠼⠁⠒  ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
589
590  ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
591  ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
592  ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
593  ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
594  ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
595  ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
596
597  ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
598
599  ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
600  ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
601  ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
602  ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
603  ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
604  ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
605  ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
606  ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
607  ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
608
609  (The first couple of paragraphs of "A Christmas Carol" by Dickens)
610
611Compact font selection example text:
612
613  ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
614  abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
615  –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
616  ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ ﬁ�⑀₂ἠḂӥẄɐː⍎אԱა
617
618Greetings in various languages:
619
620  Hello world, Καλημέρα κόσμε, コンニチハ
621
622Box drawing alignment tests:                                          █
623                                                                      ▉
624  ╔══╦══╗  ┌──┬──┐  ╭──┬──╮  ╭──┬──╮  ┏━━┳━━┓  ┎┒┏┑   ╷  ╻ ┏┯┓ ┌┰┐    ▊ ╱╲╱╲╳╳╳
625  ║┌─╨─┐║  │╔═╧═╗│  │╒═╪═╕│  │╓─╁─╖│  ┃┌─╂─┐┃  ┗╃╄┙  ╶┼╴╺╋╸┠┼┨ ┝╋┥    ▋ ╲╱╲╱╳╳╳
626  ║│╲ ╱│║  │║   ║│  ││ │ ││  │║ ┃ ║│  ┃│ ╿ │┃  ┍╅╆┓   ╵  ╹ ┗┷┛ └┸┘    ▌ ╱╲╱╲╳╳╳
627  ╠╡ ╳ ╞╣  ├╢   ╟┤  ├┼─┼─┼┤  ├╫─╂─╫┤  ┣┿╾┼╼┿┫  ┕┛┖┚     ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
628  ║│╱ ╲│║  │║   ║│  ││ │ ││  │║ ┃ ║│  ┃│ ╽ │┃  ░░▒▒▓▓██ ┊  ┆ ╎ ╏  ┇ ┋ ▎
629  ║└─╥─┘║  │╚═╤═╝│  │╘═╪═╛│  │╙─╀─╜│  ┃└─╂─┘┃  ░░▒▒▓▓██ ┊  ┆ ╎ ╏  ┇ ┋ ▏
630  ╚══╩══╝  └──┴──┘  ╰──┴──╯  ╰──┴──╯  ┗━━┻━━┛  ▗▄▖▛▀▜   └╌╌┘ ╎ ┗╍╍┛ ┋  ▁▂▃▄▅▆▇█
631                                               ▝▀▘▙▄▟
632
633Sanskrit: काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥
634Sanskrit (standard transcription): kācaṃ śaknomyattum; nopahinasti mām.
635Classical Greek: ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει.
636Greek (monotonic): Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα.
637Greek (polytonic): Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα.
638Etruscan: (NEEDED)
639Latin: Vitrum edere possum; mihi non nocet.
640Old French: Je puis mangier del voirre. Ne me nuit.
641French: Je peux manger du verre, ça ne me fait pas mal.
642Provençal / Occitan: Pòdi manjar de veire, me nafrariá pas.
643Québécois: J'peux manger d'la vitre, ça m'fa pas mal.
644Walloon: Dji pou magnî do vêre, çoula m' freut nén må.
645Champenois: (NEEDED)
646Lorrain: (NEEDED)
647Picard: Ch'peux mingi du verre, cha m'foé mie n'ma.
648Corsican/Corsu: (NEEDED)
649Jèrriais: (NEEDED)
650Kreyòl Ayisyen (Haitï): Mwen kap manje vè, li pa blese'm.
651Basque: Kristala jan dezaket, ez dit minik ematen.
652Catalan / Català: Puc menjar vidre, que no em fa mal.
653Spanish: Puedo comer vidrio, no me hace daño.
654Aragonés: Puedo minchar beire, no me'n fa mal .
655Aranés: (NEEDED)
656Mallorquín: (NEEDED)
657Galician: Eu podo xantar cristais e non cortarme.
658European Portuguese: Posso comer vidro, não me faz mal.
659Brazilian Portuguese (8): Posso comer vidro, não me machuca.
660Caboverdiano/Kabuverdianu (Cape Verde): M' podê cumê vidru, ca ta maguâ-m'.
661Papiamentu: Ami por kome glas anto e no ta hasimi daño.
662Italian: Posso mangiare il vetro e non mi fa male.
663Milanese: Sôn bôn de magnà el véder, el me fa minga mal.
664Roman: Me posso magna' er vetro, e nun me fa male.
665Napoletano: M' pozz magna' o'vetr, e nun m' fa mal.
666Venetian: Mi posso magnare el vetro, no'l me fa mae.
667Zeneise (Genovese): Pòsso mangiâ o veddro e o no me fà mâ.
668Sicilian: Puotsu mangiari u vitru, nun mi fa mali.
669Campinadese (Sardinia): (NEEDED)
670Lugudorese (Sardinia): (NEEDED)
671Romansch (Grischun): Jau sai mangiar vaider, senza che quai fa donn a mai.
672Romany / Tsigane: (NEEDED)
673Romanian: Pot să mănânc sticlă și ea nu mă rănește.
674Esperanto: Mi povas manĝi vitron, ĝi ne damaĝas min.
675Pictish: (NEEDED)
676Breton: (NEEDED)
677Cornish: Mý a yl dybry gwéder hag éf ny wra ow ankenya.
678Welsh: Dw i'n gallu bwyta gwydr, 'dyw e ddim yn gwneud dolur i mi.
679Manx Gaelic: Foddym gee glonney agh cha jean eh gortaghey mee.
680Old Irish (Ogham): ᚛᚛ᚉᚑᚅᚔᚉᚉᚔᚋ ᚔᚈᚔ ᚍᚂᚐᚅᚑ ᚅᚔᚋᚌᚓᚅᚐ᚜
681Old Irish (Latin): Con·iccim ithi nglano. Ním·géna.
682Irish: Is féidir liom gloinne a ithe. Ní dhéanann sí dochar ar bith dom.
683Ulster Gaelic: Ithim-sa gloine agus ní miste damh é.
684Scottish Gaelic: S urrainn dhomh gloinne ithe; cha ghoirtich i mi.
685Anglo-Saxon (Runes): ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬
686Anglo-Saxon (Latin): Ic mæg glæs eotan ond hit ne hearmiað me.
687Middle English: Ich canne glas eten and hit hirtiþ me nouȝt.
688English: I can eat glass and it doesn't hurt me.
689English (IPA): [aɪ kæn iːt glɑːs ænd ɪt dɐz nɒt hɜːt miː] (Received Pronunciation)
690English (Braille): ⠊⠀⠉⠁⠝⠀⠑⠁⠞⠀⠛⠇⠁⠎⠎⠀⠁⠝⠙⠀⠊⠞⠀⠙⠕⠑⠎⠝⠞⠀⠓⠥⠗⠞⠀⠍⠑
691Jamaican: Mi kian niam glas han i neba hot mi.
692Lalland Scots / Doric: Ah can eat gless, it disnae hurt us.
693Glaswegian: (NEEDED)
694Gothic (4): 𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸.
695Old Norse (Runes): ᛖᚴ ᚷᛖᛏ ᛖᛏᛁ ᚧ ᚷᛚᛖᚱ ᛘᚾ ᚦᛖᛋᛋ ᚨᚧ ᚡᛖ ᚱᚧᚨ ᛋᚨᚱ
696Old Norse (Latin): Ek get etið gler án þess að verða sár.
697Norsk / Norwegian (Nynorsk): Eg kan eta glas utan å skada meg.
698Norsk / Norwegian (Bokmål): Jeg kan spise glass uten å skade meg.
699Føroyskt / Faroese: Eg kann eta glas, skaðaleysur.
700Íslenska / Icelandic: Ég get etið gler án þess að meiða mig.
701Svenska / Swedish: Jag kan äta glas utan att skada mig.
702Dansk / Danish: Jeg kan spise glas, det gør ikke ondt på mig.
703Sønderjysk: Æ ka æe glass uhen at det go mæ naue.
704Frysk / Frisian: Ik kin glês ite, it docht me net sear.
705Nederlands / Dutch: Ik kan glas eten, het doet mĳ geen kwaad.
706Kirchröadsj/Bôchesserplat: Iech ken glaas èèse, mer 't deet miech jing pieng.
707Afrikaans: Ek kan glas eet, maar dit doen my nie skade nie.
708Lëtzebuergescht / Luxemburgish: Ech kan Glas iessen, daat deet mir nët wei.
709Deutsch / German: Ich kann Glas essen, ohne mir zu schaden.
710Ruhrdeutsch: Ich kann Glas verkasematuckeln, ohne dattet mich wat jucken tut.
711Langenfelder Platt: Isch kann Jlaas kimmeln, uuhne datt mich datt weh dääd.
712Lausitzer Mundart ("Lusatian"): Ich koann Gloos assn und doas dudd merr ni wii.
713Odenwälderisch: Iech konn glaasch voschbachteln ohne dass es mir ebbs daun doun dud.
714Sächsisch / Saxon: 'sch kann Glos essn, ohne dass'sch mer wehtue.
715Pfälzisch: Isch konn Glass fresse ohne dasses mer ebbes ausmache dud.
716Schwäbisch / Swabian: I kå Glas frässa, ond des macht mr nix!
717Deutsch (Voralberg): I ka glas eassa, ohne dass mar weh tuat.
718Bayrisch / Bavarian: I koh Glos esa, und es duard ma ned wei.
719Allemannisch: I kaun Gloos essen, es tuat ma ned weh.
720Schwyzerdütsch (Zürich): Ich chan Glaas ässe, das schadt mir nöd.
721Schwyzerdütsch (Luzern): Ech cha Glâs ässe, das schadt mer ned.
722Plautdietsch: (NEEDED)
723Hungarian: Meg tudom enni az üveget, nem lesz tőle bajom.
724Suomi / Finnish: Voin syödä lasia, se ei vahingoita minua.
725Sami (Northern): Sáhtán borrat lása, dat ii leat bávččas.
726Erzian: Мон ярсан суликадо, ды зыян эйстэнзэ а ули.
727Northern Karelian: Mie voin syvvä lasie ta minla ei ole kipie.
728Southern Karelian: Minä voin syvvä st'oklua dai minule ei ole kibie.
729Vepsian: (NEEDED)
730Votian: (NEEDED)
731Livonian: (NEEDED)
732Estonian: Ma võin klaasi süüa, see ei tee mulle midagi.
733Latvian: Es varu ēst stiklu, tas man nekaitē.
734Lithuanian: Aš galiu valgyti stiklą ir jis manęs nežeidžia
735Old Prussian: (NEEDED)
736Sorbian (Wendish): (NEEDED)
737Czech: Mohu jíst sklo, neublíží mi.
738Slovak: Môžem jesť sklo. Nezraní ma.
739Polska / Polish: Mogę jeść szkło i mi nie szkodzi.
740Slovenian: Lahko jem steklo, ne da bi mi škodovalo.
741Bosnian, Croatian, Montenegrin and Serbian (Latin): Ja mogu jesti staklo, i to mi ne šteti.
742Bosnian, Montenegrin and Serbian (Cyrillic): Ја могу јести стакло, и то ми не штети.
743Macedonian: Можам да јадам стакло, а не ме штета.
744Russian: Я могу есть стекло, оно мне не вредит.
745Belarusian (Cyrillic): Я магу есці шкло, яно мне не шкодзіць.
746Belarusian (Lacinka): Ja mahu jeści škło, jano mne ne škodzić.
747Ukrainian: Я можу їсти скло, і воно мені не зашкодить.
748Bulgarian: Мога да ям стъкло, то не ми вреди.
749Georgian: მინას ვჭამ და არა მტკივა.
750Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։
751Albanian: Unë mund të ha qelq dhe nuk më gjen gjë.
752Turkish: Cam yiyebilirim, bana zararı dokunmaz.
753Turkish (Ottoman): جام ييه بلورم بڭا ضررى طوقونمز
754Tatar: Алам да бар, пыяла, әмма бу ранит мине.
755Uzbek / O’zbekcha: (Roman): Men shisha yeyishim mumkin, ammo u menga zarar keltirmaydi.
756Uzbek / Ўзбекча (Cyrillic): Мен шиша ейишим мумкин, аммо у менга зарар келтирмайди.
757Bangla / Bengali: আমি কাঁচ খেতে পারি, তাতে আমার কোনো ক্ষতি হয় না।
758Marathi (masculine): मी काच खाऊ शकतो, मला ते दुखत नाही.
759Marathi (feminine):   मी काच खाऊ शकते, मला ते दुखत नाही.
760Kannada: ನನಗೆ ಹಾನಿ ಆಗದೆ, ನಾನು ಗಜನ್ನು ತಿನಬಹುದು
761Hindi (masculine): मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती.
762Hindi (feminine):   मैं काँच खा सकती हूँ और मुझे उससे कोई चोट नहीं पहुंचती.
763Malayalam: എനിക്ക് ഗ്ലാസ് തിന്നാം. അതെന്നെ വേദനിപ്പിക്കില്ല.
764Tamil: நான் கண்ணாடி சாப்பிடுவேன், அதனால் எனக்கு ஒரு கேடும் வராது.
765Telugu: నేను గాజు తినగలను మరియు అలా చేసినా నాకు ఏమి ఇబ్బంది లేదు
766Sinhalese: මට වීදුරු කෑමට හැකියි. එයින් මට කිසි හානියක් සිදු නොවේ.
767Urdu(3): میں کانچ کھا سکتا ہوں اور مجھے تکلیف نہیں ہوتی ۔
768Pashto(3): زه شيشه خوړلې شم، هغه ما نه خوږوي
769Farsi / Persian(3): .من می توانم بدونِ احساس درد شيشه بخورم
770Arabic(3): أنا قادر على أكل الزجاج و هذا لا يؤلمني.
771Aramaic: (NEEDED)
772Maltese: Nista' niekol il-ħġieġ u ma jagħmilli xejn.
773Hebrew(3): אני יכול לאכול זכוכית וזה לא מזיק לי.
774Yiddish(3): איך קען עסן גלאָז און עס טוט מיר נישט װײ.
775Judeo-Arabic: (NEEDED)
776Ladino: (NEEDED)
777Gǝʼǝz: (NEEDED)
778Amharic: (NEEDED)
779Twi: Metumi awe tumpan, ɜnyɜ me hwee.
780Hausa (Latin): Inā iya taunar gilāshi kuma in gamā lāfiyā.
781Hausa (Ajami) (2): إِنا إِىَ تَونَر غِلَاشِ كُمَ إِن غَمَا لَافِىَا
782Yoruba(4): Mo lè je̩ dígí, kò ní pa mí lára.
783Lingala: Nakokí kolíya biténi bya milungi, ekosála ngáí mabé tɛ́.
784(Ki)Swahili: Naweza kula bilauri na sikunyui.
785Malay: Saya boleh makan kaca dan ia tidak mencederakan saya.
786Tagalog: Kaya kong kumain nang bubog at hindi ako masaktan.
787Chamorro: Siña yo' chumocho krestat, ti ha na'lalamen yo'.
788Fijian: Au rawa ni kana iloilo, ia au sega ni vakacacani kina.
789Javanese: Aku isa mangan beling tanpa lara.
790Burmese (Unicode 4.0): က္ယ္ဝန္‌တော္‌၊က္ယ္ဝန္‌မ မ္ယက္‌စားနုိင္‌သည္‌။ ၎က္ရောင္‌့ ထိခုိက္‌မ္ဟု မရ္ဟိပာ။ (9)
791Burmese (Unicode 5.0): ကျွန်တော် ကျွန်မ မှန်စားနိုင်တယ်။ ၎င်းကြောင့် ထိခိုက်မှုမရှိပါ။ (9)
792Vietnamese (quốc ngữ): Tôi có thể ăn thủy tinh mà không hại gì.
793Vietnamese (nôm) (4): 些 𣎏 世 咹 水 晶 𦓡 空 𣎏 害 咦
794Khmer: ខ្ញុំអាចញុំកញ្ចក់បាន ដោយគ្មានបញ្ហារ
795Lao: ຂອ້ຍກິນແກ້ວໄດ້ໂດຍທີ່ມັນບໍ່ໄດ້ເຮັດໃຫ້ຂອ້ຍເຈັບ.
796Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ
797Mongolian (Cyrillic): Би шил идэй чадна, надад хортой биш
798Mongolian (Classic) (5): ᠪᠢ ᠰᠢᠯᠢ ᠢᠳᠡᠶᠦ ᠴᠢᠳᠠᠨᠠ ᠂ ᠨᠠᠳᠤᠷ ᠬᠣᠤᠷᠠᠳᠠᠢ ᠪᠢᠰᠢ
799Dzongkha: (NEEDED)
800Nepali: म काँच खान सक्छू र मलाई केहि नी हुन्‍न् ।
801Tibetan: ཤེལ་སྒོ་ཟ་ནས་ང་ན་གི་མ་རེད།
802Chinese: 我能吞下玻璃而不伤身体。
803Chinese (Traditional): 我能吞下玻璃而不傷身體。
804Taiwanese(6): Góa ē-tàng chia̍h po-lê, mā bē tio̍h-siong.
805Japanese: 私はガラスを食べられます。それは私を傷つけません。
806Korean: 나는 유리를 먹을 수 있어요. 그래도 아프지 않아요
807Bislama: Mi save kakae glas, hemi no save katem mi.
808Hawaiian: Hiki iaʻu ke ʻai i ke aniani; ʻaʻole nō lā au e ʻeha.
809Marquesan: E koʻana e kai i te karahi, mea ʻā, ʻaʻe hauhau.
810Inuktitut (10): ᐊᓕᒍᖅ ᓂᕆᔭᕌᖓᒃᑯ ᓱᕋᙱᑦᑐᓐᓇᖅᑐᖓ
811Chinook Jargon: Naika məkmək kakshət labutay, pi weyk ukuk munk-sik nay.
812Navajo: Tsésǫʼ yishą́ągo bííníshghah dóó doo shił neezgai da.
813Cherokee (and Cree, Chickasaw, Cree, Micmac, Ojibwa, Lakota, Náhuatl, Quechua, Aymara, and other American languages): (NEEDED)
814Garifuna: (NEEDED)
815Gullah: (NEEDED)
816Lojban: mi kakne le nu citka le blaci .iku'i le se go'i na xrani mi
817Nórdicg: Ljœr ye caudran créneþ ý jor cẃran."#;
818
819    #[test]
820    fn test_encoding_beautiful_day() {
821        let tokenizer = WorldTokenizer::new(None).unwrap();
822        let token_ids = tokenizer.encode(BEAUTIFUL_DAY);
823        assert_eq!(token_ids, [33520, 4600, 332, 59219, 21509, 47, 33, 10381, 11639, 13091, 15597, 11685, 14734, 10250, 11639, 10080]);
824    }
825
826    #[test]
827    fn test_encoding_decoding_beautiful_day() {
828        let tokenizer = WorldTokenizer::new(None).unwrap();
829        let token_ids = tokenizer.encode(BEAUTIFUL_DAY);
830        let text = tokenizer.decode(token_ids).unwrap();
831        assert_eq!(text, BEAUTIFUL_DAY);
832    }
833
834    #[test]
835    fn test_encoding_decoding_japanese() {
836        let tokenizer = WorldTokenizer::new(None).unwrap();
837        let token_ids = tokenizer.encode(JAPANESE);
838        let text = tokenizer.decode(token_ids).unwrap();
839        assert_eq!(text, JAPANESE);
840    }
841
842    #[test]
843    fn test_utf8_tokenization() {
844        let tokenizer = WorldTokenizer::new(None).unwrap();
845        let token_ids = tokenizer.encode(LONG_UTF8_TEXT);
846        let text = tokenizer.decode(token_ids).unwrap();
847        assert_eq!(text, LONG_UTF8_TEXT);
848    }
849
850    #[test]
851    fn test_get_vocab() {
852        let tokenizer = WorldTokenizer::new(None).unwrap();
853        let vocab = tokenizer.get_vocab();
854        // The vocab size should be 65529, but currently, the binary keys/strings are not included,
855        // therefore it is only 65044. It will be added later.
856        assert_eq!(vocab.len(), 65044);
857    }
858}
rwkv_tokenizer/lib.rs

rwkv_tokenizer/
lib.rs