1mod trie;
2use std::{str, env};
3use std::collections::HashMap;
4use std::fs::File;
5use std::io::{self, BufRead};
6use std::path::{Path};
7use std::str::Utf8Error;
8use regex::Regex;
9use trie::Trie;
10use unescape::unescape;
11use rayon::prelude::*;
12
13
14#[derive(Debug)]
15pub struct WorldTokenizer {
16 tokens: Vec<Vec<u8>>,
17 trie: Trie
18}
19
20impl WorldTokenizer {
21 pub fn new(vocab_filepath: Option<&str>) -> io::Result<Self> {
22 let mut tokenizer = WorldTokenizer {
23 tokens: Vec::new(),
24 trie: Trie::new()
25 };
26 let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("assets").join("rwkv_vocab_v20230424.txt");
27 let vocab_filepath = vocab_filepath.unwrap_or(manifest_path.to_str().unwrap());
28 let file = File::open(vocab_filepath)?;
29 let reader = io::BufReader::new(file);
30
31 let re = Regex::new(r"(\d+)\s+(b?)(.+)\s+(\d+)").unwrap();
32 tokenizer.tokens.push(vec![0]);
33 for line in reader.lines() {
34 let line = line?;
35 if let Some(captures) = re.captures(&line) {
36 let id = captures[1].parse::<u16>().unwrap();
37 let is_byte = captures[2].to_string();
38 let length = captures[4].parse::<usize>().unwrap();
39 let mut string: String = captures[3].to_string();
40 string = string[1..string.len()-1].parse().unwrap();
41 let sbytes: Vec<u8>;
42 if is_byte.len() == 0 {
43 string = unescape(string.as_str()).unwrap();
44 sbytes = string.clone().into_bytes();
45 tokenizer.tokens.push(Vec::from(string.as_bytes()));
46 } else {
47 sbytes = WorldTokenizer::hex_to_bytes(string.as_str()).unwrap();
48 tokenizer.tokens.push(sbytes.clone());
49 }
50 assert_eq!(sbytes.len(), length);
51 tokenizer.trie.insert(&sbytes, id);
52 }
53 else {
54 println!("Line with issue: {:?}", line)
55 }
56 }
57 Ok(tokenizer)
58 }
59
60 pub fn encode(&self, word: &str) -> Vec<u16> {
61 self.trie.tokenize(word)
62 }
63
64 pub fn encode_batch(&self, word_list: Vec<String>) -> Vec<Vec<u16>> {
65 word_list.par_iter().map(|word| self.trie.tokenize(word)).collect()
66 }
67
68 pub fn decode(&self, vec: Vec<u16>) -> Result<String, Utf8Error> {
69 let mut result: Vec<u8> = Vec::new();
70 for index in vec.iter() {
71 let mut current_tokens = self.tokens[*index as usize].clone();
72 result.append(&mut current_tokens);
73 }
74 Ok(str::from_utf8(&*result)?.to_string())
75 }
76
77 pub fn vocab_size(&self) -> usize {
78 self.tokens.len()
79 }
80
81 pub fn get_vocab(&self) -> HashMap<String, usize> {
82 let mut vocabularies: HashMap<String, usize> = HashMap::new();
83 for (index, value) in self.tokens.iter().enumerate() {
84 let text: String = String::from_utf8((*value).to_owned()).unwrap_or_else(|_e| "Binary string (TODO)".to_string());
85 vocabularies.insert(text, index);
86 }
87 vocabularies
88 }
89
90 fn hex_to_bytes(hex: &str) -> Option<Vec<u8>> {
91 let hex = hex.replace("\\x", "");
92 if hex.len() % 2 == 0 {
93 (0..hex.len())
94 .step_by(2)
95 .map(|i| hex.get(i..i + 2)
96 .and_then(|sub| u8::from_str_radix(sub, 16).ok()))
97 .collect()
98 } else {
99 None
100 }
101 }
102}
103
104#[cfg(test)]
105mod tests {
106 use super::*;
107 const BEAUTIFUL_DAY: &str = "Today is a beautiful day. 今天是美好的一天。";
108
109 const JAPANESE: &str = "起業家イーロン・マスク氏が創業した宇宙開発企業「スペースX(エックス)」の巨大新型ロケット「スターシップ」が20日朝、初めて打ち上げられたが、爆発した。
110打ち上げは米テキサス州の東海岸で行われた。無人の試験で、負傷者はいなかった。
111打ち上げから2~3分後、史上最大のロケットが制御不能になり、まもなく搭載された装置で破壊された。
112マスク氏は、数カ月後に再挑戦すると表明した。
113スペースXのエンジニアたちは、それでもこの日のミッションは成功だったとしている。「早期に頻繁に試験する」ことを好む人たちなので、破壊を恐れていない。次のフライトに向け、大量のデータを収集したはずだ。2機目のスターシップは、ほぼ飛行準備が整っている。
114マスク氏は、「SpaceXチームの皆さん、スターシップのエキサイティングな試験打ち上げ、おめでとう! 数カ月後に行われる次の試験打ち上げに向けて、多くを学んだ」とツイートした。
115アメリカでのロケット打ち上げを認可する米連邦航空局(NASA)は、事故調査を監督するとした。広報担当者は、飛行中に機体が失われた場合の通常の対応だと述べた。
116マスク氏は打ち上げ前、期待値を下げようとしていた。発射台の設備を破壊せずに機体を打ち上げるだけでも「成功」だとしていた。
117その願いはかなった。スターシップは打ち上げ施設からどんどん上昇し、メキシコ湾の上空へと向かっていった。しかし1分もしないうち、すべてが計画通りに進んでいるのではないことが明らかになった。";
118
119 const LONG_UTF8_TEXT: &str = r#"UTF-8 decoder capability and stress test
120----------------------------------------
121
122Markus Kuhn <https://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
123
124This test file can help you examine, how your UTF-8 decoder handles
125various types of correct, malformed, or otherwise interesting UTF-8
126sequences. This file is not meant to be a conformance test. It does
127not prescribe any particular outcome. Therefore, there is no way to
128"pass" or "fail" this test file, even though the text does suggest a
129preferable decoder behaviour at some places. Its aim is, instead, to
130help you think about, and test, the behaviour of your UTF-8 decoder on a
131systematic collection of unusual inputs. Experience so far suggests
132that most first-time authors of UTF-8 decoders find at least one
133serious problem in their decoder using this file.
134
135The test lines below cover boundary conditions, malformed UTF-8
136sequences, as well as correctly encoded UTF-8 sequences of Unicode code
137points that should never occur in a correct UTF-8 file.
138
139According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
140receiving UTF-8 shall interpret a "malformed sequence in the same way
141that it interprets a character that is outside the adopted subset" and
142"characters that are not within the adopted subset shall be indicated
143to the user" by a receiving device. One commonly used approach in
144UTF-8 decoders is to replace any malformed UTF-8 sequence by a
145replacement character (U+FFFD), which looks a bit like an inverted
146question mark, or a similar symbol. It might be a good idea to
147visually distinguish a malformed UTF-8 sequence from a correctly
148encoded Unicode character that is just not available in the current
149font but otherwise fully legal, even though ISO 10646-1 doesn't
150mandate this. In any case, just ignoring malformed sequences or
151unavailable characters does not conform to ISO 10646, will make
152debugging more difficult, and can lead to user confusion.
153
154Please check, whether a malformed UTF-8 sequence is (1) represented at
155all, (2) represented by exactly one single replacement character (or
156equivalent signal), and (3) the following quotation mark after an
157illegal UTF-8 sequence is correctly displayed, i.e. proper
158resynchronization takes place immediately after any malformed
159sequence. This file says "THE END" in the last line, so if you don't
160see that, your decoder crashed somehow before, which should always be
161cause for concern.
162
163All lines in this file are exactly 79 characters long (plus the line
164feed). In addition, all lines end with "|", except for the two test
165lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
166U+0000 and U+007F. If you display this file with a fixed-width font,
167these "|" characters should all line up in column 79 (right margin).
168This allows you to test quickly, whether your UTF-8 decoder finds the
169correct number of characters in every line, that is whether each
170malformed sequences is replaced by a single replacement character.
171
172Note that, as an alternative to the notion of malformed sequence used
173here, it is also a perfectly acceptable (and in some situations even
174preferable) solution to represent each individual byte of a malformed
175sequence with a replacement character. If you follow this strategy in
176your decoder, then please ignore the "|" column.
177
178
179Here come the tests: |
180 |
1811 Some correct UTF-8 text |
182 |
183You should see the Greek word 'kosme': "κόσμε" |
184 |
1852 Boundary condition test cases |
186 |
1872.1 First possible sequence of a certain length |
188 |
1892.1.1 1 byte (U-00000000): "�"
1902.1.2 2 bytes (U-00000080): "" |
1912.1.3 3 bytes (U-00000800): "ࠀ" |
1922.1.4 4 bytes (U-00010000): "𐀀" |
1932.1.5 5 bytes (U-00200000): "�����" |
1942.1.6 6 bytes (U-04000000): "������" |
195 |
1962.2 Last possible sequence of a certain length |
197 |
1982.2.1 1 byte (U-0000007F): ""
1992.2.2 2 bytes (U-000007FF): "߿" |
2002.2.3 3 bytes (U-0000FFFF): "" |
2012.2.4 4 bytes (U-001FFFFF): "����" |
2022.2.5 5 bytes (U-03FFFFFF): "�����" |
2032.2.6 6 bytes (U-7FFFFFFF): "������" |
204 |
2052.3 Other boundary conditions |
206 |
2072.3.1 U-0000D7FF = ed 9f bf = "" |
2082.3.2 U-0000E000 = ee 80 80 = "" |
2092.3.3 U-0000FFFD = ef bf bd = "�" |
2102.3.4 U-0010FFFF = f4 8f bf bf = "" |
2112.3.5 U-00110000 = f4 90 80 80 = "����" |
212 |
2133 Malformed sequences |
214 |
2153.1 Unexpected continuation bytes |
216 |
217Each unexpected continuation byte should be separately signalled as a |
218malformed sequence of its own. |
219 |
2203.1.1 First continuation byte 0x80: "�" |
2213.1.2 Last continuation byte 0xbf: "�" |
222 |
2233.1.3 2 continuation bytes: "��" |
2243.1.4 3 continuation bytes: "���" |
2253.1.5 4 continuation bytes: "����" |
2263.1.6 5 continuation bytes: "�����" |
2273.1.7 6 continuation bytes: "������" |
2283.1.8 7 continuation bytes: "�������" |
229 |
2303.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): |
231 |
232 "���������������� |
233 ���������������� |
234 ���������������� |
235 ����������������" |
236 |
2373.2 Lonely start characters |
238 |
2393.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), |
240 each followed by a space character: |
241 |
242 "� � � � � � � � � � � � � � � � |
243 � � � � � � � � � � � � � � � � " |
244 |
2453.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), |
246 each followed by a space character: |
247 |
248 "� � � � � � � � � � � � � � � � " |
249 |
2503.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), |
251 each followed by a space character: |
252 |
253 "� � � � � � � � " |
254 |
2553.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), |
256 each followed by a space character: |
257 |
258 "� � � � " |
259 |
2603.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), |
261 each followed by a space character: |
262 |
263 "� � " |
264 |
2653.3 Sequences with last continuation byte missing |
266 |
267All bytes of an incomplete sequence should be signalled as a single |
268malformed sequence, i.e., you should see only a single replacement |
269character in each of the next 10 tests. (Characters as in section 2) |
270 |
2713.3.1 2-byte sequence with last byte missing (U+0000): "�" |
2723.3.2 3-byte sequence with last byte missing (U+0000): "��" |
2733.3.3 4-byte sequence with last byte missing (U+0000): "���" |
2743.3.4 5-byte sequence with last byte missing (U+0000): "����" |
2753.3.5 6-byte sequence with last byte missing (U+0000): "�����" |
2763.3.6 2-byte sequence with last byte missing (U-000007FF): "�" |
2773.3.7 3-byte sequence with last byte missing (U-0000FFFF): "�" |
2783.3.8 4-byte sequence with last byte missing (U-001FFFFF): "���" |
2793.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "����" |
2803.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "�����" |
281 |
2823.4 Concatenation of incomplete sequences |
283 |
284All the 10 sequences of 3.3 concatenated, you should see 10 malformed |
285sequences being signalled: |
286 |
287 "�����������������������������" |
288 |
2893.5 Impossible bytes |
290 |
291The following two bytes cannot appear in a correct UTF-8 string |
292 |
2933.5.1 fe = "�" |
2943.5.2 ff = "�" |
2953.5.3 fe fe ff ff = "����" |
296 |
2974 Overlong sequences |
298 |
299The following sequences are not malformed according to the letter of |
300the Unicode 2.0 standard. However, they are longer then necessary and |
301a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 |
302decoder" should reject them just like malformed sequences for two |
303reasons: (1) It helps to debug applications if overlong sequences are |
304not treated as valid representations of characters, because this helps |
305to spot problems more quickly. (2) Overlong sequences provide |
306alternative representations of characters, that could maliciously be |
307used to bypass filters that check only for ASCII characters. For |
308instance, a 2-byte encoded line feed (LF) would not be caught by a |
309line counter that counts only 0x0a bytes, but it would still be |
310processed as a line feed by an unsafe UTF-8 decoder later in the |
311pipeline. From a security point of view, ASCII compatibility of UTF-8 |
312sequences means also, that ASCII characters are *only* allowed to be |
313represented by ASCII bytes in the range 0x00-0x7f. To ensure this |
314aspect of ASCII compatibility, use only "safe UTF-8 decoders" that |
315reject overlong UTF-8 sequences for which a shorter encoding exists. |
316 |
3174.1 Examples of an overlong ASCII character |
318 |
319With a safe UTF-8 decoder, all of the following five overlong |
320representations of the ASCII character slash ("/") should be rejected |
321like a malformed UTF-8 sequence, for instance by substituting it with |
322a replacement character. If you see a slash below, you do not have a |
323safe UTF-8 decoder! |
324 |
3254.1.1 U+002F = c0 af = "��" |
3264.1.2 U+002F = e0 80 af = "���" |
3274.1.3 U+002F = f0 80 80 af = "����" |
3284.1.4 U+002F = f8 80 80 80 af = "�����" |
3294.1.5 U+002F = fc 80 80 80 80 af = "������" |
330 |
3314.2 Maximum overlong sequences |
332 |
333Below you see the highest Unicode value that is still resulting in an |
334overlong sequence if represented with the given number of bytes. This |
335is a boundary test for safe UTF-8 decoders. All five characters should |
336be rejected like malformed UTF-8 sequences. |
337 |
3384.2.1 U-0000007F = c1 bf = "��" |
3394.2.2 U-000007FF = e0 9f bf = "���" |
3404.2.3 U-0000FFFF = f0 8f bf bf = "����" |
3414.2.4 U-001FFFFF = f8 87 bf bf bf = "�����" |
3424.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "������" |
343 |
3444.3 Overlong representation of the NUL character |
345 |
346The following five sequences should also be rejected like malformed |
347UTF-8 sequences and should not be treated like the ASCII NUL |
348character. |
349 |
3504.3.1 U+0000 = c0 80 = "��" |
3514.3.2 U+0000 = e0 80 80 = "���" |
3524.3.3 U+0000 = f0 80 80 80 = "����" |
3534.3.4 U+0000 = f8 80 80 80 80 = "�����" |
3544.3.5 U+0000 = fc 80 80 80 80 80 = "������" |
355 |
3565 Illegal code positions |
357 |
358The following UTF-8 sequences should be rejected like malformed |
359sequences, because they never represent valid ISO 10646 characters and |
360a UTF-8 decoder that accepts them might introduce security problems |
361comparable to overlong UTF-8 sequences. |
362 |
3635.1 Single UTF-16 surrogates |
364 |
3655.1.1 U+D800 = ed a0 80 = "���" |
3665.1.2 U+DB7F = ed ad bf = "���" |
3675.1.3 U+DB80 = ed ae 80 = "���" |
3685.1.4 U+DBFF = ed af bf = "���" |
3695.1.5 U+DC00 = ed b0 80 = "���" |
3705.1.6 U+DF80 = ed be 80 = "���" |
3715.1.7 U+DFFF = ed bf bf = "���" |
372 |
3735.2 Paired UTF-16 surrogates |
374 |
3755.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "������" |
3765.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "������" |
3775.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "������" |
3785.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "������" |
3795.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "������" |
3805.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "������" |
3815.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "������" |
3825.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "������" |
383 |
3845.3 Noncharacter code positions |
385 |
386The following "noncharacters" are "reserved for internal use" by |
387applications, and according to older versions of the Unicode Standard |
388"should never be interchanged". Unicode Corrigendum #9 dropped the |
389latter restriction. Nevertheless, their presence in incoming UTF-8 data |
390can remain a potential security risk, depending on what use is made of |
391these codes subsequently. Examples of such internal use: |
392 |
393 - Some file APIs with 16-bit characters may use the integer value -1 |
394 = U+FFFF to signal an end-of-file (EOF) or error condition. |
395 |
396 - In some UTF-16 receivers, code point U+FFFE might trigger a |
397 byte-swap operation (to convert between UTF-16LE and UTF-16BE). |
398 |
399With such internal use of noncharacters, it may be desirable and safer |
400to block those code points in UTF-8 decoders, as they should never |
401occur legitimately in incoming UTF-8 data, and could trigger unsafe |
402behaviour in subsequent processing. |
403 |
404Particularly problematic noncharacters in 16-bit applications: |
405 |
4065.3.1 U+FFFE = ef bf be = "" |
4075.3.2 U+FFFF = ef bf bf = "" |
408 |
409Other noncharacters: |
410 |
4115.3.3 U+FDD0 .. U+FDEF = ""|
412 |
4135.3.4 U+nFFFE U+nFFFF (for n = 1..10) |
414 |
415 " |
416 " |
417 |
418THE END |
419
420
421UTF-8 encoded sample plain-text file
422‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
423
424Markus Kuhn [ˈmaʳkʊs kuːn] <https://www.cl.cam.ac.uk/~mgk25/> — 2002-07-25 CC BY
425
426
427The ASCII compatible UTF-8 encoding used in this plain-text file
428is defined in Unicode, ISO 10646-1, and RFC 2279.
429
430
431Using Unicode/UTF-8, you can write in emails and source code things such as
432
433Mathematics and sciences:
434
435 ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
436 ⎪⎢⎜│a²+b³ ⎟⎥⎪
437 ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
438 ⎪⎢⎜⎷ c₈ ⎟⎥⎪
439 ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
440 ⎪⎢⎜ ∞ ⎟⎥⎪
441 ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
442 ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
443 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
444
445Linguistics and dictionaries:
446
447 ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
448 Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
449
450APL:
451
452 ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
453
454Nicer typography in plain text files:
455
456 ╔══════════════════════════════════════════╗
457 ║ ║
458 ║ • ‘single’ and “double” quotes ║
459 ║ ║
460 ║ • Curly apostrophes: “We’ve been here” ║
461 ║ ║
462 ║ • Latin-1 apostrophe and accents: '´` ║
463 ║ ║
464 ║ • ‚deutsche‘ „Anführungszeichen“ ║
465 ║ ║
466 ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
467 ║ ║
468 ║ • ASCII safety test: 1lI|, 0OD, 8B ║
469 ║ ╭─────────╮ ║
470 ║ • the euro symbol: │ 14.95 € │ ║
471 ║ ╰─────────╯ ║
472 ╚══════════════════════════════════════════╝
473
474Combining characters:
475
476 STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
477
478Greek (in Polytonic):
479
480 The Greek anthem:
481
482 Σὲ γνωρίζω ἀπὸ τὴν κόψη
483 τοῦ σπαθιοῦ τὴν τρομερή,
484 σὲ γνωρίζω ἀπὸ τὴν ὄψη
485 ποὺ μὲ βία μετράει τὴ γῆ.
486
487 ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
488 τῶν ῾Ελλήνων τὰ ἱερά
489 καὶ σὰν πρῶτα ἀνδρειωμένη
490 χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
491
492 From a speech of Demosthenes in the 4th century BC:
493
494 Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
495 ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
496 λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
497 τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
498 εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
499 πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
500 οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
501 οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
502 ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
503 τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
504 γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
505 προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
506 σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
507 τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
508 τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
509 τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
510
511 Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
512
513Georgian:
514
515 From a Unicode conference invitation:
516
517 გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
518 კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
519 ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
520 ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
521 ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
522 ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
523 ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
524
525Russian:
526
527 From a Unicode conference invitation:
528
529 Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
530 Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
531 Конференция соберет широкий круг экспертов по вопросам глобального
532 Интернета и Unicode, локализации и интернационализации, воплощению и
533 применению Unicode в различных операционных системах и программных
534 приложениях, шрифтах, верстке и многоязычных компьютерных системах.
535
536Thai (UCS Level 2):
537
538 Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
539 classic 'San Gua'):
540
541 [----------------------------|------------------------]
542 ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
543 สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
544 ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
545 โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
546 เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
547 ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
548 พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
549 ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
550
551 (The above is a two-column text. If combining characters are handled
552 correctly, the lines of the second column should be aligned with the
553 | character above.)
554
555Ethiopian:
556
557 Proverbs in the Amharic language:
558
559 ሰማይ አይታረስ ንጉሥ አይከሰስ።
560 ብላ ካለኝ እንደአባቴ በቆመጠኝ።
561 ጌጥ ያለቤቱ ቁምጥና ነው።
562 ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
563 የአፍ ወለምታ በቅቤ አይታሽም።
564 አይጥ በበላ ዳዋ ተመታ።
565 ሲተረጉሙ ይደረግሙ።
566 ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
567 ድር ቢያብር አንበሳ ያስር።
568 ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
569 እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
570 የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
571 ሥራ ከመፍታት ልጄን ላፋታት።
572 ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
573 የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
574 ተንጋሎ ቢተፉ ተመልሶ ባፉ።
575 ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
576 እግርህን በፍራሽህ ልክ ዘርጋ።
577
578Runes:
579
580 ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
581
582 (Old English, which transcribed into Latin reads 'He cwaeth that he
583 bude thaem lande northweardum with tha Westsae.' and means 'He said
584 that he lived in the northern land near the Western Sea.')
585
586Braille:
587
588 ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
589
590 ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
591 ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
592 ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
593 ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
594 ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
595 ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
596
597 ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
598
599 ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
600 ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
601 ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
602 ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
603 ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
604 ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
605 ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
606 ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
607 ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
608
609 (The first couple of paragraphs of "A Christmas Carol" by Dickens)
610
611Compact font selection example text:
612
613 ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
614 abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
615 –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
616 ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
617
618Greetings in various languages:
619
620 Hello world, Καλημέρα κόσμε, コンニチハ
621
622Box drawing alignment tests: █
623 ▉
624 ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
625 ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
626 ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
627 ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
628 ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
629 ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
630 ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
631 ▝▀▘▙▄▟
632
633Sanskrit: काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥
634Sanskrit (standard transcription): kācaṃ śaknomyattum; nopahinasti mām.
635Classical Greek: ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει.
636Greek (monotonic): Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα.
637Greek (polytonic): Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα.
638Etruscan: (NEEDED)
639Latin: Vitrum edere possum; mihi non nocet.
640Old French: Je puis mangier del voirre. Ne me nuit.
641French: Je peux manger du verre, ça ne me fait pas mal.
642Provençal / Occitan: Pòdi manjar de veire, me nafrariá pas.
643Québécois: J'peux manger d'la vitre, ça m'fa pas mal.
644Walloon: Dji pou magnî do vêre, çoula m' freut nén må.
645Champenois: (NEEDED)
646Lorrain: (NEEDED)
647Picard: Ch'peux mingi du verre, cha m'foé mie n'ma.
648Corsican/Corsu: (NEEDED)
649Jèrriais: (NEEDED)
650Kreyòl Ayisyen (Haitï): Mwen kap manje vè, li pa blese'm.
651Basque: Kristala jan dezaket, ez dit minik ematen.
652Catalan / Català: Puc menjar vidre, que no em fa mal.
653Spanish: Puedo comer vidrio, no me hace daño.
654Aragonés: Puedo minchar beire, no me'n fa mal .
655Aranés: (NEEDED)
656Mallorquín: (NEEDED)
657Galician: Eu podo xantar cristais e non cortarme.
658European Portuguese: Posso comer vidro, não me faz mal.
659Brazilian Portuguese (8): Posso comer vidro, não me machuca.
660Caboverdiano/Kabuverdianu (Cape Verde): M' podê cumê vidru, ca ta maguâ-m'.
661Papiamentu: Ami por kome glas anto e no ta hasimi daño.
662Italian: Posso mangiare il vetro e non mi fa male.
663Milanese: Sôn bôn de magnà el véder, el me fa minga mal.
664Roman: Me posso magna' er vetro, e nun me fa male.
665Napoletano: M' pozz magna' o'vetr, e nun m' fa mal.
666Venetian: Mi posso magnare el vetro, no'l me fa mae.
667Zeneise (Genovese): Pòsso mangiâ o veddro e o no me fà mâ.
668Sicilian: Puotsu mangiari u vitru, nun mi fa mali.
669Campinadese (Sardinia): (NEEDED)
670Lugudorese (Sardinia): (NEEDED)
671Romansch (Grischun): Jau sai mangiar vaider, senza che quai fa donn a mai.
672Romany / Tsigane: (NEEDED)
673Romanian: Pot să mănânc sticlă și ea nu mă rănește.
674Esperanto: Mi povas manĝi vitron, ĝi ne damaĝas min.
675Pictish: (NEEDED)
676Breton: (NEEDED)
677Cornish: Mý a yl dybry gwéder hag éf ny wra ow ankenya.
678Welsh: Dw i'n gallu bwyta gwydr, 'dyw e ddim yn gwneud dolur i mi.
679Manx Gaelic: Foddym gee glonney agh cha jean eh gortaghey mee.
680Old Irish (Ogham): ᚛᚛ᚉᚑᚅᚔᚉᚉᚔᚋ ᚔᚈᚔ ᚍᚂᚐᚅᚑ ᚅᚔᚋᚌᚓᚅᚐ᚜
681Old Irish (Latin): Con·iccim ithi nglano. Ním·géna.
682Irish: Is féidir liom gloinne a ithe. Ní dhéanann sí dochar ar bith dom.
683Ulster Gaelic: Ithim-sa gloine agus ní miste damh é.
684Scottish Gaelic: S urrainn dhomh gloinne ithe; cha ghoirtich i mi.
685Anglo-Saxon (Runes): ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬
686Anglo-Saxon (Latin): Ic mæg glæs eotan ond hit ne hearmiað me.
687Middle English: Ich canne glas eten and hit hirtiþ me nouȝt.
688English: I can eat glass and it doesn't hurt me.
689English (IPA): [aɪ kæn iːt glɑːs ænd ɪt dɐz nɒt hɜːt miː] (Received Pronunciation)
690English (Braille): ⠊⠀⠉⠁⠝⠀⠑⠁⠞⠀⠛⠇⠁⠎⠎⠀⠁⠝⠙⠀⠊⠞⠀⠙⠕⠑⠎⠝⠞⠀⠓⠥⠗⠞⠀⠍⠑
691Jamaican: Mi kian niam glas han i neba hot mi.
692Lalland Scots / Doric: Ah can eat gless, it disnae hurt us.
693Glaswegian: (NEEDED)
694Gothic (4): 𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸.
695Old Norse (Runes): ᛖᚴ ᚷᛖᛏ ᛖᛏᛁ ᚧ ᚷᛚᛖᚱ ᛘᚾ ᚦᛖᛋᛋ ᚨᚧ ᚡᛖ ᚱᚧᚨ ᛋᚨᚱ
696Old Norse (Latin): Ek get etið gler án þess að verða sár.
697Norsk / Norwegian (Nynorsk): Eg kan eta glas utan å skada meg.
698Norsk / Norwegian (Bokmål): Jeg kan spise glass uten å skade meg.
699Føroyskt / Faroese: Eg kann eta glas, skaðaleysur.
700Íslenska / Icelandic: Ég get etið gler án þess að meiða mig.
701Svenska / Swedish: Jag kan äta glas utan att skada mig.
702Dansk / Danish: Jeg kan spise glas, det gør ikke ondt på mig.
703Sønderjysk: Æ ka æe glass uhen at det go mæ naue.
704Frysk / Frisian: Ik kin glês ite, it docht me net sear.
705Nederlands / Dutch: Ik kan glas eten, het doet mij geen kwaad.
706Kirchröadsj/Bôchesserplat: Iech ken glaas èèse, mer 't deet miech jing pieng.
707Afrikaans: Ek kan glas eet, maar dit doen my nie skade nie.
708Lëtzebuergescht / Luxemburgish: Ech kan Glas iessen, daat deet mir nët wei.
709Deutsch / German: Ich kann Glas essen, ohne mir zu schaden.
710Ruhrdeutsch: Ich kann Glas verkasematuckeln, ohne dattet mich wat jucken tut.
711Langenfelder Platt: Isch kann Jlaas kimmeln, uuhne datt mich datt weh dääd.
712Lausitzer Mundart ("Lusatian"): Ich koann Gloos assn und doas dudd merr ni wii.
713Odenwälderisch: Iech konn glaasch voschbachteln ohne dass es mir ebbs daun doun dud.
714Sächsisch / Saxon: 'sch kann Glos essn, ohne dass'sch mer wehtue.
715Pfälzisch: Isch konn Glass fresse ohne dasses mer ebbes ausmache dud.
716Schwäbisch / Swabian: I kå Glas frässa, ond des macht mr nix!
717Deutsch (Voralberg): I ka glas eassa, ohne dass mar weh tuat.
718Bayrisch / Bavarian: I koh Glos esa, und es duard ma ned wei.
719Allemannisch: I kaun Gloos essen, es tuat ma ned weh.
720Schwyzerdütsch (Zürich): Ich chan Glaas ässe, das schadt mir nöd.
721Schwyzerdütsch (Luzern): Ech cha Glâs ässe, das schadt mer ned.
722Plautdietsch: (NEEDED)
723Hungarian: Meg tudom enni az üveget, nem lesz tőle bajom.
724Suomi / Finnish: Voin syödä lasia, se ei vahingoita minua.
725Sami (Northern): Sáhtán borrat lása, dat ii leat bávččas.
726Erzian: Мон ярсан суликадо, ды зыян эйстэнзэ а ули.
727Northern Karelian: Mie voin syvvä lasie ta minla ei ole kipie.
728Southern Karelian: Minä voin syvvä st'oklua dai minule ei ole kibie.
729Vepsian: (NEEDED)
730Votian: (NEEDED)
731Livonian: (NEEDED)
732Estonian: Ma võin klaasi süüa, see ei tee mulle midagi.
733Latvian: Es varu ēst stiklu, tas man nekaitē.
734Lithuanian: Aš galiu valgyti stiklą ir jis manęs nežeidžia
735Old Prussian: (NEEDED)
736Sorbian (Wendish): (NEEDED)
737Czech: Mohu jíst sklo, neublíží mi.
738Slovak: Môžem jesť sklo. Nezraní ma.
739Polska / Polish: Mogę jeść szkło i mi nie szkodzi.
740Slovenian: Lahko jem steklo, ne da bi mi škodovalo.
741Bosnian, Croatian, Montenegrin and Serbian (Latin): Ja mogu jesti staklo, i to mi ne šteti.
742Bosnian, Montenegrin and Serbian (Cyrillic): Ја могу јести стакло, и то ми не штети.
743Macedonian: Можам да јадам стакло, а не ме штета.
744Russian: Я могу есть стекло, оно мне не вредит.
745Belarusian (Cyrillic): Я магу есці шкло, яно мне не шкодзіць.
746Belarusian (Lacinka): Ja mahu jeści škło, jano mne ne škodzić.
747Ukrainian: Я можу їсти скло, і воно мені не зашкодить.
748Bulgarian: Мога да ям стъкло, то не ми вреди.
749Georgian: მინას ვჭამ და არა მტკივა.
750Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։
751Albanian: Unë mund të ha qelq dhe nuk më gjen gjë.
752Turkish: Cam yiyebilirim, bana zararı dokunmaz.
753Turkish (Ottoman): جام ييه بلورم بڭا ضررى طوقونمز
754Tatar: Алам да бар, пыяла, әмма бу ранит мине.
755Uzbek / O’zbekcha: (Roman): Men shisha yeyishim mumkin, ammo u menga zarar keltirmaydi.
756Uzbek / Ўзбекча (Cyrillic): Мен шиша ейишим мумкин, аммо у менга зарар келтирмайди.
757Bangla / Bengali: আমি কাঁচ খেতে পারি, তাতে আমার কোনো ক্ষতি হয় না।
758Marathi (masculine): मी काच खाऊ शकतो, मला ते दुखत नाही.
759Marathi (feminine): मी काच खाऊ शकते, मला ते दुखत नाही.
760Kannada: ನನಗೆ ಹಾನಿ ಆಗದೆ, ನಾನು ಗಜನ್ನು ತಿನಬಹುದು
761Hindi (masculine): मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती.
762Hindi (feminine): मैं काँच खा सकती हूँ और मुझे उससे कोई चोट नहीं पहुंचती.
763Malayalam: എനിക്ക് ഗ്ലാസ് തിന്നാം. അതെന്നെ വേദനിപ്പിക്കില്ല.
764Tamil: நான் கண்ணாடி சாப்பிடுவேன், அதனால் எனக்கு ஒரு கேடும் வராது.
765Telugu: నేను గాజు తినగలను మరియు అలా చేసినా నాకు ఏమి ఇబ్బంది లేదు
766Sinhalese: මට වීදුරු කෑමට හැකියි. එයින් මට කිසි හානියක් සිදු නොවේ.
767Urdu(3): میں کانچ کھا سکتا ہوں اور مجھے تکلیف نہیں ہوتی ۔
768Pashto(3): زه شيشه خوړلې شم، هغه ما نه خوږوي
769Farsi / Persian(3): .من می توانم بدونِ احساس درد شيشه بخورم
770Arabic(3): أنا قادر على أكل الزجاج و هذا لا يؤلمني.
771Aramaic: (NEEDED)
772Maltese: Nista' niekol il-ħġieġ u ma jagħmilli xejn.
773Hebrew(3): אני יכול לאכול זכוכית וזה לא מזיק לי.
774Yiddish(3): איך קען עסן גלאָז און עס טוט מיר נישט װײ.
775Judeo-Arabic: (NEEDED)
776Ladino: (NEEDED)
777Gǝʼǝz: (NEEDED)
778Amharic: (NEEDED)
779Twi: Metumi awe tumpan, ɜnyɜ me hwee.
780Hausa (Latin): Inā iya taunar gilāshi kuma in gamā lāfiyā.
781Hausa (Ajami) (2): إِنا إِىَ تَونَر غِلَاشِ كُمَ إِن غَمَا لَافِىَا
782Yoruba(4): Mo lè je̩ dígí, kò ní pa mí lára.
783Lingala: Nakokí kolíya biténi bya milungi, ekosála ngáí mabé tɛ́.
784(Ki)Swahili: Naweza kula bilauri na sikunyui.
785Malay: Saya boleh makan kaca dan ia tidak mencederakan saya.
786Tagalog: Kaya kong kumain nang bubog at hindi ako masaktan.
787Chamorro: Siña yo' chumocho krestat, ti ha na'lalamen yo'.
788Fijian: Au rawa ni kana iloilo, ia au sega ni vakacacani kina.
789Javanese: Aku isa mangan beling tanpa lara.
790Burmese (Unicode 4.0): က္ယ္ဝန္တော္၊က္ယ္ဝန္မ မ္ယက္စားနုိင္သည္။ ၎က္ရောင့္ ထိခုိက္မ္ဟု မရ္ဟိပာ။ (9)
791Burmese (Unicode 5.0): ကျွန်တော် ကျွန်မ မှန်စားနိုင်တယ်။ ၎င်းကြောင့် ထိခိုက်မှုမရှိပါ။ (9)
792Vietnamese (quốc ngữ): Tôi có thể ăn thủy tinh mà không hại gì.
793Vietnamese (nôm) (4): 些 𣎏 世 咹 水 晶 𦓡 空 𣎏 害 咦
794Khmer: ខ្ញុំអាចញុំកញ្ចក់បាន ដោយគ្មានបញ្ហារ
795Lao: ຂອ້ຍກິນແກ້ວໄດ້ໂດຍທີ່ມັນບໍ່ໄດ້ເຮັດໃຫ້ຂອ້ຍເຈັບ.
796Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ
797Mongolian (Cyrillic): Би шил идэй чадна, надад хортой биш
798Mongolian (Classic) (5): ᠪᠢ ᠰᠢᠯᠢ ᠢᠳᠡᠶᠦ ᠴᠢᠳᠠᠨᠠ ᠂ ᠨᠠᠳᠤᠷ ᠬᠣᠤᠷᠠᠳᠠᠢ ᠪᠢᠰᠢ
799Dzongkha: (NEEDED)
800Nepali: म काँच खान सक्छू र मलाई केहि नी हुन्न् ।
801Tibetan: ཤེལ་སྒོ་ཟ་ནས་ང་ན་གི་མ་རེད།
802Chinese: 我能吞下玻璃而不伤身体。
803Chinese (Traditional): 我能吞下玻璃而不傷身體。
804Taiwanese(6): Góa ē-tàng chia̍h po-lê, mā bē tio̍h-siong.
805Japanese: 私はガラスを食べられます。それは私を傷つけません。
806Korean: 나는 유리를 먹을 수 있어요. 그래도 아프지 않아요
807Bislama: Mi save kakae glas, hemi no save katem mi.
808Hawaiian: Hiki iaʻu ke ʻai i ke aniani; ʻaʻole nō lā au e ʻeha.
809Marquesan: E koʻana e kai i te karahi, mea ʻā, ʻaʻe hauhau.
810Inuktitut (10): ᐊᓕᒍᖅ ᓂᕆᔭᕌᖓᒃᑯ ᓱᕋᙱᑦᑐᓐᓇᖅᑐᖓ
811Chinook Jargon: Naika məkmək kakshət labutay, pi weyk ukuk munk-sik nay.
812Navajo: Tsésǫʼ yishą́ągo bííníshghah dóó doo shił neezgai da.
813Cherokee (and Cree, Chickasaw, Cree, Micmac, Ojibwa, Lakota, Náhuatl, Quechua, Aymara, and other American languages): (NEEDED)
814Garifuna: (NEEDED)
815Gullah: (NEEDED)
816Lojban: mi kakne le nu citka le blaci .iku'i le se go'i na xrani mi
817Nórdicg: Ljœr ye caudran créneþ ý jor cẃran."#;
818
819 #[test]
820 fn test_encoding_beautiful_day() {
821 let tokenizer = WorldTokenizer::new(None).unwrap();
822 let token_ids = tokenizer.encode(BEAUTIFUL_DAY);
823 assert_eq!(token_ids, [33520, 4600, 332, 59219, 21509, 47, 33, 10381, 11639, 13091, 15597, 11685, 14734, 10250, 11639, 10080]);
824 }
825
826 #[test]
827 fn test_encoding_decoding_beautiful_day() {
828 let tokenizer = WorldTokenizer::new(None).unwrap();
829 let token_ids = tokenizer.encode(BEAUTIFUL_DAY);
830 let text = tokenizer.decode(token_ids).unwrap();
831 assert_eq!(text, BEAUTIFUL_DAY);
832 }
833
834 #[test]
835 fn test_encoding_decoding_japanese() {
836 let tokenizer = WorldTokenizer::new(None).unwrap();
837 let token_ids = tokenizer.encode(JAPANESE);
838 let text = tokenizer.decode(token_ids).unwrap();
839 assert_eq!(text, JAPANESE);
840 }
841
842 #[test]
843 fn test_utf8_tokenization() {
844 let tokenizer = WorldTokenizer::new(None).unwrap();
845 let token_ids = tokenizer.encode(LONG_UTF8_TEXT);
846 let text = tokenizer.decode(token_ids).unwrap();
847 assert_eq!(text, LONG_UTF8_TEXT);
848 }
849
850 #[test]
851 fn test_get_vocab() {
852 let tokenizer = WorldTokenizer::new(None).unwrap();
853 let vocab = tokenizer.get_vocab();
854 assert_eq!(vocab.len(), 65044);
857 }
858}