1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
// Copyright 2018 Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your // option. This file may not be copied, modified, or distributed // except according to those terms. #![doc(html_root_url = "https://docs.rs/shift_or_euc/0.1.0")] //! A Japanese legacy encoding detector for detecting between Shift_JIS, //! EUC-JP, and, optionally, ISO-2022-JP _given_ the assumption that the //! encoding is one of those. //! //! This detector is generally more accurate (but see below about the failure //! mode on half-width katakana) and decides much sooner than machine //! learning-based detectors. To decide EUC-JP, machine learning-based //! detectors try to gain confidence that the input looks like EUC-JP. To //! decide EUC-JP, this detector instead looks for two simple rule-based //! signs of the input not being Shift_JIS. //! //! As a consequence of not containing machine learning tables, the binary //! size footprint that this crate adds on top of //! [`encoding_rs`](https://docs.rs/crate/encoding_rs) is tiny. //! //! # Licensing //! //! See the file named [COPYRIGHT](https://github.com/hsivonen/shift_or_euc/blob/master/COPYRIGHT). //! //! # Principle of Operation //! //! The detector is based on two observations: //! //! 1. The ISO-2022-JP escape sequences don't normally occur in Shift_JIS or //! EUC-JP, so encountering such an escape sequence (before non-ASCII has been //! encountered) can be taken as indication of ISO-2022-JP. //! 2. When normal (full-with) kana or common kanji encoded as Shift_JIS is //! decoded as EUC-JP, or vice versa, the result is either an error or //! half-width katakana, and it's very uncommon for Japanese HTML to have //! half-width katakana character before a normal kana or common kanji //! character. Therefore, if decoding as Shift_JIS results in error or //! have-width katakana, the detector decides that the content is EUC-JP, and //! vice versa. //! //! # Failure Modes //! //! The detector gives the wrong answer if the text has a half-width katakana //! character before normal kana or common kanji. Some uncommon kanji are //! undecidable. (All JIS X 0208 Level 1 kanji are decidable.) //! //! The half-width katakana issue is mainly relevant for old 8-bit JIS X //! 0201-only text files that would decode correctly as Shift_JIS but that the //! detector detects as EUC-JP. //! //! The undecidable kanji issue does not realistically show up when a full //! document is fed to the detector, because, realistically, in a full //! document, there is at least one kana or common kanji. It can occur, //! though, if the detector is only run on a prefix of a document and the //! prefix only contains the title of the document. It is possible for //! document title to consist entirely of undecidable kanji. (Indeed, //! Japanese Wikipedia has articles with such titles.) If the detector is //! undecided, falling back to Shift_JIS is typically the Web oriented better //! guess. use encoding_rs::Decoder; use encoding_rs::DecoderResult; use encoding_rs::Encoding; use encoding_rs::EUC_JP; use encoding_rs::ISO_2022_JP; use encoding_rs::SHIFT_JIS; /// Returns the index of the first non-ASCII byte or the first /// 0x1B, whichever comes first, or the length of the buffer /// if neither is found. fn find_non_ascii_or_escape(buffer: &[u8]) -> usize { let ascii_up_to = Encoding::ascii_valid_up_to(buffer); if let Some(escape) = memchr::memchr(0x1B, &buffer[..ascii_up_to]) { escape } else { ascii_up_to } } /// Feed decoder with one byte (if `last` is `false`) or EOF (if `last` is /// `true`). `byte` is ignored if `last` is `true`. /// Returns `true` if there was no rejection or `false` upon rejecting the /// encoding hypothesis represented by this decoder. #[inline(always)] fn feed_decoder(decoder: &mut Decoder, byte: u8, last: bool) -> bool { let mut output = [0u16; 1]; let input = [byte]; let (result, _read, written) = decoder.decode_to_utf16_without_replacement( if last { b"" } else { &input }, &mut output, last, ); match result { DecoderResult::InputEmpty => { if written == 1 { match output[0] { 0xFF61...0xFF9F => { return false; } _ => {} } } } DecoderResult::Malformed(_, _) => { return false; } DecoderResult::OutputFull => { unreachable!(); } } true } /// A detector for detecting the character encoding of input on the /// precondition that the encoding is a Japanese legacy encoding. pub struct Detector { shift_jis_decoder: Decoder, euc_jp_decoder: Decoder, second_byte_in_escape: u8, iso_2022_jp_disqualified: bool, escape_seen: bool, finished: bool, } impl Detector { /// Instantiates the detector. If `allow_2022` is `true` the possible /// guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If /// `allow_2022` is `false`, the possible guesses are Shift_JIS, EUC-JP, /// and undecided. pub fn new(allow_2022: bool) -> Self { Detector { shift_jis_decoder: SHIFT_JIS.new_decoder_without_bom_handling(), euc_jp_decoder: EUC_JP.new_decoder_without_bom_handling(), second_byte_in_escape: 0, iso_2022_jp_disqualified: !allow_2022, escape_seen: false, finished: false, } } /// Feeds bytes to the detector. If `last` is `true` the end of the stream /// is considered to occur immediately after the end of `buffer`. /// Otherwise, the stream is expected to continue. `buffer` may be empty. /// /// If you're running the detector only on a prefix of a complete /// document, _do not_ pass `last` as `true` after the prefix if the /// stream as a whole still contains more content. /// /// Returns `Some(encoding_rs::SHIFT_JIS)` if the detector guessed /// Shift_JIS. Returns `Some(encoding_rs::EUC_JP)` if the detector /// guessed EUC-JP. Returns `Some(encoding_rs::ISO_2022_JP)` if the /// detector guessed ISO-2022-JP (only possible if `true` was passed as /// `allow_2022` when instantiating the detector). Returns `None` if the /// detector is undecided. If `None` is returned even when passing `true` /// as `last`, falling back to Shift_JIS is the best guess for Web /// purposes. /// /// Do not call again after the method has returned `Some(_)` or after /// the method has been called with `true` as `last`. /// /// # Panics /// /// If called after the method has returned `Some(_)` or after the method /// has been called with `true` as `last`. pub fn feed(&mut self, buffer: &[u8], last: bool) -> Option<&'static Encoding> { assert!( !self.finished, "Tried to used a detector that has finished." ); self.finished = true; // Will change back to false unless we return early let mut i = 0; if !self.iso_2022_jp_disqualified { if !self.escape_seen { i = find_non_ascii_or_escape(buffer); } while i < buffer.len() { let byte = buffer[i]; if byte > 0x7F { self.iso_2022_jp_disqualified = true; break; } if !self.escape_seen && byte == 0x1B { self.escape_seen = true; i += 1; continue; } if self.escape_seen && self.second_byte_in_escape == 0 { self.second_byte_in_escape = byte; i += 1; continue; } match (self.second_byte_in_escape, byte) { (0x28, 0x42) | (0x28, 0x4A) | (0x28, 0x49) | (0x24, 0x40) | (0x24, 0x42) => { return Some(ISO_2022_JP); } _ => {} } if self.escape_seen { self.iso_2022_jp_disqualified = true; break; } i += 1; } } for &byte in &buffer[i..] { if !feed_decoder(&mut self.euc_jp_decoder, byte, false) { return Some(SHIFT_JIS); } if !feed_decoder(&mut self.shift_jis_decoder, byte, false) { return Some(EUC_JP); } } if last { if !feed_decoder(&mut self.euc_jp_decoder, 0, true) { return Some(SHIFT_JIS); } if !feed_decoder(&mut self.shift_jis_decoder, 0, true) { return Some(EUC_JP); } return None; } self.finished = false; None } } // Any copyright to the test code below this comment is dedicated to the // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ #[cfg(test)] mod tests { use super::*; #[test] fn test_iso_2022_jp() { let mut detector = Detector::new(true); assert_eq!( detector.feed(b"abc\x1B\x28\x42\xFF", true), Some(ISO_2022_JP) ); } #[test] fn test_error_precedence() { let mut detector = Detector::new(true); assert_eq!(detector.feed(b"abc\xFF", true), Some(SHIFT_JIS)); } #[test] fn test_invalid_euc_jp() { let mut detector = Detector::new(true); assert_eq!(detector.feed(b"abc\x81\x40", true), Some(SHIFT_JIS)); } #[test] fn test_invalid_shift_jis() { let mut detector = Detector::new(true); assert_eq!(detector.feed(b"abc\xEB\xA8", true), Some(EUC_JP)); } #[test] fn test_invalid_shift_jis_before_invalid_euc_jp() { let mut detector = Detector::new(true); assert_eq!(detector.feed(b"abc\xEB\xA8\x81\x40", true), Some(EUC_JP)); } #[test] fn test_undecided() { let mut detector = Detector::new(true); assert_eq!(detector.feed(b"abc", false), None); assert_eq!(detector.feed(b"abc", false), None); } }