1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
// Copyright 2018 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#![doc(html_root_url = "https://docs.rs/shift_or_euc/0.1.0")]

//! A Japanese legacy encoding detector for detecting between Shift_JIS,
//! EUC-JP, and, optionally, ISO-2022-JP _given_ the assumption that the
//! encoding is one of those.
//!
//! This detector is generally more accurate (but see below about the failure
//! mode on half-width katakana) and decides much sooner than machine
//! learning-based detectors. To decide EUC-JP, machine learning-based
//! detectors try to gain confidence that the input looks like EUC-JP. To
//! decide EUC-JP, this detector instead looks for two simple rule-based
//! signs of the input not being Shift_JIS.
//!
//! As a consequence of not containing machine learning tables, the binary
//! size footprint that this crate adds on top of
//! [`encoding_rs`](https://docs.rs/crate/encoding_rs) is tiny.
//!
//! # Licensing
//!
//! See the file named [COPYRIGHT](https://github.com/hsivonen/shift_or_euc/blob/master/COPYRIGHT).
//!
//! # Principle of Operation
//!
//! The detector is based on two observations:
//!
//! 1. The ISO-2022-JP escape sequences don't normally occur in Shift_JIS or
//! EUC-JP, so encountering such an escape sequence (before non-ASCII has been
//! encountered) can be taken as indication of ISO-2022-JP.
//! 2. When normal (full-with) kana or common kanji encoded as Shift_JIS is
//! decoded as EUC-JP, or vice versa, the result is either an error or
//! half-width katakana, and it's very uncommon for Japanese HTML to have
//! half-width katakana character before a normal kana or common kanji
//! character. Therefore, if decoding as Shift_JIS results in error or
//! have-width katakana, the detector decides that the content is EUC-JP, and
//! vice versa.
//!
//! # Failure Modes
//!
//! The detector gives the wrong answer if the text has a half-width katakana
//! character before normal kana or common kanji. Some uncommon kanji are
//! undecidable. (All JIS X 0208 Level 1 kanji are decidable.)
//!
//! The half-width katakana issue is mainly relevant for old 8-bit JIS X
//! 0201-only text files that would decode correctly as Shift_JIS but that the
//!  detector detects as EUC-JP.
//!
//! The undecidable kanji issue does not realistically show up when a full
//! document is fed to the detector, because, realistically, in a full
//! document, there is at least one kana or common kanji. It can occur,
//! though, if the detector is only run on a prefix of a document and the
//! prefix only contains the title of the document. It is possible for
//! document title to consist entirely of undecidable kanji. (Indeed,
//! Japanese Wikipedia has articles with such titles.) If the detector is
//! undecided, falling back to Shift_JIS is typically the Web oriented better
//! guess.

use encoding_rs::Decoder;
use encoding_rs::DecoderResult;
use encoding_rs::Encoding;
use encoding_rs::EUC_JP;
use encoding_rs::ISO_2022_JP;
use encoding_rs::SHIFT_JIS;

/// Returns the index of the first non-ASCII byte or the first
/// 0x1B, whichever comes first, or the length of the buffer
/// if neither is found.
fn find_non_ascii_or_escape(buffer: &[u8]) -> usize {
    let ascii_up_to = Encoding::ascii_valid_up_to(buffer);
    if let Some(escape) = memchr::memchr(0x1B, &buffer[..ascii_up_to]) {
        escape
    } else {
        ascii_up_to
    }
}

/// Feed decoder with one byte (if `last` is `false`) or EOF (if `last` is
/// `true`). `byte` is ignored if `last` is `true`.
/// Returns `true` if there was no rejection or `false` upon rejecting the
/// encoding hypothesis represented by this decoder.
#[inline(always)]
fn feed_decoder(decoder: &mut Decoder, byte: u8, last: bool) -> bool {
    let mut output = [0u16; 1];
    let input = [byte];
    let (result, _read, written) = decoder.decode_to_utf16_without_replacement(
        if last { b"" } else { &input },
        &mut output,
        last,
    );
    match result {
        DecoderResult::InputEmpty => {
            if written == 1 {
                match output[0] {
                    0xFF61...0xFF9F => {
                        return false;
                    }
                    _ => {}
                }
            }
        }
        DecoderResult::Malformed(_, _) => {
            return false;
        }
        DecoderResult::OutputFull => {
            unreachable!();
        }
    }
    true
}

/// A detector for detecting the character encoding of input on the
/// precondition that the encoding is a Japanese legacy encoding.
pub struct Detector {
    shift_jis_decoder: Decoder,
    euc_jp_decoder: Decoder,
    second_byte_in_escape: u8,
    iso_2022_jp_disqualified: bool,
    escape_seen: bool,
    finished: bool,
}

impl Detector {
    /// Instantiates the detector. If `allow_2022` is `true` the possible
    /// guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If
    /// `allow_2022` is `false`, the possible guesses are Shift_JIS, EUC-JP,
    /// and undecided.
    pub fn new(allow_2022: bool) -> Self {
        Detector {
            shift_jis_decoder: SHIFT_JIS.new_decoder_without_bom_handling(),
            euc_jp_decoder: EUC_JP.new_decoder_without_bom_handling(),
            second_byte_in_escape: 0,
            iso_2022_jp_disqualified: !allow_2022,
            escape_seen: false,
            finished: false,
        }
    }

    /// Feeds bytes to the detector. If `last` is `true` the end of the stream
    /// is considered to occur immediately after the end of `buffer`.
    /// Otherwise, the stream is expected to continue. `buffer` may be empty.
    ///
    /// If you're running the detector only on a prefix of a complete
    /// document, _do not_ pass `last` as `true` after the prefix if the
    /// stream as a whole still contains more content.
    ///
    /// Returns `Some(encoding_rs::SHIFT_JIS)` if the detector guessed
    /// Shift_JIS. Returns `Some(encoding_rs::EUC_JP)` if the detector
    /// guessed EUC-JP. Returns `Some(encoding_rs::ISO_2022_JP)` if the
    /// detector guessed ISO-2022-JP (only possible if `true` was passed as
    /// `allow_2022` when instantiating the detector). Returns `None` if the
    /// detector is undecided. If `None` is returned even when passing `true`
    /// as `last`, falling back to Shift_JIS is the best guess for Web
    /// purposes.
    ///
    /// Do not call again after the method has returned `Some(_)` or after
    /// the method has been called with `true` as `last`.
    ///
    /// # Panics
    ///
    /// If called after the method has returned `Some(_)` or after the method
    /// has been called with `true` as `last`.
    pub fn feed(&mut self, buffer: &[u8], last: bool) -> Option<&'static Encoding> {
        assert!(
            !self.finished,
            "Tried to used a detector that has finished."
        );
        self.finished = true; // Will change back to false unless we return early
        let mut i = 0;
        if !self.iso_2022_jp_disqualified {
            if !self.escape_seen {
                i = find_non_ascii_or_escape(buffer);
            }
            while i < buffer.len() {
                let byte = buffer[i];
                if byte > 0x7F {
                    self.iso_2022_jp_disqualified = true;
                    break;
                }
                if !self.escape_seen && byte == 0x1B {
                    self.escape_seen = true;
                    i += 1;
                    continue;
                }
                if self.escape_seen && self.second_byte_in_escape == 0 {
                    self.second_byte_in_escape = byte;
                    i += 1;
                    continue;
                }
                match (self.second_byte_in_escape, byte) {
                    (0x28, 0x42) | (0x28, 0x4A) | (0x28, 0x49) | (0x24, 0x40) | (0x24, 0x42) => {
                        return Some(ISO_2022_JP);
                    }
                    _ => {}
                }
                if self.escape_seen {
                    self.iso_2022_jp_disqualified = true;
                    break;
                }
                i += 1;
            }
        }
        for &byte in &buffer[i..] {
            if !feed_decoder(&mut self.euc_jp_decoder, byte, false) {
                return Some(SHIFT_JIS);
            }
            if !feed_decoder(&mut self.shift_jis_decoder, byte, false) {
                return Some(EUC_JP);
            }
        }
        if last {
            if !feed_decoder(&mut self.euc_jp_decoder, 0, true) {
                return Some(SHIFT_JIS);
            }
            if !feed_decoder(&mut self.shift_jis_decoder, 0, true) {
                return Some(EUC_JP);
            }
            return None;
        }
        self.finished = false;
        None
    }
}

// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/

#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn test_iso_2022_jp() {
        let mut detector = Detector::new(true);
        assert_eq!(
            detector.feed(b"abc\x1B\x28\x42\xFF", true),
            Some(ISO_2022_JP)
        );
    }

    #[test]
    fn test_error_precedence() {
        let mut detector = Detector::new(true);
        assert_eq!(detector.feed(b"abc\xFF", true), Some(SHIFT_JIS));
    }

    #[test]
    fn test_invalid_euc_jp() {
        let mut detector = Detector::new(true);
        assert_eq!(detector.feed(b"abc\x81\x40", true), Some(SHIFT_JIS));
    }

    #[test]
    fn test_invalid_shift_jis() {
        let mut detector = Detector::new(true);
        assert_eq!(detector.feed(b"abc\xEB\xA8", true), Some(EUC_JP));
    }

    #[test]
    fn test_invalid_shift_jis_before_invalid_euc_jp() {
        let mut detector = Detector::new(true);
        assert_eq!(detector.feed(b"abc\xEB\xA8\x81\x40", true), Some(EUC_JP));
    }

    #[test]
    fn test_undecided() {
        let mut detector = Detector::new(true);
        assert_eq!(detector.feed(b"abc", false), None);
        assert_eq!(detector.feed(b"abc", false), None);
    }

}