1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#![doc(html_root_url = "https://docs.rs/chardetng_c/0.1.0")]

//! C API for [`chardetng`](https://docs.rs/chardetng/)
//!
//! # Panics
//!
//! This crate is designed to be used only in a `panic=abort` scenario.
//! Panic propagation across FFI is not handled!
//!
//! # Licensing
//!
//! See the file named [COPYRIGHT](https://github.com/hsivonen/chardetng_c/blob/master/COPYRIGHT).

use encoding_rs::Encoding;
use chardetng::EncodingDetector;

/// Instantiates a Web browser-oriented detector for guessing what
/// character encoding a stream of bytes is encoded in.
///
/// The bytes are fed to the detector incrementally using the
/// `chardetng_encoding_detector_free` function. The current guess of the
/// detector can be queried using the `chardetng_encoding_detector_guess`
/// function. The guessing parameters are arguments to the
/// `chardetng_encoding_detector_guess` function rather than arguments to the
/// constructor in order to enable the application to check if the arguments
/// affect the guessing outcome. (The specific use case is to disable UI for
/// re-running the detector with UTF-8 allowed and the top-level domain name
/// ignored if those arguments don't change the guess.)
///
/// The instantiated detector must be freed after use using
/// `chardetng_detectordetector_free`.
#[no_mangle]
pub unsafe extern "C" fn chardetng_encoding_detector_new() -> *mut EncodingDetector {
    Box::into_raw(Box::new(EncodingDetector::new()))
}

/// Deallocates a detector obtained from `chardetng_encodingdetector_new`.
#[no_mangle]
pub unsafe extern "C" fn chardetng_encoding_detector_free(detector: *mut EncodingDetector) {
    let _ = Box::from_raw(detector);
}

/// Inform the detector of a chunk of input.
///
/// The byte stream is represented as a sequence of calls to this
/// function such that the concatenation of the arguments to this
/// function form the byte stream. It does not matter how the application
/// chooses to chunk the stream. It is OK to call this function with
/// a zero-length byte slice.
///
/// The end of the stream is indicated by calling this function with
/// `last` set to `true`. In that case, the end of the stream is
/// considered to occur after the last byte of the `buffer` (which
/// may be zero-length) passed in the same call. Once this function
/// has been called with `last` set to `true` this function must not
/// be called again.
///
/// If you want to perform detection on just the prefix of a longer
/// stream, do not pass `last=true` after the prefix if the stream
/// actually still continues.
///
/// Returns `true` if after processing `buffer` the stream has
/// contained at least one non-ASCII byte and `false` if only
/// ASCII has been seen so far.
///
/// # Panics
///
/// If this function has previously been called with `last` set to `true`.
///
/// # Undefined Behavior
///
/// UB ensues if
///
/// * `detector` does not point to a detector obtained from
///   `chardetng_detector_new` but not yet freed with
///   `chardetng_detector_free`.
/// * `buffer` is `NULL`. (It can be a bogus pointer when `buffer_len` is 0.)
/// * ,buffer_len` is non-zero and `buffer` and `buffer_len` don't designate
///    a range of memory valid for reading.
#[no_mangle]
pub unsafe extern "C" fn chardetng_encoding_detector_feed(
    detector: *mut EncodingDetector,
    buffer: *const u8,
    buffer_len: usize,
    last: bool,
) -> bool {
    (*detector).feed(::std::slice::from_raw_parts(buffer, buffer_len), last)
}

/// Guess the encoding given the bytes pushed to the detector so far
/// (via `chardetng_encoding_detector_feed()`), the top-level domain name 
/// from which the bytes were loaded, and an indication of whether to
/// consider UTF-8 as a permissible guess.
///
/// The `tld` argument takes the rightmost DNS label of the hostname of the
/// host the stream was loaded from in lower-case ASCII form. That is, if
/// the label is an internationalized top-level domain name, it must be
/// provided in its Punycode form. If the TLD that the stream was loaded
/// from is unavalable, `NULL` may be passed instead (and 0 as `tld_len`),
/// which is equivalent to passing pointer to "com" as `tld` and 3 as 
/// `tld_len`.
///
/// If the `allow_utf8` argument is set to `false`, the return value of
/// this function won't be `UTF_8_ENCODING`. When performing detection
/// on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
/// unless the user has taken a specific contextual action to request an
/// override. This way, Web developers cannot start depending on UTF-8
/// detection. Such reliance would make the Web Platform more brittle.
///
/// Returns the guessed encoding (never `NULL`).
///
/// # Panics
///
/// If `tld` is `NULL` but `tld_len` is not zero.
///
/// If `tld` contains non-ASCII, period, or upper-case letters. (The panic
/// condition is intentionally limited to signs of failing to extract the
/// label correctly, failing to provide it in its Punycode form, and failure
/// to lower-case it. Full DNS label validation is intentionally not performed
/// to avoid panics when the reality doesn't match the specs.)
///
/// # Undefined Behavior
///
/// UB ensues if
///
/// * `detector` does not point to a detector obtained from
///   `chardetng_detector_new` but not yet freed with
///   `chardetng_detector_free`.
/// * `tld` is non-NULL and `tld_len` is non-zero but `tld` and `tld_len`
///   don't designate a range of memory valid for reading.
#[no_mangle]
pub unsafe extern "C" fn chardetng_encoding_detector_guess(
    detector: *const EncodingDetector,
    tld: *const u8,
    tld_len: usize,
    allow_utf8: bool,
) -> *const Encoding {
	let tld_opt = if tld.is_null() {
		assert_eq!(tld_len, 0);
		None
	} else {
		Some(::std::slice::from_raw_parts(tld, tld_len))
	};
	(*detector).guess(tld_opt, allow_utf8)
}