utf8proc/transform/
options.rs

1//! Defines [`TransformOptions`] and related types.
2
3#[allow(unused_imports, reason = "used by docs")]
4use super::advanced;
5#[allow(unused_imports, reason = "used by docs")]
6use super::{UnicodeNormalizationForm, decompose_buffer, decompose_char, map};
7#[allow(unused_imports, reason = "used by docs")]
8use crate::ErrorKind;
9#[allow(unused_imports, reason = "used by docs")]
10use utf8proc_sys::utf8proc_option_t;
11
12/// Options for the [`map`], [`decompose_buffer`], and [`decompose_char`]  functions.
13///
14/// Used to flexibly support multiple transformations
15/// through a single interface.
16///
17/// Some options are specific to composition/decomposition,
18/// and are stored in [`CompositionOptions`].
19///
20/// ## Limitation
21/// Certain options are only supported in the [advanced] interface,
22/// because they have the potential to produce invalid UTF8.
23///
24/// This currently includes the [`grapheme_boundary_markers`](Self::grapheme_boundary_markers) option,
25/// and  [`unassigned_codepoint_handling`](Self::unassigned_codepoint_handling) set to [`UnassignedCodepointHandling::Allow`].
26#[derive(Clone, Debug, Default)]
27#[non_exhaustive]
28#[must_use]
29pub struct TransformOptions {
30    /// Specify how to handle unassigned codepoints.
31    ///
32    /// By default, this is set to [`UnassignedCodepointHandling::Forbid`].
33    pub unassigned_codepoint_handling: UnassignedCodepointHandling,
34    /// Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE..
35    ///
36    /// This is equivalent to the [`UTF8PROC_IGNORE`] option in the C library.
37    ///
38    /// [`UTF8PROC_IGNORE`]: utf8proc_option_t::UTF8PROC_IGNORE
39    pub ignore: bool,
40    /// Apply Unicode case-folding,
41    /// to be able to do a case-insensitive
42    /// string comparison.
43    ///
44    /// This is equivalent to the [`UTF8PROC_CASEFOLD`] option in the C library.
45    ///
46    /// [`UTF8PROC_CASEFOLD`]: utf8proc_option_t::UTF8PROC_CASEFOLD
47    pub case_fold: bool,
48    /// Inserts marker values at the beginning of each sequence which is representing
49    /// a single grapheme cluster (see UAX#29)..
50    ///
51    /// This is only usable in the [`advanced`] interface,
52    /// because it produces invalid UTF8 or codepoints.
53    /// Using this option in the simple interface *will panic*.
54    ///
55    /// The same functionality is also available through the [`crate::grapheme`] module.
56    ///
57    /// This is equivalent to the [`UTF8PROC_CHARBOUND`] option in the C library.
58    ///
59    /// [`UTF8PROC_CHARBOUND`]: utf8proc_option_t::UTF8PROC_CHARBOUND
60    pub grapheme_boundary_markers: bool,
61    /// Replace certain characters with their compatibility decomposition.
62    ///
63    /// This is used to implement [NFKD] and [NFKC] Unicode normalization.
64    ///
65    /// This is equivalent to the [`UTF8PROC_COMPAT`] option in the C library.
66    ///
67    /// [`UTF8PROC_COMPAT`]: utf8proc_option_t::UTF8PROC_COMPAT
68    /// [NFKD]: UnicodeNormalizationForm::NFKD
69    /// [NFKC]: UnicodeNormalizationForm::NFKC
70    pub compat: bool,
71    /// If not `None`, enables composition/decomposition of control characters.
72    ///
73    /// Use [`CompositionOptions::compose`] and  [`CompositionOptions::decompose`]
74    /// for default compose/decompose options.
75    ///
76    /// Equivalent to either [`UTF8PROC_COMPOSE`] or [`UTF8PROC_DECOMPOSE`] in the C library,
77    /// depending on the [`CompositionDirection`].
78    ///
79    ///
80    /// [`UTF8PROC_COMPOSE`]: utf8proc_option_t::UTF8PROC_COMPOSE
81    /// [`UTF8PROC_DECOMPOSE`]: utf8proc_option_t::UTF8PROC_DECOMPOSE
82    pub composition: Option<CompositionOptions>,
83    /// Lump certain characters together.
84    ///
85    /// For example, HYPHEN U+2010 and MINUS U+2212 are converted to ASCII "-".
86    /// Documented in [`lump.md`] in the utf8proc repository (link valid as of version v2.10.0).
87    ///
88    /// If the [`nlf_conversion`](Self::nlf_conversion) option is set,
89    /// this includes a transformation of paragraph and
90    /// line separators to ASCII line-feed (LF).
91    ///
92    /// [`lump.md`]: https://github.com/JuliaStrings/utf8proc/blob/v2.10.0/lump.md
93    pub lump: bool,
94    /// Customize the conversion of NLF-sequences (LF, CRLF, CR, NEL).
95    ///
96    /// If this is `None`, no conversions are applied.
97    /// Can be used to customize the [`strip_control_codes`](Self::strip_control_codes) option.
98    pub nlf_conversion: Option<NlfConversionMode>,
99    /// Strips and/or converts control characters.
100    ///
101    /// NLF-sequences are transformed into spaces, except if of the
102    /// [`nlf_conversion`](Self::nlf_conversion) option is specified.
103    /// `HorizontalTab` (HT) and `FormFeed` (FF)
104    /// are treated as a NLF-sequence in this case.
105    /// All other control characters are simply removed.
106    pub strip_control_codes: bool,
107    /// Prohibit combining characters that would violate [Unicode versioning stability].
108    ///
109    /// [Unicode versioning stability]: https://www.unicode.org/policies/stability_policy.html
110    pub stable: bool,
111}
112impl TransformOptions {
113    /// Panic if options are used that could produce non-UTF8 data.
114    ///
115    /// These are only allowed in the [advanced] interface.
116    #[track_caller]
117    #[inline] // potential to be constant-folded
118    pub(crate) fn validate_utf8(&self) -> &Self {
119        assert!(
120            !self.grapheme_boundary_markers,
121            "Enabling `grapheme_boundary_markers` is forbidden in the simple interface",
122        );
123        match self.unassigned_codepoint_handling {
124            UnassignedCodepointHandling::Forbid | UnassignedCodepointHandling::Strip => { /* acceptable */ }
125            UnassignedCodepointHandling::Allow => {
126                panic!("Setting `unassigned_codepoint_handling=Allow` is forbidden in the simple interface")
127            }
128        }
129        self
130    }
131    /// Convert this into a FFI option.
132    ///
133    /// The returned option should be semantically valid,
134    /// and will not trigger a [`ErrorKind::InvalidOptions`] error.
135    ///
136    /// ## Safety
137    /// Certain options have the potential to produce non-UTF8 data,
138    /// which will trigger undefined behavior if passed to [`std::str::from_utf8_unchecked`].
139    ///
140    /// Call [`Self::validate_utf8`] to make sure these options are not present,
141    /// and if you allow them don't later convert to UTF8.
142    ///
143    /// This function itself can not trigger undefined behavior,
144    /// but may invalidate future assumptions (see above)
145    #[track_caller]
146    #[inline]
147    #[deny(unused_variables)]
148    pub(crate) unsafe fn to_ffi(&self) -> utf8proc_option_t {
149        let TransformOptions {
150            unassigned_codepoint_handling,
151            ignore,
152            case_fold,
153            grapheme_boundary_markers,
154            compat,
155            ref composition,
156            lump,
157            nlf_conversion,
158            strip_control_codes,
159            stable,
160        } = *self;
161        let mut res = utf8proc_option_t::NONE;
162        res |= match unassigned_codepoint_handling {
163            UnassignedCodepointHandling::Forbid => utf8proc_option_t::UTF8PROC_REJECTNA,
164            UnassignedCodepointHandling::Strip => utf8proc_option_t::UTF8PROC_STRIPNA,
165            UnassignedCodepointHandling::Allow => utf8proc_option_t::NONE,
166        };
167        if ignore {
168            res |= utf8proc_option_t::UTF8PROC_IGNORE;
169        }
170        if case_fold {
171            res |= utf8proc_option_t::UTF8PROC_CASEFOLD;
172        }
173        if grapheme_boundary_markers {
174            res |= utf8proc_option_t::UTF8PROC_CHARBOUND;
175        }
176        if compat {
177            res |= utf8proc_option_t::UTF8PROC_COMPAT;
178        }
179        if let Some(composition) = composition {
180            res |= match composition.direction {
181                CompositionDirection::Compose => utf8proc_option_t::UTF8PROC_COMPOSE,
182                CompositionDirection::Decompose => utf8proc_option_t::UTF8PROC_DECOMPOSE,
183            }
184        }
185        if lump {
186            res |= utf8proc_option_t::UTF8PROC_LUMP;
187        }
188        res |= match nlf_conversion {
189            None => utf8proc_option_t::NONE,
190            Some(NlfConversionMode::LineSeparation) => utf8proc_option_t::UTF8PROC_NLF2LS,
191            Some(NlfConversionMode::ParagraphSeparator) => utf8proc_option_t::UTF8PROC_NLF2PS,
192            Some(NlfConversionMode::Unknown) => utf8proc_option_t::UTF8PROC_NLF2LF,
193        };
194        if strip_control_codes {
195            res |= utf8proc_option_t::UTF8PROC_STRIPCC;
196        }
197        if stable {
198            res |= utf8proc_option_t::UTF8PROC_STABLE;
199        }
200        res
201    }
202}
203
204/// Indicates how to handle unassigned codepoints.
205#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Default)]
206#[must_use]
207pub enum UnassignedCodepointHandling {
208    /// Return an [`ErrorKind::NotAssigned`] error if an unassigned codepoint is encountered.
209    ///
210    /// This corresponds to the [`UTF8PROC_REJECTNA`] option in the C library.
211    ///
212    /// [`UTF8PROC_REJECTNA`]: utf8proc_option_t::UTF8PROC_REJECTNA
213    #[default]
214    Forbid,
215    /// Remove unassigned codepoints.
216    ///
217    /// This corresponds to the [`UTF8PROC_STRIPNA`] option in the C library.
218    ///
219    /// [`UTF8PROC_STRIPNA`]: utf8proc_option_t::UTF8PROC_STRIPNA
220    Strip,
221    /// Allow unassigned codepoints, without returning an error or ignoring them.
222    ///
223    /// This option can only be set using the [advanced] interface,
224    /// as unsigned codepoints have the potential to produce invalid UTF8.
225    Allow,
226}
227
228/// Controls Unicode composition and decomposition.
229///
230/// There is no type-wide default, because you must choose a direction.
231/// Use [`Self::compose`] or [`Self::decompose`] instead.
232#[derive(Clone, Debug)]
233#[non_exhaustive]
234#[must_use]
235pub struct CompositionOptions {
236    /// Whether composition or decomposition should be performed
237    pub direction: CompositionDirection,
238    /// Strips all character markings.
239    ///
240    /// This includes non-spacing, spacing and enclosing (i.e. accents).
241    ///
242    /// This is equivalent to the [`UTF8PROC_CASEFOLD`] option in the C library.
243    ///
244    /// [`UTF8PROC_CASEFOLD`]: utf8proc_option_t::UTF8PROC_CASEFOLD
245    pub strip_marks: bool,
246}
247impl CompositionOptions {
248    /// Enable composition, with no additional options.
249    #[inline]
250    pub const fn compose() -> CompositionOptions {
251        CompositionOptions {
252            direction: CompositionDirection::Compose,
253            strip_marks: false,
254        }
255    }
256
257    /// Enable decomposition, with no additional options.
258    #[inline]
259    pub const fn decompose() -> CompositionOptions {
260        CompositionOptions {
261            direction: CompositionDirection::Decompose,
262            ..Self::compose()
263        }
264    }
265}
266/// Controls whether composition or decomposition is being performed.
267#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
268#[must_use]
269pub enum CompositionDirection {
270    /// Enable composition, recomposing characters by canonical equivalence.
271    ///
272    /// This is the inverse of the [`UTF8PROC_COMPOSE`] option in the C library.
273    ///
274    /// [`UTF8PROC_COMPOSE`]: utf8proc_option_t::UTF8PROC_COMPOSE
275    Compose,
276    /// Enable decomposition, decomposing characters by canonical equivalence.
277    ///
278    /// This is the inverse of the [`UTF8PROC_DECOMPOSE`] option in the C library.
279    ///
280    /// [`UTF8PROC_DECOMPOSE`]: utf8proc_option_t::UTF8PROC_DECOMPOSE
281    Decompose,
282}
283
284/// Indicates how NLF-sequences (LF, CRLF, CR, NEL) should be converted.
285#[derive(Copy, Clone, Debug)]
286#[non_exhaustive]
287#[must_use]
288pub enum NlfConversionMode {
289    /// Indicates that NLF-sequences are representing a
290    /// line break, and should be converted to the codepoint for line
291    /// separation (LS).
292    ///
293    /// This is equivalent to the [`UTF8PROC_NLF2LS`] option in the C library.
294    ///
295    /// [`UTF8PROC_NLF2LS`]: utf8proc_option_t::UTF8PROC_NLF2LS
296    LineSeparation,
297    /// Indicates that NLF-sequences are representing a paragraph break, and
298    /// should be converted to the codepoint for paragraph separation (PS).
299    ///
300    /// This is equivalent to the [`UTF8PROC_NLF2PS`] option in the C library.
301    ///
302    /// [`UTF8PROC_NLF2PS`]: utf8proc_option_t::UTF8PROC_NLF2PS
303    ParagraphSeparator,
304    /// Indicates that the meaning of NLF-sequences is unknown.
305    ///
306    /// Note that this option is distinct from disabling NLF conversion.
307    ///
308    ///
309    /// This is equivalent to the [`UTF8PROC_NLF2LF`] option in the C library.
310    ///
311    /// [`UTF8PROC_NLF2LF`]: utf8proc_option_t::UTF8PROC_NLF2LF
312    Unknown,
313}