utf8proc/transform/options.rs
1//! Defines [`TransformOptions`] and related types.
2
3#[allow(unused_imports, reason = "used by docs")]
4use super::advanced;
5#[allow(unused_imports, reason = "used by docs")]
6use super::{UnicodeNormalizationForm, decompose_buffer, decompose_char, map};
7#[allow(unused_imports, reason = "used by docs")]
8use crate::ErrorKind;
9#[allow(unused_imports, reason = "used by docs")]
10use utf8proc_sys::utf8proc_option_t;
11
12/// Options for the [`map`], [`decompose_buffer`], and [`decompose_char`] functions.
13///
14/// Used to flexibly support multiple transformations
15/// through a single interface.
16///
17/// Some options are specific to composition/decomposition,
18/// and are stored in [`CompositionOptions`].
19///
20/// ## Limitation
21/// Certain options are only supported in the [advanced] interface,
22/// because they have the potential to produce invalid UTF8.
23///
24/// This currently includes the [`grapheme_boundary_markers`](Self::grapheme_boundary_markers) option,
25/// and [`unassigned_codepoint_handling`](Self::unassigned_codepoint_handling) set to [`UnassignedCodepointHandling::Allow`].
26#[derive(Clone, Debug, Default)]
27#[non_exhaustive]
28#[must_use]
29pub struct TransformOptions {
30 /// Specify how to handle unassigned codepoints.
31 ///
32 /// By default, this is set to [`UnassignedCodepointHandling::Forbid`].
33 pub unassigned_codepoint_handling: UnassignedCodepointHandling,
34 /// Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE..
35 ///
36 /// This is equivalent to the [`UTF8PROC_IGNORE`] option in the C library.
37 ///
38 /// [`UTF8PROC_IGNORE`]: utf8proc_option_t::UTF8PROC_IGNORE
39 pub ignore: bool,
40 /// Apply Unicode case-folding,
41 /// to be able to do a case-insensitive
42 /// string comparison.
43 ///
44 /// This is equivalent to the [`UTF8PROC_CASEFOLD`] option in the C library.
45 ///
46 /// [`UTF8PROC_CASEFOLD`]: utf8proc_option_t::UTF8PROC_CASEFOLD
47 pub case_fold: bool,
48 /// Inserts marker values at the beginning of each sequence which is representing
49 /// a single grapheme cluster (see UAX#29)..
50 ///
51 /// This is only usable in the [`advanced`] interface,
52 /// because it produces invalid UTF8 or codepoints.
53 /// Using this option in the simple interface *will panic*.
54 ///
55 /// The same functionality is also available through the [`crate::grapheme`] module.
56 ///
57 /// This is equivalent to the [`UTF8PROC_CHARBOUND`] option in the C library.
58 ///
59 /// [`UTF8PROC_CHARBOUND`]: utf8proc_option_t::UTF8PROC_CHARBOUND
60 pub grapheme_boundary_markers: bool,
61 /// Replace certain characters with their compatibility decomposition.
62 ///
63 /// This is used to implement [NFKD] and [NFKC] Unicode normalization.
64 ///
65 /// This is equivalent to the [`UTF8PROC_COMPAT`] option in the C library.
66 ///
67 /// [`UTF8PROC_COMPAT`]: utf8proc_option_t::UTF8PROC_COMPAT
68 /// [NFKD]: UnicodeNormalizationForm::NFKD
69 /// [NFKC]: UnicodeNormalizationForm::NFKC
70 pub compat: bool,
71 /// If not `None`, enables composition/decomposition of control characters.
72 ///
73 /// Use [`CompositionOptions::compose`] and [`CompositionOptions::decompose`]
74 /// for default compose/decompose options.
75 ///
76 /// Equivalent to either [`UTF8PROC_COMPOSE`] or [`UTF8PROC_DECOMPOSE`] in the C library,
77 /// depending on the [`CompositionDirection`].
78 ///
79 ///
80 /// [`UTF8PROC_COMPOSE`]: utf8proc_option_t::UTF8PROC_COMPOSE
81 /// [`UTF8PROC_DECOMPOSE`]: utf8proc_option_t::UTF8PROC_DECOMPOSE
82 pub composition: Option<CompositionOptions>,
83 /// Lump certain characters together.
84 ///
85 /// For example, HYPHEN U+2010 and MINUS U+2212 are converted to ASCII "-".
86 /// Documented in [`lump.md`] in the utf8proc repository (link valid as of version v2.10.0).
87 ///
88 /// If the [`nlf_conversion`](Self::nlf_conversion) option is set,
89 /// this includes a transformation of paragraph and
90 /// line separators to ASCII line-feed (LF).
91 ///
92 /// [`lump.md`]: https://github.com/JuliaStrings/utf8proc/blob/v2.10.0/lump.md
93 pub lump: bool,
94 /// Customize the conversion of NLF-sequences (LF, CRLF, CR, NEL).
95 ///
96 /// If this is `None`, no conversions are applied.
97 /// Can be used to customize the [`strip_control_codes`](Self::strip_control_codes) option.
98 pub nlf_conversion: Option<NlfConversionMode>,
99 /// Strips and/or converts control characters.
100 ///
101 /// NLF-sequences are transformed into spaces, except if of the
102 /// [`nlf_conversion`](Self::nlf_conversion) option is specified.
103 /// `HorizontalTab` (HT) and `FormFeed` (FF)
104 /// are treated as a NLF-sequence in this case.
105 /// All other control characters are simply removed.
106 pub strip_control_codes: bool,
107 /// Prohibit combining characters that would violate [Unicode versioning stability].
108 ///
109 /// [Unicode versioning stability]: https://www.unicode.org/policies/stability_policy.html
110 pub stable: bool,
111}
112impl TransformOptions {
113 /// Panic if options are used that could produce non-UTF8 data.
114 ///
115 /// These are only allowed in the [advanced] interface.
116 #[track_caller]
117 #[inline] // potential to be constant-folded
118 pub(crate) fn validate_utf8(&self) -> &Self {
119 assert!(
120 !self.grapheme_boundary_markers,
121 "Enabling `grapheme_boundary_markers` is forbidden in the simple interface",
122 );
123 match self.unassigned_codepoint_handling {
124 UnassignedCodepointHandling::Forbid | UnassignedCodepointHandling::Strip => { /* acceptable */ }
125 UnassignedCodepointHandling::Allow => {
126 panic!("Setting `unassigned_codepoint_handling=Allow` is forbidden in the simple interface")
127 }
128 }
129 self
130 }
131 /// Convert this into a FFI option.
132 ///
133 /// The returned option should be semantically valid,
134 /// and will not trigger a [`ErrorKind::InvalidOptions`] error.
135 ///
136 /// ## Safety
137 /// Certain options have the potential to produce non-UTF8 data,
138 /// which will trigger undefined behavior if passed to [`std::str::from_utf8_unchecked`].
139 ///
140 /// Call [`Self::validate_utf8`] to make sure these options are not present,
141 /// and if you allow them don't later convert to UTF8.
142 ///
143 /// This function itself can not trigger undefined behavior,
144 /// but may invalidate future assumptions (see above)
145 #[track_caller]
146 #[inline]
147 #[deny(unused_variables)]
148 pub(crate) unsafe fn to_ffi(&self) -> utf8proc_option_t {
149 let TransformOptions {
150 unassigned_codepoint_handling,
151 ignore,
152 case_fold,
153 grapheme_boundary_markers,
154 compat,
155 ref composition,
156 lump,
157 nlf_conversion,
158 strip_control_codes,
159 stable,
160 } = *self;
161 let mut res = utf8proc_option_t::NONE;
162 res |= match unassigned_codepoint_handling {
163 UnassignedCodepointHandling::Forbid => utf8proc_option_t::UTF8PROC_REJECTNA,
164 UnassignedCodepointHandling::Strip => utf8proc_option_t::UTF8PROC_STRIPNA,
165 UnassignedCodepointHandling::Allow => utf8proc_option_t::NONE,
166 };
167 if ignore {
168 res |= utf8proc_option_t::UTF8PROC_IGNORE;
169 }
170 if case_fold {
171 res |= utf8proc_option_t::UTF8PROC_CASEFOLD;
172 }
173 if grapheme_boundary_markers {
174 res |= utf8proc_option_t::UTF8PROC_CHARBOUND;
175 }
176 if compat {
177 res |= utf8proc_option_t::UTF8PROC_COMPAT;
178 }
179 if let Some(composition) = composition {
180 res |= match composition.direction {
181 CompositionDirection::Compose => utf8proc_option_t::UTF8PROC_COMPOSE,
182 CompositionDirection::Decompose => utf8proc_option_t::UTF8PROC_DECOMPOSE,
183 }
184 }
185 if lump {
186 res |= utf8proc_option_t::UTF8PROC_LUMP;
187 }
188 res |= match nlf_conversion {
189 None => utf8proc_option_t::NONE,
190 Some(NlfConversionMode::LineSeparation) => utf8proc_option_t::UTF8PROC_NLF2LS,
191 Some(NlfConversionMode::ParagraphSeparator) => utf8proc_option_t::UTF8PROC_NLF2PS,
192 Some(NlfConversionMode::Unknown) => utf8proc_option_t::UTF8PROC_NLF2LF,
193 };
194 if strip_control_codes {
195 res |= utf8proc_option_t::UTF8PROC_STRIPCC;
196 }
197 if stable {
198 res |= utf8proc_option_t::UTF8PROC_STABLE;
199 }
200 res
201 }
202}
203
204/// Indicates how to handle unassigned codepoints.
205#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Default)]
206#[must_use]
207pub enum UnassignedCodepointHandling {
208 /// Return an [`ErrorKind::NotAssigned`] error if an unassigned codepoint is encountered.
209 ///
210 /// This corresponds to the [`UTF8PROC_REJECTNA`] option in the C library.
211 ///
212 /// [`UTF8PROC_REJECTNA`]: utf8proc_option_t::UTF8PROC_REJECTNA
213 #[default]
214 Forbid,
215 /// Remove unassigned codepoints.
216 ///
217 /// This corresponds to the [`UTF8PROC_STRIPNA`] option in the C library.
218 ///
219 /// [`UTF8PROC_STRIPNA`]: utf8proc_option_t::UTF8PROC_STRIPNA
220 Strip,
221 /// Allow unassigned codepoints, without returning an error or ignoring them.
222 ///
223 /// This option can only be set using the [advanced] interface,
224 /// as unsigned codepoints have the potential to produce invalid UTF8.
225 Allow,
226}
227
228/// Controls Unicode composition and decomposition.
229///
230/// There is no type-wide default, because you must choose a direction.
231/// Use [`Self::compose`] or [`Self::decompose`] instead.
232#[derive(Clone, Debug)]
233#[non_exhaustive]
234#[must_use]
235pub struct CompositionOptions {
236 /// Whether composition or decomposition should be performed
237 pub direction: CompositionDirection,
238 /// Strips all character markings.
239 ///
240 /// This includes non-spacing, spacing and enclosing (i.e. accents).
241 ///
242 /// This is equivalent to the [`UTF8PROC_CASEFOLD`] option in the C library.
243 ///
244 /// [`UTF8PROC_CASEFOLD`]: utf8proc_option_t::UTF8PROC_CASEFOLD
245 pub strip_marks: bool,
246}
247impl CompositionOptions {
248 /// Enable composition, with no additional options.
249 #[inline]
250 pub const fn compose() -> CompositionOptions {
251 CompositionOptions {
252 direction: CompositionDirection::Compose,
253 strip_marks: false,
254 }
255 }
256
257 /// Enable decomposition, with no additional options.
258 #[inline]
259 pub const fn decompose() -> CompositionOptions {
260 CompositionOptions {
261 direction: CompositionDirection::Decompose,
262 ..Self::compose()
263 }
264 }
265}
266/// Controls whether composition or decomposition is being performed.
267#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
268#[must_use]
269pub enum CompositionDirection {
270 /// Enable composition, recomposing characters by canonical equivalence.
271 ///
272 /// This is the inverse of the [`UTF8PROC_COMPOSE`] option in the C library.
273 ///
274 /// [`UTF8PROC_COMPOSE`]: utf8proc_option_t::UTF8PROC_COMPOSE
275 Compose,
276 /// Enable decomposition, decomposing characters by canonical equivalence.
277 ///
278 /// This is the inverse of the [`UTF8PROC_DECOMPOSE`] option in the C library.
279 ///
280 /// [`UTF8PROC_DECOMPOSE`]: utf8proc_option_t::UTF8PROC_DECOMPOSE
281 Decompose,
282}
283
284/// Indicates how NLF-sequences (LF, CRLF, CR, NEL) should be converted.
285#[derive(Copy, Clone, Debug)]
286#[non_exhaustive]
287#[must_use]
288pub enum NlfConversionMode {
289 /// Indicates that NLF-sequences are representing a
290 /// line break, and should be converted to the codepoint for line
291 /// separation (LS).
292 ///
293 /// This is equivalent to the [`UTF8PROC_NLF2LS`] option in the C library.
294 ///
295 /// [`UTF8PROC_NLF2LS`]: utf8proc_option_t::UTF8PROC_NLF2LS
296 LineSeparation,
297 /// Indicates that NLF-sequences are representing a paragraph break, and
298 /// should be converted to the codepoint for paragraph separation (PS).
299 ///
300 /// This is equivalent to the [`UTF8PROC_NLF2PS`] option in the C library.
301 ///
302 /// [`UTF8PROC_NLF2PS`]: utf8proc_option_t::UTF8PROC_NLF2PS
303 ParagraphSeparator,
304 /// Indicates that the meaning of NLF-sequences is unknown.
305 ///
306 /// Note that this option is distinct from disabling NLF conversion.
307 ///
308 ///
309 /// This is equivalent to the [`UTF8PROC_NLF2LF`] option in the C library.
310 ///
311 /// [`UTF8PROC_NLF2LF`]: utf8proc_option_t::UTF8PROC_NLF2LF
312 Unknown,
313}