languagetool_rust/api/check/
data_annotations.rs

1//! Structures for handling data annotations.
2
3use crate::error::{Error, Result};
4
5use std::{borrow::Cow, mem};
6
7use lifetime::IntoStatic;
8use serde::{Deserialize, Serialize};
9
10/// A portion of text to be checked.
11#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize, Hash, IntoStatic)]
12#[non_exhaustive]
13#[serde(rename_all = "camelCase")]
14pub struct DataAnnotation<'source> {
15    /// Text that should be treated as normal text.
16    ///
17    /// This or `markup` is required.
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub text: Option<Cow<'source, str>>,
20    /// Text that should be treated as markup.
21    ///
22    /// This or `text` is required.
23    #[serde(skip_serializing_if = "Option::is_none")]
24    pub markup: Option<Cow<'source, str>>,
25    /// If set, the markup will be interpreted as this.
26    #[serde(skip_serializing_if = "Option::is_none")]
27    pub interpret_as: Option<Cow<'source, str>>,
28}
29
30impl<'source> DataAnnotation<'source> {
31    /// Instantiate a new `DataAnnotation` with text only.
32    #[inline]
33    #[must_use]
34    pub fn new_text<T: Into<Cow<'source, str>>>(text: T) -> Self {
35        Self {
36            text: Some(text.into()),
37            markup: None,
38            interpret_as: None,
39        }
40    }
41
42    /// Instantiate a new `DataAnnotation` with markup only.
43    #[inline]
44    #[must_use]
45    pub fn new_markup<M: Into<Cow<'source, str>>>(markup: M) -> Self {
46        Self {
47            text: None,
48            markup: Some(markup.into()),
49            interpret_as: None,
50        }
51    }
52
53    /// Instantiate a new `DataAnnotation` with markup and its interpretation.
54    #[inline]
55    #[must_use]
56    pub fn new_interpreted_markup<M: Into<Cow<'source, str>>, I: Into<Cow<'source, str>>>(
57        markup: M,
58        interpret_as: I,
59    ) -> Self {
60        Self {
61            interpret_as: Some(interpret_as.into()),
62            markup: Some(markup.into()),
63            text: None,
64        }
65    }
66
67    /// Return the text or markup within the data annotation.
68    ///
69    /// # Errors
70    ///
71    /// If this data annotation does not contain text or markup.
72    pub fn try_get_text(&self) -> Result<Cow<'source, str>> {
73        if let Some(ref text) = self.text {
74            Ok(text.clone())
75        } else if let Some(ref markup) = self.markup {
76            Ok(markup.clone())
77        } else {
78            Err(Error::InvalidDataAnnotation(format!(
79                "missing either text or markup field in {self:?}"
80            )))
81        }
82    }
83}
84
85#[cfg(test)]
86mod data_annotation_tests {
87
88    use super::DataAnnotation;
89
90    #[test]
91    fn test_text() {
92        let da = DataAnnotation::new_text("Hello");
93
94        assert_eq!(da.text.unwrap(), "Hello");
95        assert!(da.markup.is_none());
96        assert!(da.interpret_as.is_none());
97    }
98
99    #[test]
100    fn test_markup() {
101        let da = DataAnnotation::new_markup("<a>Hello</a>");
102
103        assert!(da.text.is_none());
104        assert_eq!(da.markup.unwrap(), "<a>Hello</a>");
105        assert!(da.interpret_as.is_none());
106    }
107
108    #[test]
109    fn test_interpreted_markup() {
110        let da = DataAnnotation::new_interpreted_markup("<a>Hello</a>", "Hello");
111
112        assert!(da.text.is_none());
113        assert_eq!(da.markup.unwrap(), "<a>Hello</a>");
114        assert_eq!(da.interpret_as.unwrap(), "Hello");
115    }
116}
117
118/// Alternative text to be checked.
119#[derive(Clone, Debug, Default, Deserialize, PartialEq, Eq, Hash)]
120#[non_exhaustive]
121pub struct Data<'source> {
122    /// Vector of markup text, see [`DataAnnotation`].
123    pub annotation: Vec<DataAnnotation<'source>>,
124}
125
126impl Data<'_> {
127    /// Split data into as few fragments as possible, where each fragment
128    /// contains (if possible) a maximum of `n` characters in it's
129    /// annotations' markup and text fields.
130    ///
131    /// Pattern str `pat` is used for splitting.
132    #[must_use]
133    pub fn split(self, n: usize, pat: &str) -> Vec<Self> {
134        // Build vec of breakpoints and the length of the text + markup at that
135        // potential breakpoint
136        let mut break_point_lengths = vec![];
137        let mut len = 0;
138        for (i, ann) in self.annotation.iter().enumerate() {
139            len +=
140                ann.text.as_deref().unwrap_or("").len() + ann.markup.as_deref().unwrap_or("").len();
141            if ann.text.as_ref().is_some_and(|t| t.contains(pat)) {
142                break_point_lengths.push((i, len));
143            }
144        }
145
146        // Decide which breakpoints to split the annotations at
147        let mut break_points: Vec<usize> = vec![];
148        if break_point_lengths.len() > 1 {
149            let (mut i, mut ii) = (0, 1);
150            let (mut base, mut curr) = (0, 0);
151            while ii < break_point_lengths.len() {
152                curr += break_point_lengths[i].1 - base;
153
154                if break_point_lengths[ii].1 - base + curr > n {
155                    break_points.push(break_point_lengths[i].0);
156                    base = break_point_lengths[i].1;
157                    curr = 0;
158                }
159
160                i += 1;
161                ii += 1;
162            }
163        }
164
165        // Split annotations based on calculated break points
166        let mut split = Vec::with_capacity(break_points.len());
167        let mut iter = self.into_iter();
168        let mut taken = 0;
169        let mut annotations = vec![];
170        for break_point in break_points {
171            while taken != break_point + 1 {
172                annotations.push(iter.next().unwrap());
173                taken += 1;
174            }
175            split.push(Data::from_iter(mem::take(&mut annotations)));
176        }
177
178        split
179    }
180}
181
182impl IntoStatic for Data<'_> {
183    type Static = Data<'static>;
184    fn into_static(self) -> Self::Static {
185        Data {
186            annotation: self
187                .annotation
188                .into_iter()
189                .map(IntoStatic::into_static)
190                .collect(),
191        }
192    }
193}
194
195impl<'source, T: Into<DataAnnotation<'source>>> FromIterator<T> for Data<'source> {
196    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
197        let annotation = iter.into_iter().map(std::convert::Into::into).collect();
198        Data { annotation }
199    }
200}
201
202impl<'source> IntoIterator for Data<'source> {
203    type Item = DataAnnotation<'source>;
204    type IntoIter = std::vec::IntoIter<Self::Item>;
205
206    fn into_iter(self) -> Self::IntoIter {
207        self.annotation.into_iter()
208    }
209}
210
211impl Serialize for Data<'_> {
212    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
213    where
214        S: serde::Serializer,
215    {
216        let mut map = std::collections::HashMap::new();
217        map.insert("annotation", &self.annotation);
218
219        serializer.serialize_str(&serde_json::to_string(&map).unwrap())
220    }
221}
222
223#[cfg(feature = "cli")]
224impl std::str::FromStr for Data<'_> {
225    type Err = Error;
226
227    fn from_str(s: &str) -> Result<Self> {
228        let v: Self = serde_json::from_str(s)?;
229        Ok(v)
230    }
231}
232
233#[cfg(test)]
234mod tests {
235    use std::borrow::Cow;
236
237    use super::super::{Data, DataAnnotation};
238
239    #[derive(Debug)]
240    enum Token<'source> {
241        Text(&'source str),
242        Skip(&'source str),
243    }
244
245    impl<'source> From<&'source str> for Token<'source> {
246        fn from(s: &'source str) -> Self {
247            if s.chars().all(|c| c.is_ascii_alphabetic()) {
248                Token::Text(s)
249            } else {
250                Token::Skip(s)
251            }
252        }
253    }
254
255    impl<'source> From<Token<'source>> for DataAnnotation<'source> {
256        fn from(token: Token<'source>) -> Self {
257            match token {
258                Token::Text(s) => DataAnnotation::new_text(s),
259                Token::Skip(s) => DataAnnotation::new_markup(s),
260            }
261        }
262    }
263
264    #[test]
265    fn test_data_annotation() {
266        let words: Vec<&str> = "My name is Q34XY".split(' ').collect();
267        let data: Data = words.iter().map(|w| Token::from(*w)).collect();
268
269        let expected_data = Data {
270            annotation: vec![
271                DataAnnotation::new_text("My"),
272                DataAnnotation::new_text("name"),
273                DataAnnotation::new_text("is"),
274                DataAnnotation::new_markup("Q34XY"),
275            ],
276        };
277
278        assert_eq!(data, expected_data);
279    }
280
281    #[test]
282    fn test_try_get_text() {
283        const TEXT: &str = "Lorem Ipsum";
284        assert_eq!(
285            DataAnnotation::new_text(TEXT).try_get_text().unwrap(),
286            Cow::from(TEXT)
287        );
288        assert_eq!(
289            DataAnnotation::new_markup(TEXT).try_get_text().unwrap(),
290            Cow::from(TEXT)
291        );
292        assert!((DataAnnotation {
293            text: None,
294            markup: None,
295            interpret_as: None
296        })
297        .try_get_text()
298        .is_err());
299    }
300}