Skip to main content

oxicuda_seq/tagging/
bioes.rs

1//! BIO / BIOES tag-scheme conversion, validation, and span extraction.
2//!
3//! Sequence-labelling tasks (NER, chunking) encode multi-token spans with a
4//! *tagging scheme*.  Two common schemes are:
5//!
6//! * **BIO** (a.k.a. IOB2): `B-X` begins a span of type `X`, `I-X` continues it,
7//!   `O` is outside any span.
8//! * **BIOES** (a.k.a. BILOU): adds `E-X` (end of a multi-token span) and `S-X`
9//!   (single-token span), which gives the model an explicit "end" signal and
10//!   often improves boundary accuracy.
11//!
12//! A tag is represented as a [`Tag`] enum.  Tags are parsed from / formatted to
13//! the canonical `"<prefix>-<type>"` string (or `"O"`).  This module provides:
14//!
15//! * [`Tag::parse`] / [`Tag::to_tag_string`] — string ⇆ enum.
16//! * [`bio_to_bioes`] / [`bioes_to_bio`] — scheme conversion.
17//! * [`validate_bio`] / [`validate_bioes`] — well-formedness checks.
18//! * [`extract_spans`] — robust span extraction (seqeval semantics).
19
20use crate::error::{SeqError, SeqResult};
21
22// ─── Tag ─────────────────────────────────────────────────────────────────────
23
24/// A single tagging-scheme tag.
25#[derive(Debug, Clone, PartialEq, Eq)]
26pub enum Tag {
27    /// Outside any span.
28    Outside,
29    /// Begin a span of the given type.
30    Begin(String),
31    /// Inside (continue) a span of the given type.
32    Inside(String),
33    /// End of a multi-token span of the given type (BIOES only).
34    End(String),
35    /// Single-token span of the given type (BIOES only).
36    Single(String),
37}
38
39impl Tag {
40    /// Parse a tag string such as `"B-PER"`, `"I-LOC"`, `"S-ORG"`, or `"O"`.
41    ///
42    /// The case-sensitive single-letter prefix selects the variant; the
43    /// remainder after the `-` is the entity type.
44    ///
45    /// # Errors
46    ///
47    /// [`SeqError::InvalidObservation`] for an unrecognised prefix or a missing
48    /// `-type` body on a non-`O` tag.
49    pub fn parse(s: &str) -> SeqResult<Tag> {
50        if s == "O" {
51            return Ok(Tag::Outside);
52        }
53        let mut parts = s.splitn(2, '-');
54        let prefix = parts.next().unwrap_or("");
55        let body = parts.next();
56        let ty = match body {
57            Some(b) if !b.is_empty() => b.to_string(),
58            _ => {
59                return Err(SeqError::InvalidObservation(format!(
60                    "tag '{s}' is missing an entity type"
61                )));
62            }
63        };
64        match prefix {
65            "B" => Ok(Tag::Begin(ty)),
66            "I" => Ok(Tag::Inside(ty)),
67            "E" => Ok(Tag::End(ty)),
68            "S" => Ok(Tag::Single(ty)),
69            other => Err(SeqError::InvalidObservation(format!(
70                "unknown tag prefix '{other}' in '{s}'"
71            ))),
72        }
73    }
74
75    /// Format this tag back into its canonical string.
76    #[must_use]
77    pub fn to_tag_string(&self) -> String {
78        match self {
79            Tag::Outside => "O".to_string(),
80            Tag::Begin(t) => format!("B-{t}"),
81            Tag::Inside(t) => format!("I-{t}"),
82            Tag::End(t) => format!("E-{t}"),
83            Tag::Single(t) => format!("S-{t}"),
84        }
85    }
86
87    /// The entity type (`None` for [`Tag::Outside`]).
88    #[must_use]
89    pub fn entity_type(&self) -> Option<&str> {
90        match self {
91            Tag::Outside => None,
92            Tag::Begin(t) | Tag::Inside(t) | Tag::End(t) | Tag::Single(t) => Some(t.as_str()),
93        }
94    }
95}
96
97/// Parse a whole slice of tag strings into [`Tag`] values.
98///
99/// # Errors
100///
101/// Propagates the first [`Tag::parse`] failure.
102pub fn parse_tags(tags: &[&str]) -> SeqResult<Vec<Tag>> {
103    tags.iter().map(|t| Tag::parse(t)).collect()
104}
105
106// ─── Span ────────────────────────────────────────────────────────────────────
107
108/// An entity span: `(entity_type, start, end)` with `end` **inclusive**.
109#[derive(Debug, Clone, PartialEq, Eq, Hash)]
110pub struct Span {
111    /// Entity type label.
112    pub entity_type: String,
113    /// Inclusive start index.
114    pub start: usize,
115    /// Inclusive end index.
116    pub end: usize,
117}
118
119/// Extract entity spans from a tag sequence using seqeval-style semantics.
120///
121/// A new span opens whenever:
122/// * the tag is `B-X` or `S-X`, **or**
123/// * the tag is `I-X`/`E-X` but the previous position did not continue an open
124///   span of the same type `X` (robust handling of malformed input).
125///
126/// A span closes at an `E-X`/`S-X`, when the type changes, or at `O`.  This
127/// matches the behaviour of `seqeval.metrics.sequence_labeling.get_entities`.
128#[must_use]
129pub fn extract_spans(tags: &[Tag]) -> Vec<Span> {
130    let mut spans = Vec::new();
131    let mut open: Option<(String, usize)> = None; // (type, start)
132
133    for (i, tag) in tags.iter().enumerate() {
134        // Decide whether the currently-open span must close *before* this tag.
135        let close_before = match (&open, tag) {
136            (Some(_), Tag::Begin(_)) | (Some(_), Tag::Single(_)) | (Some(_), Tag::Outside) => true,
137            (Some((ot, _)), Tag::Inside(t)) | (Some((ot, _)), Tag::End(t)) => ot != t,
138            _ => false,
139        };
140        if close_before {
141            if let Some((ot, os)) = open.take() {
142                spans.push(Span {
143                    entity_type: ot,
144                    start: os,
145                    end: i - 1,
146                });
147            }
148        }
149
150        match tag {
151            Tag::Outside => {}
152            Tag::Begin(t) | Tag::Single(t) => {
153                // Open a fresh span starting here.
154                open = Some((t.clone(), i));
155            }
156            Tag::Inside(t) | Tag::End(t) => {
157                if open.is_none() {
158                    // Dangling I/E: treat as the beginning of a new span.
159                    open = Some((t.clone(), i));
160                }
161            }
162        }
163
164        // S/E close immediately *after* being counted.
165        if matches!(tag, Tag::Single(_) | Tag::End(_)) {
166            if let Some((ot, os)) = open.take() {
167                spans.push(Span {
168                    entity_type: ot,
169                    start: os,
170                    end: i,
171                });
172            }
173        }
174    }
175    // Flush a trailing open span.
176    if let Some((ot, os)) = open.take() {
177        spans.push(Span {
178            entity_type: ot,
179            start: os,
180            end: tags.len() - 1,
181        });
182    }
183    spans
184}
185
186// ─── Conversion ──────────────────────────────────────────────────────────────
187
188/// Convert a BIO tag sequence to BIOES.
189///
190/// Single-token spans become `S-X`; the final token of a multi-token span
191/// becomes `E-X`.  The input need not be perfectly well-formed: spans are first
192/// extracted with [`extract_spans`] (which repairs dangling `I-`), then
193/// re-emitted in BIOES.
194///
195/// # Errors
196///
197/// [`SeqError::EmptyInput`] if `tags` is empty.
198pub fn bio_to_bioes(tags: &[Tag]) -> SeqResult<Vec<Tag>> {
199    if tags.is_empty() {
200        return Err(SeqError::EmptyInput);
201    }
202    let spans = extract_spans(tags);
203    let mut out = vec![Tag::Outside; tags.len()];
204    for sp in spans {
205        if sp.start == sp.end {
206            out[sp.start] = Tag::Single(sp.entity_type);
207        } else {
208            out[sp.start] = Tag::Begin(sp.entity_type.clone());
209            for idx in sp.start + 1..sp.end {
210                out[idx] = Tag::Inside(sp.entity_type.clone());
211            }
212            out[sp.end] = Tag::End(sp.entity_type);
213        }
214    }
215    Ok(out)
216}
217
218/// Convert a BIOES tag sequence back to BIO.
219///
220/// `S-X` becomes `B-X`; `E-X` becomes `I-X`.  Spans are extracted first to
221/// normalise any malformed input.
222///
223/// # Errors
224///
225/// [`SeqError::EmptyInput`] if `tags` is empty.
226pub fn bioes_to_bio(tags: &[Tag]) -> SeqResult<Vec<Tag>> {
227    if tags.is_empty() {
228        return Err(SeqError::EmptyInput);
229    }
230    let spans = extract_spans(tags);
231    let mut out = vec![Tag::Outside; tags.len()];
232    for sp in spans {
233        out[sp.start] = Tag::Begin(sp.entity_type.clone());
234        for idx in sp.start + 1..=sp.end {
235            out[idx] = Tag::Inside(sp.entity_type.clone());
236        }
237    }
238    Ok(out)
239}
240
241// ─── Validation ──────────────────────────────────────────────────────────────
242
243/// Validate that a tag sequence is well-formed **BIO** (IOB2): every `I-X` must
244/// be preceded by a `B-X` or `I-X` of the *same* type, and no `E`/`S` tags are
245/// allowed.
246///
247/// # Errors
248///
249/// [`SeqError::GraphInvariantViolated`] describing the first violation.
250pub fn validate_bio(tags: &[Tag]) -> SeqResult<()> {
251    let mut prev: Option<&Tag> = None;
252    for (i, tag) in tags.iter().enumerate() {
253        match tag {
254            Tag::End(_) | Tag::Single(_) => {
255                return Err(SeqError::GraphInvariantViolated(format!(
256                    "BIO sequence contains BIOES tag '{}' at {i}",
257                    tag.to_tag_string()
258                )));
259            }
260            Tag::Inside(t) => {
261                let ok = matches!(prev, Some(Tag::Begin(p)) | Some(Tag::Inside(p)) if p == t);
262                if !ok {
263                    return Err(SeqError::GraphInvariantViolated(format!(
264                        "I-{t} at {i} not preceded by B-{t}/I-{t}"
265                    )));
266                }
267            }
268            _ => {}
269        }
270        prev = Some(tag);
271    }
272    Ok(())
273}
274
275/// Validate that a tag sequence is well-formed **BIOES**.
276///
277/// Rules enforced:
278/// * `I-X` and `E-X` must continue an *open* span of type `X` (i.e. the previous
279///   tag is `B-X` or `I-X`).
280/// * `B-X` must be *closed* by a subsequent `E-X` (via `I-X`*) — i.e. a span
281///   that starts with `B` may not be followed by `O`/`B`/`S` or a type change
282///   before an `E`.
283///
284/// # Errors
285///
286/// [`SeqError::GraphInvariantViolated`] describing the first violation.
287pub fn validate_bioes(tags: &[Tag]) -> SeqResult<()> {
288    let n = tags.len();
289    for i in 0..n {
290        match &tags[i] {
291            Tag::Inside(t) | Tag::End(t) => {
292                let prev_ok =
293                    i > 0 && matches!(&tags[i - 1], Tag::Begin(p) | Tag::Inside(p) if p == t);
294                if !prev_ok {
295                    return Err(SeqError::GraphInvariantViolated(format!(
296                        "{} at {i} does not continue an open span of type {t}",
297                        tags[i].to_tag_string()
298                    )));
299                }
300            }
301            Tag::Begin(t) => {
302                // Must be followed by I-t or E-t (a span cannot end with B).
303                let next_ok =
304                    i + 1 < n && matches!(&tags[i + 1], Tag::Inside(p) | Tag::End(p) if p == t);
305                if !next_ok {
306                    return Err(SeqError::GraphInvariantViolated(format!(
307                        "B-{t} at {i} is not continued by I-{t}/E-{t}"
308                    )));
309                }
310            }
311            _ => {}
312        }
313    }
314    Ok(())
315}
316
317// ─── Tests ───────────────────────────────────────────────────────────────────
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322
323    fn tags(strs: &[&str]) -> Vec<Tag> {
324        parse_tags(strs).expect("parse")
325    }
326
327    #[test]
328    fn parse_outside_and_typed() {
329        assert_eq!(Tag::parse("O").expect("o"), Tag::Outside);
330        assert_eq!(Tag::parse("B-PER").expect("b"), Tag::Begin("PER".into()));
331        assert_eq!(Tag::parse("I-LOC").expect("i"), Tag::Inside("LOC".into()));
332        assert_eq!(Tag::parse("E-ORG").expect("e"), Tag::End("ORG".into()));
333        assert_eq!(Tag::parse("S-MISC").expect("s"), Tag::Single("MISC".into()));
334    }
335
336    #[test]
337    fn parse_handles_hyphenated_types() {
338        // Entity type may itself contain a hyphen.
339        assert_eq!(
340            Tag::parse("B-GPE-CITY").expect("ok"),
341            Tag::Begin("GPE-CITY".into())
342        );
343    }
344
345    #[test]
346    fn parse_rejects_bad_prefix_and_empty_type() {
347        assert!(Tag::parse("X-PER").is_err());
348        assert!(Tag::parse("B-").is_err());
349        assert!(Tag::parse("B").is_err());
350    }
351
352    #[test]
353    fn to_tag_string_roundtrip() {
354        for s in ["O", "B-PER", "I-PER", "E-PER", "S-LOC"] {
355            assert_eq!(Tag::parse(s).expect("p").to_tag_string(), s);
356        }
357    }
358
359    #[test]
360    fn extract_single_and_multi_spans() {
361        // "B-PER I-PER O S-LOC" → PER[0..1], LOC[3..3]
362        let t = tags(&["B-PER", "I-PER", "O", "S-LOC"]);
363        let spans = extract_spans(&t);
364        assert_eq!(spans.len(), 2);
365        assert_eq!(
366            spans[0],
367            Span {
368                entity_type: "PER".into(),
369                start: 0,
370                end: 1
371            }
372        );
373        assert_eq!(
374            spans[1],
375            Span {
376                entity_type: "LOC".into(),
377                start: 3,
378                end: 3
379            }
380        );
381    }
382
383    #[test]
384    fn extract_adjacent_same_type_spans() {
385        // "B-PER B-PER" → two separate PER spans.
386        let t = tags(&["B-PER", "B-PER"]);
387        let spans = extract_spans(&t);
388        assert_eq!(spans.len(), 2);
389        assert_eq!(spans[0].start, 0);
390        assert_eq!(spans[0].end, 0);
391        assert_eq!(spans[1].start, 1);
392        assert_eq!(spans[1].end, 1);
393    }
394
395    #[test]
396    fn extract_dangling_inside_starts_new_span() {
397        // Malformed: "I-PER I-PER" with no B → still one PER span [0..1].
398        let t = tags(&["I-PER", "I-PER"]);
399        let spans = extract_spans(&t);
400        assert_eq!(spans.len(), 1);
401        assert_eq!(spans[0].start, 0);
402        assert_eq!(spans[0].end, 1);
403    }
404
405    #[test]
406    fn extract_type_change_splits() {
407        // "B-PER I-LOC" → PER[0] then LOC[1].
408        let t = tags(&["B-PER", "I-LOC"]);
409        let spans = extract_spans(&t);
410        assert_eq!(spans.len(), 2);
411        assert_eq!(spans[0].entity_type, "PER");
412        assert_eq!(spans[0].end, 0);
413        assert_eq!(spans[1].entity_type, "LOC");
414        assert_eq!(spans[1].start, 1);
415    }
416
417    #[test]
418    fn bio_to_bioes_basic() {
419        // B-PER I-PER O B-LOC  →  B-PER E-PER O S-LOC
420        let bio = tags(&["B-PER", "I-PER", "O", "B-LOC"]);
421        let bioes = bio_to_bioes(&bio).expect("conv");
422        let got: Vec<String> = bioes.iter().map(Tag::to_tag_string).collect();
423        assert_eq!(got, vec!["B-PER", "E-PER", "O", "S-LOC"]);
424    }
425
426    #[test]
427    fn bioes_to_bio_basic() {
428        // B-PER E-PER O S-LOC → B-PER I-PER O B-LOC
429        let bioes = tags(&["B-PER", "E-PER", "O", "S-LOC"]);
430        let bio = bioes_to_bio(&bioes).expect("conv");
431        let got: Vec<String> = bio.iter().map(Tag::to_tag_string).collect();
432        assert_eq!(got, vec!["B-PER", "I-PER", "O", "B-LOC"]);
433    }
434
435    #[test]
436    fn conversion_roundtrip_preserves_spans() {
437        let bio = tags(&["O", "B-PER", "I-PER", "I-PER", "O", "B-LOC", "O"]);
438        let bioes = bio_to_bioes(&bio).expect("a");
439        let back = bioes_to_bio(&bioes).expect("b");
440        assert_eq!(extract_spans(&bio), extract_spans(&back));
441    }
442
443    #[test]
444    fn convert_rejects_empty() {
445        assert!(matches!(bio_to_bioes(&[]), Err(SeqError::EmptyInput)));
446        assert!(matches!(bioes_to_bio(&[]), Err(SeqError::EmptyInput)));
447    }
448
449    #[test]
450    fn validate_bio_accepts_well_formed() {
451        let t = tags(&["O", "B-PER", "I-PER", "O", "B-LOC"]);
452        assert!(validate_bio(&t).is_ok());
453    }
454
455    #[test]
456    fn validate_bio_rejects_dangling_inside() {
457        let t = tags(&["O", "I-PER"]);
458        assert!(matches!(
459            validate_bio(&t),
460            Err(SeqError::GraphInvariantViolated(_))
461        ));
462    }
463
464    #[test]
465    fn validate_bio_rejects_type_mismatch_continuation() {
466        let t = tags(&["B-PER", "I-LOC"]);
467        assert!(validate_bio(&t).is_err());
468    }
469
470    #[test]
471    fn validate_bio_rejects_bioes_tags() {
472        let t = tags(&["S-PER"]);
473        assert!(validate_bio(&t).is_err());
474    }
475
476    #[test]
477    fn validate_bioes_accepts_well_formed() {
478        let t = tags(&["B-PER", "I-PER", "E-PER", "O", "S-LOC"]);
479        assert!(validate_bioes(&t).is_ok());
480    }
481
482    #[test]
483    fn validate_bioes_rejects_begin_without_end() {
484        // B-PER followed by O is illegal in BIOES.
485        let t = tags(&["B-PER", "O"]);
486        assert!(matches!(
487            validate_bioes(&t),
488            Err(SeqError::GraphInvariantViolated(_))
489        ));
490    }
491
492    #[test]
493    fn validate_bioes_rejects_dangling_end() {
494        let t = tags(&["E-PER"]);
495        assert!(validate_bioes(&t).is_err());
496    }
497}