1use crate::error::{SeqError, SeqResult};
21
22#[derive(Debug, Clone, PartialEq, Eq)]
26pub enum Tag {
27 Outside,
29 Begin(String),
31 Inside(String),
33 End(String),
35 Single(String),
37}
38
39impl Tag {
40 pub fn parse(s: &str) -> SeqResult<Tag> {
50 if s == "O" {
51 return Ok(Tag::Outside);
52 }
53 let mut parts = s.splitn(2, '-');
54 let prefix = parts.next().unwrap_or("");
55 let body = parts.next();
56 let ty = match body {
57 Some(b) if !b.is_empty() => b.to_string(),
58 _ => {
59 return Err(SeqError::InvalidObservation(format!(
60 "tag '{s}' is missing an entity type"
61 )));
62 }
63 };
64 match prefix {
65 "B" => Ok(Tag::Begin(ty)),
66 "I" => Ok(Tag::Inside(ty)),
67 "E" => Ok(Tag::End(ty)),
68 "S" => Ok(Tag::Single(ty)),
69 other => Err(SeqError::InvalidObservation(format!(
70 "unknown tag prefix '{other}' in '{s}'"
71 ))),
72 }
73 }
74
75 #[must_use]
77 pub fn to_tag_string(&self) -> String {
78 match self {
79 Tag::Outside => "O".to_string(),
80 Tag::Begin(t) => format!("B-{t}"),
81 Tag::Inside(t) => format!("I-{t}"),
82 Tag::End(t) => format!("E-{t}"),
83 Tag::Single(t) => format!("S-{t}"),
84 }
85 }
86
87 #[must_use]
89 pub fn entity_type(&self) -> Option<&str> {
90 match self {
91 Tag::Outside => None,
92 Tag::Begin(t) | Tag::Inside(t) | Tag::End(t) | Tag::Single(t) => Some(t.as_str()),
93 }
94 }
95}
96
97pub fn parse_tags(tags: &[&str]) -> SeqResult<Vec<Tag>> {
103 tags.iter().map(|t| Tag::parse(t)).collect()
104}
105
106#[derive(Debug, Clone, PartialEq, Eq, Hash)]
110pub struct Span {
111 pub entity_type: String,
113 pub start: usize,
115 pub end: usize,
117}
118
119#[must_use]
129pub fn extract_spans(tags: &[Tag]) -> Vec<Span> {
130 let mut spans = Vec::new();
131 let mut open: Option<(String, usize)> = None; for (i, tag) in tags.iter().enumerate() {
134 let close_before = match (&open, tag) {
136 (Some(_), Tag::Begin(_)) | (Some(_), Tag::Single(_)) | (Some(_), Tag::Outside) => true,
137 (Some((ot, _)), Tag::Inside(t)) | (Some((ot, _)), Tag::End(t)) => ot != t,
138 _ => false,
139 };
140 if close_before {
141 if let Some((ot, os)) = open.take() {
142 spans.push(Span {
143 entity_type: ot,
144 start: os,
145 end: i - 1,
146 });
147 }
148 }
149
150 match tag {
151 Tag::Outside => {}
152 Tag::Begin(t) | Tag::Single(t) => {
153 open = Some((t.clone(), i));
155 }
156 Tag::Inside(t) | Tag::End(t) => {
157 if open.is_none() {
158 open = Some((t.clone(), i));
160 }
161 }
162 }
163
164 if matches!(tag, Tag::Single(_) | Tag::End(_)) {
166 if let Some((ot, os)) = open.take() {
167 spans.push(Span {
168 entity_type: ot,
169 start: os,
170 end: i,
171 });
172 }
173 }
174 }
175 if let Some((ot, os)) = open.take() {
177 spans.push(Span {
178 entity_type: ot,
179 start: os,
180 end: tags.len() - 1,
181 });
182 }
183 spans
184}
185
186pub fn bio_to_bioes(tags: &[Tag]) -> SeqResult<Vec<Tag>> {
199 if tags.is_empty() {
200 return Err(SeqError::EmptyInput);
201 }
202 let spans = extract_spans(tags);
203 let mut out = vec![Tag::Outside; tags.len()];
204 for sp in spans {
205 if sp.start == sp.end {
206 out[sp.start] = Tag::Single(sp.entity_type);
207 } else {
208 out[sp.start] = Tag::Begin(sp.entity_type.clone());
209 for idx in sp.start + 1..sp.end {
210 out[idx] = Tag::Inside(sp.entity_type.clone());
211 }
212 out[sp.end] = Tag::End(sp.entity_type);
213 }
214 }
215 Ok(out)
216}
217
218pub fn bioes_to_bio(tags: &[Tag]) -> SeqResult<Vec<Tag>> {
227 if tags.is_empty() {
228 return Err(SeqError::EmptyInput);
229 }
230 let spans = extract_spans(tags);
231 let mut out = vec![Tag::Outside; tags.len()];
232 for sp in spans {
233 out[sp.start] = Tag::Begin(sp.entity_type.clone());
234 for idx in sp.start + 1..=sp.end {
235 out[idx] = Tag::Inside(sp.entity_type.clone());
236 }
237 }
238 Ok(out)
239}
240
241pub fn validate_bio(tags: &[Tag]) -> SeqResult<()> {
251 let mut prev: Option<&Tag> = None;
252 for (i, tag) in tags.iter().enumerate() {
253 match tag {
254 Tag::End(_) | Tag::Single(_) => {
255 return Err(SeqError::GraphInvariantViolated(format!(
256 "BIO sequence contains BIOES tag '{}' at {i}",
257 tag.to_tag_string()
258 )));
259 }
260 Tag::Inside(t) => {
261 let ok = matches!(prev, Some(Tag::Begin(p)) | Some(Tag::Inside(p)) if p == t);
262 if !ok {
263 return Err(SeqError::GraphInvariantViolated(format!(
264 "I-{t} at {i} not preceded by B-{t}/I-{t}"
265 )));
266 }
267 }
268 _ => {}
269 }
270 prev = Some(tag);
271 }
272 Ok(())
273}
274
275pub fn validate_bioes(tags: &[Tag]) -> SeqResult<()> {
288 let n = tags.len();
289 for i in 0..n {
290 match &tags[i] {
291 Tag::Inside(t) | Tag::End(t) => {
292 let prev_ok =
293 i > 0 && matches!(&tags[i - 1], Tag::Begin(p) | Tag::Inside(p) if p == t);
294 if !prev_ok {
295 return Err(SeqError::GraphInvariantViolated(format!(
296 "{} at {i} does not continue an open span of type {t}",
297 tags[i].to_tag_string()
298 )));
299 }
300 }
301 Tag::Begin(t) => {
302 let next_ok =
304 i + 1 < n && matches!(&tags[i + 1], Tag::Inside(p) | Tag::End(p) if p == t);
305 if !next_ok {
306 return Err(SeqError::GraphInvariantViolated(format!(
307 "B-{t} at {i} is not continued by I-{t}/E-{t}"
308 )));
309 }
310 }
311 _ => {}
312 }
313 }
314 Ok(())
315}
316
317#[cfg(test)]
320mod tests {
321 use super::*;
322
323 fn tags(strs: &[&str]) -> Vec<Tag> {
324 parse_tags(strs).expect("parse")
325 }
326
327 #[test]
328 fn parse_outside_and_typed() {
329 assert_eq!(Tag::parse("O").expect("o"), Tag::Outside);
330 assert_eq!(Tag::parse("B-PER").expect("b"), Tag::Begin("PER".into()));
331 assert_eq!(Tag::parse("I-LOC").expect("i"), Tag::Inside("LOC".into()));
332 assert_eq!(Tag::parse("E-ORG").expect("e"), Tag::End("ORG".into()));
333 assert_eq!(Tag::parse("S-MISC").expect("s"), Tag::Single("MISC".into()));
334 }
335
336 #[test]
337 fn parse_handles_hyphenated_types() {
338 assert_eq!(
340 Tag::parse("B-GPE-CITY").expect("ok"),
341 Tag::Begin("GPE-CITY".into())
342 );
343 }
344
345 #[test]
346 fn parse_rejects_bad_prefix_and_empty_type() {
347 assert!(Tag::parse("X-PER").is_err());
348 assert!(Tag::parse("B-").is_err());
349 assert!(Tag::parse("B").is_err());
350 }
351
352 #[test]
353 fn to_tag_string_roundtrip() {
354 for s in ["O", "B-PER", "I-PER", "E-PER", "S-LOC"] {
355 assert_eq!(Tag::parse(s).expect("p").to_tag_string(), s);
356 }
357 }
358
359 #[test]
360 fn extract_single_and_multi_spans() {
361 let t = tags(&["B-PER", "I-PER", "O", "S-LOC"]);
363 let spans = extract_spans(&t);
364 assert_eq!(spans.len(), 2);
365 assert_eq!(
366 spans[0],
367 Span {
368 entity_type: "PER".into(),
369 start: 0,
370 end: 1
371 }
372 );
373 assert_eq!(
374 spans[1],
375 Span {
376 entity_type: "LOC".into(),
377 start: 3,
378 end: 3
379 }
380 );
381 }
382
383 #[test]
384 fn extract_adjacent_same_type_spans() {
385 let t = tags(&["B-PER", "B-PER"]);
387 let spans = extract_spans(&t);
388 assert_eq!(spans.len(), 2);
389 assert_eq!(spans[0].start, 0);
390 assert_eq!(spans[0].end, 0);
391 assert_eq!(spans[1].start, 1);
392 assert_eq!(spans[1].end, 1);
393 }
394
395 #[test]
396 fn extract_dangling_inside_starts_new_span() {
397 let t = tags(&["I-PER", "I-PER"]);
399 let spans = extract_spans(&t);
400 assert_eq!(spans.len(), 1);
401 assert_eq!(spans[0].start, 0);
402 assert_eq!(spans[0].end, 1);
403 }
404
405 #[test]
406 fn extract_type_change_splits() {
407 let t = tags(&["B-PER", "I-LOC"]);
409 let spans = extract_spans(&t);
410 assert_eq!(spans.len(), 2);
411 assert_eq!(spans[0].entity_type, "PER");
412 assert_eq!(spans[0].end, 0);
413 assert_eq!(spans[1].entity_type, "LOC");
414 assert_eq!(spans[1].start, 1);
415 }
416
417 #[test]
418 fn bio_to_bioes_basic() {
419 let bio = tags(&["B-PER", "I-PER", "O", "B-LOC"]);
421 let bioes = bio_to_bioes(&bio).expect("conv");
422 let got: Vec<String> = bioes.iter().map(Tag::to_tag_string).collect();
423 assert_eq!(got, vec!["B-PER", "E-PER", "O", "S-LOC"]);
424 }
425
426 #[test]
427 fn bioes_to_bio_basic() {
428 let bioes = tags(&["B-PER", "E-PER", "O", "S-LOC"]);
430 let bio = bioes_to_bio(&bioes).expect("conv");
431 let got: Vec<String> = bio.iter().map(Tag::to_tag_string).collect();
432 assert_eq!(got, vec!["B-PER", "I-PER", "O", "B-LOC"]);
433 }
434
435 #[test]
436 fn conversion_roundtrip_preserves_spans() {
437 let bio = tags(&["O", "B-PER", "I-PER", "I-PER", "O", "B-LOC", "O"]);
438 let bioes = bio_to_bioes(&bio).expect("a");
439 let back = bioes_to_bio(&bioes).expect("b");
440 assert_eq!(extract_spans(&bio), extract_spans(&back));
441 }
442
443 #[test]
444 fn convert_rejects_empty() {
445 assert!(matches!(bio_to_bioes(&[]), Err(SeqError::EmptyInput)));
446 assert!(matches!(bioes_to_bio(&[]), Err(SeqError::EmptyInput)));
447 }
448
449 #[test]
450 fn validate_bio_accepts_well_formed() {
451 let t = tags(&["O", "B-PER", "I-PER", "O", "B-LOC"]);
452 assert!(validate_bio(&t).is_ok());
453 }
454
455 #[test]
456 fn validate_bio_rejects_dangling_inside() {
457 let t = tags(&["O", "I-PER"]);
458 assert!(matches!(
459 validate_bio(&t),
460 Err(SeqError::GraphInvariantViolated(_))
461 ));
462 }
463
464 #[test]
465 fn validate_bio_rejects_type_mismatch_continuation() {
466 let t = tags(&["B-PER", "I-LOC"]);
467 assert!(validate_bio(&t).is_err());
468 }
469
470 #[test]
471 fn validate_bio_rejects_bioes_tags() {
472 let t = tags(&["S-PER"]);
473 assert!(validate_bio(&t).is_err());
474 }
475
476 #[test]
477 fn validate_bioes_accepts_well_formed() {
478 let t = tags(&["B-PER", "I-PER", "E-PER", "O", "S-LOC"]);
479 assert!(validate_bioes(&t).is_ok());
480 }
481
482 #[test]
483 fn validate_bioes_rejects_begin_without_end() {
484 let t = tags(&["B-PER", "O"]);
486 assert!(matches!(
487 validate_bioes(&t),
488 Err(SeqError::GraphInvariantViolated(_))
489 ));
490 }
491
492 #[test]
493 fn validate_bioes_rejects_dangling_end() {
494 let t = tags(&["E-PER"]);
495 assert!(validate_bioes(&t).is_err());
496 }
497}