Skip to main content

anno_core/core/
historical.rs

1//! Historical provenance types for ancient/historical text processing.
2//!
3//! # Research Context
4//!
5//! When processing ancient languages and historical texts, entities require
6//! additional metadata that modern NLP typically ignores:
7//!
8//! - **Temporal provenance**: When was the text written? BCE/CE dates.
9//! - **Epigraphic medium**: Stone inscription vs papyrus vs clay tablet.
10//! - **Script/writing system**: Cuneiform, hieroglyphic, Linear B, etc.
11//! - **Archaeological context**: Where found, current location, preservation.
12//!
13//! This module provides types for capturing this metadata, inspired by:
14//! - Sommerschield et al. (2023): "Machine Learning for Ancient Languages"
15//! - Digital epigraphy standards (EpiDoc, CIDOC-CRM)
16//! - Ancient language corpora (ORACC, Perseus, TLG)
17//!
18//! # Example
19//!
20//! ```rust
21//! use anno_core::core::historical::{HistoricalProvenance, EpigraphicMedium, HistoricalDate, Era};
22//!
23//! let provenance = HistoricalProvenance::new()
24//!     .with_date(HistoricalDate::range_bce(1500, 1150))
25//!     .with_medium(EpigraphicMedium::ClayTablet)
26//!     .with_script("Cypro-Minoan")
27//!     .with_find_spot("Enkomi, Cyprus")
28//!     .with_corpus("ENKO");
29//!
30//! // Check if entity is from Bronze Age
31//! assert!(provenance.is_bronze_age());
32//! ```
33
34use serde::{Deserialize, Serialize};
35
36// =============================================================================
37// Historical Date
38// =============================================================================
39
40/// Era designation for historical dates.
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
42pub enum Era {
43    /// Before Common Era (= BC)
44    BCE,
45    /// Common Era (= AD)
46    #[default]
47    CE,
48}
49
50impl std::fmt::Display for Era {
51    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52        match self {
53            Era::BCE => write!(f, "BCE"),
54            Era::CE => write!(f, "CE"),
55        }
56    }
57}
58
59/// A historical date, possibly imprecise.
60///
61/// Ancient dates are often imprecise (e.g., "circa 1500 BCE", "15th century BCE").
62/// This type captures both point-in-time and range dates.
63#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
64pub struct HistoricalDate {
65    /// Start year (negative for BCE in internal representation)
66    pub year_start: i32,
67    /// End year (if a range)
68    pub year_end: Option<i32>,
69    /// Era for display purposes
70    pub era: Era,
71    /// Whether the date is approximate ("circa")
72    pub circa: bool,
73    /// Textual note (e.g., "Late Bronze Age")
74    pub note: Option<String>,
75}
76
77impl HistoricalDate {
78    /// Create a point-in-time CE date.
79    pub fn ce(year: i32) -> Self {
80        Self {
81            year_start: year,
82            year_end: None,
83            era: Era::CE,
84            circa: false,
85            note: None,
86        }
87    }
88
89    /// Create a point-in-time BCE date.
90    pub fn bce(year: i32) -> Self {
91        Self {
92            year_start: -year.abs(),
93            year_end: None,
94            era: Era::BCE,
95            circa: false,
96            note: None,
97        }
98    }
99
100    /// Create a range of years (BCE).
101    ///
102    /// Note: `start` and `end` should be positive; internally stored as negative.
103    pub fn range_bce(start: i32, end: i32) -> Self {
104        Self {
105            year_start: -start.abs(),
106            year_end: Some(-end.abs()),
107            era: Era::BCE,
108            circa: false,
109            note: None,
110        }
111    }
112
113    /// Create a range of years (CE).
114    pub fn range_ce(start: i32, end: i32) -> Self {
115        Self {
116            year_start: start,
117            year_end: Some(end),
118            era: Era::CE,
119            circa: false,
120            note: None,
121        }
122    }
123
124    /// Mark as approximate ("circa").
125    pub fn circa(mut self) -> Self {
126        self.circa = true;
127        self
128    }
129
130    /// Add a textual note.
131    pub fn with_note(mut self, note: impl Into<String>) -> Self {
132        self.note = Some(note.into());
133        self
134    }
135
136    /// Check if this date is in the Bronze Age (roughly 3300-1200 BCE).
137    pub fn is_bronze_age(&self) -> bool {
138        // Bronze Age: ~3300 BCE to ~1200 BCE
139        self.year_start <= -1200 && self.year_start >= -3300
140    }
141
142    /// Check if this date is in the Iron Age (roughly 1200-500 BCE).
143    pub fn is_iron_age(&self) -> bool {
144        self.year_start <= -500 && self.year_start >= -1200
145    }
146
147    /// Check if this is an ancient date (before 500 CE).
148    pub fn is_ancient(&self) -> bool {
149        self.year_start < 500
150    }
151
152    /// Get the midpoint year (useful for sorting/comparison).
153    pub fn midpoint(&self) -> i32 {
154        match self.year_end {
155            Some(end) => (self.year_start + end) / 2,
156            None => self.year_start,
157        }
158    }
159}
160
161impl std::fmt::Display for HistoricalDate {
162    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
163        let prefix = if self.circa { "c. " } else { "" };
164
165        let display_year = |y: i32| -> (i32, Era) {
166            if y < 0 {
167                (-y, Era::BCE)
168            } else {
169                (y, Era::CE)
170            }
171        };
172
173        let (start_abs, start_era) = display_year(self.year_start);
174
175        if let Some(end) = self.year_end {
176            let (end_abs, _) = display_year(end);
177            write!(f, "{}{}-{} {}", prefix, start_abs, end_abs, start_era)?;
178        } else {
179            write!(f, "{}{} {}", prefix, start_abs, start_era)?;
180        }
181
182        if let Some(ref note) = self.note {
183            write!(f, " ({})", note)?;
184        }
185
186        Ok(())
187    }
188}
189
190// =============================================================================
191// Epigraphic Medium
192// =============================================================================
193
194/// The physical medium on which text is written.
195///
196/// Different media require different OCR/HTR approaches and have
197/// characteristic preservation patterns.
198#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
199#[non_exhaustive]
200pub enum EpigraphicMedium {
201    /// Clay tablet (Mesopotamian cuneiform, Linear B, etc.)
202    ClayTablet,
203    /// Stone inscription (Greek/Roman epigraphy, Egyptian hieroglyphs)
204    Stone,
205    /// Papyrus (Egyptian, Greek papyri)
206    Papyrus,
207    /// Parchment/vellum (medieval manuscripts)
208    Parchment,
209    /// Metal (bronze, lead tablets)
210    Metal,
211    /// Pottery/ceramic (ostraca, vessel inscriptions)
212    Pottery,
213    /// Wax tablet (Roman/medieval note-taking)
214    WaxTablet,
215    /// Wood (wooden tablets, bamboo slips)
216    Wood,
217    /// Seal impression (cylinder seals, stamp seals)
218    Seal,
219    /// Coin (numismatic inscriptions)
220    Coin,
221    /// Other medium with description
222    Other(String),
223}
224
225impl EpigraphicMedium {
226    /// Get typical preservation characteristics.
227    pub fn preservation_notes(&self) -> &'static str {
228        match self {
229            EpigraphicMedium::ClayTablet => "Surcernos fire; damaged by water",
230            EpigraphicMedium::Stone => "Durable; may have erosion/damage",
231            EpigraphicMedium::Papyrus => "Fragile; surcernos in dry climates only",
232            EpigraphicMedium::Parchment => "Durable; may have damage/palimpsest",
233            EpigraphicMedium::Metal => "Durable; may have corrosion",
234            EpigraphicMedium::Pottery => "Durable; often fragmentary",
235            EpigraphicMedium::WaxTablet => "Extremely rare survival",
236            EpigraphicMedium::Wood => "Rare survival except in dry/waterlogged contexts",
237            EpigraphicMedium::Seal => "Durable; small scale",
238            EpigraphicMedium::Coin => "Durable; standardized format",
239            EpigraphicMedium::Other(_) => "Variable preservation",
240        }
241    }
242
243    /// Whether this medium typically requires specialized OCR/HTR.
244    pub fn requires_specialized_ocr(&self) -> bool {
245        matches!(
246            self,
247            EpigraphicMedium::ClayTablet | EpigraphicMedium::Papyrus | EpigraphicMedium::Seal
248        )
249    }
250}
251
252impl std::fmt::Display for EpigraphicMedium {
253    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
254        match self {
255            EpigraphicMedium::ClayTablet => write!(f, "Clay tablet"),
256            EpigraphicMedium::Stone => write!(f, "Stone"),
257            EpigraphicMedium::Papyrus => write!(f, "Papyrus"),
258            EpigraphicMedium::Parchment => write!(f, "Parchment"),
259            EpigraphicMedium::Metal => write!(f, "Metal"),
260            EpigraphicMedium::Pottery => write!(f, "Pottery/Ostracon"),
261            EpigraphicMedium::WaxTablet => write!(f, "Wax tablet"),
262            EpigraphicMedium::Wood => write!(f, "Wood"),
263            EpigraphicMedium::Seal => write!(f, "Seal"),
264            EpigraphicMedium::Coin => write!(f, "Coin"),
265            EpigraphicMedium::Other(s) => write!(f, "{}", s),
266        }
267    }
268}
269
270// =============================================================================
271// Writing System
272// =============================================================================
273
274/// Ancient writing system classification.
275///
276/// Important for understanding character-level processing requirements.
277#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
278#[non_exhaustive]
279pub enum WritingSystem {
280    /// Alphabetic (Greek, Latin, Phoenician)
281    Alphabetic,
282    /// Syllabic (Linear A/B, Cypro-Minoan, Cherokee)
283    Syllabic,
284    /// Logographic (Chinese, Egyptian hieroglyphs)
285    Logographic,
286    /// Logosyllabic (Cuneiform, Maya)
287    Logosyllabic,
288    /// Abjad - consonantal alphabet (Hebrew, Arabic, Phoenician)
289    Abjad,
290    /// Abugida - consonant-vowel combinations (Brahmic scripts)
291    Abugida,
292    /// Undeciphered (script system unknown)
293    Undeciphered,
294    /// Other with description
295    Other(String),
296}
297
298impl WritingSystem {
299    /// Whether this system is fully deciphered.
300    pub fn is_deciphered(&self) -> bool {
301        !matches!(self, WritingSystem::Undeciphered)
302    }
303
304    /// Whether word boundaries are typically explicit.
305    pub fn has_word_boundaries(&self) -> bool {
306        matches!(self, WritingSystem::Alphabetic | WritingSystem::Abjad)
307    }
308}
309
310impl std::fmt::Display for WritingSystem {
311    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
312        match self {
313            WritingSystem::Alphabetic => write!(f, "Alphabetic"),
314            WritingSystem::Syllabic => write!(f, "Syllabic"),
315            WritingSystem::Logographic => write!(f, "Logographic"),
316            WritingSystem::Logosyllabic => write!(f, "Logosyllabic"),
317            WritingSystem::Abjad => write!(f, "Abjad (consonantal)"),
318            WritingSystem::Abugida => write!(f, "Abugida"),
319            WritingSystem::Undeciphered => write!(f, "Undeciphered"),
320            WritingSystem::Other(s) => write!(f, "{}", s),
321        }
322    }
323}
324
325// =============================================================================
326// Historical Provenance
327// =============================================================================
328
329/// Full provenance information for historical/ancient text.
330///
331/// Captures the archaeological, temporal, and linguistic context
332/// that is essential for proper interpretation of ancient texts.
333#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
334pub struct HistoricalProvenance {
335    /// Date or date range of the text
336    pub date: Option<HistoricalDate>,
337    /// Physical medium
338    pub medium: Option<EpigraphicMedium>,
339    /// Script name (e.g., "Cypro-Minoan", "Linear B", "Demotic")
340    pub script: Option<String>,
341    /// Writing system classification
342    pub writing_system: Option<WritingSystem>,
343    /// Language (if known)
344    pub language: Option<String>,
345    /// Find spot / provenance (e.g., "Enkomi, Cyprus")
346    pub find_spot: Option<String>,
347    /// Current location (museum, collection)
348    pub current_location: Option<String>,
349    /// Corpus/catalog identifier (e.g., "ENKO", "KN", "P.Oxy.")
350    pub corpus: Option<String>,
351    /// Object number within corpus
352    pub object_number: Option<String>,
353    /// Publication reference
354    pub publication: Option<String>,
355    /// Preservation state (0.0 = destroyed, 1.0 = perfect)
356    pub preservation: Option<f64>,
357    /// Whether text is fragmentary
358    pub fragmentary: bool,
359    /// Additional notes
360    pub notes: Option<String>,
361}
362
363impl HistoricalProvenance {
364    /// Create empty provenance.
365    pub fn new() -> Self {
366        Self::default()
367    }
368
369    /// Set the date.
370    pub fn with_date(mut self, date: HistoricalDate) -> Self {
371        self.date = Some(date);
372        self
373    }
374
375    /// Set the medium.
376    pub fn with_medium(mut self, medium: EpigraphicMedium) -> Self {
377        self.medium = Some(medium);
378        self
379    }
380
381    /// Set the script name.
382    pub fn with_script(mut self, script: impl Into<String>) -> Self {
383        self.script = Some(script.into());
384        self
385    }
386
387    /// Set the writing system.
388    pub fn with_writing_system(mut self, system: WritingSystem) -> Self {
389        self.writing_system = Some(system);
390        self
391    }
392
393    /// Set the language.
394    pub fn with_language(mut self, lang: impl Into<String>) -> Self {
395        self.language = Some(lang.into());
396        self
397    }
398
399    /// Set the find spot.
400    pub fn with_find_spot(mut self, spot: impl Into<String>) -> Self {
401        self.find_spot = Some(spot.into());
402        self
403    }
404
405    /// Set the current location.
406    pub fn with_current_location(mut self, loc: impl Into<String>) -> Self {
407        self.current_location = Some(loc.into());
408        self
409    }
410
411    /// Set the corpus identifier.
412    pub fn with_corpus(mut self, corpus: impl Into<String>) -> Self {
413        self.corpus = Some(corpus.into());
414        self
415    }
416
417    /// Set the object number.
418    pub fn with_object_number(mut self, num: impl Into<String>) -> Self {
419        self.object_number = Some(num.into());
420        self
421    }
422
423    /// Set preservation state (0.0-1.0).
424    pub fn with_preservation(mut self, pres: f64) -> Self {
425        self.preservation = Some(pres.clamp(0.0, 1.0));
426        self
427    }
428
429    /// Mark as fragmentary.
430    pub fn fragmentary(mut self) -> Self {
431        self.fragmentary = true;
432        self
433    }
434
435    /// Add notes.
436    pub fn with_notes(mut self, notes: impl Into<String>) -> Self {
437        self.notes = Some(notes.into());
438        self
439    }
440
441    /// Check if from Bronze Age.
442    pub fn is_bronze_age(&self) -> bool {
443        self.date
444            .as_ref()
445            .map(|d| d.is_bronze_age())
446            .unwrap_or(false)
447    }
448
449    /// Check if from Iron Age.
450    pub fn is_iron_age(&self) -> bool {
451        self.date.as_ref().map(|d| d.is_iron_age()).unwrap_or(false)
452    }
453
454    /// Check if ancient (before 500 CE).
455    pub fn is_ancient(&self) -> bool {
456        self.date.as_ref().map(|d| d.is_ancient()).unwrap_or(false)
457    }
458
459    /// Check if the text is in an undeciphered script.
460    pub fn is_undeciphered(&self) -> bool {
461        self.writing_system == Some(WritingSystem::Undeciphered)
462    }
463
464    /// Format a citation string.
465    pub fn citation(&self) -> String {
466        let mut parts = Vec::new();
467
468        if let Some(ref corpus) = self.corpus {
469            if let Some(ref num) = self.object_number {
470                parts.push(format!("{} {}", corpus, num));
471            } else {
472                parts.push(corpus.clone());
473            }
474        }
475
476        if let Some(ref date) = self.date {
477            parts.push(date.to_string());
478        }
479
480        if let Some(ref spot) = self.find_spot {
481            parts.push(spot.clone());
482        }
483
484        parts.join(", ")
485    }
486}
487
488impl std::fmt::Display for HistoricalProvenance {
489    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
490        write!(f, "{}", self.citation())
491    }
492}
493
494// =============================================================================
495// Tests
496// =============================================================================
497
498#[cfg(test)]
499mod tests {
500    use super::*;
501
502    #[test]
503    fn test_historical_date_bce() {
504        let date = HistoricalDate::bce(1500);
505        assert_eq!(date.year_start, -1500);
506        assert!(date.is_bronze_age());
507        assert!(!date.is_iron_age());
508    }
509
510    #[test]
511    fn test_historical_date_range() {
512        let date = HistoricalDate::range_bce(1500, 1150);
513        assert_eq!(date.year_start, -1500);
514        assert_eq!(date.year_end, Some(-1150));
515        assert!(date.is_bronze_age());
516    }
517
518    #[test]
519    fn test_historical_date_display() {
520        let date = HistoricalDate::range_bce(1500, 1150).circa();
521        let s = format!("{}", date);
522        assert!(s.contains("c."));
523        assert!(s.contains("1500"));
524        assert!(s.contains("BCE"));
525    }
526
527    #[test]
528    fn test_historical_date_ce() {
529        let date = HistoricalDate::ce(2024);
530        assert_eq!(date.year_start, 2024);
531        assert!(!date.is_ancient());
532    }
533
534    #[test]
535    fn test_epigraphic_medium() {
536        let medium = EpigraphicMedium::ClayTablet;
537        assert!(medium.requires_specialized_ocr());
538        assert!(medium.preservation_notes().contains("fire"));
539    }
540
541    #[test]
542    fn test_writing_system() {
543        let undeciphered = WritingSystem::Undeciphered;
544        assert!(!undeciphered.is_deciphered());
545
546        let alphabetic = WritingSystem::Alphabetic;
547        assert!(alphabetic.has_word_boundaries());
548    }
549
550    #[test]
551    fn test_historical_provenance_builder() {
552        let prov = HistoricalProvenance::new()
553            .with_date(HistoricalDate::range_bce(1500, 1150))
554            .with_medium(EpigraphicMedium::ClayTablet)
555            .with_script("Cypro-Minoan")
556            .with_writing_system(WritingSystem::Undeciphered)
557            .with_find_spot("Enkomi, Cyprus")
558            .with_corpus("ENKO")
559            .with_object_number("001")
560            .fragmentary();
561
562        assert!(prov.is_bronze_age());
563        assert!(prov.is_undeciphered());
564        assert!(prov.fragmentary);
565        assert_eq!(prov.script, Some("Cypro-Minoan".to_string()));
566    }
567
568    #[test]
569    fn test_historical_provenance_citation() {
570        let prov = HistoricalProvenance::new()
571            .with_corpus("ENKO")
572            .with_object_number("001")
573            .with_date(HistoricalDate::range_bce(1500, 1150))
574            .with_find_spot("Enkomi, Cyprus");
575
576        let citation = prov.citation();
577        assert!(citation.contains("ENKO 001"));
578        assert!(citation.contains("BCE"));
579        assert!(citation.contains("Enkomi"));
580    }
581
582    #[test]
583    fn test_midpoint() {
584        let point = HistoricalDate::bce(1500);
585        assert_eq!(point.midpoint(), -1500);
586
587        let range = HistoricalDate::range_bce(1500, 1200);
588        assert_eq!(range.midpoint(), -1350);
589    }
590
591    #[test]
592    fn test_era_display() {
593        assert_eq!(format!("{}", Era::BCE), "BCE");
594        assert_eq!(format!("{}", Era::CE), "CE");
595    }
596
597    #[test]
598    fn test_preservation() {
599        let prov = HistoricalProvenance::new().with_preservation(0.75);
600        assert_eq!(prov.preservation, Some(0.75));
601
602        // Clamping
603        let clamped = HistoricalProvenance::new().with_preservation(1.5);
604        assert_eq!(clamped.preservation, Some(1.0));
605    }
606
607    #[test]
608    fn test_epic_literature_provenance() {
609        // Mahābhārata-style provenance: composed ~400 BCE - 400 CE
610        // but set in mythological time (Dwapara Yuga)
611        let mahabharata = HistoricalProvenance::new()
612            .with_date(HistoricalDate::range_bce(400, -400).with_note("composition period"))
613            .with_script("Devanagari")
614            .with_corpus("Mahābhārata");
615
616        // The composition period spans BCE to CE
617        assert!(!mahabharata.is_bronze_age());
618        assert_eq!(mahabharata.script, Some("Devanagari".to_string()));
619    }
620
621    #[test]
622    fn test_circa_display() {
623        let approx = HistoricalDate::bce(1500).circa();
624        let display = format!("{}", approx);
625        assert!(display.contains("c."));
626        assert!(display.contains("1500"));
627    }
628
629    #[test]
630    fn test_date_ordering() {
631        let bronze = HistoricalDate::bce(1500);
632        let iron = HistoricalDate::bce(800);
633        let modern = HistoricalDate::ce(2000);
634
635        // Midpoint ordering works for chronological sorting
636        assert!(bronze.midpoint() < iron.midpoint());
637        assert!(iron.midpoint() < modern.midpoint());
638    }
639}