Skip to main content

astorion/
lib.rs

1extern crate self as astorion;
2
3use regex::Regex;
4
5#[macro_use]
6mod macros;
7mod api;
8mod engine;
9mod rules;
10
11mod time_expr;
12
13pub use api::{
14    Context, Entity, NodeSummary, Options, ParseDetails, ParseResult, RegexProfilingOptions, parse, parse_verbose_with,
15    parse_with,
16};
17
18use crate::time_expr::TimeExpr;
19
20// --- Internal types ---------------------------------------------------------
21
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23pub(crate) enum Dimension {
24    Time,
25    RegexMatch,
26    Numeral,
27    // later: Number, AmountOfMoney, ...
28}
29
30#[derive(Debug, Clone)]
31pub(crate) struct Token {
32    pub dim: Dimension,
33    pub kind: TokenKind,
34}
35
36#[derive(Debug, Clone)]
37pub(crate) struct NumeralData {
38    pub value: f64,
39    pub grain: Option<u32>,
40    pub multipliable: bool,
41}
42
43#[derive(Debug, Clone)]
44pub(crate) enum TokenKind {
45    Numeral(NumeralData),
46    TimeExpr(TimeExpr),
47    RegexMatch(Vec<String>),
48}
49
50// Trait to convert rule production results into tokens
51pub(crate) trait IntoToken {
52    fn into_token(self) -> Option<Token>;
53}
54
55impl IntoToken for TimeExpr {
56    fn into_token(self) -> Option<Token> {
57        Some(Token { dim: Dimension::Time, kind: TokenKind::TimeExpr(self) })
58    }
59}
60
61impl IntoToken for NumeralData {
62    fn into_token(self) -> Option<Token> {
63        Some(Token { dim: Dimension::Numeral, kind: TokenKind::Numeral(self) })
64    }
65}
66
67// Pattern items used by rules: either a Regex to match text, or a Predicate
68// that matches an existing token in the stash.
69#[derive(Debug)]
70pub(crate) enum Pattern {
71    /// Match a regular expression against the original input. The `Regex`
72    /// is stored as a static reference (created via a `regex!` helper macro
73    /// in `src/macros.rs`).
74    Regex(&'static Regex),
75
76    /// Match an already-discovered `Token` using a predicate function. This
77    /// allows rules to combine previously found tokens (from the `Stash`).
78    Predicate(fn(&Token) -> bool),
79}
80
81pub(crate) type Production = Box<dyn Fn(&[Token]) -> Option<Token> + Send + Sync>;
82
83/// A parsing rule: a name, a positional `pattern` (vector of `Pattern` items)
84/// and a `production` function that receives the matched tokens and
85/// optionally returns a new `Token`.
86///
87/// Optional metadata fields enable selective rule activation (Step 3-4).
88pub(crate) struct Rule {
89    pub name: &'static str,
90    pub pattern: Vec<Pattern>,
91    pub production: Production,
92    /// Required phrases - ALL must appear in input for this rule to activate (AND logic).
93    pub required_phrases: &'static [&'static str],
94    /// Optional phrases - ANY one must appear in input for this rule to activate (OR logic).
95    pub optional_phrases: &'static [&'static str],
96    /// Bucket mask - rule only activates if input has matching buckets.
97    pub buckets: u32,
98    /// Required dimensions in stash before this rule activates.
99    pub deps: &'static [Dimension],
100    /// Priority for deterministic tie-breaking (higher = preferred).
101    pub priority: u16,
102}
103
104impl std::fmt::Debug for Rule {
105    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
106        f.debug_struct("Rule")
107            .field("name", &self.name)
108            .field("pattern", &self.pattern)
109            .field("production", &"<function>")
110            .field("buckets", &self.buckets)
111            .finish()
112    }
113}
114
115#[derive(Debug, Clone)]
116pub(crate) struct Range {
117    /// Start byte index (inclusive).
118    pub start: usize,
119    /// End byte index (exclusive).
120    pub end: usize,
121}
122
123/// Internal resolved token: a `Node` (parse-tree leaf), its resolved string
124/// value, and a `latent` flag. This is converted to the public `Entity`
125/// by higher-level API functions (not implemented in v1).
126#[derive(Debug, Clone)]
127pub(crate) struct ResolvedToken {
128    pub node: Node,
129    pub value: String, // for now, resolved value is just a String
130    pub latent: bool,
131}
132
133/// Basic parse tree node produced by rules. `Node` pairs a `Token` with the
134/// consumed `Range` from the original input.
135#[derive(Debug, Clone)]
136pub(crate) struct Node {
137    pub range: Range,
138    pub token: Token,
139    /// Name of the rule that produced this node (used for ranking/classification).
140    pub rule_name: &'static str,
141    /// Names of rules that directly contributed to producing this node.
142    ///
143    /// This is derived from the matched route (the tokens consumed by the rule),
144    /// and is used as classifier "features".
145    pub evidence: Vec<&'static str>,
146}
147
148// --- Stash: lightweight container for discovered nodes ----------------------
149
150#[derive(Debug, Clone)]
151pub(crate) struct Stash {
152    nodes: Vec<Node>,
153}
154
155impl Stash {
156    /// Create an empty `Stash`.
157    pub fn empty() -> Self {
158        Stash { nodes: Vec::new() }
159    }
160
161    /// Return true if the stash is empty.
162    pub fn null(&self) -> bool {
163        self.nodes.is_empty()
164    }
165
166    /// Get the nodes in this stash.
167    pub fn get_nodes(&self) -> Vec<Node> {
168        self.nodes.clone()
169    }
170
171    /// Return nodes sorted by `(start, end)`.
172    pub fn to_pos_ordered_list(&self) -> Vec<Node> {
173        let mut v = self.nodes.clone();
174        v.sort_by_key(|n| (n.range.start, n.range.end));
175        v
176    }
177
178    /// Return nodes sorted and filtered to those starting at or after `position`.
179    pub fn to_pos_ordered_list_from(&self, position: usize) -> Vec<Node> {
180        self.to_pos_ordered_list().into_iter().filter(|n| n.range.start >= position).collect()
181    }
182
183    /// Union two stashes; keeps nodes deduplicated by (start,end,dim[,numeral value]).
184    ///
185    /// When two nodes share the same position and dimension they are
186    /// de-duplicated; for `Numeral` tokens the numeric value is also
187    /// compared to avoid merging distinct numbers.
188    pub fn union(&self, other: &Stash) -> Stash {
189        let mut combined = self.nodes.clone();
190        combined.extend(other.nodes.clone());
191
192        // Deduplicate by position + dimension + token content.
193        //
194        // This must stay in sync with `Parser::node_key` semantics: many rules
195        // can produce multiple distinct `Time` values for the same span (e.g.
196        // raw-input vs a normalized holiday description), and we must not
197        // collapse them before resolution.
198        combined.sort_by_key(|n| (n.range.start, n.range.end));
199        combined.dedup_by(|a, b| {
200            if a.range.start != b.range.start
201                || a.range.end != b.range.end
202                || a.token.dim != b.token.dim
203                || a.rule_name != b.rule_name
204                || a.evidence != b.evidence
205            {
206                return false;
207            }
208
209            match (&a.token.kind, &b.token.kind) {
210                (crate::TokenKind::Numeral(da), crate::TokenKind::Numeral(db)) => da.value == db.value,
211                (crate::TokenKind::TimeExpr(ea), crate::TokenKind::TimeExpr(eb)) => ea == eb,
212                (crate::TokenKind::RegexMatch(ga), crate::TokenKind::RegexMatch(gb)) => ga.first() == gb.first(),
213                _ => false,
214            }
215        });
216
217        Stash { nodes: combined }
218    }
219
220    /// Insert a node into the stash (appends to internal vector).
221    pub fn insert(&mut self, node: Node) {
222        self.nodes.push(node);
223    }
224}
225
226// (Public API lives in `src/api.rs`.)
227
228// --- Internal pipeline ------------------------------------------------------
229
230// For v1: dumb "analyzer".
231// Here is where you’ll later plug in real rules / models.
232//
233// For demonstration, we:
234//   - look for the word "tomorrow"
235//   - return a single Time token if found
236// fn analyze(input: &str) -> Vec<ResolvedToken> {
237//     let needle = "tomorrow";
238//     if let Some(start) = input.find(needle) {
239//         let end = start + needle.len();
240//
241//         let token = Token { dim: Dimension::Time };
242//
243//         let resolved = ResolvedToken {
244//             range: Range { start, end },
245//             token,
246//             // Later this could be a structured datetime value
247//             value: "2025-12-12".to_string(), // dummy example
248//             latent: false,
249//         };
250//
251//         vec![resolved]
252//     } else {
253//         vec![]
254//     }
255// }
256
257// Convert internal representation to API `Entity`.
258// fn format_token(input: &str, resolved: ResolvedToken) -> Entity {
259//     let Range { start, end } = resolved.range;
260//     let dim = resolved.token.dim;
261//
262//     let body = input.get(start..end).unwrap_or("").to_string();
263//     let name = to_name(dim).to_string();
264//
265//     Entity {
266//         name,
267//         body,
268//         value: resolved.value,
269//         start,
270//         end,
271//         latent: resolved.latent,
272//     }
273// }
274
275// Map dimension to its string name.
276// fn to_name(dim: Dimension) -> &'static str {
277//     match dim {
278//         Dimension::Time => "time",
279//     }
280// }