astorion/lib.rs
1extern crate self as astorion;
2
3use regex::Regex;
4
5#[macro_use]
6mod macros;
7mod api;
8mod engine;
9mod rules;
10
11mod time_expr;
12
13pub use api::{
14 Context, Entity, NodeSummary, Options, ParseDetails, ParseResult, RegexProfilingOptions, parse, parse_verbose_with,
15 parse_with,
16};
17
18use crate::time_expr::TimeExpr;
19
20// --- Internal types ---------------------------------------------------------
21
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23pub(crate) enum Dimension {
24 Time,
25 RegexMatch,
26 Numeral,
27 // later: Number, AmountOfMoney, ...
28}
29
30#[derive(Debug, Clone)]
31pub(crate) struct Token {
32 pub dim: Dimension,
33 pub kind: TokenKind,
34}
35
36#[derive(Debug, Clone)]
37pub(crate) struct NumeralData {
38 pub value: f64,
39 pub grain: Option<u32>,
40 pub multipliable: bool,
41}
42
43#[derive(Debug, Clone)]
44pub(crate) enum TokenKind {
45 Numeral(NumeralData),
46 TimeExpr(TimeExpr),
47 RegexMatch(Vec<String>),
48}
49
50// Trait to convert rule production results into tokens
51pub(crate) trait IntoToken {
52 fn into_token(self) -> Option<Token>;
53}
54
55impl IntoToken for TimeExpr {
56 fn into_token(self) -> Option<Token> {
57 Some(Token { dim: Dimension::Time, kind: TokenKind::TimeExpr(self) })
58 }
59}
60
61impl IntoToken for NumeralData {
62 fn into_token(self) -> Option<Token> {
63 Some(Token { dim: Dimension::Numeral, kind: TokenKind::Numeral(self) })
64 }
65}
66
67// Pattern items used by rules: either a Regex to match text, or a Predicate
68// that matches an existing token in the stash.
69#[derive(Debug)]
70pub(crate) enum Pattern {
71 /// Match a regular expression against the original input. The `Regex`
72 /// is stored as a static reference (created via a `regex!` helper macro
73 /// in `src/macros.rs`).
74 Regex(&'static Regex),
75
76 /// Match an already-discovered `Token` using a predicate function. This
77 /// allows rules to combine previously found tokens (from the `Stash`).
78 Predicate(fn(&Token) -> bool),
79}
80
81pub(crate) type Production = Box<dyn Fn(&[Token]) -> Option<Token> + Send + Sync>;
82
83/// A parsing rule: a name, a positional `pattern` (vector of `Pattern` items)
84/// and a `production` function that receives the matched tokens and
85/// optionally returns a new `Token`.
86///
87/// Optional metadata fields enable selective rule activation (Step 3-4).
88pub(crate) struct Rule {
89 pub name: &'static str,
90 pub pattern: Vec<Pattern>,
91 pub production: Production,
92 /// Required phrases - ALL must appear in input for this rule to activate (AND logic).
93 pub required_phrases: &'static [&'static str],
94 /// Optional phrases - ANY one must appear in input for this rule to activate (OR logic).
95 pub optional_phrases: &'static [&'static str],
96 /// Bucket mask - rule only activates if input has matching buckets.
97 pub buckets: u32,
98 /// Required dimensions in stash before this rule activates.
99 pub deps: &'static [Dimension],
100 /// Priority for deterministic tie-breaking (higher = preferred).
101 pub priority: u16,
102}
103
104impl std::fmt::Debug for Rule {
105 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
106 f.debug_struct("Rule")
107 .field("name", &self.name)
108 .field("pattern", &self.pattern)
109 .field("production", &"<function>")
110 .field("buckets", &self.buckets)
111 .finish()
112 }
113}
114
115#[derive(Debug, Clone)]
116pub(crate) struct Range {
117 /// Start byte index (inclusive).
118 pub start: usize,
119 /// End byte index (exclusive).
120 pub end: usize,
121}
122
123/// Internal resolved token: a `Node` (parse-tree leaf), its resolved string
124/// value, and a `latent` flag. This is converted to the public `Entity`
125/// by higher-level API functions (not implemented in v1).
126#[derive(Debug, Clone)]
127pub(crate) struct ResolvedToken {
128 pub node: Node,
129 pub value: String, // for now, resolved value is just a String
130 pub latent: bool,
131}
132
133/// Basic parse tree node produced by rules. `Node` pairs a `Token` with the
134/// consumed `Range` from the original input.
135#[derive(Debug, Clone)]
136pub(crate) struct Node {
137 pub range: Range,
138 pub token: Token,
139 /// Name of the rule that produced this node (used for ranking/classification).
140 pub rule_name: &'static str,
141 /// Names of rules that directly contributed to producing this node.
142 ///
143 /// This is derived from the matched route (the tokens consumed by the rule),
144 /// and is used as classifier "features".
145 pub evidence: Vec<&'static str>,
146}
147
148// --- Stash: lightweight container for discovered nodes ----------------------
149
150#[derive(Debug, Clone)]
151pub(crate) struct Stash {
152 nodes: Vec<Node>,
153}
154
155impl Stash {
156 /// Create an empty `Stash`.
157 pub fn empty() -> Self {
158 Stash { nodes: Vec::new() }
159 }
160
161 /// Return true if the stash is empty.
162 pub fn null(&self) -> bool {
163 self.nodes.is_empty()
164 }
165
166 /// Get the nodes in this stash.
167 pub fn get_nodes(&self) -> Vec<Node> {
168 self.nodes.clone()
169 }
170
171 /// Return nodes sorted by `(start, end)`.
172 pub fn to_pos_ordered_list(&self) -> Vec<Node> {
173 let mut v = self.nodes.clone();
174 v.sort_by_key(|n| (n.range.start, n.range.end));
175 v
176 }
177
178 /// Return nodes sorted and filtered to those starting at or after `position`.
179 pub fn to_pos_ordered_list_from(&self, position: usize) -> Vec<Node> {
180 self.to_pos_ordered_list().into_iter().filter(|n| n.range.start >= position).collect()
181 }
182
183 /// Union two stashes; keeps nodes deduplicated by (start,end,dim[,numeral value]).
184 ///
185 /// When two nodes share the same position and dimension they are
186 /// de-duplicated; for `Numeral` tokens the numeric value is also
187 /// compared to avoid merging distinct numbers.
188 pub fn union(&self, other: &Stash) -> Stash {
189 let mut combined = self.nodes.clone();
190 combined.extend(other.nodes.clone());
191
192 // Deduplicate by position + dimension + token content.
193 //
194 // This must stay in sync with `Parser::node_key` semantics: many rules
195 // can produce multiple distinct `Time` values for the same span (e.g.
196 // raw-input vs a normalized holiday description), and we must not
197 // collapse them before resolution.
198 combined.sort_by_key(|n| (n.range.start, n.range.end));
199 combined.dedup_by(|a, b| {
200 if a.range.start != b.range.start
201 || a.range.end != b.range.end
202 || a.token.dim != b.token.dim
203 || a.rule_name != b.rule_name
204 || a.evidence != b.evidence
205 {
206 return false;
207 }
208
209 match (&a.token.kind, &b.token.kind) {
210 (crate::TokenKind::Numeral(da), crate::TokenKind::Numeral(db)) => da.value == db.value,
211 (crate::TokenKind::TimeExpr(ea), crate::TokenKind::TimeExpr(eb)) => ea == eb,
212 (crate::TokenKind::RegexMatch(ga), crate::TokenKind::RegexMatch(gb)) => ga.first() == gb.first(),
213 _ => false,
214 }
215 });
216
217 Stash { nodes: combined }
218 }
219
220 /// Insert a node into the stash (appends to internal vector).
221 pub fn insert(&mut self, node: Node) {
222 self.nodes.push(node);
223 }
224}
225
226// (Public API lives in `src/api.rs`.)
227
228// --- Internal pipeline ------------------------------------------------------
229
230// For v1: dumb "analyzer".
231// Here is where you’ll later plug in real rules / models.
232//
233// For demonstration, we:
234// - look for the word "tomorrow"
235// - return a single Time token if found
236// fn analyze(input: &str) -> Vec<ResolvedToken> {
237// let needle = "tomorrow";
238// if let Some(start) = input.find(needle) {
239// let end = start + needle.len();
240//
241// let token = Token { dim: Dimension::Time };
242//
243// let resolved = ResolvedToken {
244// range: Range { start, end },
245// token,
246// // Later this could be a structured datetime value
247// value: "2025-12-12".to_string(), // dummy example
248// latent: false,
249// };
250//
251// vec![resolved]
252// } else {
253// vec![]
254// }
255// }
256
257// Convert internal representation to API `Entity`.
258// fn format_token(input: &str, resolved: ResolvedToken) -> Entity {
259// let Range { start, end } = resolved.range;
260// let dim = resolved.token.dim;
261//
262// let body = input.get(start..end).unwrap_or("").to_string();
263// let name = to_name(dim).to_string();
264//
265// Entity {
266// name,
267// body,
268// value: resolved.value,
269// start,
270// end,
271// latent: resolved.latent,
272// }
273// }
274
275// Map dimension to its string name.
276// fn to_name(dim: Dimension) -> &'static str {
277// match dim {
278// Dimension::Time => "time",
279// }
280// }