Skip to main content

text_processing_rs/taggers/
measure.rs

1//! Measure tagger.
2//!
3//! Converts spoken measurements to written form:
4//! - "two hundred meters" → "200 m"
5//! - "eighteen point five kilometers" → "18.5 km"
6//! - "two hundred kilometers per hour" → "200 km/h"
7//! - "thirty one thousand square feet" → "31000 sq ft"
8
9use super::cardinal::words_to_number;
10use super::decimal;
11
12/// Parse spoken measurement expression to written form.
13pub fn parse(input: &str) -> Option<String> {
14    let input = input.to_lowercase();
15    let input = input.trim();
16
17    // Try compound units first (most specific)
18    if let Some(result) = parse_compound_unit(input) {
19        return Some(result);
20    }
21
22    // Try simple unit
23    if let Some(result) = parse_simple_unit(input) {
24        return Some(result);
25    }
26
27    None
28}
29
30/// Parse compound units like "kilometers per hour" → "km/h"
31fn parse_compound_unit(input: &str) -> Option<String> {
32    // Special case: "X miles per hour" → "X mph"
33    if input.ends_with(" miles per hour") {
34        let num_part = input.strip_suffix(" miles per hour")?;
35        let num_value = parse_number_value(num_part.trim())?;
36        return Some(format!("{} mph", num_value));
37    }
38
39    // Special case: "X kilograms force per square centimeter" → "X kgf/cm²"
40    if input.ends_with(" kilograms force per square centimeter") {
41        let num_part = input.strip_suffix(" kilograms force per square centimeter")?;
42        let num_value = parse_number_value(num_part.trim())?;
43        return Some(format!("{} kgf/cm²", num_value));
44    }
45
46    // Special case: "X per square Y" without unit (e.g., "fifty six per square kilometer")
47    if let Some(idx) = input.find(" per square ") {
48        let num_part = &input[..idx];
49        let denom_part = &input[idx + 12..]; // " per square " is 12 chars
50
51        // Parse numerator (just number, no unit)
52        let num_value = parse_number_value(num_part.trim())?;
53        let denom_unit = get_unit_symbol(denom_part)?;
54
55        return Some(format!("{} /{}²", num_value, denom_unit));
56    }
57
58    // "X per cubic Y" pattern
59    if let Some(idx) = input.find(" per cubic ") {
60        let num_part = &input[..idx];
61        let denom_part = &input[idx + 11..];
62
63        let num_value = parse_number_value(num_part.trim())?;
64        let denom_unit = get_unit_symbol(denom_part)?;
65
66        return Some(format!("{} /{}³", num_value, denom_unit));
67    }
68
69    // "X unit per Y" pattern (e.g., "kilometers per hour")
70    if let Some(idx) = input.find(" per ") {
71        let num_unit_part = &input[..idx];
72        let denom_part = &input[idx + 5..];
73
74        // Try to parse as number + unit
75        if let Some((num_value, num_unit)) = parse_number_and_unit(num_unit_part) {
76            let denom_unit = get_unit_symbol(denom_part)?;
77            return Some(format!("{} {}/{}", num_value, num_unit, denom_unit));
78        }
79    }
80
81    None
82}
83
84/// Parse simple measurement: number + unit
85fn parse_simple_unit(input: &str) -> Option<String> {
86    let (value, unit) = parse_number_and_unit(input)?;
87    Some(format!("{} {}", value, unit))
88}
89
90/// Parse number and unit from input, returning (formatted_number, unit_symbol)
91fn parse_number_and_unit(input: &str) -> Option<(String, String)> {
92    // Handle negative
93    let (is_negative, rest) = if input.starts_with("minus ") {
94        (true, input.strip_prefix("minus ")?)
95    } else {
96        (false, input)
97    };
98
99    // Try to find unit at the end
100    let (num_part, unit_symbol) = extract_unit(rest)?;
101
102    // Parse the number part
103    let num_value = parse_number_value(num_part.trim())?;
104
105    let sign = if is_negative { "-" } else { "" };
106    Some((format!("{}{}", sign, num_value), unit_symbol))
107}
108
109/// Extract unit from end of string, return (number_part, unit_symbol)
110fn extract_unit(input: &str) -> Option<(&str, String)> {
111    // Check for "miles per hour" first - special case for mph
112    if input.ends_with(" miles per hour") {
113        let num_part = input.strip_suffix(" miles per hour")?;
114        return Some((num_part, "mph".to_string()));
115    }
116
117    // Check for square/cubic prefixes
118    let (prefix, rest, modifier) = if input.contains(" square ") {
119        let idx = input.rfind(" square ")?;
120        let after_square = &input[idx + 8..];
121        (&input[..idx], after_square, "sq")
122    } else if input.contains(" cubic ") {
123        let idx = input.rfind(" cubic ")?;
124        let after_cubic = &input[idx + 7..];
125        (&input[..idx], after_cubic, "³")
126    } else {
127        (input, "", "")
128    };
129
130    // If we have a modifier (square/cubic), parse the unit from rest
131    if !modifier.is_empty() {
132        let unit = get_unit_symbol(rest)?;
133        // Use "sq ft", "sq mi" format for imperial, "m²", "km²" for metric
134        let formatted = if modifier == "sq" {
135            match unit {
136                "ft" => "sq ft".to_string(),
137                "mi" => "sq mi".to_string(),
138                _ => format!("{}²", unit),
139            }
140        } else {
141            format!("{}{}", unit, modifier)
142        };
143        return Some((prefix, formatted));
144    }
145
146    // Try each unit pattern from longest to shortest
147    for (spoken, symbol) in get_unit_mappings() {
148        if input.ends_with(spoken) {
149            let num_part = input.strip_suffix(spoken)?.trim();
150            return Some((num_part, symbol.to_string()));
151        }
152    }
153
154    None
155}
156
157/// Get unit symbol from spoken unit name
158fn get_unit_symbol(unit_name: &str) -> Option<&'static str> {
159    let unit_name = unit_name.trim();
160
161    for (spoken, symbol) in get_unit_mappings() {
162        // Remove leading space from spoken pattern for matching
163        let spoken_trimmed = spoken.trim();
164        if unit_name == spoken_trimmed || unit_name == spoken_trimmed.trim_end_matches('s') {
165            return Some(symbol);
166        }
167    }
168
169    // Handle singular/plural variations
170    match unit_name {
171        "meter" | "meters" => Some("m"),
172        "kilometer" | "kilometers" => Some("km"),
173        "centimeter" | "centimeters" => Some("cm"),
174        "decimeter" | "decimeters" | "deci meter" | "deci meters" => Some("dm"),
175        "millimeter" | "millimeters" => Some("mm"),
176        "micrometer" | "micrometers" => Some("μm"),
177        "nanometer" | "nanometers" => Some("nm"),
178        "foot" | "feet" => Some("ft"),
179        "mile" | "miles" => Some("mi"),
180        "hour" | "hours" => Some("h"),
181        "second" | "seconds" => Some("s"),
182        "minute" | "minutes" => Some("min"),
183        "gram" | "grams" => Some("g"),
184        "kilogram" | "kilograms" => Some("kg"),
185        "hectare" | "hectares" => Some("ha"),
186        "liter" | "liters" | "litre" | "litres" => Some("l"),
187        "milliliter" | "milliliters" => Some("ml"),
188        _ => None,
189    }
190}
191
192/// Get all unit mappings (spoken -> symbol)
193/// Ordered from longest to shortest to match most specific first
194fn get_unit_mappings() -> Vec<(&'static str, &'static str)> {
195    vec![
196        // Compound/special units (longest first)
197        (" kilo watt hours", "kWh"),
198        (" giga watt hours", "gWh"),
199        (" mega watt hours", "MWh"),
200        (" watt hours", "Wh"),
201        (" kilograms force", "kgf"),
202        (" astronomical units", "au"),
203        (" miles per hour", "mph"),
204        (" kilometers per hour", "km/h"),
205        // Square/cubic variations
206        (" square kilometers", "km²"),
207        (" square kilometer", "km²"),
208        (" square meters", "m²"),
209        (" square meter", "m²"),
210        (" square feet", "sq ft"),
211        (" square foot", "sq ft"),
212        (" square miles", "sq mi"),
213        (" square mile", "sq mi"),
214        (" cubic meters", "m³"),
215        (" cubic meter", "m³"),
216        (" cubic deci meters", "dm³"),
217        (" cubic decimeters", "dm³"),
218        // Data units
219        (" peta bytes", "pb"),
220        (" petabytes", "pb"),
221        (" giga bytes", "gb"),
222        (" gigabytes", "gb"),
223        (" mega bytes", "mb"),
224        (" megabytes", "mb"),
225        (" kilo bytes", "kb"),
226        (" kilobytes", "kb"),
227        (" kilobits", "kb"),
228        (" bytes", "b"),
229        // Power/Energy
230        (" megawatts", "mW"),
231        (" megawatt", "mW"),
232        (" kilowatts", "kW"),
233        (" kilowatt", "kW"),
234        (" gigawatts", "gW"),
235        (" watts", "W"),
236        (" watt", "W"),
237        (" horsepower", "hp"),
238        // Data rates
239        (" gigabits per second", "gbps"),
240        (" gigabit per second", "gbps"),
241        (" megabits per second", "mbps"),
242        (" megabit per second", "mbps"),
243        // Temperature
244        (" degrees celsius", "°C"),
245        (" degree celsius", "°C"),
246        (" degrees fahrenheit", "°F"),
247        (" degree fahrenheit", "°F"),
248        (" kelvin", "K"),
249        // Frequency
250        (" megahertz", "mhz"),
251        (" kilohertz", "khz"),
252        (" hertz", "hz"),
253        // Electrical
254        (" milli volt", "mv"),
255        (" millivolts", "mv"),
256        (" volts", "v"),
257        (" volt", "v"),
258        (" mega siemens", "ms"),
259        // Length
260        (" micrometers", "μm"),
261        (" micrometer", "μm"),
262        (" nanometers", "nm"),
263        (" nanometer", "nm"),
264        (" millimeters", "mm"),
265        (" millimeter", "mm"),
266        (" centimeters", "cm"),
267        (" centimeter", "cm"),
268        (" kilometers", "km"),
269        (" kilometer", "km"),
270        (" meters", "m"),
271        (" meter", "m"),
272        (" feet", "ft"),
273        (" foot", "ft"),
274        (" miles", "mi"),
275        (" mile", "mi"),
276        (" ounces", "oz"),
277        (" ounce", "oz"),
278        // Mass
279        (" kilograms", "kg"),
280        (" kilogram", "kg"),
281        (" grams", "g"),
282        (" gram", "g"),
283        // Volume
284        (" kilo liters", "kl"),
285        (" milliliters", "ml"),
286        (" milliliter", "ml"),
287        (" liters", "l"),
288        (" liter", "l"),
289        (" c c", "cc"),
290        // Area
291        (" hectares", "ha"),
292        (" hectare", "ha"),
293        // Time
294        (" hours", "h"),
295        (" hour", "h"),
296        // Light
297        (" lumens", "lm"),
298        (" lumen", "lm"),
299        // Percent
300        (" percent", "%"),
301    ]
302}
303
304/// Parse number value (cardinal, decimal, or with "point")
305fn parse_number_value(input: &str) -> Option<String> {
306    // Try decimal first (handles "point" patterns)
307    if input.contains(" point ") || input.starts_with("point ") {
308        return decimal::parse(input);
309    }
310
311    // Try cardinal
312    if let Some(num) = words_to_number(input) {
313        return Some((num as i64).to_string());
314    }
315
316    None
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322
323    #[test]
324    fn test_simple_units() {
325        assert_eq!(parse("two hundred meters"), Some("200 m".to_string()));
326        assert_eq!(parse("ninety grams"), Some("90 g".to_string()));
327        assert_eq!(parse("three hours"), Some("3 h".to_string()));
328    }
329
330    #[test]
331    fn test_decimal_units() {
332        assert_eq!(
333            parse("eighteen point five kilometers"),
334            Some("18.5 km".to_string())
335        );
336    }
337
338    #[test]
339    fn test_negative() {
340        assert_eq!(
341            parse("minus sixty six kilograms"),
342            Some("-66 kg".to_string())
343        );
344    }
345
346    #[test]
347    fn test_square_units() {
348        assert_eq!(parse("two square meters"), Some("2 m²".to_string()));
349        assert_eq!(
350            parse("sixty five thousand square kilometers"),
351            Some("65000 km²".to_string())
352        );
353    }
354
355    #[test]
356    fn test_compound_units() {
357        assert_eq!(
358            parse("two hundred kilometers per hour"),
359            Some("200 km/h".to_string())
360        );
361    }
362
363    #[test]
364    fn test_special_units() {
365        assert_eq!(parse("two kilo watt hours"), Some("2 kWh".to_string()));
366        assert_eq!(parse("one hundred fifty c c"), Some("150 cc".to_string()));
367    }
368
369    #[test]
370    fn test_percent() {
371        assert_eq!(
372            parse("eighteen point one four percent"),
373            Some("18.14 %".to_string())
374        );
375    }
376}