synox/blinkfill/
language.rs

1use super::token::Token;
2use crate::private::Sealed;
3use crate::StringProgram;
4use std::fmt::Debug;
5
6#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
7pub struct ColumnIndex(pub usize);
8
9#[derive(Debug, PartialEq, Eq, Clone)]
10pub struct StringExpression(pub Vec<SubstringExpression>);
11
12impl Sealed for StringExpression {}
13
14impl StringProgram for StringExpression {
15    fn run<S: AsRef<str>>(&self, row: &[S]) -> Option<String> {
16        self.0.iter().fold(Some(String::new()), |acc, e| {
17            acc.and_then(|mut s| {
18                e.run(row).map(|part| {
19                    s.push_str(&part);
20                    s
21                })
22            })
23        })
24    }
25}
26
27#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)]
28pub enum SubstringExpression {
29    ConstantString(String),
30    Substring(ColumnIndex, Position, Position),
31}
32
33impl SubstringExpression {
34    pub fn run<S: AsRef<str>>(&self, row: &[S]) -> Option<String> {
35        match self {
36            SubstringExpression::ConstantString(s) => Some(s.clone()),
37            SubstringExpression::Substring(ci, p_start, p_end) => {
38                let s = row.get(ci.0)?;
39                let p_start = p_start.run(s.as_ref())?;
40                let p_end = p_end.run(s.as_ref())?;
41                if p_start.0 >= p_end.0 {
42                    return None;
43                }
44                Some(String::from(&s.as_ref()[p_start.0 - 1..p_end.0 - 1]))
45            }
46        }
47    }
48}
49
50// a one-based string index
51#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
52pub struct StringIndex(pub usize);
53
54#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
55pub struct Occurrence(pub isize);
56
57impl Occurrence {
58    pub fn weight(&self) -> isize {
59        // prefer occurrences closer to ends
60        -self.0.abs()
61    }
62}
63
64#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)]
65pub enum Position {
66    Match(Token, Occurrence, Direction),
67    ConstantPosition(Occurrence),
68}
69
70impl Position {
71    fn run(&self, s: &str) -> Option<StringIndex> {
72        match self {
73            Position::Match(token, k, dir) => {
74                let k = k.0;
75                let matches = token.all_matches(s);
76                let n = matches.len() as isize;
77                let k = if k > 0 { k - 1 } else { n + k };
78                if !(0 <= k && k < n) {
79                    return None;
80                }
81                // now, k is a 0-based index, and we know that it's in bounds
82                let r = &matches[k as usize];
83                match dir {
84                    Direction::Start => Some(StringIndex(r.start)),
85                    Direction::End => Some(StringIndex(r.end)),
86                }
87            }
88            Position::ConstantPosition(k) => {
89                let k = k.0;
90                let n = s.len() as isize;
91                let k = if k > 0 { k } else { n + k + 1 };
92                if !(0 < k && k <= n + 1) {
93                    None
94                } else {
95                    Some(StringIndex(k as usize))
96                }
97            }
98        }
99    }
100}
101
102#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
103pub enum Direction {
104    Start,
105    End,
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111    use Direction::*;
112    use Position::*;
113    use SubstringExpression::*;
114
115    fn assert_eval_single(p: &impl StringProgram, s: &str, expected: &str) {
116        let res = p.run(&vec![String::from(s)]).unwrap();
117        assert_eq!(res, String::from(expected));
118    }
119
120    #[test]
121    fn extract_country() {
122        let p = StringExpression(vec![Substring(
123            ColumnIndex(0),
124            Match(Token::Literal(String::from(", ")), Occurrence(1), End),
125            Match(Token::End, Occurrence(-1), Start),
126        )]);
127        assert_eval_single(&p, "Mumbai, India", "India");
128        assert_eval_single(
129            &p,
130            "Los Angeles, United States of America",
131            "United States of America",
132        );
133        assert_eval_single(&p, "Newark, United States", "United States");
134    }
135
136    #[test]
137    fn extract_initials() {
138        let p = StringExpression(vec![
139            Substring(
140                ColumnIndex(0),
141                Match(Token::CapsWithSpaces, Occurrence(1), Start),
142                Match(Token::Caps, Occurrence(1), End),
143            ),
144            ConstantString(String::from(".")),
145            Substring(
146                ColumnIndex(0),
147                Match(Token::Whitespace, Occurrence(-1), End),
148                Match(Token::Lowercase, Occurrence(-1), Start),
149            ),
150            ConstantString(String::from(".")),
151        ]);
152        assert_eval_single(&p, "Brandon Henry Saunders", "B.S.");
153        assert_eval_single(&p, "William Lee", "W.L.");
154        assert_eval_single(&p, "Dafna Q. Chen", "D.C.");
155        assert_eval_single(&p, "Danielle D. Saunders", "D.S.");
156    }
157
158    #[test]
159    fn add_bracket() {
160        let p = StringExpression(vec![
161            Substring(
162                ColumnIndex(0),
163                Match(Token::Start, Occurrence(1), End),
164                Match(Token::Digits, Occurrence(1), End),
165            ),
166            ConstantString(String::from("]")),
167        ]);
168        assert_eval_single(&p, "[CPT-00350", "[CPT-00350]");
169        assert_eval_single(&p, "[CPT-00340", "[CPT-00340]");
170        assert_eval_single(&p, "[CPT-11536]", "[CPT-11536]");
171        assert_eval_single(&p, "[CPT-115]", "[CPT-115]");
172    }
173
174    #[test]
175    fn constant_strings() {
176        let p = StringExpression(vec![Substring(
177            ColumnIndex(0),
178            Match(
179                Token::Literal(String::from("nextData ")),
180                Occurrence(1),
181                End,
182            ),
183            Match(
184                Token::Literal(String::from(" moreInfo")),
185                Occurrence(1),
186                Start,
187            ),
188        )]);
189        assert_eval_single(&p, "nextData 12 Street moreInfo 35", "12 Street");
190        assert_eval_single(&p, "nextData Main moreInfo 36", "Main");
191        assert_eval_single(&p, "nextData Albany Street moreInfo 37", "Albany Street");
192        assert_eval_single(&p, "nextData Green Street moreInfo 39", "Green Street");
193    }
194
195    #[test]
196    fn constant_position() {
197        let p = StringExpression(vec![Substring(
198            ColumnIndex(0),
199            ConstantPosition(Occurrence(3)),
200            Match(Token::Literal(String::from("|")), Occurrence(1), Start),
201        )]);
202        assert_eval_single(&p, "xzHello|asdofij", "Hello");
203    }
204}