1use super::token::Token;
2use crate::private::Sealed;
3use crate::StringProgram;
4use std::fmt::Debug;
5
6#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
7pub struct ColumnIndex(pub usize);
8
9#[derive(Debug, PartialEq, Eq, Clone)]
10pub struct StringExpression(pub Vec<SubstringExpression>);
11
12impl Sealed for StringExpression {}
13
14impl StringProgram for StringExpression {
15 fn run<S: AsRef<str>>(&self, row: &[S]) -> Option<String> {
16 self.0.iter().fold(Some(String::new()), |acc, e| {
17 acc.and_then(|mut s| {
18 e.run(row).map(|part| {
19 s.push_str(&part);
20 s
21 })
22 })
23 })
24 }
25}
26
27#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)]
28pub enum SubstringExpression {
29 ConstantString(String),
30 Substring(ColumnIndex, Position, Position),
31}
32
33impl SubstringExpression {
34 pub fn run<S: AsRef<str>>(&self, row: &[S]) -> Option<String> {
35 match self {
36 SubstringExpression::ConstantString(s) => Some(s.clone()),
37 SubstringExpression::Substring(ci, p_start, p_end) => {
38 let s = row.get(ci.0)?;
39 let p_start = p_start.run(s.as_ref())?;
40 let p_end = p_end.run(s.as_ref())?;
41 if p_start.0 >= p_end.0 {
42 return None;
43 }
44 Some(String::from(&s.as_ref()[p_start.0 - 1..p_end.0 - 1]))
45 }
46 }
47 }
48}
49
50#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
52pub struct StringIndex(pub usize);
53
54#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
55pub struct Occurrence(pub isize);
56
57impl Occurrence {
58 pub fn weight(&self) -> isize {
59 -self.0.abs()
61 }
62}
63
64#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)]
65pub enum Position {
66 Match(Token, Occurrence, Direction),
67 ConstantPosition(Occurrence),
68}
69
70impl Position {
71 fn run(&self, s: &str) -> Option<StringIndex> {
72 match self {
73 Position::Match(token, k, dir) => {
74 let k = k.0;
75 let matches = token.all_matches(s);
76 let n = matches.len() as isize;
77 let k = if k > 0 { k - 1 } else { n + k };
78 if !(0 <= k && k < n) {
79 return None;
80 }
81 let r = &matches[k as usize];
83 match dir {
84 Direction::Start => Some(StringIndex(r.start)),
85 Direction::End => Some(StringIndex(r.end)),
86 }
87 }
88 Position::ConstantPosition(k) => {
89 let k = k.0;
90 let n = s.len() as isize;
91 let k = if k > 0 { k } else { n + k + 1 };
92 if !(0 < k && k <= n + 1) {
93 None
94 } else {
95 Some(StringIndex(k as usize))
96 }
97 }
98 }
99 }
100}
101
102#[derive(Debug, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
103pub enum Direction {
104 Start,
105 End,
106}
107
108#[cfg(test)]
109mod tests {
110 use super::*;
111 use Direction::*;
112 use Position::*;
113 use SubstringExpression::*;
114
115 fn assert_eval_single(p: &impl StringProgram, s: &str, expected: &str) {
116 let res = p.run(&vec![String::from(s)]).unwrap();
117 assert_eq!(res, String::from(expected));
118 }
119
120 #[test]
121 fn extract_country() {
122 let p = StringExpression(vec![Substring(
123 ColumnIndex(0),
124 Match(Token::Literal(String::from(", ")), Occurrence(1), End),
125 Match(Token::End, Occurrence(-1), Start),
126 )]);
127 assert_eval_single(&p, "Mumbai, India", "India");
128 assert_eval_single(
129 &p,
130 "Los Angeles, United States of America",
131 "United States of America",
132 );
133 assert_eval_single(&p, "Newark, United States", "United States");
134 }
135
136 #[test]
137 fn extract_initials() {
138 let p = StringExpression(vec![
139 Substring(
140 ColumnIndex(0),
141 Match(Token::CapsWithSpaces, Occurrence(1), Start),
142 Match(Token::Caps, Occurrence(1), End),
143 ),
144 ConstantString(String::from(".")),
145 Substring(
146 ColumnIndex(0),
147 Match(Token::Whitespace, Occurrence(-1), End),
148 Match(Token::Lowercase, Occurrence(-1), Start),
149 ),
150 ConstantString(String::from(".")),
151 ]);
152 assert_eval_single(&p, "Brandon Henry Saunders", "B.S.");
153 assert_eval_single(&p, "William Lee", "W.L.");
154 assert_eval_single(&p, "Dafna Q. Chen", "D.C.");
155 assert_eval_single(&p, "Danielle D. Saunders", "D.S.");
156 }
157
158 #[test]
159 fn add_bracket() {
160 let p = StringExpression(vec![
161 Substring(
162 ColumnIndex(0),
163 Match(Token::Start, Occurrence(1), End),
164 Match(Token::Digits, Occurrence(1), End),
165 ),
166 ConstantString(String::from("]")),
167 ]);
168 assert_eval_single(&p, "[CPT-00350", "[CPT-00350]");
169 assert_eval_single(&p, "[CPT-00340", "[CPT-00340]");
170 assert_eval_single(&p, "[CPT-11536]", "[CPT-11536]");
171 assert_eval_single(&p, "[CPT-115]", "[CPT-115]");
172 }
173
174 #[test]
175 fn constant_strings() {
176 let p = StringExpression(vec![Substring(
177 ColumnIndex(0),
178 Match(
179 Token::Literal(String::from("nextData ")),
180 Occurrence(1),
181 End,
182 ),
183 Match(
184 Token::Literal(String::from(" moreInfo")),
185 Occurrence(1),
186 Start,
187 ),
188 )]);
189 assert_eval_single(&p, "nextData 12 Street moreInfo 35", "12 Street");
190 assert_eval_single(&p, "nextData Main moreInfo 36", "Main");
191 assert_eval_single(&p, "nextData Albany Street moreInfo 37", "Albany Street");
192 assert_eval_single(&p, "nextData Green Street moreInfo 39", "Green Street");
193 }
194
195 #[test]
196 fn constant_position() {
197 let p = StringExpression(vec![Substring(
198 ColumnIndex(0),
199 ConstantPosition(Occurrence(3)),
200 Match(Token::Literal(String::from("|")), Occurrence(1), Start),
201 )]);
202 assert_eval_single(&p, "xzHello|asdofij", "Hello");
203 }
204}