kvc/
lib.rs

1
2extern crate regex;
3use std::collections::HashMap;
4use std::io::BufRead;
5use std::io::Lines;
6/// Return the current version of the library as a String
7/// following semantic versioning, like this: "Major.Minor.Build"
8/// # Examples
9/// ```rust
10/// //Check we have the right version:
11/// assert_eq!(kvc::version(),"1.1.3");
12/// ```
13pub fn version() -> String{
14    return "1.1.3".to_string();
15}
16
17/// Get the reserved keyword matchers as a HashMap<String,regex::Regex> 
18/// You can add to this to change the way text is parsed 
19/// 
20/// For each whitespace delimited token in the line, see if the regex matches
21/// it. If it does, that token is added as a "String" key,value pair under the
22/// corresponding name.
23/// 
24/// Default matchers pull out "Date" fields of the form YYYY-MM-DD and return
25/// tuples of the form ("Date",<date string>)
26/// 
27/// You can add regexes and names by inserting into the returned hashmap and 
28/// passing that to all kvc:: functions
29/// 
30/// # Examples
31/// 
32/// The default result
33/// 
34/// ```rust
35///  let (counts,strs) =kvc::read_kvc_line_default(&"    2021-01-01 ".to_string());
36///  assert_eq!(strs.len(),1);
37///  assert_eq!(counts.len(),0);
38///  assert_eq!(strs[0],("Date".to_string(),"2021-01-01".to_string()));
39/// ```
40/// 
41/// Adding a custom keyword matcher:
42/// ```rust
43///     let mut keywords = kvc::get_reserved_matchers();
44///     keywords.push(
45///         ( "One-plus-one".to_string(), regex::Regex::new(r"^\d{1}\+\d{1}$").unwrap()) );
46///     let (counts,strs) =kvc::read_kvc_line(&"    2021-01-01 \n 1+1   ".to_string(),&keywords,&"");
47///     assert_eq!(counts.len(),0);
48///     assert_eq!(strs.len(),2);
49///     for pairing in strs{
50///         let (name,val) = pairing;
51///         match &name[..]{
52///             "One-plus-one"=>assert_eq!(val,"1+1"),
53///             "Date"=>assert_eq!(val,"2021-01-01"),
54///             _=>assert!(false,"Unknown value!")
55///         }
56///     }
57/// ```
58/// 
59/// # Returns
60/// 
61/// A keyword matcher, populated with the following matchers
62///  
63/// - {"^\d{4}-\d{2}-\d{2}" --> "Date"}
64/// 
65/// # See also
66/// 
67/// - [kvc::read_kvc_line]
68/// 
69pub fn get_reserved_matchers() -> Vec<(String,regex::Regex)>
70{
71    let mut retvals:HashMap<String,regex::Regex> = HashMap::new();
72    retvals.insert(
73        "Date".to_string(),
74        regex::Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap());
75    retvals.into_iter().collect()
76}
77
78//TODO 0.4: [x] return vec of tuples
79pub fn read_kvc_line_default( input_line: &String ) -> 
80(
81    Vec<(String,f32)>,
82    Vec<(String,String)>
83)
84{
85    read_kvc_line( input_line, &get_reserved_matchers(),&"")
86}
87
88//TODO 0.4: [x] return vec of tuples
89pub fn read_kvc_line( line: &String, keywords: &Vec<(String,regex::Regex)>, start_sequence: &str) -> 
90(
91    Vec<(String,f32)>,
92    Vec<(String,String)>
93)
94{
95    if line.len()==0 {
96        return (
97            vec![],
98            vec![]
99        );
100    }
101    let mut line_strings: HashMap<String,String> = HashMap::new();
102    let mut line_counter: HashMap<String,f32> = HashMap::new();
103    let input_line = match start_sequence.len()>0{
104        true=>{
105            let mut strings = line.split(start_sequence);
106            let _ = strings.next();
107            strings.collect()
108        },
109        false=>line.clone(),
110    };
111    let mut tok_iter = input_line.split_whitespace();
112    'nexttok: while let Some(kvpair) = tok_iter.next(){
113
114        //sure hope I understand what that split_whitespace() was up to.
115        assert!(kvpair.len() > 0);
116        if kvpair.chars().next().unwrap()=='#'{
117            break;
118        }
119        let mut kvitr = kvpair.split(":");
120        if let Some(key)=kvitr.next(){
121            //got a key, that's good.
122            //if it's a date-matching key, we can specially process that one
123            for (name,matcher) in keywords{
124                if matcher.is_match(key)
125                {
126                    line_strings.insert(name.clone(),key.to_string().clone());
127                    continue 'nexttok;
128                }
129            }
130
131            //It's not one of the speically formatted keys, so let's just parse as accumulator keys
132            //These are of the form K K K K K , which should compress to K:5
133            //or K:4 K, which should compress also to K:5
134            //e.g., of the form K:I, and if no :I, then let's assume :1.
135            //get val -- thestuff after ':'
136            let val=match kvitr.next(){
137                None=>1.0,
138                Some(s)=>{
139                    if let Ok(f_val) = s.parse::<f32>(){
140                        f_val
141                    } else {
142                        eprintln!("Got a non-accumulator (int/float) here: {}:{}",key,s);
143                        continue 'nexttok;
144                    }
145                },
146            };
147            let countref = line_counter.entry(key.to_string()).or_insert(0.0);
148            *countref =  *countref + val;
149        } else {
150            panic!("Bug! Cannot process: '{}' from '{}'",kvpair,input_line);
151        }
152    }
153    return (
154        line_counter.into_iter().collect(),
155        line_strings.into_iter().collect(),
156    );
157}
158
159pub fn load_table_from_kvc_stream<B:BufRead> (
160    lines_input:Lines<B>, 
161    keywords :&Vec<(String,regex::Regex)> ,
162    start_sequence: &str
163)->
164(
165    (usize,usize),  //size
166    Vec<((usize,usize),String)> , //entries
167    Vec<String>  // col_names
168)
169{
170    let mut rows = 0;
171    let mut col_to_name: HashMap<usize,String> = HashMap::new();
172    let mut name_to_col: HashMap<String,usize> = HashMap::new();
173    let mut string_entries: HashMap< (usize,usize), String> = HashMap::new();
174
175    for line_res in lines_input{
176        //check for bad line
177        let line = match line_res{
178            Ok(l)=>l,
179            Err(_)=> continue,
180        };
181        //parse it (or try)
182        let (key_counts,key_strings)=read_kvc_line(&line,&keywords,start_sequence);
183
184        //see if we got nothing, if so skip it
185        if key_counts.len() + key_strings.len()==0
186        {
187            continue;
188        } 
189
190        //record what we may have gotten
191        for (key,val) in key_strings{
192            let colsize = name_to_col.len();
193            let colidx = name_to_col.entry(key.to_string()).or_insert(colsize);
194            col_to_name.insert(*colidx,key.to_string());
195            string_entries.insert( (rows,*colidx), val);
196        }
197        for (key,count) in key_counts{
198            let colsize = name_to_col.len();
199            let colidx = name_to_col.entry(key.to_string()).or_insert(colsize);
200            col_to_name.insert(*colidx,key.to_string());
201            string_entries.insert( (rows,*colidx), count.to_string());
202        }
203        //record that we tallied a row
204        rows+=1;
205    }
206
207    //trial by fire: Assume the hash map is correctly set up 0..col_to_name.len() 
208    let cols = col_to_name.len();
209    let mut col_names:Vec<String> = vec!["".to_string(); cols];
210    for (idx,name) in col_to_name{
211        assert!(col_names[idx].len()==0,"Found non-zero column name! Error in read_kvc_line?");
212        col_names[idx]+=&name.to_string();
213    }
214    for idx in 0..cols{
215        assert!(col_names[idx].len()!=0,"Found zero-length column name! Error in read_kvc_line?")
216    }
217
218    return ( 
219        (rows,cols),
220        string_entries.into_iter().collect(),
221        col_names 
222    )
223}
224
225pub fn load_table_from_kvc_stream_default<B:BufRead> (lines_input:Lines<B>)->
226(
227    (usize,usize),
228    Vec<((usize,usize),String)> , //entries 
229    Vec<String> // col_names
230)
231{
232    return load_table_from_kvc_stream(lines_input, &get_reserved_matchers(),&"");
233}
234
235#[cfg(test)]
236mod tests{
237use super::*;
238use std::io::Cursor;
239
240    #[test]
241    fn keywords_are_returned(){
242        assert_eq!(get_reserved_matchers().len(),1);
243        let (name,_) = get_reserved_matchers().into_iter().next().unwrap();
244        assert_eq!(name,"Date");
245    }
246
247    #[test]
248    fn line_accepts_keywords(){
249        let mut keywords = get_reserved_matchers();
250        keywords.push(
251            ( "One-plus-one".to_string(), regex::Regex::new(r"^\d{1}\+\d{1}$").unwrap()) );
252        let (counts,strs) =read_kvc_line(&"    2021-01-01 \n 1+1   ".to_string(),&keywords,&"");
253        assert_eq!(counts.len(),0);
254        assert_eq!(strs.len(),2);
255        for pairing in strs{
256            let (name,val) = pairing;
257            match &name[..]{
258                "One-plus-one"=>assert_eq!(val,"1+1"),
259                "Date"=>assert_eq!(val,"2021-01-01"),
260                _=>assert!(false,"Unknown value!")
261            }
262        }
263    }
264
265    #[test]
266    fn line_gets_date(){
267        let (counts,strs) =read_kvc_line_default(&"    2021-01-01 ".to_string());
268        assert_eq!(strs.len(),1);
269        assert_eq!(counts.len(),0);
270        assert_eq!(strs[0],("Date".to_string(),"2021-01-01".to_string()));
271    }
272
273    #[test]
274    fn line_counts_tokens(){
275        let (counts,strs) =read_kvc_line_default(&" A A A B  B C Z:4 Y:2 Y:3 ".to_string());
276        assert_eq!(strs.len(),0);
277        assert_eq!(counts.len(),5);
278        for (key,val) in counts{
279            match &key[..]{
280                "A"=>assert_eq!(val,3.0),
281                "B"=>assert_eq!(val,2.0),
282                "C"=>assert_eq!(val,1.0),
283                "Y"=>assert_eq!(val,5.0),
284                "Z"=>assert_eq!(val,4.0),
285                _=>panic!("Found unexpected token:{}",key)
286            }
287        }
288    }
289
290    #[test]
291    fn line_ignores_comments(){
292        let (counts,strs) =read_kvc_line_default(&" A # A A B  B C Z:4 Y:2 Y:3 ".to_string());
293        assert_eq!(strs.len(),0);
294        assert_eq!(counts.len(),1);
295        for (key,val) in counts{
296            match &key[..]{
297                "A"=>assert_eq!(val,1.0),
298                _=>panic!("Found unexpected token:{}",key)
299            }
300        }
301    }
302
303    #[test]
304    fn table_size(){
305        let data =Cursor::new( "A # NO\n A A # \n A A A\n\n" );
306        let ( (r,c) ,_entries,names)=load_table_from_kvc_stream_default(data.lines());
307        assert_eq!(r,3);
308        assert_eq!(c,1);
309        assert_eq!(names[0],"A");
310        assert_eq!(names.len(),c);
311    }
312    #[test]
313    fn date_matches_only_date(){
314        let data=Cursor::new(" 2021-01-01AAAAAA \n 2021-01-012021-01-02 \n 2021-02-02      ");
315        let ((r,c), entries,names)=load_table_from_kvc_stream_default(data.lines());
316        //should get three entries. Two weirdly named tokens and one Date
317        assert_eq!(r,3);
318        assert_eq!(c,3);
319        assert_eq!(names[0],"2021-01-01AAAAAA");
320        assert_eq!(names[1],"2021-01-012021-01-02");
321        assert_eq!(names[2],"Date");
322        assert_eq!(entries.len(),3);
323
324        for (idx, entry) in entries{
325            eprintln!("Checking {}",entry);
326            match idx{
327                (0,0)=>assert_eq!(entry,(1.0).to_string()),
328                (1,1)=>assert_eq!(entry,(1.0).to_string()),
329                (2,2)=>assert_eq!(entry,"2021-02-02"),
330                _=>{
331                    let (i,j)=idx;
332                    panic!("Found unexpected entry: ({},{}) {}",i,j,entry);
333                }
334            }
335        }
336    }
337}