oca_parser_xls/xls_parser/
oca.rs

1use calamine::{open_workbook_auto, DataType, Range, Reader};
2use isolang::Language;
3use oca_ast_semantics::ast::NestedAttrType;
4use oca_bundle_semantics::state::{
5    attribute::Attribute,
6    encoding::Encoding,
7    entries::EntriesElement,
8    entry_codes::EntryCodes as EntryCodesValue,
9    oca::overlay::cardinality::Cardinalitys,
10    oca::overlay::character_encoding::CharacterEncodings,
11    oca::overlay::conformance::Conformances,
12    oca::overlay::entry::Entries,
13    oca::overlay::entry_code::EntryCodes,
14    oca::overlay::format::Formats,
15    oca::overlay::information::Information,
16    oca::overlay::label::Labels,
17    oca::overlay::meta::Metas,
18    oca::OCABox,
19};
20use std::collections::HashMap;
21
22pub struct ParsedResult {
23    pub oca: OCABox,
24    pub languages: Vec<Language>,
25}
26
27const SUPPORTED_TEMPLATES: &str = "^1.0.0";
28
29const SAMPLE_TEMPLATE_MSG: &str = "Template file can be found here: https://github.com/THCLab/oca-parser-xls/blob/main/templates/template.xlsx";
30
31pub fn parse(path: String) -> Result<ParsedResult, Vec<std::string::String>> {
32    let mut errors: Vec<String> = vec![];
33    let mut workbook = open_workbook_auto(path).map_err(|_| {
34        errors.push(
35            "Provided file cannot be parsed. Check if file exists and format is XLS(X)".to_string(),
36        );
37        errors.clone()
38    })?;
39    let mut sheet_names = workbook.sheet_names().to_vec();
40    let mut languages = vec![];
41    sheet_names
42        .retain(|n| n != "READ ME" && n != "README" && n != "Start Here" && n != "Documentation");
43
44    let main_sheet_name = sheet_names.first().ok_or_else(|| {
45        errors.push(format!("Missing sheets. {SAMPLE_TEMPLATE_MSG}"));
46        errors.clone()
47    })?;
48    if !(main_sheet_name.eq("Main") || main_sheet_name.eq("main")) {
49        errors.push(format!(
50            "Provided XLS file does not match template. Missing Main sheet. {SAMPLE_TEMPLATE_MSG}"
51        ));
52    }
53
54    if !errors.is_empty() {
55        return Err(errors);
56    }
57    let main_sheet = workbook.worksheet_range(main_sheet_name).unwrap().unwrap();
58    let sheet_version = main_sheet.get_value((0, 0));
59    if let Some(DataType::String(version_value)) = sheet_version {
60        let version = semver::Version::parse(version_value).unwrap();
61        let req = semver::VersionReq::parse(self::SUPPORTED_TEMPLATES).unwrap();
62        if !req.matches(&version) {
63            errors.push(format!("Provided version of template ({}) is not supported. Please provide template matching {} version.", version_value, self::SUPPORTED_TEMPLATES));
64        }
65    }
66
67    if !errors.is_empty() {
68        return Err(errors);
69    }
70    let translation_sheet_names = sheet_names.split_off(1);
71    let mut translation_sheets: Vec<(Language, _)> = vec![];
72
73    for translation_sheet_name in translation_sheet_names {
74        languages.push(
75            Language::from_639_1(&translation_sheet_name.clone()).unwrap());
76        translation_sheets.push((
77            Language::from_639_1(&translation_sheet_name.clone()).unwrap(),
78            workbook
79                .worksheet_range(&translation_sheet_name.clone())
80                .unwrap()
81                .unwrap(),
82        ));
83    }
84
85    let mut column_indicies: HashMap<&str, u32> = HashMap::new();
86    for row in main_sheet.rows().filter(|&row| {
87        if let DataType::String(v) = &row[0] {
88            v.starts_with("CB") || v.starts_with("OL")
89        } else {
90            false
91        }
92    }) {
93        for (i, value) in row.iter().enumerate() {
94            if let DataType::String(v) = value {
95                if v.starts_with("CB-CL:") {
96                    column_indicies.insert("CLASSIFICATION_INDEX", i as u32);
97                } else if v.starts_with("CB-AN:") {
98                    column_indicies.insert("ATTR_NAME_INDEX", i as u32);
99                } else if v.starts_with("CB-AT:") {
100                    column_indicies.insert("ATTR_TYPE_INDEX", i as u32);
101                } else if v.starts_with("CB-RS:") {
102                    column_indicies.insert("REFERENCE_SAI_INDEX", i as u32);
103                } else if v.starts_with("CB-FA:") {
104                    column_indicies.insert("FLAGGED_INDEX", i as u32);
105                } else if v.starts_with("OL-CH:") {
106                    column_indicies.insert("ENCODING_INDEX", i as u32);
107                } else if v.starts_with("OL-ST:") {
108                    column_indicies.insert("STANDARD_INDEX", i as u32);
109                } else if v.starts_with("OL-FT:") {
110                    column_indicies.insert("FORMAT_INDEX", i as u32);
111                } else if v.starts_with("OL-EC:") {
112                    column_indicies.insert("ENTRY_CODES_INDEX", i as u32);
113                } else if v.starts_with("OL-UT:") {
114                    column_indicies.insert("UNIT_INDEX", i as u32);
115                } else if v.starts_with("OL-CC:") {
116                    column_indicies.insert("CONDITION_INDEX", i as u32);
117                } else if v.starts_with("OL-CD:") {
118                    column_indicies.insert("DEPENDENCIES_INDEX", i as u32);
119                } else if v.starts_with("OL-CR:") {
120                    column_indicies.insert("CARDINALITY_INDEX", i as u32);
121                } else if v.starts_with("OL-CN:") {
122                    column_indicies.insert("CONFORMANCE_INDEX", i as u32);
123                } else if v.starts_with("OL-AM:") {
124                    column_indicies.insert("ATTRIBUTE_MAPPING_INDEX", i as u32);
125                } else if v.starts_with("OL-EM:") {
126                    column_indicies.insert("ENTRY_CODE_MAPPING_INDEX", i as u32);
127                }
128            }
129        }
130    }
131
132    if column_indicies.get("ATTR_NAME_INDEX").is_none() {
133        errors.push("Not found column with Attribute Names definiton".to_string());
134    }
135    if column_indicies.get("ATTR_NAME_INDEX").is_none() {
136        errors.push("Not found column with Attribute Names definiton".to_string());
137    }
138    if !errors.is_empty() {
139        return Err(errors);
140    }
141
142    let start: u32 = 3;
143    let end_offset = if sheet_version.is_some() { 1 } else { 3 };
144    let mut end: u32 = main_sheet.height() as u32;
145    for (i, row) in main_sheet.rows().enumerate().rev() {
146        if row
147            .iter()
148            .any(|cell| cell != &DataType::Empty && cell.get_string().unwrap().trim().ne(""))
149        {
150            end = end_offset + i as u32;
151            break;
152        }
153    }
154    let oca_range = (start, end);
155
156    let mut oca = OCABox::new();
157
158    let mut classification = String::new();
159    if let Some(classification_index) = column_indicies.get("CLASSIFICATION_INDEX") {
160        let classification_value = main_sheet.get_value((oca_range.0, *classification_index));
161        if let Some(class) = classification_value {
162            classification = class.to_string().trim().to_string();
163        }
164    }
165    oca.add_classification(classification);
166
167    // let mut attribute_names = vec![];
168    let mut attributes: Vec<(u32, Attribute)> = vec![];
169    fn parse_row(
170        attr_index: u32,
171        main_sheet: &Range<DataType>,
172        column_indicies: HashMap<&str, u32>,
173    ) -> Result<Attribute, String> {
174        let attribute_name = main_sheet
175            .get_value((attr_index, *column_indicies.get("ATTR_NAME_INDEX").unwrap()))
176            .unwrap()
177            .to_string()
178            .trim()
179            .to_string();
180        if attribute_name.is_empty() {
181            return Err(format!(
182                "Parsing attribute in row {} failed. Attribute name is empty.",
183                attr_index + 1
184            ));
185        }
186        /*
187        let attribute_count = attribute_names
188            .iter()
189            .filter(|&name| *name == attribute_name)
190            .count();
191        if attribute_count > 0 {
192            attribute_name = format!("{}-{}", attribute_name, attribute_count);
193        }
194
195        attribute_names.push(attribute_name.clone());
196        */
197        let attribute_type = &format!(
198            r#"{}"#,
199            &main_sheet
200                .get_value((attr_index, *column_indicies.get("ATTR_TYPE_INDEX").unwrap()))
201                .unwrap()
202        )
203        .trim()
204        .to_string();
205        let mut attribute = Attribute::new(attribute_name.clone());
206        attribute.set_attribute_type(
207            serde_json::from_str::<NestedAttrType>(format!("\"{attribute_type}\"").as_str())
208                .map_err(|e| {
209                    format!(
210                        "Parsing attribute type in row {} ({}) failed. {}",
211                        attr_index + 1,
212                        attribute_name,
213                        e
214                    )
215                })?,
216        );
217        if let Some(flagged_index) = column_indicies.get("FLAGGED_INDEX") {
218            if let Some(DataType::String(_value)) =
219                main_sheet.get_value((attr_index, *flagged_index))
220            {
221                attribute.set_flagged();
222            }
223        }
224        if let Some(encoding_index) = column_indicies.get("ENCODING_INDEX") {
225            if let Some(DataType::String(encoding_value)) =
226                main_sheet.get_value((attr_index, *encoding_index))
227            {
228                let encoding =
229                    serde_json::from_str::<Encoding>(&format!(r#""{}""#, encoding_value.trim()))
230                        .map_err(|e| {
231                            format!(
232                                "Parsing character encoding in row {} failed. {}",
233                                attr_index + 1,
234                                e
235                            )
236                        })?;
237                attribute.set_encoding(encoding);
238            }
239        }
240
241        if let Some(format_index) = column_indicies.get("FORMAT_INDEX") {
242            if let Some(DataType::String(format_value)) =
243                main_sheet.get_value((attr_index, *format_index))
244            {
245                attribute.set_format(format_value.clone().trim().to_string());
246            }
247        }
248
249        /* TODO: Add standard
250        if let Some(standard_index) = column_indicies.get("STANDARD_INDEX") {
251            if let Some(DataType::String(standard_value)) =
252                main_sheet.get_value((attr_index, *standard_index))
253            {
254                attribute_builder.add_standard(standard_value.clone().trim().to_string());
255            }
256        }
257        */
258
259        if let Some(entry_codes_index) = column_indicies.get("ENTRY_CODES_INDEX") {
260            if let Some(DataType::String(entry_codes_value)) =
261                main_sheet.get_value((attr_index, *entry_codes_index))
262            {
263                if entry_codes_value != &"[SAI]".to_string() {
264                    let entry_codes: EntryCodesValue = if entry_codes_value.starts_with("SAI:") {
265                        let sai = entry_codes_value.strip_prefix("SAI:").unwrap();
266                        EntryCodesValue::Sai(sai.to_string())
267                    } else {
268                        let codes: Vec<String> = entry_codes_value
269                            .trim()
270                            .to_string()
271                            .split('|')
272                            .collect::<Vec<&str>>()
273                            .iter()
274                            .map(|c| c.to_string().trim().to_string())
275                            .collect();
276                        EntryCodesValue::Array(codes)
277                    };
278                    attribute.set_entry_codes(entry_codes);
279                }
280            }
281        }
282
283        /* TODO: Add condition
284        if let Some(condition_index) = column_indicies.get("CONDITION_INDEX") {
285            if let Some(DataType::String(condition_value)) =
286                main_sheet.get_value((attr_index, *condition_index))
287            {
288                if let Some(dependencies_index) = column_indicies.get("DEPENDENCIES_INDEX") {
289                    if let Some(DataType::String(dependencies_value)) =
290                        main_sheet.get_value((attr_index, *dependencies_index))
291                    {
292                        attribute_builder = attribute_builder.add_condition(
293                            condition_value.clone().trim().to_string(),
294                            dependencies_value
295                                .split(',')
296                                .collect::<Vec<&str>>()
297                                .iter()
298                                .map(|c| c.to_string().trim().to_string())
299                                .collect(),
300                        );
301                    }
302                }
303            }
304        }
305        */
306
307        if let Some(cardinality_index) = column_indicies.get("CARDINALITY_INDEX") {
308            if let Some(DataType::String(cardinality_value)) =
309                main_sheet.get_value((attr_index, *cardinality_index))
310            {
311                attribute.set_cardinality(cardinality_value.to_string().trim().to_string());
312            }
313        }
314
315        if let Some(conformance_index) = column_indicies.get("CONFORMANCE_INDEX") {
316            if let Some(DataType::String(conformance_value)) =
317                main_sheet.get_value((attr_index, *conformance_index))
318            {
319                attribute.set_conformance(conformance_value.clone().trim().to_string());
320            }
321        }
322
323        /* TODO: Add unit
324        if let Some(unit_index) = column_indicies.get("UNIT_INDEX") {
325            if let Some(DataType::String(unit_value)) =
326                main_sheet.get_value((attr_index, *unit_index))
327            {
328                let mut metric_system = String::new();
329                let mut unit = unit_value.clone().trim().to_string();
330
331                let mut splitted: Vec<String> = unit_value
332                    .trim()
333                    .to_string()
334                    .split('|')
335                    .collect::<Vec<&str>>()
336                    .iter()
337                    .map(|c| c.to_string().trim().to_string())
338                    .collect();
339                if splitted.len() > 1 {
340                    unit = splitted.pop().unwrap();
341                    metric_system = splitted.join("|");
342                }
343
344                attribute_builder = attribute_builder.add_unit(metric_system, unit);
345            }
346        }
347        */
348
349        /* TODO: Add mappings
350        if let Some(attribute_mapping_index) = column_indicies.get("ATTRIBUTE_MAPPING_INDEX") {
351            if let Some(DataType::String(mapping_value)) =
352                main_sheet.get_value((attr_index, *attribute_mapping_index))
353            {
354                attribute_builder =
355                    attribute_builder.add_mapping(mapping_value.clone().trim().to_string());
356            }
357        }
358
359        if let Some(entry_code_mapping_index) = column_indicies.get("ENTRY_CODE_MAPPING_INDEX") {
360            if let Some(DataType::String(mapping_value)) =
361                main_sheet.get_value((attr_index, *entry_code_mapping_index))
362            {
363                attribute_builder = attribute_builder.add_entry_codes_mapping(
364                    mapping_value
365                        .trim()
366                        .to_string()
367                        .split('|')
368                        .collect::<Vec<&str>>()
369                        .iter()
370                        .map(|c| c.to_string().trim().to_string())
371                        .collect(),
372                );
373            }
374        }
375        */
376        Ok(attribute)
377    }
378
379    for attr_index in oca_range.0..oca_range.1 {
380        match parse_row(attr_index, &main_sheet, column_indicies.clone()) {
381            Ok(attribute) => attributes.push((attr_index, attribute)),
382            Err(e) => errors.push(e),
383        }
384    }
385
386    let mut meta_trans: HashMap<String, HashMap<Language, String>> = HashMap::new();
387
388    let mut label_trans: HashMap<u32, HashMap<Language, String>> = HashMap::new();
389    let mut entries_trans: HashMap<u32, HashMap<Language, EntriesElement>> = HashMap::new();
390    let mut information_trans: HashMap<u32, HashMap<Language, String>> = HashMap::new();
391
392    for (lang, sheet) in translation_sheets.iter() {
393        let mut sheet_column_indicies: HashMap<&str, u32> = HashMap::new();
394        for row in sheet.rows().filter(|&row| {
395            if let DataType::String(v) = &row[0] {
396                v.starts_with("CB") || v.starts_with("OL")
397            } else {
398                false
399            }
400        }) {
401            for (i, value) in row.iter().enumerate() {
402                if let DataType::String(v) = value {
403                    if v.starts_with("OL-MN:") {
404                        sheet_column_indicies.insert("META_NAME_INDEX", i as u32);
405                    } else if v.starts_with("OL-MV:") {
406                        sheet_column_indicies.insert("META_VALUE_INDEX", i as u32);
407                    } else if v.starts_with("CB-AN:") {
408                        sheet_column_indicies.insert("ATTR_NAME_INDEX", i as u32);
409                    } else if v.starts_with("OL-LA:") {
410                        sheet_column_indicies.insert("LABEL_INDEX", i as u32);
411                    } else if v.starts_with("OL-EN:") {
412                        sheet_column_indicies.insert("ENTRIES_INDEX", i as u32);
413                    } else if v.starts_with("OL-IN:") {
414                        sheet_column_indicies.insert("INFORMATION_INDEX", i as u32);
415                    }
416                }
417            }
418        }
419
420        for attr_index in (oca_range.0)..(sheet.height() as u32) {
421            if let Some(name_index) = sheet_column_indicies.get("META_NAME_INDEX") {
422                if let Some(DataType::String(name_value)) =
423                    sheet.get_value((attr_index, *name_index))
424                {
425                    if let Some(value_index) = sheet_column_indicies.get("META_VALUE_INDEX") {
426                        if let Some(DataType::String(value_value)) =
427                            sheet.get_value((attr_index, *value_index))
428                        {
429                            match meta_trans.get_mut(name_value) {
430                                Some(m_trans) => {
431                                    m_trans.insert(*lang, value_value.to_string());
432                                }
433                                None => {
434                                    let mut m_trans = HashMap::new();
435                                    m_trans.insert(*lang, value_value.to_string());
436                                    meta_trans.insert(name_value.to_string(), m_trans);
437                                }
438                            }
439                        }
440                    }
441                }
442            }
443        }
444
445        for attr_index in (oca_range.0)..(oca_range.1) {
446            if let Some(label_index) = sheet_column_indicies.get("LABEL_INDEX") {
447                if let Some(DataType::String(label_value)) =
448                    sheet.get_value((attr_index, *label_index))
449                {
450                    match label_trans.get_mut(&attr_index) {
451                        Some(attr_label_tr) => {
452                            attr_label_tr
453                                .insert(*lang, label_value.clone().trim().to_string());
454                        }
455                        None => {
456                            let mut attr_label_tr: HashMap<Language, String> = HashMap::new();
457                            attr_label_tr
458                                .insert(*lang, label_value.clone().trim().to_string());
459                            label_trans.insert(attr_index, attr_label_tr);
460                        }
461                    }
462                }
463            }
464
465            if let Some(entries_index) = sheet_column_indicies.get("ENTRIES_INDEX") {
466                if let Some(DataType::String(entries_value)) =
467                    sheet.get_value((attr_index, *entries_index))
468                {
469                    let entries_el: EntriesElement = if entries_value.starts_with("SAI:") {
470                        let sai = entries_value
471                            .strip_prefix("SAI:")
472                            .unwrap()
473                            .trim()
474                            .to_string();
475                        EntriesElement::Sai(sai.to_string())
476                    } else {
477                        let entries_values = entries_value.split('|').collect::<Vec<&str>>();
478                        for (i, entries_value_element) in entries_values.iter().enumerate() {
479                            let splitted = entries_value_element.split(':').collect::<Vec<&str>>();
480                            let entry_key = splitted.first().unwrap().trim().to_string();
481                            if entry_key.is_empty() {
482                                errors.push(format!(
483                                        "Parsing attribute in row {} failed. Invalid Entry Overlay definition for {} language. Missing entry key in position {}",
484                                                    attr_index + 1,
485                                                    lang,
486                                                    i + 1)
487                                    );
488                                return Err(errors);
489                            }
490                            splitted.get(1).ok_or_else(|| {
491                                errors.push(format!(
492                                        "Parsing attribute in row {} failed. Invalid Entry Overlay definition for {} language. Missing entry value in position {}",
493                                                    attr_index + 1,
494                                                    lang,
495                                                    i + 1)
496                                    );
497                                errors.clone()
498                            })?;
499                        }
500                        let entries_obj =
501                            entries_values.iter().fold(HashMap::new(), |mut acc, x| {
502                                let splitted = x.split(':').collect::<Vec<&str>>();
503                                acc.insert(
504                                    splitted.first().unwrap().to_string().trim().to_string(),
505                                    splitted.get(1).unwrap().to_string().trim().to_string(),
506                                );
507                                acc
508                            });
509                        EntriesElement::Object(entries_obj)
510                    };
511                    match entries_trans.get_mut(&attr_index) {
512                        Some(attr_entries_tr) => {
513                            if attr_entries_tr.get(lang).is_none() {
514                                attr_entries_tr.insert(*lang, entries_el);
515                            }
516                        }
517                        None => {
518                            let mut attr_entries_tr: HashMap<Language, EntriesElement> =
519                                HashMap::new();
520                            attr_entries_tr.insert(*lang, entries_el);
521                            entries_trans.insert(attr_index, attr_entries_tr);
522                        }
523                    }
524                }
525            }
526
527            if let Some(information_index) = sheet_column_indicies.get("INFORMATION_INDEX") {
528                if let Some(DataType::String(information_value)) =
529                    sheet.get_value((attr_index, *information_index))
530                {
531                    match information_trans.get_mut(&attr_index) {
532                        Some(attr_info_tr) => {
533                            attr_info_tr.insert(
534                                *lang,
535                                information_value.clone().trim().to_string(),
536                            );
537                        }
538                        None => {
539                            let mut attr_info_tr: HashMap<Language, String> = HashMap::new();
540                            attr_info_tr.insert(
541                                *lang,
542                                information_value.clone().trim().to_string(),
543                            );
544                            information_trans.insert(attr_index, attr_info_tr);
545                        }
546                    }
547                }
548            }
549        }
550    }
551    for (i, mut attribute) in attributes {
552        if let Some(label_tr) = label_trans.get(&i).cloned() {
553            for (lang, label) in label_tr {
554                attribute.set_label(lang, label);
555            }
556        }
557        if let Some(lang_entries_tr) = entries_trans.get(&i).cloned() {
558            for (lang, entries_tr) in lang_entries_tr {
559              attribute.set_entry(lang, entries_tr);
560            }
561        }
562        if let Some(info_tr) = information_trans.get(&i).cloned() {
563            for (lang, info) in info_tr {
564                attribute.set_information(lang, info);
565            }
566        }
567        oca.add_attribute(attribute);
568    }
569    for (key, value_trans) in meta_trans {
570        for (lang, value) in value_trans {
571            oca.add_meta(lang, key.clone(), value.clone());
572        }
573    }
574    // let oca = oca_builder.finalize();
575
576    if errors.is_empty() {
577        Ok(ParsedResult {
578            oca,
579            languages,
580        })
581    } else {
582        Err(errors)
583    }
584}
585
586#[cfg(test)]
587mod tests {
588    use super::*;
589
590    #[test]
591    fn parse_xlsx_file() {
592        let result = parse(
593            format!(
594                "{}/tests/assets/oca_template.xlsx",
595                env!("CARGO_MANIFEST_DIR")
596            )
597        );
598        assert!(result.is_ok());
599        if let Ok(mut parsed) = result {
600            assert_eq!(parsed.languages.len(), 1);
601            let oca_bundle = parsed.oca.generate_bundle();
602            assert_eq!(oca_bundle.capture_base.attributes.len(), 3);
603            assert_eq!(oca_bundle.overlays.len(), 7);
604        }
605    }
606
607    #[test]
608    fn parse_xls_file() {
609        let result = parse(
610            format!(
611                "{}/tests/assets/oca_template.xls",
612                env!("CARGO_MANIFEST_DIR")
613            )
614        );
615        assert!(result.is_ok());
616
617        if let Ok(mut parsed) = result {
618            assert_eq!(parsed.languages.len(), 1);
619            let oca_bundle = parsed.oca.generate_bundle();
620            assert_eq!(oca_bundle.capture_base.attributes.len(), 3);
621            assert_eq!(oca_bundle.overlays.len(), 7);
622        }
623    }
624
625    #[test]
626    fn return_error_when_file_type_is_invalid() {
627        let result = parse(
628            format!(
629                "{}/tests/assets/invalid_format.txt",
630                env!("CARGO_MANIFEST_DIR")
631            )
632        );
633        assert!(result.is_err());
634    }
635}