oca_parser_xls/xls_parser/
oca.rs1use calamine::{open_workbook_auto, DataType, Range, Reader};
2use isolang::Language;
3use oca_ast_semantics::ast::NestedAttrType;
4use oca_bundle_semantics::state::{
5 attribute::Attribute,
6 encoding::Encoding,
7 entries::EntriesElement,
8 entry_codes::EntryCodes as EntryCodesValue,
9 oca::overlay::cardinality::Cardinalitys,
10 oca::overlay::character_encoding::CharacterEncodings,
11 oca::overlay::conformance::Conformances,
12 oca::overlay::entry::Entries,
13 oca::overlay::entry_code::EntryCodes,
14 oca::overlay::format::Formats,
15 oca::overlay::information::Information,
16 oca::overlay::label::Labels,
17 oca::overlay::meta::Metas,
18 oca::OCABox,
19};
20use std::collections::HashMap;
21
22pub struct ParsedResult {
23 pub oca: OCABox,
24 pub languages: Vec<Language>,
25}
26
27const SUPPORTED_TEMPLATES: &str = "^1.0.0";
28
29const SAMPLE_TEMPLATE_MSG: &str = "Template file can be found here: https://github.com/THCLab/oca-parser-xls/blob/main/templates/template.xlsx";
30
31pub fn parse(path: String) -> Result<ParsedResult, Vec<std::string::String>> {
32 let mut errors: Vec<String> = vec![];
33 let mut workbook = open_workbook_auto(path).map_err(|_| {
34 errors.push(
35 "Provided file cannot be parsed. Check if file exists and format is XLS(X)".to_string(),
36 );
37 errors.clone()
38 })?;
39 let mut sheet_names = workbook.sheet_names().to_vec();
40 let mut languages = vec![];
41 sheet_names
42 .retain(|n| n != "READ ME" && n != "README" && n != "Start Here" && n != "Documentation");
43
44 let main_sheet_name = sheet_names.first().ok_or_else(|| {
45 errors.push(format!("Missing sheets. {SAMPLE_TEMPLATE_MSG}"));
46 errors.clone()
47 })?;
48 if !(main_sheet_name.eq("Main") || main_sheet_name.eq("main")) {
49 errors.push(format!(
50 "Provided XLS file does not match template. Missing Main sheet. {SAMPLE_TEMPLATE_MSG}"
51 ));
52 }
53
54 if !errors.is_empty() {
55 return Err(errors);
56 }
57 let main_sheet = workbook.worksheet_range(main_sheet_name).unwrap().unwrap();
58 let sheet_version = main_sheet.get_value((0, 0));
59 if let Some(DataType::String(version_value)) = sheet_version {
60 let version = semver::Version::parse(version_value).unwrap();
61 let req = semver::VersionReq::parse(self::SUPPORTED_TEMPLATES).unwrap();
62 if !req.matches(&version) {
63 errors.push(format!("Provided version of template ({}) is not supported. Please provide template matching {} version.", version_value, self::SUPPORTED_TEMPLATES));
64 }
65 }
66
67 if !errors.is_empty() {
68 return Err(errors);
69 }
70 let translation_sheet_names = sheet_names.split_off(1);
71 let mut translation_sheets: Vec<(Language, _)> = vec![];
72
73 for translation_sheet_name in translation_sheet_names {
74 languages.push(
75 Language::from_639_1(&translation_sheet_name.clone()).unwrap());
76 translation_sheets.push((
77 Language::from_639_1(&translation_sheet_name.clone()).unwrap(),
78 workbook
79 .worksheet_range(&translation_sheet_name.clone())
80 .unwrap()
81 .unwrap(),
82 ));
83 }
84
85 let mut column_indicies: HashMap<&str, u32> = HashMap::new();
86 for row in main_sheet.rows().filter(|&row| {
87 if let DataType::String(v) = &row[0] {
88 v.starts_with("CB") || v.starts_with("OL")
89 } else {
90 false
91 }
92 }) {
93 for (i, value) in row.iter().enumerate() {
94 if let DataType::String(v) = value {
95 if v.starts_with("CB-CL:") {
96 column_indicies.insert("CLASSIFICATION_INDEX", i as u32);
97 } else if v.starts_with("CB-AN:") {
98 column_indicies.insert("ATTR_NAME_INDEX", i as u32);
99 } else if v.starts_with("CB-AT:") {
100 column_indicies.insert("ATTR_TYPE_INDEX", i as u32);
101 } else if v.starts_with("CB-RS:") {
102 column_indicies.insert("REFERENCE_SAI_INDEX", i as u32);
103 } else if v.starts_with("CB-FA:") {
104 column_indicies.insert("FLAGGED_INDEX", i as u32);
105 } else if v.starts_with("OL-CH:") {
106 column_indicies.insert("ENCODING_INDEX", i as u32);
107 } else if v.starts_with("OL-ST:") {
108 column_indicies.insert("STANDARD_INDEX", i as u32);
109 } else if v.starts_with("OL-FT:") {
110 column_indicies.insert("FORMAT_INDEX", i as u32);
111 } else if v.starts_with("OL-EC:") {
112 column_indicies.insert("ENTRY_CODES_INDEX", i as u32);
113 } else if v.starts_with("OL-UT:") {
114 column_indicies.insert("UNIT_INDEX", i as u32);
115 } else if v.starts_with("OL-CC:") {
116 column_indicies.insert("CONDITION_INDEX", i as u32);
117 } else if v.starts_with("OL-CD:") {
118 column_indicies.insert("DEPENDENCIES_INDEX", i as u32);
119 } else if v.starts_with("OL-CR:") {
120 column_indicies.insert("CARDINALITY_INDEX", i as u32);
121 } else if v.starts_with("OL-CN:") {
122 column_indicies.insert("CONFORMANCE_INDEX", i as u32);
123 } else if v.starts_with("OL-AM:") {
124 column_indicies.insert("ATTRIBUTE_MAPPING_INDEX", i as u32);
125 } else if v.starts_with("OL-EM:") {
126 column_indicies.insert("ENTRY_CODE_MAPPING_INDEX", i as u32);
127 }
128 }
129 }
130 }
131
132 if column_indicies.get("ATTR_NAME_INDEX").is_none() {
133 errors.push("Not found column with Attribute Names definiton".to_string());
134 }
135 if column_indicies.get("ATTR_NAME_INDEX").is_none() {
136 errors.push("Not found column with Attribute Names definiton".to_string());
137 }
138 if !errors.is_empty() {
139 return Err(errors);
140 }
141
142 let start: u32 = 3;
143 let end_offset = if sheet_version.is_some() { 1 } else { 3 };
144 let mut end: u32 = main_sheet.height() as u32;
145 for (i, row) in main_sheet.rows().enumerate().rev() {
146 if row
147 .iter()
148 .any(|cell| cell != &DataType::Empty && cell.get_string().unwrap().trim().ne(""))
149 {
150 end = end_offset + i as u32;
151 break;
152 }
153 }
154 let oca_range = (start, end);
155
156 let mut oca = OCABox::new();
157
158 let mut classification = String::new();
159 if let Some(classification_index) = column_indicies.get("CLASSIFICATION_INDEX") {
160 let classification_value = main_sheet.get_value((oca_range.0, *classification_index));
161 if let Some(class) = classification_value {
162 classification = class.to_string().trim().to_string();
163 }
164 }
165 oca.add_classification(classification);
166
167 let mut attributes: Vec<(u32, Attribute)> = vec![];
169 fn parse_row(
170 attr_index: u32,
171 main_sheet: &Range<DataType>,
172 column_indicies: HashMap<&str, u32>,
173 ) -> Result<Attribute, String> {
174 let attribute_name = main_sheet
175 .get_value((attr_index, *column_indicies.get("ATTR_NAME_INDEX").unwrap()))
176 .unwrap()
177 .to_string()
178 .trim()
179 .to_string();
180 if attribute_name.is_empty() {
181 return Err(format!(
182 "Parsing attribute in row {} failed. Attribute name is empty.",
183 attr_index + 1
184 ));
185 }
186 let attribute_type = &format!(
198 r#"{}"#,
199 &main_sheet
200 .get_value((attr_index, *column_indicies.get("ATTR_TYPE_INDEX").unwrap()))
201 .unwrap()
202 )
203 .trim()
204 .to_string();
205 let mut attribute = Attribute::new(attribute_name.clone());
206 attribute.set_attribute_type(
207 serde_json::from_str::<NestedAttrType>(format!("\"{attribute_type}\"").as_str())
208 .map_err(|e| {
209 format!(
210 "Parsing attribute type in row {} ({}) failed. {}",
211 attr_index + 1,
212 attribute_name,
213 e
214 )
215 })?,
216 );
217 if let Some(flagged_index) = column_indicies.get("FLAGGED_INDEX") {
218 if let Some(DataType::String(_value)) =
219 main_sheet.get_value((attr_index, *flagged_index))
220 {
221 attribute.set_flagged();
222 }
223 }
224 if let Some(encoding_index) = column_indicies.get("ENCODING_INDEX") {
225 if let Some(DataType::String(encoding_value)) =
226 main_sheet.get_value((attr_index, *encoding_index))
227 {
228 let encoding =
229 serde_json::from_str::<Encoding>(&format!(r#""{}""#, encoding_value.trim()))
230 .map_err(|e| {
231 format!(
232 "Parsing character encoding in row {} failed. {}",
233 attr_index + 1,
234 e
235 )
236 })?;
237 attribute.set_encoding(encoding);
238 }
239 }
240
241 if let Some(format_index) = column_indicies.get("FORMAT_INDEX") {
242 if let Some(DataType::String(format_value)) =
243 main_sheet.get_value((attr_index, *format_index))
244 {
245 attribute.set_format(format_value.clone().trim().to_string());
246 }
247 }
248
249 if let Some(entry_codes_index) = column_indicies.get("ENTRY_CODES_INDEX") {
260 if let Some(DataType::String(entry_codes_value)) =
261 main_sheet.get_value((attr_index, *entry_codes_index))
262 {
263 if entry_codes_value != &"[SAI]".to_string() {
264 let entry_codes: EntryCodesValue = if entry_codes_value.starts_with("SAI:") {
265 let sai = entry_codes_value.strip_prefix("SAI:").unwrap();
266 EntryCodesValue::Sai(sai.to_string())
267 } else {
268 let codes: Vec<String> = entry_codes_value
269 .trim()
270 .to_string()
271 .split('|')
272 .collect::<Vec<&str>>()
273 .iter()
274 .map(|c| c.to_string().trim().to_string())
275 .collect();
276 EntryCodesValue::Array(codes)
277 };
278 attribute.set_entry_codes(entry_codes);
279 }
280 }
281 }
282
283 if let Some(cardinality_index) = column_indicies.get("CARDINALITY_INDEX") {
308 if let Some(DataType::String(cardinality_value)) =
309 main_sheet.get_value((attr_index, *cardinality_index))
310 {
311 attribute.set_cardinality(cardinality_value.to_string().trim().to_string());
312 }
313 }
314
315 if let Some(conformance_index) = column_indicies.get("CONFORMANCE_INDEX") {
316 if let Some(DataType::String(conformance_value)) =
317 main_sheet.get_value((attr_index, *conformance_index))
318 {
319 attribute.set_conformance(conformance_value.clone().trim().to_string());
320 }
321 }
322
323 Ok(attribute)
377 }
378
379 for attr_index in oca_range.0..oca_range.1 {
380 match parse_row(attr_index, &main_sheet, column_indicies.clone()) {
381 Ok(attribute) => attributes.push((attr_index, attribute)),
382 Err(e) => errors.push(e),
383 }
384 }
385
386 let mut meta_trans: HashMap<String, HashMap<Language, String>> = HashMap::new();
387
388 let mut label_trans: HashMap<u32, HashMap<Language, String>> = HashMap::new();
389 let mut entries_trans: HashMap<u32, HashMap<Language, EntriesElement>> = HashMap::new();
390 let mut information_trans: HashMap<u32, HashMap<Language, String>> = HashMap::new();
391
392 for (lang, sheet) in translation_sheets.iter() {
393 let mut sheet_column_indicies: HashMap<&str, u32> = HashMap::new();
394 for row in sheet.rows().filter(|&row| {
395 if let DataType::String(v) = &row[0] {
396 v.starts_with("CB") || v.starts_with("OL")
397 } else {
398 false
399 }
400 }) {
401 for (i, value) in row.iter().enumerate() {
402 if let DataType::String(v) = value {
403 if v.starts_with("OL-MN:") {
404 sheet_column_indicies.insert("META_NAME_INDEX", i as u32);
405 } else if v.starts_with("OL-MV:") {
406 sheet_column_indicies.insert("META_VALUE_INDEX", i as u32);
407 } else if v.starts_with("CB-AN:") {
408 sheet_column_indicies.insert("ATTR_NAME_INDEX", i as u32);
409 } else if v.starts_with("OL-LA:") {
410 sheet_column_indicies.insert("LABEL_INDEX", i as u32);
411 } else if v.starts_with("OL-EN:") {
412 sheet_column_indicies.insert("ENTRIES_INDEX", i as u32);
413 } else if v.starts_with("OL-IN:") {
414 sheet_column_indicies.insert("INFORMATION_INDEX", i as u32);
415 }
416 }
417 }
418 }
419
420 for attr_index in (oca_range.0)..(sheet.height() as u32) {
421 if let Some(name_index) = sheet_column_indicies.get("META_NAME_INDEX") {
422 if let Some(DataType::String(name_value)) =
423 sheet.get_value((attr_index, *name_index))
424 {
425 if let Some(value_index) = sheet_column_indicies.get("META_VALUE_INDEX") {
426 if let Some(DataType::String(value_value)) =
427 sheet.get_value((attr_index, *value_index))
428 {
429 match meta_trans.get_mut(name_value) {
430 Some(m_trans) => {
431 m_trans.insert(*lang, value_value.to_string());
432 }
433 None => {
434 let mut m_trans = HashMap::new();
435 m_trans.insert(*lang, value_value.to_string());
436 meta_trans.insert(name_value.to_string(), m_trans);
437 }
438 }
439 }
440 }
441 }
442 }
443 }
444
445 for attr_index in (oca_range.0)..(oca_range.1) {
446 if let Some(label_index) = sheet_column_indicies.get("LABEL_INDEX") {
447 if let Some(DataType::String(label_value)) =
448 sheet.get_value((attr_index, *label_index))
449 {
450 match label_trans.get_mut(&attr_index) {
451 Some(attr_label_tr) => {
452 attr_label_tr
453 .insert(*lang, label_value.clone().trim().to_string());
454 }
455 None => {
456 let mut attr_label_tr: HashMap<Language, String> = HashMap::new();
457 attr_label_tr
458 .insert(*lang, label_value.clone().trim().to_string());
459 label_trans.insert(attr_index, attr_label_tr);
460 }
461 }
462 }
463 }
464
465 if let Some(entries_index) = sheet_column_indicies.get("ENTRIES_INDEX") {
466 if let Some(DataType::String(entries_value)) =
467 sheet.get_value((attr_index, *entries_index))
468 {
469 let entries_el: EntriesElement = if entries_value.starts_with("SAI:") {
470 let sai = entries_value
471 .strip_prefix("SAI:")
472 .unwrap()
473 .trim()
474 .to_string();
475 EntriesElement::Sai(sai.to_string())
476 } else {
477 let entries_values = entries_value.split('|').collect::<Vec<&str>>();
478 for (i, entries_value_element) in entries_values.iter().enumerate() {
479 let splitted = entries_value_element.split(':').collect::<Vec<&str>>();
480 let entry_key = splitted.first().unwrap().trim().to_string();
481 if entry_key.is_empty() {
482 errors.push(format!(
483 "Parsing attribute in row {} failed. Invalid Entry Overlay definition for {} language. Missing entry key in position {}",
484 attr_index + 1,
485 lang,
486 i + 1)
487 );
488 return Err(errors);
489 }
490 splitted.get(1).ok_or_else(|| {
491 errors.push(format!(
492 "Parsing attribute in row {} failed. Invalid Entry Overlay definition for {} language. Missing entry value in position {}",
493 attr_index + 1,
494 lang,
495 i + 1)
496 );
497 errors.clone()
498 })?;
499 }
500 let entries_obj =
501 entries_values.iter().fold(HashMap::new(), |mut acc, x| {
502 let splitted = x.split(':').collect::<Vec<&str>>();
503 acc.insert(
504 splitted.first().unwrap().to_string().trim().to_string(),
505 splitted.get(1).unwrap().to_string().trim().to_string(),
506 );
507 acc
508 });
509 EntriesElement::Object(entries_obj)
510 };
511 match entries_trans.get_mut(&attr_index) {
512 Some(attr_entries_tr) => {
513 if attr_entries_tr.get(lang).is_none() {
514 attr_entries_tr.insert(*lang, entries_el);
515 }
516 }
517 None => {
518 let mut attr_entries_tr: HashMap<Language, EntriesElement> =
519 HashMap::new();
520 attr_entries_tr.insert(*lang, entries_el);
521 entries_trans.insert(attr_index, attr_entries_tr);
522 }
523 }
524 }
525 }
526
527 if let Some(information_index) = sheet_column_indicies.get("INFORMATION_INDEX") {
528 if let Some(DataType::String(information_value)) =
529 sheet.get_value((attr_index, *information_index))
530 {
531 match information_trans.get_mut(&attr_index) {
532 Some(attr_info_tr) => {
533 attr_info_tr.insert(
534 *lang,
535 information_value.clone().trim().to_string(),
536 );
537 }
538 None => {
539 let mut attr_info_tr: HashMap<Language, String> = HashMap::new();
540 attr_info_tr.insert(
541 *lang,
542 information_value.clone().trim().to_string(),
543 );
544 information_trans.insert(attr_index, attr_info_tr);
545 }
546 }
547 }
548 }
549 }
550 }
551 for (i, mut attribute) in attributes {
552 if let Some(label_tr) = label_trans.get(&i).cloned() {
553 for (lang, label) in label_tr {
554 attribute.set_label(lang, label);
555 }
556 }
557 if let Some(lang_entries_tr) = entries_trans.get(&i).cloned() {
558 for (lang, entries_tr) in lang_entries_tr {
559 attribute.set_entry(lang, entries_tr);
560 }
561 }
562 if let Some(info_tr) = information_trans.get(&i).cloned() {
563 for (lang, info) in info_tr {
564 attribute.set_information(lang, info);
565 }
566 }
567 oca.add_attribute(attribute);
568 }
569 for (key, value_trans) in meta_trans {
570 for (lang, value) in value_trans {
571 oca.add_meta(lang, key.clone(), value.clone());
572 }
573 }
574 if errors.is_empty() {
577 Ok(ParsedResult {
578 oca,
579 languages,
580 })
581 } else {
582 Err(errors)
583 }
584}
585
586#[cfg(test)]
587mod tests {
588 use super::*;
589
590 #[test]
591 fn parse_xlsx_file() {
592 let result = parse(
593 format!(
594 "{}/tests/assets/oca_template.xlsx",
595 env!("CARGO_MANIFEST_DIR")
596 )
597 );
598 assert!(result.is_ok());
599 if let Ok(mut parsed) = result {
600 assert_eq!(parsed.languages.len(), 1);
601 let oca_bundle = parsed.oca.generate_bundle();
602 assert_eq!(oca_bundle.capture_base.attributes.len(), 3);
603 assert_eq!(oca_bundle.overlays.len(), 7);
604 }
605 }
606
607 #[test]
608 fn parse_xls_file() {
609 let result = parse(
610 format!(
611 "{}/tests/assets/oca_template.xls",
612 env!("CARGO_MANIFEST_DIR")
613 )
614 );
615 assert!(result.is_ok());
616
617 if let Ok(mut parsed) = result {
618 assert_eq!(parsed.languages.len(), 1);
619 let oca_bundle = parsed.oca.generate_bundle();
620 assert_eq!(oca_bundle.capture_base.attributes.len(), 3);
621 assert_eq!(oca_bundle.overlays.len(), 7);
622 }
623 }
624
625 #[test]
626 fn return_error_when_file_type_is_invalid() {
627 let result = parse(
628 format!(
629 "{}/tests/assets/invalid_format.txt",
630 env!("CARGO_MANIFEST_DIR")
631 )
632 );
633 assert!(result.is_err());
634 }
635}