1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
use std::collections::HashMap;
use crate::types::*;

pub fn parse_xml(text: &str) -> anyhow::Result<Kathoey> {
  let mut map: HashMap<String, Fem> = HashMap::new();
  let mut lemma = false;
  let mut lword = false;
  let mut gword = false;
  let mut fword = false;
  let mut smthn = false;
  let mut lfemm = false;
  let mut addvr = false;
  let mut addot = true;
  let mut word: &str = "";
  let mut mbfem: &str = "";
  let mut femfem: &str = "";
  let mut other: Vec<&str> = vec![];
  let mut dict: Vec<String> = vec![];
  let mut temp_dict: HashMap<&str, usize> = HashMap::new();
  let mut fem_index : usize = 0;
  let mut lem: Lemma = Lemma::Other;
  for token in xmlparser::Tokenizer::from(text) {
    let t: xmlparser::Token = token?;
    match t {
      xmlparser::Token::ElementStart { local, .. } => {
        if local.as_str() == "lemma" {
          lemma = true;
        }
        else if local.as_str() == "l" {
          lword = true;
        }
        else if local.as_str() == "g" {
          gword = true;
        }
        else if local.as_str() == "f" {
          fword = true;
          addot = true;
        } else {
          smthn = true;
        }
      },
      xmlparser::Token::Attribute { local, value, .. } => {
        if lword && !gword {
          if local.as_str() == "t" {
            word = value.as_str();
          }
        } else if gword && lword {
          if local.as_str() == "v" {
            if value.as_str() == "VERB" {
              lem = Lemma::Verb;
            } else if value.as_str() == "ADJS" {
              lem = Lemma::Adjs;
            } else if value.as_str() == "PRTS" {
              lem = Lemma::Prts;
            } else if value.as_str() == "femn" {
              femfem = word;
              lfemm = true;
            }
          }
        } else if fword {
          if gword {
            if value.as_str() == "femn" {
              femfem = mbfem;
              lfemm = true;
              addot = false;
            } else if value.as_str() == "impr"
                   || value.as_str() == "neut" {
              addot = false;
            }
            if lem == Lemma::Verb
            && value.as_str() == "masc" {
              addvr = true;
            }
          } else if local.as_str() == "t" {
            mbfem = value.as_str();
          }
        }
      },
      xmlparser::Token::ElementEnd { end, .. } => {
        match end {
          xmlparser::ElementEnd::Open => {
            // means > and we not interested in those
          }
          _ => {
            if smthn {
              smthn = false;
            } else if gword {
              gword = false;
            } else if fword {
              if lem == Lemma::Verb {
                if addot && addvr {
                  other.push(mbfem);
                }
              } else if addot {
                other.push(mbfem);
              }
              addot = true;
              addvr = false;
              fword = false;
            } else if lword {
              lword = false;
            } else if lemma {
              let fem_index =
                if let Some(i) = temp_dict.get(femfem) {
                  *i
                } else {
                  let i = fem_index;
                  temp_dict.insert(femfem, i);
                  dict.push(femfem.to_string());
                  fem_index += 1;
                  i
                };
              if lfemm {
                for w in other.iter() {
                  if let Some(mut f) = map.get_mut(*w) {
                    if lem < f.lemma {
                      f.fem = fem_index;
                      f.lemma = lem;
                    }
                  } else {
                    map.insert(
                      w.to_string(),
                      Fem {
                        fem: fem_index,
                        lemma: lem
                      }
                    );
                  }
                }
              }
              lemma = false;
              lem = Lemma::Other;
              lfemm = false;
              other.clear();
            }
          }
        }
      },
      _ => {}
    }
  }
  Ok(Kathoey {
    dict, map
  })
}