kweepeer 0.1.2

A generic webservice for interactive query expansion, expansion is provided via various modules
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::HashMap;
use tracing::info;

pub mod api;
pub mod apidocs;
pub mod lexer;
pub mod modules;

#[cfg(feature = "analiticcl")]
use modules::analiticcl::{AnaliticclConfig, AnaliticclModule};

#[cfg(feature = "fst")]
use modules::fst::{FstConfig, FstModule};

use modules::lookup::{LookupConfig, LookupModule};

#[cfg(feature = "finalfusion")]
use modules::finalfusion::{FinalFusionConfig, FinalFusionModule};

use modules::Module;

pub use lexer::Term;

/// Maps a term to expansions, each `TermExpansion` corresponds to one source/module and may itself contain multiple expansions
pub type TermExpansions = HashMap<String, Vec<TermExpansion>>;

#[derive(Default)]
pub struct QueryExpander {
    config: Config,
    modules: Vec<Box<dyn Module>>,
    initialised: bool,
}

#[derive(Deserialize, Default)]
#[serde(default)]
pub struct Config {
    lookup: Vec<LookupConfig>,

    #[cfg(feature = "analiticcl")]
    analiticcl: Vec<AnaliticclConfig>,

    #[cfg(feature = "fst")]
    fst: Vec<FstConfig>,

    #[cfg(feature = "finalfusion")]
    finalfusion: Vec<FinalFusionConfig>,
}

impl QueryExpander {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn with_config(mut self, config: Config) -> Self {
        self.config = config;
        self
    }

    /// Adds a new module. Only valid before call to `load()`, will panic afterwards.
    pub fn add_module(&mut self, module: Box<dyn Module>) {
        if self.initialised {
            panic!("Can not add modules after load()!")
        }
        self.modules.push(module);
    }

    /// Adds a new module. Only valid before call to `load()`, will panic afterwards.
    pub fn with_module(mut self, module: Box<dyn Module>) -> Self {
        self.add_module(module);
        self
    }

    /// Returns an iterator over all the modules
    pub fn modules(&self) -> impl Iterator<Item = &dyn Module> {
        self.modules.iter().map(|x| x.as_ref())
    }

    /// Initialise all modules. This should be called once after all modules are loaded. Will panic if called multiple times.
    pub fn load(&mut self) -> Result<(), Error> {
        if self.initialised {
            panic!("load() can only be called once");
        }
        //MAYBE TODO: we could parallellize the loading for quicker startup time
        for lookupconfig in self.config.lookup.iter() {
            info!(
                "Adding Lookup module {} - {}",
                lookupconfig.id(),
                lookupconfig.name()
            );
            let mut module = LookupModule::new(lookupconfig.clone());
            module.load()?;
            self.modules.push(Box::new(module));
        }

        #[cfg(feature = "fst")]
        for fstconfig in self.config.fst.iter() {
            info!(
                "Adding Fst module {} - {}",
                fstconfig.id(),
                fstconfig.name()
            );
            let mut module = FstModule::new(fstconfig.clone());
            module.load()?;
            self.modules.push(Box::new(module));
        }

        #[cfg(feature = "analiticcl")]
        for analiticclconfig in self.config.analiticcl.iter() {
            info!(
                "Adding Analiticcl module {} - {}",
                analiticclconfig.id(),
                analiticclconfig.name()
            );
            let mut module = AnaliticclModule::new(analiticclconfig.clone());
            module.load()?;
            self.modules.push(Box::new(module));
        }
        #[cfg(feature = "finalfusion")]
        for finalfusionconfig in self.config.finalfusion.iter() {
            info!(
                "Adding Finalfusion module {} - {}",
                finalfusionconfig.id(),
                finalfusionconfig.name()
            );
            let mut module = FinalFusionModule::new(finalfusionconfig.clone());
            module.load()?;
            self.modules.push(Box::new(module));
        }

        info!("All modules loaded");
        self.initialised = true;
        Ok(())
    }

    pub fn expand_query(
        &self,
        terms: &Vec<Term>,
        params: &QueryParams,
    ) -> Result<TermExpansions, Error> {
        let mut terms_map = TermExpansions::new();
        self.expand_query_into(&mut terms_map, terms, params)?;
        Ok(terms_map)
    }

    pub fn expand_query_into(
        &self,
        terms_map: &mut TermExpansions,
        terms: &Vec<Term>,
        params: &QueryParams,
    ) -> Result<(), Error> {
        let excludemods: Vec<_> = if let Some(mods) = params.get("", "exclude") {
            value_to_str_array(mods)
        } else {
            Vec::new()
        };
        let includemods: Vec<_> = if let Some(mods) = params.get("", "include") {
            value_to_str_array(mods)
        } else {
            Vec::new()
        };
        for module in self.modules() {
            if (excludemods.is_empty() || !excludemods.contains(&module.id()))
                && (includemods.is_empty() || includemods.contains(&module.id()))
            {
                let mut expansion_map = module.expand_query(terms, params)?;
                for term in terms.iter() {
                    terms_map
                        .entry(term.as_str().to_string())
                        .and_modify(|expansions| {
                            if let Some(expansions2) = expansion_map.remove(term.as_str()) {
                                for expansion in expansions2 {
                                    expansions.push(expansion);
                                }
                            }
                        })
                        .or_insert_with(|| {
                            if let Some(expansions2) = expansion_map.remove(term.as_str()) {
                                expansions2
                            } else {
                                vec![]
                            }
                        });
                }
            }
        }
        Ok(())
    }

    /// Resolve a query template by substituting the template terms by the disjunctions from query expansion
    /// You won't really need to call this yourself.
    pub fn resolve_query_template(
        &self,
        query_template: &str,
        terms_map: &TermExpansions,
    ) -> Result<String, Error> {
        let mut query = String::with_capacity(query_template.len());
        let mut termbegin = None;
        let mut termend = None;
        let mut prevc = None;
        let mut expansioncache = std::collections::HashSet::<&str>::new();
        for (i, c) in query_template.char_indices() {
            if c == '{' && prevc == Some('{') {
                termbegin = Some(i + 1);
            }
            if c == '}' && prevc == Some('}') && termbegin.is_some() {
                if let Some(termend) = termend {
                    query += &query_template[termend + 2..termbegin.unwrap() - 2];
                }
                termend = Some(i - 1);
                let term = &query_template[termbegin.unwrap()..termend.unwrap()];
                if let Some(termexpansions) = terms_map.get(term) {
                    expansioncache.clear();
                    for termexpansion in termexpansions {
                        let mut first = true;
                        for expansion in termexpansion.iter() {
                            if !expansioncache.contains(expansion) {
                                if !first {
                                    query += "\" OR \"";
                                } else {
                                    if !expansioncache.is_empty() {
                                        query += " OR ";
                                    }
                                    query += "(\"";
                                }
                                first = false;
                                query += expansion;
                                expansioncache.insert(expansion);
                            }
                        }
                        if !first {
                            query += "\")";
                        }
                    }
                }
                //reset
                termbegin = None;
            }
            prevc = Some(c);
        }
        if let Some(termend) = termend {
            query += &query_template[termend + 2..];
        }
        Ok(query)
    }
}

/// convert a json array of strings to a rust Vec<&str>
fn value_to_str_array(input: &Value) -> Vec<&str> {
    if let Value::Array(array) = input {
        let mut array_out = Vec::with_capacity(array.len());
        for value in array {
            if let Value::String(s) = value {
                array_out.push(s.as_str());
            }
        }
        array_out
    } else if let Value::String(s) = input {
        s.split(',').collect()
    } else {
        Vec::new()
    }
}

#[derive(Debug, Serialize, Default, Clone)]
pub struct TermExpansion {
    expansions: Vec<String>,
    scores: Vec<f64>,
    source_id: Option<String>,
    source_name: Option<String>,
    source_type: &'static str,
    link: Option<String>,
}

impl TermExpansion {
    pub fn with_source(mut self, module: &impl Module) -> Self {
        self.source_id = Some(module.id().into());
        self.source_name = Some(module.name().into());
        self.source_type = module.kind();
        self
    }

    pub fn with_link(mut self, link: impl Into<String>) -> Self {
        self.link = Some(link.into());
        self
    }

    pub fn with_expansions(mut self, expansions: Vec<String>) -> Self {
        self.expansions = expansions;
        self
    }

    pub fn with_scores(mut self, scores: Vec<f64>) -> Self {
        self.scores = scores;
        self
    }

    pub fn add_variant_with_score(&mut self, expansion: impl Into<String>, score: f64) {
        self.expansions.push(expansion.into());
        self.scores.push(score);
    }

    pub fn add_variant(&mut self, expansion: impl Into<String>) {
        self.expansions.push(expansion.into());
    }

    pub fn expansions(&self) -> &Vec<String> {
        &self.expansions
    }

    pub fn scores(&self) -> &Vec<f64> {
        &self.scores
    }

    pub fn source_id(&self) -> Option<&str> {
        self.source_id.as_deref()
    }

    pub fn source_name(&self) -> Option<&str> {
        self.source_name.as_deref()
    }

    pub fn link(&self) -> Option<&str> {
        self.link.as_deref()
    }

    pub fn len(&self) -> usize {
        self.expansions.len()
    }

    pub fn iter(&self) -> impl Iterator<Item = &str> {
        self.expansions.iter().map(|x| x.as_str())
    }

    pub fn as_vec(&self) -> &Vec<String> {
        &self.expansions
    }
}

#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct QueryParam {
    module_id: String,
    key: String,
    value: Value,
}

impl QueryParam {
    pub fn module_id(&self) -> &str {
        self.module_id.as_str()
    }

    pub fn key(&self) -> &str {
        self.key.as_str()
    }

    pub fn value(&self) -> &Value {
        &self.value
    }
}

#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
/// Holds arbitrary parameters passed to queries at runtime when requesting expansion
// The implementation uses a simple vec to save ourselves HashMap overhead.
pub struct QueryParams(Vec<QueryParam>);

impl QueryParams {
    pub fn new() -> Self {
        Self::default()
    }

    /// Insert a new key and value (builder pattern)
    pub fn with(
        mut self,
        module_id: impl Into<String>,
        key: impl Into<String>,
        value: Value,
    ) -> Self {
        self.insert(module_id, key, value);
        self
    }

    /// Insert a new key and value
    /// By convention, we use an empty module_id for a global scope.
    pub fn insert(&mut self, module_id: impl Into<String>, key: impl Into<String>, value: Value) {
        self.0.push(QueryParam {
            module_id: module_id.into(),
            key: key.into(),
            value,
        });
    }

    /// Check if a key exists. By convention, we use an empty module_id for a global scope.
    pub fn contains(&self, module_id: &str, key: &str) -> bool {
        for param in self.iter_for_module(module_id) {
            if param.key() == key {
                return true;
            }
        }
        false
    }

    /// Iterate over all keys and values
    pub fn iter<'a>(&'a self) -> impl Iterator<Item = &'a QueryParam> {
        self.0.iter()
    }

    /// Iterate over all keys and values
    pub fn iter_for_module<'a>(
        &'a self,
        module_id: &'a str,
    ) -> impl Iterator<Item = &'a QueryParam> {
        self.0
            .iter()
            .filter(move |param| param.module_id() == module_id)
    }

    /// Retrieve a value by key
    /// By convention, we use an empty module_id for a global scope.
    pub fn get<'a>(&'a self, module_id: &'a str, key: &str) -> Option<&'a Value> {
        for param in self.iter_for_module(module_id) {
            if param.key() == key {
                return Some(param.value());
            }
        }
        None
    }
}

impl From<&HashMap<String, String>> for QueryParams {
    fn from(map: &HashMap<String, String>) -> Self {
        let mut result = QueryParams::new();
        for (key, value) in map.iter() {
            let splitkey: Vec<_> = key.splitn(2, '.').collect();
            if splitkey.len() == 1 {
                result.insert("", key, value.to_owned().into());
            } else {
                result.insert(splitkey[0], splitkey[1], value.to_owned().into());
            }
        }
        result
    }
}

#[derive(Debug, Clone)]
pub enum Error {
    LoadError(String),
    QueryExpandError(String),
}

impl std::fmt::Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::LoadError(x) => {
                f.write_str("[Load error] ")?;
                f.write_str(x)
            }
            Self::QueryExpandError(x) => {
                f.write_str("[Query expansion error] ")?;
                f.write_str(x)
            }
        }
    }
}

impl From<std::io::Error> for Error {
    fn from(value: std::io::Error) -> Self {
        Self::LoadError(format!("{}", value))
    }
}

impl Serialize for Error {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        match self {
            Self::LoadError(s) | Self::QueryExpandError(s) => serializer.serialize_str(s.as_str()),
        }
    }
}