elasticsearch_dsl/search/queries/specialized/more_like_this_query.rs
1use crate::search::*;
2use crate::util::*;
3
4/// The More Like This Query finds documents that are "like" a given set of documents.
5/// In order to do so, MLT selects a set of representative terms of these input documents,
6/// forms a query using these terms, executes the query and returns the results.
7/// The user controls the input documents, how the terms should be selected and how the query is formed.
8///
9/// The simplest use case consists of asking for documents that are similar to a provided piece of text.
10/// Here, we are asking for all movies that have some text similar to "Once upon a time"
11/// in their "title" and in their "description" fields, limiting the number of selected terms to 12.
12///
13/// A more complicated use case consists of mixing texts with documents already existing in the index.
14/// In this case, the syntax to specify a document is similar to the one used in the
15/// [Multi GET API](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html).
16///
17/// Finally, users can mix some texts, a chosen set of documents but also provide documents not necessarily present in the index.
18/// To provide documents not present in the index, the syntax is similar to
19/// [artificial documents](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-termvectors.html#docs-termvectors-artificial-doc).
20///
21/// **How it Works**
22/// Suppose we wanted to find all documents similar to a given input document. Obviously, the input document
23/// itself should be its best match for that type of query. And the reason would be mostly,
24/// according to [Lucene scoring formula](https://lucene.apache.org/core/4_9_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html),
25/// due to the terms with the highest tf-idf. Therefore, the terms of the input document that have the highest
26/// tf-idf are good representatives of that document, and could be used within a disjunctive query (or OR) to retrieve similar documents.
27/// The MLT query simply extracts the text from the input document, analyzes it, usually using the same analyzer at the field,
28/// then selects the top K terms with highest tf-idf to form a disjunctive query of these terms.
29///
30/// To create a `more_like_this` query with `like` as a string on title field:
31/// ```
32/// # use elasticsearch_dsl::queries::*;
33/// # use elasticsearch_dsl::queries::params::*;
34/// # let query =
35/// Query::more_like_this(["test"])
36/// .fields(["title"]);
37/// ```
38/// To create a `more_like_this` query with string and document id fields on title and description with optional fields:
39/// ```
40/// # use elasticsearch_dsl::queries::*;
41/// # use elasticsearch_dsl::queries::params::*;
42/// # let query =
43/// Query::more_like_this([Like::from(Document::new("123")), Like::from("test")])
44/// .fields(["title", "description"])
45/// .min_term_freq(1)
46/// .max_query_terms(12)
47/// .boost(1.2)
48/// .name("more_like_this");
49/// ```
50/// <https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-mlt-query.html>
51#[derive(Debug, Clone, PartialEq, Serialize)]
52#[serde(remote = "Self")]
53pub struct MoreLikeThisQuery {
54 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
55 fields: Option<Vec<String>>,
56
57 like: Vec<Like>,
58
59 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
60 unlike: Option<Vec<Like>>,
61
62 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
63 min_term_freq: Option<i64>,
64
65 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
66 max_query_terms: Option<i64>,
67
68 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
69 min_doc_freq: Option<i64>,
70
71 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
72 max_doc_freq: Option<i64>,
73
74 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
75 min_word_length: Option<i64>,
76
77 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
78 max_word_length: Option<i64>,
79
80 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
81 stop_words: Option<Vec<String>>,
82
83 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
84 analyzer: Option<String>,
85
86 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
87 minimum_should_match: Option<String>,
88
89 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
90 fail_on_unsupported_field: Option<bool>,
91
92 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
93 boost_terms: Option<f64>,
94
95 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
96 include: Option<bool>,
97
98 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
99 boost: Option<f32>,
100
101 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
102 _name: Option<String>,
103}
104
105/// Types for `like` and `unlike` fields
106#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
107#[serde(untagged)]
108pub enum Like {
109 /// String/text which will be used in `like` field array
110 String(String),
111
112 /// Struct to describe elasticsearch document which will be used in `like` field array
113 Document(Document),
114}
115
116impl From<String> for Like {
117 fn from(value: String) -> Self {
118 Self::String(value)
119 }
120}
121
122impl<'a> From<&'a str> for Like {
123 fn from(value: &'a str) -> Self {
124 Self::String(value.into())
125 }
126}
127
128impl From<Document> for Like {
129 fn from(value: Document) -> Self {
130 Self::Document(value)
131 }
132}
133
134/// One of `like` and `unlike` types which has like document structure
135#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
136pub struct Document {
137 _id: String,
138
139 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
140 _index: Option<String>,
141
142 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
143 _routing: Option<String>,
144
145 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
146 _source: Option<SourceFilter>,
147
148 #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
149 _stored_fields: StoredFields,
150}
151
152impl Document {
153 /// Creates an instance of [Document](Document)
154 ///
155 /// - `id` - document id as string.
156 pub fn new<T>(id: T) -> Self
157 where
158 T: ToString,
159 {
160 Self {
161 _id: id.to_string(),
162 _stored_fields: Default::default(),
163 _index: None,
164 _routing: None,
165 _source: None,
166 }
167 }
168
169 /// The index that contains the document. Required if no index is specified in the request URI.
170 pub fn index<T>(mut self, index: T) -> Self
171 where
172 T: ToString,
173 {
174 self._index = Some(index.to_string());
175 self
176 }
177
178 /// The key for the primary shard the document resides on. Required if routing is used during indexing.
179 pub fn routing<T>(mut self, routing: T) -> Self
180 where
181 T: ToString,
182 {
183 self._routing = Some(routing.to_string());
184 self
185 }
186
187 /// If `false`, excludes all `_source` fields. Defaults to `true`.
188 pub fn source<T>(mut self, source: T) -> Self
189 where
190 T: Into<SourceFilter>,
191 {
192 self._source = Some(source.into());
193 self
194 }
195
196 /// The stored fields you want to retrieve.
197 pub fn stored_fields<T>(mut self, stored_fields: T) -> Self
198 where
199 T: Into<StoredFields>,
200 {
201 self._stored_fields = stored_fields.into();
202 self
203 }
204}
205
206impl Query {
207 /// Creates an instance of [`MoreLikeThisQuery`]
208 ///
209 /// - `like` - free form text and/or a single or multiple documents.
210 pub fn more_like_this<I>(like: I) -> MoreLikeThisQuery
211 where
212 I: IntoIterator,
213 I::Item: Into<Like>,
214 {
215 MoreLikeThisQuery {
216 like: like.into_iter().map(Into::into).collect(),
217 fields: None,
218 unlike: None,
219 min_term_freq: None,
220 max_query_terms: None,
221 min_doc_freq: None,
222 max_doc_freq: None,
223 min_word_length: None,
224 max_word_length: None,
225 stop_words: None,
226 analyzer: None,
227 minimum_should_match: None,
228 fail_on_unsupported_field: None,
229 boost_terms: None,
230 include: None,
231 boost: None,
232 _name: None,
233 }
234 }
235}
236
237impl MoreLikeThisQuery {
238 /// A list of fields to fetch and analyze the text from.
239 /// Defaults to the index.query.default_field index setting, which has a default value of *.
240 /// The * value matches all fields eligible for `term-level queries`, excluding metadata fields.
241 pub fn fields<I>(mut self, fields: I) -> Self
242 where
243 I: IntoIterator,
244 I::Item: ToString,
245 {
246 self.fields = Some(fields.into_iter().map(|x| x.to_string()).collect());
247 self
248 }
249
250 /// The unlike parameter is used in conjunction with like in order not to select terms found in a chosen set of documents.
251 /// In other words, we could ask for documents like: "Apple", but unlike: "cake crumble tree". The syntax is the same as like.
252 pub fn unlike<I>(mut self, unlike: I) -> Self
253 where
254 I: IntoIterator,
255 I::Item: Into<Like>,
256 {
257 self.unlike = Some(unlike.into_iter().map(Into::into).collect());
258 self
259 }
260
261 /// The maximum number of query terms that will be selected.
262 /// Increasing this value gives greater accuracy at the expense of query execution speed.
263 /// Defaults to 25.
264 pub fn max_query_terms(mut self, max_query_terms: i64) -> Self {
265 self.max_query_terms = Some(max_query_terms);
266 self
267 }
268
269 /// The minimum term frequency below which the terms will be ignored from the input document.
270 /// Defaults to 2.
271 pub fn min_term_freq(mut self, min_term_freq: i64) -> Self {
272 self.min_term_freq = Some(min_term_freq);
273 self
274 }
275
276 /// The minimum document frequency below which the terms will be ignored from the input document.
277 /// Defaults to 5.
278 pub fn min_doc_freq(mut self, min_doc_freq: i64) -> Self {
279 self.min_doc_freq = Some(min_doc_freq);
280 self
281 }
282
283 /// The maximum document frequency above which the terms will be ignored from the input document.
284 /// This could be useful in order to ignore highly frequent words such as stop words.
285 /// Defaults to unbounded (Integer.MAX_VALUE, which is 2^31-1 or 2147483647).
286 pub fn max_doc_freq(mut self, max_doc_freq: i64) -> Self {
287 self.max_doc_freq = Some(max_doc_freq);
288 self
289 }
290
291 /// The minimum word length below which the terms will be ignored. Defaults to 0.
292 pub fn min_word_length(mut self, min_word_length: i64) -> Self {
293 self.min_word_length = Some(min_word_length);
294 self
295 }
296
297 /// The maximum word length above which the terms will be ignored. Defaults to unbounded (0).
298 pub fn max_word_length(mut self, max_word_length: i64) -> Self {
299 self.max_word_length = Some(max_word_length);
300 self
301 }
302
303 /// An array of stop words. Any word in this set is considered "uninteresting" and ignored.
304 /// If the analyzer allows for stop words, you might want to tell MLT to explicitly ignore them,
305 /// as for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
306 pub fn stop_words<T>(mut self, stop_words: T) -> Self
307 where
308 T: IntoIterator,
309 T::Item: ToString,
310 {
311 self.stop_words = Some(stop_words.into_iter().map(|x| x.to_string()).collect());
312 self
313 }
314
315 /// The analyzer that is used to analyze the free form text.
316 /// Defaults to the analyzer associated with the first field in `fields`.
317 pub fn analyzer<T>(mut self, analyzer: T) -> Self
318 where
319 T: ToString,
320 {
321 self.analyzer = Some(analyzer.to_string());
322 self
323 }
324
325 /// After the disjunctive query has been formed, this parameter controls the number of terms that must match.
326 /// The syntax is the same as the `minimum should match`. (Defaults to "30%").
327 pub fn minimum_should_match<T>(mut self, minimum_should_match: T) -> Self
328 where
329 T: ToString,
330 {
331 self.minimum_should_match = Some(minimum_should_match.to_string());
332 self
333 }
334
335 /// Controls whether the query should fail (throw an exception) if any of the specified fields are not of the supported types (text or keyword).
336 /// Set this to false to ignore the field and continue processing. Defaults to true.
337 pub fn fail_on_unsupported_field(mut self, fail_on_unsupported_field: bool) -> Self {
338 self.fail_on_unsupported_field = Some(fail_on_unsupported_field);
339 self
340 }
341
342 /// Each term in the formed query could be further boosted by their tf-idf score. This sets the boost factor to use when using this feature.
343 /// Defaults to deactivated (0). Any other positive value activates terms boosting with the given boost factor.
344 pub fn boost_terms<T>(mut self, boost_terms: T) -> Self
345 where
346 T: Into<f64>,
347 {
348 self.boost_terms = Some(boost_terms.into());
349 self
350 }
351
352 /// Specifies whether the input documents should also be included in the search results returned. Defaults to `false`.
353 pub fn include(mut self, include: bool) -> Self {
354 self.include = Some(include);
355 self
356 }
357
358 add_boost_and_name!();
359}
360
361impl ShouldSkip for MoreLikeThisQuery {
362 fn should_skip(&self) -> bool {
363 self.like.is_empty()
364 }
365}
366
367serialize_with_root!("more_like_this": MoreLikeThisQuery);
368
369#[cfg(test)]
370mod tests {
371 use super::*;
372
373 #[test]
374 fn serialization() {
375 assert_serialize_query(
376 Query::more_like_this(["test"]).fields(["title"]),
377 json!({
378 "more_like_this": {
379 "fields": ["title"],
380 "like": [
381 "test"
382 ]
383 }
384 }),
385 );
386
387 assert_serialize_query(
388 Query::more_like_this(["test"])
389 .fields(["title", "description"])
390 .min_term_freq(1)
391 .max_query_terms(12)
392 .boost(1.2)
393 .name("more_like_this"),
394 json!({
395 "more_like_this": {
396 "fields": ["title", "description"],
397 "like": [
398 "test"
399 ],
400 "min_term_freq": 1,
401 "max_query_terms": 12,
402 "boost": 1.2,
403 "_name": "more_like_this"
404 }
405 }),
406 );
407 assert_serialize_query(
408 Query::more_like_this([Document::new("123")]).fields(["title"]),
409 json!({
410 "more_like_this": {
411 "fields": ["title"],
412 "like": [
413 {
414 "_id": "123"
415 }
416 ]
417 }
418 }),
419 );
420
421 assert_serialize_query(
422 Query::more_like_this([Document::new("123")])
423 .fields(["title", "description"])
424 .min_term_freq(1)
425 .max_query_terms(12)
426 .boost(1.2)
427 .name("more_like_this"),
428 json!({
429 "more_like_this": {
430 "fields": ["title", "description"],
431 "like": [
432 {
433 "_id": "123"
434 }
435 ],
436 "min_term_freq": 1,
437 "max_query_terms": 12,
438 "boost": 1.2,
439 "_name": "more_like_this"
440 }
441 }),
442 );
443 assert_serialize_query(
444 Query::more_like_this([Like::from(Document::new("123")), Like::from("test")])
445 .fields(["title"]),
446 json!({
447 "more_like_this": {
448 "fields": ["title"],
449 "like": [
450 {
451 "_id": "123"
452 },
453 "test"
454 ]
455 }
456 }),
457 );
458
459 assert_serialize_query(
460 Query::more_like_this([
461 Like::from(
462 Document::new("123")
463 .index("test_index")
464 .routing("test_routing")
465 .source(false),
466 ),
467 Like::from("test"),
468 ])
469 .fields(["title", "description"])
470 .min_term_freq(1)
471 .max_query_terms(12)
472 .boost(1.2)
473 .name("more_like_this"),
474 json!({
475 "more_like_this": {
476 "fields": ["title", "description"],
477 "like": [
478 {
479 "_id": "123",
480 "_index": "test_index",
481 "_routing": "test_routing",
482 "_source": false
483 },
484 "test"
485 ],
486 "min_term_freq": 1,
487 "max_query_terms": 12,
488 "boost": 1.2,
489 "_name": "more_like_this"
490 }
491 }),
492 );
493 }
494}