elasticsearch_dsl/search/queries/specialized/
more_like_this_query.rs

1use crate::search::*;
2use crate::util::*;
3
4/// The More Like This Query finds documents that are "like" a given set of documents.
5/// In order to do so, MLT selects a set of representative terms of these input documents,
6/// forms a query using these terms, executes the query and returns the results.
7/// The user controls the input documents, how the terms should be selected and how the query is formed.
8///
9/// The simplest use case consists of asking for documents that are similar to a provided piece of text.
10/// Here, we are asking for all movies that have some text similar to "Once upon a time"
11/// in their "title" and in their "description" fields, limiting the number of selected terms to 12.
12///
13/// A more complicated use case consists of mixing texts with documents already existing in the index.
14/// In this case, the syntax to specify a document is similar to the one used in the
15/// [Multi GET API](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-multi-get.html).
16///
17/// Finally, users can mix some texts, a chosen set of documents but also provide documents not necessarily present in the index.
18/// To provide documents not present in the index, the syntax is similar to
19/// [artificial documents](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-termvectors.html#docs-termvectors-artificial-doc).
20///
21/// **How it Works**
22/// Suppose we wanted to find all documents similar to a given input document. Obviously, the input document
23/// itself should be its best match for that type of query. And the reason would be mostly,
24/// according to [Lucene scoring formula](https://lucene.apache.org/core/4_9_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html),
25/// due to the terms with the highest tf-idf. Therefore, the terms of the input document that have the highest
26/// tf-idf are good representatives of that document, and could be used within a disjunctive query (or OR) to retrieve similar documents.
27/// The MLT query simply extracts the text from the input document, analyzes it, usually using the same analyzer at the field,
28/// then selects the top K terms with highest tf-idf to form a disjunctive query of these terms.
29///
30/// To create a `more_like_this` query with `like` as a string on title field:
31/// ```
32/// # use elasticsearch_dsl::queries::*;
33/// # use elasticsearch_dsl::queries::params::*;
34/// # let query =
35/// Query::more_like_this(["test"])
36///     .fields(["title"]);
37/// ```
38/// To create a `more_like_this` query with string and document id fields on title and description with optional fields:
39/// ```
40/// # use elasticsearch_dsl::queries::*;
41/// # use elasticsearch_dsl::queries::params::*;
42/// # let query =
43/// Query::more_like_this([Like::from(Document::new("123")), Like::from("test")])
44///     .fields(["title", "description"])
45///     .min_term_freq(1)
46///     .max_query_terms(12)
47///     .boost(1.2)
48///     .name("more_like_this");
49/// ```
50/// <https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-mlt-query.html>
51#[derive(Debug, Clone, PartialEq, Serialize)]
52#[serde(remote = "Self")]
53pub struct MoreLikeThisQuery {
54    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
55    fields: Option<Vec<String>>,
56
57    like: Vec<Like>,
58
59    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
60    unlike: Option<Vec<Like>>,
61
62    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
63    min_term_freq: Option<i64>,
64
65    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
66    max_query_terms: Option<i64>,
67
68    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
69    min_doc_freq: Option<i64>,
70
71    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
72    max_doc_freq: Option<i64>,
73
74    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
75    min_word_length: Option<i64>,
76
77    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
78    max_word_length: Option<i64>,
79
80    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
81    stop_words: Option<Vec<String>>,
82
83    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
84    analyzer: Option<String>,
85
86    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
87    minimum_should_match: Option<String>,
88
89    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
90    fail_on_unsupported_field: Option<bool>,
91
92    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
93    boost_terms: Option<f64>,
94
95    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
96    include: Option<bool>,
97
98    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
99    boost: Option<f32>,
100
101    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
102    _name: Option<String>,
103}
104
105/// Types for `like` and `unlike` fields
106#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
107#[serde(untagged)]
108pub enum Like {
109    /// String/text which will be used in `like` field array
110    String(String),
111
112    /// Struct to describe elasticsearch document which will be used in `like` field array
113    Document(Document),
114}
115
116impl From<String> for Like {
117    fn from(value: String) -> Self {
118        Self::String(value)
119    }
120}
121
122impl<'a> From<&'a str> for Like {
123    fn from(value: &'a str) -> Self {
124        Self::String(value.into())
125    }
126}
127
128impl From<Document> for Like {
129    fn from(value: Document) -> Self {
130        Self::Document(value)
131    }
132}
133
134/// One of `like` and `unlike` types which has like document structure
135#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
136pub struct Document {
137    _id: String,
138
139    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
140    _index: Option<String>,
141
142    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
143    _routing: Option<String>,
144
145    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
146    _source: Option<SourceFilter>,
147
148    #[serde(skip_serializing_if = "ShouldSkip::should_skip")]
149    _stored_fields: StoredFields,
150}
151
152impl Document {
153    /// Creates an instance of [Document](Document)
154    ///
155    /// - `id` - document id as string.
156    pub fn new<T>(id: T) -> Self
157    where
158        T: ToString,
159    {
160        Self {
161            _id: id.to_string(),
162            _stored_fields: Default::default(),
163            _index: None,
164            _routing: None,
165            _source: None,
166        }
167    }
168
169    /// The index that contains the document. Required if no index is specified in the request URI.
170    pub fn index<T>(mut self, index: T) -> Self
171    where
172        T: ToString,
173    {
174        self._index = Some(index.to_string());
175        self
176    }
177
178    /// The key for the primary shard the document resides on. Required if routing is used during indexing.
179    pub fn routing<T>(mut self, routing: T) -> Self
180    where
181        T: ToString,
182    {
183        self._routing = Some(routing.to_string());
184        self
185    }
186
187    /// If `false`, excludes all `_source` fields. Defaults to `true`.
188    pub fn source<T>(mut self, source: T) -> Self
189    where
190        T: Into<SourceFilter>,
191    {
192        self._source = Some(source.into());
193        self
194    }
195
196    /// The stored fields you want to retrieve.
197    pub fn stored_fields<T>(mut self, stored_fields: T) -> Self
198    where
199        T: Into<StoredFields>,
200    {
201        self._stored_fields = stored_fields.into();
202        self
203    }
204}
205
206impl Query {
207    /// Creates an instance of [`MoreLikeThisQuery`]
208    ///
209    /// - `like` - free form text and/or a single or multiple documents.
210    pub fn more_like_this<I>(like: I) -> MoreLikeThisQuery
211    where
212        I: IntoIterator,
213        I::Item: Into<Like>,
214    {
215        MoreLikeThisQuery {
216            like: like.into_iter().map(Into::into).collect(),
217            fields: None,
218            unlike: None,
219            min_term_freq: None,
220            max_query_terms: None,
221            min_doc_freq: None,
222            max_doc_freq: None,
223            min_word_length: None,
224            max_word_length: None,
225            stop_words: None,
226            analyzer: None,
227            minimum_should_match: None,
228            fail_on_unsupported_field: None,
229            boost_terms: None,
230            include: None,
231            boost: None,
232            _name: None,
233        }
234    }
235}
236
237impl MoreLikeThisQuery {
238    /// A list of fields to fetch and analyze the text from.
239    /// Defaults to the index.query.default_field index setting, which has a default value of *.
240    /// The * value matches all fields eligible for `term-level queries`, excluding metadata fields.
241    pub fn fields<I>(mut self, fields: I) -> Self
242    where
243        I: IntoIterator,
244        I::Item: ToString,
245    {
246        self.fields = Some(fields.into_iter().map(|x| x.to_string()).collect());
247        self
248    }
249
250    /// The unlike parameter is used in conjunction with like in order not to select terms found in a chosen set of documents.
251    /// In other words, we could ask for documents like: "Apple", but unlike: "cake crumble tree". The syntax is the same as like.
252    pub fn unlike<I>(mut self, unlike: I) -> Self
253    where
254        I: IntoIterator,
255        I::Item: Into<Like>,
256    {
257        self.unlike = Some(unlike.into_iter().map(Into::into).collect());
258        self
259    }
260
261    /// The maximum number of query terms that will be selected.
262    /// Increasing this value gives greater accuracy at the expense of query execution speed.
263    /// Defaults to 25.
264    pub fn max_query_terms(mut self, max_query_terms: i64) -> Self {
265        self.max_query_terms = Some(max_query_terms);
266        self
267    }
268
269    /// The minimum term frequency below which the terms will be ignored from the input document.
270    /// Defaults to 2.
271    pub fn min_term_freq(mut self, min_term_freq: i64) -> Self {
272        self.min_term_freq = Some(min_term_freq);
273        self
274    }
275
276    /// The minimum document frequency below which the terms will be ignored from the input document.
277    /// Defaults to 5.
278    pub fn min_doc_freq(mut self, min_doc_freq: i64) -> Self {
279        self.min_doc_freq = Some(min_doc_freq);
280        self
281    }
282
283    /// The maximum document frequency above which the terms will be ignored from the input document.
284    /// This could be useful in order to ignore highly frequent words such as stop words.
285    /// Defaults to unbounded (Integer.MAX_VALUE, which is 2^31-1 or 2147483647).
286    pub fn max_doc_freq(mut self, max_doc_freq: i64) -> Self {
287        self.max_doc_freq = Some(max_doc_freq);
288        self
289    }
290
291    /// The minimum word length below which the terms will be ignored. Defaults to 0.
292    pub fn min_word_length(mut self, min_word_length: i64) -> Self {
293        self.min_word_length = Some(min_word_length);
294        self
295    }
296
297    /// The maximum word length above which the terms will be ignored. Defaults to unbounded (0).
298    pub fn max_word_length(mut self, max_word_length: i64) -> Self {
299        self.max_word_length = Some(max_word_length);
300        self
301    }
302
303    /// An array of stop words. Any word in this set is considered "uninteresting" and ignored.
304    /// If the analyzer allows for stop words, you might want to tell MLT to explicitly ignore them,
305    /// as for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
306    pub fn stop_words<T>(mut self, stop_words: T) -> Self
307    where
308        T: IntoIterator,
309        T::Item: ToString,
310    {
311        self.stop_words = Some(stop_words.into_iter().map(|x| x.to_string()).collect());
312        self
313    }
314
315    /// The analyzer that is used to analyze the free form text.
316    /// Defaults to the analyzer associated with the first field in `fields`.
317    pub fn analyzer<T>(mut self, analyzer: T) -> Self
318    where
319        T: ToString,
320    {
321        self.analyzer = Some(analyzer.to_string());
322        self
323    }
324
325    /// After the disjunctive query has been formed, this parameter controls the number of terms that must match.
326    /// The syntax is the same as the `minimum should match`. (Defaults to "30%").
327    pub fn minimum_should_match<T>(mut self, minimum_should_match: T) -> Self
328    where
329        T: ToString,
330    {
331        self.minimum_should_match = Some(minimum_should_match.to_string());
332        self
333    }
334
335    /// Controls whether the query should fail (throw an exception) if any of the specified fields are not of the supported types (text or keyword).
336    /// Set this to false to ignore the field and continue processing. Defaults to true.
337    pub fn fail_on_unsupported_field(mut self, fail_on_unsupported_field: bool) -> Self {
338        self.fail_on_unsupported_field = Some(fail_on_unsupported_field);
339        self
340    }
341
342    /// Each term in the formed query could be further boosted by their tf-idf score. This sets the boost factor to use when using this feature.
343    /// Defaults to deactivated (0). Any other positive value activates terms boosting with the given boost factor.
344    pub fn boost_terms<T>(mut self, boost_terms: T) -> Self
345    where
346        T: Into<f64>,
347    {
348        self.boost_terms = Some(boost_terms.into());
349        self
350    }
351
352    /// Specifies whether the input documents should also be included in the search results returned. Defaults to `false`.
353    pub fn include(mut self, include: bool) -> Self {
354        self.include = Some(include);
355        self
356    }
357
358    add_boost_and_name!();
359}
360
361impl ShouldSkip for MoreLikeThisQuery {
362    fn should_skip(&self) -> bool {
363        self.like.is_empty()
364    }
365}
366
367serialize_with_root!("more_like_this": MoreLikeThisQuery);
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[test]
374    fn serialization() {
375        assert_serialize_query(
376            Query::more_like_this(["test"]).fields(["title"]),
377            json!({
378                "more_like_this": {
379                    "fields": ["title"],
380                    "like": [
381                        "test"
382                    ]
383                }
384            }),
385        );
386
387        assert_serialize_query(
388            Query::more_like_this(["test"])
389                .fields(["title", "description"])
390                .min_term_freq(1)
391                .max_query_terms(12)
392                .boost(1.2)
393                .name("more_like_this"),
394            json!({
395                "more_like_this": {
396                    "fields": ["title", "description"],
397                    "like": [
398                        "test"
399                    ],
400                    "min_term_freq": 1,
401                    "max_query_terms": 12,
402                    "boost": 1.2,
403                    "_name": "more_like_this"
404                }
405            }),
406        );
407        assert_serialize_query(
408            Query::more_like_this([Document::new("123")]).fields(["title"]),
409            json!({
410                "more_like_this": {
411                    "fields": ["title"],
412                    "like": [
413                        {
414                            "_id": "123"
415                        }
416                    ]
417                }
418            }),
419        );
420
421        assert_serialize_query(
422            Query::more_like_this([Document::new("123")])
423                .fields(["title", "description"])
424                .min_term_freq(1)
425                .max_query_terms(12)
426                .boost(1.2)
427                .name("more_like_this"),
428            json!({
429                "more_like_this": {
430                    "fields": ["title", "description"],
431                    "like": [
432                        {
433                            "_id": "123"
434                        }
435                    ],
436                    "min_term_freq": 1,
437                    "max_query_terms": 12,
438                    "boost": 1.2,
439                    "_name": "more_like_this"
440                }
441            }),
442        );
443        assert_serialize_query(
444            Query::more_like_this([Like::from(Document::new("123")), Like::from("test")])
445                .fields(["title"]),
446            json!({
447                "more_like_this": {
448                    "fields": ["title"],
449                    "like": [
450                        {
451                            "_id": "123"
452                        },
453                        "test"
454                    ]
455                }
456            }),
457        );
458
459        assert_serialize_query(
460            Query::more_like_this([
461                Like::from(
462                    Document::new("123")
463                        .index("test_index")
464                        .routing("test_routing")
465                        .source(false),
466                ),
467                Like::from("test"),
468            ])
469            .fields(["title", "description"])
470            .min_term_freq(1)
471            .max_query_terms(12)
472            .boost(1.2)
473            .name("more_like_this"),
474            json!({
475                "more_like_this": {
476                    "fields": ["title", "description"],
477                    "like": [
478                        {
479                            "_id": "123",
480                            "_index": "test_index",
481                            "_routing": "test_routing",
482                            "_source": false
483                        },
484                        "test"
485                    ],
486                    "min_term_freq": 1,
487                    "max_query_terms": 12,
488                    "boost": 1.2,
489                    "_name": "more_like_this"
490                }
491            }),
492        );
493    }
494}