Struct tantivy::collector::TopDocs [−][src]
The TopDocs
collector keeps track of the top K
documents
sorted by their score.
The implementation is based on a BinaryHeap
.
The theorical complexity for collecting the top K
out of n
documents
is O(n log K)
.
This collector guarantees a stable sorting in case of a tie on the document score. As such, it is suitable to implement pagination.
use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::{Schema, TEXT}; use tantivy::{doc, DocAddress, Index}; let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field("title", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); index_writer.add_document(doc!(title => "The Name of the Wind")); index_writer.add_document(doc!(title => "The Diary of Muadib")); index_writer.add_document(doc!(title => "A Dairy Cow")); index_writer.add_document(doc!(title => "The Diary of a Young Girl")); assert!(index_writer.commit().is_ok()); let reader = index.reader().unwrap(); let searcher = reader.searcher(); let query_parser = QueryParser::for_index(&index, vec![title]); let query = query_parser.parse_query("diary").unwrap(); let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap(); assert_eq!(top_docs[0].1, DocAddress(0, 1)); assert_eq!(top_docs[1].1, DocAddress(0, 3));
Implementations
impl TopDocs
[src]
pub fn with_limit(limit: usize) -> TopDocs
[src]
Creates a top score collector, with a number of documents equal to "limit".
Panics
The method panics if limit is 0
pub fn and_offset(self, offset: usize) -> TopDocs
[src]
Skip the first "offset" documents when collecting.
This is equivalent to OFFSET
in MySQL or PostgreSQL and start
in
Lucene's TopDocsCollector.
Example
use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::{Schema, TEXT}; use tantivy::{doc, DocAddress, Index}; let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field("title", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); index_writer.add_document(doc!(title => "The Name of the Wind")); index_writer.add_document(doc!(title => "The Diary of Muadib")); index_writer.add_document(doc!(title => "A Dairy Cow")); index_writer.add_document(doc!(title => "The Diary of a Young Girl")); index_writer.add_document(doc!(title => "The Diary of Lena Mukhina")); assert!(index_writer.commit().is_ok()); let reader = index.reader().unwrap(); let searcher = reader.searcher(); let query_parser = QueryParser::for_index(&index, vec![title]); let query = query_parser.parse_query("diary").unwrap(); let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap(); assert_eq!(top_docs.len(), 2); assert_eq!(top_docs[0].1, DocAddress(0, 4)); assert_eq!(top_docs[1].1, DocAddress(0, 3));
pub fn order_by_u64_field(
self,
field: Field
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>>
[src]
self,
field: Field
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>>
Set top-K to rank documents by a given fast field.
If the field is not a fast or does not exist, this method returns successfully (it is not aware of any schema). An error will be returned at the moment of search.
If the field is a FAST field but not a u64 field, search will return successfully but it will return returns a monotonic u64-representation (ie. the order is still correct) of the requested field type.
Example
use tantivy::Searcher; use tantivy::collector::TopDocs; use tantivy::schema::Field; /// Searches the document matching the given query, and /// collects the top 10 documents, order by the u64-`field` /// given in argument. fn docs_sorted_by_rating(searcher: &Searcher, query: &dyn Query, rating_field: Field) -> tantivy::Result<Vec<(u64, DocAddress)>> { // This is where we build our topdocs collector // // Note the `rating_field` needs to be a FAST field here. let top_books_by_rating = TopDocs ::with_limit(10) .order_by_u64_field(rating_field); // ... and here are our documents. Note this is a simple vec. // The `u64` in the pair is the value of our fast field for // each documents. // // The vec is sorted decreasingly by `sort_by_field`, and has a // length of 10, or less if not enough documents matched the // query. let resulting_docs: Vec<(u64, DocAddress)> = searcher.search(query, &top_books_by_rating)?; Ok(resulting_docs) }
See also
To confortably work with u64
s, i64
s, f64
s, or date
s, please refer to
.order_by_fast_field(...) method.
pub fn order_by_fast_field<TFastValue>(
self,
fast_field: Field
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>> where
TFastValue: FastValue + 'static,
[src]
self,
fast_field: Field
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>> where
TFastValue: FastValue + 'static,
Set top-K to rank documents by a given fast field.
If the field is not a fast field, or its field type does not match the generic type, this method does not panic, but an explicit error will be returned at the moment of collection.
Note that this method is a generic. The requested fast field type will be often inferred in your code by the rust compiler.
Implementation-wise, for performance reason, tantivy will manipulate the u64 representation of your fast field until the last moment.
Example
use tantivy::Searcher; use tantivy::collector::TopDocs; use tantivy::schema::Field; /// Searches the document matching the given query, and /// collects the top 10 documents, order by the u64-`field` /// given in argument. fn docs_sorted_by_revenue(searcher: &Searcher, query: &dyn Query, revenue_field: Field) -> tantivy::Result<Vec<(i64, DocAddress)>> { // This is where we build our topdocs collector // // Note the generics parameter that needs to match the // type `sort_by_field`. revenue_field here is a FAST i64 field. let top_company_by_revenue = TopDocs ::with_limit(2) .order_by_fast_field(revenue_field); // ... and here are our documents. Note this is a simple vec. // The `i64` in the pair is the value of our fast field for // each documents. // // The vec is sorted decreasingly by `sort_by_field`, and has a // length of 10, or less if not enough documents matched the // query. let resulting_docs: Vec<(i64, DocAddress)> = searcher.search(query, &top_company_by_revenue)?; Ok(resulting_docs) }
pub fn tweak_score<TScore, TScoreSegmentTweaker, TScoreTweaker>(
self,
score_tweaker: TScoreTweaker
) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>> where
TScore: 'static + Send + Sync + Clone + PartialOrd,
TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker> + Send + Sync,
[src]
self,
score_tweaker: TScoreTweaker
) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>> where
TScore: 'static + Send + Sync + Clone + PartialOrd,
TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker> + Send + Sync,
Ranks the documents using a custom score.
This method offers a convenient way to tweak or replace
the documents score. As suggested by the prototype you can
manually define your own ScoreTweaker
and pass it as an argument, but there is a much simpler way to
tweak your score: you can use a closure as in the following
example.
Example
Typically, you will want to rely on one or more fast fields,
to alter the original relevance Score
.
For instance, in the following, we assume that we are implementing
an e-commerce website that has a fast field called popularity
that rates whether a product is typically often bought by users.
In the following example will will tweak our ranking a bit by boosting popular products a notch.
In more serious application, this tweaking could involved running a learning-to-rank model over various features
use tantivy::SegmentReader; use tantivy::collector::TopDocs; use tantivy::schema::Field; fn create_schema() -> Schema { let mut schema_builder = Schema::builder(); schema_builder.add_text_field("product_name", TEXT); schema_builder.add_u64_field("popularity", FAST); schema_builder.build() } fn create_index() -> tantivy::Result<Index> { let schema = create_schema(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; let product_name = index.schema().get_field("product_name").unwrap(); let popularity: Field = index.schema().get_field("popularity").unwrap(); index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64)); index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64)); index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64)); index_writer.commit()?; Ok(index) } let index = create_index().unwrap(); let product_name = index.schema().get_field("product_name").unwrap(); let popularity: Field = index.schema().get_field("popularity").unwrap(); let user_query_str = "diary"; let query_parser = QueryParser::for_index(&index, vec![product_name]); let query = query_parser.parse_query(user_query_str).unwrap(); // This is where we build our collector with our custom score. let top_docs_by_custom_score = TopDocs ::with_limit(10) .tweak_score(move |segment_reader: &SegmentReader| { // The argument is a function that returns our scoring // function. // // The point of this "mother" function is to gather all // of the segment level information we need for scoring. // Typically, fast_fields. // // In our case, we will get a reader for the popularity // fast field. let popularity_reader = segment_reader.fast_fields().u64(popularity).unwrap(); // We can now define our actual scoring function move |doc: DocId, original_score: Score| { let popularity: u64 = popularity_reader.get(doc); // Well.. For the sake of the example we use a simple logarithm // function. let popularity_boost_score = ((2u64 + popularity) as Score).log2(); popularity_boost_score * original_score } }); let reader = index.reader().unwrap(); let searcher = reader.searcher(); // ... and here are our documents. Note this is a simple vec. // The `Score` in the pair is our tweaked score. let resulting_docs: Vec<(Score, DocAddress)> = searcher.search(&query, &top_docs_by_custom_score).unwrap();
See also
pub fn custom_score<TScore, TCustomSegmentScorer, TCustomScorer>(
self,
custom_score: TCustomScorer
) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>> where
TScore: 'static + Send + Sync + Clone + PartialOrd,
TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer> + Send + Sync,
[src]
self,
custom_score: TCustomScorer
) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>> where
TScore: 'static + Send + Sync + Clone + PartialOrd,
TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer> + Send + Sync,
Ranks the documents using a custom score.
This method offers a convenient way to use a different score.
As suggested by the prototype you can manually define your
own CustomScorer
and pass it as an argument, but there is a much simpler way to
tweak your score: you can use a closure as in the following
example.
Limitation
This method only makes it possible to compute the score from a given
DocId
, fastfield values for the doc and any information you could
have precomputed beforehands. It does not make it possible for instance
to compute something like TfIdf as it does not have access to the list of query
terms present in the document, nor the term frequencies for the different terms.
It can be used if your search engine relies on a learning-to-rank model for instance, which does not rely on the term frequencies or positions as features.
Example
use tantivy::SegmentReader; use tantivy::collector::TopDocs; use tantivy::schema::Field; let popularity: Field = index.schema().get_field("popularity").unwrap(); let boosted: Field = index.schema().get_field("boosted").unwrap(); // ... // This is where we build our collector with our custom score. let top_docs_by_custom_score = TopDocs ::with_limit(10) .custom_score(move |segment_reader: &SegmentReader| { // The argument is a function that returns our scoring // function. // // The point of this "mother" function is to gather all // of the segment level information we need for scoring. // Typically, fast_fields. // // In our case, we will get a reader for the popularity // fast field and a boosted field. // // We want to get boosted items score, and when we get // a tie, return the item with the highest popularity. // // Note that this is implemented by using a `(u64, u64)` // as a score. let popularity_reader = segment_reader.fast_fields().u64(popularity).unwrap(); let boosted_reader = segment_reader.fast_fields().u64(boosted).unwrap(); // We can now define our actual scoring function move |doc: DocId| { let popularity: u64 = popularity_reader.get(doc); let boosted: u64 = boosted_reader.get(doc); // Score do not have to be `f64` in tantivy. // Here we return a couple to get lexicographical order // for free. (boosted, popularity) } }); // ... and here are our documents. Note this is a simple vec. // The `Score` in the pair is our tweaked score. let resulting_docs: Vec<((u64, u64), DocAddress)> = searcher.search(&*query, &top_docs_by_custom_score)?;
See also
Trait Implementations
impl Collector for TopDocs
[src]
type Fruit = Vec<(Score, DocAddress)>
Fruit
is the type for the result of our collection.
e.g. usize
for the Count
collector. Read more
type Child = TopScoreSegmentCollector
Type of the SegmentCollector
associated to this collector.
fn for_segment(
&self,
segment_local_id: SegmentLocalId,
reader: &SegmentReader
) -> Result<Self::Child>
[src]
&self,
segment_local_id: SegmentLocalId,
reader: &SegmentReader
) -> Result<Self::Child>
fn requires_scoring(&self) -> bool
[src]
fn merge_fruits(
&self,
child_fruits: Vec<Vec<(Score, DocAddress)>>
) -> Result<Self::Fruit>
[src]
&self,
child_fruits: Vec<Vec<(Score, DocAddress)>>
) -> Result<Self::Fruit>
fn collect_segment(
&self,
weight: &dyn Weight,
segment_ord: u32,
reader: &SegmentReader
) -> Result<<Self::Child as SegmentCollector>::Fruit>
[src]
&self,
weight: &dyn Weight,
segment_ord: u32,
reader: &SegmentReader
) -> Result<<Self::Child as SegmentCollector>::Fruit>
impl Debug for TopDocs
[src]
Auto Trait Implementations
impl RefUnwindSafe for TopDocs
[src]
impl Send for TopDocs
[src]
impl Sync for TopDocs
[src]
impl Unpin for TopDocs
[src]
impl UnwindSafe for TopDocs
[src]
Blanket Implementations
impl<T> Any for T where
T: 'static + ?Sized,
[src]
T: 'static + ?Sized,
impl<T> Borrow<T> for T where
T: ?Sized,
[src]
T: ?Sized,
impl<T> BorrowMut<T> for T where
T: ?Sized,
[src]
T: ?Sized,
pub fn borrow_mut(&mut self) -> &mut T
[src]
impl<T> Downcast for T where
T: Any,
[src]
T: Any,
pub fn into_any(self: Box<T, Global>) -> Box<dyn Any + 'static, Global>
[src]
pub fn into_any_rc(self: Rc<T>) -> Rc<dyn Any + 'static>
[src]
pub fn as_any(&self) -> &(dyn Any + 'static)
[src]
pub fn as_any_mut(&mut self) -> &mut (dyn Any + 'static)
[src]
impl<T> DowncastSync for T where
T: Send + Sync + Any,
[src]
T: Send + Sync + Any,
impl<T> From<T> for T
[src]
impl<T> Fruit for T where
T: Send + Downcast,
[src]
T: Send + Downcast,
impl<T, U> Into<U> for T where
U: From<T>,
[src]
U: From<T>,
impl<T> Pointable for T
pub const ALIGN: usize
type Init = T
The type for initializers.
pub unsafe fn init(init: <T as Pointable>::Init) -> usize
pub unsafe fn deref<'a>(ptr: usize) -> &'a T
pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T
pub unsafe fn drop(ptr: usize)
impl<T, U> TryFrom<U> for T where
U: Into<T>,
[src]
U: Into<T>,
type Error = Infallible
The type returned in the event of a conversion error.
pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>
[src]
impl<T, U> TryInto<U> for T where
U: TryFrom<T>,
[src]
U: TryFrom<T>,
type Error = <U as TryFrom<T>>::Error
The type returned in the event of a conversion error.
pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>
[src]
impl<V, T> VZip<V> for T where
V: MultiLane<T>,
V: MultiLane<T>,