1use fastfield_codecs::MonotonicallyMappableToU64;
2use itertools::Itertools;
3
4use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
5use super::operation::AddOperation;
6use crate::core::Segment;
7use crate::fastfield::FastFieldsWriter;
8use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
9use crate::indexer::json_term_writer::index_json_values;
10use crate::indexer::segment_serializer::SegmentSerializer;
11use crate::postings::{
12 compute_table_size, serialize_postings, IndexingContext, IndexingPosition,
13 PerFieldPostingsWriter, PostingsWriter,
14};
15use crate::schema::{FieldEntry, FieldType, Schema, Term, Value};
16use crate::store::{StoreReader, StoreWriter};
17use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
18use crate::{DatePrecision, DocId, Document, Opstamp, SegmentComponent};
19
20fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
26 let table_memory_upper_bound = per_thread_memory_budget / 3;
27 (10..20) .map(|power| 1 << power)
29 .take_while(|capacity| compute_table_size(*capacity) < table_memory_upper_bound)
30 .last()
31 .ok_or_else(|| {
32 crate::TantivyError::InvalidArgument(format!(
33 "per thread memory budget (={per_thread_memory_budget}) is too small. Raise the \
34 memory budget or lower the number of threads."
35 ))
36 })
37}
38
39fn remap_doc_opstamps(
40 opstamps: Vec<Opstamp>,
41 doc_id_mapping_opt: Option<&DocIdMapping>,
42) -> Vec<Opstamp> {
43 if let Some(doc_id_mapping_opt) = doc_id_mapping_opt {
44 doc_id_mapping_opt
45 .iter_old_doc_ids()
46 .map(|doc| opstamps[doc as usize])
47 .collect()
48 } else {
49 opstamps
50 }
51}
52
53pub struct SegmentWriter {
59 pub(crate) max_doc: DocId,
60 pub(crate) ctx: IndexingContext,
61 pub(crate) per_field_postings_writers: PerFieldPostingsWriter,
62 pub(crate) segment_serializer: SegmentSerializer,
63 pub(crate) fast_field_writers: FastFieldsWriter,
64 pub(crate) fieldnorms_writer: FieldNormsWriter,
65 pub(crate) doc_opstamps: Vec<Opstamp>,
66 per_field_text_analyzers: Vec<TextAnalyzer>,
67 term_buffer: Term,
68 schema: Schema,
69}
70
71impl SegmentWriter {
72 pub fn for_segment(
82 memory_budget_in_bytes: usize,
83 segment: Segment,
84 ) -> crate::Result<SegmentWriter> {
85 let schema = segment.schema();
86 let tokenizer_manager = segment.index().tokenizers().clone();
87 let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
88 let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
89 let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
90 let per_field_text_analyzers = schema
91 .fields()
92 .map(|(_, field_entry): (_, &FieldEntry)| {
93 let text_options = match field_entry.field_type() {
94 FieldType::Str(ref text_options) => text_options.get_indexing_options(),
95 FieldType::JsonObject(ref json_object_options) => {
96 json_object_options.get_text_indexing_options()
97 }
98 _ => None,
99 };
100 text_options
101 .and_then(|text_index_option| {
102 let tokenizer_name = &text_index_option.tokenizer();
103 tokenizer_manager.get(tokenizer_name)
104 })
105 .unwrap_or_default()
106 })
107 .collect();
108 Ok(SegmentWriter {
109 max_doc: 0,
110 ctx: IndexingContext::new(table_size),
111 per_field_postings_writers,
112 fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
113 segment_serializer,
114 fast_field_writers: FastFieldsWriter::from_schema(&schema),
115 doc_opstamps: Vec::with_capacity(1_000),
116 per_field_text_analyzers,
117 term_buffer: Term::with_capacity(16),
118 schema,
119 })
120 }
121
122 pub fn finalize(mut self) -> crate::Result<Vec<u64>> {
127 self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
128 let mapping: Option<DocIdMapping> = self
129 .segment_serializer
130 .segment()
131 .index()
132 .settings()
133 .sort_by_field
134 .clone()
135 .map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
136 .transpose()?;
137 remap_and_write(
138 &self.per_field_postings_writers,
139 self.ctx,
140 self.fast_field_writers,
141 &self.fieldnorms_writer,
142 &self.schema,
143 self.segment_serializer,
144 mapping.as_ref(),
145 )?;
146 let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
147 Ok(doc_opstamps)
148 }
149
150 pub fn mem_usage(&self) -> usize {
151 self.ctx.mem_usage()
152 + self.fieldnorms_writer.mem_usage()
153 + self.fast_field_writers.mem_usage()
154 + self.segment_serializer.mem_usage()
155 }
156
157 fn index_document(&mut self, doc: &Document) -> crate::Result<()> {
158 let doc_id = self.max_doc;
159 let vals_grouped_by_field = doc
160 .field_values()
161 .iter()
162 .sorted_by_key(|el| el.field())
163 .group_by(|el| el.field());
164 for (field, field_values) in &vals_grouped_by_field {
165 let values = field_values.map(|field_value| field_value.value());
166 let field_entry = self.schema.get_field_entry(field);
167 let make_schema_error = || {
168 crate::TantivyError::SchemaError(format!(
169 "Expected a {:?} for field {:?}",
170 field_entry.field_type().value_type(),
171 field_entry.name()
172 ))
173 };
174 if !field_entry.is_indexed() {
175 continue;
176 }
177
178 let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
179 let postings_writer: &mut dyn PostingsWriter =
180 self.per_field_postings_writers.get_for_field_mut(field);
181 term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);
182
183 match field_entry.field_type() {
184 FieldType::Facet(_) => {
185 for value in values {
186 let facet = value.as_facet().ok_or_else(make_schema_error)?;
187 let facet_str = facet.encoded_str();
188 let mut unordered_term_id_opt = None;
189 FacetTokenizer
190 .token_stream(facet_str)
191 .process(&mut |token| {
192 term_buffer.set_text(&token.text);
193 let unordered_term_id =
194 postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
195 unordered_term_id_opt = Some(unordered_term_id);
197 });
198 if let Some(unordered_term_id) = unordered_term_id_opt {
199 self.fast_field_writers
200 .get_term_id_writer_mut(field)
201 .expect("writer for facet missing")
202 .add_val(unordered_term_id);
203 }
204 }
205 }
206 FieldType::Str(_) => {
207 let mut indexing_position = IndexingPosition::default();
208 for value in values {
209 let mut token_stream = match value {
210 Value::PreTokStr(tok_str) => {
211 PreTokenizedStream::from(tok_str.clone()).into()
212 }
213 Value::Str(ref text) => {
214 let text_analyzer =
215 &self.per_field_text_analyzers[field.field_id() as usize];
216 text_analyzer.token_stream(text)
217 }
218 _ => {
219 continue;
220 }
221 };
222
223 assert!(term_buffer.is_empty());
224 postings_writer.index_text(
225 doc_id,
226 &mut *token_stream,
227 term_buffer,
228 ctx,
229 &mut indexing_position,
230 self.fast_field_writers.get_term_id_writer_mut(field),
231 );
232 }
233 if field_entry.has_fieldnorms() {
234 self.fieldnorms_writer
235 .record(doc_id, field, indexing_position.num_tokens);
236 }
237 }
238 FieldType::U64(_) => {
239 let mut num_vals = 0;
240 for value in values {
241 num_vals += 1;
242 let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
243 term_buffer.set_u64(u64_val);
244 postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
245 }
246 if field_entry.has_fieldnorms() {
247 self.fieldnorms_writer.record(doc_id, field, num_vals);
248 }
249 }
250 FieldType::Date(_) => {
251 let mut num_vals = 0;
252 for value in values {
253 num_vals += 1;
254 let date_val = value.as_date().ok_or_else(make_schema_error)?;
255 term_buffer.set_u64(date_val.truncate(DatePrecision::Seconds).to_u64());
256 postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
257 }
258 if field_entry.has_fieldnorms() {
259 self.fieldnorms_writer.record(doc_id, field, num_vals);
260 }
261 }
262 FieldType::I64(_) => {
263 let mut num_vals = 0;
264 for value in values {
265 num_vals += 1;
266 let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
267 term_buffer.set_i64(i64_val);
268 postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
269 }
270 if field_entry.has_fieldnorms() {
271 self.fieldnorms_writer.record(doc_id, field, num_vals);
272 }
273 }
274 FieldType::F64(_) => {
275 let mut num_vals = 0;
276 for value in values {
277 num_vals += 1;
278 let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
279 term_buffer.set_f64(f64_val);
280 postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
281 }
282 if field_entry.has_fieldnorms() {
283 self.fieldnorms_writer.record(doc_id, field, num_vals);
284 }
285 }
286 FieldType::Bool(_) => {
287 let mut num_vals = 0;
288 for value in values {
289 num_vals += 1;
290 let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
291 term_buffer.set_bool(bool_val);
292 postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
293 }
294 if field_entry.has_fieldnorms() {
295 self.fieldnorms_writer.record(doc_id, field, num_vals);
296 }
297 }
298 FieldType::Bytes(_) => {
299 let mut num_vals = 0;
300 for value in values {
301 num_vals += 1;
302 let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
303 term_buffer.set_bytes(bytes);
304 postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
305 }
306 if field_entry.has_fieldnorms() {
307 self.fieldnorms_writer.record(doc_id, field, num_vals);
308 }
309 }
310 FieldType::JsonObject(json_options) => {
311 let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
312 let json_values_it =
313 values.map(|value| value.as_json().ok_or_else(make_schema_error));
314 index_json_values(
315 doc_id,
316 json_values_it,
317 text_analyzer,
318 json_options.is_expand_dots_enabled(),
319 term_buffer,
320 postings_writer,
321 ctx,
322 )?;
323 }
324 FieldType::IpAddr(_) => {
325 let mut num_vals = 0;
326 for value in values {
327 num_vals += 1;
328 let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
329 term_buffer.set_ip_addr(ip_addr);
330 postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
331 }
332 if field_entry.has_fieldnorms() {
333 self.fieldnorms_writer.record(doc_id, field, num_vals);
334 }
335 }
336 }
337 }
338 Ok(())
339 }
340
341 pub fn add_document(&mut self, add_operation: AddOperation) -> crate::Result<()> {
345 let doc = add_operation.document;
346 self.doc_opstamps.push(add_operation.opstamp);
347 self.fast_field_writers.add_document(&doc)?;
348 self.index_document(&doc)?;
349 let doc_writer = self.segment_serializer.get_store_writer();
350 doc_writer.store(&doc, &self.schema)?;
351 self.max_doc += 1;
352 Ok(())
353 }
354
355 pub fn max_doc(&self) -> u32 {
362 self.max_doc
363 }
364
365 #[allow(dead_code)]
371 pub fn num_docs(&self) -> u32 {
372 self.max_doc
373 }
374}
375
376fn remap_and_write(
382 per_field_postings_writers: &PerFieldPostingsWriter,
383 ctx: IndexingContext,
384 fast_field_writers: FastFieldsWriter,
385 fieldnorms_writer: &FieldNormsWriter,
386 schema: &Schema,
387 mut serializer: SegmentSerializer,
388 doc_id_map: Option<&DocIdMapping>,
389) -> crate::Result<()> {
390 debug!("remap-and-write");
391 if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
392 fieldnorms_writer.serialize(fieldnorms_serializer, doc_id_map)?;
393 }
394 let fieldnorm_data = serializer
395 .segment()
396 .open_read(SegmentComponent::FieldNorms)?;
397 let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
398 let term_ord_map = serialize_postings(
399 ctx,
400 per_field_postings_writers,
401 fieldnorm_readers,
402 doc_id_map,
403 schema,
404 serializer.get_postings_serializer(),
405 )?;
406 debug!("fastfield-serialize");
407 fast_field_writers.serialize(
408 serializer.get_fast_field_serializer(),
409 &term_ord_map,
410 doc_id_map,
411 )?;
412
413 if let Some(doc_id_map) = doc_id_map {
415 debug!("resort-docstore");
416 let store_write = serializer
417 .segment_mut()
418 .open_write(SegmentComponent::Store)?;
419 let settings = serializer.segment().index().settings();
420 let store_writer = StoreWriter::new(
421 store_write,
422 settings.docstore_compression,
423 settings.docstore_blocksize,
424 settings.docstore_compress_dedicated_thread,
425 )?;
426 let old_store_writer = std::mem::replace(&mut serializer.store_writer, store_writer);
427 old_store_writer.close()?;
428 let store_read = StoreReader::open(
429 serializer
430 .segment()
431 .open_read(SegmentComponent::TempStore)?,
432 1, )?;
435 for old_doc_id in doc_id_map.iter_old_doc_ids() {
436 let doc_bytes = store_read.get_document_bytes(old_doc_id)?;
437 serializer.get_store_writer().store_bytes(&doc_bytes)?;
438 }
439 }
440
441 debug!("serializer-close");
442 serializer.close()?;
443
444 Ok(())
445}
446
447#[cfg(test)]
448mod tests {
449 use std::path::Path;
450
451 use super::compute_initial_table_size;
452 use crate::collector::Count;
453 use crate::directory::RamDirectory;
454 use crate::indexer::json_term_writer::JsonTermWriter;
455 use crate::postings::TermInfo;
456 use crate::query::PhraseQuery;
457 use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
458 use crate::store::{Compressor, StoreReader, StoreWriter};
459 use crate::time::format_description::well_known::Rfc3339;
460 use crate::time::OffsetDateTime;
461 use crate::tokenizer::{PreTokenizedString, Token};
462 use crate::{
463 DateTime, Directory, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED,
464 };
465
466 #[test]
467 fn test_hashmap_size() {
468 assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
469 assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
470 assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 17);
471 assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
472 assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
473 }
474
475 #[test]
476 fn test_prepare_for_store() {
477 let mut schema_builder = Schema::builder();
478 let text_field = schema_builder.add_text_field("title", TEXT | STORED);
479 let schema = schema_builder.build();
480 let mut doc = Document::default();
481 let pre_tokenized_text = PreTokenizedString {
482 text: String::from("A"),
483 tokens: vec![Token {
484 offset_from: 0,
485 offset_to: 1,
486 position: 0,
487 text: String::from("A"),
488 position_length: 1,
489 }],
490 };
491
492 doc.add_pre_tokenized_text(text_field, pre_tokenized_text);
493 doc.add_text(text_field, "title");
494
495 let path = Path::new("store");
496 let directory = RamDirectory::create();
497 let store_wrt = directory.open_write(path).unwrap();
498
499 let mut store_writer = StoreWriter::new(store_wrt, Compressor::None, 0, false).unwrap();
500 store_writer.store(&doc, &schema).unwrap();
501 store_writer.close().unwrap();
502
503 let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
504 let doc = reader.get(0).unwrap();
505
506 assert_eq!(doc.field_values().len(), 2);
507 assert_eq!(doc.field_values()[0].value().as_text(), Some("A"));
508 assert_eq!(doc.field_values()[1].value().as_text(), Some("title"));
509 }
510
511 #[test]
512 fn test_json_indexing() {
513 let mut schema_builder = Schema::builder();
514 let json_field = schema_builder.add_json_field("json", STORED | TEXT);
515 let schema = schema_builder.build();
516 let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
517 r#"{
518 "toto": "titi",
519 "float": -0.2,
520 "bool": true,
521 "unsigned": 1,
522 "signed": -2,
523 "complexobject": {
524 "field.with.dot": 1
525 },
526 "date": "1985-04-12T23:20:50.52Z",
527 "my_arr": [2, 3, {"my_key": "two tokens"}, 4]
528 }"#,
529 )
530 .unwrap();
531 let doc = doc!(json_field=>json_val.clone());
532 let index = Index::create_in_ram(schema.clone());
533 let mut writer = index.writer_for_tests().unwrap();
534 writer.add_document(doc).unwrap();
535 writer.commit().unwrap();
536 let reader = index.reader().unwrap();
537 let searcher = reader.searcher();
538 let doc = searcher
539 .doc(DocAddress {
540 segment_ord: 0u32,
541 doc_id: 0u32,
542 })
543 .unwrap();
544 let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
545 &schema.to_json(&doc),
546 )
547 .unwrap()
548 .get("json")
549 .unwrap()[0]
550 .as_object()
551 .unwrap()
552 .clone();
553 assert_eq!(json_val, serdeser_json_val);
554 let segment_reader = searcher.segment_reader(0u32);
555 let inv_idx = segment_reader.inverted_index(json_field).unwrap();
556 let term_dict = inv_idx.terms();
557
558 let mut term = Term::with_type_and_field(Type::Json, json_field);
559 let mut term_stream = term_dict.stream().unwrap();
560
561 let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
562
563 json_term_writer.push_path_segment("bool");
564 json_term_writer.set_fast_value(true);
565 assert!(term_stream.advance());
566 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
567
568 json_term_writer.pop_path_segment();
569 json_term_writer.push_path_segment("complexobject");
570 json_term_writer.push_path_segment("field.with.dot");
571 json_term_writer.set_fast_value(1u64);
572 assert!(term_stream.advance());
573 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
574
575 json_term_writer.pop_path_segment();
576 json_term_writer.pop_path_segment();
577 json_term_writer.push_path_segment("date");
578 json_term_writer.set_fast_value(DateTime::from_utc(
579 OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
580 ));
581 assert!(term_stream.advance());
582 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
583
584 json_term_writer.pop_path_segment();
585 json_term_writer.push_path_segment("float");
586 json_term_writer.set_fast_value(-0.2f64);
587 assert!(term_stream.advance());
588 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
589
590 json_term_writer.pop_path_segment();
591 json_term_writer.push_path_segment("my_arr");
592 json_term_writer.set_fast_value(2u64);
593 assert!(term_stream.advance());
594 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
595
596 json_term_writer.set_fast_value(3u64);
597 assert!(term_stream.advance());
598 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
599
600 json_term_writer.set_fast_value(4u64);
601 assert!(term_stream.advance());
602 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
603
604 json_term_writer.push_path_segment("my_key");
605 json_term_writer.set_str("tokens");
606 assert!(term_stream.advance());
607 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
608
609 json_term_writer.set_str("two");
610 assert!(term_stream.advance());
611 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
612
613 json_term_writer.pop_path_segment();
614 json_term_writer.pop_path_segment();
615 json_term_writer.push_path_segment("signed");
616 json_term_writer.set_fast_value(-2i64);
617 assert!(term_stream.advance());
618 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
619
620 json_term_writer.pop_path_segment();
621 json_term_writer.push_path_segment("toto");
622 json_term_writer.set_str("titi");
623 assert!(term_stream.advance());
624 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
625
626 json_term_writer.pop_path_segment();
627 json_term_writer.push_path_segment("unsigned");
628 json_term_writer.set_fast_value(1u64);
629 assert!(term_stream.advance());
630 assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
631 assert!(!term_stream.advance());
632 }
633
634 #[test]
635 fn test_json_tokenized_with_position() {
636 let mut schema_builder = Schema::builder();
637 let json_field = schema_builder.add_json_field("json", STORED | TEXT);
638 let schema = schema_builder.build();
639 let mut doc = Document::default();
640 let json_val: serde_json::Map<String, serde_json::Value> =
641 serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap();
642 doc.add_json_object(json_field, json_val);
643 let index = Index::create_in_ram(schema);
644 let mut writer = index.writer_for_tests().unwrap();
645 writer.add_document(doc).unwrap();
646 writer.commit().unwrap();
647 let reader = index.reader().unwrap();
648 let searcher = reader.searcher();
649 let segment_reader = searcher.segment_reader(0u32);
650 let inv_index = segment_reader.inverted_index(json_field).unwrap();
651 let mut term = Term::with_type_and_field(Type::Json, json_field);
652 let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
653 json_term_writer.push_path_segment("mykey");
654 json_term_writer.set_str("token");
655 let term_info = inv_index
656 .get_term_info(json_term_writer.term())
657 .unwrap()
658 .unwrap();
659 assert_eq!(
660 term_info,
661 TermInfo {
662 doc_freq: 1,
663 postings_range: 2..4,
664 positions_range: 2..5
665 }
666 );
667 let mut postings = inv_index
668 .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
669 .unwrap()
670 .unwrap();
671 assert_eq!(postings.doc(), 0);
672 assert_eq!(postings.term_freq(), 2);
673 let mut positions = Vec::new();
674 postings.positions(&mut positions);
675 assert_eq!(&positions[..], &[1, 2]);
676 assert_eq!(postings.advance(), TERMINATED);
677 }
678
679 #[test]
680 fn test_json_raw_no_position() {
681 let mut schema_builder = Schema::builder();
682 let json_field = schema_builder.add_json_field("json", STRING);
683 let schema = schema_builder.build();
684 let json_val: serde_json::Map<String, serde_json::Value> =
685 serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap();
686 let doc = doc!(json_field=>json_val);
687 let index = Index::create_in_ram(schema);
688 let mut writer = index.writer_for_tests().unwrap();
689 writer.add_document(doc).unwrap();
690 writer.commit().unwrap();
691 let reader = index.reader().unwrap();
692 let searcher = reader.searcher();
693 let segment_reader = searcher.segment_reader(0u32);
694 let inv_index = segment_reader.inverted_index(json_field).unwrap();
695 let mut term = Term::with_type_and_field(Type::Json, json_field);
696 let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
697 json_term_writer.push_path_segment("mykey");
698 json_term_writer.set_str("two tokens");
699 let term_info = inv_index
700 .get_term_info(json_term_writer.term())
701 .unwrap()
702 .unwrap();
703 assert_eq!(
704 term_info,
705 TermInfo {
706 doc_freq: 1,
707 postings_range: 0..1,
708 positions_range: 0..0
709 }
710 );
711 let mut postings = inv_index
712 .read_postings(&term, IndexRecordOption::WithFreqs)
713 .unwrap()
714 .unwrap();
715 assert_eq!(postings.doc(), 0);
716 assert_eq!(postings.term_freq(), 1);
717 let mut positions = Vec::new();
718 postings.positions(&mut positions);
719 assert_eq!(postings.advance(), TERMINATED);
720 }
721
722 #[test]
723 fn test_position_overlapping_path() {
724 let mut schema_builder = Schema::builder();
727 let json_field = schema_builder.add_json_field("json", TEXT);
728 let schema = schema_builder.build();
729 let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
730 r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#,
731 )
732 .unwrap();
733 let doc = doc!(json_field=>json_val);
734 let index = Index::create_in_ram(schema);
735 let mut writer = index.writer_for_tests().unwrap();
736 writer.add_document(doc).unwrap();
737 writer.commit().unwrap();
738 let reader = index.reader().unwrap();
739 let searcher = reader.searcher();
740 let mut term = Term::with_type_and_field(Type::Json, json_field);
741 let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
742 json_term_writer.push_path_segment("mykey");
743 json_term_writer.push_path_segment("field");
744 json_term_writer.set_str("hello");
745 let hello_term = json_term_writer.term().clone();
746 json_term_writer.set_str("nothello");
747 let nothello_term = json_term_writer.term().clone();
748 json_term_writer.set_str("happy");
749 let happy_term = json_term_writer.term().clone();
750 let phrase_query = PhraseQuery::new(vec![hello_term, happy_term.clone()]);
751 assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 1);
752 let phrase_query = PhraseQuery::new(vec![nothello_term, happy_term]);
753 assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0);
754 }
755
756 #[test]
757 fn test_bug_regression_1629_position_when_array_with_a_field_value_that_does_not_contain_any_token(
758 ) {
759 let mut schema_builder = Schema::builder();
764 let text = schema_builder.add_text_field("text", TEXT);
765 let schema = schema_builder.build();
766 let doc = schema
767 .parse_document(r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
768 .unwrap();
769 let index = Index::create_in_ram(schema);
770 let mut index_writer = index.writer_for_tests().unwrap();
771 index_writer.add_document(doc).unwrap();
772 index_writer.commit().unwrap();
774 let reader = index.reader().unwrap();
775 let searcher = reader.searcher();
776 let seg_reader = searcher.segment_reader(0);
777 let inv_index = seg_reader.inverted_index(text).unwrap();
778 let term = Term::from_field_text(text, "aaa");
779 let mut postings = inv_index
780 .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
781 .unwrap()
782 .unwrap();
783 assert_eq!(postings.doc(), 0u32);
784 let mut positions = Vec::new();
785 postings.positions(&mut positions);
786 assert_eq!(positions, &[2, 5]);
788 }
789
790 #[test]
791 fn test_multiple_field_value_and_long_tokens() {
792 let mut schema_builder = Schema::builder();
793 let text = schema_builder.add_text_field("text", TEXT);
794 let schema = schema_builder.build();
795 let mut doc = Document::default();
796 let tokens = PreTokenizedString {
798 text: "roller-coaster".to_string(),
799 tokens: vec![Token {
800 offset_from: 0,
801 offset_to: 14,
802 position: 0,
803 text: "rollercoaster".to_string(),
804 position_length: 2,
805 }],
806 };
807 doc.add_pre_tokenized_text(text, tokens.clone());
808 doc.add_pre_tokenized_text(text, tokens);
809 let index = Index::create_in_ram(schema);
810 let mut index_writer = index.writer_for_tests().unwrap();
811 index_writer.add_document(doc).unwrap();
812 index_writer.commit().unwrap();
813 let reader = index.reader().unwrap();
814 let searcher = reader.searcher();
815 let seg_reader = searcher.segment_reader(0);
816 let inv_index = seg_reader.inverted_index(text).unwrap();
817 let term = Term::from_field_text(text, "rollercoaster");
818 let mut postings = inv_index
819 .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
820 .unwrap()
821 .unwrap();
822 assert_eq!(postings.doc(), 0u32);
823 let mut positions = Vec::new();
824 postings.positions(&mut positions);
825 assert_eq!(positions, &[0, 3]); }
827
828 #[test]
829 fn test_last_token_not_ending_last() {
830 let mut schema_builder = Schema::builder();
831 let text = schema_builder.add_text_field("text", TEXT);
832 let schema = schema_builder.build();
833 let mut doc = Document::default();
834 let tokens = PreTokenizedString {
836 text: "contrived-example".to_string(), tokens: vec![
838 Token {
839 offset_from: 0,
841 offset_to: 14,
842 position: 0,
843 text: "long_token".to_string(),
844 position_length: 3,
845 },
846 Token {
847 offset_from: 0,
848 offset_to: 14,
849 position: 1,
850 text: "short".to_string(),
851 position_length: 1,
852 },
853 ],
854 };
855 doc.add_pre_tokenized_text(text, tokens);
856 doc.add_text(text, "hello");
857 let index = Index::create_in_ram(schema);
858 let mut index_writer = index.writer_for_tests().unwrap();
859 index_writer.add_document(doc).unwrap();
860 index_writer.commit().unwrap();
861 let reader = index.reader().unwrap();
862 let searcher = reader.searcher();
863 let seg_reader = searcher.segment_reader(0);
864 let inv_index = seg_reader.inverted_index(text).unwrap();
865 let term = Term::from_field_text(text, "hello");
866 let mut postings = inv_index
867 .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
868 .unwrap()
869 .unwrap();
870 assert_eq!(postings.doc(), 0u32);
871 let mut positions = Vec::new();
872 postings.positions(&mut positions);
873 assert_eq!(positions, &[4]); }
875}