List of all items
Structs
- chunking::SlidingWindowChunker
- config::ChunkingStrategy
- config::DenoiserConfig
- config::SamplerConfig
- config::TextRecipe
- config::TripletRecipe
- data::DataRecord
- data::QualityScore
- data::RecordChunk
- data::RecordSection
- data::SampleBatch
- data::SamplePair
- data::SampleTriplet
- data::TextBatch
- data::TextSample
- data::TripletBatch
- heuristics::CapacityTotals
- ingestion::IngestionManager
- ingestion::RecordCache
- ingestion::SourceRefreshStats
- kvp::KvpField
- kvp::KvpPrefixSampler
- kvp::MetaFieldSpec
- kvp::MetaPolicy
- metadata::MetadataKey
- metrics::SourceShare
- metrics::SourceSkew
- preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor
- sampler::BatchPrefetcher
- sampler::TripletSampler
- source::IndexPermutation
- source::IndexableAdapter
- source::IndexablePager
- source::SourceCursor
- source::SourceSnapshot
- source::backends::csv_source::CsvSource
- source::backends::csv_source::CsvSourceConfig
- source::backends::file_source::FileSource
- source::backends::file_source::FileSourceConfig
- source::backends::in_memory_source::InMemorySource
- source::indexing::file_corpus::FileCorpusIndex
- splits::DeterministicSplitStore
- splits::FileSplitStore
- splits::PersistedSamplerState
- splits::PersistedSplitHashes
- splits::PersistedSplitMeta
- splits::SplitRatios
- tokenizer::WhitespaceTokenizer
Enums
- SamplerError
- config::NegativeStrategy
- config::Selector
- data::ChunkView
- data::PairLabel
- data::SectionRole
- splits::SplitLabel
Traits
- chunking::ChunkingAlgorithm
- preprocessor::TextPreprocessor
- sampler::Sampler
- source::DataSource
- source::IndexableSource
- splits::EpochStateStore
- splits::SamplerStateStore
- splits::SplitStore
- tokenizer::Tokenizer
Functions
- constants::splits::is_reserved_source_id
- hash::derive_epoch_seed
- hash::stable_hash_path
- hash::stable_hash_str
- hash::stable_hash_with
- heuristics::build_derived_text_recipes
- heuristics::estimate_source_split_capacity_from_counts
- heuristics::format_replay_factor
- heuristics::format_u128_with_commas
- heuristics::resolve_text_recipes_for_source
- heuristics::split_counts_for_total
- metadata::build_date_meta_values
- metrics::chunk_distance_relevance_score
- metrics::chunk_proximity_score
- metrics::source_skew
- metrics::window_chunk_distance
- metrics::window_index_proximity
- preprocessor::backends::denoiser_preprocessor::denoise_text
- sampler::chunk_weight
- source::backends::file_source::anchor_context_sections
- source::backends::file_source::default_title_context_triplet_recipes
- source::backends::file_source::taxonomy_from_path
- source::indexing::date_helpers::parse_publication_date_from_folder
- source::indexing::date_helpers::parse_publication_date_from_month_range_folder
- source::indexing::date_helpers::parse_publication_date_from_year_folder
- utils::file_mtime
- utils::file_times
- utils::is_text_file
- utils::make_section
- utils::normalize_inline_whitespace
- utils::platform_newline
- utils::sentences
Type Aliases
- source::backends::file_source::SectionBuilder
- source::backends::file_source::TaxonomyBuilder
- types::CategoryId
- types::GroupKey
- types::HashPart
- types::ItemOrderKey
- types::KvpValue
- types::LogMessage
- types::MetaValue
- types::PathString
- types::RecipeKey
- types::RecordId
- types::Sentence
- types::SourceId
- types::TaxonomyValue
Constants
- constants::cache::FILE_CORPUS_GROUP
- constants::cache::MULTI_SOURCE_DEMO_GROUP
- constants::cache::MULTI_SOURCE_DEMO_STORE_FILENAME
- constants::env_vars::ENV_TRIPLETS_SKIP_LIVE_TESTS
- constants::file_corpus::FILE_INDEX_META_KEY
- constants::file_corpus::FILE_INDEX_PATH_KEY_PREFIX
- constants::file_corpus::FILE_INDEX_READ_BATCH
- constants::file_corpus::FILE_INDEX_STORE_DIR
- constants::file_corpus::SKIP_UNREADABLE_MSG
- constants::heuristics::EFFECTIVE_NEGATIVES_PER_ANCHOR
- constants::heuristics::EFFECTIVE_POSITIVES_PER_ANCHOR
- constants::metadata::METADATA_DELIMITER
- constants::metadata::META_FIELD_DATE
- constants::sampler::ANCHOR_POSITIVE_SWAP_MASK
- constants::sampler::AUTO_INJECTED_LONG_SECTION_CHUNK_PAIR_RECIPE_NAME
- constants::sampler::EPOCH_SEED_OFFSET
- constants::sampler::EXHAUSTION_RETRY_LIMIT
- constants::sampler::NEG_REASON_WRONG_ARTICLE
- constants::sampler::NEG_REASON_WRONG_DATE
- constants::sampler::NEG_REASON_WRONG_QA
- constants::sampler::PREFETCHER_SOURCE_ID
- constants::sampler::PREFETCHER_STOPPED_REASON
- constants::sampler::RECIPE_LABEL_TEXT
- constants::sampler::RECIPE_LABEL_TRIPLETS
- constants::sampler::RECIPE_ORDER_MAX_WEIGHT_MULTIPLIER
- constants::sampler::ROLE_LABEL_ANCHOR
- constants::sampler::ROLE_LABEL_CONTEXT
- constants::sampler::SAME_SELECTOR_PAIR_RETRY_LIMIT
- constants::splits::ALL_SPLITS
- constants::splits::BITCODE_PREFIX
- constants::splits::EPOCH_HASHES_PREFIX
- constants::splits::EPOCH_HASH_RECORD_VERSION
- constants::splits::EPOCH_META_PREFIX
- constants::splits::EPOCH_META_RECORD_VERSION
- constants::splits::EPOCH_RECORD_TOMBSTONE
- constants::splits::EPOCH_STATE_VERSION
- constants::splits::META_KEY
- constants::splits::SAMPLER_STATE_KEY
- constants::splits::SAMPLER_STATE_RECORD_VERSION
- constants::splits::SPLIT_PREFIX
- constants::splits::STORE_VERSION
- source::backends::csv_source::CSV_RECIPE_TEXT_SIMCSE_WRONG_ARTICLE