pub struct PartitionParameters {Show 22 fields
pub coordinates: bool,
pub encoding: Option<String>,
pub extract_image_block_types: Vec<String>,
pub gz_uncompressed_content_type: Option<String>,
pub hi_res_model_name: Option<String>,
pub include_page_breaks: bool,
pub languages: Option<Vec<String>>,
pub output_format: String,
pub skip_infer_table_types: Vec<String>,
pub starting_page_number: Option<i32>,
pub strategy: Strategy,
pub unique_element_ids: bool,
pub xml_keep_tags: bool,
pub chunking_strategy: Option<ChunkingStrategy>,
pub combine_under_n_chars: Option<i32>,
pub include_orig_elements: bool,
pub max_characters: Option<i32>,
pub multipage_sections: bool,
pub new_after_n_chars: Option<i32>,
pub overlap: i32,
pub overlap_all: bool,
pub similarity_threshold: Option<f64>,
}
Fields§
§coordinates: bool
If True
, return coordinates for each element extracted via OCR. Default: False
.
encoding: Option<String>
The encoding method used to decode the text input. Default: utf-8
extract_image_block_types: Vec<String>
The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields. Default: [].
gz_uncompressed_content_type: Option<String>
If file is gzipped, use this content type after unzipping.
hi_res_model_name: Option<String>
The name of the inference model used when strategy is hi_res
include_page_breaks: bool
If true, the output will include page breaks if the filetype supports it. Default: false
languages: Option<Vec<String>>
The languages present in the document, for use in partitioning and/or OCR. See the Tesseract documentation for a full list of languages. Default: [].
output_format: String
The format of the response. Supported formats are application/json and text/csv. Default: application/json.
skip_infer_table_types: Vec<String>
The document types that you want to skip table extraction with. Default: [].
starting_page_number: Option<i32>
When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27.
strategy: Strategy
The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto
unique_element_ids: bool
When True
, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False
If True
, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to XML documents. Default: false
chunking_strategy: Option<ChunkingStrategy>
Use one of the supported strategies to chunk the returned elements after partitioning. When ‘chunking_strategy’ is not specified, no chunking is performed and any other chunking parameters provided are ignored. Supported strategies: ‘basic’, ‘by_page’, ‘by_similarity’, or ‘by_title’
combine_under_n_chars: Option<i32>
If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500
include_orig_elements: bool
When a chunking strategy is specified, each returned chunk will include the elements consolidated to form that chunk as .metadata.orig_elements
. Default: true.
max_characters: Option<i32>
If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500
multipage_sections: bool
If chunking strategy is set, determines if sections can span multiple sections. Default: true
new_after_n_chars: Option<i32>
If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500
overlap: i32
Specifies the length of a string (‘tail’) to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. Default 0.
overlap_all: bool
When True
, apply overlap between ‘normal’ chunks formed from whole elements and not subject to text-splitting. Use this with caution as it entails a certain level of ‘pollution’ of otherwise clean semantic chunk boundaries. Default false.
similarity_threshold: Option<f64>
A value between 0.0 and 1.0 describing the minimum similarity two elements must have to be included in the same chunk. Note that similar elements may be separated to meet chunk-size criteria; this value can only guarantees that two elements with similarity below the threshold will appear in separate chunks.