Struct jieba_rs::TfIdf

source ·
pub struct TfIdf { /* private fields */ }
Expand description

TF-IDF keywords extraction

Require tfidf feature to be enabled

Implementations§

source§

impl TfIdf

Implementation of JiebaKeywordExtract using a TF-IDF dictionary.

This takes the segments produced by Jieba and attempts to extract keywords. Segments are filtered for stopwords and short terms. They are then matched against a loaded dictionary to calculate TF-IDF scores.

source

pub fn new( opt_dict: Option<&mut impl BufRead>, config: KeywordExtractConfig ) -> Self

Creates an TfIdf.

§Examples

New instance with custom idf dictionary.

   use jieba_rs::{TfIdf, KeywordExtractConfig};

   let mut sample_idf = "劳动防护 13.900677652\n\
       生化学 13.900677652\n";
   TfIdf::new(
       Some(&mut sample_idf.as_bytes()),
       KeywordExtractConfig::default());

New instance with module default stop words and no initial IDF dictionary. Dictionary should be loaded later with load_dict() calls.

   use jieba_rs::{TfIdf, KeywordExtractConfig};

   TfIdf::new(
       None::<&mut std::io::Empty>,
       KeywordExtractConfig::default());
source

pub fn load_dict(&mut self, dict: &mut impl BufRead) -> Result<()>

Merges entires from dict into the idf_dict.

   use jieba_rs::{Jieba, KeywordExtract, Keyword, KeywordExtractConfig,
       TfIdf};

   let jieba = Jieba::default();
   let mut init_idf = "生化学 13.900677652\n";

   let mut tfidf = TfIdf::new(
       Some(&mut init_idf.as_bytes()),
       KeywordExtractConfig::default());
   let top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]);
   assert_eq!(
       top_k,
       vec![
           Keyword { keyword: "不是".to_string(), weight: 4.6335592173333335 },
           Keyword { keyword: "光化学".to_string(), weight: 4.6335592173333335 },
           Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 }
       ]
   );

   let mut init_idf = "光化学 99.123456789\n";
   tfidf.load_dict(&mut init_idf.as_bytes());
   let new_top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]);
   assert_eq!(
       new_top_k,
       vec![
           Keyword { keyword: "不是".to_string(), weight: 33.041152263 },
           Keyword { keyword: "光化学".to_string(), weight: 33.041152263 },
           Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 }
       ]
   );
source

pub fn config(&self) -> &KeywordExtractConfig

source

pub fn config_mut(&mut self) -> &mut KeywordExtractConfig

Trait Implementations§

source§

impl Debug for TfIdf

source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
source§

impl Default for TfIdf

TF-IDF keywords extraction.

Require tfidf feature to be enabled.

source§

fn default() -> Self

Creates TfIdf with DEFAULT_STOP_WORDS, the default TfIdf dictionary, 2 Unicode Scalar Value minimum for keywords, and no hmm in segmentation.

source§

impl KeywordExtract for TfIdf

source§

fn extract_keywords( &self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec<String> ) -> Vec<Keyword>

Uses TF-IDF algorithm to extract the top_k keywords from sentence.

If allowed_pos is not empty, then only terms matching those parts if speech are considered.

§Examples
   use jieba_rs::{Jieba, KeywordExtract, TfIdf};

   let jieba = Jieba::new();
   let keyword_extractor = TfIdf::default();
   let mut top_k = keyword_extractor.extract_keywords(
       &jieba,
       "今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃",
       3,
       vec![],
   );
   assert_eq!(
       top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
       vec!["北京烤鸭", "纽约", "天气"]
   );

   top_k = keyword_extractor.extract_keywords(
       &jieba,
       "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。",
       5,
       vec![],
   );
   assert_eq!(
       top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
       vec!["欧亚", "吉林", "置业", "万元", "增资"]
   );

   top_k = keyword_extractor.extract_keywords(
       &jieba,
       "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。",
       5,
       vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")],
   );
   assert_eq!(
       top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
       vec!["欧亚", "吉林", "置业", "增资", "实现"]
   );

Auto Trait Implementations§

§

impl Freeze for TfIdf

§

impl RefUnwindSafe for TfIdf

§

impl Send for TfIdf

§

impl Sync for TfIdf

§

impl Unpin for TfIdf

§

impl UnwindSafe for TfIdf

Blanket Implementations§

source§

impl<T> Any for T
where T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for T
where T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T, U> Into<U> for T
where U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.