bosonnlp/
client.rs

1use std::io::{Read, Write};
2use std::iter::FromIterator;
3
4use serde::Serialize;
5use serde::de::DeserializeOwned;
6use serde_json::{self, Value, Map};
7use url::Url;
8use uuid::Uuid;
9use flate2::Compression;
10use flate2::write::GzEncoder;
11use reqwest::Method;
12use reqwest::blocking::Client;
13use reqwest::header::{USER_AGENT, ACCEPT, CONTENT_ENCODING, CONTENT_TYPE};
14
15use errors::*;
16use rep::{Dependency, NamedEntity, Tag, TextCluster, CommentsCluster, ConvertedTime, ClusterContent};
17use task::{ClusterTask, CommentsTask, Task};
18
19
20/// 默认的 `BosonNLP` API 服务器地址
21const DEFAULT_BOSONNLP_URL: &'static str = "https://api.bosonnlp.com";
22
23/// [`BosonNLP`](http://bosonnlp.com) REST API 访问的封装
24#[derive(Debug, Clone)]
25pub struct BosonNLP {
26    /// 用于 API 鉴权的 API Token
27    pub token: String,
28    /// 是否压缩大于 10K 的请求体,默认为 true
29    pub compress: bool,
30    /// `BosonNLP` HTTP API 的 URL,默认为 `http://api.bosonnlp.com`
31    bosonnlp_url: String,
32    /// hyper http Client
33    client: Client,
34}
35
36impl Default for BosonNLP {
37    fn default() -> BosonNLP {
38        BosonNLP {
39            token: "".to_string(),
40            compress: true,
41            bosonnlp_url: DEFAULT_BOSONNLP_URL.to_owned(),
42            client: Client::new(),
43        }
44    }
45}
46
47impl BosonNLP {
48    /// 初始化一个新的 `BosonNLP` 实例
49    pub fn new<T: Into<String>>(token: T) -> BosonNLP {
50        BosonNLP {
51            token: token.into(),
52            ..Default::default()
53        }
54    }
55
56    /// 使用自定义参数初始化一个新的 ``BosonNLP`` 实例
57    pub fn with_options<T: Into<String>>(token: T, bosonnlp_url: T, compress: bool) -> BosonNLP {
58        BosonNLP {
59            token: token.into(),
60            compress: compress,
61            bosonnlp_url: bosonnlp_url.into(),
62            ..Default::default()
63        }
64    }
65
66    /// 使用自定义的 reqwest Client 初始化一个新的 ``BosonNLP`` 实例
67    pub fn with_client<T: Into<String>>(token: T, client: Client) -> BosonNLP {
68        BosonNLP {
69            token: token.into(),
70            client: client,
71            ..Default::default()
72        }
73    }
74
75    fn request<D, E>(&self, method: Method, endpoint: &str, params: Vec<(&str, &str)>, data: &E) -> Result<D>
76    where
77        D: DeserializeOwned,
78        E: Serialize,
79    {
80        let url_string = format!("{}{}", self.bosonnlp_url, endpoint);
81        let mut url = Url::parse(&url_string).unwrap();
82        url.query_pairs_mut().extend_pairs(params.into_iter());
83        let mut req = self.client.request(method.clone(), url);
84        req = req.header(
85                USER_AGENT,
86                format!("bosonnlp-rs/{}", env!("CARGO_PKG_VERSION")),
87            )
88            .header(ACCEPT, "application/json")
89            .header("X-Token", self.token.clone());
90        let mut res = if method == Method::POST {
91            let req = req.header(CONTENT_TYPE, "application/json");
92            let body = serde_json::to_vec(data)?;
93            if self.compress && body.len() > 10240 {
94                let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
95                encoder.write_all(&body)?;
96                let compressed = encoder.finish()?;
97                let req = req.header(CONTENT_ENCODING, "gzip");
98                req.body(compressed).send()?
99            } else {
100                req.body(body).send()?
101            }
102        } else {
103            req.send()?
104        };
105        let content_len = res.content_length().unwrap_or(0) as usize;
106        let mut body = String::with_capacity(content_len);
107        res.read_to_string(&mut body)?;
108        let status = res.status();
109        if !status.is_success() {
110            let result: Value = match serde_json::from_str(&body) {
111                Ok(obj) => obj,
112                Err(..) => Value::Object(Map::new()),
113            };
114            let message = match result.get("message") {
115                Some(msg) => msg.as_str().unwrap_or("").to_owned(),
116                None => body,
117            };
118            return Err(
119                Error::Api {
120                    code: status,
121                    reason: message
122                }
123            );
124        }
125        Ok(serde_json::from_str::<D>(&body)?)
126    }
127
128    pub(crate) fn get<D>(&self, endpoint: &str, params: Vec<(&str, &str)>) -> Result<D>
129    where
130        D: DeserializeOwned,
131    {
132        self.request(Method::GET, endpoint, params, &Value::Null)
133    }
134
135    pub(crate) fn post<D, E>(&self, endpoint: &str, params: Vec<(&str, &str)>, data: &E) -> Result<D>
136    where
137        D: DeserializeOwned,
138        E: Serialize,
139    {
140        self.request(Method::POST, endpoint, params, data)
141    }
142
143    /// [情感分析接口](http://docs.bosonnlp.com/sentiment.html)
144    ///
145    /// ``contents``: 需要做情感分析的文本序列
146    ///
147    /// ``model``: 使用不同的语料训练的模型
148    ///
149    /// # 使用示例
150    ///
151    /// ```
152    /// extern crate bosonnlp;
153    ///
154    /// use bosonnlp::BosonNLP;
155    ///
156    /// fn main() {
157    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
158    ///     let rs = nlp.sentiment(&["这家味道还不错"], "food").unwrap();
159    ///     assert_eq!(1, rs.len());
160    /// }
161    /// ```
162    pub fn sentiment<T: AsRef<str>>(&self, contents: &[T], model: &str) -> Result<Vec<(f32, f32)>> {
163        let endpoint = format!("/sentiment/analysis?{}", model);
164        let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
165        self.post(&endpoint, vec![], &data)
166    }
167
168    /// [时间转换接口](http://docs.bosonnlp.com/time.html)
169    ///
170    /// ``content``: 需要做时间转换的文本
171    ///
172    /// ``basetime``: 时间描述时的基准时间戳。如果为 ``None`` ,使用服务器当前的GMT+8时间
173    ///
174    /// # 使用示例
175    ///
176    /// ```
177    /// extern crate bosonnlp;
178    ///
179    /// use bosonnlp::BosonNLP;
180    ///
181    /// fn main() {
182    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
183    ///     let time = nlp.convert_time("2013年二月二十八日下午四点三十分二十九秒", None).unwrap();
184    ///     assert_eq!("2013-02-28 16:30:29", &time.timestamp.unwrap());
185    ///     assert_eq!("timestamp", &time.format);
186    /// }
187    /// ```
188    pub fn convert_time<T: AsRef<str>>(&self, content: T, basetime: Option<T>) -> Result<ConvertedTime> {
189        if let Some(base) = basetime {
190            let params = vec![("pattern", content.as_ref()), ("basetime", base.as_ref())];
191            return self.post("/time/analysis", params, &Value::Null);
192        } else {
193            let params = vec![("pattern", content.as_ref())];
194            return self.post("/time/analysis", params, &Value::Null);
195        };
196    }
197
198    /// [新闻分类接口](http://docs.bosonnlp.com/classify.html)
199    ///
200    /// ``contents``: 需要做分类的新闻文本序列
201    ///
202    /// # 使用示例
203    ///
204    /// ```
205    /// extern crate bosonnlp;
206    ///
207    /// use bosonnlp::BosonNLP;
208    ///
209    /// fn main() {
210    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
211    ///     let rs = nlp.classify(&["俄否决安理会谴责叙军战机空袭阿勒颇平民"]).unwrap();
212    ///     assert_eq!(vec![5usize], rs);
213    /// }
214    /// ```
215    pub fn classify<T: AsRef<str>>(&self, contents: &[T]) -> Result<Vec<usize>> {
216        let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
217        self.post("/classify/analysis", vec![], &data)
218    }
219
220    /// [语义联想接口](http://docs.bosonnlp.com/suggest.html)
221    ///
222    /// ``word``: 需要做语义联想的词
223    ///
224    /// ``top_k``: 返回结果的条数,最大值可设定为 100
225    ///
226    /// # 使用示例
227    ///
228    /// ```
229    /// extern crate bosonnlp;
230    ///
231    /// use bosonnlp::BosonNLP;
232    ///
233    /// fn main() {
234    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
235    ///     let rs = nlp.suggest("北京", 2).unwrap();
236    ///     assert_eq!(2, rs.len());
237    /// }
238    /// ```
239    pub fn suggest<T: AsRef<str>>(&self, word: T, top_k: usize) -> Result<Vec<(f32, String)>> {
240        self.post(
241            "/suggest/analysis",
242            vec![("top_k", &top_k.to_string())],
243            &word.as_ref(),
244        )
245    }
246
247    /// [关键词提取接口](http://docs.bosonnlp.com/keywords.html)
248    ///
249    /// ``text``: 需要做关键词提取的文本
250    ///
251    /// ``top_k``: 返回结果的条数,最大值可设定为 100
252    ///
253    /// ``segmented``: `text` 是否已经进行了分词,若为 `true` 则不会再对内容进行分词处理
254    ///
255    /// # 使用示例
256    ///
257    /// ```
258    /// extern crate bosonnlp;
259    ///
260    /// use bosonnlp::BosonNLP;
261    ///
262    /// fn main() {
263    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
264    ///     let rs = nlp.keywords("病毒式媒体网站:让新闻迅速蔓延", 2, false).unwrap();
265    ///     assert_eq!(2, rs.len());
266    /// }
267    /// ```
268    pub fn keywords<T: AsRef<str>>(&self, text: T, top_k: usize, segmented: bool) -> Result<Vec<(f32, String)>> {
269        let top_k_str = top_k.to_string();
270        let params = if segmented {
271            vec![("top_k", top_k_str.as_ref()), ("segmented", "1")]
272        } else {
273            vec![("top_k", top_k_str.as_ref())]
274        };
275        self.post("/keywords/analysis", params, &text.as_ref())
276    }
277
278    /// [依存文法分析接口](http://docs.bosonnlp.com/depparser.html)
279    ///
280    /// ``contents``: 需要做依存文法分析的文本序列
281    ///
282    /// # 使用示例
283    ///
284    /// ```
285    /// extern crate bosonnlp;
286    ///
287    /// use bosonnlp::BosonNLP;
288    ///
289    /// fn main() {
290    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
291    ///     let rs = nlp.depparser(&["今天天气好"]).unwrap();
292    ///     assert_eq!(1, rs.len());
293    ///     let dep0 = &rs[0];
294    ///     assert_eq!(vec![2isize, 2isize, -1isize], dep0.head);
295    ///     let rs = nlp.depparser(&["今天天气好", "美好的世界"]).unwrap();
296    ///     assert_eq!(2, rs.len());
297    /// }
298    /// ```
299    pub fn depparser<T: AsRef<str>>(&self, contents: &[T]) -> Result<Vec<Dependency>> {
300        let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
301        self.post("/depparser/analysis", vec![], &data)
302    }
303
304    /// [命名实体识别接口](http://docs.bosonnlp.com/ner.html)
305    ///
306    /// ``contents``: 需要做命名实体识别的文本序列
307    ///
308    /// ``sensitivity``: 准确率与召回率之间的平衡。
309    /// 设置成 1 能找到更多的实体,设置成 5 能以更高的精度寻找实体
310    /// 一般设置为 3
311    ///
312    /// ``segmented``: 输入是否已经为分词结果
313    ///
314    /// # 使用示例
315    ///
316    /// ```
317    /// extern crate bosonnlp;
318    ///
319    /// use bosonnlp::BosonNLP;
320    ///
321    /// fn main() {
322    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
323    ///     let rs = nlp.ner(&["成都商报记者 姚永忠"], 2, false).unwrap();
324    ///     assert_eq!(1, rs.len());
325    ///     let rs = nlp.ner(&["成都商报记者 姚永忠", "微软XP操作系统今日正式退休"], 2, false).unwrap();
326    ///     assert_eq!(2, rs.len());
327    /// }
328    /// ```
329    pub fn ner<T: AsRef<str>>(&self, contents: &[T], sensitivity: usize, segmented: bool) -> Result<Vec<NamedEntity>> {
330        let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
331        let sensitivity_str = sensitivity.to_string();
332        let params = if segmented {
333            vec![
334                ("sensitivity", sensitivity_str.as_ref()),
335                ("segmented", "1"),
336            ]
337        } else {
338            vec![("sensitivity", sensitivity_str.as_ref())]
339        };
340        self.post("/ner/analysis", params, &data)
341    }
342
343    /// [分词与词性标注接口](http://docs.bosonnlp.com/tag.html)
344    ///
345    /// ``contents``: 需要做分词与词性标注的文本序列
346    ///
347    /// ``space_mode``: 空格保留选项,0-3 有效
348    ///
349    /// ``oov_level``: 枚举强度选项,0-4 有效
350    ///
351    /// ``t2s``: 是否开启繁体转简体
352    ///
353    /// ``special_char_conv``: 是否转化特殊字符,针对回车、Tab 等特殊字符。
354    ///
355    /// # 使用示例
356    ///
357    /// ```
358    /// extern crate bosonnlp;
359    ///
360    /// use bosonnlp::BosonNLP;
361    ///
362    /// fn main() {
363    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
364    ///     let rs = nlp.tag(&["成都商报记者 姚永忠"], 0, 3, false, false).unwrap();
365    ///     assert_eq!(1, rs.len());
366    /// }
367    /// ```
368    pub fn tag<T: AsRef<str>>(
369        &self,
370        contents: &[T],
371        space_mode: usize,
372        oov_level: usize,
373        t2s: bool,
374        special_char_conv: bool,
375    ) -> Result<Vec<Tag>> {
376        let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
377        let t2s_str = if t2s { "1" } else { "0" };
378        let special_char_conv_str = if special_char_conv { "1" } else { "0" };
379        let space_mode_str = space_mode.to_string();
380        let oov_level_str = oov_level.to_string();
381        let params = vec![
382            ("space_mode", space_mode_str.as_ref()),
383            ("oov_level", oov_level_str.as_ref()),
384            ("t2s", t2s_str),
385            ("special_char_conv", special_char_conv_str),
386        ];
387        self.post("/tag/analysis", params, &data)
388    }
389
390    /// [新闻摘要接口](http://docs.bosonnlp.com/summary.html)
391    ///
392    /// ``title``: 需要做摘要的新闻标题,如果没有则传入空字符串
393    ///
394    /// ``content``: 需要做摘要的新闻正文
395    ///
396    /// ``word_limit``: 摘要字数限制
397    ///
398    /// ``not_exceed``: 是否严格限制字数
399    ///
400    /// # 使用示例
401    ///
402    /// ```
403    /// extern crate bosonnlp;
404    ///
405    /// use bosonnlp::BosonNLP;
406    ///
407    /// fn main() {
408    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
409    ///     let title = "前优酷土豆技术副总裁黄冬加盟芒果TV任CTO";
410    ///     let content = "腾讯科技讯(刘亚澜)10月22日消息,前优酷土豆技术副总裁黄冬已于日前正式加盟芒果TV,出任CTO一职。";
411    ///     let rs = nlp.summary(title, content, 1.0, false);
412    ///     assert!(rs.is_ok());
413    /// }
414    /// ```
415    pub fn summary<T: Into<String>>(&self, title: T, content: T, word_limit: f32, not_exceed: bool) -> Result<String> {
416        let not_exceed = if not_exceed { 1 } else { 0 };
417        let data = json!({
418            "title": title.into(),
419            "content": content.into(),
420            "percentage": word_limit,
421            "not_exceed": not_exceed
422        });
423        self.post("/summary/analysis", vec![], &data)
424    }
425
426    /// [文本聚类接口](http://docs.bosonnlp.com/cluster.html)
427    ///
428    /// ``task_id``: 唯一的 task_id,话题聚类任务的名字,可由字母和数字组成
429    ///
430    /// ``alpha``: 聚类最大 cluster 大小,一般为 0.8
431    ///
432    /// ``beta``: 聚类平均 cluster 大小,一般为 0.45
433    ///
434    /// ``timeout``: 等待文本聚类任务完成的秒数,一般为 1800 秒
435    ///
436    /// # 使用示例
437    ///
438    /// ```
439    /// extern crate bosonnlp;
440    ///
441    /// use bosonnlp::BosonNLP;
442    ///
443    /// fn main() {
444    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
445    ///     let contents = vec![
446    ///         "今天天气好",
447    ///         "今天天气好",
448    ///         "今天天气不错",
449    ///         "点点楼头细雨",
450    ///         "重重江外平湖",
451    ///         "当年戏马会东徐",
452    ///         "今日凄凉南浦",
453    ///     ];
454    ///     let rs = nlp.cluster(&contents, None, 0.8, 0.45, Some(10)).unwrap();
455    ///     assert_eq!(1, rs.len());
456    /// }
457    /// ```
458    pub fn cluster<T: AsRef<str>>(
459        &self,
460        contents: &[T],
461        task_id: Option<&str>,
462        alpha: f32,
463        beta: f32,
464        timeout: Option<u64>,
465    ) -> Result<Vec<TextCluster>> {
466        let mut task = match task_id {
467            Some(_id) => ClusterTask::new(self, _id),
468            None => {
469                let _id = Uuid::new_v4().to_simple_ref().to_string();
470                ClusterTask::new(self, _id)
471            }
472        };
473        let tasks: Vec<ClusterContent> = Vec::from_iter(contents.iter().map(|c| c.into()));
474        if !task.push(&tasks)? {
475            return Ok(vec![]);
476        }
477        task.analysis(alpha, beta)?;
478        task.wait(timeout)?;
479        let result = task.result()?;
480        task.clear()?;
481        Ok(result)
482    }
483
484    /// [典型意见接口](http://docs.bosonnlp.com/comments.html)
485    ///
486    /// ``task_id``: 唯一的 task_id,典型意见任务的名字,可由字母和数字组成
487    ///
488    /// ``alpha``: 聚类最大 cluster 大小,一般为 0.8
489    ///
490    /// ``beta``: 聚类平均 cluster 大小,一般为 0.45
491    ///
492    /// ``timeout``: 等待典型意见任务完成的秒数,一般为 1800 秒
493    ///
494    /// # 使用示例
495    ///
496    /// ```
497    /// extern crate bosonnlp;
498    ///
499    /// use bosonnlp::BosonNLP;
500    ///
501    /// fn main() {
502    ///     let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
503    ///     let contents = vec![
504    ///         "今天天气好",
505    ///         "今天天气好",
506    ///         "今天天气不错",
507    ///         "点点楼头细雨",
508    ///         "重重江外平湖",
509    ///         "当年戏马会东徐",
510    ///         "今日凄凉南浦",
511    ///         "今天天气好",
512    ///         "今天天气好",
513    ///         "今天天气不错",
514    ///         "点点楼头细雨",
515    ///         "重重江外平湖",
516    ///         "当年戏马会东徐",
517    ///         "今日凄凉南浦",
518    ///     ];
519    ///     let rs = nlp.comments(&contents, None, 0.8, 0.45, Some(10)).unwrap();
520    ///     assert_eq!(4, rs.len());
521    /// }
522    /// ```
523    pub fn comments<T: AsRef<str>>(
524        &self,
525        contents: &[T],
526        task_id: Option<&str>,
527        alpha: f32,
528        beta: f32,
529        timeout: Option<u64>,
530    ) -> Result<Vec<CommentsCluster>> {
531        let mut task = match task_id {
532            Some(_id) => CommentsTask::new(self, _id),
533            None => {
534                let _id = Uuid::new_v4().to_simple_ref().to_string();
535                CommentsTask::new(self, _id)
536            }
537        };
538        let tasks: Vec<ClusterContent> = Vec::from_iter(contents.iter().map(|c| c.into()));
539        if !task.push(&tasks)? {
540            return Ok(vec![]);
541        }
542        task.analysis(alpha, beta)?;
543        task.wait(timeout)?;
544        let result = task.result()?;
545        task.clear()?;
546        Ok(result)
547    }
548}