bosonnlp/client.rs
1use std::io::{Read, Write};
2use std::iter::FromIterator;
3
4use serde::Serialize;
5use serde::de::DeserializeOwned;
6use serde_json::{self, Value, Map};
7use url::Url;
8use uuid::Uuid;
9use flate2::Compression;
10use flate2::write::GzEncoder;
11use reqwest::Method;
12use reqwest::blocking::Client;
13use reqwest::header::{USER_AGENT, ACCEPT, CONTENT_ENCODING, CONTENT_TYPE};
14
15use errors::*;
16use rep::{Dependency, NamedEntity, Tag, TextCluster, CommentsCluster, ConvertedTime, ClusterContent};
17use task::{ClusterTask, CommentsTask, Task};
18
19
20/// 默认的 `BosonNLP` API 服务器地址
21const DEFAULT_BOSONNLP_URL: &'static str = "https://api.bosonnlp.com";
22
23/// [`BosonNLP`](http://bosonnlp.com) REST API 访问的封装
24#[derive(Debug, Clone)]
25pub struct BosonNLP {
26 /// 用于 API 鉴权的 API Token
27 pub token: String,
28 /// 是否压缩大于 10K 的请求体,默认为 true
29 pub compress: bool,
30 /// `BosonNLP` HTTP API 的 URL,默认为 `http://api.bosonnlp.com`
31 bosonnlp_url: String,
32 /// hyper http Client
33 client: Client,
34}
35
36impl Default for BosonNLP {
37 fn default() -> BosonNLP {
38 BosonNLP {
39 token: "".to_string(),
40 compress: true,
41 bosonnlp_url: DEFAULT_BOSONNLP_URL.to_owned(),
42 client: Client::new(),
43 }
44 }
45}
46
47impl BosonNLP {
48 /// 初始化一个新的 `BosonNLP` 实例
49 pub fn new<T: Into<String>>(token: T) -> BosonNLP {
50 BosonNLP {
51 token: token.into(),
52 ..Default::default()
53 }
54 }
55
56 /// 使用自定义参数初始化一个新的 ``BosonNLP`` 实例
57 pub fn with_options<T: Into<String>>(token: T, bosonnlp_url: T, compress: bool) -> BosonNLP {
58 BosonNLP {
59 token: token.into(),
60 compress: compress,
61 bosonnlp_url: bosonnlp_url.into(),
62 ..Default::default()
63 }
64 }
65
66 /// 使用自定义的 reqwest Client 初始化一个新的 ``BosonNLP`` 实例
67 pub fn with_client<T: Into<String>>(token: T, client: Client) -> BosonNLP {
68 BosonNLP {
69 token: token.into(),
70 client: client,
71 ..Default::default()
72 }
73 }
74
75 fn request<D, E>(&self, method: Method, endpoint: &str, params: Vec<(&str, &str)>, data: &E) -> Result<D>
76 where
77 D: DeserializeOwned,
78 E: Serialize,
79 {
80 let url_string = format!("{}{}", self.bosonnlp_url, endpoint);
81 let mut url = Url::parse(&url_string).unwrap();
82 url.query_pairs_mut().extend_pairs(params.into_iter());
83 let mut req = self.client.request(method.clone(), url);
84 req = req.header(
85 USER_AGENT,
86 format!("bosonnlp-rs/{}", env!("CARGO_PKG_VERSION")),
87 )
88 .header(ACCEPT, "application/json")
89 .header("X-Token", self.token.clone());
90 let mut res = if method == Method::POST {
91 let req = req.header(CONTENT_TYPE, "application/json");
92 let body = serde_json::to_vec(data)?;
93 if self.compress && body.len() > 10240 {
94 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
95 encoder.write_all(&body)?;
96 let compressed = encoder.finish()?;
97 let req = req.header(CONTENT_ENCODING, "gzip");
98 req.body(compressed).send()?
99 } else {
100 req.body(body).send()?
101 }
102 } else {
103 req.send()?
104 };
105 let content_len = res.content_length().unwrap_or(0) as usize;
106 let mut body = String::with_capacity(content_len);
107 res.read_to_string(&mut body)?;
108 let status = res.status();
109 if !status.is_success() {
110 let result: Value = match serde_json::from_str(&body) {
111 Ok(obj) => obj,
112 Err(..) => Value::Object(Map::new()),
113 };
114 let message = match result.get("message") {
115 Some(msg) => msg.as_str().unwrap_or("").to_owned(),
116 None => body,
117 };
118 return Err(
119 Error::Api {
120 code: status,
121 reason: message
122 }
123 );
124 }
125 Ok(serde_json::from_str::<D>(&body)?)
126 }
127
128 pub(crate) fn get<D>(&self, endpoint: &str, params: Vec<(&str, &str)>) -> Result<D>
129 where
130 D: DeserializeOwned,
131 {
132 self.request(Method::GET, endpoint, params, &Value::Null)
133 }
134
135 pub(crate) fn post<D, E>(&self, endpoint: &str, params: Vec<(&str, &str)>, data: &E) -> Result<D>
136 where
137 D: DeserializeOwned,
138 E: Serialize,
139 {
140 self.request(Method::POST, endpoint, params, data)
141 }
142
143 /// [情感分析接口](http://docs.bosonnlp.com/sentiment.html)
144 ///
145 /// ``contents``: 需要做情感分析的文本序列
146 ///
147 /// ``model``: 使用不同的语料训练的模型
148 ///
149 /// # 使用示例
150 ///
151 /// ```
152 /// extern crate bosonnlp;
153 ///
154 /// use bosonnlp::BosonNLP;
155 ///
156 /// fn main() {
157 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
158 /// let rs = nlp.sentiment(&["这家味道还不错"], "food").unwrap();
159 /// assert_eq!(1, rs.len());
160 /// }
161 /// ```
162 pub fn sentiment<T: AsRef<str>>(&self, contents: &[T], model: &str) -> Result<Vec<(f32, f32)>> {
163 let endpoint = format!("/sentiment/analysis?{}", model);
164 let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
165 self.post(&endpoint, vec![], &data)
166 }
167
168 /// [时间转换接口](http://docs.bosonnlp.com/time.html)
169 ///
170 /// ``content``: 需要做时间转换的文本
171 ///
172 /// ``basetime``: 时间描述时的基准时间戳。如果为 ``None`` ,使用服务器当前的GMT+8时间
173 ///
174 /// # 使用示例
175 ///
176 /// ```
177 /// extern crate bosonnlp;
178 ///
179 /// use bosonnlp::BosonNLP;
180 ///
181 /// fn main() {
182 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
183 /// let time = nlp.convert_time("2013年二月二十八日下午四点三十分二十九秒", None).unwrap();
184 /// assert_eq!("2013-02-28 16:30:29", &time.timestamp.unwrap());
185 /// assert_eq!("timestamp", &time.format);
186 /// }
187 /// ```
188 pub fn convert_time<T: AsRef<str>>(&self, content: T, basetime: Option<T>) -> Result<ConvertedTime> {
189 if let Some(base) = basetime {
190 let params = vec![("pattern", content.as_ref()), ("basetime", base.as_ref())];
191 return self.post("/time/analysis", params, &Value::Null);
192 } else {
193 let params = vec![("pattern", content.as_ref())];
194 return self.post("/time/analysis", params, &Value::Null);
195 };
196 }
197
198 /// [新闻分类接口](http://docs.bosonnlp.com/classify.html)
199 ///
200 /// ``contents``: 需要做分类的新闻文本序列
201 ///
202 /// # 使用示例
203 ///
204 /// ```
205 /// extern crate bosonnlp;
206 ///
207 /// use bosonnlp::BosonNLP;
208 ///
209 /// fn main() {
210 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
211 /// let rs = nlp.classify(&["俄否决安理会谴责叙军战机空袭阿勒颇平民"]).unwrap();
212 /// assert_eq!(vec![5usize], rs);
213 /// }
214 /// ```
215 pub fn classify<T: AsRef<str>>(&self, contents: &[T]) -> Result<Vec<usize>> {
216 let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
217 self.post("/classify/analysis", vec![], &data)
218 }
219
220 /// [语义联想接口](http://docs.bosonnlp.com/suggest.html)
221 ///
222 /// ``word``: 需要做语义联想的词
223 ///
224 /// ``top_k``: 返回结果的条数,最大值可设定为 100
225 ///
226 /// # 使用示例
227 ///
228 /// ```
229 /// extern crate bosonnlp;
230 ///
231 /// use bosonnlp::BosonNLP;
232 ///
233 /// fn main() {
234 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
235 /// let rs = nlp.suggest("北京", 2).unwrap();
236 /// assert_eq!(2, rs.len());
237 /// }
238 /// ```
239 pub fn suggest<T: AsRef<str>>(&self, word: T, top_k: usize) -> Result<Vec<(f32, String)>> {
240 self.post(
241 "/suggest/analysis",
242 vec![("top_k", &top_k.to_string())],
243 &word.as_ref(),
244 )
245 }
246
247 /// [关键词提取接口](http://docs.bosonnlp.com/keywords.html)
248 ///
249 /// ``text``: 需要做关键词提取的文本
250 ///
251 /// ``top_k``: 返回结果的条数,最大值可设定为 100
252 ///
253 /// ``segmented``: `text` 是否已经进行了分词,若为 `true` 则不会再对内容进行分词处理
254 ///
255 /// # 使用示例
256 ///
257 /// ```
258 /// extern crate bosonnlp;
259 ///
260 /// use bosonnlp::BosonNLP;
261 ///
262 /// fn main() {
263 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
264 /// let rs = nlp.keywords("病毒式媒体网站:让新闻迅速蔓延", 2, false).unwrap();
265 /// assert_eq!(2, rs.len());
266 /// }
267 /// ```
268 pub fn keywords<T: AsRef<str>>(&self, text: T, top_k: usize, segmented: bool) -> Result<Vec<(f32, String)>> {
269 let top_k_str = top_k.to_string();
270 let params = if segmented {
271 vec![("top_k", top_k_str.as_ref()), ("segmented", "1")]
272 } else {
273 vec![("top_k", top_k_str.as_ref())]
274 };
275 self.post("/keywords/analysis", params, &text.as_ref())
276 }
277
278 /// [依存文法分析接口](http://docs.bosonnlp.com/depparser.html)
279 ///
280 /// ``contents``: 需要做依存文法分析的文本序列
281 ///
282 /// # 使用示例
283 ///
284 /// ```
285 /// extern crate bosonnlp;
286 ///
287 /// use bosonnlp::BosonNLP;
288 ///
289 /// fn main() {
290 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
291 /// let rs = nlp.depparser(&["今天天气好"]).unwrap();
292 /// assert_eq!(1, rs.len());
293 /// let dep0 = &rs[0];
294 /// assert_eq!(vec![2isize, 2isize, -1isize], dep0.head);
295 /// let rs = nlp.depparser(&["今天天气好", "美好的世界"]).unwrap();
296 /// assert_eq!(2, rs.len());
297 /// }
298 /// ```
299 pub fn depparser<T: AsRef<str>>(&self, contents: &[T]) -> Result<Vec<Dependency>> {
300 let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
301 self.post("/depparser/analysis", vec![], &data)
302 }
303
304 /// [命名实体识别接口](http://docs.bosonnlp.com/ner.html)
305 ///
306 /// ``contents``: 需要做命名实体识别的文本序列
307 ///
308 /// ``sensitivity``: 准确率与召回率之间的平衡。
309 /// 设置成 1 能找到更多的实体,设置成 5 能以更高的精度寻找实体
310 /// 一般设置为 3
311 ///
312 /// ``segmented``: 输入是否已经为分词结果
313 ///
314 /// # 使用示例
315 ///
316 /// ```
317 /// extern crate bosonnlp;
318 ///
319 /// use bosonnlp::BosonNLP;
320 ///
321 /// fn main() {
322 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
323 /// let rs = nlp.ner(&["成都商报记者 姚永忠"], 2, false).unwrap();
324 /// assert_eq!(1, rs.len());
325 /// let rs = nlp.ner(&["成都商报记者 姚永忠", "微软XP操作系统今日正式退休"], 2, false).unwrap();
326 /// assert_eq!(2, rs.len());
327 /// }
328 /// ```
329 pub fn ner<T: AsRef<str>>(&self, contents: &[T], sensitivity: usize, segmented: bool) -> Result<Vec<NamedEntity>> {
330 let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
331 let sensitivity_str = sensitivity.to_string();
332 let params = if segmented {
333 vec![
334 ("sensitivity", sensitivity_str.as_ref()),
335 ("segmented", "1"),
336 ]
337 } else {
338 vec![("sensitivity", sensitivity_str.as_ref())]
339 };
340 self.post("/ner/analysis", params, &data)
341 }
342
343 /// [分词与词性标注接口](http://docs.bosonnlp.com/tag.html)
344 ///
345 /// ``contents``: 需要做分词与词性标注的文本序列
346 ///
347 /// ``space_mode``: 空格保留选项,0-3 有效
348 ///
349 /// ``oov_level``: 枚举强度选项,0-4 有效
350 ///
351 /// ``t2s``: 是否开启繁体转简体
352 ///
353 /// ``special_char_conv``: 是否转化特殊字符,针对回车、Tab 等特殊字符。
354 ///
355 /// # 使用示例
356 ///
357 /// ```
358 /// extern crate bosonnlp;
359 ///
360 /// use bosonnlp::BosonNLP;
361 ///
362 /// fn main() {
363 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
364 /// let rs = nlp.tag(&["成都商报记者 姚永忠"], 0, 3, false, false).unwrap();
365 /// assert_eq!(1, rs.len());
366 /// }
367 /// ```
368 pub fn tag<T: AsRef<str>>(
369 &self,
370 contents: &[T],
371 space_mode: usize,
372 oov_level: usize,
373 t2s: bool,
374 special_char_conv: bool,
375 ) -> Result<Vec<Tag>> {
376 let data = contents.iter().map(|c| c.as_ref()).collect::<Vec<_>>();
377 let t2s_str = if t2s { "1" } else { "0" };
378 let special_char_conv_str = if special_char_conv { "1" } else { "0" };
379 let space_mode_str = space_mode.to_string();
380 let oov_level_str = oov_level.to_string();
381 let params = vec![
382 ("space_mode", space_mode_str.as_ref()),
383 ("oov_level", oov_level_str.as_ref()),
384 ("t2s", t2s_str),
385 ("special_char_conv", special_char_conv_str),
386 ];
387 self.post("/tag/analysis", params, &data)
388 }
389
390 /// [新闻摘要接口](http://docs.bosonnlp.com/summary.html)
391 ///
392 /// ``title``: 需要做摘要的新闻标题,如果没有则传入空字符串
393 ///
394 /// ``content``: 需要做摘要的新闻正文
395 ///
396 /// ``word_limit``: 摘要字数限制
397 ///
398 /// ``not_exceed``: 是否严格限制字数
399 ///
400 /// # 使用示例
401 ///
402 /// ```
403 /// extern crate bosonnlp;
404 ///
405 /// use bosonnlp::BosonNLP;
406 ///
407 /// fn main() {
408 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
409 /// let title = "前优酷土豆技术副总裁黄冬加盟芒果TV任CTO";
410 /// let content = "腾讯科技讯(刘亚澜)10月22日消息,前优酷土豆技术副总裁黄冬已于日前正式加盟芒果TV,出任CTO一职。";
411 /// let rs = nlp.summary(title, content, 1.0, false);
412 /// assert!(rs.is_ok());
413 /// }
414 /// ```
415 pub fn summary<T: Into<String>>(&self, title: T, content: T, word_limit: f32, not_exceed: bool) -> Result<String> {
416 let not_exceed = if not_exceed { 1 } else { 0 };
417 let data = json!({
418 "title": title.into(),
419 "content": content.into(),
420 "percentage": word_limit,
421 "not_exceed": not_exceed
422 });
423 self.post("/summary/analysis", vec![], &data)
424 }
425
426 /// [文本聚类接口](http://docs.bosonnlp.com/cluster.html)
427 ///
428 /// ``task_id``: 唯一的 task_id,话题聚类任务的名字,可由字母和数字组成
429 ///
430 /// ``alpha``: 聚类最大 cluster 大小,一般为 0.8
431 ///
432 /// ``beta``: 聚类平均 cluster 大小,一般为 0.45
433 ///
434 /// ``timeout``: 等待文本聚类任务完成的秒数,一般为 1800 秒
435 ///
436 /// # 使用示例
437 ///
438 /// ```
439 /// extern crate bosonnlp;
440 ///
441 /// use bosonnlp::BosonNLP;
442 ///
443 /// fn main() {
444 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
445 /// let contents = vec![
446 /// "今天天气好",
447 /// "今天天气好",
448 /// "今天天气不错",
449 /// "点点楼头细雨",
450 /// "重重江外平湖",
451 /// "当年戏马会东徐",
452 /// "今日凄凉南浦",
453 /// ];
454 /// let rs = nlp.cluster(&contents, None, 0.8, 0.45, Some(10)).unwrap();
455 /// assert_eq!(1, rs.len());
456 /// }
457 /// ```
458 pub fn cluster<T: AsRef<str>>(
459 &self,
460 contents: &[T],
461 task_id: Option<&str>,
462 alpha: f32,
463 beta: f32,
464 timeout: Option<u64>,
465 ) -> Result<Vec<TextCluster>> {
466 let mut task = match task_id {
467 Some(_id) => ClusterTask::new(self, _id),
468 None => {
469 let _id = Uuid::new_v4().to_simple_ref().to_string();
470 ClusterTask::new(self, _id)
471 }
472 };
473 let tasks: Vec<ClusterContent> = Vec::from_iter(contents.iter().map(|c| c.into()));
474 if !task.push(&tasks)? {
475 return Ok(vec![]);
476 }
477 task.analysis(alpha, beta)?;
478 task.wait(timeout)?;
479 let result = task.result()?;
480 task.clear()?;
481 Ok(result)
482 }
483
484 /// [典型意见接口](http://docs.bosonnlp.com/comments.html)
485 ///
486 /// ``task_id``: 唯一的 task_id,典型意见任务的名字,可由字母和数字组成
487 ///
488 /// ``alpha``: 聚类最大 cluster 大小,一般为 0.8
489 ///
490 /// ``beta``: 聚类平均 cluster 大小,一般为 0.45
491 ///
492 /// ``timeout``: 等待典型意见任务完成的秒数,一般为 1800 秒
493 ///
494 /// # 使用示例
495 ///
496 /// ```
497 /// extern crate bosonnlp;
498 ///
499 /// use bosonnlp::BosonNLP;
500 ///
501 /// fn main() {
502 /// let nlp = BosonNLP::new(env!("BOSON_API_TOKEN"));
503 /// let contents = vec![
504 /// "今天天气好",
505 /// "今天天气好",
506 /// "今天天气不错",
507 /// "点点楼头细雨",
508 /// "重重江外平湖",
509 /// "当年戏马会东徐",
510 /// "今日凄凉南浦",
511 /// "今天天气好",
512 /// "今天天气好",
513 /// "今天天气不错",
514 /// "点点楼头细雨",
515 /// "重重江外平湖",
516 /// "当年戏马会东徐",
517 /// "今日凄凉南浦",
518 /// ];
519 /// let rs = nlp.comments(&contents, None, 0.8, 0.45, Some(10)).unwrap();
520 /// assert_eq!(4, rs.len());
521 /// }
522 /// ```
523 pub fn comments<T: AsRef<str>>(
524 &self,
525 contents: &[T],
526 task_id: Option<&str>,
527 alpha: f32,
528 beta: f32,
529 timeout: Option<u64>,
530 ) -> Result<Vec<CommentsCluster>> {
531 let mut task = match task_id {
532 Some(_id) => CommentsTask::new(self, _id),
533 None => {
534 let _id = Uuid::new_v4().to_simple_ref().to_string();
535 CommentsTask::new(self, _id)
536 }
537 };
538 let tasks: Vec<ClusterContent> = Vec::from_iter(contents.iter().map(|c| c.into()));
539 if !task.push(&tasks)? {
540 return Ok(vec![]);
541 }
542 task.analysis(alpha, beta)?;
543 task.wait(timeout)?;
544 let result = task.result()?;
545 task.clear()?;
546 Ok(result)
547 }
548}