1mod file_platform;
2use file_platform::ReadAtFile;
3
4use std::cmp::Ordering;
5use std::str::from_utf8;
6
7fn read_line_at(f : &ReadAtFile, mut pos : u64)
8 -> Vec<u8>
9{
10 let mut block = vec![];
11 let mut writing_to = 0usize;
12 block.resize(512, 0u8);
13
14 loop
15 {
16 let len = f.read_at(&mut block[writing_to..], pos).unwrap();
17 let truncate = block.iter()
18 .skip(writing_to)
19 .enumerate()
20 .find(|x| *x.1 == b'\n')
21 .map(|x| x.0);
22 if let Some(t) = truncate
23 {
24 block.truncate(writing_to+t);
25 return block;
26 }
27 writing_to += len;
28 pos += len as u64;
29 let newlen = block.len()*2;
30 block.resize(newlen, 0u8);
31 }
32}
33
34#[derive(Debug)]
35#[derive(PartialEq)]
36pub enum PartOfSpeech
37{
38 Noun,
40 Adjective,
42 AdjectiveSatellite,
44 Verb,
46 Adverb,
48}
49
50
51impl PartOfSpeech
53{
54 pub fn short(&self) -> &'static str
61 {
62 match *self
63 {
64 PartOfSpeech::Noun => "n",
65 PartOfSpeech::Adjective => "adj",
66 PartOfSpeech::AdjectiveSatellite => "adj",
67 PartOfSpeech::Verb => "v",
68 PartOfSpeech::Adverb => "adv",
69 }
70 }
71}
72
73fn part_of_speech_code_to_part_of_speech(code : &[u8])
74 -> PartOfSpeech
75{
76 match code
77 {
78 b"n" => PartOfSpeech::Noun,
79 b"v" => PartOfSpeech::Verb,
80 b"a" => PartOfSpeech::Adjective,
81 b"s" => PartOfSpeech::AdjectiveSatellite,
82 b"r" => PartOfSpeech::Adverb,
83 _ => panic!("impossible part of speech '{}'", from_utf8(code).unwrap()),
84 }
85}
86
87#[derive(Debug)]
89#[derive(PartialEq)]
90pub enum Relationship
91{
92 Antonym,
94 Hypernym,
96 InstanceHypernym,
99 Hyponym,
101 MemberHolonym,
103 SubstanceHolonym,
105 PartHolonym,
107 MemberMeronym,
109 SubstanceMeronym,
111 PartMeronym,
113 Attribute,
115 DerivationallyRelated,
117 DomainOfTopic,
119 MemberOfTopic,
121 DomainOfRegion,
123 MemberOfRegion,
125 DomainOfUsage,
127 MemberOfUsage,
129
130 Entailment,
132 Cause,
134 AlsoSee,
136 VerbGroup,
138
139 SimilarTo,
141 VerbParticiple,
143
144 PertainymOrDerivedFromAdjective, }
147
148fn relationship_code_to_relationship(code : &[u8])
149 -> Relationship
150{
151 match code
152 {
153 b"!" => Relationship::Antonym,
154 b"@" => Relationship::Hypernym,
155 b"@i" => Relationship::InstanceHypernym,
156 b"~" => Relationship::Hyponym,
157 b"~i" => Relationship::InstanceHypernym,
158 b"#m" => Relationship::MemberHolonym,
159 b"#s" => Relationship::SubstanceHolonym,
160 b"#p" => Relationship::PartHolonym,
161 b"%m" => Relationship::MemberMeronym,
162 b"%s" => Relationship::SubstanceMeronym,
163 b"%p" => Relationship::PartMeronym,
164 b"=" => Relationship::Attribute,
165 b"+" => Relationship::DerivationallyRelated,
166 b";c" => Relationship::DomainOfTopic,
167 b"-c" => Relationship::MemberOfTopic,
168 b";r" => Relationship::DomainOfRegion,
169 b"-r" => Relationship::MemberOfRegion,
170 b";u" => Relationship::DomainOfUsage,
171 b"-u" => Relationship::MemberOfUsage,
172 b"*" => Relationship::Entailment,
173 b">" => Relationship::Cause,
174 b"^" => Relationship::AlsoSee,
175 b"$" => Relationship::VerbGroup,
176 b"&" => Relationship::SimilarTo,
177 b"<" => Relationship::VerbParticiple,
178 b"\\" => Relationship::PertainymOrDerivedFromAdjective,
179 _ => panic!("illegal relationship code")
180 }
181}
182
183
184#[derive(Debug)]
192pub struct Sense<'db>
193{
194 pub part_of_speech : PartOfSpeech,
196 pub gloss : String,
198 pub synonyms : Vec<SenseRef>,
201 pub pointers : Vec<PointerRef<'db>>,
203}
204
205#[derive(Debug)]
210pub struct PointerRef<'db>
211{
212 db : &'db Database,
213 pub relationship : Relationship,
217 pub part_of_speech : PartOfSpeech,
219 offset : u64,
220}
221
222impl<'db> PointerRef<'db>
224{
225 pub fn read(&self) -> Sense<'db>
232 {
233 self.db
234 .dbfile_for_part_of_speech(&self.part_of_speech)
235 .read_sense(self.db, self.offset)
236 }
237}
238
239#[derive(Debug)]
240pub struct SenseRef
242{
243 pub word : String,
245 lex_id : u32,
246}
247
248impl SenseRef
249{
250}
251
252#[derive(Debug)]
253struct DBFile
254{
255 name : String,
256 index : ReadAtFile,
257 index_size : u64,
258 data : ReadAtFile,
259 part_of_speech : PartOfSpeech,
260}
261
262impl DBFile
263{
264 fn new(
265 part_of_speech : PartOfSpeech,
266 index : &std::path::Path,
267 data : &std::path::Path
268 )
269 -> std::io::Result<DBFile>
270 {
271 let mut index_f = std::fs::File::open(index)?;
272 let data_f = std::fs::File::open(data)?;
273
274 let index_size = std::io::Seek::seek(&mut index_f, std::io::SeekFrom::End(0))?;
275
276 Ok(DBFile
277 {
278 name: index.to_str().unwrap().to_string(),
279 index: ReadAtFile::new(index_f),
280 index_size: index_size,
281 data: ReadAtFile::new(data_f),
282 part_of_speech : part_of_speech,
283 })
284 }
285
286 fn is_found_here(
287 &self,
288 pos : u64,
289 data : &[u8],
290 remaining_word : &[u8]
291 ) -> Ordering
292 {
293 for x in 0..data.len()
294 {
295 if x == remaining_word.len() && data[x] == b' '
296 {
297 return Ordering::Equal;
298 }
299 else if x >= remaining_word.len() || data[x] > remaining_word[x]
300 {
301 return Ordering::Less;
302 }
303 else if data[x] < remaining_word[x]
304 {
305 return Ordering::Greater;
306 }
307 }
308 let block = &mut [0u8;32];
309 let bytes = self.index.read_at(block, pos+data.len() as u64).unwrap();
310 return self.is_found_here(
311 pos + data.len() as u64,
312 &block[0..bytes],
313 &remaining_word[data.len()..]
314 );
315 }
316
317 fn find_position(&self, word : &[u8])
318 -> Option<u64>
319 {
320 let block = &mut [0u8;32];
321
322 let mut end = self.index_size;
323 let mut begin = 0u64;
324 let mut pos = end/2;
325
326 while end-begin > (word.len()+10) as u64
327 {
328 if end-pos < 32
329 {
330 pos = begin;
331 }
332
333 let bytes = self.index.read_at(block, pos).unwrap();
334
335 let block = &block[ 0 .. bytes ];
336
337 if pos == begin
338 {
339 begin += bytes as u64;
340 }
341
342
343 if let Some(newline_offset)
344 = block.iter().enumerate().find(|a| *a.1 == b'\n').map(|x| x.0)
345 {
346 let newline = &block[newline_offset+1..];
347 let current_line_starts_at = pos + newline_offset as u64 + 1;
348 let rel = self.is_found_here(current_line_starts_at, newline, word);
349 match rel
350 {
351 Ordering::Equal => return Some(current_line_starts_at),
352 Ordering::Less =>
353 {
354 end = current_line_starts_at;
355 },
356 Ordering::Greater =>
357 {
358 begin = current_line_starts_at+word.len() as u64;
359 }
360 }
361
362 if begin >= end { break; }
363
364 let newpos = (end-begin)/2 + begin;
365 if newpos == pos
366 {
367 break;
368 }
369 else
370 {
371 pos = newpos;
372 }
373 }
374 else if (pos + bytes as u64) < end
375 {
376 pos += bytes as u64;
377 }
378 else
379 {
380 pos -= std::cmp::min(64, pos);
381 }
382 }
383
384 None
385 }
386
387 fn read_sense<'db>(
388 &self,
389 database : &'db Database,
390 offset : u64
391 ) -> Sense<'db>
392 {
393 let line = read_line_at(&self.data, offset);
394
395 let sections : Vec<_> = line.split(|x| *x == b' ').collect();
396
397 let part_of_speech = part_of_speech_code_to_part_of_speech(sections[2]);
398
399 let mut index = 3;
400
401 let synonyms_cnt =
402 usize::from_str_radix(from_utf8(sections[index]).unwrap(), 16).unwrap();
403 index += 1;
404
405 let mut synonyms = vec!();
406 synonyms.reserve(synonyms_cnt);
407
408 for _sn in 0..synonyms_cnt
409 {
410 synonyms.push(
411 SenseRef
412 {
413 word : from_utf8(sections[index])
414 .unwrap()
415 .chars()
416 .map(|x| if x=='_' { ' ' } else { x })
417 .collect(),
418 lex_id : u32::from_str_radix(from_utf8(sections[index+1]).unwrap(), 16).unwrap(),
419 }
420 );
421 index += 2;
422 }
423 let pointer_count =
424 u32::from_str_radix( from_utf8(sections[index]).unwrap(), 10).unwrap();
425 index+=1;
426
427 let mut pointers = vec!();
428 pointers.reserve(pointer_count as usize);
429
430 for _pointern in 0..pointer_count
431 {
432 let rel = relationship_code_to_relationship(sections[index]);
433 let offset = u64::from_str_radix(
434 from_utf8(sections[index+1]).unwrap(), 10
435 ).unwrap();
436 let part_of_speech = part_of_speech_code_to_part_of_speech(sections[index+2]);
437 let _offset = u64::from_str_radix(
438 from_utf8(sections[index+3]).unwrap(), 16
439 ).unwrap();
440
441 index += 4;
442 pointers.push(
443 PointerRef
444 {
445 db: database,
446 relationship : rel,
447 part_of_speech: part_of_speech,
448 offset : offset,
449 }
450 );
451 }
452
453 if sections[2] == b"v"
454 {
455 let frame_count =
456 usize::from_str_radix(from_utf8(sections[index]).unwrap(), 10).unwrap();
457 index += frame_count + 1;
458 }
459
460 let _ = index;
461
462 let gloss =
463 {
464 let line_utf = from_utf8(&line).unwrap();
465 let gloss = &line_utf[line_utf.find('|').unwrap()+2..];
466 gloss
467 };
468
469 Sense
470 {
471 part_of_speech: part_of_speech,
472 gloss: gloss.to_string(),
473 synonyms: synonyms,
474 pointers: pointers,
475 }
476 }
477
478 fn senses<'db>(&self, database : &'db Database, word : &[u8])
480 -> Option<Vec<Sense<'db>>>
481 {
482 let offset = self.find_position(word);
483 if offset.is_none() { return None; }
484
485 let offset = offset.unwrap();
486 let line = read_line_at(&self.index, offset);
487
488 let line = String::from_utf8(line).unwrap();
489
490 let sections : Vec<&str>= line.split(' ').collect();
491
492 let mut index = 2;
493
494 let synset_cnt : u32 = sections[index].parse().unwrap();
495
496 index += 1;
497 let ptr_symbols_cnt : usize = sections[index].parse().unwrap();
498 index += 1;
499
500 index += ptr_symbols_cnt;
501
502 index += 1; index += 1; let mut senses = vec!();
506 senses.reserve(synset_cnt as usize);
507
508 for synset in 0..synset_cnt
509 {
510 let synset_offset =
511 u64::from_str_radix(sections[index+synset as usize], 10).unwrap();
512 senses.push( self.read_sense( database, synset_offset ) );
513 }
515
516 Some(senses)
517 }
518
519}
520
521
522#[derive(Debug)]
524pub struct Database
525{
526 db_files: Vec<DBFile>,
527}
528
529impl Database
530{
531 pub fn open(path : &std::path::Path)
536 -> std::io::Result<Database>
537 {
538 let mut db = Database { db_files: vec!() };
539
540 for e in std::fs::read_dir(path)?
541 {
542 let entry = e?;
543 let path_buf = entry.path();
544 if path_buf.file_stem().unwrap_or(std::ffi::OsStr::new(""))
545 == std::ffi::OsStr::new("index")
546 {
547 let ex = path_buf.extension().ok_or(std::io::Error::new(
548 std::io::ErrorKind::InvalidData,
549 "file with invalid part of speech".to_string()
550 ))?;
551 let part_of_speech
552 = if ex == "noun"
553 { PartOfSpeech::Noun }
554 else if ex == "verb"
555 { PartOfSpeech::Verb }
556 else if ex == "adv"
557 { PartOfSpeech::Adverb }
558 else if ex == "adj"
559 { PartOfSpeech::Adjective }
560 else
561 {
562 return Err(std::io::Error::new(
563 std::io::ErrorKind::InvalidData,
564 "file with invalid part of speech"
565 ));
566 };
567
568 let mut data_path = path_buf.with_file_name("data");
569 data_path.set_extension(ex);
570 db.db_files.push( DBFile::new(
571 part_of_speech,
572 path_buf.as_path(),
573 data_path.as_path(),
574 )? );
575 }
576 }
577
578 if db.db_files.len() == 0
579 {
580 Err(std::io::Error::new(
581 std::io::ErrorKind::InvalidData,
582 "file with invalid part of speech"
583 ))
584 }
585 else
586 {
587 Ok(db)
588 }
589 }
590
591 fn dbfile_for_part_of_speech(&self, part_of_speech : &PartOfSpeech)
592 -> &DBFile
593 {
594 for ref db in &self.db_files
595 {
596 if db.part_of_speech == *part_of_speech
597 {
598 return db;
599 }
600 }
601 panic!("part of speech file not found {:?}", part_of_speech);
602 }
603
604 pub fn senses(&self, word : &str)
608 -> Vec<Sense>
609 {
610 let mut all = vec!();
611 for w in &self.db_files
612 {
613 if let Some(x) = w.senses(
614 self,
615 word
616 .to_lowercase()
617 .chars()
618 .map(|x| if x==' ' { '_' } else { x })
619 .collect::<String>()
620 .as_bytes()
621 )
622 {
623 all.extend(x);
624 }
625 }
626 all
627 }
628}
629
630#[cfg(test)]
631mod test
632{
633 #[test]
634 fn test_1()
635 {
636 let wn = ::Database::open(&::std::path::Path::new("/usr/share/wordnet")).unwrap();
637 assert_eq!(18, wn.senses("bank").len());
638 assert_eq!(
639 1,
640 wn.senses("bank")[2].pointers
641 .iter()
642 .filter(|&x| x.relationship == ::Relationship::Hypernym)
643 .count()
644 );
645 assert_eq!(13, wn.senses("thrust").len());
646 assert_eq!(3, wn.senses("enlightenment").len());
647 }
648}