wordnet/
lib.rs

1mod file_platform;
2use file_platform::ReadAtFile;
3
4use std::cmp::Ordering;
5use std::str::from_utf8;
6
7fn read_line_at(f : &ReadAtFile, mut pos : u64)
8	-> Vec<u8>
9{
10	let mut block = vec![];
11	let mut writing_to = 0usize;
12	block.resize(512, 0u8);
13
14	loop
15	{
16		let len = f.read_at(&mut block[writing_to..], pos).unwrap();
17		let truncate = block.iter()
18			.skip(writing_to)
19			.enumerate()
20			.find(|x| *x.1 == b'\n')
21			.map(|x| x.0);
22		if let Some(t) = truncate
23		{
24			block.truncate(writing_to+t);
25			return block;
26		}
27		writing_to += len;
28		pos += len as u64;
29		let newlen = block.len()*2;
30		block.resize(newlen, 0u8);
31	}
32}
33
34#[derive(Debug)]
35#[derive(PartialEq)]
36pub enum PartOfSpeech
37{
38	/// An substantive
39	Noun,
40	/// a word that describes a noun
41	Adjective,
42	/// a word that describes a noun
43	AdjectiveSatellite,
44	/// a word that describes an action
45	Verb,
46	/// a word that describes a verb
47	Adverb,
48}
49
50
51/// noun, adjective, verb, adverb
52impl PartOfSpeech
53{
54	/// Returns a short dictionary-like label:
55	///
56	/// * n
57	/// * adj
58	/// * v
59	/// * adv
60	pub fn short(&self) -> &'static str
61	{
62		match *self
63		{
64			PartOfSpeech::Noun => "n",
65			PartOfSpeech::Adjective => "adj",
66			PartOfSpeech::AdjectiveSatellite => "adj",
67			PartOfSpeech::Verb => "v",
68			PartOfSpeech::Adverb => "adv",
69		}
70	}
71}
72
73fn part_of_speech_code_to_part_of_speech(code : &[u8])
74	-> PartOfSpeech
75{
76	match code
77	{
78		b"n" => PartOfSpeech::Noun,
79		b"v" => PartOfSpeech::Verb,
80		b"a" => PartOfSpeech::Adjective,
81		b"s" => PartOfSpeech::AdjectiveSatellite,
82		b"r" => PartOfSpeech::Adverb,
83		_ => panic!("impossible part of speech '{}'", from_utf8(code).unwrap()),
84	}
85}
86
87/// Relates one word to another semantically
88#[derive(Debug)]
89#[derive(PartialEq)]
90pub enum Relationship
91{
92	/// an opposite word
93	Antonym,
94	/// broader forms of this word (a *structure* is a hypernym of a *building*)
95	Hypernym,
96	/// broader forms of this word of which this word is a specific instance
97	/// (*The Enlightenment* is a specific instance of a *historic period*)
98	InstanceHypernym,
99	/// more specific versions of this word (a *courthouse* is a hyponym of a *house*)
100	Hyponym,
101	/// this word is a member of (the *world* is a hyponym of the *solar system*)
102	MemberHolonym,
103	/// this word is made with (*tin* is a substance holonym of *cassiterite*)
104	SubstanceHolonym,
105	/// this word is a part of (*land* is a part holonym of the *world*)
106	PartHolonym,
107	/// reverse of MemberHolonym (an *air bag* is a member meronym of *car*)
108	MemberMeronym,
109	/// reverse of SubstanceHolonym (*cassiterite* is a substance meronym of *tin*)
110	SubstanceMeronym,
111	/// reverse of PartHolonym (a *car* is a part holonym of an *air bag*)
112	PartMeronym,
113	/// *scientific* is an attribute of *scientific knowledge*
114	Attribute,
115	/// the word is related to (the adjective *outward* is an related to *outwardness*)
116	DerivationallyRelated,
117	///
118	DomainOfTopic,
119	///
120	MemberOfTopic,
121	///
122	DomainOfRegion,
123	///
124	MemberOfRegion,
125	///
126	DomainOfUsage,
127	///
128	MemberOfUsage,
129
130	/// A verb requires an action to be completed first (to *eat* requires one to *chew*)
131	Entailment,
132	/// A verb causes another action (to *retire* causes one to *yield*)
133	Cause,
134	///
135	AlsoSee,
136	///
137	VerbGroup,
138
139	///
140	SimilarTo,
141	///
142	VerbParticiple,
143
144	///
145	PertainymOrDerivedFromAdjective, // fixme
146}
147
148fn relationship_code_to_relationship(code : &[u8])
149	-> Relationship
150{
151	match code
152	{
153		b"!"	=> Relationship::Antonym,
154		b"@"	=> Relationship::Hypernym,
155		b"@i" => Relationship::InstanceHypernym,
156		b"~"	=> Relationship::Hyponym,
157		b"~i" => Relationship::InstanceHypernym,
158		b"#m" => Relationship::MemberHolonym,
159		b"#s" => Relationship::SubstanceHolonym,
160		b"#p" => Relationship::PartHolonym,
161		b"%m" => Relationship::MemberMeronym,
162		b"%s" => Relationship::SubstanceMeronym,
163		b"%p" => Relationship::PartMeronym,
164		b"="	=> Relationship::Attribute,
165		b"+"	=> Relationship::DerivationallyRelated,
166		b";c" => Relationship::DomainOfTopic,
167		b"-c" => Relationship::MemberOfTopic,
168		b";r" => Relationship::DomainOfRegion,
169		b"-r" => Relationship::MemberOfRegion,
170		b";u" => Relationship::DomainOfUsage,
171		b"-u" => Relationship::MemberOfUsage,
172		b"*"	=> Relationship::Entailment,
173		b">"	=> Relationship::Cause,
174		b"^"	=> Relationship::AlsoSee,
175		b"$"	=> Relationship::VerbGroup,
176		b"&"	=> Relationship::SimilarTo,
177		b"<"	=> Relationship::VerbParticiple,
178		b"\\" => Relationship::PertainymOrDerivedFromAdjective,
179		_ => panic!("illegal relationship code")
180	}
181}
182
183
184/// Senses are different definitions or etymologies for a word.
185///
186/// The senses are also arranged by part of speech. For example,
187/// "bank" has many senses, one is a verb that means "to count on something",
188/// another is a noun that refers to the financial institution.
189///
190/// A list of these can be accessed by `senses()`
191#[derive(Debug)]
192pub struct Sense<'db>
193{
194	/// The part of speech that this sense has
195	pub part_of_speech : PartOfSpeech,
196	/// A short dictionary-like text written in prose that describes the word
197	pub gloss : String,
198	/// Ways to write this sense, one of which is
199	/// probably the word you passed to `Database::senses()`
200	pub synonyms : Vec<SenseRef>,
201	/// Words that are somehow related to this sense.
202	pub pointers : Vec<PointerRef<'db>>,
203}
204
205/// Connects a Sense to words that relationship
206///
207/// A PointerRef has not been loaded from the database yet. You
208/// can call `read()` to do that.
209#[derive(Debug)]
210pub struct PointerRef<'db>
211{
212	db : &'db Database,
213	/// The relationship this pointer has
214	/// from the original word to to the sense
215	/// you can read with `read()`
216	pub relationship : Relationship,
217	/// The part of the speech that this new sense has.
218	pub part_of_speech : PartOfSpeech,
219	offset : u64,
220}
221
222///
223impl<'db> PointerRef<'db>
224{
225	/// Read this pointer from the database files.
226	/// This might lead to a Sense that you already have
227	/// seen so be careful to not recurse indefinitely.
228	///
229	/// If you only use look at once `relationship`, then everything
230	/// should be ok
231	pub fn read(&self) -> Sense<'db>
232	{
233		self.db
234			.dbfile_for_part_of_speech(&self.part_of_speech)
235			.read_sense(self.db, self.offset)
236	}
237}
238
239#[derive(Debug)]
240/// refers to the actual text of a word
241pub struct SenseRef
242{
243	/// the word
244	pub word : String,
245	lex_id : u32,
246}
247
248impl SenseRef
249{
250}
251
252#[derive(Debug)]
253struct DBFile
254{
255	name : String,
256	index : ReadAtFile,
257	index_size : u64,
258	data : ReadAtFile,
259	part_of_speech : PartOfSpeech,
260}
261
262impl DBFile
263{
264	fn new(
265		part_of_speech : PartOfSpeech,
266		index : &std::path::Path,
267		data : &std::path::Path
268	)
269		-> std::io::Result<DBFile>
270	{
271		let mut index_f = std::fs::File::open(index)?;
272		let data_f = std::fs::File::open(data)?;
273
274		let index_size = std::io::Seek::seek(&mut index_f, std::io::SeekFrom::End(0))?;
275
276		Ok(DBFile
277		{
278			name: index.to_str().unwrap().to_string(),
279			index: ReadAtFile::new(index_f),
280			index_size: index_size,
281			data: ReadAtFile::new(data_f),
282			part_of_speech : part_of_speech,
283		})
284	}
285
286	fn is_found_here(
287		&self,
288		pos : u64,
289		data : &[u8],
290		remaining_word : &[u8]
291	) -> Ordering
292	{
293		for x in 0..data.len()
294		{
295			if x == remaining_word.len() && data[x] == b' '
296			{
297				return Ordering::Equal;
298			}
299			else if x >= remaining_word.len() || data[x] > remaining_word[x]
300			{
301				return Ordering::Less;
302			}
303			else if data[x] < remaining_word[x]
304			{
305				return Ordering::Greater;
306			}
307		}
308		let block = &mut [0u8;32];
309		let bytes = self.index.read_at(block, pos+data.len() as u64).unwrap();
310		return self.is_found_here(
311			pos + data.len() as u64,
312			&block[0..bytes],
313			&remaining_word[data.len()..]
314		);
315	}
316
317	fn find_position(&self, word : &[u8])
318		-> Option<u64>
319	{
320		let block = &mut [0u8;32];
321
322		let mut end = self.index_size;
323		let mut begin = 0u64;
324		let mut pos = end/2;
325
326		while end-begin > (word.len()+10) as u64
327		{
328			if end-pos < 32
329			{
330				pos = begin;
331			}
332
333			let bytes = self.index.read_at(block, pos).unwrap();
334
335			let block = &block[ 0 .. bytes ];
336
337			if pos == begin
338			{
339				begin += bytes as u64;
340			}
341
342
343			if let Some(newline_offset)
344				= block.iter().enumerate().find(|a| *a.1 == b'\n').map(|x| x.0)
345			{
346				let newline = &block[newline_offset+1..];
347				let current_line_starts_at = pos + newline_offset as u64 + 1;
348				let rel = self.is_found_here(current_line_starts_at, newline, word);
349				match rel
350				{
351					Ordering::Equal => return Some(current_line_starts_at),
352					Ordering::Less =>
353					{
354						end = current_line_starts_at;
355					},
356					Ordering::Greater =>
357					{
358						begin = current_line_starts_at+word.len() as u64;
359					}
360				}
361
362				if begin >= end { break; }
363
364				let newpos = (end-begin)/2 + begin;
365				if newpos == pos
366				{
367					break;
368				}
369				else
370				{
371					pos = newpos;
372				}
373			}
374			else if (pos + bytes as u64) < end
375			{
376				pos += bytes as u64;
377			}
378			else
379			{
380				pos -= std::cmp::min(64, pos);
381			}
382		}
383
384		None
385	}
386
387	fn read_sense<'db>(
388		&self,
389		database : &'db Database,
390		offset : u64
391	) -> Sense<'db>
392	{
393		let line = read_line_at(&self.data, offset);
394
395		let sections : Vec<_> = line.split(|x| *x == b' ').collect();
396
397		let part_of_speech = part_of_speech_code_to_part_of_speech(sections[2]);
398
399		let mut index = 3;
400
401		let synonyms_cnt =
402			usize::from_str_radix(from_utf8(sections[index]).unwrap(), 16).unwrap();
403		index += 1;
404
405		let mut synonyms = vec!();
406		synonyms.reserve(synonyms_cnt);
407
408		for _sn in 0..synonyms_cnt
409		{
410			synonyms.push(
411				SenseRef
412				{
413					word : from_utf8(sections[index])
414						.unwrap()
415						.chars()
416						.map(|x| if x=='_' { ' ' } else { x })
417						.collect(),
418					lex_id : u32::from_str_radix(from_utf8(sections[index+1]).unwrap(), 16).unwrap(),
419				}
420			);
421			index += 2;
422		}
423		let pointer_count =
424			u32::from_str_radix( from_utf8(sections[index]).unwrap(), 10).unwrap();
425		index+=1;
426
427		let mut pointers = vec!();
428		pointers.reserve(pointer_count as usize);
429
430		for _pointern in 0..pointer_count
431		{
432			let rel = relationship_code_to_relationship(sections[index]);
433			let offset = u64::from_str_radix(
434				from_utf8(sections[index+1]).unwrap(), 10
435			).unwrap();
436			let part_of_speech = part_of_speech_code_to_part_of_speech(sections[index+2]);
437			let _offset = u64::from_str_radix(
438				from_utf8(sections[index+3]).unwrap(), 16
439			).unwrap();
440
441			index += 4;
442			pointers.push(
443				PointerRef
444				{
445					db: database,
446					relationship : rel,
447					part_of_speech: part_of_speech,
448					offset : offset,
449				}
450			);
451		}
452
453		if sections[2] == b"v"
454		{
455			let frame_count =
456				usize::from_str_radix(from_utf8(sections[index]).unwrap(), 10).unwrap();
457			index += frame_count + 1;
458		}
459
460		let _ = index;
461
462		let gloss =
463		{
464			let line_utf = from_utf8(&line).unwrap();
465			let gloss = &line_utf[line_utf.find('|').unwrap()+2..];
466			gloss
467		};
468
469		Sense
470		{
471			part_of_speech: part_of_speech,
472			gloss: gloss.to_string(),
473			synonyms: synonyms,
474			pointers: pointers,
475		}
476	}
477
478	/// Searches for a word and returns a list of all its senses.
479	fn senses<'db>(&self, database : &'db Database, word : &[u8])
480		-> Option<Vec<Sense<'db>>>
481	{
482		let offset = self.find_position(word);
483		if offset.is_none() { return None; }
484
485		let offset = offset.unwrap();
486		let line = read_line_at(&self.index, offset);
487
488		let line = String::from_utf8(line).unwrap();
489
490		let sections : Vec<&str>= line.split(' ').collect();
491
492		let mut index = 2;
493
494		let synset_cnt : u32 = sections[index].parse().unwrap();
495
496		index += 1;
497		let ptr_symbols_cnt : usize = sections[index].parse().unwrap();
498		index += 1;
499
500		index += ptr_symbols_cnt;
501
502		index += 1; // skip sense_cnt
503		index += 1; // skip tagsense_cnt
504
505		let mut senses = vec!();
506		senses.reserve(synset_cnt as usize);
507
508		for synset in 0..synset_cnt
509		{
510			let synset_offset =
511				u64::from_str_radix(sections[index+synset as usize], 10).unwrap();
512			senses.push( self.read_sense( database, synset_offset ) );
513			// self.read_sense( synset_offset );
514		}
515
516		Some(senses)
517	}
518
519}
520
521
522/// Represents a Wordnet database directory
523#[derive(Debug)]
524pub struct Database
525{
526	db_files: Vec<DBFile>,
527}
528
529impl Database
530{
531	/// Open a wordnet database directory (not included)
532	///
533	/// On Debian, these files are present in `/usr/share/wordnet`
534	/// and can be installed from the package `wordnet-base`.
535	pub fn open(path : &std::path::Path)
536		-> std::io::Result<Database>
537	{
538		let mut db = Database { db_files: vec!() };
539
540		for e in std::fs::read_dir(path)?
541		{
542			let entry = e?;
543			let path_buf = entry.path();
544			if path_buf.file_stem().unwrap_or(std::ffi::OsStr::new(""))
545				== std::ffi::OsStr::new("index")
546			{
547				let ex = path_buf.extension().ok_or(std::io::Error::new(
548					std::io::ErrorKind::InvalidData,
549					"file with invalid part of speech".to_string()
550				))?;
551				let part_of_speech
552					= if ex == "noun"
553						{ PartOfSpeech::Noun }
554					else if ex == "verb"
555						{ PartOfSpeech::Verb }
556					else if ex == "adv"
557						{ PartOfSpeech::Adverb }
558					else if ex == "adj"
559						{ PartOfSpeech::Adjective }
560					else
561					{
562						return Err(std::io::Error::new(
563							std::io::ErrorKind::InvalidData,
564							"file with invalid part of speech"
565						));
566					};
567
568				let mut data_path = path_buf.with_file_name("data");
569				data_path.set_extension(ex);
570				db.db_files.push( DBFile::new(
571					part_of_speech,
572					path_buf.as_path(),
573					data_path.as_path(),
574				)? );
575			}
576		}
577
578		if db.db_files.len() == 0
579		{
580			Err(std::io::Error::new(
581				std::io::ErrorKind::InvalidData,
582				"file with invalid part of speech"
583			))
584		}
585		else
586		{
587			Ok(db)
588		}
589	}
590
591	fn dbfile_for_part_of_speech(&self, part_of_speech : &PartOfSpeech)
592		-> &DBFile
593	{
594		for ref db in &self.db_files
595		{
596			if db.part_of_speech == *part_of_speech
597			{
598				return db;
599			}
600		}
601		panic!("part of speech file not found {:?}", part_of_speech);
602	}
603
604	/// find all senses of a word.
605	///
606	/// This search is case-insensitive.
607	pub fn senses(&self, word : &str)
608		-> Vec<Sense>
609	{
610		let mut all = vec!();
611		for w in &self.db_files
612		{
613			if let Some(x) = w.senses(
614				self,
615				word
616					.to_lowercase()
617					.chars()
618					.map(|x| if x==' ' { '_' } else { x })
619					.collect::<String>()
620					.as_bytes()
621			)
622			{
623				all.extend(x);
624			}
625		}
626		all
627	}
628}
629
630#[cfg(test)]
631mod test
632{
633	#[test]
634	fn test_1()
635	{
636		let wn = ::Database::open(&::std::path::Path::new("/usr/share/wordnet")).unwrap();
637		assert_eq!(18, wn.senses("bank").len());
638		assert_eq!(
639			1,
640			wn.senses("bank")[2].pointers
641				.iter()
642				.filter(|&x| x.relationship == ::Relationship::Hypernym)
643				.count()
644		);
645		assert_eq!(13, wn.senses("thrust").len());
646		assert_eq!(3, wn.senses("enlightenment").len());
647	}
648}