mod file_platform;
use file_platform::ReadAtFile;
use std::cmp::Ordering;
use std::str::from_utf8;
fn read_line_at(f : &ReadAtFile, mut pos : u64)
-> Vec<u8>
{
let mut block = vec![];
let mut writing_to = 0usize;
block.resize(512, 0u8);
loop
{
let len = f.read_at(&mut block[writing_to..], pos).unwrap();
let truncate = block.iter()
.skip(writing_to)
.enumerate()
.find(|x| *x.1 == b'\n')
.map(|x| x.0);
if let Some(t) = truncate
{
block.truncate(writing_to+t);
return block;
}
writing_to += len;
pos += len as u64;
let newlen = block.len()*2;
block.resize(newlen, 0u8);
}
}
#[derive(Debug)]
#[derive(PartialEq)]
pub enum PartOfSpeech
{
Noun,
Adjective,
AdjectiveSatellite,
Verb,
Adverb,
}
impl PartOfSpeech
{
pub fn short(&self) -> &'static str
{
match *self
{
PartOfSpeech::Noun => "n",
PartOfSpeech::Adjective => "adj",
PartOfSpeech::AdjectiveSatellite => "adj",
PartOfSpeech::Verb => "v",
PartOfSpeech::Adverb => "adv",
}
}
}
fn part_of_speech_code_to_part_of_speech(code : &[u8])
-> PartOfSpeech
{
match code
{
b"n" => PartOfSpeech::Noun,
b"v" => PartOfSpeech::Verb,
b"a" => PartOfSpeech::Adjective,
b"s" => PartOfSpeech::AdjectiveSatellite,
b"r" => PartOfSpeech::Adverb,
_ => panic!("impossible part of speech '{}'", from_utf8(code).unwrap()),
}
}
#[derive(Debug)]
#[derive(PartialEq)]
pub enum Relationship
{
Antonym,
Hypernym,
InstanceHypernym,
Hyponym,
MemberHolonym,
SubstanceHolonym,
PartHolonym,
MemberMeronym,
SubstanceMeronym,
PartMeronym,
Attribute,
DerivationallyRelated,
DomainOfTopic,
MemberOfTopic,
DomainOfRegion,
MemberOfRegion,
DomainOfUsage,
MemberOfUsage,
Entailment,
Cause,
AlsoSee,
VerbGroup,
SimilarTo,
VerbParticiple,
PertainymOrDerivedFromAdjective, }
fn relationship_code_to_relationship(code : &[u8])
-> Relationship
{
match code
{
b"!" => Relationship::Antonym,
b"@" => Relationship::Hypernym,
b"@i" => Relationship::InstanceHypernym,
b"~" => Relationship::Hyponym,
b"~i" => Relationship::InstanceHypernym,
b"#m" => Relationship::MemberHolonym,
b"#s" => Relationship::SubstanceHolonym,
b"#p" => Relationship::PartHolonym,
b"%m" => Relationship::MemberMeronym,
b"%s" => Relationship::SubstanceMeronym,
b"%p" => Relationship::PartMeronym,
b"=" => Relationship::Attribute,
b"+" => Relationship::DerivationallyRelated,
b";c" => Relationship::DomainOfTopic,
b"-c" => Relationship::MemberOfTopic,
b";r" => Relationship::DomainOfRegion,
b"-r" => Relationship::MemberOfRegion,
b";u" => Relationship::DomainOfUsage,
b"-u" => Relationship::MemberOfUsage,
b"*" => Relationship::Entailment,
b">" => Relationship::Cause,
b"^" => Relationship::AlsoSee,
b"$" => Relationship::VerbGroup,
b"&" => Relationship::SimilarTo,
b"<" => Relationship::VerbParticiple,
b"\\" => Relationship::PertainymOrDerivedFromAdjective,
_ => panic!("illegal relationship code")
}
}
#[derive(Debug)]
pub struct Sense<'db>
{
pub part_of_speech : PartOfSpeech,
pub gloss : String,
pub synonyms : Vec<SenseRef>,
pub pointers : Vec<PointerRef<'db>>,
}
#[derive(Debug)]
pub struct PointerRef<'db>
{
db : &'db Database,
pub relationship : Relationship,
pub part_of_speech : PartOfSpeech,
offset : u64,
}
impl<'db> PointerRef<'db>
{
pub fn read(&self) -> Sense<'db>
{
self.db
.dbfile_for_part_of_speech(&self.part_of_speech)
.read_sense(self.db, self.offset)
}
}
#[derive(Debug)]
pub struct SenseRef
{
pub word : String,
lex_id : u32,
}
impl SenseRef
{
}
#[derive(Debug)]
struct DBFile
{
name : String,
index : ReadAtFile,
index_size : u64,
data : ReadAtFile,
part_of_speech : PartOfSpeech,
}
impl DBFile
{
fn new(
part_of_speech : PartOfSpeech,
index : &std::path::Path,
data : &std::path::Path
)
-> std::io::Result<DBFile>
{
let mut index_f = std::fs::File::open(index)?;
let data_f = std::fs::File::open(data)?;
let index_size = std::io::Seek::seek(&mut index_f, std::io::SeekFrom::End(0))?;
Ok(DBFile
{
name: index.to_str().unwrap().to_string(),
index: ReadAtFile::new(index_f),
index_size: index_size,
data: ReadAtFile::new(data_f),
part_of_speech : part_of_speech,
})
}
fn is_found_here(
&self,
pos : u64,
data : &[u8],
remaining_word : &[u8]
) -> Ordering
{
for x in 0..data.len()
{
if x == remaining_word.len() && data[x] == b' '
{
return Ordering::Equal;
}
else if x >= remaining_word.len() || data[x] > remaining_word[x]
{
return Ordering::Less;
}
else if data[x] < remaining_word[x]
{
return Ordering::Greater;
}
}
let block = &mut [0u8;32];
let bytes = self.index.read_at(block, pos+data.len() as u64).unwrap();
return self.is_found_here(
pos + data.len() as u64,
&block[0..bytes],
&remaining_word[data.len()..]
);
}
fn find_position(&self, word : &[u8])
-> Option<u64>
{
let block = &mut [0u8;32];
let mut end = self.index_size;
let mut begin = 0u64;
let mut pos = end/2;
while end-begin > (word.len()+10) as u64
{
if end-pos < 32
{
pos = begin;
}
let bytes = self.index.read_at(block, pos).unwrap();
let block = &block[ 0 .. bytes ];
if pos == begin
{
begin += bytes as u64;
}
if let Some(newline_offset)
= block.iter().enumerate().find(|a| *a.1 == b'\n').map(|x| x.0)
{
let newline = &block[newline_offset+1..];
let current_line_starts_at = pos + newline_offset as u64 + 1;
let rel = self.is_found_here(current_line_starts_at, newline, word);
match rel
{
Ordering::Equal => return Some(current_line_starts_at),
Ordering::Less =>
{
end = current_line_starts_at;
},
Ordering::Greater =>
{
begin = current_line_starts_at+word.len() as u64;
}
}
if begin >= end { break; }
let newpos = (end-begin)/2 + begin;
if newpos == pos
{
break;
}
else
{
pos = newpos;
}
}
else if (pos + bytes as u64) < end
{
pos += bytes as u64;
}
else
{
pos -= std::cmp::min(64, pos);
}
}
None
}
fn read_sense<'db>(
&self,
database : &'db Database,
offset : u64
) -> Sense<'db>
{
let line = read_line_at(&self.data, offset);
let sections : Vec<_> = line.split(|x| *x == b' ').collect();
let part_of_speech = part_of_speech_code_to_part_of_speech(sections[2]);
let mut index = 3;
let synonyms_cnt =
usize::from_str_radix(from_utf8(sections[index]).unwrap(), 16).unwrap();
index += 1;
let mut synonyms = vec!();
synonyms.reserve(synonyms_cnt);
for _sn in 0..synonyms_cnt
{
synonyms.push(
SenseRef
{
word : from_utf8(sections[index])
.unwrap()
.chars()
.map(|x| if x=='_' { ' ' } else { x })
.collect(),
lex_id : u32::from_str_radix(from_utf8(sections[index+1]).unwrap(), 16).unwrap(),
}
);
index += 2;
}
let pointer_count =
u32::from_str_radix( from_utf8(sections[index]).unwrap(), 10).unwrap();
index+=1;
let mut pointers = vec!();
pointers.reserve(pointer_count as usize);
for _pointern in 0..pointer_count
{
let rel = relationship_code_to_relationship(sections[index]);
let offset = u64::from_str_radix(
from_utf8(sections[index+1]).unwrap(), 10
).unwrap();
let part_of_speech = part_of_speech_code_to_part_of_speech(sections[index+2]);
let _offset = u64::from_str_radix(
from_utf8(sections[index+3]).unwrap(), 16
).unwrap();
index += 4;
pointers.push(
PointerRef
{
db: database,
relationship : rel,
part_of_speech: part_of_speech,
offset : offset,
}
);
}
if sections[2] == b"v"
{
let frame_count =
usize::from_str_radix(from_utf8(sections[index]).unwrap(), 10).unwrap();
index += frame_count + 1;
}
let _ = index;
let gloss =
{
let line_utf = from_utf8(&line).unwrap();
let gloss = &line_utf[line_utf.find('|').unwrap()+2..];
gloss
};
Sense
{
part_of_speech: part_of_speech,
gloss: gloss.to_string(),
synonyms: synonyms,
pointers: pointers,
}
}
fn senses<'db>(&self, database : &'db Database, word : &[u8])
-> Option<Vec<Sense<'db>>>
{
let offset = self.find_position(word);
if offset.is_none() { return None; }
let offset = offset.unwrap();
let line = read_line_at(&self.index, offset);
let line = String::from_utf8(line).unwrap();
let sections : Vec<&str>= line.split(' ').collect();
let mut index = 2;
let synset_cnt : u32 = sections[index].parse().unwrap();
index += 1;
let ptr_symbols_cnt : usize = sections[index].parse().unwrap();
index += 1;
index += ptr_symbols_cnt;
index += 1; index += 1; let mut senses = vec!();
senses.reserve(synset_cnt as usize);
for synset in 0..synset_cnt
{
let synset_offset =
u64::from_str_radix(sections[index+synset as usize], 10).unwrap();
senses.push( self.read_sense( database, synset_offset ) );
}
Some(senses)
}
}
#[derive(Debug)]
pub struct Database
{
db_files: Vec<DBFile>,
}
impl Database
{
pub fn open(path : &std::path::Path)
-> std::io::Result<Database>
{
let mut db = Database { db_files: vec!() };
for e in std::fs::read_dir(path)?
{
let entry = e?;
let path_buf = entry.path();
if path_buf.file_stem().unwrap_or(std::ffi::OsStr::new(""))
== std::ffi::OsStr::new("index")
{
let ex = path_buf.extension().ok_or(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"file with invalid part of speech".to_string()
))?;
let part_of_speech
= if ex == "noun"
{ PartOfSpeech::Noun }
else if ex == "verb"
{ PartOfSpeech::Verb }
else if ex == "adv"
{ PartOfSpeech::Adverb }
else if ex == "adj"
{ PartOfSpeech::Adjective }
else
{
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"file with invalid part of speech"
));
};
let mut data_path = path_buf.with_file_name("data");
data_path.set_extension(ex);
db.db_files.push( DBFile::new(
part_of_speech,
path_buf.as_path(),
data_path.as_path(),
)? );
}
}
if db.db_files.len() == 0
{
Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"file with invalid part of speech"
))
}
else
{
Ok(db)
}
}
fn dbfile_for_part_of_speech(&self, part_of_speech : &PartOfSpeech)
-> &DBFile
{
for ref db in &self.db_files
{
if db.part_of_speech == *part_of_speech
{
return db;
}
}
panic!("part of speech file not found {:?}", part_of_speech);
}
pub fn senses(&self, word : &str)
-> Vec<Sense>
{
let mut all = vec!();
for w in &self.db_files
{
if let Some(x) = w.senses(
self,
word
.to_lowercase()
.chars()
.map(|x| if x==' ' { '_' } else { x })
.collect::<String>()
.as_bytes()
)
{
all.extend(x);
}
}
all
}
}
#[cfg(test)]
mod test
{
#[test]
fn test_1()
{
let wn = ::Database::open(&::std::path::Path::new("/usr/share/wordnet")).unwrap();
assert_eq!(18, wn.senses("bank").len());
assert_eq!(
1,
wn.senses("bank")[2].pointers
.iter()
.filter(|&x| x.relationship == ::Relationship::Hypernym)
.count()
);
assert_eq!(13, wn.senses("thrust").len());
assert_eq!(3, wn.senses("enlightenment").len());
}
}