import sys
def get_bits(high, punct):
b = 0
for i in range(16):
if high * 16 + i in punct:
b |= 1 << i
return b
def main(args):
ascii_punct = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
ascii_set = set((ord(c) for c in ascii_punct))
punct = set()
for line in file(args[1]):
spl = line.split(';')
if spl[2] in ('Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps'):
punct.add(int(spl[0], 16))
pshift = list(set((cp // 16 for cp in punct if cp >= 128)))
pshift.sort()
bits = [get_bits(high, punct) for high in pshift]
print """// Copyright 2015 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! CommonMark punctuation set based on spec and Unicode properties.
// Autogenerated by mk_puncttable.py
const PUNCT_MASKS_ASCII: [u16; 8] = ["""
for x in range(8):
y = get_bits(x, ascii_set)
print ' 0x%04x, // U+%04X...U+%04X' % (y, x * 16, x * 16 + 15)
print """ ];
const PUNCT_TAB: [u16; %i] = [""" % len(pshift)
for x in pshift:
print ' %d, // U+%04X...U+%04X' % (x, x * 16, x * 16 + 15)
print """ ];
const PUNCT_MASKS: [u16; %i] = [""" % len(pshift)
for i, y in enumerate(bits):
x = pshift[i]
print ' 0x%04x, // U+%04X...U+%04X' % (y, x * 16, x * 16 + 15)
print """ ];
pub fn is_ascii_punctuation(c: u8) -> bool {
c < 128 && (PUNCT_MASKS_ASCII[(c / 16) as usize] & (1 << (c & 15))) != 0
}
pub fn is_punctuation(c: char) -> bool {
let cp = c as u32;
if cp < 128 {return is_ascii_punctuation(cp as u8); }
if cp > 0x%04X { return false; }
let high = (cp / 16) as u16;
match PUNCT_TAB.binary_search(&high) {
Ok(index) => (PUNCT_MASKS[index] & (1 << (cp & 15))) != 0,
_ => false
}
}
#[cfg(test)]
mod tests {
use super::{is_ascii_punctuation, is_punctuation};
#[test]
fn test_ascii() {
assert!(is_ascii_punctuation(b'!'));
assert!(is_ascii_punctuation(b'@'));
assert!(is_ascii_punctuation(b'~'));
assert!(!is_ascii_punctuation(b' '));
assert!(!is_ascii_punctuation(b'0'));
assert!(!is_ascii_punctuation(b'A'));
assert!(!is_ascii_punctuation(0xA1));
}
#[test]
fn test_unicode() {
assert!(is_punctuation('~'));
assert!(!is_punctuation(' '));
assert!(is_punctuation('\u{00A1}'));
assert!(is_punctuation('\u{060C}'));
assert!(is_punctuation('\u{FF65}'));
assert!(is_punctuation('\u{1BC9F}'));
assert!(!is_punctuation('\u{1BCA0}'));
}
}
""" % max(punct)
main(sys.argv)