#include "libtexpdf.h"
#include <ctype.h>
#include <string.h>
#include <math.h>
#ifdef is_space
#undef is_space
#endif
#ifdef is_delim
#undef is_delim
#endif
#define is_space(c) ((c) == ' ' || (c) == '\t' || (c) == '\f' || \
(c) == '\r' || (c) == '\n' || (c) == '\0')
#define is_delim(c) ((c) == '(' || (c) == '/' || \
(c) == '<' || (c) == '>' || \
(c) == '[' || (c) == ']' || \
(c) == '%')
#define PDF_TOKEN_END(p,e) ((p) >= (e) || is_space(*(p)) || is_delim(*(p)))
#define istokensep(c) (is_space((c)) || is_delim((c)))
static struct {
int tainted;
} parser_state = {
0
};
extern int xtoi (char ch);
void
texpdf_dump (const char *start, const char *end)
{
const char *p = start;
#define DUMP_LIMIT 50
MESG("\nCurrent input buffer is -->");
while (p < end && p < start + DUMP_LIMIT)
MESG("%c", *(p++));
if (p == start+DUMP_LIMIT)
MESG("...");
MESG("<--\n");
}
#define SAVE(s,e) do {\
save = (s);\
} while (0)
#define DUMP_RESTORE(s,e) do {\
texpdf_dump(save, end);\
(s) = save;\
} while (0)
void
skip_line (const char **start, const char *end)
{
while (*start < end && **start != '\n' && **start != '\r')
(*start)++;
if (*start < end && **start == '\r')
(*start)++;
if (*start < end && **start == '\n')
(*start)++;
}
void
texpdf_skip_white (const char **start, const char *end)
{
while (*start < end && (is_space(**start) || **start == '%')) {
if (**start == '%')
skip_line(start, end);
else
(*start)++;
}
}
static char *
parsed_string (const char *start, const char *end)
{
char *result = NULL;
int len;
len = end - start;
if (len > 0) {
result = NEW(len + 1, char);
memcpy(result, start, len);
result[len] = '\0';
}
return result;
}
char *
texpdf_parse_number (const char **start, const char *end)
{
char *number;
const char *p;
texpdf_skip_white(start, end);
p = *start;
if (p < end && (*p == '+' || *p == '-'))
p++;
while (p < end && isdigit((unsigned char)*p))
p++;
if (p < end && *p == '.') {
p++;
while (p < end && isdigit((unsigned char)*p))
p++;
}
number = parsed_string(*start, p);
*start = p;
return number;
}
char *
texpdf_parse_unsigned (const char **start, const char *end)
{
char *number;
const char *p;
texpdf_skip_white(start, end);
for (p = *start; p < end; p++) {
if (!isdigit((unsigned char)*p))
break;
}
number = parsed_string(*start, p);
*start = p;
return number;
}
static char *
texpdf_parse_gen_ident (const char **start, const char *end, const char *valid_chars)
{
char *ident;
const char *p;
for (p = *start; p < end; p++) {
if (!strchr(valid_chars, *p))
break;
}
ident = parsed_string(*start, p);
*start = p;
return ident;
}
char *
texpdf_parse_ident (const char **start, const char *end)
{
static const char *valid_chars =
"!\"#$&'*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
return texpdf_parse_gen_ident(start, end, valid_chars);
}
char *
texpdf_parse_val_ident (const char **start, const char *end)
{
static const char *valid_chars =
"!\"#$&'*+,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
return texpdf_parse_gen_ident(start, end, valid_chars);
}
char *
texpdf_parse_opt_ident (const char **start, const char *end)
{
if (*start < end && **start == '@') {
(*start)++;
return texpdf_parse_ident(start, end);
}
return NULL;
}
pdf_obj *
texpdf_parse_pdf_number (const char **pp, const char *endptr)
{
const char *p;
double v = 0.0;
int nddigits = 0, sign = 1;
int has_dot = 0;
p = *pp;
texpdf_skip_white(&p, endptr);
if (p >= endptr ||
(!isdigit((unsigned char)p[0]) && p[0] != '.' &&
p[0] != '+' && p[0] != '-')) {
WARN("Could not find a numeric object.");
return NULL;
}
if (p[0] == '-') {
if (p + 1 >= endptr) {
WARN("Could not find a numeric object.");
return NULL;
}
sign = -1;
p++;
} else if (p[0] == '+') {
if (p + 1 >= endptr) {
WARN("Could not find a numeric object.");
return NULL;
}
sign = 1;
p++;
}
while (p < endptr && !istokensep(p[0])) {
if (p[0] == '.') {
if (has_dot) {
WARN("Could not find a numeric object.");
return NULL;
} else {
has_dot = 1;
}
} else if (isdigit((unsigned char)p[0])) {
if (has_dot) {
v += (p[0] - '0') / pow(10, nddigits + 1);
nddigits++;
} else {
v = v * 10.0 + p[0] - '0';
}
} else {
WARN("Could not find a numeric object.");
return NULL;
}
p++;
}
*pp = p;
return texpdf_new_number(sign * v);
}
static int
pn_getc (const char **pp, const char *endptr)
{
int ch = 0;
const char *p;
p = *pp;
if (p[0] == '#') {
if (p + 2 >= endptr) {
*pp = endptr;
return -1;
}
if (!isxdigit((unsigned char)p[1]) || !isxdigit((unsigned char)p[2])) {
*pp += 3;
return -1;
}
ch = (xtoi(p[1]) << 4);
ch += xtoi(p[2]);
*pp += 3;
} else {
ch = p[0];
*pp += 1;
}
return ch;
}
#ifndef PDF_NAME_LEN_MAX
#define PDF_NAME_LEN_MAX 128
#endif
#ifndef PDF_STRING_LEN_MAX
#define PDF_STRING_LEN_MAX 65535
#endif
#define STRING_BUFFER_SIZE PDF_STRING_LEN_MAX+1
static char sbuf[PDF_STRING_LEN_MAX+1];
pdf_obj *
texpdf_parse_pdf_name (const char **pp, const char *endptr)
{
char name[PDF_NAME_LEN_MAX+1];
int ch, len = 0;
texpdf_skip_white(pp, endptr);
if (*pp >= endptr || **pp != '/') {
WARN("Could not find a name object.");
return NULL;
}
(*pp)++;
while (*pp < endptr && !istokensep(**pp)) {
ch = pn_getc(pp, endptr);
if (ch < 0 || ch > 0xff) {
WARN("Invalid char in PDF name object. (ignored)");
} else if (ch == 0) {
WARN("Null char not allowed in PDF name object. (ignored)");
} else if (len < STRING_BUFFER_SIZE) {
if (len == PDF_NAME_LEN_MAX) {
WARN("PDF name length too long. (>= %d bytes)", PDF_NAME_LEN_MAX);
}
name[len++] = ch;
} else {
WARN("PDF name length too long. (>= %d bytes, truncated)",
STRING_BUFFER_SIZE);
}
}
if (len < 1) {
WARN("No valid name object found.");
return NULL;
}
name[len] = '\0';
return texpdf_new_name(name);
}
pdf_obj *
texpdf_parse_pdf_boolean (const char **pp, const char *endptr)
{
texpdf_skip_white(pp, endptr);
if (*pp + 4 <= endptr &&
!strncmp(*pp, "true", 4)) {
if (*pp + 4 == endptr ||
istokensep(*(*pp + 4))) {
*pp += 4;
return texpdf_new_boolean(1);
}
} else if (*pp + 5 <= endptr &&
!strncmp(*pp, "false", 5)) {
if (*pp + 5 == endptr ||
istokensep(*(*pp + 5))) {
*pp += 5;
return texpdf_new_boolean(0);
}
}
WARN("Not a boolean object.");
return NULL;
}
pdf_obj *
texpdf_parse_pdf_null (const char **pp, const char *endptr)
{
texpdf_skip_white(pp, endptr);
if (*pp + 4 > endptr) {
WARN("Not a null object.");
return NULL;
} else if (*pp + 4 < endptr &&
!istokensep(*(*pp+4))) {
WARN("Not a null object.");
return NULL;
} else if (!strncmp(*pp, "null", 4)) {
*pp += 4;
return texpdf_new_null();
}
WARN("Not a null object.");
return NULL;
}
#ifndef isodigit
#define isodigit(c) ((c) >= '0' && (c) <= '7')
#endif
static int
ps_getescc (const char **pp, const char *endptr)
{
int ch, i;
const char *p;
p = *pp + 1;
switch (p[0]) {
case 'n': ch = '\n'; p++; break;
case 'r': ch = '\r'; p++; break;
case 't': ch = '\t'; p++; break;
case 'b': ch = '\b'; p++; break;
case 'f': ch = '\f'; p++; break;
case '\n':
ch = -1;
p++;
break;
case '\r':
ch = -1;
p++;
if (p < endptr && p[0] == '\n')
p++;
break;
default:
if (p[0] == '\\' ||
p[0] == '(' || p[0] == ')') {
ch = p[0];
p++;
} else if (isodigit(p[0])) {
ch = 0;
for (i = 0; i < 3 &&
p < endptr && isodigit(p[0]); i++) {
ch = (ch << 3) + (p[0] - '0');
p++;
}
ch = (ch & 0xff);
} else {
ch = ((unsigned char) p[0]);
p++;
}
}
*pp = p;
return ch;
}
static pdf_obj *
texpdf_parse_pdf_literal_string (const char **pp, const char *endptr)
{
int ch, op_count = 0, len = 0;
const char *p;
p = *pp;
texpdf_skip_white(&p, endptr);
if (p >= endptr || p[0] != '(')
return NULL;
p++;
while (p < endptr) {
ch = p[0];
if (ch == ')' && op_count < 1)
break;
#ifndef PDF_PARSE_STRICT
if (parser_state.tainted) {
if (p + 1 < endptr && (ch & 0x80)) {
if (len + 2 >= PDF_STRING_LEN_MAX) {
WARN("PDF string length too long. (limit: %ld)",
PDF_STRING_LEN_MAX);
return NULL;
}
sbuf[len++] = p[0];
sbuf[len++] = p[1];
p += 2;
continue;
}
}
#endif
if (len + 1 >= PDF_STRING_LEN_MAX) {
WARN("PDF string length too long. (limit: %ld)",
PDF_STRING_LEN_MAX);
return NULL;
}
switch (ch) {
case '\\':
ch = ps_getescc(&p, endptr);
if (ch >= 0)
sbuf[len++] = (ch & 0xff);
break;
case '\r':
p++;
if (p < endptr && p[0] == '\n')
p++;
sbuf[len++] = '\n';
break;
default:
if (ch == '(')
op_count++;
else if (ch == ')')
op_count--;
sbuf[len++] = ch;
p++;
break;
}
}
if (op_count > 0 ||
p >= endptr || p[0] != ')') {
WARN("Unbalanced parens/truncated PDF literal string.");
return NULL;
}
*pp = p + 1;
return texpdf_new_string(sbuf, len);
}
static pdf_obj *
texpdf_parse_pdf_hex_string (const char **pp, const char *endptr)
{
const char *p;
long len;
p = *pp;
texpdf_skip_white(&p, endptr);
if (p >= endptr || p[0] != '<')
return NULL;
p++;
len = 0;
while (p < endptr && p[0] != '>' && len < PDF_STRING_LEN_MAX) {
int ch;
texpdf_skip_white(&p, endptr);
if (p >= endptr || p[0] == '>')
break;
ch = (xtoi(p[0]) << 4);
p++;
texpdf_skip_white(&p, endptr);
if (p < endptr && p[0] != '>') {
ch += xtoi(p[0]);
p++;
}
sbuf[len++] = (ch & 0xff);
}
if (p >= endptr) {
WARN("Premature end of input hex string.");
return NULL;
} else if (p[0] != '>') {
WARN("PDF string length too long. (limit: %ld)", PDF_STRING_LEN_MAX);
return NULL;
}
*pp = p + 1;
return texpdf_new_string(sbuf, len);
}
pdf_obj *
texpdf_parse_pdf_string (const char **pp, const char *endptr)
{
texpdf_skip_white(pp, endptr);
if (*pp + 2 <= endptr) {
if (**pp == '(')
return texpdf_parse_pdf_literal_string(pp, endptr);
else if (**pp == '<' &&
(*(*pp + 1) == '>' || isxdigit((unsigned char)*(*pp + 1)))) {
return texpdf_parse_pdf_hex_string(pp, endptr);
}
}
WARN("Could not find a string object.");
return NULL;
}
#ifndef PDF_PARSE_STRICT
pdf_obj *
texpdf_parse_texpdf_tainted_dict (const char **pp, const char *endptr)
{
pdf_obj *result;
parser_state.tainted = 1;
result = texpdf_parse_pdf_dict(pp, endptr, NULL);
parser_state.tainted = 0;
return result;
}
#else
pdf_obj *
texpdf_parse_texpdf_tainted_dict (const char **pp, const char *endptr)
{
return texpdf_parse_pdf_dict(pp, endptr, NULL);
}
#endif
pdf_obj *
texpdf_parse_pdf_dict (const char **pp, const char *endptr, pdf_file *pf)
{
pdf_obj *result = NULL;
const char *p;
p = *pp;
texpdf_skip_white(&p, endptr);
if (p + 4 > endptr ||
p[0] != '<' || p[1] != '<') {
return NULL;
}
p += 2;
result = texpdf_new_dict();
texpdf_skip_white(&p, endptr);
while (p < endptr && p[0] != '>') {
pdf_obj *key, *value;
texpdf_skip_white(&p, endptr);
key = texpdf_parse_pdf_name(&p, endptr);
if (!key) {
WARN("Could not find a key in dictionary object.");
texpdf_release_obj(result);
return NULL;
}
texpdf_skip_white(&p, endptr);
value = texpdf_parse_pdf_object(&p, endptr, pf);
if (!value) {
texpdf_release_obj(key);
texpdf_release_obj(value);
texpdf_release_obj(result);
WARN("Could not find a value in dictionary object.");
return NULL;
}
texpdf_add_dict(result, key, value);
texpdf_skip_white(&p, endptr);
}
if (p + 2 > endptr ||
p[0] != '>' || p[1] != '>') {
WARN("Syntax error: Dictionary object ended prematurely.");
texpdf_release_obj(result);
return NULL;
}
*pp = p + 2;
return result;
}
pdf_obj *
texpdf_parse_pdf_array (const char **pp, const char *endptr, pdf_file *pf)
{
pdf_obj *result;
const char *p;
p = *pp;
texpdf_skip_white(&p, endptr);
if (p + 2 > endptr || p[0] != '[') {
WARN("Could not find an array object.");
return NULL;
}
result = texpdf_new_array();
p++;
texpdf_skip_white(&p, endptr);
while (p < endptr && p[0] != ']') {
pdf_obj *elem;
elem = texpdf_parse_pdf_object(&p, endptr, pf);
if (!elem) {
texpdf_release_obj(result);
WARN("Could not find a valid object in array object.");
return NULL;
}
texpdf_add_array(result, elem);
texpdf_skip_white(&p, endptr);
}
if (p >= endptr || p[0] != ']') {
WARN("Array object ended prematurely.");
texpdf_release_obj(result);
return NULL;
}
*pp = p + 1;
return result;
}
static pdf_obj *
texpdf_parse_pdf_stream (const char **pp, const char *endptr, pdf_obj *dict)
{
pdf_obj *result = NULL;
const char *p;
pdf_obj *stream_dict;
long stream_length;
p = *pp;
texpdf_skip_white(&p, endptr);
if (p + 6 > endptr ||
strncmp(p, "stream", 6)) {
return NULL;
}
p += 6;
if (p < endptr && p[0] == '\n') {
p++;
} else if (p + 1 < endptr &&
(p[0] == '\r' && p[1] == '\n')) {
p += 2;
}
{
pdf_obj *tmp, *tmp2;
tmp = texpdf_lookup_dict(dict, "Length");
if (tmp != NULL) {
tmp2 = pdf_deref_obj(tmp);
if (texpdf_obj_typeof(tmp2) != PDF_NUMBER)
stream_length = -1;
else {
stream_length = (long) texpdf_number_value(tmp2);
}
texpdf_release_obj(tmp2);
}
else {
return NULL;
}
}
if (stream_length < 0 ||
p + stream_length > endptr)
return NULL;
{
pdf_obj *filters;
filters = texpdf_lookup_dict(dict, "Filter");
if (!filters && stream_length > 10) {
result = texpdf_new_stream(STREAM_COMPRESS);
} else {
result = texpdf_new_stream(0);
}
}
stream_dict = texpdf_stream_dict(result);
texpdf_merge_dict(stream_dict, dict);
texpdf_add_stream(result, p, stream_length);
p += stream_length;
{
if (p < endptr && p[0] == '\r')
p++;
if (p < endptr && p[0] == '\n')
p++;
if (p + 9 > endptr ||
memcmp(p, "endstream", 9)) {
texpdf_release_obj(result);
return NULL;
}
p += 9;
}
*pp = p;
return result;
}
static pdf_obj *
try_pdf_reference (const char *start, const char *end, const char **endptr, pdf_file *pf)
{
unsigned long id = 0;
unsigned short gen = 0;
ASSERT(pf);
if (endptr)
*endptr = start;
texpdf_skip_white(&start, end);
if (start > end - 5 || !isdigit((unsigned char)*start)) {
return NULL;
}
while (!is_space(*start)) {
if (start >= end || !isdigit((unsigned char)*start)) {
return NULL;
}
id = id * 10 + (*start - '0');
start++;
}
texpdf_skip_white(&start, end);
if (start >= end || !isdigit((unsigned char)*start))
return NULL;
while (!is_space(*start)) {
if (start >= end || !isdigit((unsigned char)*start))
return NULL;
gen = gen * 10 + (*start - '0');
start++;
}
texpdf_skip_white(&start, end);
if (start >= end || *start != 'R')
return NULL;
start++;
if (!PDF_TOKEN_END(start, end))
return NULL;
if (endptr)
*endptr = start;
return texpdf_new_indirect(pf, id, gen);
}
pdf_obj *
texpdf_parse_pdf_object (const char **pp, const char *endptr, pdf_file *pf)
{
pdf_obj *result = NULL;
const char *nextptr;
texpdf_skip_white(pp, endptr);
if (*pp >= endptr) {
WARN("Could not find any valid object.");
return NULL;
}
switch (**pp) {
case '<':
if (*(*pp + 1) != '<') {
result = texpdf_parse_pdf_hex_string(pp, endptr);
} else {
pdf_obj *dict;
result = texpdf_parse_pdf_dict(pp, endptr, pf);
texpdf_skip_white(pp, endptr);
if ( result &&
*pp <= endptr - 15 &&
!memcmp(*pp, "stream", 6)) {
dict = result;
result = texpdf_parse_pdf_stream(pp, endptr, dict);
texpdf_release_obj(dict);
}
}
break;
case '(':
result = texpdf_parse_pdf_string(pp, endptr);
break;
case '[':
result = texpdf_parse_pdf_array(pp, endptr, pf);
break;
case '/':
result = texpdf_parse_pdf_name(pp, endptr);
break;
case 'n':
result = texpdf_parse_pdf_null(pp, endptr);
break;
case 't': case 'f':
result = texpdf_parse_pdf_boolean(pp, endptr);
break;
case '+': case '-': case '.':
result = texpdf_parse_pdf_number(pp, endptr);
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
if (pf && (result = try_pdf_reference(*pp, endptr, &nextptr, pf))) {
*pp = nextptr;
} else {
result = texpdf_parse_pdf_number(pp, endptr);
}
break;
case '@':
break;
default:
WARN("Unknown PDF object type.");
result = NULL;
}
return result;
}