filters

Lexer.cc

00001 //========================================================================
00002 //
00003 // Lexer.cc
00004 //
00005 // Copyright 1996-2002 Glyph & Cog, LLC
00006 //
00007 //========================================================================
00008 
00009 #include <aconf.h>
00010 
00011 #ifdef USE_GCC_PRAGMAS
00012 #pragma implementation
00013 #endif
00014 
00015 #include <stdlib.h>
00016 #include <stddef.h>
00017 #include <string.h>
00018 #include <ctype.h>
00019 #include "Lexer.h"
00020 #include "Error.h"
00021 
00022 //------------------------------------------------------------------------
00023 
00024 // A '1' in this array means the character is white space.  A '1' or
00025 // '2' means the character ends a name or command.
00026 static char specialChars[256] = {
00027   1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
00028   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
00029   1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
00030   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
00031   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
00032   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
00033   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
00034   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
00035   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
00036   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
00037   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
00038   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
00039   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
00040   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
00041   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
00042   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
00043 };
00044 
00045 //------------------------------------------------------------------------
00046 // Lexer
00047 //------------------------------------------------------------------------
00048 
00049 Lexer::Lexer(XRef *xref, Stream *str) {
00050   Object obj;
00051 
00052   curStr.initStream(str);
00053   streams = new Array(xref);
00054   streams->add(curStr.copy(&obj));
00055   strPtr = 0;
00056   freeArray = gTrue;
00057   curStr.streamReset();
00058 }
00059 
00060 Lexer::Lexer(XRef *xref, Object *obj) {
00061   Object obj2;
00062 
00063   if (obj->isStream()) {
00064     streams = new Array(xref);
00065     freeArray = gTrue;
00066     streams->add(obj->copy(&obj2));
00067   } else {
00068     streams = obj->getArray();
00069     freeArray = gFalse;
00070   }
00071   strPtr = 0;
00072   if (streams->getLength() > 0) {
00073     streams->get(strPtr, &curStr);
00074     curStr.streamReset();
00075   }
00076 }
00077 
00078 Lexer::~Lexer() {
00079   if (!curStr.isNone()) {
00080     curStr.streamClose();
00081     curStr.free();
00082   }
00083   if (freeArray) {
00084     delete streams;
00085   }
00086 }
00087 
00088 int Lexer::getChar() {
00089   int c;
00090 
00091   c = EOF;
00092   while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
00093     curStr.streamClose();
00094     curStr.free();
00095     ++strPtr;
00096     if (strPtr < streams->getLength()) {
00097       streams->get(strPtr, &curStr);
00098       curStr.streamReset();
00099     }
00100   }
00101   return c;
00102 }
00103 
00104 int Lexer::lookChar() {
00105   if (curStr.isNone()) {
00106     return EOF;
00107   }
00108   return curStr.streamLookChar();
00109 }
00110 
00111 Object *Lexer::getObj(Object *obj) {
00112   char *p;
00113   int c, c2;
00114   GBool comment, neg, done;
00115   int numParen;
00116   int xi;
00117   double xf, scale;
00118   GString *s;
00119   int n, m;
00120 
00121   // skip whitespace and comments
00122   comment = gFalse;
00123   while (1) {
00124     if ((c = getChar()) == EOF) {
00125       return obj->initEOF();
00126     }
00127     if (comment) {
00128       if (c == '\r' || c == '\n')
00129     comment = gFalse;
00130     } else if (c == '%') {
00131       comment = gTrue;
00132     } else if (specialChars[c] != 1) {
00133       break;
00134     }
00135   }
00136 
00137   // start reading token
00138   switch (c) {
00139 
00140   // number
00141   case '0': case '1': case '2': case '3': case '4':
00142   case '5': case '6': case '7': case '8': case '9':
00143   case '-': case '.':
00144     neg = gFalse;
00145     xi = 0;
00146     if (c == '-') {
00147       neg = gTrue;
00148     } else if (c == '.') {
00149       goto doReal;
00150     } else {
00151       xi = c - '0';
00152     }
00153     while (1) {
00154       c = lookChar();
00155       if (isdigit(c)) {
00156     getChar();
00157     xi = xi * 10 + (c - '0');
00158       } else if (c == '.') {
00159     getChar();
00160     goto doReal;
00161       } else {
00162     break;
00163       }
00164     }
00165     if (neg)
00166       xi = -xi;
00167     obj->initInt(xi);
00168     break;
00169   doReal:
00170     xf = xi;
00171     scale = 0.1;
00172     while (1) {
00173       c = lookChar();
00174       if (!isdigit(c)) {
00175     break;
00176       }
00177       getChar();
00178       xf = xf + scale * (c - '0');
00179       scale *= 0.1;
00180     }
00181     if (neg)
00182       xf = -xf;
00183     obj->initReal(xf);
00184     break;
00185 
00186   // string
00187   case '(':
00188     p = tokBuf;
00189     n = 0;
00190     numParen = 1;
00191     done = gFalse;
00192     s = NULL;
00193     do {
00194       c2 = EOF;
00195       switch (c = getChar()) {
00196 
00197       case EOF:
00198 #if 0
00199       // This breaks some PDF files, e.g., ones from Photoshop.
00200       case '\r':
00201       case '\n':
00202 #endif
00203     error(getPos(), "Unterminated string");
00204     done = gTrue;
00205     break;
00206 
00207       case '(':
00208     ++numParen;
00209     c2 = c;
00210     break;
00211 
00212       case ')':
00213     if (--numParen == 0) {
00214       done = gTrue;
00215     } else {
00216       c2 = c;
00217     }
00218     break;
00219 
00220       case '\\':
00221     switch (c = getChar()) {
00222     case 'n':
00223       c2 = '\n';
00224       break;
00225     case 'r':
00226       c2 = '\r';
00227       break;
00228     case 't':
00229       c2 = '\t';
00230       break;
00231     case 'b':
00232       c2 = '\b';
00233       break;
00234     case 'f':
00235       c2 = '\f';
00236       break;
00237     case '\\':
00238     case '(':
00239     case ')':
00240       c2 = c;
00241       break;
00242     case '0': case '1': case '2': case '3':
00243     case '4': case '5': case '6': case '7':
00244       c2 = c - '0';
00245       c = lookChar();
00246       if (c >= '0' && c <= '7') {
00247         getChar();
00248         c2 = (c2 << 3) + (c - '0');
00249         c = lookChar();
00250         if (c >= '0' && c <= '7') {
00251           getChar();
00252           c2 = (c2 << 3) + (c - '0');
00253         }
00254       }
00255       break;
00256     case '\r':
00257       c = lookChar();
00258       if (c == '\n') {
00259         getChar();
00260       }
00261       break;
00262     case '\n':
00263       break;
00264     case EOF:
00265       error(getPos(), "Unterminated string");
00266       done = gTrue;
00267       break;
00268     default:
00269       c2 = c;
00270       break;
00271     }
00272     break;
00273 
00274       default:
00275     c2 = c;
00276     break;
00277       }
00278 
00279       if (c2 != EOF) {
00280     if (n == tokBufSize) {
00281       if (!s)
00282         s = new GString(tokBuf, tokBufSize);
00283       else
00284         s->append(tokBuf, tokBufSize);
00285       p = tokBuf;
00286       n = 0;
00287     }
00288     *p++ = (char)c2;
00289     ++n;
00290       }
00291     } while (!done);
00292     if (!s)
00293       s = new GString(tokBuf, n);
00294     else
00295       s->append(tokBuf, n);
00296     obj->initString(s);
00297     break;
00298 
00299   // name
00300   case '/':
00301     p = tokBuf;
00302     n = 0;
00303     while ((c = lookChar()) != EOF && !specialChars[c]) {
00304       getChar();
00305       if (c == '#') {
00306     c2 = lookChar();
00307     if (c2 >= '0' && c2 <= '9') {
00308       c = c2 - '0';
00309     } else if (c2 >= 'A' && c2 <= 'F') {
00310       c = c2 - 'A' + 10;
00311     } else if (c2 >= 'a' && c2 <= 'f') {
00312       c = c2 - 'a' + 10;
00313     } else {
00314       goto notEscChar;
00315     }
00316     getChar();
00317     c <<= 4;
00318     c2 = getChar();
00319     if (c2 >= '0' && c2 <= '9') {
00320       c += c2 - '0';
00321     } else if (c2 >= 'A' && c2 <= 'F') {
00322       c += c2 - 'A' + 10;
00323     } else if (c2 >= 'a' && c2 <= 'f') {
00324       c += c2 - 'a' + 10;
00325     } else {
00326       error(getPos(), "Illegal digit in hex char in name");
00327     }
00328       }
00329      notEscChar:
00330       if (++n == tokBufSize) {
00331     error(getPos(), "Name token too long");
00332     break;
00333       }
00334       *p++ = c;
00335     }
00336     *p = '\0';
00337     obj->initName(tokBuf);
00338     break;
00339 
00340   // array punctuation
00341   case '[':
00342   case ']':
00343     tokBuf[0] = c;
00344     tokBuf[1] = '\0';
00345     obj->initCmd(tokBuf);
00346     break;
00347 
00348   // hex string or dict punctuation
00349   case '<':
00350     c = lookChar();
00351 
00352     // dict punctuation
00353     if (c == '<') {
00354       getChar();
00355       tokBuf[0] = tokBuf[1] = '<';
00356       tokBuf[2] = '\0';
00357       obj->initCmd(tokBuf);
00358 
00359     // hex string
00360     } else {
00361       p = tokBuf;
00362       m = n = 0;
00363       c2 = 0;
00364       s = NULL;
00365       while (1) {
00366     c = getChar();
00367     if (c == '>') {
00368       break;
00369     } else if (c == EOF) {
00370       error(getPos(), "Unterminated hex string");
00371       break;
00372     } else if (specialChars[c] != 1) {
00373       c2 = c2 << 4;
00374       if (c >= '0' && c <= '9')
00375         c2 += c - '0';
00376       else if (c >= 'A' && c <= 'F')
00377         c2 += c - 'A' + 10;
00378       else if (c >= 'a' && c <= 'f')
00379         c2 += c - 'a' + 10;
00380       else
00381         error(getPos(), "Illegal character <%02x> in hex string", c);
00382       if (++m == 2) {
00383         if (n == tokBufSize) {
00384           if (!s)
00385         s = new GString(tokBuf, tokBufSize);
00386           else
00387         s->append(tokBuf, tokBufSize);
00388           p = tokBuf;
00389           n = 0;
00390         }
00391         *p++ = (char)c2;
00392         ++n;
00393         c2 = 0;
00394         m = 0;
00395       }
00396     }
00397       }
00398       if (!s)
00399     s = new GString(tokBuf, n);
00400       else
00401     s->append(tokBuf, n);
00402       if (m == 1)
00403     s->append((char)(c2 << 4));
00404       obj->initString(s);
00405     }
00406     break;
00407 
00408   // dict punctuation
00409   case '>':
00410     c = lookChar();
00411     if (c == '>') {
00412       getChar();
00413       tokBuf[0] = tokBuf[1] = '>';
00414       tokBuf[2] = '\0';
00415       obj->initCmd(tokBuf);
00416     } else {
00417       error(getPos(), "Illegal character '>'");
00418       obj->initError();
00419     }
00420     break;
00421 
00422   // error
00423   case ')':
00424   case '{':
00425   case '}':
00426     error(getPos(), "Illegal character '%c'", c);
00427     obj->initError();
00428     break;
00429 
00430   // command
00431   default:
00432     p = tokBuf;
00433     *p++ = c;
00434     n = 1;
00435     while ((c = lookChar()) != EOF && !specialChars[c]) {
00436       getChar();
00437       if (++n == tokBufSize) {
00438     error(getPos(), "Command token too long");
00439     break;
00440       }
00441       *p++ = c;
00442     }
00443     *p = '\0';
00444     if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
00445       obj->initBool(gTrue);
00446     } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
00447       obj->initBool(gFalse);
00448     } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
00449       obj->initNull();
00450     } else {
00451       obj->initCmd(tokBuf);
00452     }
00453     break;
00454   }
00455 
00456   return obj;
00457 }
00458 
00459 void Lexer::skipToNextLine() {
00460   int c;
00461 
00462   while (1) {
00463     c = getChar();
00464     if (c == EOF || c == '\n') {
00465       return;
00466     }
00467     if (c == '\r') {
00468       if ((c = lookChar()) == '\n') {
00469     getChar();
00470       }
00471       return;
00472     }
00473   }
00474 }
KDE Home | KDE Accessibility Home | Description of Access Keys