filters

CharCodeToUnicode.cc

00001 //========================================================================
00002 //
00003 // CharCodeToUnicode.cc
00004 //
00005 // Copyright 2001-2002 Glyph & Cog, LLC
00006 //
00007 //========================================================================
00008 
00009 #include <aconf.h>
00010 
00011 #ifdef USE_GCC_PRAGMAS
00012 #pragma implementation
00013 #endif
00014 
00015 #include <stdio.h>
00016 #include <string.h>
00017 #include "gmem.h"
00018 #include "gfile.h"
00019 #include "GString.h"
00020 #include "Error.h"
00021 #include "GlobalParams.h"
00022 #include "PSTokenizer.h"
00023 #include "CharCodeToUnicode.h"
00024 
00025 //------------------------------------------------------------------------
00026 
00027 #define maxUnicodeString 8
00028 
00029 struct CharCodeToUnicodeString {
00030   CharCode c;
00031   Unicode u[maxUnicodeString];
00032   int len;
00033 };
00034 
00035 //------------------------------------------------------------------------
00036 
00037 static int getCharFromString(void *data) {
00038   char *p;
00039   int c;
00040 
00041   p = *(char **)data;
00042   if (*p) {
00043     c = *p++;
00044     *(char **)data = p;
00045   } else {
00046     c = EOF;
00047   }
00048   return c;
00049 }
00050 
00051 static int getCharFromFile(void *data) {
00052   return fgetc((FILE *)data);
00053 }
00054 
00055 //------------------------------------------------------------------------
00056 
00057 CharCodeToUnicode *CharCodeToUnicode::parseCIDToUnicode(GString *collectionA) {
00058   FILE *f;
00059   Unicode *mapA;
00060   CharCode size, mapLenA;
00061   char buf[64];
00062   Unicode u;
00063   CharCodeToUnicode *ctu;
00064 
00065   if (!(f = globalParams->getCIDToUnicodeFile(collectionA))) {
00066     error(-1, "Couldn't find cidToUnicode file for the '%s' collection",
00067       collectionA->getCString());
00068     return NULL;
00069   }
00070 
00071   size = 32768;
00072   mapA = (Unicode *)gmalloc(size * sizeof(Unicode));
00073   mapLenA = 0;
00074 
00075   while (getLine(buf, sizeof(buf), f)) {
00076     if (mapLenA == size) {
00077       size *= 2;
00078       mapA = (Unicode *)grealloc(mapA, size * sizeof(Unicode));
00079     }
00080     if (sscanf(buf, "%x", &u) == 1) {
00081       mapA[mapLenA] = u;
00082     } else {
00083       error(-1, "Bad line (%d) in cidToUnicode file for the '%s' collection",
00084         (int)(mapLenA + 1), collectionA->getCString());
00085       mapA[mapLenA] = 0;
00086     }
00087     ++mapLenA;
00088   }
00089   fclose(f);
00090 
00091   ctu = new CharCodeToUnicode(collectionA->copy(), mapA, mapLenA, gTrue,
00092                   NULL, 0);
00093   gfree(mapA);
00094   return ctu;
00095 }
00096 
00097 CharCodeToUnicode *CharCodeToUnicode::make8BitToUnicode(Unicode *toUnicode) {
00098   return new CharCodeToUnicode(NULL, toUnicode, 256, gTrue, NULL, 0);
00099 }
00100 
00101 CharCodeToUnicode *CharCodeToUnicode::parseCMap(GString *buf, int nBits) {
00102   CharCodeToUnicode *ctu;
00103   char *p;
00104 
00105   ctu = new CharCodeToUnicode(NULL);
00106   p = buf->getCString();
00107   ctu->parseCMap1(&getCharFromString, &p, nBits);
00108   return ctu;
00109 }
00110 
00111 void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data,
00112                    int nBits) {
00113   PSTokenizer *pst;
00114   char tok1[256], tok2[256], tok3[256];
00115   int nDigits, n1, n2, n3;
00116   CharCode oldLen, i;
00117   CharCode code1, code2;
00118   Unicode u;
00119   char uHex[5];
00120   int j;
00121   GString *name;
00122   FILE *f;
00123 
00124   nDigits = nBits / 4;
00125   pst = new PSTokenizer(getCharFunc, data);
00126   pst->getToken(tok1, sizeof(tok1), &n1);
00127   while (pst->getToken(tok2, sizeof(tok2), &n2)) {
00128     if (!strcmp(tok2, "usecmap")) {
00129       if (tok1[0] == '/') {
00130     name = new GString(tok1 + 1);
00131     if ((f = globalParams->findToUnicodeFile(name))) {
00132       parseCMap1(&getCharFromFile, f, nBits);
00133       fclose(f);
00134     } else {
00135       error(-1, "Couldn't find ToUnicode CMap file for '%s'",
00136         name->getCString());
00137     }
00138     delete name;
00139       }
00140       pst->getToken(tok1, sizeof(tok1), &n1);
00141     } else if (!strcmp(tok2, "beginbfchar")) {
00142       while (pst->getToken(tok1, sizeof(tok1), &n1)) {
00143     if (!strcmp(tok1, "endbfchar")) {
00144       break;
00145     }
00146     if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
00147         !strcmp(tok2, "endbfchar")) {
00148       error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00149       break;
00150     }
00151     if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
00152           tok2[0] == '<' && tok2[n2 - 1] == '>')) {
00153       error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00154       continue;
00155     }
00156     tok1[n1 - 1] = tok2[n2 - 1] = '\0';
00157     if (sscanf(tok1 + 1, "%x", &code1) != 1) {
00158       error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00159       continue;
00160     }
00161     if (code1 >= mapLen) {
00162       oldLen = mapLen;
00163       mapLen = (code1 + 256) & ~255;
00164       map = (Unicode *)grealloc(map, mapLen * sizeof(Unicode));
00165       for (i = oldLen; i < mapLen; ++i) {
00166         map[i] = 0;
00167       }
00168     }
00169     if (n2 == 6) {
00170       if (sscanf(tok2 + 1, "%x", &u) != 1) {
00171         error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00172         continue;
00173       }
00174       map[code1] = u;
00175     } else {
00176       map[code1] = 0;
00177       if (sMapLen == sMapSize) {
00178         sMapSize += 8;
00179         sMap = (CharCodeToUnicodeString *)
00180             grealloc(sMap, sMapSize * sizeof(CharCodeToUnicodeString));
00181       }
00182       sMap[sMapLen].c = code1;
00183       sMap[sMapLen].len = (n2 - 2) / 4;
00184       for (j = 0; j < sMap[sMapLen].len && j < maxUnicodeString; ++j) {
00185         strncpy(uHex, tok2 + 1 + j*4, 4);
00186         uHex[4] = '\0';
00187         if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) {
00188           error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00189         }
00190       }
00191       ++sMapLen;
00192     }
00193       }
00194       pst->getToken(tok1, sizeof(tok1), &n1);
00195     } else if (!strcmp(tok2, "beginbfrange")) {
00196       while (pst->getToken(tok1, sizeof(tok1), &n1)) {
00197     if (!strcmp(tok1, "endbfrange")) {
00198       break;
00199     }
00200     if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
00201         !strcmp(tok2, "endbfrange") ||
00202         !pst->getToken(tok3, sizeof(tok3), &n3) ||
00203         !strcmp(tok3, "endbfrange")) {
00204       error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00205       break;
00206     }
00207     if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
00208           n2 == 2 + nDigits && tok2[0] == '<' && tok2[n2 - 1] == '>' &&
00209           tok3[0] == '<' && tok3[n3 - 1] == '>')) {
00210       error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00211       continue;
00212     }
00213     tok1[n1 - 1] = tok2[n2 - 1] = tok3[n3 - 1] = '\0';
00214     if (sscanf(tok1 + 1, "%x", &code1) != 1 ||
00215         sscanf(tok2 + 1, "%x", &code2) != 1) {
00216       error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00217       continue;
00218     }
00219     if (code2 >= mapLen) {
00220       oldLen = mapLen;
00221       mapLen = (code2 + 256) & ~255;
00222       map = (Unicode *)grealloc(map, mapLen * sizeof(Unicode));
00223       for (i = oldLen; i < mapLen; ++i) {
00224         map[i] = 0;
00225       }
00226     }
00227     if (n3 == 6) {
00228       if (sscanf(tok3 + 1, "%x", &u) != 1) {
00229         error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00230         continue;
00231       }
00232       for (; code1 <= code2; ++code1) {
00233         map[code1] = u++;
00234       }
00235     } else {
00236       if (sMapLen + (int)(code2 - code1 + 1) > sMapSize) {
00237         sMapSize = (sMapSize + (code2 - code1 + 1) + 7) & ~7;
00238         sMap = (CharCodeToUnicodeString *)
00239             grealloc(sMap, sMapSize * sizeof(CharCodeToUnicodeString));
00240       }
00241       for (i = 0; code1 <= code2; ++code1, ++i) {
00242         map[code1] = 0;
00243         sMap[sMapLen].c = code1;
00244         sMap[sMapLen].len = (n3 - 2) / 4;
00245         for (j = 0; j < sMap[sMapLen].len && j < maxUnicodeString; ++j) {
00246           strncpy(uHex, tok3 + 1 + j*4, 4);
00247           uHex[4] = '\0';
00248           if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) {
00249         error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00250           }
00251         }
00252         sMap[sMapLen].u[sMap[sMapLen].len - 1] += i;
00253         ++sMapLen;
00254       }
00255     }
00256       }
00257       pst->getToken(tok1, sizeof(tok1), &n1);
00258     } else {
00259       strcpy(tok1, tok2);
00260     }
00261   }
00262   delete pst;
00263 }
00264 
00265 CharCodeToUnicode::CharCodeToUnicode(GString *collectionA) {
00266   CharCode i;
00267 
00268   collection = collectionA;
00269   mapLen = 256;
00270   map = (Unicode *)gmalloc(mapLen * sizeof(Unicode));
00271   for (i = 0; i < mapLen; ++i) {
00272     map[i] = 0;
00273   }
00274   sMap = NULL;
00275   sMapLen = sMapSize = 0;
00276   refCnt = 1;
00277 }
00278 
00279 CharCodeToUnicode::CharCodeToUnicode(GString *collectionA, Unicode *mapA,
00280                      CharCode mapLenA, GBool copyMap,
00281                      CharCodeToUnicodeString *sMapA,
00282                      int sMapLenA) {
00283   collection = collectionA;
00284   mapLen = mapLenA;
00285   if (copyMap) {
00286     map = (Unicode *)gmalloc(mapLen * sizeof(Unicode));
00287     memcpy(map, mapA, mapLen * sizeof(Unicode));
00288   } else {
00289     map = mapA;
00290   }
00291   sMap = sMapA;
00292   sMapLen = sMapSize = sMapLenA;
00293   refCnt = 1;
00294 }
00295 
00296 CharCodeToUnicode::~CharCodeToUnicode() {
00297   if (collection) {
00298     delete collection;
00299   }
00300   gfree(map);
00301   if (sMap) {
00302     gfree(sMap);
00303   }
00304 }
00305 
00306 void CharCodeToUnicode::incRefCnt() {
00307   ++refCnt;
00308 }
00309 
00310 void CharCodeToUnicode::decRefCnt() {
00311   if (--refCnt == 0) {
00312     delete this;
00313   }
00314 }
00315 
00316 GBool CharCodeToUnicode::match(GString *collectionA) {
00317   return collection && !collection->cmp(collectionA);
00318 }
00319 
00320 int CharCodeToUnicode::mapToUnicode(CharCode c, Unicode *u, int size) {
00321   int i, j;
00322 
00323   if (c >= mapLen) {
00324     return 0;
00325   }
00326   if (map[c]) {
00327     u[0] = map[c];
00328     return 1;
00329   }
00330   for (i = 0; i < sMapLen; ++i) {
00331     if (sMap[i].c == c) {
00332       for (j = 0; j < sMap[i].len && j < size; ++j) {
00333     u[j] = sMap[i].u[j];
00334       }
00335       return j;
00336     }
00337   }
00338   return 0;
00339 }
00340 
00341 //------------------------------------------------------------------------
00342 
00343 CIDToUnicodeCache::CIDToUnicodeCache() {
00344   int i;
00345 
00346   for (i = 0; i < cidToUnicodeCacheSize; ++i) {
00347     cache[i] = NULL;
00348   }
00349 }
00350 
00351 CIDToUnicodeCache::~CIDToUnicodeCache() {
00352   int i;
00353 
00354   for (i = 0; i < cidToUnicodeCacheSize; ++i) {
00355     if (cache[i]) {
00356       cache[i]->decRefCnt();
00357     }
00358   }
00359 }
00360 
00361 CharCodeToUnicode *CIDToUnicodeCache::getCIDToUnicode(GString *collection) {
00362   CharCodeToUnicode *ctu;
00363   int i, j;
00364 
00365   if (cache[0] && cache[0]->match(collection)) {
00366     cache[0]->incRefCnt();
00367     return cache[0];
00368   }
00369   for (i = 1; i < cidToUnicodeCacheSize; ++i) {
00370     if (cache[i] && cache[i]->match(collection)) {
00371       ctu = cache[i];
00372       for (j = i; j >= 1; --j) {
00373     cache[j] = cache[j - 1];
00374       }
00375       cache[0] = ctu;
00376       ctu->incRefCnt();
00377       return ctu;
00378     }
00379   }
00380   if ((ctu = CharCodeToUnicode::parseCIDToUnicode(collection))) {
00381     if (cache[cidToUnicodeCacheSize - 1]) {
00382       cache[cidToUnicodeCacheSize - 1]->decRefCnt();
00383     }
00384     for (j = cidToUnicodeCacheSize - 1; j >= 1; --j) {
00385       cache[j] = cache[j - 1];
00386     }
00387     cache[0] = ctu;
00388     ctu->incRefCnt();
00389     return ctu;
00390   }
00391   return NULL;
00392 }
KDE Home | KDE Accessibility Home | Description of Access Keys