00001
00002
00003
00004
00005
00006
00007
00008
00009 #include <aconf.h>
00010
00011 #ifdef USE_GCC_PRAGMAS
00012 #pragma implementation
00013 #endif
00014
00015 #include <stdio.h>
00016 #include <string.h>
00017 #include "gmem.h"
00018 #include "gfile.h"
00019 #include "GString.h"
00020 #include "Error.h"
00021 #include "GlobalParams.h"
00022 #include "PSTokenizer.h"
00023 #include "CharCodeToUnicode.h"
00024
00025
00026
00027 #define maxUnicodeString 8
00028
00029 struct CharCodeToUnicodeString {
00030 CharCode c;
00031 Unicode u[maxUnicodeString];
00032 int len;
00033 };
00034
00035
00036
00037 static int getCharFromString(void *data) {
00038 char *p;
00039 int c;
00040
00041 p = *(char **)data;
00042 if (*p) {
00043 c = *p++;
00044 *(char **)data = p;
00045 } else {
00046 c = EOF;
00047 }
00048 return c;
00049 }
00050
00051 static int getCharFromFile(void *data) {
00052 return fgetc((FILE *)data);
00053 }
00054
00055
00056
00057 CharCodeToUnicode *CharCodeToUnicode::parseCIDToUnicode(GString *collectionA) {
00058 FILE *f;
00059 Unicode *mapA;
00060 CharCode size, mapLenA;
00061 char buf[64];
00062 Unicode u;
00063 CharCodeToUnicode *ctu;
00064
00065 if (!(f = globalParams->getCIDToUnicodeFile(collectionA))) {
00066 error(-1, "Couldn't find cidToUnicode file for the '%s' collection",
00067 collectionA->getCString());
00068 return NULL;
00069 }
00070
00071 size = 32768;
00072 mapA = (Unicode *)gmalloc(size * sizeof(Unicode));
00073 mapLenA = 0;
00074
00075 while (getLine(buf, sizeof(buf), f)) {
00076 if (mapLenA == size) {
00077 size *= 2;
00078 mapA = (Unicode *)grealloc(mapA, size * sizeof(Unicode));
00079 }
00080 if (sscanf(buf, "%x", &u) == 1) {
00081 mapA[mapLenA] = u;
00082 } else {
00083 error(-1, "Bad line (%d) in cidToUnicode file for the '%s' collection",
00084 (int)(mapLenA + 1), collectionA->getCString());
00085 mapA[mapLenA] = 0;
00086 }
00087 ++mapLenA;
00088 }
00089 fclose(f);
00090
00091 ctu = new CharCodeToUnicode(collectionA->copy(), mapA, mapLenA, gTrue,
00092 NULL, 0);
00093 gfree(mapA);
00094 return ctu;
00095 }
00096
00097 CharCodeToUnicode *CharCodeToUnicode::make8BitToUnicode(Unicode *toUnicode) {
00098 return new CharCodeToUnicode(NULL, toUnicode, 256, gTrue, NULL, 0);
00099 }
00100
00101 CharCodeToUnicode *CharCodeToUnicode::parseCMap(GString *buf, int nBits) {
00102 CharCodeToUnicode *ctu;
00103 char *p;
00104
00105 ctu = new CharCodeToUnicode(NULL);
00106 p = buf->getCString();
00107 ctu->parseCMap1(&getCharFromString, &p, nBits);
00108 return ctu;
00109 }
00110
00111 void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data,
00112 int nBits) {
00113 PSTokenizer *pst;
00114 char tok1[256], tok2[256], tok3[256];
00115 int nDigits, n1, n2, n3;
00116 CharCode oldLen, i;
00117 CharCode code1, code2;
00118 Unicode u;
00119 char uHex[5];
00120 int j;
00121 GString *name;
00122 FILE *f;
00123
00124 nDigits = nBits / 4;
00125 pst = new PSTokenizer(getCharFunc, data);
00126 pst->getToken(tok1, sizeof(tok1), &n1);
00127 while (pst->getToken(tok2, sizeof(tok2), &n2)) {
00128 if (!strcmp(tok2, "usecmap")) {
00129 if (tok1[0] == '/') {
00130 name = new GString(tok1 + 1);
00131 if ((f = globalParams->findToUnicodeFile(name))) {
00132 parseCMap1(&getCharFromFile, f, nBits);
00133 fclose(f);
00134 } else {
00135 error(-1, "Couldn't find ToUnicode CMap file for '%s'",
00136 name->getCString());
00137 }
00138 delete name;
00139 }
00140 pst->getToken(tok1, sizeof(tok1), &n1);
00141 } else if (!strcmp(tok2, "beginbfchar")) {
00142 while (pst->getToken(tok1, sizeof(tok1), &n1)) {
00143 if (!strcmp(tok1, "endbfchar")) {
00144 break;
00145 }
00146 if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
00147 !strcmp(tok2, "endbfchar")) {
00148 error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00149 break;
00150 }
00151 if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
00152 tok2[0] == '<' && tok2[n2 - 1] == '>')) {
00153 error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00154 continue;
00155 }
00156 tok1[n1 - 1] = tok2[n2 - 1] = '\0';
00157 if (sscanf(tok1 + 1, "%x", &code1) != 1) {
00158 error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00159 continue;
00160 }
00161 if (code1 >= mapLen) {
00162 oldLen = mapLen;
00163 mapLen = (code1 + 256) & ~255;
00164 map = (Unicode *)grealloc(map, mapLen * sizeof(Unicode));
00165 for (i = oldLen; i < mapLen; ++i) {
00166 map[i] = 0;
00167 }
00168 }
00169 if (n2 == 6) {
00170 if (sscanf(tok2 + 1, "%x", &u) != 1) {
00171 error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00172 continue;
00173 }
00174 map[code1] = u;
00175 } else {
00176 map[code1] = 0;
00177 if (sMapLen == sMapSize) {
00178 sMapSize += 8;
00179 sMap = (CharCodeToUnicodeString *)
00180 grealloc(sMap, sMapSize * sizeof(CharCodeToUnicodeString));
00181 }
00182 sMap[sMapLen].c = code1;
00183 sMap[sMapLen].len = (n2 - 2) / 4;
00184 for (j = 0; j < sMap[sMapLen].len && j < maxUnicodeString; ++j) {
00185 strncpy(uHex, tok2 + 1 + j*4, 4);
00186 uHex[4] = '\0';
00187 if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) {
00188 error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00189 }
00190 }
00191 ++sMapLen;
00192 }
00193 }
00194 pst->getToken(tok1, sizeof(tok1), &n1);
00195 } else if (!strcmp(tok2, "beginbfrange")) {
00196 while (pst->getToken(tok1, sizeof(tok1), &n1)) {
00197 if (!strcmp(tok1, "endbfrange")) {
00198 break;
00199 }
00200 if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
00201 !strcmp(tok2, "endbfrange") ||
00202 !pst->getToken(tok3, sizeof(tok3), &n3) ||
00203 !strcmp(tok3, "endbfrange")) {
00204 error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00205 break;
00206 }
00207 if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
00208 n2 == 2 + nDigits && tok2[0] == '<' && tok2[n2 - 1] == '>' &&
00209 tok3[0] == '<' && tok3[n3 - 1] == '>')) {
00210 error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00211 continue;
00212 }
00213 tok1[n1 - 1] = tok2[n2 - 1] = tok3[n3 - 1] = '\0';
00214 if (sscanf(tok1 + 1, "%x", &code1) != 1 ||
00215 sscanf(tok2 + 1, "%x", &code2) != 1) {
00216 error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00217 continue;
00218 }
00219 if (code2 >= mapLen) {
00220 oldLen = mapLen;
00221 mapLen = (code2 + 256) & ~255;
00222 map = (Unicode *)grealloc(map, mapLen * sizeof(Unicode));
00223 for (i = oldLen; i < mapLen; ++i) {
00224 map[i] = 0;
00225 }
00226 }
00227 if (n3 == 6) {
00228 if (sscanf(tok3 + 1, "%x", &u) != 1) {
00229 error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00230 continue;
00231 }
00232 for (; code1 <= code2; ++code1) {
00233 map[code1] = u++;
00234 }
00235 } else {
00236 if (sMapLen + (int)(code2 - code1 + 1) > sMapSize) {
00237 sMapSize = (sMapSize + (code2 - code1 + 1) + 7) & ~7;
00238 sMap = (CharCodeToUnicodeString *)
00239 grealloc(sMap, sMapSize * sizeof(CharCodeToUnicodeString));
00240 }
00241 for (i = 0; code1 <= code2; ++code1, ++i) {
00242 map[code1] = 0;
00243 sMap[sMapLen].c = code1;
00244 sMap[sMapLen].len = (n3 - 2) / 4;
00245 for (j = 0; j < sMap[sMapLen].len && j < maxUnicodeString; ++j) {
00246 strncpy(uHex, tok3 + 1 + j*4, 4);
00247 uHex[4] = '\0';
00248 if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) {
00249 error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00250 }
00251 }
00252 sMap[sMapLen].u[sMap[sMapLen].len - 1] += i;
00253 ++sMapLen;
00254 }
00255 }
00256 }
00257 pst->getToken(tok1, sizeof(tok1), &n1);
00258 } else {
00259 strcpy(tok1, tok2);
00260 }
00261 }
00262 delete pst;
00263 }
00264
00265 CharCodeToUnicode::CharCodeToUnicode(GString *collectionA) {
00266 CharCode i;
00267
00268 collection = collectionA;
00269 mapLen = 256;
00270 map = (Unicode *)gmalloc(mapLen * sizeof(Unicode));
00271 for (i = 0; i < mapLen; ++i) {
00272 map[i] = 0;
00273 }
00274 sMap = NULL;
00275 sMapLen = sMapSize = 0;
00276 refCnt = 1;
00277 }
00278
00279 CharCodeToUnicode::CharCodeToUnicode(GString *collectionA, Unicode *mapA,
00280 CharCode mapLenA, GBool copyMap,
00281 CharCodeToUnicodeString *sMapA,
00282 int sMapLenA) {
00283 collection = collectionA;
00284 mapLen = mapLenA;
00285 if (copyMap) {
00286 map = (Unicode *)gmalloc(mapLen * sizeof(Unicode));
00287 memcpy(map, mapA, mapLen * sizeof(Unicode));
00288 } else {
00289 map = mapA;
00290 }
00291 sMap = sMapA;
00292 sMapLen = sMapSize = sMapLenA;
00293 refCnt = 1;
00294 }
00295
00296 CharCodeToUnicode::~CharCodeToUnicode() {
00297 if (collection) {
00298 delete collection;
00299 }
00300 gfree(map);
00301 if (sMap) {
00302 gfree(sMap);
00303 }
00304 }
00305
00306 void CharCodeToUnicode::incRefCnt() {
00307 ++refCnt;
00308 }
00309
00310 void CharCodeToUnicode::decRefCnt() {
00311 if (--refCnt == 0) {
00312 delete this;
00313 }
00314 }
00315
00316 GBool CharCodeToUnicode::match(GString *collectionA) {
00317 return collection && !collection->cmp(collectionA);
00318 }
00319
00320 int CharCodeToUnicode::mapToUnicode(CharCode c, Unicode *u, int size) {
00321 int i, j;
00322
00323 if (c >= mapLen) {
00324 return 0;
00325 }
00326 if (map[c]) {
00327 u[0] = map[c];
00328 return 1;
00329 }
00330 for (i = 0; i < sMapLen; ++i) {
00331 if (sMap[i].c == c) {
00332 for (j = 0; j < sMap[i].len && j < size; ++j) {
00333 u[j] = sMap[i].u[j];
00334 }
00335 return j;
00336 }
00337 }
00338 return 0;
00339 }
00340
00341
00342
00343 CIDToUnicodeCache::CIDToUnicodeCache() {
00344 int i;
00345
00346 for (i = 0; i < cidToUnicodeCacheSize; ++i) {
00347 cache[i] = NULL;
00348 }
00349 }
00350
00351 CIDToUnicodeCache::~CIDToUnicodeCache() {
00352 int i;
00353
00354 for (i = 0; i < cidToUnicodeCacheSize; ++i) {
00355 if (cache[i]) {
00356 cache[i]->decRefCnt();
00357 }
00358 }
00359 }
00360
00361 CharCodeToUnicode *CIDToUnicodeCache::getCIDToUnicode(GString *collection) {
00362 CharCodeToUnicode *ctu;
00363 int i, j;
00364
00365 if (cache[0] && cache[0]->match(collection)) {
00366 cache[0]->incRefCnt();
00367 return cache[0];
00368 }
00369 for (i = 1; i < cidToUnicodeCacheSize; ++i) {
00370 if (cache[i] && cache[i]->match(collection)) {
00371 ctu = cache[i];
00372 for (j = i; j >= 1; --j) {
00373 cache[j] = cache[j - 1];
00374 }
00375 cache[0] = ctu;
00376 ctu->incRefCnt();
00377 return ctu;
00378 }
00379 }
00380 if ((ctu = CharCodeToUnicode::parseCIDToUnicode(collection))) {
00381 if (cache[cidToUnicodeCacheSize - 1]) {
00382 cache[cidToUnicodeCacheSize - 1]->decRefCnt();
00383 }
00384 for (j = cidToUnicodeCacheSize - 1; j >= 1; --j) {
00385 cache[j] = cache[j - 1];
00386 }
00387 cache[0] = ctu;
00388 ctu->incRefCnt();
00389 return ctu;
00390 }
00391 return NULL;
00392 }