00001
00002
00003
00004
00005
00006
00007
00008
00009 #include <aconf.h>
00010
00011 #ifdef USE_GCC_PRAGMAS
00012 #pragma implementation
00013 #endif
00014
00015 #include <stdio.h>
00016 #include <stdlib.h>
00017 #include <stddef.h>
00018 #include <math.h>
00019 #include <ctype.h>
00020 #include "GString.h"
00021 #include "gmem.h"
00022 #include "config.h"
00023 #include "Error.h"
00024 #include "GlobalParams.h"
00025 #include "UnicodeMap.h"
00026 #include "GfxState.h"
00027 #include "TextOutputDev.h"
00028
00029 #ifdef MACOS
00030
00031 #include "ICSupport.h"
00032 #endif
00033
00034
00035
00036 #define textOutSpace 0.2
00037 #define textOutColSpace 0.2
00038
00039
00040
00041 struct TextOutColumnEdge {
00042 double x, y0, y1;
00043 };
00044
00045
00046
00047
00048
00049 TextBlock::TextBlock() {
00050 strings = NULL;
00051 next = NULL;
00052 xyNext = NULL;
00053 text = NULL;
00054 xRight = NULL;
00055 col = NULL;
00056 }
00057
00058 TextBlock::~TextBlock() {
00059 TextString *p1, *p2;
00060
00061 for (p1 = strings; p1; p1 = p2) {
00062 p2 = p1->next;
00063 delete p1;
00064 }
00065 gfree(text);
00066 gfree(xRight);
00067 gfree(col);
00068 }
00069
00070
00071
00072
00073
00074 TextLine::TextLine() {
00075 blocks = NULL;
00076 next = NULL;
00077 }
00078
00079 TextLine::~TextLine() {
00080 TextBlock *p1, *p2;
00081
00082 for (p1 = blocks; p1; p1 = p2) {
00083 p2 = p1->next;
00084 delete p1;
00085 }
00086 }
00087
00088
00089
00090
00091
00092 TextString::TextString(GfxState *state, double x0, double y0,
00093 double fontSize) {
00094 GfxFont *font;
00095 double x, y;
00096
00097 state->transform(x0, y0, &x, &y);
00098 if ((font = state->getFont())) {
00099 yMin = y - font->getAscent() * fontSize;
00100 yMax = y - font->getDescent() * fontSize;
00101 } else {
00102
00103
00104 yMin = y - 0.95 * fontSize;
00105 yMax = y + 0.35 * fontSize;
00106 }
00107 if (yMin == yMax) {
00108
00109
00110 yMin = y;
00111 yMax = y + 1;
00112 }
00113 marked = gFalse;
00114 text = NULL;
00115 xRight = NULL;
00116 len = size = 0;
00117 next = NULL;
00118 }
00119
00120
00121 TextString::~TextString() {
00122 gfree(text);
00123 gfree(xRight);
00124 }
00125
00126 void TextString::addChar(GfxState *, double x, double ,
00127 double dx, double , Unicode u) {
00128 if (len == size) {
00129 size += 16;
00130 text = (Unicode *)grealloc(text, size * sizeof(Unicode));
00131 xRight = (double *)grealloc(xRight, size * sizeof(double));
00132 }
00133 text[len] = u;
00134 if (len == 0) {
00135 xMin = x;
00136 }
00137 xMax = xRight[len] = x + dx;
00138 ++len;
00139 }
00140
00141
00142
00143
00144
00145 TextPage::TextPage(GBool rawOrderA) {
00146 rawOrder = rawOrderA;
00147 curStr = NULL;
00148 fontSize = 0;
00149 xyStrings = NULL;
00150 xyCur1 = xyCur2 = NULL;
00151 lines = NULL;
00152 nest = 0;
00153 nTinyChars = 0;
00154 }
00155
00156 TextPage::~TextPage() {
00157 clear();
00158 }
00159
00160 void TextPage::updateFont(GfxState *state) {
00161 GfxFont *font;
00162 double *fm;
00163 char *name;
00164 int code, mCode, letterCode, anyCode;
00165 double w;
00166
00167
00168 fontSize = state->getTransformedFontSize();
00169 if ((font = state->getFont()) && font->getType() == fontType3) {
00170
00171
00172
00173
00174
00175
00176 mCode = letterCode = anyCode = -1;
00177 for (code = 0; code < 256; ++code) {
00178 name = ((Gfx8BitFont *)font)->getCharName(code);
00179 if (name && name[0] == 'm' && name[1] == '\0') {
00180 mCode = code;
00181 }
00182 if (letterCode < 0 && name && name[1] == '\0' &&
00183 ((name[0] >= 'A' && name[0] <= 'Z') ||
00184 (name[0] >= 'a' && name[0] <= 'z'))) {
00185 letterCode = code;
00186 }
00187 if (anyCode < 0 && name && ((Gfx8BitFont *)font)->getWidth(code) > 0) {
00188 anyCode = code;
00189 }
00190 }
00191 if (mCode >= 0 &&
00192 (w = ((Gfx8BitFont *)font)->getWidth(mCode)) > 0) {
00193
00194 fontSize *= w / 0.6;
00195 } else if (letterCode >= 0 &&
00196 (w = ((Gfx8BitFont *)font)->getWidth(letterCode)) > 0) {
00197
00198 fontSize *= w / 0.5;
00199 } else if (anyCode >= 0 &&
00200 (w = ((Gfx8BitFont *)font)->getWidth(anyCode)) > 0) {
00201
00202 fontSize *= w / 0.5;
00203 }
00204 fm = font->getFontMatrix();
00205 if (fm[0] != 0) {
00206 fontSize *= fabs(fm[3] / fm[0]);
00207 }
00208 }
00209 }
00210
00211 void TextPage::beginString(GfxState *state, double x0, double y0) {
00212
00213
00214 if (curStr) {
00215 ++nest;
00216 return;
00217 }
00218
00219 curStr = new TextString(state, x0, y0, fontSize);
00220 }
00221
00222 void TextPage::addChar(GfxState *state, double x, double y,
00223 double dx, double dy, Unicode *u, int uLen) {
00224 double x1, y1, w1, h1, dx2, dy2;
00225 int n, i;
00226
00227 state->transform(x, y, &x1, &y1);
00228 if (x1 < 0 || x1 > state->getPageWidth() ||
00229 y1 < 0 || y1 > state->getPageHeight()) {
00230 return;
00231 }
00232 state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
00233 0, &dx2, &dy2);
00234 dx -= dx2;
00235 dy -= dy2;
00236 state->transformDelta(dx, dy, &w1, &h1);
00237 if (!globalParams->getTextKeepTinyChars() &&
00238 fabs(w1) < 3 && fabs(h1) < 3) {
00239 if (++nTinyChars > 20000) {
00240 return;
00241 }
00242 }
00243 n = curStr->len;
00244 if (n > 0 && x1 - curStr->xRight[n-1] >
00245 0.1 * (curStr->yMax - curStr->yMin)) {
00246
00247 endString();
00248 beginString(state, x, y);
00249 }
00250 if (uLen == 1 && u[0] == (Unicode)0x20 &&
00251 w1 > 0.5 * (curStr->yMax - curStr->yMin)) {
00252
00253 return;
00254 }
00255 if (uLen != 0) {
00256 w1 /= uLen;
00257 h1 /= uLen;
00258 }
00259 for (i = 0; i < uLen; ++i) {
00260 curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
00261 }
00262 }
00263
00264 void TextPage::endString() {
00265
00266
00267 if (nest > 0) {
00268 --nest;
00269 return;
00270 }
00271
00272 addString(curStr);
00273 curStr = NULL;
00274 }
00275
00276 void TextPage::addString(TextString *str) {
00277 TextString *p1, *p2;
00278
00279
00280
00281 if (str->len == 0) {
00282 delete str;
00283 return;
00284 }
00285
00286
00287 if (rawOrder) {
00288 p1 = xyCur1;
00289 p2 = NULL;
00290 } else if ((!xyCur1 || xyBefore(xyCur1, str)) &&
00291 (!xyCur2 || xyBefore(str, xyCur2))) {
00292 p1 = xyCur1;
00293 p2 = xyCur2;
00294 } else if (xyCur1 && xyBefore(xyCur1, str)) {
00295 for (p1 = xyCur1, p2 = xyCur2; p2; p1 = p2, p2 = p2->next) {
00296 if (xyBefore(str, p2)) {
00297 break;
00298 }
00299 }
00300 xyCur2 = p2;
00301 } else {
00302 for (p1 = NULL, p2 = xyStrings; p2; p1 = p2, p2 = p2->next) {
00303 if (xyBefore(str, p2)) {
00304 break;
00305 }
00306 }
00307 xyCur2 = p2;
00308 }
00309 xyCur1 = str;
00310 if (p1) {
00311 p1->next = str;
00312 } else {
00313 xyStrings = str;
00314 }
00315 str->next = p2;
00316 }
00317
00318 void TextPage::coalesce() {
00319 TextLine *line, *line0;
00320 TextBlock *yxBlocks, *xyBlocks, *blk, *blk0, *blk1, *blk2;
00321 TextString *str0, *str1, *str2, *str3, *str4;
00322 TextString *str1prev, *str2prev, *str3prev;
00323 TextOutColumnEdge *edges;
00324 UnicodeMap *uMap;
00325 GBool isUnicode;
00326 char buf[8];
00327 int edgesLength, edgesSize;
00328 double x, yMin, yMax;
00329 double space, fit1, fit2, h;
00330 int col1, col2, d;
00331 int i, j;
00332
00333 #if 0 //~ for debugging
00334 for (str1 = xyStrings; str1; str1 = str1->next) {
00335 printf("x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
00336 str1->xMin, str1->xMax, str1->yMin, str1->yMax,
00337 (str1->yMax - str1->yMin));
00338 for (i = 0; i < str1->len; ++i) {
00339 fputc(str1->text[i] & 0xff, stdout);
00340 }
00341 printf("'\n");
00342 }
00343 printf("\n------------------------------------------------------------\n\n");
00344 #endif
00345
00346
00347 edges = NULL;
00348 edgesLength = edgesSize = 0;
00349 if (!rawOrder) {
00350 for (str1prev = NULL, str1 = xyStrings;
00351 str1;
00352 str1prev = str1, str1 = str1->next) {
00353 if (str1->marked) {
00354 continue;
00355 }
00356 h = str1->yMax - str1->yMin;
00357 if (str1prev && (str1->xMin - str1prev->xMax) / h < textOutColSpace) {
00358 continue;
00359 }
00360 x = str1->xMin;
00361 yMin = str1->yMin;
00362 yMax = str1->yMax;
00363 for (str2prev = str1, str2 = str1->next;
00364 str2;
00365 str2prev = str2, str2 = str2->next) {
00366 h = str2->yMax - str2->yMin;
00367 if (!str2->marked &&
00368 (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
00369 fabs(str2->xMin - x) < 0.5 &&
00370 str2->yMin - yMax < 0.3 * h &&
00371 yMin - str2->yMax < 0.3 * h) {
00372 break;
00373 }
00374 }
00375 if (str2) {
00376 if (str2->yMin < yMin) {
00377 yMin = str2->yMin;
00378 }
00379 if (str2->yMax > yMax) {
00380 yMax = str2->yMax;
00381 }
00382 str2->marked = gTrue;
00383 for (str3prev = str1, str3 = str1->next;
00384 str3;
00385 str3prev = str3, str3 = str3->next) {
00386 h = str3->yMax - str3->yMin;
00387 if (!str3->marked &&
00388 (str3->xMin - str3prev->xMax) / h > textOutColSpace &&
00389 fabs(str3->xMin - x) < 0.5 &&
00390 str3->yMin - yMax < 0.3 * h &&
00391 yMin - str3->yMax < 0.3 * h) {
00392 break;
00393 }
00394 }
00395 if (str3) {
00396 if (str3->yMin < yMin) {
00397 yMin = str3->yMin;
00398 }
00399 if (str3->yMax > yMax) {
00400 yMax = str3->yMax;
00401 }
00402 str3->marked = gTrue;
00403 do {
00404 for (str2prev = str1, str2 = str1->next;
00405 str2;
00406 str2prev = str2, str2 = str2->next) {
00407 h = str2->yMax - str2->yMin;
00408 if (!str2->marked &&
00409 (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
00410 fabs(str2->xMin - x) < 0.5 &&
00411 str2->yMin - yMax < 0.3 * h &&
00412 yMin - str2->yMax < 0.3 * h) {
00413 if (str2->yMin < yMin) {
00414 yMin = str2->yMin;
00415 }
00416 if (str2->yMax > yMax) {
00417 yMax = str2->yMax;
00418 }
00419 str2->marked = gTrue;
00420 break;
00421 }
00422 }
00423 } while (str2);
00424 if (edgesLength == edgesSize) {
00425 edgesSize = edgesSize ? 2 * edgesSize : 16;
00426 edges = (TextOutColumnEdge *)
00427 grealloc(edges, edgesSize * sizeof(TextOutColumnEdge));
00428 }
00429 edges[edgesLength].x = x;
00430 edges[edgesLength].y0 = yMin;
00431 edges[edgesLength].y1 = yMax;
00432 ++edgesLength;
00433 } else {
00434 str2->marked = gFalse;
00435 }
00436 }
00437 str1->marked = gTrue;
00438 }
00439 }
00440
00441 #if 0 //~ for debugging
00442 printf("column edges:\n");
00443 for (i = 0; i < edgesLength; ++i) {
00444 printf("%d: x=%.2f y0=%.2f y1=%.2f\n",
00445 i, edges[i].x, edges[i].y0, edges[i].y1);
00446 }
00447 printf("\n------------------------------------------------------------\n\n");
00448 #endif
00449
00450
00451 yxBlocks = NULL;
00452 blk1 = blk2 = NULL;
00453 while (xyStrings) {
00454
00455
00456 str0 = xyStrings;
00457 xyStrings = xyStrings->next;
00458 str0->next = NULL;
00459 blk = new TextBlock();
00460 blk->strings = str0;
00461 blk->xMin = str0->xMin;
00462 blk->xMax = str0->xMax;
00463 blk->yMin = str0->yMin;
00464 blk->yMax = str0->yMax;
00465 while (xyStrings) {
00466 str1 = NULL;
00467 str2 = xyStrings;
00468 fit1 = coalesceFit(str0, str2);
00469 if (!rawOrder) {
00470
00471 space = str0->yMax - str0->yMin;
00472 for (str3 = xyStrings, str4 = xyStrings->next;
00473 str4 && str4->xMin - str0->xMax <= space;
00474 str3 = str4, str4 = str4->next) {
00475 fit2 = coalesceFit(str0, str4);
00476 if (fit2 < fit1) {
00477 str1 = str3;
00478 str2 = str4;
00479 fit1 = fit2;
00480 }
00481 }
00482 }
00483 if (fit1 > 1) {
00484
00485 break;
00486 }
00487
00488
00489 if (fit1 > 0.2) {
00490 for (i = 0; i < edgesLength; ++i) {
00491 if (str0->xMax < edges[i].x + 0.5 && edges[i].x - 0.5 < str2->xMin &&
00492 str0->yMin < edges[i].y1 && str0->yMax > edges[i].y0 &&
00493 str2->yMin < edges[i].y1 && str2->yMax > edges[i].y0) {
00494 break;
00495 }
00496 }
00497 if (i < edgesLength) {
00498 break;
00499 }
00500 }
00501
00502 if (str1) {
00503 str1->next = str2->next;
00504 } else {
00505 xyStrings = str2->next;
00506 }
00507 str0->next = str2;
00508 str2->next = NULL;
00509 if (str2->xMax > blk->xMax) {
00510 blk->xMax = str2->xMax;
00511 }
00512 if (str2->yMin < blk->yMin) {
00513 blk->yMin = str2->yMin;
00514 }
00515 if (str2->yMax > blk->yMax) {
00516 blk->yMax = str2->yMax;
00517 }
00518 str0 = str2;
00519 }
00520
00521
00522 if (!rawOrder) {
00523
00524 for (blk1 = NULL, blk2 = yxBlocks;
00525 blk2 && !yxBefore(blk, blk2);
00526 blk1 = blk2, blk2 = blk2->next) ;
00527 }
00528 blk->next = blk2;
00529 if (blk1) {
00530 blk1->next = blk;
00531 } else {
00532 yxBlocks = blk;
00533 }
00534 blk1 = blk;
00535 }
00536
00537 gfree(edges);
00538
00539
00540 xyStrings = NULL;
00541
00542
00543 uMap = globalParams->getTextEncoding();
00544 isUnicode = uMap ? uMap->isUnicode() : gFalse;
00545 for (blk = yxBlocks; blk; blk = blk->next) {
00546 blk->len = 0;
00547 for (str1 = blk->strings; str1; str1 = str1->next) {
00548 blk->len += str1->len;
00549 if (str1->next && str1->next->xMin - str1->xMax >
00550 textOutSpace * (str1->yMax - str1->yMin)) {
00551 str1->spaceAfter = gTrue;
00552 ++blk->len;
00553 } else {
00554 str1->spaceAfter = gFalse;
00555 }
00556 }
00557 blk->text = (Unicode *)gmalloc(blk->len * sizeof(Unicode));
00558 blk->xRight = (double *)gmalloc(blk->len * sizeof(double));
00559 blk->col = (int *)gmalloc(blk->len * sizeof(int));
00560 i = 0;
00561 for (str1 = blk->strings; str1; str1 = str1->next) {
00562 for (j = 0; j < str1->len; ++j) {
00563 blk->text[i] = str1->text[j];
00564 blk->xRight[i] = str1->xRight[j];
00565 ++i;
00566 }
00567 if (str1->spaceAfter) {
00568 blk->text[i] = (Unicode)0x0020;
00569 blk->xRight[i] = str1->next->xMin;
00570 ++i;
00571 }
00572 }
00573 blk->convertedLen = 0;
00574 for (j = 0; j < blk->len; ++j) {
00575 blk->col[j] = blk->convertedLen;
00576 if (isUnicode) {
00577 ++blk->convertedLen;
00578 } else if (uMap) {
00579 blk->convertedLen += uMap->mapUnicode(blk->text[j], buf, sizeof(buf));
00580 }
00581 }
00582 }
00583 if (uMap) {
00584 uMap->decRefCnt();
00585 }
00586
00587 #if 0 //~ for debugging
00588 for (blk = yxBlocks; blk; blk = blk->next) {
00589 printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
00590 blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
00591 TextString *str;
00592 for (str = blk->strings; str; str = str->next) {
00593 printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f'",
00594 str->xMin, str->xMax, str->yMin, str->yMax,
00595 (str->yMax - str->yMin));
00596 for (i = 0; i < str->len; ++i) {
00597 fputc(str->text[i] & 0xff, stdout);
00598 }
00599 if (str->spaceAfter) {
00600 fputc(' ', stdout);
00601 }
00602 printf("'\n");
00603 }
00604 }
00605 printf("\n------------------------------------------------------------\n\n");
00606 #endif
00607
00608
00609 lines = NULL;
00610 line0 = NULL;
00611 while (yxBlocks) {
00612 blk0 = yxBlocks;
00613 yxBlocks = yxBlocks->next;
00614 blk0->next = NULL;
00615 line = new TextLine();
00616 line->blocks = blk0;
00617 line->yMin = blk0->yMin;
00618 line->yMax = blk0->yMax;
00619 while (yxBlocks) {
00620
00621
00622 h = blk0->yMax - blk0->yMin;
00623 if (yxBlocks->len == blk0->len &&
00624 !memcmp(yxBlocks->text, blk0->text,
00625 yxBlocks->len * sizeof(Unicode)) &&
00626 fabs(yxBlocks->yMin - blk0->yMin) / h < 0.2 &&
00627 fabs(yxBlocks->yMax - blk0->yMax) / h < 0.2 &&
00628 fabs(yxBlocks->xMin - blk0->xMin) / h < 0.2 &&
00629 fabs(yxBlocks->xMax - blk0->xMax) / h < 0.2) {
00630 blk1 = yxBlocks;
00631 yxBlocks = yxBlocks->next;
00632 delete blk1;
00633 continue;
00634 }
00635
00636 if (rawOrder && yxBlocks->yMax < blk0->yMin) {
00637 break;
00638 }
00639 if (yxBlocks->yMin > 0.2*blk0->yMin + 0.8*blk0->yMax ||
00640 yxBlocks->xMin < blk0->xMax) {
00641 break;
00642 }
00643 blk1 = yxBlocks;
00644 yxBlocks = yxBlocks->next;
00645 blk0->next = blk1;
00646 blk1->next = NULL;
00647 if (blk1->yMin < line->yMin) {
00648 line->yMin = blk1->yMin;
00649 }
00650 if (blk1->yMax > line->yMax) {
00651 line->yMax = blk1->yMax;
00652 }
00653 blk0 = blk1;
00654 }
00655 if (line0) {
00656 line0->next = line;
00657 } else {
00658 lines = line;
00659 }
00660 line->next = NULL;
00661 line0 = line;
00662 }
00663
00664
00665
00666 xyBlocks = NULL;
00667 for (line = lines; line; line = line->next) {
00668 for (blk = line->blocks; blk; blk = blk->next) {
00669 for (blk1 = NULL, blk2 = xyBlocks;
00670 blk2 && !xyBefore(blk, blk2);
00671 blk1 = blk2, blk2 = blk2->xyNext) ;
00672 blk->xyNext = blk2;
00673 if (blk1) {
00674 blk1->xyNext = blk;
00675 } else {
00676 xyBlocks = blk;
00677 }
00678 }
00679 }
00680
00681 #if 0 //~ for debugging
00682 for (blk = xyBlocks; blk; blk = blk->xyNext) {
00683 printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
00684 blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
00685 TextString *str;
00686 for (str = blk->strings; str; str = str->next) {
00687 printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
00688 str->xMin, str->xMax, str->yMin, str->yMax,
00689 (str->yMax - str->yMin));
00690 for (i = 0; i < str->len; ++i) {
00691 fputc(str->text[i] & 0xff, stdout);
00692 }
00693 printf("'\n");
00694 }
00695 }
00696 printf("\n------------------------------------------------------------\n\n");
00697 #endif
00698
00699
00700 for (blk1 = xyBlocks; blk1; blk1 = blk1->xyNext) {
00701 col1 = 0;
00702 for (blk2 = xyBlocks; blk2 != blk1; blk2 = blk2->xyNext) {
00703 if (blk1->xMin >= blk2->xMax) {
00704 d = (int)((blk1->xMin - blk2->xMax) /
00705 (0.4 * (blk1->yMax - blk1->yMin)));
00706 if (d > 4) {
00707 d = 4;
00708 }
00709 col2 = blk2->col[0] + blk2->convertedLen + d;
00710 if (col2 > col1) {
00711 col1 = col2;
00712 }
00713 } else if (blk1->xMin > blk2->xMin) {
00714 for (i = 0; i < blk2->len && blk1->xMin >= blk2->xRight[i]; ++i) ;
00715 col2 = blk2->col[i];
00716 if (col2 > col1) {
00717 col1 = col2;
00718 }
00719 }
00720 }
00721 for (j = 0; j < blk1->len; ++j) {
00722 blk1->col[j] += col1;
00723 }
00724 }
00725
00726 #if 0 //~ for debugging
00727 for (line = lines; line; line = line->next) {
00728 printf("[line]\n");
00729 for (blk = line->blocks; blk; blk = blk->next) {
00730 printf("[block: col=%d, len=%d]\n", blk->col[0], blk->len);
00731 TextString *str;
00732 for (str = blk->strings; str; str = str->next) {
00733 printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
00734 str->xMin, str->xMax, str->yMin, str->yMax,
00735 (str->yMax - str->yMin));
00736 for (i = 0; i < str->len; ++i) {
00737 fputc(str->text[i] & 0xff, stdout);
00738 }
00739 if (str->spaceAfter) {
00740 printf(" [space]\n");
00741 }
00742 printf("'\n");
00743 }
00744 }
00745 }
00746 printf("\n------------------------------------------------------------\n\n");
00747 #endif
00748 }
00749
00750
00751 GBool TextPage::findText(Unicode *s, int len,
00752 GBool top, GBool bottom,
00753 double *xMin, double *yMin,
00754 double *xMax, double *yMax) {
00755 TextLine *line;
00756 TextBlock *blk;
00757 Unicode *p;
00758 Unicode u1, u2;
00759 int m, i, j;
00760 double x0, x1, x;
00761
00762
00763 for (line = lines; line; line = line->next) {
00764 for (blk = line->blocks; blk; blk = blk->next) {
00765
00766
00767 if (!top && (blk->yMax < *yMin ||
00768 (blk->yMin < *yMin && blk->xMax <= *xMin))) {
00769 continue;
00770 }
00771
00772
00773 if (!bottom && (blk->yMin > *yMax ||
00774 (blk->yMax > *yMax && blk->xMin >= *xMax))) {
00775 return gFalse;
00776 }
00777
00778
00779 m = blk->len;
00780 for (i = 0, p = blk->text; i <= m - len; ++i, ++p) {
00781
00782 x0 = (i == 0) ? blk->xMin : blk->xRight[i-1];
00783 x1 = blk->xRight[i];
00784 x = 0.5 * (x0 + x1);
00785
00786
00787 if (!top && blk->yMin < *yMin) {
00788 if (x < *xMin) {
00789 continue;
00790 }
00791 }
00792
00793
00794 if (!bottom && blk->yMax > *yMax) {
00795 if (x > *xMax) {
00796 return gFalse;
00797 }
00798 }
00799
00800
00801 for (j = 0; j < len; ++j) {
00802 #if 1 //~ this lowercases Latin A-Z only -- this will eventually be
00803
00804 if (p[j] >= 0x41 && p[j] <= 0x5a) {
00805 u1 = p[j] + 0x20;
00806 } else {
00807 u1 = p[j];
00808 }
00809 if (s[j] >= 0x41 && s[j] <= 0x5a) {
00810 u2 = s[j] + 0x20;
00811 } else {
00812 u2 = s[j];
00813 }
00814 #endif
00815 if (u1 != u2) {
00816 break;
00817 }
00818 }
00819
00820
00821 if (j == len) {
00822 *xMin = x0;
00823 *xMax = blk->xRight[i + len - 1];
00824 *yMin = blk->yMin;
00825 *yMax = blk->yMax;
00826 return gTrue;
00827 }
00828 }
00829 }
00830 }
00831
00832 return gFalse;
00833 }
00834
00835 GString *TextPage::getText(double xMin, double yMin,
00836 double xMax, double yMax) {
00837 GString *s;
00838 UnicodeMap *uMap;
00839 GBool isUnicode;
00840 char space[8], eol[16], buf[8];
00841 int spaceLen, eolLen, len;
00842 TextLine *line;
00843 TextBlock *blk;
00844 double x0, x1, y;
00845 int firstCol, col, i;
00846 GBool multiLine;
00847
00848 s = new GString();
00849
00850
00851 if (!(uMap = globalParams->getTextEncoding())) {
00852 return s;
00853 }
00854 isUnicode = uMap->isUnicode();
00855 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
00856 eolLen = 0;
00857 switch (globalParams->getTextEOL()) {
00858 case eolUnix:
00859 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
00860 break;
00861 case eolDOS:
00862 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
00863 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
00864 break;
00865 case eolMac:
00866 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
00867 break;
00868 }
00869
00870
00871 multiLine = gFalse;
00872 firstCol = -1;
00873 for (line = lines; line; line = line->next) {
00874 if (line->yMin > yMax) {
00875 break;
00876 }
00877 if (line->yMax < yMin) {
00878 continue;
00879 }
00880
00881 for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
00882 if (!blk || blk->xMin > xMax) {
00883 continue;
00884 }
00885
00886 y = 0.5 * (blk->yMin + blk->yMax);
00887 if (y < yMin || y > yMax) {
00888 continue;
00889 }
00890
00891 if (firstCol >= 0) {
00892 multiLine = gTrue;
00893 }
00894
00895 i = 0;
00896 while (1) {
00897 x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
00898 x1 = blk->xRight[i];
00899 if (0.5 * (x0 + x1) > xMin) {
00900 break;
00901 }
00902 ++i;
00903 }
00904 col = blk->col[i];
00905
00906 if (firstCol < 0 || col < firstCol) {
00907 firstCol = col;
00908 }
00909 }
00910
00911
00912 for (line = lines; line; line = line->next) {
00913 if (line->yMin > yMax) {
00914 break;
00915 }
00916 if (line->yMax < yMin) {
00917 continue;
00918 }
00919
00920 for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
00921 if (!blk || blk->xMin > xMax) {
00922 continue;
00923 }
00924
00925 y = 0.5 * (blk->yMin + blk->yMax);
00926 if (y < yMin || y > yMax) {
00927 continue;
00928 }
00929
00930 i = 0;
00931 while (1) {
00932 x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
00933 x1 = blk->xRight[i];
00934 if (0.5 * (x0 + x1) > xMin) {
00935 break;
00936 }
00937 ++i;
00938 }
00939
00940 col = firstCol;
00941
00942 do {
00943
00944
00945 for (; col < blk->col[i]; ++col) {
00946 s->append(space, spaceLen);
00947 }
00948
00949
00950 for (; i < blk->len; ++i) {
00951
00952 x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
00953 x1 = blk->xRight[i];
00954 if (0.5 * (x0 + x1) > xMax) {
00955 break;
00956 }
00957
00958 len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
00959 s->append(buf, len);
00960 col += isUnicode ? 1 : len;
00961 }
00962 if (i < blk->len) {
00963 break;
00964 }
00965
00966
00967 blk = blk->next;
00968 i = 0;
00969
00970 } while (blk && blk->xMin < xMax);
00971
00972 if (multiLine) {
00973 s->append(eol, eolLen);
00974 }
00975 }
00976
00977 uMap->decRefCnt();
00978
00979 return s;
00980 }
00981
00982 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) {
00983 UnicodeMap *uMap;
00984 char space[8], eol[16], eop[8], buf[8];
00985 int spaceLen, eolLen, eopLen, len;
00986 TextLine *line;
00987 TextBlock *blk;
00988 int col, d, i;
00989
00990
00991 if (!(uMap = globalParams->getTextEncoding())) {
00992 return;
00993 }
00994 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
00995 eolLen = 0;
00996 switch (globalParams->getTextEOL()) {
00997 case eolUnix:
00998 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
00999 break;
01000 case eolDOS:
01001 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
01002 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
01003 break;
01004 case eolMac:
01005 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
01006 break;
01007 }
01008 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
01009
01010
01011 for (line = lines; line; line = line->next) {
01012 col = 0;
01013 for (blk = line->blocks; blk; blk = blk->next) {
01014
01015
01016 if (rawOrder && col == 0) {
01017 col = blk->col[0];
01018 } else {
01019 for (; col < blk->col[0]; ++col) {
01020 (*outputFunc)(outputStream, space, spaceLen);
01021 }
01022 }
01023
01024
01025 for (i = 0; i < blk->len; ++i) {
01026 len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
01027 (*outputFunc)(outputStream, buf, len);
01028 }
01029 col += blk->convertedLen;
01030 }
01031
01032
01033 (*outputFunc)(outputStream, eol, eolLen);
01034
01035
01036 if (line->next) {
01037 d = (int)((line->next->yMin - line->yMax) /
01038 (line->blocks->strings->yMax - lines->blocks->strings->yMin)
01039 + 0.5);
01040
01041
01042 if (rawOrder && d > 2) {
01043 d = 2;
01044 } else if (!rawOrder && d > 5) {
01045 d = 5;
01046 }
01047 for (; d > 0; --d) {
01048 (*outputFunc)(outputStream, eol, eolLen);
01049 }
01050 }
01051 }
01052
01053
01054 (*outputFunc)(outputStream, eol, eolLen);
01055 (*outputFunc)(outputStream, eop, eopLen);
01056 (*outputFunc)(outputStream, eol, eolLen);
01057
01058 uMap->decRefCnt();
01059 }
01060
01061
01062
01063 GBool TextPage::xyBefore(TextString *str1, TextString *str2) {
01064 return str1->xMin < str2->xMin ||
01065 (str1->xMin == str2->xMin && str1->yMin < str2->yMin);
01066 }
01067
01068
01069
01070 GBool TextPage::xyBefore(TextBlock *blk1, TextBlock *blk2) {
01071 return blk1->xMin < blk2->xMin ||
01072 (blk1->xMin == blk2->xMin && blk1->yMin < blk2->yMin);
01073 }
01074
01075
01076
01077 GBool TextPage::yxBefore(TextBlock *blk1, TextBlock *blk2) {
01078 double h1, h2, overlap;
01079
01080 h1 = blk1->yMax - blk1->yMin;
01081 h2 = blk2->yMax - blk2->yMin;
01082 overlap = ((blk1->yMax < blk2->yMax ? blk1->yMax : blk2->yMax) -
01083 (blk1->yMin > blk2->yMin ? blk1->yMin : blk2->yMin)) /
01084 (h1 < h2 ? h1 : h2);
01085 if (overlap > 0.6) {
01086 return blk1->xMin < blk2->xMin;
01087 }
01088 return blk1->yMin < blk2->yMin;
01089 }
01090
01091 double TextPage::coalesceFit(TextString *str1, TextString *str2) {
01092 double h1, h2, w1, w2, r, overlap, spacing;
01093
01094 h1 = str1->yMax - str1->yMin;
01095 h2 = str2->yMax - str2->yMin;
01096 w1 = str1->xMax - str1->xMin;
01097 w2 = str2->xMax - str2->xMin;
01098 r = h1 / h2;
01099 if (r < (1.0 / 3.0) || r > 3) {
01100 return 10;
01101 }
01102 overlap = ((str1->yMax < str2->yMax ? str1->yMax : str2->yMax) -
01103 (str1->yMin > str2->yMin ? str1->yMin : str2->yMin)) /
01104 (h1 < h2 ? h1 : h2);
01105 if (overlap < 0.5) {
01106 return 10;
01107 }
01108 spacing = (str2->xMin - str1->xMax) / (h1 > h2 ? h1 : h2);
01109 if (spacing < -0.5) {
01110 return 10;
01111 }
01112
01113
01114 if ((str2->xMin - str1->xMax) / (w1 < w2 ? w1 : w2) < -0.7) {
01115 return 10;
01116 }
01117 return spacing;
01118 }
01119
01120 void TextPage::clear() {
01121 TextLine *p1, *p2;
01122 TextString *s1, *s2;
01123
01124 if (curStr) {
01125 delete curStr;
01126 curStr = NULL;
01127 }
01128 if (lines) {
01129 for (p1 = lines; p1; p1 = p2) {
01130 p2 = p1->next;
01131 delete p1;
01132 }
01133 } else if (xyStrings) {
01134 for (s1 = xyStrings; s1; s1 = s2) {
01135 s2 = s1->next;
01136 delete s1;
01137 }
01138 }
01139 xyStrings = NULL;
01140 xyCur1 = xyCur2 = NULL;
01141 lines = NULL;
01142 nest = 0;
01143 nTinyChars = 0;
01144 }
01145
01146
01147
01148
01149
01150 static void outputToFile(void *stream, char *text, int len) {
01151 fwrite(text, 1, len, (FILE *)stream);
01152 }
01153
01154 TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) {
01155 text = NULL;
01156 rawOrder = rawOrderA;
01157 ok = gTrue;
01158
01159
01160 needClose = gFalse;
01161 if (fileName) {
01162 if (!strcmp(fileName, "-")) {
01163 outputStream = stdout;
01164 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
01165 needClose = gTrue;
01166 } else {
01167 error(-1, "Couldn't open text file '%s'", fileName);
01168 ok = gFalse;
01169 return;
01170 }
01171 outputFunc = &outputToFile;
01172 } else {
01173 outputStream = NULL;
01174 }
01175
01176
01177 text = new TextPage(rawOrder);
01178 }
01179
01180 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
01181 GBool rawOrderA) {
01182 outputFunc = func;
01183 outputStream = stream;
01184 needClose = gFalse;
01185 rawOrder = rawOrderA;
01186 text = new TextPage(rawOrder);
01187 ok = gTrue;
01188 }
01189
01190 TextOutputDev::~TextOutputDev() {
01191 if (needClose) {
01192 #ifdef MACOS
01193 ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
01194 #endif
01195 fclose((FILE *)outputStream);
01196 }
01197 if (text) {
01198 delete text;
01199 }
01200 }
01201
01202 void TextOutputDev::startPage(int , GfxState *) {
01203 text->clear();
01204 }
01205
01206 void TextOutputDev::endPage() {
01207 text->coalesce();
01208 if (outputStream) {
01209 text->dump(outputStream, outputFunc);
01210 }
01211 }
01212
01213 void TextOutputDev::updateFont(GfxState *state) {
01214 text->updateFont(state);
01215 }
01216
01217 void TextOutputDev::beginString(GfxState *state, GString *) {
01218 text->beginString(state, state->getCurX(), state->getCurY());
01219 }
01220
01221 void TextOutputDev::endString(GfxState *) {
01222 text->endString();
01223 }
01224
01225 void TextOutputDev::drawChar(GfxState *state, double x, double y,
01226 double dx, double dy,
01227 double , double ,
01228 CharCode , Unicode *u, int uLen) {
01229 text->addChar(state, x, y, dx, dy, u, uLen);
01230 }
01231
01232 GBool TextOutputDev::findText(Unicode *s, int len,
01233 GBool top, GBool bottom,
01234 double *xMin, double *yMin,
01235 double *xMax, double *yMax) {
01236 return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax);
01237 }
01238
01239 GString *TextOutputDev::getText(double xMin, double yMin,
01240 double xMax, double yMax) {
01241 return text->getText(xMin, yMin, xMax, yMax);
01242 }
01243