00001 #include "transform.h"
00002
00003 #include <kdebug.h>
00004
00005
00006 namespace PDFImport
00007 {
00008
00009
00010 static const uint TABLE_SIZE = 0x0100;
00011 static const uint NB_TABLES = 5;
00012 static const uint OFFSET[NB_TABLES] = {
00013 0x00, 0x01, 0x20, 0x21, 0x22
00014 };
00015 static const char TABLE[NB_TABLES][TABLE_SIZE] = {
00016 #define U Unknown
00017 #define P Punctuation
00018 #define S SymbolChar
00019 #define D Digit
00020 #define L Letter
00021
00022 #define H Hyphen
00023 #define B Bullet
00024 #define I SuperScript
00025 #define Y SpecialSymbol
00026 #define G Ligature
00027
00028 #define A Accent
00029 #define E Punctuation_Accent
00030 #define C Letter_CanHaveAccent
00031
00032 #define X LatexSpecial
00033
00034 {
00035 U,U,U,U,U,U,U,U,U,P,P,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00036
00037 P,P,E,S,S,S,S,P,P,P,S,S,E,H,E,S, D,D,D,D,D,D,D,D,D,D,P,P,S,S,S,P,
00038
00039 S,C,L,C,L,C,L,C,C,C,C,C,C,L,C,C, L,L,C,C,C,C,L,C,L,C,C,S,X,S,A,A,
00040
00041 E,C,L,C,L,C,L,C,C,C,C,C,C,L,C,C, L,L,C,C,C,C,L,C,L,C,C,S,S,S,A,U,
00042
00043 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00044
00045 U,U,S,S,S,S,S,S,A,S,U,P,U,H,S,A, A,S,I,I,E,U,U,S,A,I,U,P,S,S,S,P,
00046
00047 L,L,L,L,L,L,C,L,L,L,L,L,L,L,L,L, L,L,L,L,L,L,L,S,C,L,L,L,L,L,L,L,
00048
00049 L,L,L,L,L,L,C,L,L,L,L,L,L,L,L,L, L,L,L,L,L,L,L,S,C,L,L,L,L,L,L,L
00050
00051 },
00052
00053 {
00054 L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,L, L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,
00055 L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,L, L,C,L,L,L,L,L,L,L,L,L,L,L,L,L,L,
00056 L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,L, L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,
00057 L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,L, L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,U,
00058 U,U,U,U,U,U,U,U,U,U,U,U,U,U,S,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00059 L,L,U,U,U,U,U,L,L,S,U,U,U,U,L,L, L,U,U,U,U,L,L,U,U,U,U,U,U,U,U,U,
00060 U,U,S,P,U,U,U,U,U,U,U,U,U,L,L,L, L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,L,
00061 L,L,L,L,L,L,L,L,L,L,L,L,L,L,U,U, L,U,U,U,L,L,U,U,L,L,L,L,L,L,L,L
00062 },
00063
00064 {
00065 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,P,P,P,P,P,P,P,P,
00066 S,S,B,U,U,U,P,U,U,U,U,U,U,U,U,U, S,U,P,P,U,U,U,U,U,Y,Y,U,U,U,U,U,
00067 U,U,U,U,Y,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00068 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00069 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00070 U,U,U,U,U,U,U,U,U,U,U,U,S,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00071 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00072 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U
00073 },
00074
00075 {
00076 U,U,U,G,U,U,U,U,U,G,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00077 U,U,S,U,U,U,U,U,U,U,Y,Y,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00078 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00079 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00080 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, Y,Y,Y,Y,Y,U,U,U,U,U,Y,Y,U,U,U,U,
00081 U,U,U,U,U,U,U,U,U,U,U,U,U,U,Y,U, U,U,U,U,U,Y,U,U,U,U,U,U,U,U,U,U,
00082 U,U,U,U,U,U,U,U,U,U,U,U,U,Y,Y,Y, Y,Y,Y,Y,Y,U,U,U,U,U,U,U,U,U,U,U,
00083 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U
00084 },
00085
00086 {
00087 Y,U,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,U,Y, U,Y,Y,U,U,Y,U,Y,U,Y,Y,U,U,Y,Y,U,
00088 Y,U,U,U,U,U,U,Y,Y,Y,Y,Y,U,U,U,U, U,U,U,U,Y,Y,Y,U,U,U,U,U,Y,U,U,U,
00089 U,Y,U,U,U,Y,U,Y,Y,Y,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,Y,Y,U,U,U,U,U,
00090 Y,Y,Y,U,Y,Y,U,U,U,U,U,U,U,U,Y,Y, Y,Y,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00091 U,U,Y,Y,Y,Y,Y,Y,Y,Y,U,U,U,U,U,U, U,U,U,U,U,Y,U,Y,U,U,U,U,U,U,U,U,
00092 U,U,U,U,Y,Y,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00093 U,U,U,U,U,Y,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,
00094 U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U, U,U,U,U,U,U,U,U,U,U,U,U,U,U,U,U
00095 }
00096
00097 #undef U
00098 #undef P
00099 #undef S
00100 #undef D
00101 #undef L
00102
00103 #undef H
00104 #undef B
00105 #undef I
00106 #undef Y
00107
00108 #undef A
00109 #undef E
00110 #undef C
00111 #undef X
00112 };
00113
00114 CharType type(Unicode u)
00115 {
00116 uint offset = u / TABLE_SIZE;
00117 uint index = u % TABLE_SIZE;
00118 for (uint i=0; i<NB_TABLES; i++) {
00119 if ( offset==OFFSET[i] ) return (CharType)TABLE[i][index];
00120 if ( offset<OFFSET[i] ) break;
00121 }
00122 if ( u>=0xFB00 && u<=0xFB06 ) return Ligature;
00123 return Unknown;
00124 }
00125
00126
00127 static const Unicode LIGATURE_DATA[][MaxLigatureLength+1] = {
00128 { 0xFB00, 0x0066, 0x0066, 0x0000 },
00129 { 0xFB01, 0x0066, 0x0069, 0x0000 },
00130 { 0xFB02, 0x0066, 0x006C, 0x0000 },
00131 { 0xFB03, 0x0066, 0x0066, 0x0069 },
00132 { 0xFB04, 0x0066, 0x0066, 0x006c },
00133
00134
00135 { 0x0000, 0x0000, 0x0000, 0x0000 }
00136 };
00137
00138 uint checkLigature(Unicode u, Unicode res[MaxLigatureLength])
00139 {
00140 if ( type(u)==Unknown ) kdDebug(30516) << "unknown char " << u << endl;
00141 if ( type(u)!=Ligature ) {
00142 res[0] = u;
00143 return 1;
00144 }
00145
00146 uint i = 0;
00147 while ( LIGATURE_DATA[i][0]!=0 ) {
00148 if ( LIGATURE_DATA[i][0]==u ) {
00149 uint k = 0;
00150 for (; k<MaxLigatureLength; k++) {
00151 if ( LIGATURE_DATA[i][k+1]==0 ) break;
00152 res[k] = LIGATURE_DATA[i][k+1];
00153 }
00154 return k;
00155 }
00156 i++;
00157 }
00158 kdDebug(30516) << "undefined ligature !! " << u <<endl;
00159 res[0] = u;
00160 return 1;
00161 }
00162
00163
00164
00165 static const Unicode SUPER_DATA[][2] = {
00166 { 0x00B9, 0x0031 },
00167 { 0x00B2, 0x0032 },
00168 { 0x00B3, 0x0033 },
00169 { 0x0000, 0x0000 }
00170 };
00171
00172 static const Unicode BULLET_DATA[][2] = {
00173 { 0x2022, 0x00B7 },
00174 { 0x0000, 0x0000 }
00175 };
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189 CharType checkSpecial(Unicode u, Unicode &res)
00190 {
00191 CharType t = type(u);
00192
00193
00194 switch (t) {
00195 case Unknown:
00196 kdDebug(30516) << "unknown special " << QString(QChar(u))
00197 << " (" << u << ")" << endl;
00198 break;
00199 case SuperScript: {
00200 uint i = 0;
00201 for (;;) {
00202 if ( SUPER_DATA[i][0]==0 ) {
00203 kdDebug(30516) << "undefined superscript !!" << endl;
00204 break;
00205 }
00206 if ( SUPER_DATA[i][0]==u ) {
00207 res = SUPER_DATA[i][1];
00208 break;
00209 }
00210 i++;
00211 }
00212 break;
00213 }
00214 case Bullet:{
00215 uint i = 0;
00216 for (;;) {
00217 if ( BULLET_DATA[i][0]==0 ) {
00218 kdDebug(30516) << "undefined bullet !!" << endl;
00219 break;
00220 }
00221 if ( BULLET_DATA[i][0]==u ) {
00222 res = BULLET_DATA[i][1];
00223 break;
00224 }
00225 i++;
00226 }
00227 break;
00228 }
00229
00230 default:
00231 break;
00232 }
00233
00234 return t;
00235 }
00236
00237
00238
00239 enum AccentType {
00240 NoAccent,
00241 Grave, Acute, Circumflex, Tilde, Diaeresis, Degree,
00242 Macron, LowLine, Dot, Comma, DQuote, Cedilla
00243 };
00244 struct AccentData {
00245 Unicode u;
00246 AccentType type;
00247 };
00248 static const AccentData ACCENT_DATA[] = {
00249 { 0x0060, Grave },
00250 { 0x00B4, Acute },
00251 { 0x005E, Circumflex },
00252 { 0x007E, Tilde },
00253 { 0x00A8, Diaeresis },
00254 { 0x00B0, Degree },
00255 { 0x00AF, Macron },
00256 { 0x005F, LowLine },
00257 { 0x002E, Dot },
00258
00259 { 0x002C, Comma },
00260 { 0x0022, DQuote },
00261 { 0x00B8, Cedilla },
00262 { 0x0000, NoAccent }
00263 };
00264
00265 struct CombiData {
00266 AccentType type;
00267 Unicode upper, lower;
00268 };
00269 static const CombiData A_DATA[] = {
00270 { Grave, 0x00C0, 0x00E0 },
00271 { Acute, 0x00C1, 0x00E1 },
00272 { Circumflex, 0x00C2, 0x00E2 },
00273 { Tilde, 0x00C3, 0x00E3 },
00274 { Diaeresis, 0x00C4, 0x00E4 },
00275 { Degree, 0x00C5, 0x00E5 },
00276 { Macron, 0x0100, 0x0101 },
00277 { Dot, 0x0226, 0x0227 },
00278 { NoAccent, 0x0000, 0x0000 }
00279 };
00280 static const CombiData C_DATA[] = {
00281 { Acute, 0x0106, 0x0107 },
00282 { Circumflex, 0x0108, 0x0109 },
00283 { Dot, 0x010A, 0x010B },
00284 { Cedilla, 0x00C7, 0x00E7 },
00285 { NoAccent, 0x0000, 0x0000 }
00286 };
00287 static const CombiData E_DATA[] = {
00288 { Grave, 0x00C8, 0x00E8 },
00289 { Acute, 0x00C9, 0x00E9 },
00290 { Circumflex, 0x00CA, 0x00EA },
00291 { Diaeresis, 0x00CB, 0x00EB },
00292 { Macron, 0x0112, 0x0113 },
00293 { Dot, 0x0116, 0x0117 },
00294 { Cedilla, 0x0228, 0x0229 },
00295 { NoAccent, 0x0000, 0x0000 }
00296 };
00297 static const CombiData G_DATA[] = {
00298 { Acute, 0x01F4, 0x01F5 },
00299 { Circumflex, 0x011C, 0x011D },
00300 { Comma, 0x0122, 0x0000 },
00301 { NoAccent, 0x0000, 0x0000 }
00302 };
00303 static const CombiData H_DATA[] = {
00304 { Circumflex, 0x0124, 0x0125 },
00305 { NoAccent, 0x0000, 0x0000 }
00306 };
00307 static const CombiData I_DATA[] = {
00308 { Grave, 0x00CC, 0x0000 },
00309 { Acute, 0x00CD, 0x0000 },
00310 { Circumflex, 0x00CE, 0x0000 },
00311 { Tilde, 0x0128, 0x0000 },
00312 { Diaeresis, 0x00CF, 0x0000 },
00313 { Macron, 0x012A, 0x0000 },
00314 { Dot, 0x0130, 0x0000 },
00315 { Cedilla, 0x012E, 0x0000 },
00316 { NoAccent, 0x0000, 0x0000 }
00317 };
00318 static const CombiData J_DATA[] = {
00319 { Circumflex, 0x0134, 0x0135 },
00320 { NoAccent, 0x0000, 0x0000 }
00321 };
00322 static const CombiData K_DATA[] = {
00323 { Comma, 0x0136, 0x0137 },
00324 { NoAccent, 0x0000, 0x0000 }
00325 };
00326 static const CombiData L_DATA[] = {
00327 { Acute, 0x0139, 0x013A },
00328 { Comma, 0x013B, 0x013C },
00329 { NoAccent, 0x0000, 0x0000 }
00330 };
00331 static const CombiData N_DATA[] = {
00332 { Acute, 0x0143, 0x0144 },
00333 { Comma, 0x0145, 0x0146 },
00334 { Cedilla, 0x00D1, 0x00F1 },
00335 { NoAccent, 0x0000, 0x0000 }
00336 };
00337 static const CombiData O_DATA[] = {
00338 { Grave, 0x00D2, 0x00F2 },
00339 { Acute, 0x00D3, 0x00F3 },
00340 { Circumflex, 0x00D4, 0x00F4 },
00341 { Tilde, 0x00D5, 0x00F5 },
00342 { Diaeresis, 0x00D6, 0x00F6 },
00343 { Macron, 0x014C, 0x014D },
00344 { DQuote, 0x0150, 0x0151 },
00345 { Dot, 0x022E, 0x022F },
00346 { NoAccent, 0x0000, 0x0000 },
00347 };
00348 static const CombiData R_DATA[] = {
00349 { Acute, 0x0154, 0x0155 },
00350 { Comma, 0x0156, 0x0157 },
00351 { NoAccent, 0x0000, 0x0000 }
00352 };
00353 static const CombiData S_DATA[] = {
00354 { Acute, 0x015A, 0x015B },
00355 { Circumflex, 0x015C, 0x015D },
00356 { Comma, 0x0218, 0x0219 },
00357 { Cedilla, 0x015E, 0x015F },
00358 { NoAccent, 0x0000, 0x0000 }
00359 };
00360 static const CombiData T_DATA[] = {
00361 { Comma, 0x021A, 0x021B },
00362 { Cedilla, 0x0162, 0x0163 },
00363 { NoAccent, 0x0000, 0x0000 }
00364 };
00365 static const CombiData U_DATA[] = {
00366 { Grave, 0x00D9, 0x00F9 },
00367 { Acute, 0x00DA, 0x00FA },
00368 { Circumflex, 0x00DB, 0x00FB },
00369 { Tilde, 0x0168, 0x0169 },
00370 { Diaeresis, 0x00DC, 0x00FC },
00371 { Macron, 0x016A, 0x016B },
00372 { Dot, 0x016E, 0x016F },
00373 { DQuote, 0x0170, 0x0171 },
00374 { NoAccent, 0x0000, 0x0000 }
00375 };
00376 static const CombiData W_DATA[] = {
00377 { Circumflex, 0x0174, 0x0175 },
00378 { NoAccent, 0x0000, 0x0000 }
00379 };
00380 static const CombiData Y_DATA[] = {
00381 { Acute, 0x00DD, 0x00FD },
00382 { Circumflex, 0x0176, 0x0177 },
00383 { Diaeresis, 0x0178, 0x00FF },
00384 { Macron, 0x0232, 0x0233 },
00385 { NoAccent, 0x0000, 0x0000 }
00386 };
00387 static const CombiData Z_DATA[] = {
00388 { Acute, 0x0179, 0x017A },
00389 { Dot, 0x017B, 0x017C },
00390 { NoAccent, 0x0000, 0x0000 }
00391 };
00392
00393 static const CombiData *LETTER_DATA[26] = {
00394 A_DATA, 0, C_DATA, 0, E_DATA, 0, G_DATA, H_DATA, I_DATA, J_DATA,
00395 K_DATA, L_DATA, 0, N_DATA, O_DATA, 0, 0, R_DATA, S_DATA, T_DATA,
00396 U_DATA, 0, W_DATA, 0, Y_DATA, Z_DATA
00397 };
00398
00399 static const CombiData AE_DATA[] = {
00400 { Acute, 0x01FC, 0x01FD },
00401 { Macron, 0x01E2, 0x01E3 },
00402 { NoAccent, 0x0000, 0x0000 }
00403 };
00404 static const CombiData NULL_DATA[] = {
00405 { Acute, 0x01FE, 0x01FF },
00406 { NoAccent, 0x0000, 0x0000 }
00407 };
00408 static const CombiData I_LOWER_DATA[] = {
00409 { Grave, 0x0000, 0x00EC },
00410 { Acute, 0x0000, 0x00ED },
00411 { Circumflex, 0x0000, 0x00EE },
00412 { Tilde, 0x0000, 0x0129 },
00413 { Diaeresis, 0x0000, 0x00EF },
00414 { Macron, 0x0000, 0x012B },
00415 { Dot, 0x0000, 0x0045 },
00416 { Cedilla, 0x0000, 0x012F },
00417 { NoAccent, 0x0000, 0x0000 }
00418 };
00419
00420 struct SpecialCombiData {
00421 Unicode upper, lower;
00422 const CombiData *data;
00423 };
00424 static const SpecialCombiData SPECIAL_COMBI_DATA[] = {
00425 { 0x00C6, 0x00E6, AE_DATA },
00426 { 0x00D8, 0x00E8, NULL_DATA },
00427 { 0x0000, 0x0131, I_LOWER_DATA },
00428 { 0x0000, 0x0000, 0 }
00429 };
00430
00431 Unicode checkCombi(Unicode letter, Unicode accent)
00432 {
00433
00434 if ( !isAccent( type(accent) ) ) return 0;
00435 if ( type(letter)!=Letter_CanHaveAccent ) return 0;
00436
00437
00438 uint i = 0;
00439 for (;;) {
00440 if ( ACCENT_DATA[i].u==0 ) return 0;
00441 if ( ACCENT_DATA[i].u==accent ) break;
00442 i++;
00443 }
00444
00445
00446 const CombiData *data = 0;
00447 bool upper = true;
00448 if ( letter>='A' && letter<='Z' )
00449 data = LETTER_DATA[letter-'A'];
00450 else if ( letter>='a' && letter<='z' ) {
00451 data = LETTER_DATA[letter-'a'];
00452 upper = false;
00453 } else {
00454 uint k = 0;
00455 for (;;) {
00456 if ( SPECIAL_COMBI_DATA[k].data==0 ) return 0;
00457 if ( letter==SPECIAL_COMBI_DATA[k].upper ) {
00458 data = SPECIAL_COMBI_DATA[k].data;
00459 break;
00460 } else if ( letter==SPECIAL_COMBI_DATA[k].lower ) {
00461 data = SPECIAL_COMBI_DATA[k].data;
00462 upper = false;
00463 break;
00464 }
00465 k++;
00466 }
00467 }
00468 if ( data==0 ) return 0;
00469
00470
00471 uint l = 0;
00472 while ( data[l].type!=NoAccent ) {
00473 if ( data[l].type==ACCENT_DATA[i].type )
00474 return (upper ? data[l].upper : data[l].lower);
00475 l++;
00476 }
00477 return 0;
00478 }
00479
00480 }