filters

pdftotext.cc

00001 //========================================================================
00002 //
00003 // pdftotext.cc
00004 //
00005 // Copyright 1997-2002 Glyph & Cog, LLC
00006 //
00007 //========================================================================
00008 
00009 #include <aconf.h>
00010 #include <stdio.h>
00011 #include <stdlib.h>
00012 #include <stddef.h>
00013 #include <string.h>
00014 #include "parseargs.h"
00015 #include "GString.h"
00016 #include "gmem.h"
00017 #include "GlobalParams.h"
00018 #include "Object.h"
00019 #include "Stream.h"
00020 #include "Array.h"
00021 #include "Dict.h"
00022 #include "XRef.h"
00023 #include "Catalog.h"
00024 #include "Page.h"
00025 #include "PDFDoc.h"
00026 #include "TextOutputDev.h"
00027 #include "CharTypes.h"
00028 #include "UnicodeMap.h"
00029 #include "Error.h"
00030 #include "config.h"
00031 
00032 static void printInfoString(FILE *f, Dict *infoDict, char *key,
00033                 char *text1, char *text2, UnicodeMap *uMap);
00034 static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt);
00035 
00036 static int firstPage = 1;
00037 static int lastPage = 0;
00038 static GBool rawOrder = gFalse;
00039 static GBool htmlMeta = gFalse;
00040 static char textEncName[128] = "";
00041 static char textEOL[16] = "";
00042 static char ownerPassword[33] = "";
00043 static char userPassword[33] = "";
00044 static GBool quiet = gFalse;
00045 static char cfgFileName[256] = "";
00046 static GBool printVersion = gFalse;
00047 static GBool printHelp = gFalse;
00048 
00049 static ArgDesc argDesc[] = {
00050   {"-f",      argInt,      &firstPage,     0,
00051    "first page to convert"},
00052   {"-l",      argInt,      &lastPage,      0,
00053    "last page to convert"},
00054   {"-raw",    argFlag,     &rawOrder,      0,
00055    "keep strings in content stream order"},
00056   {"-htmlmeta", argFlag,   &htmlMeta,      0,
00057    "generate a simple HTML file, including the meta information"},
00058   {"-enc",    argString,   textEncName,    sizeof(textEncName),
00059    "output text encoding name"},
00060   {"-eol",    argString,   textEOL,        sizeof(textEOL),
00061    "output end-of-line convention (unix, dos, or mac)"},
00062   {"-opw",    argString,   ownerPassword,  sizeof(ownerPassword),
00063    "owner password (for encrypted files)"},
00064   {"-upw",    argString,   userPassword,   sizeof(userPassword),
00065    "user password (for encrypted files)"},
00066   {"-q",      argFlag,     &quiet,         0,
00067    "don't print any messages or errors"},
00068   {"-cfg",        argString,      cfgFileName,    sizeof(cfgFileName),
00069    "configuration file to use in place of .xpdfrc"},
00070   {"-v",      argFlag,     &printVersion,  0,
00071    "print copyright and version info"},
00072   {"-h",      argFlag,     &printHelp,     0,
00073    "print usage information"},
00074   {"-help",   argFlag,     &printHelp,     0,
00075    "print usage information"},
00076   {"--help",  argFlag,     &printHelp,     0,
00077    "print usage information"},
00078   {"-?",      argFlag,     &printHelp,     0,
00079    "print usage information"},
00080   {NULL}
00081 };
00082 
00083 int main(int argc, char *argv[]) {
00084   PDFDoc *doc;
00085   GString *fileName;
00086   GString *textFileName;
00087   GString *ownerPW, *userPW;
00088   TextOutputDev *textOut;
00089   FILE *f;
00090   UnicodeMap *uMap;
00091   Object info;
00092   GBool ok;
00093   char *p;
00094   int exitCode;
00095 
00096   exitCode = 99;
00097 
00098   // parse args
00099   ok = parseArgs(argDesc, &argc, argv);
00100   if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
00101     fprintf(stderr, "pdftotext version %s\n", xpdfVersion);
00102     fprintf(stderr, "%s\n", xpdfCopyright);
00103     if (!printVersion) {
00104       printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
00105     }
00106     goto err0;
00107   }
00108   fileName = new GString(argv[1]);
00109 
00110   // read config file
00111   globalParams = new GlobalParams(cfgFileName);
00112   if (textEncName[0]) {
00113     globalParams->setTextEncoding(textEncName);
00114   }
00115   if (textEOL[0]) {
00116     if (!globalParams->setTextEOL(textEOL)) {
00117       fprintf(stderr, "Bad '-eol' value on command line\n");
00118     }
00119   }
00120   if (quiet) {
00121     globalParams->setErrQuiet(quiet);
00122   }
00123 
00124   // get mapping to output encoding
00125   if (!(uMap = globalParams->getTextEncoding())) {
00126     error(-1, "Couldn't get text encoding");
00127     delete fileName;
00128     goto err1;
00129   }
00130 
00131   // open PDF file
00132   if (ownerPassword[0]) {
00133     ownerPW = new GString(ownerPassword);
00134   } else {
00135     ownerPW = NULL;
00136   }
00137   if (userPassword[0]) {
00138     userPW = new GString(userPassword);
00139   } else {
00140     userPW = NULL;
00141   }
00142   doc = new PDFDoc(fileName, ownerPW, userPW);
00143   if (userPW) {
00144     delete userPW;
00145   }
00146   if (ownerPW) {
00147     delete ownerPW;
00148   }
00149   if (!doc->isOk()) {
00150     exitCode = 1;
00151     goto err2;
00152   }
00153 
00154   // check for copy permission
00155   if (!doc->okToCopy()) {
00156     error(-1, "Copying of text from this document is not allowed.");
00157     exitCode = 3;
00158     goto err2;
00159   }
00160 
00161   // construct text file name
00162   if (argc == 3) {
00163     textFileName = new GString(argv[2]);
00164   } else {
00165     p = fileName->getCString() + fileName->getLength() - 4;
00166     if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
00167       textFileName = new GString(fileName->getCString(),
00168                  fileName->getLength() - 4);
00169     } else {
00170       textFileName = fileName->copy();
00171     }
00172     textFileName->append(htmlMeta ? ".html" : ".txt");
00173   }
00174 
00175   // get page range
00176   if (firstPage < 1) {
00177     firstPage = 1;
00178   }
00179   if (lastPage < 1 || lastPage > doc->getNumPages()) {
00180     lastPage = doc->getNumPages();
00181   }
00182 
00183   // write HTML header
00184   if (htmlMeta) {
00185     if (!textFileName->cmp("-")) {
00186       f = stdout;
00187     } else {
00188       if (!(f = fopen(textFileName->getCString(), "wb"))) {
00189     error(-1, "Couldn't open text file '%s'", textFileName->getCString());
00190     exitCode = 2;
00191     goto err3;
00192       }
00193     }
00194     fputs("<html>\n", f);
00195     fputs("<head>\n", f);
00196     doc->getDocInfo(&info);
00197     if (info.isDict()) {
00198       printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
00199               uMap);
00200       printInfoString(f, info.getDict(), "Subject",
00201               "<meta name=\"Subject\" content=\"", "\">\n", uMap);
00202       printInfoString(f, info.getDict(), "Keywords",
00203               "<meta name=\"Keywords\" content=\"", "\">\n", uMap);
00204       printInfoString(f, info.getDict(), "Author",
00205               "<meta name=\"Author\" content=\"", "\">\n", uMap);
00206       printInfoString(f, info.getDict(), "Creator",
00207               "<meta name=\"Creator\" content=\"", "\">\n", uMap);
00208       printInfoString(f, info.getDict(), "Producer",
00209               "<meta name=\"Producer\" content=\"", "\">\n", uMap);
00210       printInfoDate(f, info.getDict(), "CreationDate",
00211             "<meta name=\"CreationDate\" content=\"\">\n");
00212       printInfoDate(f, info.getDict(), "LastModifiedDate",
00213             "<meta name=\"ModDate\" content=\"\">\n");
00214     }
00215     info.free();
00216     fputs("</head>\n", f);
00217     fputs("<body>\n", f);
00218     fputs("<pre>\n", f);
00219     if (f != stdout) {
00220       fclose(f);
00221     }
00222   }
00223 
00224   // write text file
00225   textOut = new TextOutputDev(textFileName->getCString(), rawOrder, htmlMeta);
00226   if (textOut->isOk()) {
00227     doc->displayPages(textOut, firstPage, lastPage, 72, 0, gFalse);
00228   } else {
00229     delete textOut;
00230     exitCode = 2;
00231     goto err3;
00232   }
00233   delete textOut;
00234 
00235   // write end of HTML file
00236   if (htmlMeta) {
00237     if (!textFileName->cmp("-")) {
00238       f = stdout;
00239     } else {
00240       if (!(f = fopen(textFileName->getCString(), "ab"))) {
00241     error(-1, "Couldn't open text file '%s'", textFileName->getCString());
00242     exitCode = 2;
00243     goto err3;
00244       }
00245     }
00246     fputs("</pre>\n", f);
00247     fputs("</body>\n", f);
00248     fputs("</html>\n", f);
00249     if (f != stdout) {
00250       fclose(f);
00251     }
00252   }
00253 
00254   exitCode = 0;
00255 
00256   // clean up
00257  err3:
00258   delete textFileName;
00259  err2:
00260   delete doc;
00261   uMap->decRefCnt();
00262  err1:
00263   delete globalParams;
00264  err0:
00265 
00266   // check for memory leaks
00267   Object::memCheck(stderr);
00268   gMemReport(stderr);
00269 
00270   return exitCode;
00271 }
00272 
00273 static void printInfoString(FILE *f, Dict *infoDict, char *key,
00274                 char *text1, char *text2, UnicodeMap *uMap) {
00275   Object obj;
00276   GString *s1;
00277   GBool isUnicode;
00278   Unicode u;
00279   char buf[8];
00280   int i, n;
00281 
00282   if (infoDict->lookup(key, &obj)->isString()) {
00283     fputs(text1, f);
00284     s1 = obj.getString();
00285     if ((s1->getChar(0) & 0xff) == 0xfe &&
00286     (s1->getChar(1) & 0xff) == 0xff) {
00287       isUnicode = gTrue;
00288       i = 2;
00289     } else {
00290       isUnicode = gFalse;
00291       i = 0;
00292     }
00293     while (i < obj.getString()->getLength()) {
00294       if (isUnicode) {
00295     u = ((s1->getChar(i) & 0xff) << 8) |
00296         (s1->getChar(i+1) & 0xff);
00297     i += 2;
00298       } else {
00299     u = s1->getChar(i) & 0xff;
00300     ++i;
00301       }
00302       n = uMap->mapUnicode(u, buf, sizeof(buf));
00303       fwrite(buf, 1, n, f);
00304     }
00305     fputs(text2, f);
00306   }
00307   obj.free();
00308 }
00309 
00310 static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt) {
00311   Object obj;
00312   char *s;
00313 
00314   if (infoDict->lookup(key, &obj)->isString()) {
00315     s = obj.getString()->getCString();
00316     if (s[0] == 'D' && s[1] == ':') {
00317       s += 2;
00318     }
00319     fprintf(f, fmt, s);
00320   }
00321   obj.free();
00322 }
KDE Home | KDE Accessibility Home | Description of Access Keys