filters

TextOutputDev.h

00001 //========================================================================
00002 //
00003 // TextOutputDev.h
00004 //
00005 // Copyright 1997-2002 Glyph & Cog, LLC
00006 //
00007 //========================================================================
00008 
00009 #ifndef TEXTOUTPUTDEV_H
00010 #define TEXTOUTPUTDEV_H
00011 
00012 #include <aconf.h>
00013 
00014 #ifdef USE_GCC_PRAGMAS
00015 #pragma interface
00016 #endif
00017 
00018 #include <stdio.h>
00019 #include "gtypes.h"
00020 #include "GfxFont.h"
00021 #include "OutputDev.h"
00022 
00023 class GfxState;
00024 class GString;
00025 class TextBlock;
00026 class TextLine;
00027 
00028 #undef TEXTOUT_DO_SYMBOLS
00029 
00030 //------------------------------------------------------------------------
00031 
00032 typedef void (*TextOutputFunc)(void *stream, char *text, int len);
00033 
00034 
00035 //------------------------------------------------------------------------
00036 // TextString
00037 //------------------------------------------------------------------------
00038 namespace PDFImport {
00039     class String;
00040     class Page;
00041 }
00042 
00043 class TextString {
00044 public:
00045 
00046   // Constructor.
00047   TextString(GfxState *state, double x0, double y0,
00048          double fontSize);
00049 
00050 
00051   // Destructor.
00052   virtual ~TextString();
00053 
00054   // Add a character to the string.
00055   virtual void addChar(GfxState *state, double x, double y,
00056                        double dx, double dy, Unicode u);
00057 
00058 protected:
00059   double xMin, xMax;        // bounding box x coordinates
00060   double yMin, yMax;        // bounding box y coordinates
00061   union {
00062     GBool marked;       // temporary flag used by coalesce()
00063     GBool spaceAfter;       // insert a space after this string?
00064   };
00065   Unicode *text;        // the text
00066   double *xRight;       // right-hand x coord of each char
00067   int len;          // length of text and xRight
00068   int size;         // size of text and xRight arrays
00069   TextString *next;
00070 
00071   friend class TextPage;
00072   friend class TextBlock;
00073   friend class PDFImport::String;
00074   friend class PDFImport::Page;
00075 };
00076 
00077 
00078 //------------------------------------------------------------------------
00079 // TextBlock
00080 //------------------------------------------------------------------------
00081 
00082 class TextBlock {
00083 public:
00084 
00085   TextBlock();
00086   ~TextBlock();
00087 
00088   double xMin, xMax;
00089   double yMin, yMax;
00090   TextString *strings;      // list of strings in the block
00091   TextBlock *next;      // next block in line
00092   TextBlock *xyNext;        // next block on xyBlocks list
00093   Unicode *text;        // Unicode text of the block, including
00094                 //   spaces between strings
00095   double *xRight;       // right-hand x coord of each char
00096   int len;          // total number of Unicode characters
00097   int convertedLen;     // total number of converted characters
00098   int *col;         // starting column number for each
00099                 //   Unicode character
00100 };
00101 
00102 //------------------------------------------------------------------------
00103 // TextLine
00104 //------------------------------------------------------------------------
00105 
00106 class TextLine {
00107 public:
00108 
00109   TextLine();
00110   ~TextLine();
00111 
00112   TextBlock *blocks;
00113   TextLine *next;
00114   double yMin, yMax;
00115 };
00116 
00117 //------------------------------------------------------------------------
00118 // TextPage
00119 //------------------------------------------------------------------------
00120 
00121 class TextPage {
00122 public:
00123 
00124   // Constructor.
00125   TextPage(GBool rawOrderA);
00126 
00127   // Destructor.
00128   virtual ~TextPage();
00129 
00130   // Update the current font.
00131   void updateFont(GfxState *state);
00132 
00133 
00134   // Begin a new string.
00135   virtual void beginString(GfxState *state, double x0, double y0);
00136 
00137   // Add a character to the current string.
00138   void addChar(GfxState *state, double x, double y,
00139            double dx, double dy, Unicode *u, int uLen);
00140 
00141   // End the current string, sorting it into the list of strings.
00142   virtual void endString();
00143 
00144   // Add a string, sorting it into the list of strings.
00145   virtual void addString(TextString *str);
00146 
00147 
00148   // Coalesce strings that look like parts of the same line.
00149   void coalesce();
00150 
00151   // Find a string.  If <top> is true, starts looking at top of page;
00152   // otherwise starts looking at <xMin>,<yMin>.  If <bottom> is true,
00153   // stops looking at bottom of page; otherwise stops looking at
00154   // <xMax>,<yMax>.  If found, sets the text bounding rectange and
00155   // returns true; otherwise returns false.
00156   GBool findText(Unicode *s, int len,
00157          GBool top, GBool bottom,
00158          double *xMin, double *yMin,
00159          double *xMax, double *yMax);
00160 
00161   // Get the text which is inside the specified rectangle.
00162   GString *getText(double xMin, double yMin,
00163            double xMax, double yMax);
00164 
00165   // Dump contents of page to a file.
00166   void dump(void *outputStream, TextOutputFunc outputFunc);
00167 
00168   // Clear the page.
00169   virtual void clear();
00170 
00171 private:
00172 
00173   GBool xyBefore(TextString *str1, TextString *str2);
00174   GBool xyBefore(TextBlock *blk1, TextBlock *blk2);
00175   GBool yxBefore(TextBlock *blk1, TextBlock *blk2);
00176   double coalesceFit(TextString *str1, TextString *str2);
00177 
00178   GBool rawOrder;       // keep strings in content stream order
00179 
00180   TextString *curStr;       // currently active string
00181   double fontSize;      // current font size
00182 
00183   TextString *xyStrings;    // strings in x-major order (before
00184                 //   they're sorted into lines)
00185   TextString *xyCur1, *xyCur2;  // cursors for xyStrings list
00186   TextLine *lines;      // list of lines
00187 
00188   int nest;         // current nesting level (for Type 3 fonts)
00189 
00190   int nTinyChars;       // number of "tiny" chars seen so far
00191 
00192   friend class PDFImport::Page;
00193 };
00194 
00195 //------------------------------------------------------------------------
00196 // TextOutputDev
00197 //------------------------------------------------------------------------
00198 
00199 class TextOutputDev: public OutputDev {
00200 public:
00201 
00202   // Open a text output file.  If <fileName> is NULL, no file is
00203   // written (this is useful, e.g., for searching text).  If
00204   // <rawOrder> is true, the text is kept in content stream order.
00205   TextOutputDev(char *fileName, GBool rawOrderA, GBool append);
00206 
00207   // Create a TextOutputDev which will write to a generic stream.  If
00208   // <rawOrder> is true, the text is kept in content stream order.
00209   TextOutputDev(TextOutputFunc func, void *stream, GBool rawOrderA);
00210 
00211   // Destructor.
00212   virtual ~TextOutputDev();
00213 
00214   // Check if file was successfully created.
00215   virtual GBool isOk() { return ok; }
00216 
00217   //---- get info about output device
00218 
00219   // Does this device use upside-down coordinates?
00220   // (Upside-down means (0,0) is the top left corner of the page.)
00221   virtual GBool upsideDown() { return gTrue; }
00222 
00223   // Does this device use drawChar() or drawString()?
00224   virtual GBool useDrawChar() { return gTrue; }
00225 
00226   // Does this device use beginType3Char/endType3Char?  Otherwise,
00227   // text in Type 3 fonts will be drawn with drawChar/drawString.
00228   virtual GBool interpretType3Chars() { return gFalse; }
00229 
00230   // Does this device need non-text content?
00231   virtual GBool needNonText() { return gFalse; }
00232 
00233   //----- initialization and control
00234 
00235   // Start a page.
00236   virtual void startPage(int pageNum, GfxState *state);
00237 
00238   // End a page.
00239   virtual void endPage();
00240 
00241   //----- update text state
00242   virtual void updateFont(GfxState *state);
00243 
00244   //----- text drawing
00245   virtual void beginString(GfxState *state, GString *s);
00246   virtual void endString(GfxState *state);
00247   virtual void drawChar(GfxState *state, double x, double y,
00248             double dx, double dy,
00249             double originX, double originY,
00250             CharCode c, Unicode *u, int uLen);
00251 
00252   //----- path painting
00253 
00254   //----- special access
00255 
00256   // Find a string.  If <top> is true, starts looking at top of page;
00257   // otherwise starts looking at <xMin>,<yMin>.  If <bottom> is true,
00258   // stops looking at bottom of page; otherwise stops looking at
00259   // <xMax>,<yMax>.  If found, sets the text bounding rectange and
00260   // returns true; otherwise returns false.
00261   GBool findText(Unicode *s, int len,
00262          GBool top, GBool bottom,
00263          double *xMin, double *yMin,
00264          double *xMax, double *yMax);
00265 
00266   // Get the text which is inside the specified rectangle.
00267   GString *getText(double xMin, double yMin,
00268            double xMax, double yMax);
00269 
00270 private:
00271 
00272   TextOutputFunc outputFunc;    // output function
00273   void *outputStream;       // output stream
00274   GBool needClose;      // need to close the output file?
00275                 //   (only if outputStream is a FILE*)
00276   TextPage *text;       // text for the current page
00277   GBool rawOrder;       // keep text in content stream order
00278   GBool ok;         // set up ok?
00279 
00280 };
00281 
00282 #endif
KDE Home | KDE Accessibility Home | Description of Access Keys