wvtclstring.cc

00001 /*
00002  * Worldvisions Weaver Software:
00003  *   Copyright (C) 1997-2002 Net Integration Technologies, Inc.
00004  */
00005 #include "wvbackslash.h"
00006 #include "wvbuf.h"
00007 #include "wvstream.h"
00008 #include "wvstring.h"
00009 #include "wvstringmask.h"
00010 #include "wvtclstring.h"
00011 
00012 const WvStringMask WVTCL_NASTY_SPACES(WVTCL_NASTY_SPACES_STR);
00013 const WvStringMask WVTCL_NASTY_NEWLINES(WVTCL_NASTY_NEWLINES_STR);
00014 const WvStringMask WVTCL_SPLITCHARS(WVTCL_SPLITCHARS_STR);
00015 
00016 static size_t wvtcl_escape(char *dst, const char *s, size_t s_len,
00017                            const WvStringMask &nasties, bool *verbatim = NULL)
00018 {
00019     if (verbatim) *verbatim = false;
00020 
00021     // NULL strings remain such
00022     if (s == NULL)
00023         return 0;
00024     // empty strings are just {}
00025     if (s_len == 0)
00026     {
00027         if (dst)
00028         {
00029             dst[0] = '{';
00030             dst[1] = '}';
00031         }
00032         return 2;
00033     }
00034     
00035     bool backslashify = false, inescape = false;
00036     int len = 0, unprintables = 0, bracecount = 0;
00037     const char *cptr, *cptr_end = s + s_len;
00038     
00039     // figure out which method we need to use: backslashify or embrace.
00040     // also count the number of unprintable characters we'll need to 
00041     // backslashify, if it turns out that's necessary.
00042     for (cptr = s; cptr != cptr_end; cptr++)
00043     {
00044         // Assume we do nothing
00045         if (dst) dst[len] = *cptr;
00046         ++len;
00047 
00048         if (!inescape && *cptr == '{')
00049             bracecount++;
00050         else if (!inescape && *cptr == '}')
00051             bracecount--;
00052         if (bracecount < 0)
00053             backslashify = true;
00054 
00055         bool doit = false;
00056         switch (*cptr)
00057         {
00058         case WVTCL_ALWAYS_NASTY_CASE:
00059             doit = true;
00060             break;
00061         default:
00062             if (nasties[*cptr])
00063                 doit = true;
00064         }
00065         if (doit)
00066             unprintables++;
00067 
00068         if (*cptr == '\\')
00069             inescape = !inescape;
00070         else
00071             inescape = false;
00072     }
00073     
00074     // if the braces aren't balanced, backslashify
00075     if (bracecount != 0 || inescape)
00076         backslashify = true;
00077 
00078     if (!backslashify && !unprintables)
00079     {
00080         if (verbatim) *verbatim = true;
00081         return len; // no work needed!
00082     }
00083     
00084     if (backslashify)
00085     {
00086         if (dst)
00087         {
00088             len = 0;
00089             for (cptr = s; cptr != cptr_end; ++cptr)
00090             {
00091                 bool doit = false;
00092                 switch (*cptr)
00093                 {
00094                 case WVTCL_ALWAYS_NASTY_CASE:
00095                     doit = true;
00096                     break;
00097                 default:
00098                     if (nasties[*cptr])
00099                         doit = true;
00100                 }
00101                 if (doit)
00102                     dst[len++] = '\\';
00103 
00104                 dst[len++] = *cptr;
00105             }
00106             return len;
00107         }
00108         else return len+unprintables;
00109     }
00110     else
00111     {
00112         // the embrace method: just take the string and put braces around it
00113         if (dst)
00114         {
00115             len = 0;
00116             dst[len++] = '{';
00117             for (cptr = s; cptr != cptr_end; ++cptr)
00118                 dst[len++] = *cptr;
00119             dst[len++] = '}';
00120             return len;
00121         }
00122         else return len+2;
00123     }
00124 }
00125 
00126 
00127 WvString wvtcl_escape(WvStringParm s, const WvStringMask &nasties)
00128 {
00129     size_t s_len = s.len();
00130 
00131     bool verbatim;
00132     size_t len = wvtcl_escape(NULL, s, s_len, nasties, &verbatim);
00133     if (verbatim) return s;
00134 
00135     WvString result;
00136     result.setsize(len);
00137     char *e = result.edit();
00138     e += wvtcl_escape(e, s, s_len, nasties);
00139     *e = '\0';
00140     return result;
00141 }
00142 
00143 
00144 static size_t wvtcl_unescape(char *dst, const char *s, size_t s_len,
00145         bool *verbatim = NULL)
00146 {
00147     //printf("  unescape '%s'\n", (const char *)s);
00148     
00149     // empty or NULL strings remain themselves
00150     if (!s)
00151     {
00152         if (verbatim) *verbatim = true;
00153         return 0;
00154     }
00155 
00156     if (verbatim) *verbatim = false;
00157     
00158     // deal with embraced strings by simply removing the braces
00159     if (s[0] == '{' && s[s_len-1] == '}')
00160     {
00161         if (dst) memcpy(dst, &s[1], s_len-2);
00162         return s_len - 2;
00163     }
00164     
00165     bool skipquotes = false;
00166     // deal with quoted strings by ignoring the quotes _and_ unbackslashifying.
00167     if (s[0] == '"' && s[s_len-1] == '"')
00168         skipquotes = true;
00169     
00170     // otherwise, unbackslashify it.
00171     const char *start = s, *end = &s[s_len];
00172     if (skipquotes)
00173     {
00174         ++start;
00175         --end;
00176     }
00177     size_t len = 0;
00178     bool inescape = false;
00179     for (; start != end; ++start)
00180     {
00181         if (*start == '\\')
00182         {
00183             if (inescape)
00184             {
00185                 if (dst) dst[len] = *start;
00186                 len++;
00187                 inescape = false;
00188             }
00189             else
00190                 inescape = true;
00191         }
00192         else
00193         {
00194             inescape = false;
00195             if (dst) dst[len] = *start;
00196             len++;
00197         }
00198     }
00199     return len;
00200 }
00201 
00202 
00203 WvString wvtcl_unescape(WvStringParm s)
00204 {
00205     size_t s_len = s.len();
00206 
00207     bool verbatim;
00208     size_t len = wvtcl_unescape(NULL, s, s_len, &verbatim);
00209     if (verbatim) return s;
00210 
00211     WvString result;
00212     result.setsize(len+1);
00213     char *e = result.edit();
00214     e += wvtcl_unescape(e, s, s_len);
00215     *e = '\0';
00216     return result;
00217 }
00218 
00219 
00220 WvString wvtcl_encode(WvList<WvString> &l, const WvStringMask &nasties,
00221                       const WvStringMask &splitchars)
00222 {
00223     int size = 0;
00224 
00225     WvList<WvString>::Iter i(l);
00226     int count = 0;
00227     for (i.rewind(); i.next(); )
00228     {
00229         size += wvtcl_escape(NULL, *i, i->len(), nasties);
00230         ++count;
00231     }
00232     
00233     WvString result;
00234     result.setsize(size+(count-1)+1);
00235 
00236     char *p = result.edit();
00237     int j;
00238     for (i.rewind(), j=0; i.next(); ++j)
00239     {
00240         p += wvtcl_escape(p, *i, i->len(), nasties);
00241         if (j < count - 1)
00242             *p++ = splitchars.first();
00243     }
00244     *p = '\0';
00245     
00246     return result;
00247 }
00248 
00249 const size_t WVTCL_GETWORD_NONE (UINT_MAX);
00250 
00251 static size_t wvtcl_getword(char *dst, const char *s, size_t s_len,
00252                             const WvStringMask &splitchars,
00253                             bool do_unescape, size_t *end = NULL)
00254 {
00255     //printf("      used=%d\n", origsize);
00256     if (!s_len) return WVTCL_GETWORD_NONE;
00257 
00258     bool inescape = false, inquote = false, incontinuation = false;
00259     int bracecount = 0;
00260     const char *origend = s + s_len;
00261     const char *sptr, *eptr;
00262 
00263     // skip leading separators
00264     for (sptr = s; sptr != origend; sptr++)
00265     {
00266         if (!splitchars[*sptr])
00267             break;
00268     }
00269 
00270     if (sptr == origend) // nothing left
00271         return WVTCL_GETWORD_NONE;
00272 
00273     // detect initial quote
00274     if (*sptr == '"')
00275     {
00276         inquote = true;
00277         eptr = sptr+1;
00278     }
00279     else
00280         eptr = sptr;
00281     
00282     // loop over string until something satisfactory is found
00283     for (; eptr != origend; eptr++)
00284     {
00285         char ch = *eptr;
00286         
00287         incontinuation = false;
00288         
00289         if (inescape)
00290         {
00291             if (ch == '\n')
00292             {
00293                 // technically we've finished the line-continuation
00294                 // sequence, but we require at least one more character
00295                 // in order to prove that there's a next line somewhere
00296                 // in the buffer.  Otherwise we might stop parsing before
00297                 // we're "really" done if we're given input line-by-line.
00298                 // 
00299                 // A better way to do this would be for getword() to *never*
00300                 // return a string unless it contains a separator character;
00301                 // then we wouldn't need this weird special case.  But it
00302                 // don't work like that; we'll return the last word in the
00303                 // buffer even if it *doesn't* end in a separator character.
00304                 incontinuation = true;
00305             }
00306             inescape = false;
00307         }
00308         else if (ch == '\\')
00309         {
00310             inescape = true;
00311             // now we need a character to complete the escape
00312         }
00313         else // not an escape sequence
00314         {
00315             // detect end of a quoted/unquoted string
00316             if (bracecount == 0)
00317             {
00318                 if (inquote)
00319                 {
00320                     if (ch == '"')
00321                     {
00322                         eptr++;
00323                         break;
00324                     }
00325                 }
00326                 else if (splitchars[ch])
00327                     break;
00328             }
00329             
00330             // match braces
00331             if (!inquote)
00332             {
00333                 if (ch == '{')
00334                     bracecount++;
00335                 else if (bracecount > 0 && ch == '}')
00336                     bracecount--;
00337             }
00338         }
00339     }
00340     
00341     if (bracecount || sptr==eptr || inquote || inescape || incontinuation)
00342         // not there yet...
00343         return WVTCL_GETWORD_NONE;
00344 
00345     //printf("len=%d, unget=%d\n", eptr - sptr, origend - eptr);
00346     if (end) *end = eptr - s;
00347 
00348     if (do_unescape)
00349         return wvtcl_unescape(dst, sptr, eptr-sptr);
00350     else
00351     {
00352         if (dst) memcpy(dst, sptr, eptr-sptr);
00353         return eptr - sptr;
00354     }
00355 }
00356 
00357 
00358 WvString wvtcl_getword(WvBuf &buf, const WvStringMask &splitchars,
00359                        bool do_unescape)
00360 {
00361     int origsize = buf.used();
00362     const char *origptr = (const char *)buf.get(origsize);
00363 
00364     size_t end;
00365     size_t len = wvtcl_getword(NULL, origptr, origsize,
00366             splitchars, do_unescape, &end);
00367     if (len == WVTCL_GETWORD_NONE)
00368     {
00369         buf.unget(origsize);
00370         return WvString::null;
00371     }
00372 
00373     WvString result;
00374     result.setsize(len+1);
00375     char *e = result.edit();
00376     e += wvtcl_getword(e, origptr, origsize, splitchars, do_unescape);
00377     *e = '\0';
00378 
00379     buf.unget(origsize - end);
00380 
00381     return result;
00382 }
00383 
00384 
00385 void wvtcl_decode(WvList<WvString> &l, WvStringParm _s,
00386                   const WvStringMask &splitchars, bool do_unescape)
00387 {
00388     const char *s = _s;
00389     size_t s_len = _s.len();
00390     for (;;)
00391     {
00392         size_t end;
00393         size_t len = wvtcl_getword(NULL, s, s_len,
00394                 splitchars, do_unescape, &end);
00395         if (len == WVTCL_GETWORD_NONE)
00396             break;
00397 
00398         WvString *word = new WvString();
00399         word->setsize(len+1);
00400 
00401         char *e = word->edit();
00402         e += wvtcl_getword(e, s, s_len, splitchars, do_unescape);
00403         *e = '\0';
00404         l.append(word, true);
00405 
00406         s += end;
00407         s_len -= end;
00408     }
00409 }

Generated on Mon Feb 5 10:54:30 2007 for WvStreams by  doxygen 1.5.1