00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "regexp.h"
00023
00024 #include "lexer.h"
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028
00029 using namespace KJS;
00030
00031 RegExp::RegExp(const UString &p, int f)
00032 : pat(p), flgs(f), m_notEmpty(false)
00033 {
00034
00035
00036
00037 UString intern;
00038 if (p.find('\\') >= 0) {
00039 bool escape = false;
00040 for (int i = 0; i < p.size(); ++i) {
00041 UChar c = p[i];
00042 if (escape) {
00043 escape = false;
00044
00045 if (c == 'u' && i + 4 < p.size()) {
00046 int c0 = p[i+1].unicode();
00047 int c1 = p[i+2].unicode();
00048 int c2 = p[i+3].unicode();
00049 int c3 = p[i+4].unicode();
00050 if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) &&
00051 Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) {
00052 c = Lexer::convertUnicode(c0, c1, c2, c3);
00053 intern += UString(&c, 1);
00054 i += 4;
00055 continue;
00056 }
00057 }
00058 intern += UString('\\');
00059 intern += UString(&c, 1);
00060 } else {
00061 if (c == '\\')
00062 escape = true;
00063 else
00064 intern += UString(&c, 1);
00065 }
00066 }
00067 } else {
00068 intern = p;
00069 }
00070
00071 #ifdef HAVE_PCREPOSIX
00072 int pcreflags = 0;
00073 const char *perrormsg;
00074 int errorOffset;
00075
00076 if (flgs & IgnoreCase)
00077 pcreflags |= PCRE_CASELESS;
00078
00079 if (flgs & Multiline)
00080 pcreflags |= PCRE_MULTILINE;
00081
00082 pcregex = pcre_compile(intern.ascii(), pcreflags,
00083 &perrormsg, &errorOffset, NULL);
00084 #ifndef NDEBUG
00085 if (!pcregex)
00086 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
00087 #endif
00088
00089 #ifdef PCRE_INFO_CAPTURECOUNT
00090
00091 int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00092 if (rc != 0)
00093 #endif
00094 nrSubPatterns = 0;
00095
00096 #else
00097
00098 nrSubPatterns = 0;
00099 int regflags = 0;
00100 #ifdef REG_EXTENDED
00101 regflags |= REG_EXTENDED;
00102 #endif
00103 #ifdef REG_ICASE
00104 if ( f & IgnoreCase )
00105 regflags |= REG_ICASE;
00106 #endif
00107
00108
00109
00110
00111
00112
00113 if (regcomp(&preg, intern.ascii(), regflags) != 0) {
00114
00115 regcomp(&preg, "", regflags);
00116 }
00117 #endif
00118 }
00119
00120 RegExp::~RegExp()
00121 {
00122 #ifdef HAVE_PCREPOSIX
00123 if (pcregex)
00124 pcre_free(pcregex);
00125 #else
00126
00127 regfree(&preg);
00128 #endif
00129 }
00130
00131 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
00132 {
00133 if (i < 0)
00134 i = 0;
00135 if (ovector)
00136 *ovector = 0L;
00137 int dummyPos;
00138 if (!pos)
00139 pos = &dummyPos;
00140 *pos = -1;
00141 if (i > s.size() || s.isNull())
00142 return UString::null;
00143
00144 #ifdef HAVE_PCREPOSIX
00145 CString buffer(s.cstring());
00146 int bufferSize = buffer.size();
00147 int ovecsize = (nrSubPatterns+1)*3;
00148 if (ovector) *ovector = new int[ovecsize];
00149 if (!pcregex)
00150 return UString::null;
00151
00152 if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i,
00153 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0,
00154 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00155 {
00156
00157 if ((flgs & Global) && m_notEmpty && ovector)
00158 {
00159
00160
00161
00162 #ifndef NDEBUG
00163 fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
00164 #endif
00165 m_notEmpty = 0;
00166 if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i+1, 0,
00167 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00168 return UString::null;
00169 }
00170 else
00171 return UString::null;
00172 }
00173
00174
00175
00176 if (!ovector)
00177 return UString::null;
00178 #else
00179 const uint maxMatch = 10;
00180 regmatch_t rmatch[maxMatch];
00181
00182 char *str = strdup(s.ascii());
00183 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00184 free(str);
00185 return UString::null;
00186 }
00187 free(str);
00188
00189 if (!ovector) {
00190 *pos = rmatch[0].rm_so + i;
00191 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00192 }
00193
00194
00195 nrSubPatterns = 0;
00196 for(uint j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
00197 nrSubPatterns++;
00198 int ovecsize = (nrSubPatterns+1)*3;
00199 *ovector = new int[ovecsize];
00200 for (uint j = 0; j < nrSubPatterns + 1; j++) {
00201 if (j>maxMatch)
00202 break;
00203 (*ovector)[2*j] = rmatch[j].rm_so + i;
00204 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00205 }
00206 #endif
00207
00208 *pos = (*ovector)[0];
00209 #ifdef HAVE_PCREPOSIX
00210 if ( *pos == (*ovector)[1] && (flgs & Global) )
00211 {
00212
00213 m_notEmpty=true;
00214 }
00215 #endif
00216 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00217 }
00218
00219 #if 0 // unused
00220 bool RegExp::test(const UString &s, int)
00221 {
00222 #ifdef HAVE_PCREPOSIX
00223 int ovector[300];
00224 CString buffer(s.cstring());
00225
00226 if (s.isNull() ||
00227 pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
00228 0, ovector, 300) == PCRE_ERROR_NOMATCH)
00229 return false;
00230 else
00231 return true;
00232
00233 #else
00234
00235 char *str = strdup(s.ascii());
00236 int r = regexec(&preg, str, 0, 0, 0);
00237 free(str);
00238
00239 return r == 0;
00240 #endif
00241 }
00242 #endif