Package logilab-common-0 :: Package 39 :: Package 0 :: Module textutils
[frames] | no frames]

Source Code for Module logilab-common-0.39.0.textutils

  1  """Some text manipulation utility functions. 
  2   
  3  :author:    Logilab 
  4  :copyright: 2003-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  5  :contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  6  :license: General Public License version 2 - http://www.gnu.org/licenses 
  7   
  8  :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ 
  9  unquote, colorize_ansi 
 10  :group text manipulation: searchall, get_csv 
 11  :sort: text formatting, text manipulation 
 12   
 13  :type ANSI_STYLES: dict(str) 
 14  :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code 
 15   
 16  :type ANSI_COLORS: dict(str) 
 17  :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code 
 18   
 19  :type ANSI_PREFIX: str 
 20  :var ANSI_PREFIX: 
 21    ANSI terminal code notifing the start of an ANSI escape sequence 
 22     
 23  :type ANSI_END: str 
 24  :var ANSI_END: 
 25    ANSI terminal code notifing the end of an ANSI escape sequence 
 26     
 27  :type ANSI_RESET: str 
 28  :var ANSI_RESET: 
 29    ANSI terminal code reseting format defined by a previous ANSI escape sequence 
 30  """ 
 31  __docformat__ = "restructuredtext en" 
 32   
 33  import re 
 34  from unicodedata import normalize as _uninormalize 
 35  try: 
 36      from os import linesep 
 37  except ImportError: 
 38      linesep = '\n' # gae 
 39   
 40   
 41  MANUAL_UNICODE_MAP = { 
 42      u'\xa1': u'!',    # INVERTED EXCLAMATION MARK 
 43      u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE 
 44      u'\u2044': u'/',  # FRACTION SLASH 
 45      u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE 
 46      u'\xa9': u'(c)',  # COPYRIGHT SIGN 
 47      u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 
 48      u'\xe6': u'ae',   # LATIN SMALL LETTER AE 
 49      u'\xae': u'(r)',  # REGISTERED SIGN 
 50      u'\u0153': u'oe', # LATIN SMALL LIGATURE OE 
 51      u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE 
 52      u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE 
 53      u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE 
 54      u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 
 55      u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S 
 56      } 
 57   
58 -def unormalize(ustring, ignorenonascii=False):
59 """replace diacritical characters with their corresponding ascii characters 60 """ 61 res = [] 62 for letter in ustring[:]: 63 try: 64 replacement = MANUAL_UNICODE_MAP[letter] 65 except KeyError: 66 if ord(letter) >= 2**8: 67 if ignorenonascii: 68 continue 69 raise ValueError("can't deal with non-ascii based characters") 70 replacement = _uninormalize('NFD', letter)[0] 71 res.append(replacement) 72 return u''.join(res)
73
74 -def unquote(string):
75 """remove optional quotes (simple or double) from the string 76 77 :type string: str or unicode 78 :param string: an optionaly quoted string 79 80 :rtype: str or unicode 81 :return: the unquoted string (or the input string if it wasn't quoted) 82 """ 83 if not string: 84 return string 85 if string[0] in '"\'': 86 string = string[1:] 87 if string[-1] in '"\'': 88 string = string[:-1] 89 return string
90 91 92 _BLANKLINES_RGX = re.compile('\r?\n\r?\n') 93 _NORM_SPACES_RGX = re.compile('\s+') 94
95 -def normalize_text(text, line_len=80, indent='', rest=False):
96 """normalize a text to display it with a maximum line size and 97 optionally arbitrary indentation. Line jumps are normalized but blank 98 lines are kept. The indentation string may be used to insert a 99 comment (#) or a quoting (>) mark for instance. 100 101 :type text: str or unicode 102 :param text: the input text to normalize 103 104 :type line_len: int 105 :param line_len: expected maximum line's length, default to 80 106 107 :type indent: str or unicode 108 :param indent: optional string to use as indentation 109 110 :rtype: str or unicode 111 :return: 112 the input text normalized to fit on lines with a maximized size 113 inferior to `line_len`, and optionally prefixed by an 114 indentation string 115 """ 116 if rest: 117 normp = normalize_rest_paragraph 118 else: 119 normp = normalize_paragraph 120 result = [] 121 for text in _BLANKLINES_RGX.split(text): 122 result.append(normp(text, line_len, indent)) 123 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
124 125
126 -def normalize_paragraph(text, line_len=80, indent=''):
127 """normalize a text to display it with a maximum line size and 128 optionaly arbitrary indentation. Line jumps are normalized. The 129 indentation string may be used top insert a comment mark for 130 instance. 131 132 :type text: str or unicode 133 :param text: the input text to normalize 134 135 :type line_len: int 136 :param line_len: expected maximum line's length, default to 80 137 138 :type indent: str or unicode 139 :param indent: optional string to use as indentation 140 141 :rtype: str or unicode 142 :return: 143 the input text normalized to fit on lines with a maximized size 144 inferior to `line_len`, and optionally prefixed by an 145 indentation string 146 """ 147 text = _NORM_SPACES_RGX.sub(' ', text) 148 line_len = line_len - len(indent) 149 lines = [] 150 while text: 151 aline, text = splittext(text.strip(), line_len) 152 lines.append(indent + aline) 153 return linesep.join(lines)
154
155 -def normalize_rest_paragraph(text, line_len=80, indent=''):
156 """normalize a ReST text to display it with a maximum line size and 157 optionaly arbitrary indentation. Line jumps are normalized. The 158 indentation string may be used top insert a comment mark for 159 instance. 160 161 :type text: str or unicode 162 :param text: the input text to normalize 163 164 :type line_len: int 165 :param line_len: expected maximum line's length, default to 80 166 167 :type indent: str or unicode 168 :param indent: optional string to use as indentation 169 170 :rtype: str or unicode 171 :return: 172 the input text normalized to fit on lines with a maximized size 173 inferior to `line_len`, and optionally prefixed by an 174 indentation string 175 """ 176 toreport = '' 177 lines = [] 178 line_len = line_len - len(indent) 179 for line in text.splitlines(): 180 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) 181 toreport = '' 182 while len(line) > line_len: 183 # too long line, need split 184 line, toreport = splittext(line, line_len) 185 lines.append(indent + line) 186 if toreport: 187 line = toreport + ' ' 188 toreport = '' 189 else: 190 line = '' 191 if line: 192 lines.append(indent + line.strip()) 193 return linesep.join(lines)
194
195 -def splittext(text, line_len):
196 """split the given text on space according to the given max line size 197 198 return a 2-uple: 199 * a line <= line_len if possible 200 * the rest of the text which has to be reported on another line 201 """ 202 if len(text) <= line_len: 203 return text, '' 204 pos = min(len(text)-1, line_len) 205 while pos > 0 and text[pos] != ' ': 206 pos -= 1 207 if pos == 0: 208 pos = min(len(text), line_len) 209 while len(text) > pos and text[pos] != ' ': 210 pos += 1 211 return text[:pos], text[pos+1:].strip()
212 213
214 -def get_csv(string, sep=','):
215 """return a list of string in from a csv formatted line 216 217 >>> get_csv('a, b, c , 4') 218 ['a', 'b', 'c', '4'] 219 >>> get_csv('a') 220 ['a'] 221 >>> 222 223 :type string: str or unicode 224 :param string: a csv line 225 226 :type sep: str or unicode 227 :param sep: field separator, default to the comma (',') 228 229 :rtype: str or unicode 230 :return: the unquoted string (or the input string if it wasn't quoted) 231 """ 232 return [word.strip() for word in string.split(sep) if word.strip()]
233 234 _BLANK_URE = r'(\s|,)+' 235 _BLANK_RE = re.compile(_BLANK_URE) 236 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' 237 __UNITS_URE = r'[a-zA-Z]+' 238 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE,__UNITS_URE)) 239 240 BYTE_UNITS = { 241 "B": 1, 242 "KB": 1024, 243 "MB": 1024 ** 2, 244 "GB": 1024 ** 3, 245 "TB": 1024 ** 4, 246 } 247 248 TIME_UNITS = { 249 "ms": 0.0001, 250 "s": 1, 251 "min": 60, 252 "h": 60 * 60, 253 "d": 60 * 60 *24, 254 } 255
256 -def apply_units( string, units, inter=None, final=float, blank_reg=_BLANK_RE, 257 value_reg=_VALUE_RE):
258 """Parse the string applying the units defined in units 259 (eg: "1.5m",{'m',60} -> 80). 260 261 :type string: str or unicode 262 :param string: the string to parse 263 264 :type units: dict (or any object with __getitem__ using basestring key) 265 :param units: a dict mapping a unit string repr to its value 266 267 :type inter: type 268 :param inter: used to parse every intermediate value (need __sum__) 269 270 :type blank_reg: regexp 271 :param blank_reg: should match eveyr blank char to ignore. 272 273 :type value_reg: regexp with "value" and optional "unit" group 274 :param value_reg: match a value and it's unit into the 275 """ 276 if inter is None: 277 inter = final 278 279 280 string = _BLANK_RE.sub('',string) 281 values = [] 282 for match in value_reg.finditer(string): 283 dic = match.groupdict() 284 #import sys 285 #print >> sys.stderr, dic 286 lit, unit = dic["value"], dic.get("unit") 287 value = inter(lit) 288 if unit is not None: 289 value *= units[unit] 290 values.append(value) 291 292 return final(sum(values))
293 294 _LINE_RGX = re.compile('\r\n|\r+|\n') 295
296 -def pretty_match(match, string, underline_char='^'):
297 """return a string with the match location underlined: 298 299 >>> import re 300 >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon') 301 il mange du bacon 302 ^^^^^ 303 >>> 304 305 :type match: _sre.SRE_match 306 :param match: object returned by re.match, re.search or re.finditer 307 308 :type string: str or unicode 309 :param string: 310 the string on which the regular expression has been applied to 311 obtain the `match` object 312 313 :type underline_char: str or unicode 314 :param underline_char: 315 character to use to underline the matched section, default to the 316 carret '^' 317 318 :rtype: str or unicode 319 :return: 320 the original string with an inserted line to underline the match 321 location 322 """ 323 start = match.start() 324 end = match.end() 325 string = _LINE_RGX.sub(linesep, string) 326 start_line_pos = string.rfind(linesep, 0, start) 327 if start_line_pos == -1: 328 start_line_pos = 0 329 result = [] 330 else: 331 result = [string[:start_line_pos]] 332 start_line_pos += len(linesep) 333 offset = start - start_line_pos 334 underline = ' ' * offset + underline_char * (end - start) 335 end_line_pos = string.find(linesep, end) 336 if end_line_pos == -1: 337 string = string[start_line_pos:] 338 result.append(string) 339 result.append(underline) 340 else: 341 end = string[end_line_pos + len(linesep):] 342 string = string[start_line_pos:end_line_pos] 343 result.append(string) 344 result.append(underline) 345 result.append(end) 346 return linesep.join(result).rstrip()
347 348 349 # Ansi colorization ########################################################### 350 351 ANSI_PREFIX = '\033[' 352 ANSI_END = 'm' 353 ANSI_RESET = '\033[0m' 354 ANSI_STYLES = { 355 'reset' : "0", 356 'bold' : "1", 357 'italic' : "3", 358 'underline' : "4", 359 'blink' : "5", 360 'inverse' : "7", 361 'strike' : "9", 362 } 363 ANSI_COLORS = { 364 'reset' : "0", 365 'black' : "30", 366 'red' : "31", 367 'green' : "32", 368 'yellow' : "33", 369 'blue' : "34", 370 'magenta' : "35", 371 'cyan' : "36", 372 'white' : "37", 373 } 374 375
376 -def _get_ansi_code(color=None, style=None):
377 """return ansi escape code corresponding to color and style 378 379 :type color: str or None 380 :param color: 381 the color identifier (see `ANSI_COLORS` for available values) 382 383 :type style: str or None 384 :param style: 385 style string (see `ANSI_COLORS` for available values). To get 386 several style effects at the same time, use a coma as separator. 387 388 :raise KeyError: if an unexistant color or style identifier is given 389 390 :rtype: str 391 :return: the built escape code 392 """ 393 ansi_code = [] 394 if style: 395 style_attrs = get_csv(style) 396 for effect in style_attrs: 397 ansi_code.append(ANSI_STYLES[effect]) 398 if color: 399 ansi_code.append(ANSI_COLORS[color]) 400 if ansi_code: 401 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END 402 return ''
403
404 -def colorize_ansi(msg, color=None, style=None):
405 """colorize message by wrapping it with ansi escape codes 406 407 :type msg: str or unicode 408 :param msg: the message string to colorize 409 410 :type color: str or None 411 :param color: 412 the color identifier (see `ANSI_COLORS` for available values) 413 414 :type style: str or None 415 :param style: 416 style string (see `ANSI_COLORS` for available values). To get 417 several style effects at the same time, use a coma as separator. 418 419 :raise KeyError: if an unexistant color or style identifier is given 420 421 :rtype: str or unicode 422 :return: the ansi escaped string 423 """ 424 # If both color and style are not defined, then leave the text as is 425 if color is None and style is None: 426 return msg 427 escape_code = _get_ansi_code(color, style) 428 # If invalid (or unknown) color, don't wrap msg with ansi codes 429 if escape_code: 430 return '%s%s%s' % (escape_code, msg, ANSI_RESET) 431 return msg
432