Package logilab :: Package common :: Module textutils
[frames] | no frames]

Source Code for Module logilab.common.textutils

  1  # copyright 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-common. 
  5  # 
  6  # logilab-common is free software: you can redistribute it and/or modify it under 
  7  # the terms of the GNU Lesser General Public License as published by the Free 
  8  # Software Foundation, either version 2.1 of the License, or (at your option) any 
  9  # later version. 
 10  # 
 11  # logilab-common is distributed in the hope that it will be useful, but WITHOUT 
 12  # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
 13  # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more 
 14  # details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-common.  If not, see <http://www.gnu.org/licenses/>. 
 18  """Some text manipulation utility functions. 
 19   
 20   
 21  :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ 
 22  unquote, colorize_ansi 
 23  :group text manipulation: searchall, splitstrip 
 24  :sort: text formatting, text manipulation 
 25   
 26  :type ANSI_STYLES: dict(str) 
 27  :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code 
 28   
 29  :type ANSI_COLORS: dict(str) 
 30  :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code 
 31   
 32  :type ANSI_PREFIX: str 
 33  :var ANSI_PREFIX: 
 34    ANSI terminal code notifying the start of an ANSI escape sequence 
 35   
 36  :type ANSI_END: str 
 37  :var ANSI_END: 
 38    ANSI terminal code notifying the end of an ANSI escape sequence 
 39   
 40  :type ANSI_RESET: str 
 41  :var ANSI_RESET: 
 42    ANSI terminal code resetting format defined by a previous ANSI escape sequence 
 43  """ 
 44  __docformat__ = "restructuredtext en" 
 45   
 46  import sys 
 47  import re 
 48  import os.path as osp 
 49  from unicodedata import normalize as _uninormalize 
 50  try: 
 51      from os import linesep 
 52  except ImportError: 
 53      linesep = '\n' # gae 
 54   
 55  from logilab.common.deprecation import deprecated 
 56   
 57  MANUAL_UNICODE_MAP = { 
 58      u'\xa1': u'!',    # INVERTED EXCLAMATION MARK 
 59      u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE 
 60      u'\u2044': u'/',  # FRACTION SLASH 
 61      u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE 
 62      u'\xa9': u'(c)',  # COPYRIGHT SIGN 
 63      u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 
 64      u'\xe6': u'ae',   # LATIN SMALL LETTER AE 
 65      u'\xae': u'(r)',  # REGISTERED SIGN 
 66      u'\u0153': u'oe', # LATIN SMALL LIGATURE OE 
 67      u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE 
 68      u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE 
 69      u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE 
 70      u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 
 71      u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S 
 72      } 
 73   
74 -def unormalize(ustring, ignorenonascii=False):
75 """replace diacritical characters with their corresponding ascii characters 76 77 Convert the unicode string to its long normalized form (unicode character 78 will be transform into several characters) and keep the first one only. 79 The normal form KD (NFKD) will apply the compatibility decomposition, i.e. 80 replace all compatibility characters with their equivalents. 81 82 :see: Another project about ASCII transliterations of Unicode text 83 http://pypi.python.org/pypi/Unidecode 84 """ 85 res = [] 86 for letter in ustring[:]: 87 try: 88 replacement = MANUAL_UNICODE_MAP[letter] 89 except KeyError: 90 if ord(letter) >= 2**8: 91 if ignorenonascii: 92 continue 93 raise ValueError("can't deal with non-ascii based characters") 94 replacement = _uninormalize('NFKD', letter)[0] 95 res.append(replacement) 96 return u''.join(res)
97
98 -def unquote(string):
99 """remove optional quotes (simple or double) from the string 100 101 :type string: str or unicode 102 :param string: an optionally quoted string 103 104 :rtype: str or unicode 105 :return: the unquoted string (or the input string if it wasn't quoted) 106 """ 107 if not string: 108 return string 109 if string[0] in '"\'': 110 string = string[1:] 111 if string[-1] in '"\'': 112 string = string[:-1] 113 return string
114 115 116 _BLANKLINES_RGX = re.compile('\r?\n\r?\n') 117 _NORM_SPACES_RGX = re.compile('\s+') 118
119 -def normalize_text(text, line_len=80, indent='', rest=False):
120 """normalize a text to display it with a maximum line size and 121 optionally arbitrary indentation. Line jumps are normalized but blank 122 lines are kept. The indentation string may be used to insert a 123 comment (#) or a quoting (>) mark for instance. 124 125 :type text: str or unicode 126 :param text: the input text to normalize 127 128 :type line_len: int 129 :param line_len: expected maximum line's length, default to 80 130 131 :type indent: str or unicode 132 :param indent: optional string to use as indentation 133 134 :rtype: str or unicode 135 :return: 136 the input text normalized to fit on lines with a maximized size 137 inferior to `line_len`, and optionally prefixed by an 138 indentation string 139 """ 140 if rest: 141 normp = normalize_rest_paragraph 142 else: 143 normp = normalize_paragraph 144 result = [] 145 for text in _BLANKLINES_RGX.split(text): 146 result.append(normp(text, line_len, indent)) 147 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
148 149
150 -def normalize_paragraph(text, line_len=80, indent=''):
151 """normalize a text to display it with a maximum line size and 152 optionally arbitrary indentation. Line jumps are normalized. The 153 indentation string may be used top insert a comment mark for 154 instance. 155 156 :type text: str or unicode 157 :param text: the input text to normalize 158 159 :type line_len: int 160 :param line_len: expected maximum line's length, default to 80 161 162 :type indent: str or unicode 163 :param indent: optional string to use as indentation 164 165 :rtype: str or unicode 166 :return: 167 the input text normalized to fit on lines with a maximized size 168 inferior to `line_len`, and optionally prefixed by an 169 indentation string 170 """ 171 text = _NORM_SPACES_RGX.sub(' ', text) 172 line_len = line_len - len(indent) 173 lines = [] 174 while text: 175 aline, text = splittext(text.strip(), line_len) 176 lines.append(indent + aline) 177 return linesep.join(lines)
178
179 -def normalize_rest_paragraph(text, line_len=80, indent=''):
180 """normalize a ReST text to display it with a maximum line size and 181 optionally arbitrary indentation. Line jumps are normalized. The 182 indentation string may be used top insert a comment mark for 183 instance. 184 185 :type text: str or unicode 186 :param text: the input text to normalize 187 188 :type line_len: int 189 :param line_len: expected maximum line's length, default to 80 190 191 :type indent: str or unicode 192 :param indent: optional string to use as indentation 193 194 :rtype: str or unicode 195 :return: 196 the input text normalized to fit on lines with a maximized size 197 inferior to `line_len`, and optionally prefixed by an 198 indentation string 199 """ 200 toreport = '' 201 lines = [] 202 line_len = line_len - len(indent) 203 for line in text.splitlines(): 204 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) 205 toreport = '' 206 while len(line) > line_len: 207 # too long line, need split 208 line, toreport = splittext(line, line_len) 209 lines.append(indent + line) 210 if toreport: 211 line = toreport + ' ' 212 toreport = '' 213 else: 214 line = '' 215 if line: 216 lines.append(indent + line.strip()) 217 return linesep.join(lines)
218 219
220 -def splittext(text, line_len):
221 """split the given text on space according to the given max line size 222 223 return a 2-uple: 224 * a line <= line_len if possible 225 * the rest of the text which has to be reported on another line 226 """ 227 if len(text) <= line_len: 228 return text, '' 229 pos = min(len(text)-1, line_len) 230 while pos > 0 and text[pos] != ' ': 231 pos -= 1 232 if pos == 0: 233 pos = min(len(text), line_len) 234 while len(text) > pos and text[pos] != ' ': 235 pos += 1 236 return text[:pos], text[pos+1:].strip()
237 238
239 -def splitstrip(string, sep=','):
240 """return a list of stripped string by splitting the string given as 241 argument on `sep` (',' by default). Empty string are discarded. 242 243 >>> splitstrip('a, b, c , 4,,') 244 ['a', 'b', 'c', '4'] 245 >>> splitstrip('a') 246 ['a'] 247 >>> 248 249 :type string: str or unicode 250 :param string: a csv line 251 252 :type sep: str or unicode 253 :param sep: field separator, default to the comma (',') 254 255 :rtype: str or unicode 256 :return: the unquoted string (or the input string if it wasn't quoted) 257 """ 258 return [word.strip() for word in string.split(sep) if word.strip()]
259 260 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip) 261 262
263 -def split_url_or_path(url_or_path):
264 """return the latest component of a string containing either an url of the 265 form <scheme>://<path> or a local file system path 266 """ 267 if '://' in url_or_path: 268 return url_or_path.rstrip('/').rsplit('/', 1) 269 return osp.split(url_or_path.rstrip(osp.sep))
270 271
272 -def text_to_dict(text):
273 """parse multilines text containing simple 'key=value' lines and return a 274 dict of {'key': 'value'}. When the same key is encountered multiple time, 275 value is turned into a list containing all values. 276 277 >>> text_to_dict('''multiple=1 278 ... multiple= 2 279 ... single =3 280 ... ''') 281 {'single': '3', 'multiple': ['1', '2']} 282 283 """ 284 res = {} 285 if not text: 286 return res 287 for line in text.splitlines(): 288 line = line.strip() 289 if line and not line.startswith('#'): 290 key, value = [w.strip() for w in line.split('=', 1)] 291 if key in res: 292 try: 293 res[key].append(value) 294 except AttributeError: 295 res[key] = [res[key], value] 296 else: 297 res[key] = value 298 return res
299 300 301 _BLANK_URE = r'(\s|,)+' 302 _BLANK_RE = re.compile(_BLANK_URE) 303 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' 304 __UNITS_URE = r'[a-zA-Z]+' 305 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE)) 306 307 BYTE_UNITS = { 308 "b": 1, 309 "kb": 1024, 310 "mb": 1024 ** 2, 311 "gb": 1024 ** 3, 312 "tb": 1024 ** 4, 313 } 314 315 TIME_UNITS = { 316 "ms": 0.0001, 317 "s": 1, 318 "min": 60, 319 "h": 60 * 60, 320 "d": 60 * 60 *24, 321 } 322
323 -def apply_units( string, units, inter=None, final=float, blank_reg=_BLANK_RE, 324 value_reg=_VALUE_RE):
325 """Parse the string applying the units defined in units 326 (e.g.: "1.5m",{'m',60} -> 80). 327 328 :type string: str or unicode 329 :param string: the string to parse 330 331 :type units: dict (or any object with __getitem__ using basestring key) 332 :param units: a dict mapping a unit string repr to its value 333 334 :type inter: type 335 :param inter: used to parse every intermediate value (need __sum__) 336 337 :type blank_reg: regexp 338 :param blank_reg: should match every blank char to ignore. 339 340 :type value_reg: regexp with "value" and optional "unit" group 341 :param value_reg: match a value and it's unit into the 342 """ 343 if inter is None: 344 inter = final 345 string = _BLANK_RE.sub('', string) 346 values = [] 347 for match in value_reg.finditer(string): 348 dic = match.groupdict() 349 #import sys 350 #print >> sys.stderr, dic 351 lit, unit = dic["value"], dic.get("unit") 352 value = inter(lit) 353 if unit is not None: 354 try: 355 value *= units[unit.lower()] 356 except KeyError: 357 raise KeyError('invalid unit %s. valid units are %s' % 358 (unit, units.keys())) 359 values.append(value) 360 return final(sum(values))
361 362 363 _LINE_RGX = re.compile('\r\n|\r+|\n') 364
365 -def pretty_match(match, string, underline_char='^'):
366 """return a string with the match location underlined: 367 368 >>> import re 369 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')) 370 il mange du bacon 371 ^^^^^ 372 >>> 373 374 :type match: _sre.SRE_match 375 :param match: object returned by re.match, re.search or re.finditer 376 377 :type string: str or unicode 378 :param string: 379 the string on which the regular expression has been applied to 380 obtain the `match` object 381 382 :type underline_char: str or unicode 383 :param underline_char: 384 character to use to underline the matched section, default to the 385 carret '^' 386 387 :rtype: str or unicode 388 :return: 389 the original string with an inserted line to underline the match 390 location 391 """ 392 start = match.start() 393 end = match.end() 394 string = _LINE_RGX.sub(linesep, string) 395 start_line_pos = string.rfind(linesep, 0, start) 396 if start_line_pos == -1: 397 start_line_pos = 0 398 result = [] 399 else: 400 result = [string[:start_line_pos]] 401 start_line_pos += len(linesep) 402 offset = start - start_line_pos 403 underline = ' ' * offset + underline_char * (end - start) 404 end_line_pos = string.find(linesep, end) 405 if end_line_pos == -1: 406 string = string[start_line_pos:] 407 result.append(string) 408 result.append(underline) 409 else: 410 end = string[end_line_pos + len(linesep):] 411 string = string[start_line_pos:end_line_pos] 412 result.append(string) 413 result.append(underline) 414 result.append(end) 415 return linesep.join(result).rstrip()
416 417 418 # Ansi colorization ########################################################### 419 420 ANSI_PREFIX = '\033[' 421 ANSI_END = 'm' 422 ANSI_RESET = '\033[0m' 423 ANSI_STYLES = { 424 'reset': "0", 425 'bold': "1", 426 'italic': "3", 427 'underline': "4", 428 'blink': "5", 429 'inverse': "7", 430 'strike': "9", 431 } 432 ANSI_COLORS = { 433 'reset': "0", 434 'black': "30", 435 'red': "31", 436 'green': "32", 437 'yellow': "33", 438 'blue': "34", 439 'magenta': "35", 440 'cyan': "36", 441 'white': "37", 442 } 443
444 -def _get_ansi_code(color=None, style=None):
445 """return ansi escape code corresponding to color and style 446 447 :type color: str or None 448 :param color: 449 the color name (see `ANSI_COLORS` for available values) 450 or the color number when 256 colors are available 451 452 :type style: str or None 453 :param style: 454 style string (see `ANSI_COLORS` for available values). To get 455 several style effects at the same time, use a coma as separator. 456 457 :raise KeyError: if an unexistent color or style identifier is given 458 459 :rtype: str 460 :return: the built escape code 461 """ 462 ansi_code = [] 463 if style: 464 style_attrs = splitstrip(style) 465 for effect in style_attrs: 466 ansi_code.append(ANSI_STYLES[effect]) 467 if color: 468 if color.isdigit(): 469 ansi_code.extend(['38', '5']) 470 ansi_code.append(color) 471 else: 472 ansi_code.append(ANSI_COLORS[color]) 473 if ansi_code: 474 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END 475 return ''
476
477 -def colorize_ansi(msg, color=None, style=None):
478 """colorize message by wrapping it with ansi escape codes 479 480 :type msg: str or unicode 481 :param msg: the message string to colorize 482 483 :type color: str or None 484 :param color: 485 the color identifier (see `ANSI_COLORS` for available values) 486 487 :type style: str or None 488 :param style: 489 style string (see `ANSI_COLORS` for available values). To get 490 several style effects at the same time, use a coma as separator. 491 492 :raise KeyError: if an unexistent color or style identifier is given 493 494 :rtype: str or unicode 495 :return: the ansi escaped string 496 """ 497 # If both color and style are not defined, then leave the text as is 498 if color is None and style is None: 499 return msg 500 escape_code = _get_ansi_code(color, style) 501 # If invalid (or unknown) color, don't wrap msg with ansi codes 502 if escape_code: 503 return '%s%s%s' % (escape_code, msg, ANSI_RESET) 504 return msg
505 506 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} 507
508 -def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
509 for line in lines: 510 if line[:4] in ('--- ', '+++ '): 511 out.write(colorize_ansi(line, style['separator'])) 512 elif line[0] == '-': 513 out.write(colorize_ansi(line, style['remove'])) 514 elif line[0] == '+': 515 out.write(colorize_ansi(line, style['add'])) 516 elif line[:4] == '--- ': 517 out.write(colorize_ansi(line, style['separator'])) 518 elif line[:4] == '+++ ': 519 out.write(colorize_ansi(line, style['separator'])) 520 else: 521 out.write(line)
522