1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 """Some text manipulation utility functions.
19
20
21 :group text formatting: normalize_text, normalize_paragraph, pretty_match,\
22 unquote, colorize_ansi
23 :group text manipulation: searchall, splitstrip
24 :sort: text formatting, text manipulation
25
26 :type ANSI_STYLES: dict(str)
27 :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code
28
29 :type ANSI_COLORS: dict(str)
30 :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code
31
32 :type ANSI_PREFIX: str
33 :var ANSI_PREFIX:
34 ANSI terminal code notifying the start of an ANSI escape sequence
35
36 :type ANSI_END: str
37 :var ANSI_END:
38 ANSI terminal code notifying the end of an ANSI escape sequence
39
40 :type ANSI_RESET: str
41 :var ANSI_RESET:
42 ANSI terminal code resetting format defined by a previous ANSI escape sequence
43 """
44 __docformat__ = "restructuredtext en"
45
46 import sys
47 import re
48 import os.path as osp
49 from unicodedata import normalize as _uninormalize
50 try:
51 from os import linesep
52 except ImportError:
53 linesep = '\n'
54
55 from logilab.common.deprecation import deprecated
56
57 MANUAL_UNICODE_MAP = {
58 u'\xa1': u'!',
59 u'\u0142': u'l',
60 u'\u2044': u'/',
61 u'\xc6': u'AE',
62 u'\xa9': u'(c)',
63 u'\xab': u'"',
64 u'\xe6': u'ae',
65 u'\xae': u'(r)',
66 u'\u0153': u'oe',
67 u'\u0152': u'OE',
68 u'\xd8': u'O',
69 u'\xf8': u'o',
70 u'\xbb': u'"',
71 u'\xdf': u'ss',
72 }
73
75 """replace diacritical characters with their corresponding ascii characters
76
77 Convert the unicode string to its long normalized form (unicode character
78 will be transform into several characters) and keep the first one only.
79 The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
80 replace all compatibility characters with their equivalents.
81
82 :see: Another project about ASCII transliterations of Unicode text
83 http://pypi.python.org/pypi/Unidecode
84 """
85 res = []
86 for letter in ustring[:]:
87 try:
88 replacement = MANUAL_UNICODE_MAP[letter]
89 except KeyError:
90 if ord(letter) >= 2**8:
91 if ignorenonascii:
92 continue
93 raise ValueError("can't deal with non-ascii based characters")
94 replacement = _uninormalize('NFKD', letter)[0]
95 res.append(replacement)
96 return u''.join(res)
97
99 """remove optional quotes (simple or double) from the string
100
101 :type string: str or unicode
102 :param string: an optionally quoted string
103
104 :rtype: str or unicode
105 :return: the unquoted string (or the input string if it wasn't quoted)
106 """
107 if not string:
108 return string
109 if string[0] in '"\'':
110 string = string[1:]
111 if string[-1] in '"\'':
112 string = string[:-1]
113 return string
114
115
116 _BLANKLINES_RGX = re.compile('\r?\n\r?\n')
117 _NORM_SPACES_RGX = re.compile('\s+')
118
119 -def normalize_text(text, line_len=80, indent='', rest=False):
120 """normalize a text to display it with a maximum line size and
121 optionally arbitrary indentation. Line jumps are normalized but blank
122 lines are kept. The indentation string may be used to insert a
123 comment (#) or a quoting (>) mark for instance.
124
125 :type text: str or unicode
126 :param text: the input text to normalize
127
128 :type line_len: int
129 :param line_len: expected maximum line's length, default to 80
130
131 :type indent: str or unicode
132 :param indent: optional string to use as indentation
133
134 :rtype: str or unicode
135 :return:
136 the input text normalized to fit on lines with a maximized size
137 inferior to `line_len`, and optionally prefixed by an
138 indentation string
139 """
140 if rest:
141 normp = normalize_rest_paragraph
142 else:
143 normp = normalize_paragraph
144 result = []
145 for text in _BLANKLINES_RGX.split(text):
146 result.append(normp(text, line_len, indent))
147 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
148
149
151 """normalize a text to display it with a maximum line size and
152 optionally arbitrary indentation. Line jumps are normalized. The
153 indentation string may be used top insert a comment mark for
154 instance.
155
156 :type text: str or unicode
157 :param text: the input text to normalize
158
159 :type line_len: int
160 :param line_len: expected maximum line's length, default to 80
161
162 :type indent: str or unicode
163 :param indent: optional string to use as indentation
164
165 :rtype: str or unicode
166 :return:
167 the input text normalized to fit on lines with a maximized size
168 inferior to `line_len`, and optionally prefixed by an
169 indentation string
170 """
171 text = _NORM_SPACES_RGX.sub(' ', text)
172 line_len = line_len - len(indent)
173 lines = []
174 while text:
175 aline, text = splittext(text.strip(), line_len)
176 lines.append(indent + aline)
177 return linesep.join(lines)
178
180 """normalize a ReST text to display it with a maximum line size and
181 optionally arbitrary indentation. Line jumps are normalized. The
182 indentation string may be used top insert a comment mark for
183 instance.
184
185 :type text: str or unicode
186 :param text: the input text to normalize
187
188 :type line_len: int
189 :param line_len: expected maximum line's length, default to 80
190
191 :type indent: str or unicode
192 :param indent: optional string to use as indentation
193
194 :rtype: str or unicode
195 :return:
196 the input text normalized to fit on lines with a maximized size
197 inferior to `line_len`, and optionally prefixed by an
198 indentation string
199 """
200 toreport = ''
201 lines = []
202 line_len = line_len - len(indent)
203 for line in text.splitlines():
204 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())
205 toreport = ''
206 while len(line) > line_len:
207
208 line, toreport = splittext(line, line_len)
209 lines.append(indent + line)
210 if toreport:
211 line = toreport + ' '
212 toreport = ''
213 else:
214 line = ''
215 if line:
216 lines.append(indent + line.strip())
217 return linesep.join(lines)
218
219
220 -def splittext(text, line_len):
221 """split the given text on space according to the given max line size
222
223 return a 2-uple:
224 * a line <= line_len if possible
225 * the rest of the text which has to be reported on another line
226 """
227 if len(text) <= line_len:
228 return text, ''
229 pos = min(len(text)-1, line_len)
230 while pos > 0 and text[pos] != ' ':
231 pos -= 1
232 if pos == 0:
233 pos = min(len(text), line_len)
234 while len(text) > pos and text[pos] != ' ':
235 pos += 1
236 return text[:pos], text[pos+1:].strip()
237
238
240 """return a list of stripped string by splitting the string given as
241 argument on `sep` (',' by default). Empty string are discarded.
242
243 >>> splitstrip('a, b, c , 4,,')
244 ['a', 'b', 'c', '4']
245 >>> splitstrip('a')
246 ['a']
247 >>>
248
249 :type string: str or unicode
250 :param string: a csv line
251
252 :type sep: str or unicode
253 :param sep: field separator, default to the comma (',')
254
255 :rtype: str or unicode
256 :return: the unquoted string (or the input string if it wasn't quoted)
257 """
258 return [word.strip() for word in string.split(sep) if word.strip()]
259
260 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip)
261
262
264 """return the latest component of a string containing either an url of the
265 form <scheme>://<path> or a local file system path
266 """
267 if '://' in url_or_path:
268 return url_or_path.rstrip('/').rsplit('/', 1)
269 return osp.split(url_or_path.rstrip(osp.sep))
270
271
272 -def text_to_dict(text):
273 """parse multilines text containing simple 'key=value' lines and return a
274 dict of {'key': 'value'}. When the same key is encountered multiple time,
275 value is turned into a list containing all values.
276
277 >>> text_to_dict('''multiple=1
278 ... multiple= 2
279 ... single =3
280 ... ''')
281 {'single': '3', 'multiple': ['1', '2']}
282
283 """
284 res = {}
285 if not text:
286 return res
287 for line in text.splitlines():
288 line = line.strip()
289 if line and not line.startswith('#'):
290 key, value = [w.strip() for w in line.split('=', 1)]
291 if key in res:
292 try:
293 res[key].append(value)
294 except AttributeError:
295 res[key] = [res[key], value]
296 else:
297 res[key] = value
298 return res
299
300
301 _BLANK_URE = r'(\s|,)+'
302 _BLANK_RE = re.compile(_BLANK_URE)
303 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))'
304 __UNITS_URE = r'[a-zA-Z]+'
305 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE))
306
307 BYTE_UNITS = {
308 "b": 1,
309 "kb": 1024,
310 "mb": 1024 ** 2,
311 "gb": 1024 ** 3,
312 "tb": 1024 ** 4,
313 }
314
315 TIME_UNITS = {
316 "ms": 0.0001,
317 "s": 1,
318 "min": 60,
319 "h": 60 * 60,
320 "d": 60 * 60 *24,
321 }
322
325 """Parse the string applying the units defined in units
326 (e.g.: "1.5m",{'m',60} -> 80).
327
328 :type string: str or unicode
329 :param string: the string to parse
330
331 :type units: dict (or any object with __getitem__ using basestring key)
332 :param units: a dict mapping a unit string repr to its value
333
334 :type inter: type
335 :param inter: used to parse every intermediate value (need __sum__)
336
337 :type blank_reg: regexp
338 :param blank_reg: should match every blank char to ignore.
339
340 :type value_reg: regexp with "value" and optional "unit" group
341 :param value_reg: match a value and it's unit into the
342 """
343 if inter is None:
344 inter = final
345 string = _BLANK_RE.sub('', string)
346 values = []
347 for match in value_reg.finditer(string):
348 dic = match.groupdict()
349
350
351 lit, unit = dic["value"], dic.get("unit")
352 value = inter(lit)
353 if unit is not None:
354 try:
355 value *= units[unit.lower()]
356 except KeyError:
357 raise KeyError('invalid unit %s. valid units are %s' %
358 (unit, units.keys()))
359 values.append(value)
360 return final(sum(values))
361
362
363 _LINE_RGX = re.compile('\r\n|\r+|\n')
364
366 """return a string with the match location underlined:
367
368 >>> import re
369 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon'))
370 il mange du bacon
371 ^^^^^
372 >>>
373
374 :type match: _sre.SRE_match
375 :param match: object returned by re.match, re.search or re.finditer
376
377 :type string: str or unicode
378 :param string:
379 the string on which the regular expression has been applied to
380 obtain the `match` object
381
382 :type underline_char: str or unicode
383 :param underline_char:
384 character to use to underline the matched section, default to the
385 carret '^'
386
387 :rtype: str or unicode
388 :return:
389 the original string with an inserted line to underline the match
390 location
391 """
392 start = match.start()
393 end = match.end()
394 string = _LINE_RGX.sub(linesep, string)
395 start_line_pos = string.rfind(linesep, 0, start)
396 if start_line_pos == -1:
397 start_line_pos = 0
398 result = []
399 else:
400 result = [string[:start_line_pos]]
401 start_line_pos += len(linesep)
402 offset = start - start_line_pos
403 underline = ' ' * offset + underline_char * (end - start)
404 end_line_pos = string.find(linesep, end)
405 if end_line_pos == -1:
406 string = string[start_line_pos:]
407 result.append(string)
408 result.append(underline)
409 else:
410 end = string[end_line_pos + len(linesep):]
411 string = string[start_line_pos:end_line_pos]
412 result.append(string)
413 result.append(underline)
414 result.append(end)
415 return linesep.join(result).rstrip()
416
417
418
419
420 ANSI_PREFIX = '\033['
421 ANSI_END = 'm'
422 ANSI_RESET = '\033[0m'
423 ANSI_STYLES = {
424 'reset': "0",
425 'bold': "1",
426 'italic': "3",
427 'underline': "4",
428 'blink': "5",
429 'inverse': "7",
430 'strike': "9",
431 }
432 ANSI_COLORS = {
433 'reset': "0",
434 'black': "30",
435 'red': "31",
436 'green': "32",
437 'yellow': "33",
438 'blue': "34",
439 'magenta': "35",
440 'cyan': "36",
441 'white': "37",
442 }
443
445 """return ansi escape code corresponding to color and style
446
447 :type color: str or None
448 :param color:
449 the color name (see `ANSI_COLORS` for available values)
450 or the color number when 256 colors are available
451
452 :type style: str or None
453 :param style:
454 style string (see `ANSI_COLORS` for available values). To get
455 several style effects at the same time, use a coma as separator.
456
457 :raise KeyError: if an unexistent color or style identifier is given
458
459 :rtype: str
460 :return: the built escape code
461 """
462 ansi_code = []
463 if style:
464 style_attrs = splitstrip(style)
465 for effect in style_attrs:
466 ansi_code.append(ANSI_STYLES[effect])
467 if color:
468 if color.isdigit():
469 ansi_code.extend(['38', '5'])
470 ansi_code.append(color)
471 else:
472 ansi_code.append(ANSI_COLORS[color])
473 if ansi_code:
474 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
475 return ''
476
478 """colorize message by wrapping it with ansi escape codes
479
480 :type msg: str or unicode
481 :param msg: the message string to colorize
482
483 :type color: str or None
484 :param color:
485 the color identifier (see `ANSI_COLORS` for available values)
486
487 :type style: str or None
488 :param style:
489 style string (see `ANSI_COLORS` for available values). To get
490 several style effects at the same time, use a coma as separator.
491
492 :raise KeyError: if an unexistent color or style identifier is given
493
494 :rtype: str or unicode
495 :return: the ansi escaped string
496 """
497
498 if color is None and style is None:
499 return msg
500 escape_code = _get_ansi_code(color, style)
501
502 if escape_code:
503 return '%s%s%s' % (escape_code, msg, ANSI_RESET)
504 return msg
505
506 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'}
507
522