Package logilab-common-0 ::
Package 36 ::
Package 1 ::
Module textutils
|
|
1 """Some text manipulation utility functions.
2
3 :author: Logilab
4 :copyright: 2003-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
5 :contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
6 :license: General Public License version 2 - http://www.gnu.org/licenses
7
8 :group text formatting: normalize_text, normalize_paragraph, pretty_match,\
9 unquote, colorize_ansi
10 :group text manipulation: searchall, get_csv
11 :sort: text formatting, text manipulation
12
13 :type ANSI_STYLES: dict(str)
14 :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code
15
16 :type ANSI_COLORS: dict(str)
17 :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code
18
19 :type ANSI_PREFIX: str
20 :var ANSI_PREFIX:
21 ANSI terminal code notifing the start of an ANSI escape sequence
22
23 :type ANSI_END: str
24 :var ANSI_END:
25 ANSI terminal code notifing the end of an ANSI escape sequence
26
27 :type ANSI_RESET: str
28 :var ANSI_RESET:
29 ANSI terminal code reseting format defined by a previous ANSI escape sequence
30 """
31 __docformat__ = "restructuredtext en"
32
33 import re
34 from unicodedata import normalize as _uninormalize
35 try:
36 from os import linesep
37 except ImportError:
38 linesep = '\n'
39
40
41 MANUAL_UNICODE_MAP = {
42 u'\xa1': u'!',
43 u'\u0142': u'l',
44 u'\u2044': u'/',
45 u'\xc6': u'AE',
46 u'\xa9': u'(c)',
47 u'\xab': u'"',
48 u'\xe6': u'ae',
49 u'\xae': u'(r)',
50 u'\u0153': u'oe',
51 u'\u0152': u'OE',
52 u'\xd8': u'O',
53 u'\xf8': u'o',
54 u'\xbb': u'"',
55 u'\xdf': u'ss',
56 }
57
59 """replace diacritical characters with their corresponding ascii characters
60 """
61 res = []
62 for letter in ustring[:]:
63 try:
64 replacement = MANUAL_UNICODE_MAP[letter]
65 except KeyError:
66 if ord(letter) >= 2**8:
67 if ignorenonascii:
68 continue
69 raise ValueError("can't deal with non-ascii based characters")
70 replacement = _uninormalize('NFD', letter)[0]
71 res.append(replacement)
72 return u''.join(res)
73
75 """remove optional quotes (simple or double) from the string
76
77 :type string: str or unicode
78 :param string: an optionaly quoted string
79
80 :rtype: str or unicode
81 :return: the unquoted string (or the input string if it wasn't quoted)
82 """
83 if not string:
84 return string
85 if string[0] in '"\'':
86 string = string[1:]
87 if string[-1] in '"\'':
88 string = string[:-1]
89 return string
90
91
92 _BLANKLINES_RGX = re.compile('\r?\n\r?\n')
93 _NORM_SPACES_RGX = re.compile('\s+')
94
95 -def normalize_text(text, line_len=80, indent='', rest=False):
96 """normalize a text to display it with a maximum line size and
97 optionally arbitrary indentation. Line jumps are normalized but blank
98 lines are kept. The indentation string may be used to insert a
99 comment (#) or a quoting (>) mark for instance.
100
101 :type text: str or unicode
102 :param text: the input text to normalize
103
104 :type line_len: int
105 :param line_len: expected maximum line's length, default to 80
106
107 :type indent: str or unicode
108 :param indent: optional string to use as indentation
109
110 :rtype: str or unicode
111 :return:
112 the input text normalized to fit on lines with a maximized size
113 inferior to `line_len`, and optionally prefixed by an
114 indentation string
115 """
116 if rest:
117 normp = normalize_rest_paragraph
118 else:
119 normp = normalize_paragraph
120 result = []
121 for text in _BLANKLINES_RGX.split(text):
122 result.append(normp(text, line_len, indent))
123 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
124
125
127 """normalize a text to display it with a maximum line size and
128 optionaly arbitrary indentation. Line jumps are normalized. The
129 indentation string may be used top insert a comment mark for
130 instance.
131
132 :type text: str or unicode
133 :param text: the input text to normalize
134
135 :type line_len: int
136 :param line_len: expected maximum line's length, default to 80
137
138 :type indent: str or unicode
139 :param indent: optional string to use as indentation
140
141 :rtype: str or unicode
142 :return:
143 the input text normalized to fit on lines with a maximized size
144 inferior to `line_len`, and optionally prefixed by an
145 indentation string
146 """
147 text = _NORM_SPACES_RGX.sub(' ', text)
148 line_len = line_len - len(indent)
149 lines = []
150 while text:
151 aline, text = splittext(text.strip(), line_len)
152 lines.append(indent + aline)
153 return linesep.join(lines)
154
156 """normalize a ReST text to display it with a maximum line size and
157 optionaly arbitrary indentation. Line jumps are normalized. The
158 indentation string may be used top insert a comment mark for
159 instance.
160
161 :type text: str or unicode
162 :param text: the input text to normalize
163
164 :type line_len: int
165 :param line_len: expected maximum line's length, default to 80
166
167 :type indent: str or unicode
168 :param indent: optional string to use as indentation
169
170 :rtype: str or unicode
171 :return:
172 the input text normalized to fit on lines with a maximized size
173 inferior to `line_len`, and optionally prefixed by an
174 indentation string
175 """
176 toreport = ''
177 lines = []
178 line_len = line_len - len(indent)
179 for line in text.splitlines():
180 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())
181 toreport = ''
182 while len(line) > line_len:
183
184 line, toreport = splittext(line, line_len)
185 lines.append(indent + line)
186 if toreport:
187 line = toreport + ' '
188 toreport = ''
189 else:
190 line = ''
191 if line:
192 lines.append(indent + line.strip())
193 return linesep.join(lines)
194
195 -def splittext(text, line_len):
196 """split the given text on space according to the given max line size
197
198 return a 2-uple:
199 * a line <= line_len if possible
200 * the rest of the text which has to be reported on another line
201 """
202 if len(text) <= line_len:
203 return text, ''
204 pos = min(len(text)-1, line_len)
205 while pos > 0 and text[pos] != ' ':
206 pos -= 1
207 if pos == 0:
208 pos = min(len(text), line_len)
209 while len(text) > pos and text[pos] != ' ':
210 pos += 1
211 return text[:pos], text[pos+1:].strip()
212
213
215 """return a list of string in from a csv formatted line
216
217 >>> get_csv('a, b, c , 4')
218 ['a', 'b', 'c', '4']
219 >>> get_csv('a')
220 ['a']
221 >>>
222
223 :type string: str or unicode
224 :param string: a csv line
225
226 :type sep: str or unicode
227 :param sep: field separator, default to the comma (',')
228
229 :rtype: str or unicode
230 :return: the unquoted string (or the input string if it wasn't quoted)
231 """
232 return [word.strip() for word in string.split(sep) if word.strip()]
233
234 _BLANK_URE = r'(\s|,)+'
235 _BLANK_RE = re.compile(_BLANK_URE)
236 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))'
237 __UNITS_URE = r'[a-zA-Z]+'
238 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE,__UNITS_URE))
239
240 BYTE_UNITS = {
241 "B": 1,
242 "KB": 1024,
243 "MB": 1024 ** 2,
244 "GB": 1024 ** 3,
245 "TB": 1024 ** 4,
246 }
247
248 TIME_UNITS = {
249 "ms": 0.0001,
250 "s": 1,
251 "min": 60,
252 "h": 60 * 60,
253 "d": 60 * 60 *24,
254 }
255
258 """Parse the string applying the units defined in units
259 (eg: "1.5m",{'m',60} -> 80).
260
261 :type string: str or unicode
262 :param string: the string to parse
263
264 :type units: dict (or any object with __getitem__ using basestring key)
265 :param units: a dict mapping a unit string repr to its value
266
267 :type inter: type
268 :param inter: used to parse every intermediate value (need __sum__)
269
270 :type blank_reg: regexp
271 :param blank_reg: should match eveyr blank char to ignore.
272
273 :type value_reg: regexp with "value" and optional "unit" group
274 :param value_reg: match a value and it's unit into the
275 """
276 if inter is None:
277 inter = final
278
279
280 string = _BLANK_RE.sub('',string)
281 values = []
282 for match in value_reg.finditer(string):
283 dic = match.groupdict()
284
285
286 lit, unit = dic["value"], dic.get("unit")
287 value = inter(lit)
288 if unit is not None:
289 value *= units[unit]
290 values.append(value)
291
292 return final(sum(values))
293
294 _LINE_RGX = re.compile('\r\n|\r+|\n')
295
297 """return a string with the match location underlined:
298
299 >>> import re
300 >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')
301 il mange du bacon
302 ^^^^^
303 >>>
304
305 :type match: _sre.SRE_match
306 :param match: object returned by re.match, re.search or re.finditer
307
308 :type string: str or unicode
309 :param string:
310 the string on which the regular expression has been applied to
311 obtain the `match` object
312
313 :type underline_char: str or unicode
314 :param underline_char:
315 character to use to underline the matched section, default to the
316 carret '^'
317
318 :rtype: str or unicode
319 :return:
320 the original string with an inserted line to underline the match
321 location
322 """
323 start = match.start()
324 end = match.end()
325 string = _LINE_RGX.sub(linesep, string)
326 start_line_pos = string.rfind(linesep, 0, start)
327 if start_line_pos == -1:
328 start_line_pos = 0
329 result = []
330 else:
331 result = [string[:start_line_pos]]
332 start_line_pos += len(linesep)
333 offset = start - start_line_pos
334 underline = ' ' * offset + underline_char * (end - start)
335 end_line_pos = string.find(linesep, end)
336 if end_line_pos == -1:
337 string = string[start_line_pos:]
338 result.append(string)
339 result.append(underline)
340 else:
341 end = string[end_line_pos + len(linesep):]
342 string = string[start_line_pos:end_line_pos]
343 result.append(string)
344 result.append(underline)
345 result.append(end)
346 return linesep.join(result).rstrip()
347
348
349
350
351 ANSI_PREFIX = '\033['
352 ANSI_END = 'm'
353 ANSI_RESET = '\033[0m'
354 ANSI_STYLES = {
355 'reset' : "0",
356 'bold' : "1",
357 'italic' : "3",
358 'underline' : "4",
359 'blink' : "5",
360 'inverse' : "7",
361 'strike' : "9",
362 }
363 ANSI_COLORS = {
364 'reset' : "0",
365 'black' : "30",
366 'red' : "31",
367 'green' : "32",
368 'yellow' : "33",
369 'blue' : "34",
370 'magenta' : "35",
371 'cyan' : "36",
372 'white' : "37",
373 }
374
375
377 """return ansi escape code corresponding to color and style
378
379 :type color: str or None
380 :param color:
381 the color identifier (see `ANSI_COLORS` for available values)
382
383 :type style: str or None
384 :param style:
385 style string (see `ANSI_COLORS` for available values). To get
386 several style effects at the same time, use a coma as separator.
387
388 :raise KeyError: if an unexistant color or style identifier is given
389
390 :rtype: str
391 :return: the built escape code
392 """
393 ansi_code = []
394 if style:
395 style_attrs = get_csv(style)
396 for effect in style_attrs:
397 ansi_code.append(ANSI_STYLES[effect])
398 if color:
399 ansi_code.append(ANSI_COLORS[color])
400 if ansi_code:
401 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
402 return ''
403
405 """colorize message by wrapping it with ansi escape codes
406
407 :type msg: str or unicode
408 :param msg: the message string to colorize
409
410 :type color: str or None
411 :param color:
412 the color identifier (see `ANSI_COLORS` for available values)
413
414 :type style: str or None
415 :param style:
416 style string (see `ANSI_COLORS` for available values). To get
417 several style effects at the same time, use a coma as separator.
418
419 :raise KeyError: if an unexistant color or style identifier is given
420
421 :rtype: str or unicode
422 :return: the ansi escaped string
423 """
424
425 if color is None and style is None:
426 return msg
427 escape_code = _get_ansi_code(color, style)
428
429 if escape_code:
430 return '%s%s%s' % (escape_code, msg, ANSI_RESET)
431 return msg
432