Class | ActiveSupport::Multibyte::Handlers::UTF8Handler |
In: |
vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
|
Parent: | Object |
UTF8Handler implements Unicode aware operations for strings, these operations will be used by the Chars proxy when $KCODE is set to ‘UTF8’.
HANGUL_SBASE | = | 0xAC00 | Hangul character boundaries and properties | |
HANGUL_LBASE | = | 0x1100 | ||
HANGUL_VBASE | = | 0x1161 | ||
HANGUL_TBASE | = | 0x11A7 | ||
HANGUL_LCOUNT | = | 19 | ||
HANGUL_VCOUNT | = | 21 | ||
HANGUL_TCOUNT | = | 28 | ||
HANGUL_NCOUNT | = | HANGUL_VCOUNT * HANGUL_TCOUNT | ||
HANGUL_SCOUNT | = | 11172 | ||
HANGUL_SLAST | = | HANGUL_SBASE + HANGUL_SCOUNT | ||
HANGUL_JAMO_FIRST | = | 0x1100 | ||
HANGUL_JAMO_LAST | = | 0x11FF | ||
UNICODE_WHITESPACE | = | [ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D> 0x0020, # White_Space # Zs SPACE 0x0085, # White_Space # Cc <control-0085> 0x00A0, # White_Space # Zs NO-BREAK SPACE 0x1680, # White_Space # Zs OGHAM SPACE MARK 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE 0x2028, # White_Space # Zl LINE SEPARATOR 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE ].flatten.freeze | All the unicode whitespace | |
UNICODE_LEADERS_AND_TRAILERS | = | UNICODE_WHITESPACE + [65279] | BOM (byte order mark) can also be seen as whitespace, it‘s a non-rendering character used to distinguish between little and big endian. This is not an issue in utf-8, so it must be ignored. | |
UTF8_PAT | = | /\A(?: [\x00-\x7f] | [\xc2-\xdf] [\x80-\xbf] | \xe0 [\xa0-\xbf] [\x80-\xbf] | [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] )*\z/xn | Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) | |
UNICODE_TRAILERS_PAT | = | /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ | ||
UNICODE_LEADERS_PAT | = | /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ | ||
UCD | = | UnicodeDatabase.new | UniCode Database |
size | -> | length |
slice | -> | [] |
Works just like the indexed replace method on string, except instead of byte offsets you specify character offsets.
Example:
s = "Müller" s.chars[2] = "e" # Replace character with offset 2 s # => "Müeler" s = "Müller" s.chars[1, 2] = "ö" # Replace 2 characters at character offset 1 s # => "Möler"
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 156 156: def []=(str, *args) 157: replace_by = args.pop 158: # Indexed replace with regular expressions already works 159: return str[*args] = replace_by if args.first.is_a?(Regexp) 160: result = u_unpack(str) 161: if args[0].is_a?(Fixnum) 162: raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length 163: min = args[0] 164: max = args[1].nil? ? min : (min + args[1] - 1) 165: range = Range.new(min, max) 166: replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum) 167: elsif args.first.is_a?(Range) 168: raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length 169: range = args[0] 170: else 171: needle = args[0].to_s 172: min = index(str, needle) 173: max = min + length(needle) - 1 174: range = Range.new(min, max) 175: end 176: result[range] = u_unpack(replace_by) 177: str.replace(result.pack('U*')) 178: end
Returns a copy of str with the first character converted to uppercase and the remainder to lowercase
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 274 274: def capitalize(str) 275: upcase(slice(str, 0..0)) + downcase(slice(str, 1..-1) || '') 276: end
Works just like String#center, only integer specifies characters instead of bytes.
Example:
"¾ cup".chars.center(8).to_s # => " ¾ cup " "¾ cup".chars.center(8, " ").to_s # Use non-breaking whitespace # => " ¾ cup "
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 215 215: def center(str, integer, padstr=' ') 216: justify(str, integer, :center, padstr) 217: end
Perform composition on the characters in the string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 312 312: def compose(str) 313: compose_codepoints u_unpack(str).pack('U*') 314: end
Checks if the string is valid UTF8.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 342 342: def consumes?(str) 343: # Unpack is a little bit faster than regular expressions 344: begin 345: str.unpack('U*') 346: true 347: rescue ArgumentError 348: false 349: end 350: end
Perform decomposition on the characters in the string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 307 307: def decompose(str) 308: decompose_codepoints(:canonical, u_unpack(str)).pack('U*') 309: end
Convert characters in the string to lowercase
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 271 271: def downcase(str); to_case :lowercase_mapping, str; end
Returns the number of grapheme clusters in the string. This method is very likely to be moved or renamed in future versions.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 354 354: def g_length(str) 355: g_unpack(str).length 356: end
Returns the position of the passed argument in the string, counting in codepoints
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 139 139: def index(str, *args) 140: bidx = str.index(*args) 141: bidx ? (u_unpack(str.slice(0...bidx)).size) : nil 142: end
Inserts the passed string at specified codepoint offsets
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 129 129: def insert(str, offset, fragment) 130: str.replace( 131: u_unpack(str).insert( 132: offset, 133: u_unpack(fragment) 134: ).flatten.pack('U*') 135: ) 136: end
Works just like String#ljust, only integer specifies characters instead of bytes.
Example:
"¾ cup".chars.rjust(8).to_s # => "¾ cup " "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace # => "¾ cup "
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 202 202: def ljust(str, integer, padstr=' ') 203: justify(str, integer, :left, padstr) 204: end
Returns the KC normalization of the string by default. NFKC is considered the best normalization form for passing strings to databases and validations.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 289 289: def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM) 290: # See http://www.unicode.org/reports/tr15, Table 1 291: codepoints = u_unpack(str) 292: case form 293: when :d 294: reorder_characters(decompose_codepoints(:canonical, codepoints)) 295: when :c 296: compose_codepoints reorder_characters(decompose_codepoints(:canonical, codepoints)) 297: when :kd 298: reorder_characters(decompose_codepoints(:compatability, codepoints)) 299: when :kc 300: compose_codepoints reorder_characters(decompose_codepoints(:compatability, codepoints)) 301: else 302: raise ArgumentError, "#{form} is not a valid normalization variant", caller 303: end.pack('U*') 304: end
Reverses codepoints in the string.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 241 241: def reverse(str) 242: u_unpack(str).reverse.pack('U*') 243: end
Works just like String#rjust, only integer specifies characters instead of bytes.
Example:
"¾ cup".chars.rjust(8).to_s # => " ¾ cup" "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace # => " ¾ cup"
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 189 189: def rjust(str, integer, padstr=' ') 190: justify(str, integer, :right, padstr) 191: end
Returns the number of codepoints in the string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 235 235: def size(str) 236: u_unpack(str).size 237: end
Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that character.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 247 247: def slice(str, *args) 248: if args.size > 2 249: raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native 250: elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp))) 251: raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native 252: elsif (args.size == 2 && !args[1].is_a?(Numeric)) 253: raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native 254: elsif args[0].kind_of? Range 255: cps = u_unpack(str).slice(*args) 256: cps.nil? ? nil : cps.pack('U*') 257: elsif args[0].kind_of? Regexp 258: str.slice(*args) 259: elsif args.size == 1 && args[0].kind_of?(Numeric) 260: u_unpack(str)[args[0]] 261: else 262: u_unpack(str).slice(*args).pack('U*') 263: end 264: end
Removed leading and trailing whitespace
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 230 230: def strip(str) 231: str.gsub(UNICODE_LEADERS_PAT, '').gsub(UNICODE_TRAILERS_PAT, '') 232: end
Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 359 359: def tidy_bytes(str) 360: str.split(//u).map do |c| 361: if !UTF8_PAT.match(c) 362: n = c.unpack('C')[0] 363: n < 128 ? n.chr : 364: n < 160 ? [UCD.cp1252[n] || n].pack('U') : 365: n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr 366: else 367: c 368: end 369: end.join 370: end
Used to translate an offset from bytes to characters, for instance one received from a regular expression match
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 321 321: def translate_offset(str, byte_offset) 322: return nil if byte_offset.nil? 323: return 0 if str == '' 324: chunk = str[0..byte_offset] 325: begin 326: begin 327: chunk.unpack('U*').length - 1 328: rescue ArgumentError => e 329: chunk = str[0..(byte_offset+=1)] 330: # Stop retrying at the end of the string 331: raise e unless byte_offset < chunk.length 332: # We damaged a character, retry 333: retry 334: end 335: # Catch the ArgumentError so we can throw our own 336: rescue ArgumentError 337: raise EncodingError.new('malformed UTF-8 character') 338: end 339: end
Convert characters in the string to uppercase
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 268 268: def upcase(str); to_case :uppercase_mapping, str; end
Compose decomposed characters to the composed form
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 503 503: def compose_codepoints(codepoints) 504: pos = 0 505: eoa = codepoints.length - 1 506: starter_pos = 0 507: starter_char = codepoints[0] 508: previous_combining_class = -1 509: while pos < eoa 510: pos += 1 511: lindex = starter_char - HANGUL_LBASE 512: # -- Hangul 513: if 0 <= lindex and lindex < HANGUL_LCOUNT 514: vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 515: if 0 <= vindex and vindex < HANGUL_VCOUNT 516: tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 517: if 0 <= tindex and tindex < HANGUL_TCOUNT 518: j = starter_pos + 2 519: eoa -= 2 520: else 521: tindex = 0 522: j = starter_pos + 1 523: eoa -= 1 524: end 525: codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE 526: end 527: starter_pos += 1 528: starter_char = codepoints[starter_pos] 529: # -- Other characters 530: else 531: current_char = codepoints[pos] 532: current = UCD[current_char] 533: if current.combining_class > previous_combining_class 534: if ref = UCD.composition_map[starter_char] 535: composition = ref[current_char] 536: else 537: composition = nil 538: end 539: unless composition.nil? 540: codepoints[starter_pos] = composition 541: starter_char = composition 542: codepoints.delete_at pos 543: eoa -= 1 544: pos -= 1 545: previous_combining_class = -1 546: else 547: previous_combining_class = current.combining_class 548: end 549: else 550: previous_combining_class = current.combining_class 551: end 552: if current.combining_class == 0 553: starter_pos = pos 554: starter_char = codepoints[pos] 555: end 556: end 557: end 558: codepoints 559: end
Decompose composed characters to the decomposed form
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 482 482: def decompose_codepoints(type, codepoints) 483: codepoints.inject([]) do |decomposed, cp| 484: # if it's a hangul syllable starter character 485: if HANGUL_SBASE <= cp and cp < HANGUL_SLAST 486: sindex = cp - HANGUL_SBASE 487: ncp = [] # new codepoints 488: ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT 489: ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT 490: tindex = sindex % HANGUL_TCOUNT 491: ncp << (HANGUL_TBASE + tindex) unless tindex == 0 492: decomposed.concat ncp 493: # if the codepoint is decomposable in with the current decomposition type 494: elsif (ncp = UCD[cp].decomp_mapping) and (!UCD[cp].decomp_type || type == :compatability) 495: decomposed.concat decompose_codepoints(type, ncp.dup) 496: else 497: decomposed << cp 498: end 499: end 500: end
Unpack the string at grapheme boundaries instead of codepoint boundaries
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 390 390: def g_unpack(str) 391: codepoints = u_unpack(str) 392: unpacked = [] 393: pos = 0 394: marker = 0 395: eoc = codepoints.length 396: while(pos < eoc) 397: pos += 1 398: previous = codepoints[pos-1] 399: current = codepoints[pos] 400: if ( 401: # CR X LF 402: one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or 403: # L X (L|V|LV|LVT) 404: two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or 405: # (LV|V) X (V|T) 406: three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or 407: # (LVT|T) X (T) 408: four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or 409: # X Extend 410: five = (UCD.boundary[:extend] === current) 411: ) 412: else 413: unpacked << codepoints[marker..pos-1] 414: marker = pos 415: end 416: end 417: unpacked 418: end
Detect whether the codepoint is in a certain character class. Primarily used by the grapheme cluster support.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 376 376: def in_char_class?(codepoint, classes) 377: classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false 378: end
Justifies a string in a certain way. Valid values for way are :right, :left and :center. Is primarily used as a helper method by rjust, ljust and center.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 427 427: def justify(str, integer, way, padstr=' ') 428: raise ArgumentError, "zero width padding" if padstr.length == 0 429: padsize = integer - size(str) 430: padsize = padsize > 0 ? padsize : 0 431: case way 432: when :right 433: str.dup.insert(0, padding(padsize, padstr)) 434: when :left 435: str.dup.insert(-1, padding(padsize, padstr)) 436: when :center 437: lpad = padding((padsize / 2.0).floor, padstr) 438: rpad = padding((padsize / 2.0).ceil, padstr) 439: str.dup.insert(0, lpad).insert(-1, rpad) 440: end 441: end
Generates a padding string of a certain size.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 444 444: def padding(padsize, padstr=' ') 445: if padsize != 0 446: slice(padstr * ((padsize / size(padstr)) + 1), 0, padsize) 447: else 448: '' 449: end 450: end
Re-order codepoints so the string becomes canonical
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 466 466: def reorder_characters(codepoints) 467: length = codepoints.length- 1 468: pos = 0 469: while pos < length do 470: cp1, cp2 = UCD[codepoints[pos]], UCD[codepoints[pos+1]] 471: if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) 472: codepoints[pos..pos+1] = cp2.code, cp1.code 473: pos += (pos > 0 ? -1 : 1) 474: else 475: pos += 1 476: end 477: end 478: codepoints 479: end
Convert characters to a different case
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 453 453: def to_case(way, str) 454: u_unpack(str).map do |codepoint| 455: cp = UCD[codepoint] 456: unless cp.nil? 457: ncp = cp.send(way) 458: ncp > 0 ? ncp : codepoint 459: else 460: codepoint 461: end 462: end.pack('U*') 463: end