module Addressable::IDNA

Constants

ACE_MAX_LENGTH
ACE_PREFIX
COMPOSITION_TABLE
HANGUL_LBASE
HANGUL_LCOUNT
HANGUL_NCOUNT
HANGUL_SBASE
HANGUL_SCOUNT
HANGUL_TBASE
HANGUL_TCOUNT
HANGUL_VBASE
HANGUL_VCOUNT
PUNYCODE_BASE
PUNYCODE_DAMP
PUNYCODE_DELIMITER
PUNYCODE_INITIAL_BIAS
PUNYCODE_INITIAL_N
PUNYCODE_MAXINT
PUNYCODE_PRINT_ASCII
PUNYCODE_SKEW
PUNYCODE_TMAX
PUNYCODE_TMIN
UNICODE_DATA

This is a sparse Unicode table. Codepoints without entries are assumed to have the value: [0, 0, nil, nil, nil, nil, nil]

UNICODE_DATA_CANONICAL
UNICODE_DATA_COMBINING_CLASS
UNICODE_DATA_COMPATIBILITY
UNICODE_DATA_EXCLUSION
UNICODE_DATA_LOWERCASE
UNICODE_DATA_TITLECASE
UNICODE_DATA_UPPERCASE
UNICODE_MAX_LENGTH
UNICODE_TABLE

This module is loosely based on idn_actionmailer by Mick Staugaard, the unicode library by Yoshida Masato, and the punycode implementation by Kazuhiro Nishiyama. Most of the code was copied verbatim, but some reformatting was done, and some translation from C was done.

Without their code to work from as a base, we’d all still be relying on the presence of libidn. Which nobody ever seems to have installed.

Original sources: github.com/staugaard/idn_actionmailer www.yoshidam.net/Ruby.html#unicode rubyforge.org/frs/?group_id=2550

UTF8_REGEX
UTF8_REGEX_MULTIBYTE

Public Class Methods

to_ascii(value) click to toggle source
# File lib/addressable/idna/native.rb, line 36
def self.to_ascii(value)
  value.to_s.split('.', -1).map do |segment|
    if segment.size > 0 && segment.size < 64
      IDN::Idna.toASCII(segment, IDN::Idna::ALLOW_UNASSIGNED)
    elsif segment.size >= 64
      segment
    else
      ''
    end
  end.join('.')
end
to_unicode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 48
def self.to_unicode(value)
  value.to_s.split('.', -1).map do |segment|
    if segment.size > 0 && segment.size < 64
      IDN::Idna.toUnicode(segment, IDN::Idna::ALLOW_UNASSIGNED)
    elsif segment.size >= 64
      segment
    else
      ''
    end
  end.join('.')
end
unicode_normalize_kc(value) click to toggle source
# File lib/addressable/idna/native.rb, line 32
def self.unicode_normalize_kc(value)
  IDN::Stringprep.nfkc_normalize(value.to_s)
end

Private Class Methods

lookup_unicode_combining_class(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 282
def self.lookup_unicode_combining_class(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
    0)
end
lookup_unicode_compatibility(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 290
def self.lookup_unicode_compatibility(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
end
lookup_unicode_composition(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 305
def self.lookup_unicode_composition(unpacked)
  return COMPOSITION_TABLE[unpacked]
end
lookup_unicode_lowercase(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 297
def self.lookup_unicode_lowercase(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    (codepoint_data[UNICODE_DATA_LOWERCASE] || codepoint) :
    codepoint)
end
punycode_adapt(delta, numpoints, firsttime) click to toggle source

Bias adaptation method

# File lib/addressable/idna/pure.rb, line 660
def self.punycode_adapt(delta, numpoints, firsttime)
  delta = firsttime ? delta / PUNYCODE_DAMP : delta >> 1
  # delta >> 1 is a faster way of doing delta / 2
  delta += delta / numpoints
  difference = PUNYCODE_BASE - PUNYCODE_TMIN

  k = 0
  while delta > (difference * PUNYCODE_TMAX) / 2
    delta /= difference
    k += PUNYCODE_BASE
  end

  k + (difference + 1) * delta / (delta + PUNYCODE_SKEW)
end
punycode_basic?(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 628
def self.punycode_basic?(codepoint)
  codepoint < 0x80
end
punycode_decode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 28
def self.punycode_decode(value)
  IDN::Punycode.decode(value.to_s)
end
punycode_decode_digit(codepoint) click to toggle source

Returns the numeric value of a basic codepoint (for use in representing integers) in the range 0 to base - 1, or PUNYCODE_BASE if codepoint does not represent a value.

# File lib/addressable/idna/pure.rb, line 646
def self.punycode_decode_digit(codepoint)
  if codepoint - 48 < 10
    codepoint - 22
  elsif codepoint - 65 < 26
    codepoint - 65
  elsif codepoint - 97 < 26
    codepoint - 97
  else
    PUNYCODE_BASE
  end
end
punycode_delimiter?(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 633
def self.punycode_delimiter?(codepoint)
  codepoint == PUNYCODE_DELIMITER
end
punycode_encode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 24
def self.punycode_encode(value)
  IDN::Punycode.encode(value.to_s)
end
punycode_encode_digit(d) click to toggle source
# File lib/addressable/idna/pure.rb, line 638
def self.punycode_encode_digit(d)
  d + 22 + 75 * ((d < 26) ? 1 : 0)
end
ucs4_to_utf8(char, buffer) click to toggle source
# File lib/addressable/idna/pure.rb, line 188
def self.ucs4_to_utf8(char, buffer)
  if char < 128
    buffer << char
  elsif char < 2048
    buffer << (char >> 6 | 192)
    buffer << (char & 63 | 128)
  elsif char < 0x10000
    buffer << (char >> 12 | 224)
    buffer << (char >> 6 & 63 | 128)
    buffer << (char & 63 | 128)
  elsif char < 0x200000
    buffer << (char >> 18 | 240)
    buffer << (char >> 12 & 63 | 128)
    buffer << (char >> 6 & 63 | 128)
    buffer << (char & 63 | 128)
  elsif char < 0x4000000
    buffer << (char >> 24 | 248)
    buffer << (char >> 18 & 63 | 128)
    buffer << (char >> 12 & 63 | 128)
    buffer << (char >> 6 & 63 | 128)
    buffer << (char & 63 | 128)
  elsif char < 0x80000000
    buffer << (char >> 30 | 252)
    buffer << (char >> 24 & 63 | 128)
    buffer << (char >> 18 & 63 | 128)
    buffer << (char >> 12 & 63 | 128)
    buffer << (char >> 6 & 63 | 128)
    buffer << (char & 63 | 128)
  end
end
unicode_compose(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 139
def self.unicode_compose(unpacked)
  unpacked_result = []
  length = unpacked.length

  return unpacked if length == 0

  starter = unpacked[0]
  starter_cc = lookup_unicode_combining_class(starter)
  starter_cc = 256 if starter_cc != 0
  for i in 1...length
    ch = unpacked[i]

    if (starter_cc == 0 &&
        (composite = unicode_compose_pair(starter, ch)) != nil)
      starter = composite
    else
      unpacked_result << starter
      starter = ch
    end
  end
  unpacked_result << starter
  return unpacked_result
end
unicode_compose_pair(ch_one, ch_two) click to toggle source
# File lib/addressable/idna/pure.rb, line 164
def self.unicode_compose_pair(ch_one, ch_two)
  if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
      ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
    # Hangul L + V
    return HANGUL_SBASE + (
      (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
    ) * HANGUL_TCOUNT
  elsif ch_one >= HANGUL_SBASE &&
      ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
      (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
      ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
       # Hangul LV + T
    return ch_one + (ch_two - HANGUL_TBASE)
  end

  p = []

  ucs4_to_utf8(ch_one, p)
  ucs4_to_utf8(ch_two, p)

  return lookup_unicode_composition(p)
end
unicode_decompose(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 244
def self.unicode_decompose(unpacked)
  unpacked_result = []
  for cp in unpacked
    if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
      l, v, t = unicode_decompose_hangul(cp)
      unpacked_result << l
      unpacked_result << v if v
      unpacked_result << t if t
    else
      dc = lookup_unicode_compatibility(cp)
      unless dc
        unpacked_result << cp
      else
        unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
      end
    end
  end
  return unpacked_result
end
unicode_decompose_hangul(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 265
def self.unicode_decompose_hangul(codepoint)
  sindex = codepoint - HANGUL_SBASE;
  if sindex < 0 || sindex >= HANGUL_SCOUNT
    l = codepoint
    v = t = nil
    return l, v, t
  end
  l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
  v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
  t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
  if t == HANGUL_TBASE
    t = nil
  end
  return l, v, t
end
unicode_downcase(input) click to toggle source

Unicode aware downcase method.

@api private @param [String] input

The input string.

@return [String] The downcased result.

# File lib/addressable/idna/pure.rb, line 131
def self.unicode_downcase(input)
  input = input.to_s unless input.is_a?(String)
  unpacked = input.unpack("U*")
  unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) }
  return unpacked.pack("U*")
end
unicode_sort_canonical(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 220
def self.unicode_sort_canonical(unpacked)
  unpacked = unpacked.dup
  i = 1
  length = unpacked.length

  return unpacked if length < 2

  while i < length
    last = unpacked[i-1]
    ch = unpacked[i]
    last_cc = lookup_unicode_combining_class(last)
    cc = lookup_unicode_combining_class(ch)
    if cc != 0 && last_cc != 0 && last_cc > cc
      unpacked[i] = last
      unpacked[i-1] = ch
      i -= 1 if i > 1
    else
      i += 1
    end
  end
  return unpacked
end