class String

Some useful additions to the String class Copyright © 2010-16, Jacques Distler. All rights reserved. Licensed under a triple GPL/MPL/LGPL License.

Constants

UTF8_REGEX

Public Instance Methods

as_bytes() click to toggle source
# File lib/itex_stringsupport.rb, line 34
def as_bytes
  force_encoding("ASCII-8BIT")
end
as_utf8() click to toggle source
# File lib/itex_stringsupport.rb, line 52
def as_utf8
  force_encoding("UTF-8")
end
check_ncrs() click to toggle source
# File lib/itex_stringsupport.rb, line 81
def check_ncrs
  text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
  text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' }
end
is_utf8? → boolean click to toggle source

Check whether a string is valid utf-8

returns true if the sequence of bytes in string is valid utf-8

# File lib/itex_stringsupport.rb, line 107
def is_utf8?
  #expand NCRs to utf-8
  text = self.check_ncrs.as_bytes
  
  # You might think this is faster, but it isn't
  #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/)
  #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')}
  #pieces = pieces.join.split(/&#(\d+);/)
  #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')}
  #text = pieces.join
       
  #ensure the resulting string of bytes is valid utf-8
  text =~ UTF8_REGEX
end
num_chars() click to toggle source
# File lib/itex_stringsupport.rb, line 16
def num_chars
  length
end
purify() click to toggle source
# File lib/itex_stringsupport.rb, line 70
def purify
  text = self.dup.check_ncrs.as_utf8
  text.chars.collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8
end
to_ncr → string click to toggle source

Converts XHTML+MathML named entities in string to Numeric Character References

# File lib/itex_stringsupport.rb, line 2263
def to_ncr
   self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
end
to_ncr! → str or nil click to toggle source

Converts XHTML+MathML named entities in string to Numeric Character References

Substitution is done in-place.

# File lib/itex_stringsupport.rb, line 2274
def to_ncr!
   self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr}
end
to_utf8 → string click to toggle source

Converts XHTML+MathML named entities in string to UTF-8

# File lib/itex_stringsupport.rb, line 2284
def to_utf8
  self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
  
  # You might think this is faster, but it isn't
  # pieces = self.split(/&([a-zA-Z0-9]+);/)
  # 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8}
  # pieces.join
end
to_ncr! → str or nil click to toggle source

++

Converts XHTML+MathML named entities in string to UTF-8

Substitution is done in-place.
# File lib/itex_stringsupport.rb, line 2301
def to_utf8!
   self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8}
end