| 1 |
# XSD4R - Charset handling library. |
|---|
| 2 |
# Copyright (C) 2000-2007 NAKAMURA, Hiroshi <nahi@ruby-lang.org>. |
|---|
| 3 |
|
|---|
| 4 |
# This program is copyrighted free software by NAKAMURA, Hiroshi. You can |
|---|
| 5 |
# redistribute it and/or modify it under the same terms of Ruby's license; |
|---|
| 6 |
# either the dual license version in 2003, or any later version. |
|---|
| 7 |
|
|---|
| 8 |
|
|---|
| 9 |
module XSD |
|---|
| 10 |
|
|---|
| 11 |
|
|---|
| 12 |
module Charset |
|---|
| 13 |
@internal_encoding = $KCODE |
|---|
| 14 |
|
|---|
| 15 |
class XSDError < StandardError; end |
|---|
| 16 |
class CharsetError < XSDError; end |
|---|
| 17 |
class UnknownCharsetError < CharsetError; end |
|---|
| 18 |
class CharsetConversionError < CharsetError; end |
|---|
| 19 |
|
|---|
| 20 |
public |
|---|
| 21 |
|
|---|
| 22 |
### |
|---|
| 23 |
## Maps |
|---|
| 24 |
# |
|---|
| 25 |
EncodingConvertMap = {} |
|---|
| 26 |
def Charset.init |
|---|
| 27 |
EncodingConvertMap[['UTF8', 'X_ISO_8859_1']] = |
|---|
| 28 |
Proc.new { |str| str.unpack('U*').pack('C*') } |
|---|
| 29 |
EncodingConvertMap[['X_ISO_8859_1', 'UTF8']] = |
|---|
| 30 |
Proc.new { |str| str.unpack('C*').pack('U*') } |
|---|
| 31 |
begin |
|---|
| 32 |
require 'xsd/iconvcharset' |
|---|
| 33 |
@internal_encoding = 'UTF8' |
|---|
| 34 |
sjtag = (/(mswin|bccwin|mingw|cygwin|emx)/ =~ RUBY_PLATFORM) ? 'cp932' : |
|---|
| 35 |
'shift_jis' |
|---|
| 36 |
EncodingConvertMap[['UTF8', 'EUC' ]] = |
|---|
| 37 |
Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "utf-8", str) } |
|---|
| 38 |
EncodingConvertMap[['EUC' , 'UTF8']] = |
|---|
| 39 |
Proc.new { |str| IconvCharset.safe_iconv("utf-8", "euc-jp", str) } |
|---|
| 40 |
EncodingConvertMap[['EUC' , 'SJIS']] = |
|---|
| 41 |
Proc.new { |str| IconvCharset.safe_iconv(sjtag, "euc-jp", str) } |
|---|
| 42 |
EncodingConvertMap[['UTF8', 'SJIS']] = |
|---|
| 43 |
Proc.new { |str| IconvCharset.safe_iconv(sjtag, "utf-8", str) } |
|---|
| 44 |
EncodingConvertMap[['SJIS', 'UTF8']] = |
|---|
| 45 |
Proc.new { |str| IconvCharset.safe_iconv("utf-8", sjtag, str) } |
|---|
| 46 |
EncodingConvertMap[['SJIS', 'EUC' ]] = |
|---|
| 47 |
Proc.new { |str| IconvCharset.safe_iconv("euc-jp", sjtag, str) } |
|---|
| 48 |
rescue LoadError |
|---|
| 49 |
begin |
|---|
| 50 |
require 'nkf' |
|---|
| 51 |
EncodingConvertMap[['EUC' , 'SJIS']] = |
|---|
| 52 |
Proc.new { |str| NKF.nkf('-sXm0', str) } |
|---|
| 53 |
EncodingConvertMap[['SJIS', 'EUC' ]] = |
|---|
| 54 |
Proc.new { |str| NKF.nkf('-eXm0', str) } |
|---|
| 55 |
rescue LoadError |
|---|
| 56 |
end |
|---|
| 57 |
|
|---|
| 58 |
begin |
|---|
| 59 |
require 'uconv' |
|---|
| 60 |
@internal_encoding = 'UTF8' |
|---|
| 61 |
EncodingConvertMap[['UTF8', 'EUC' ]] = Uconv.method(:u8toeuc) |
|---|
| 62 |
EncodingConvertMap[['UTF8', 'SJIS']] = Uconv.method(:u8tosjis) |
|---|
| 63 |
EncodingConvertMap[['EUC' , 'UTF8']] = Uconv.method(:euctou8) |
|---|
| 64 |
EncodingConvertMap[['SJIS', 'UTF8']] = Uconv.method(:sjistou8) |
|---|
| 65 |
rescue LoadError |
|---|
| 66 |
end |
|---|
| 67 |
end |
|---|
| 68 |
end |
|---|
| 69 |
self.init |
|---|
| 70 |
|
|---|
| 71 |
CharsetMap = { |
|---|
| 72 |
'NONE' => 'us-ascii', |
|---|
| 73 |
'EUC' => 'euc-jp', |
|---|
| 74 |
'SJIS' => 'shift_jis', |
|---|
| 75 |
'UTF8' => 'utf-8', |
|---|
| 76 |
'X_ISO_8859_1' => 'iso-8859-1', |
|---|
| 77 |
'X_UNKNOWN' => nil, |
|---|
| 78 |
} |
|---|
| 79 |
|
|---|
| 80 |
CharsetStrCache = {} |
|---|
| 81 |
|
|---|
| 82 |
|
|---|
| 83 |
### |
|---|
| 84 |
## handlers |
|---|
| 85 |
# |
|---|
| 86 |
def Charset.encoding |
|---|
| 87 |
@internal_encoding |
|---|
| 88 |
end |
|---|
| 89 |
|
|---|
| 90 |
def Charset.encoding=(encoding) |
|---|
| 91 |
warn("xsd charset is set to #{encoding}") if $DEBUG |
|---|
| 92 |
@internal_encoding = encoding |
|---|
| 93 |
end |
|---|
| 94 |
|
|---|
| 95 |
def Charset.xml_encoding_label |
|---|
| 96 |
charset_label(@internal_encoding) |
|---|
| 97 |
end |
|---|
| 98 |
|
|---|
| 99 |
def Charset.encoding_from_internal(str, charset) |
|---|
| 100 |
encoding_conv(str, @internal_encoding, charset_str(charset)) |
|---|
| 101 |
end |
|---|
| 102 |
|
|---|
| 103 |
def Charset.encoding_to_internal(str, charset) |
|---|
| 104 |
encoding_conv(str, charset_str(charset), @internal_encoding) |
|---|
| 105 |
end |
|---|
| 106 |
|
|---|
| 107 |
def Charset.encoding_conv(str, enc_from, enc_to) |
|---|
| 108 |
if enc_from == enc_to or enc_from == 'NONE' or enc_to == 'NONE' |
|---|
| 109 |
str |
|---|
| 110 |
elsif converter = EncodingConvertMap[[enc_from, enc_to]] |
|---|
| 111 |
converter.call(str) |
|---|
| 112 |
else |
|---|
| 113 |
raise CharsetConversionError.new( |
|---|
| 114 |
"Converter not found: #{enc_from} -> #{enc_to}") |
|---|
| 115 |
end |
|---|
| 116 |
end |
|---|
| 117 |
|
|---|
| 118 |
def Charset.charset_label(encoding) |
|---|
| 119 |
CharsetMap[encoding.upcase] |
|---|
| 120 |
end |
|---|
| 121 |
|
|---|
| 122 |
def Charset.charset_str(label) |
|---|
| 123 |
if CharsetMap.respond_to?(:key) |
|---|
| 124 |
CharsetStrCache[label] ||= CharsetMap.key(label.downcase) || 'X_UNKNOWN' |
|---|
| 125 |
else |
|---|
| 126 |
CharsetStrCache[label] ||= CharsetMap.index(label.downcase) || 'X_UNKNOWN' |
|---|
| 127 |
end |
|---|
| 128 |
end |
|---|
| 129 |
|
|---|
| 130 |
# us_ascii = '[\x00-\x7F]' |
|---|
| 131 |
us_ascii = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted. |
|---|
| 132 |
USASCIIRegexp = Regexp.new("\\A#{us_ascii}*\\z", nil, 'NONE') |
|---|
| 133 |
|
|---|
| 134 |
twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])' |
|---|
| 135 |
threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])' |
|---|
| 136 |
character_euc = "(?:#{us_ascii}|#{twobytes_euc}|#{threebytes_euc})" |
|---|
| 137 |
EUCRegexp = Regexp.new("\\A#{character_euc}*\\z", nil, 'NONE') |
|---|
| 138 |
|
|---|
| 139 |
# onebyte_sjis = '[\x00-\x7F\xA1-\xDF]' |
|---|
| 140 |
onebyte_sjis = '[\x9\xa\xd\x20-\x7F\xA1-\xDF]' # XML 1.0 restricted. |
|---|
| 141 |
twobytes_sjis = '(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])' |
|---|
| 142 |
character_sjis = "(?:#{onebyte_sjis}|#{twobytes_sjis})" |
|---|
| 143 |
SJISRegexp = Regexp.new("\\A#{character_sjis}*\\z", nil, 'NONE') |
|---|
| 144 |
|
|---|
| 145 |
# 0xxxxxxx |
|---|
| 146 |
# 110yyyyy 10xxxxxx |
|---|
| 147 |
twobytes_utf8 = '(?:[\xC0-\xDF][\x80-\xBF])' |
|---|
| 148 |
# 1110zzzz 10yyyyyy 10xxxxxx |
|---|
| 149 |
threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])' |
|---|
| 150 |
# 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx |
|---|
| 151 |
fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])' |
|---|
| 152 |
character_utf8 = |
|---|
| 153 |
"(?:#{us_ascii}|#{twobytes_utf8}|#{threebytes_utf8}|#{fourbytes_utf8})" |
|---|
| 154 |
UTF8Regexp = Regexp.new("\\A#{character_utf8}*\\z", nil, 'NONE') |
|---|
| 155 |
|
|---|
| 156 |
def Charset.is_us_ascii(str) |
|---|
| 157 |
USASCIIRegexp =~ str |
|---|
| 158 |
end |
|---|
| 159 |
|
|---|
| 160 |
def Charset.is_utf8(str) |
|---|
| 161 |
UTF8Regexp =~ str |
|---|
| 162 |
end |
|---|
| 163 |
|
|---|
| 164 |
def Charset.is_euc(str) |
|---|
| 165 |
EUCRegexp =~ str |
|---|
| 166 |
end |
|---|
| 167 |
|
|---|
| 168 |
def Charset.is_sjis(str) |
|---|
| 169 |
SJISRegexp =~ str |
|---|
| 170 |
end |
|---|
| 171 |
|
|---|
| 172 |
def Charset.is_ces(str, code = @internal_encoding) |
|---|
| 173 |
case code |
|---|
| 174 |
when 'NONE' |
|---|
| 175 |
is_us_ascii(str) |
|---|
| 176 |
when 'UTF8' |
|---|
| 177 |
is_utf8(str) |
|---|
| 178 |
when 'EUC' |
|---|
| 179 |
is_euc(str) |
|---|
| 180 |
when 'SJIS' |
|---|
| 181 |
is_sjis(str) |
|---|
| 182 |
else |
|---|
| 183 |
raise UnknownCharsetError.new("Unknown charset: #{code}") |
|---|
| 184 |
end |
|---|
| 185 |
end |
|---|
| 186 |
end |
|---|
| 187 |
|
|---|
| 188 |
|
|---|
| 189 |
end |
|---|