Welcome to the "trac"-ing site of soap4r!
[soap4r] [httpclient] [openpgp4u] [pkcs1] [logger] [csv] [vtr]

root/branches/1_5/lib/xsd/charset.rb

Revision 2012, 5.3 kB (checked in by nahi, 1 year ago)
  • renamed XSD::Charset method name; from/to_xml -> to/from_internal
  • Property svn:eol-style set to native
  • Property svn:keywords set to author date id revision
Line 
1 # XSD4R - Charset handling library.
2 # Copyright (C) 2000-2007  NAKAMURA, Hiroshi <nahi@ruby-lang.org>.
3
4 # This program is copyrighted free software by NAKAMURA, Hiroshi.  You can
5 # redistribute it and/or modify it under the same terms of Ruby's license;
6 # either the dual license version in 2003, or any later version.
7
8
9 module XSD
10
11
12 module Charset
13   @internal_encoding = $KCODE
14
15   class XSDError < StandardError; end
16   class CharsetError < XSDError; end
17   class UnknownCharsetError < CharsetError; end
18   class CharsetConversionError < CharsetError; end
19
20 public
21
22   ###
23   ## Maps
24   #
25   EncodingConvertMap = {}
26   def Charset.init
27     EncodingConvertMap[['UTF8', 'X_ISO_8859_1']] =
28       Proc.new { |str| str.unpack('U*').pack('C*') }
29     EncodingConvertMap[['X_ISO_8859_1', 'UTF8']] =
30       Proc.new { |str| str.unpack('C*').pack('U*') }
31     begin
32       require 'xsd/iconvcharset'
33       @internal_encoding = 'UTF8'
34       sjtag = (/(mswin|bccwin|mingw|cygwin|emx)/ =~ RUBY_PLATFORM) ? 'cp932' :
35         'shift_jis'
36       EncodingConvertMap[['UTF8', 'EUC' ]] =
37         Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "utf-8", str) }
38       EncodingConvertMap[['EUC' , 'UTF8']] =
39         Proc.new { |str| IconvCharset.safe_iconv("utf-8", "euc-jp", str) }
40       EncodingConvertMap[['EUC' , 'SJIS']] =
41         Proc.new { |str| IconvCharset.safe_iconv(sjtag, "euc-jp", str) }
42       EncodingConvertMap[['UTF8', 'SJIS']] =
43         Proc.new { |str| IconvCharset.safe_iconv(sjtag, "utf-8", str) }
44       EncodingConvertMap[['SJIS', 'UTF8']] =
45         Proc.new { |str| IconvCharset.safe_iconv("utf-8", sjtag, str) }
46       EncodingConvertMap[['SJIS', 'EUC' ]] =
47         Proc.new { |str| IconvCharset.safe_iconv("euc-jp", sjtag, str) }
48     rescue LoadError
49       begin
50         require 'nkf'
51         EncodingConvertMap[['EUC' , 'SJIS']] =
52           Proc.new { |str| NKF.nkf('-sXm0', str) }
53         EncodingConvertMap[['SJIS', 'EUC' ]] =
54           Proc.new { |str| NKF.nkf('-eXm0', str) }
55       rescue LoadError
56       end
57  
58       begin
59         require 'uconv'
60         @internal_encoding = 'UTF8'
61         EncodingConvertMap[['UTF8', 'EUC' ]] = Uconv.method(:u8toeuc)
62         EncodingConvertMap[['UTF8', 'SJIS']] = Uconv.method(:u8tosjis)
63         EncodingConvertMap[['EUC' , 'UTF8']] = Uconv.method(:euctou8)
64         EncodingConvertMap[['SJIS', 'UTF8']] = Uconv.method(:sjistou8)
65       rescue LoadError
66       end
67     end
68   end
69   self.init
70
71   CharsetMap = {
72     'NONE' => 'us-ascii',
73     'EUC' => 'euc-jp',
74     'SJIS' => 'shift_jis',
75     'UTF8' => 'utf-8',
76     'X_ISO_8859_1' => 'iso-8859-1',
77     'X_UNKNOWN' => nil,
78   }
79
80   CharsetStrCache = {}
81
82
83   ###
84   ## handlers
85   #
86   def Charset.encoding
87     @internal_encoding
88   end
89
90   def Charset.encoding=(encoding)
91     warn("xsd charset is set to #{encoding}") if $DEBUG
92     @internal_encoding = encoding
93   end
94
95   def Charset.xml_encoding_label
96     charset_label(@internal_encoding)
97   end
98
99   def Charset.encoding_from_internal(str, charset)
100     encoding_conv(str, @internal_encoding, charset_str(charset))
101   end
102
103   def Charset.encoding_to_internal(str, charset)
104     encoding_conv(str, charset_str(charset), @internal_encoding)
105   end
106
107   def Charset.encoding_conv(str, enc_from, enc_to)
108     if enc_from == enc_to or enc_from == 'NONE' or enc_to == 'NONE'
109       str
110     elsif converter = EncodingConvertMap[[enc_from, enc_to]]
111       converter.call(str)
112     else
113       raise CharsetConversionError.new(
114         "Converter not found: #{enc_from} -> #{enc_to}")
115     end
116   end
117
118   def Charset.charset_label(encoding)
119     CharsetMap[encoding.upcase]
120   end
121
122   def Charset.charset_str(label)
123     if CharsetMap.respond_to?(:key)
124       CharsetStrCache[label] ||= CharsetMap.key(label.downcase) || 'X_UNKNOWN'
125     else
126       CharsetStrCache[label] ||= CharsetMap.index(label.downcase) || 'X_UNKNOWN'
127     end
128   end
129
130   # us_ascii = '[\x00-\x7F]'
131   us_ascii = '[\x9\xa\xd\x20-\x7F]'     # XML 1.0 restricted.
132   USASCIIRegexp = Regexp.new("\\A#{us_ascii}*\\z", nil, 'NONE')
133
134   twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'
135   threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'
136   character_euc = "(?:#{us_ascii}|#{twobytes_euc}|#{threebytes_euc})"
137   EUCRegexp = Regexp.new("\\A#{character_euc}*\\z", nil, 'NONE')
138
139   # onebyte_sjis = '[\x00-\x7F\xA1-\xDF]'
140   onebyte_sjis = '[\x9\xa\xd\x20-\x7F\xA1-\xDF]'        # XML 1.0 restricted.
141   twobytes_sjis = '(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])'
142   character_sjis = "(?:#{onebyte_sjis}|#{twobytes_sjis})"
143   SJISRegexp = Regexp.new("\\A#{character_sjis}*\\z", nil, 'NONE')
144
145   # 0xxxxxxx
146   # 110yyyyy 10xxxxxx
147   twobytes_utf8 = '(?:[\xC0-\xDF][\x80-\xBF])'
148   # 1110zzzz 10yyyyyy 10xxxxxx
149   threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])'
150   # 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx
151   fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])'
152   character_utf8 =
153     "(?:#{us_ascii}|#{twobytes_utf8}|#{threebytes_utf8}|#{fourbytes_utf8})"
154   UTF8Regexp = Regexp.new("\\A#{character_utf8}*\\z", nil, 'NONE')
155
156   def Charset.is_us_ascii(str)
157     USASCIIRegexp =~ str
158   end
159
160   def Charset.is_utf8(str)
161     UTF8Regexp =~ str
162   end
163
164   def Charset.is_euc(str)
165     EUCRegexp =~ str
166   end
167
168   def Charset.is_sjis(str)
169     SJISRegexp =~ str
170   end
171
172   def Charset.is_ces(str, code = @internal_encoding)
173     case code
174     when 'NONE'
175       is_us_ascii(str)
176     when 'UTF8'
177       is_utf8(str)
178     when 'EUC'
179       is_euc(str)
180     when 'SJIS'
181       is_sjis(str)
182     else
183       raise UnknownCharsetError.new("Unknown charset: #{code}")
184     end
185   end
186 end
187
188
189 end
Note: See TracBrowser for help on using the browser.