本文主要是介绍java xml utf-8 乱码_出现下面的的XML解析异常的根本原因是什么,表层原因应该是UTF-8解析过程中遇到中文乱码?...,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
https://blog.csdn.net/chenyanbo/article/details/6866941
事实上XML解析器尝试使用UTF-8编码来进行解析,但是解析到中文的时候,发现编码不符合UTF-8编码的规则了,然后就报错了,这也是为什么我删除中文之后就正确了的原因。
到解析xml时发现xml定义的编码为utf8或者没有定义编码的时候,都会采用utf8解码。
具体就是用到下面那个类,用于读取输入流。
我们知道,utf8编码每个字符,所用的字节数是不一样的,也就是不定长编码。
这种编码的方式。最主要的还是用于网络传输。在网上中可以很明确的知道,哪几个字节,构成一个字符,无论中间被截断还是缺失,都可以分辨出来。下面的代码就是在读取,字节的过程中,根据,编码的特点,校验,所读取的字节,是否符合,编码规则。
当不符合编码规则的时候,就会抛出异常。
具体规则介绍如下,其实就是utf8的编码规则。
每个字节的前n位为1,说明包含本字节在内的n个字节为一个字符。
第n+1位为0
更为详细的请参看
Unicode 和 UTF-8 有何区别?www.zhihu.com
21 package com.sun.org.apache.xerces.internal.impl.io;
22
23 import java.io.InputStream;
24 import java.io.IOException;
25 import java.io.Reader;
26
27 import java.util.Locale;
28 import com.sun.org.apache.xerces.internal.util.MessageFormatter;
29 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
30
31 import com.sun.xml.internal.stream.util.BufferAllocator;
32 import com.sun.xml.internal.stream.util.ThreadLocalBufferAllocator;
33
34 /**
35 *
A UTF-8 reader.
36 *
37 * @xerces.internal
38 *
39 * @author Andy Clark, IBM
40 *
41 */
42 public class UTF8Reader
43 extends Reader {
44
45 //
46 // Constants
47 //
48
49 /** Default byte buffer size (2048). */
50 public static final int DEFAULT_BUFFER_SIZE = 2048;
51
52 // debugging
53
54 /** Debug read. */
55 private static final boolean DEBUG_READ = false;
56
57 //
58 // Data
59 //
60
61 /** Input stream. */
62 protected InputStream fInputStream;
63
64 /** Byte buffer. */
65 protected byte[] fBuffer;
66
67 /** Offset into buffer. */
68 protected int fOffset;
69
70 /** Surrogate character. */
71 private int fSurrogate = -1;
72
73 // message formatter; used to produce localized
74 // exception messages
75 private MessageFormatter fFormatter = null;
76
77 //Locale to use for messages
78 private Locale fLocale = null;
79
80 //
81 // Constructors
82 //
83
84 /**
85 * Constructs a UTF-8 reader from the specified input stream
86 * using the default buffer size. Primarily for testing.
87 *
88 * @param inputStream The input stream.
89 */
90 public UTF8Reader(InputStream inputStream) {
91 this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
92 } // (InputStream, MessageFormatter)
93
94 /**
95 * Constructs a UTF-8 reader from the specified input stream
96 * using the default buffer size and the given MessageFormatter.
97 *
98 * @param inputStream The input stream.
99 * @param messageFormatter given MessageFormatter
100 * @param locale Locale to use for messages
101 */
102 public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter,
103 Locale locale) {
104 this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
105 } // (InputStream, MessageFormatter, Locale)
106
107 /**
108 * Constructs a UTF-8 reader from the specified input stream,
109 * buffer size and MessageFormatter.
110 *
111 * @param inputStream The input stream.
112 * @param size The initial buffer size.
113 * @param messageFormatter the formatter for localizing/formatting errors.
114 * @param locale the Locale to use for messages
115 */
116 public UTF8Reader(InputStream inputStream, int size,
117 MessageFormatter messageFormatter, Locale locale) {
118 fInputStream = inputStream;
119 BufferAllocator ba = ThreadLocalBufferAllocator.getBufferAllocator();
120 fBuffer = ba.getByteBuffer(size);
121 if (fBuffer == null) {
122 fBuffer = new byte[size];
123 }
124 fFormatter = messageFormatter;
125 fLocale = locale;
126 } // (InputStream, int, MessageFormatter, Locale)
127
128 //
129 // Reader methods
130 //
131
132 /**
133 * Read a single character. This method will block until a character is
134 * available, an I/O error occurs, or the end of the stream is reached.
135 *
136 *
Subclasses that intend to support efficient single-character input
137 * should override this method.
138 *
139 * @return The character read, as an integer in the range 0 to 16383
140 * (0x00-0xffff), or -1 if the end of the stream has
141 * been reached
142 *
143 * @exception IOException If an I/O error occurs
144 */
145 public int read() throws IOException {
146
147 // decode character
148 int c = fSurrogate;
149 if (fSurrogate == -1) {
150 // NOTE: We use the index into the buffer if there are remaining
151 // bytes from the last block read. -Ac
152 int index = 0;
153
154 // get first byte
155 int b0 = index == fOffset
156 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
157 if (b0 == -1) {
158 return -1;
159 }
160
161 // UTF-8: [0xxx xxxx]
162 // Unicode: [0000 0000] [0xxx xxxx]
163 if (b0 < 0x80) {
164 c = (char)b0;
165 }
166
167 // UTF-8: [110y yyyy] [10xx xxxx]
168 // Unicode: [0000 0yyy] [yyxx xxxx]
169 else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
170 int b1 = index == fOffset
171 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
172 if (b1 == -1) {
173 expectedByte(2, 2);
174 }
175 if ((b1 & 0xC0) != 0x80) {
176 invalidByte(2, 2, b1);
177 }
178 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
179 }
180
181 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
182 // Unicode: [zzzz yyyy] [yyxx xxxx]
183 else if ((b0 & 0xF0) == 0xE0) {
184 int b1 = index == fOffset
185 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
186 if (b1 == -1) {
187 expectedByte(2, 3);
188 }
189 if ((b1 & 0xC0) != 0x80
190 || (b0 == 0xED && b1 >= 0xA0)
191 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
192 invalidByte(2, 3, b1);
193 }
194 int b2 = index == fOffset
195 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
196 if (b2 == -1) {
197 expectedByte(3, 3);
198 }
199 if ((b2 & 0xC0) != 0x80) {
200 invalidByte(3, 3, b2);
201 }
202 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
203 (b2 & 0x003F);
204 }
205
206 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
207 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
208 // [1101 11yy] [yyxx xxxx] (low surrogate)
209 // * uuuuu = wwww + 1
210 else if ((b0 & 0xF8) == 0xF0) {
211 int b1 = index == fOffset
212 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
213 if (b1 == -1) {
214 expectedByte(2, 4);
215 }
216 if ((b1 & 0xC0) != 0x80
217 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
218 invalidByte(2, 3, b1);
219 }
220 int b2 = index == fOffset
221 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
222 if (b2 == -1) {
223 expectedByte(3, 4);
224 }
225 if ((b2 & 0xC0) != 0x80) {
226 invalidByte(3, 3, b2);
227 }
228 int b3 = index == fOffset
229 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
230 if (b3 == -1) {
231 expectedByte(4, 4);
232 }
233 if ((b3 & 0xC0) != 0x80) {
234 invalidByte(4, 4, b3);
235 }
236 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
237 if (uuuuu > 0x10) {
238 invalidSurrogate(uuuuu);
239 }
240 int wwww = uuuuu - 1;
241 int hs = 0xD800 |
242 ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
243 ((b2 >> 4) & 0x0003);
244 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
245 c = hs;
246 fSurrogate = ls;
247 }
248
249 // error
250 else {
251 invalidByte(1, 1, b0);
252 }
253 }
254
255 // use surrogate
256 else {
257 fSurrogate = -1;
258 }
259
260 // return character
261 if (DEBUG_READ) {
262 System.out.println("read(): 0x"+Integer.toHexString(c));
263 }
264 return c;
265
266 } // read():int
267
这篇关于java xml utf-8 乱码_出现下面的的XML解析异常的根本原因是什么,表层原因应该是UTF-8解析过程中遇到中文乱码?...的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!