Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
MimeUtility |
|
| 9.25;9,25 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | package org.apache.commons.fileupload.util.mime; | |
18 | ||
19 | import java.io.ByteArrayOutputStream; | |
20 | import java.io.IOException; | |
21 | import java.io.UnsupportedEncodingException; | |
22 | import java.util.HashMap; | |
23 | import java.util.Locale; | |
24 | import java.util.Map; | |
25 | ||
26 | /** | |
27 | * Utility class to decode MIME texts. | |
28 | * | |
29 | * @since 1.3 | |
30 | */ | |
31 | public final class MimeUtility { | |
32 | ||
33 | /** | |
34 | * The {@code US-ASCII} charset identifier constant. | |
35 | */ | |
36 | private static final String US_ASCII_CHARSET = "US-ASCII"; | |
37 | ||
38 | /** | |
39 | * The marker to indicate text is encoded with BASE64 algorithm. | |
40 | */ | |
41 | private static final String BASE64_ENCODING_MARKER = "B"; | |
42 | ||
43 | /** | |
44 | * The marker to indicate text is encoded with QuotedPrintable algorithm. | |
45 | */ | |
46 | private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q"; | |
47 | ||
48 | /** | |
49 | * If the text contains any encoded tokens, those tokens will be marked with "=?". | |
50 | */ | |
51 | private static final String ENCODED_TOKEN_MARKER = "=?"; | |
52 | ||
53 | /** | |
54 | * If the text contains any encoded tokens, those tokens will terminate with "=?". | |
55 | */ | |
56 | private static final String ENCODED_TOKEN_FINISHER = "?="; | |
57 | ||
58 | /** | |
59 | * The linear whitespace chars sequence. | |
60 | */ | |
61 | private static final String LINEAR_WHITESPACE = " \t\r\n"; | |
62 | ||
63 | /** | |
64 | * Mappings between MIME and Java charset. | |
65 | */ | |
66 | 1 | private static final Map<String, String> MIME2JAVA = new HashMap<String, String>(); |
67 | ||
68 | static { | |
69 | 1 | MIME2JAVA.put("iso-2022-cn", "ISO2022CN"); |
70 | 1 | MIME2JAVA.put("iso-2022-kr", "ISO2022KR"); |
71 | 1 | MIME2JAVA.put("utf-8", "UTF8"); |
72 | 1 | MIME2JAVA.put("utf8", "UTF8"); |
73 | 1 | MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP"); |
74 | 1 | MIME2JAVA.put("ja_jp.eucjp", "EUCJIS"); |
75 | 1 | MIME2JAVA.put("euc-kr", "KSC5601"); |
76 | 1 | MIME2JAVA.put("euckr", "KSC5601"); |
77 | 1 | MIME2JAVA.put("us-ascii", "ISO-8859-1"); |
78 | 1 | MIME2JAVA.put("x-us-ascii", "ISO-8859-1"); |
79 | 1 | } |
80 | ||
81 | /** | |
82 | * Hidden constructor, this class must not be instantiated. | |
83 | */ | |
84 | 0 | private MimeUtility() { |
85 | // do nothing | |
86 | 0 | } |
87 | ||
88 | /** | |
89 | * Decode a string of text obtained from a mail header into | |
90 | * its proper form. The text generally will consist of a | |
91 | * string of tokens, some of which may be encoded using | |
92 | * base64 encoding. | |
93 | * | |
94 | * @param text The text to decode. | |
95 | * | |
96 | * @return The decoded text string. | |
97 | * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported. | |
98 | */ | |
99 | public static String decodeText(String text) throws UnsupportedEncodingException { | |
100 | // if the text contains any encoded tokens, those tokens will be marked with "=?". If the | |
101 | // source string doesn't contain that sequent, no decoding is required. | |
102 | 6481 | if (text.indexOf(ENCODED_TOKEN_MARKER) < 0) { |
103 | 6475 | return text; |
104 | } | |
105 | ||
106 | 6 | int offset = 0; |
107 | 6 | int endOffset = text.length(); |
108 | ||
109 | 6 | int startWhiteSpace = -1; |
110 | 6 | int endWhiteSpace = -1; |
111 | ||
112 | 6 | StringBuilder decodedText = new StringBuilder(text.length()); |
113 | ||
114 | 6 | boolean previousTokenEncoded = false; |
115 | ||
116 | 19 | while (offset < endOffset) { |
117 | 14 | char ch = text.charAt(offset); |
118 | ||
119 | // is this a whitespace character? | |
120 | 14 | if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found |
121 | 5 | startWhiteSpace = offset; |
122 | 19 | while (offset < endOffset) { |
123 | // step over the white space characters. | |
124 | 17 | ch = text.charAt(offset); |
125 | 17 | if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found |
126 | 14 | offset++; |
127 | } else { | |
128 | // record the location of the first non lwsp and drop down to process the | |
129 | // token characters. | |
130 | 3 | endWhiteSpace = offset; |
131 | 3 | break; |
132 | } | |
133 | } | |
134 | } else { | |
135 | // we have a word token. We need to scan over the word and then try to parse it. | |
136 | 9 | int wordStart = offset; |
137 | ||
138 | 416 | while (offset < endOffset) { |
139 | // step over the non white space characters. | |
140 | 412 | ch = text.charAt(offset); |
141 | 412 | if (LINEAR_WHITESPACE.indexOf(ch) == -1) { // not white space |
142 | 407 | offset++; |
143 | } else { | |
144 | break; | |
145 | } | |
146 | ||
147 | //NB: Trailing whitespace on these header strings will just be discarded. | |
148 | } | |
149 | // pull out the word token. | |
150 | 9 | String word = text.substring(wordStart, offset); |
151 | // is the token encoded? decode the word | |
152 | 9 | if (word.startsWith(ENCODED_TOKEN_MARKER)) { |
153 | try { | |
154 | // if this gives a parsing failure, treat it like a non-encoded word. | |
155 | 9 | String decodedWord = decodeWord(word); |
156 | ||
157 | // are any whitespace characters significant? Append 'em if we've got 'em. | |
158 | 8 | if (!previousTokenEncoded && startWhiteSpace != -1) { |
159 | 0 | decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); |
160 | 0 | startWhiteSpace = -1; |
161 | } | |
162 | // this is definitely a decoded token. | |
163 | 8 | previousTokenEncoded = true; |
164 | // and add this to the text. | |
165 | 8 | decodedText.append(decodedWord); |
166 | // we continue parsing from here...we allow parsing errors to fall through | |
167 | // and get handled as normal text. | |
168 | 8 | continue; |
169 | ||
170 | 0 | } catch (ParseException e) { |
171 | // just ignore it, skip to next word | |
172 | } | |
173 | } | |
174 | // this is a normal token, so it doesn't matter what the previous token was. Add the white space | |
175 | // if we have it. | |
176 | 0 | if (startWhiteSpace != -1) { |
177 | 0 | decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); |
178 | 0 | startWhiteSpace = -1; |
179 | } | |
180 | // this is not a decoded token. | |
181 | 0 | previousTokenEncoded = false; |
182 | 0 | decodedText.append(word); |
183 | } | |
184 | 5 | } |
185 | ||
186 | 5 | return decodedText.toString(); |
187 | } | |
188 | ||
189 | /** | |
190 | * Parse a string using the RFC 2047 rules for an "encoded-word" | |
191 | * type. This encoding has the syntax: | |
192 | * | |
193 | * encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" | |
194 | * | |
195 | * @param word The possibly encoded word value. | |
196 | * | |
197 | * @return The decoded word. | |
198 | * @throws ParseException | |
199 | * @throws UnsupportedEncodingException | |
200 | */ | |
201 | private static String decodeWord(String word) throws ParseException, UnsupportedEncodingException { | |
202 | // encoded words start with the characters "=?". If this not an encoded word, we throw a | |
203 | // ParseException for the caller. | |
204 | ||
205 | 9 | if (!word.startsWith(ENCODED_TOKEN_MARKER)) { |
206 | 0 | throw new ParseException("Invalid RFC 2047 encoded-word: " + word); |
207 | } | |
208 | ||
209 | 9 | int charsetPos = word.indexOf('?', 2); |
210 | 9 | if (charsetPos == -1) { |
211 | 0 | throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word); |
212 | } | |
213 | ||
214 | // pull out the character set information (this is the MIME name at this point). | |
215 | 9 | String charset = word.substring(2, charsetPos).toLowerCase(); |
216 | ||
217 | // now pull out the encoding token the same way. | |
218 | 9 | int encodingPos = word.indexOf('?', charsetPos + 1); |
219 | 9 | if (encodingPos == -1) { |
220 | 0 | throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word); |
221 | } | |
222 | ||
223 | 9 | String encoding = word.substring(charsetPos + 1, encodingPos); |
224 | ||
225 | // and finally the encoded text. | |
226 | 9 | int encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1); |
227 | 9 | if (encodedTextPos == -1) { |
228 | 0 | throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word); |
229 | } | |
230 | ||
231 | 9 | String encodedText = word.substring(encodingPos + 1, encodedTextPos); |
232 | ||
233 | // seems a bit silly to encode a null string, but easy to deal with. | |
234 | 9 | if (encodedText.length() == 0) { |
235 | 0 | return ""; |
236 | } | |
237 | ||
238 | try { | |
239 | // the decoder writes directly to an output stream. | |
240 | 9 | ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length()); |
241 | ||
242 | 9 | byte[] encodedData = encodedText.getBytes(US_ASCII_CHARSET); |
243 | ||
244 | // Base64 encoded? | |
245 | 9 | if (encoding.equals(BASE64_ENCODING_MARKER)) { |
246 | 8 | Base64Decoder.decode(encodedData, out); |
247 | 1 | } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable. |
248 | 1 | QuotedPrintableDecoder.decode(encodedData, out); |
249 | } else { | |
250 | 0 | throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding); |
251 | } | |
252 | // get the decoded byte data and convert into a string. | |
253 | 8 | byte[] decodedData = out.toByteArray(); |
254 | 8 | return new String(decodedData, javaCharset(charset)); |
255 | 1 | } catch (IOException e) { |
256 | 1 | throw new UnsupportedEncodingException("Invalid RFC 2047 encoding"); |
257 | } | |
258 | } | |
259 | ||
260 | /** | |
261 | * Translate a MIME standard character set name into the Java | |
262 | * equivalent. | |
263 | * | |
264 | * @param charset The MIME standard name. | |
265 | * | |
266 | * @return The Java equivalent for this name. | |
267 | */ | |
268 | private static String javaCharset(String charset) { | |
269 | // nothing in, nothing out. | |
270 | 8 | if (charset == null) { |
271 | 0 | return null; |
272 | } | |
273 | ||
274 | 8 | String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH)); |
275 | // if there is no mapping, then the original name is used. Many of the MIME character set | |
276 | // names map directly back into Java. The reverse isn't necessarily true. | |
277 | 8 | if (mappedCharset == null) { |
278 | 6 | return charset; |
279 | } | |
280 | 2 | return mappedCharset; |
281 | } | |
282 | ||
283 | } |