Coverage Report - org.apache.commons.fileupload.util.mime.MimeUtility
 
Classes in this File Line Coverage Branch Coverage Complexity
MimeUtility
80%
70/87
72%
29/40
9,25
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 package org.apache.commons.fileupload.util.mime;
 18  
 
 19  
 import java.io.ByteArrayOutputStream;
 20  
 import java.io.IOException;
 21  
 import java.io.UnsupportedEncodingException;
 22  
 import java.util.HashMap;
 23  
 import java.util.Locale;
 24  
 import java.util.Map;
 25  
 
 26  
 /**
 27  
  * Utility class to decode MIME texts.
 28  
  *
 29  
  * @since 1.3
 30  
  */
 31  
 public final class MimeUtility {
 32  
 
 33  
     /**
 34  
      * The {@code US-ASCII} charset identifier constant.
 35  
      */
 36  
     private static final String US_ASCII_CHARSET = "US-ASCII";
 37  
 
 38  
     /**
 39  
      * The marker to indicate text is encoded with BASE64 algorithm.
 40  
      */
 41  
     private static final String BASE64_ENCODING_MARKER = "B";
 42  
 
 43  
     /**
 44  
      * The marker to indicate text is encoded with QuotedPrintable algorithm.
 45  
      */
 46  
     private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q";
 47  
 
 48  
     /**
 49  
      * If the text contains any encoded tokens, those tokens will be marked with "=?".
 50  
      */
 51  
     private static final String ENCODED_TOKEN_MARKER = "=?";
 52  
 
 53  
     /**
 54  
      * If the text contains any encoded tokens, those tokens will terminate with "=?".
 55  
      */
 56  
     private static final String ENCODED_TOKEN_FINISHER = "?=";
 57  
 
 58  
     /**
 59  
      * The linear whitespace chars sequence.
 60  
      */
 61  
     private static final String LINEAR_WHITESPACE = " \t\r\n";
 62  
 
 63  
     /**
 64  
      * Mappings between MIME and Java charset.
 65  
      */
 66  1
     private static final Map<String, String> MIME2JAVA = new HashMap<String, String>();
 67  
 
 68  
     static {
 69  1
         MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
 70  1
         MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
 71  1
         MIME2JAVA.put("utf-8", "UTF8");
 72  1
         MIME2JAVA.put("utf8", "UTF8");
 73  1
         MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
 74  1
         MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
 75  1
         MIME2JAVA.put("euc-kr", "KSC5601");
 76  1
         MIME2JAVA.put("euckr", "KSC5601");
 77  1
         MIME2JAVA.put("us-ascii", "ISO-8859-1");
 78  1
         MIME2JAVA.put("x-us-ascii", "ISO-8859-1");
 79  1
     }
 80  
 
 81  
     /**
 82  
      * Hidden constructor, this class must not be instantiated.
 83  
      */
 84  0
     private MimeUtility() {
 85  
         // do nothing
 86  0
     }
 87  
 
 88  
     /**
 89  
      * Decode a string of text obtained from a mail header into
 90  
      * its proper form.  The text generally will consist of a
 91  
      * string of tokens, some of which may be encoded using
 92  
      * base64 encoding.
 93  
      *
 94  
      * @param text   The text to decode.
 95  
      *
 96  
      * @return The decoded text string.
 97  
      * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported.
 98  
      */
 99  
     public static String decodeText(String text) throws UnsupportedEncodingException {
 100  
         // if the text contains any encoded tokens, those tokens will be marked with "=?".  If the
 101  
         // source string doesn't contain that sequent, no decoding is required.
 102  6481
         if (text.indexOf(ENCODED_TOKEN_MARKER) < 0) {
 103  6475
             return text;
 104  
         }
 105  
 
 106  6
         int offset = 0;
 107  6
         int endOffset = text.length();
 108  
 
 109  6
         int startWhiteSpace = -1;
 110  6
         int endWhiteSpace = -1;
 111  
 
 112  6
         StringBuilder decodedText = new StringBuilder(text.length());
 113  
 
 114  6
         boolean previousTokenEncoded = false;
 115  
 
 116  19
         while (offset < endOffset) {
 117  14
             char ch = text.charAt(offset);
 118  
 
 119  
             // is this a whitespace character?
 120  14
             if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
 121  5
                 startWhiteSpace = offset;
 122  19
                 while (offset < endOffset) {
 123  
                     // step over the white space characters.
 124  17
                     ch = text.charAt(offset);
 125  17
                     if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
 126  14
                         offset++;
 127  
                     } else {
 128  
                         // record the location of the first non lwsp and drop down to process the
 129  
                         // token characters.
 130  3
                         endWhiteSpace = offset;
 131  3
                         break;
 132  
                     }
 133  
                 }
 134  
             } else {
 135  
                 // we have a word token.  We need to scan over the word and then try to parse it.
 136  9
                 int wordStart = offset;
 137  
 
 138  416
                 while (offset < endOffset) {
 139  
                     // step over the non white space characters.
 140  412
                     ch = text.charAt(offset);
 141  412
                     if (LINEAR_WHITESPACE.indexOf(ch) == -1) { // not white space
 142  407
                         offset++;
 143  
                     } else {
 144  
                         break;
 145  
                     }
 146  
 
 147  
                     //NB:  Trailing whitespace on these header strings will just be discarded.
 148  
                 }
 149  
                 // pull out the word token.
 150  9
                 String word = text.substring(wordStart, offset);
 151  
                 // is the token encoded?  decode the word
 152  9
                 if (word.startsWith(ENCODED_TOKEN_MARKER)) {
 153  
                     try {
 154  
                         // if this gives a parsing failure, treat it like a non-encoded word.
 155  9
                         String decodedWord = decodeWord(word);
 156  
 
 157  
                         // are any whitespace characters significant?  Append 'em if we've got 'em.
 158  8
                         if (!previousTokenEncoded && startWhiteSpace != -1) {
 159  0
                             decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
 160  0
                             startWhiteSpace = -1;
 161  
                         }
 162  
                         // this is definitely a decoded token.
 163  8
                         previousTokenEncoded = true;
 164  
                         // and add this to the text.
 165  8
                         decodedText.append(decodedWord);
 166  
                         // we continue parsing from here...we allow parsing errors to fall through
 167  
                         // and get handled as normal text.
 168  8
                         continue;
 169  
 
 170  0
                     } catch (ParseException e) {
 171  
                         // just ignore it, skip to next word
 172  
                     }
 173  
                 }
 174  
                 // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
 175  
                 // if we have it.
 176  0
                 if (startWhiteSpace != -1) {
 177  0
                     decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
 178  0
                     startWhiteSpace = -1;
 179  
                 }
 180  
                 // this is not a decoded token.
 181  0
                 previousTokenEncoded = false;
 182  0
                 decodedText.append(word);
 183  
             }
 184  5
         }
 185  
 
 186  5
         return decodedText.toString();
 187  
     }
 188  
 
 189  
     /**
 190  
      * Parse a string using the RFC 2047 rules for an "encoded-word"
 191  
      * type.  This encoding has the syntax:
 192  
      *
 193  
      * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
 194  
      *
 195  
      * @param word   The possibly encoded word value.
 196  
      *
 197  
      * @return The decoded word.
 198  
      * @throws ParseException
 199  
      * @throws UnsupportedEncodingException
 200  
      */
 201  
     private static String decodeWord(String word) throws ParseException, UnsupportedEncodingException {
 202  
         // encoded words start with the characters "=?".  If this not an encoded word, we throw a
 203  
         // ParseException for the caller.
 204  
 
 205  9
         if (!word.startsWith(ENCODED_TOKEN_MARKER)) {
 206  0
             throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
 207  
         }
 208  
 
 209  9
         int charsetPos = word.indexOf('?', 2);
 210  9
         if (charsetPos == -1) {
 211  0
             throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
 212  
         }
 213  
 
 214  
         // pull out the character set information (this is the MIME name at this point).
 215  9
         String charset = word.substring(2, charsetPos).toLowerCase();
 216  
 
 217  
         // now pull out the encoding token the same way.
 218  9
         int encodingPos = word.indexOf('?', charsetPos + 1);
 219  9
         if (encodingPos == -1) {
 220  0
             throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
 221  
         }
 222  
 
 223  9
         String encoding = word.substring(charsetPos + 1, encodingPos);
 224  
 
 225  
         // and finally the encoded text.
 226  9
         int encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1);
 227  9
         if (encodedTextPos == -1) {
 228  0
             throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
 229  
         }
 230  
 
 231  9
         String encodedText = word.substring(encodingPos + 1, encodedTextPos);
 232  
 
 233  
         // seems a bit silly to encode a null string, but easy to deal with.
 234  9
         if (encodedText.length() == 0) {
 235  0
             return "";
 236  
         }
 237  
 
 238  
         try {
 239  
             // the decoder writes directly to an output stream.
 240  9
             ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());
 241  
 
 242  9
             byte[] encodedData = encodedText.getBytes(US_ASCII_CHARSET);
 243  
 
 244  
             // Base64 encoded?
 245  9
             if (encoding.equals(BASE64_ENCODING_MARKER)) {
 246  8
                 Base64Decoder.decode(encodedData, out);
 247  1
             } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable.
 248  1
                 QuotedPrintableDecoder.decode(encodedData, out);
 249  
             } else {
 250  0
                 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
 251  
             }
 252  
             // get the decoded byte data and convert into a string.
 253  8
             byte[] decodedData = out.toByteArray();
 254  8
             return new String(decodedData, javaCharset(charset));
 255  1
         } catch (IOException e) {
 256  1
             throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
 257  
         }
 258  
     }
 259  
 
 260  
     /**
 261  
      * Translate a MIME standard character set name into the Java
 262  
      * equivalent.
 263  
      *
 264  
      * @param charset The MIME standard name.
 265  
      *
 266  
      * @return The Java equivalent for this name.
 267  
      */
 268  
     private static String javaCharset(String charset) {
 269  
         // nothing in, nothing out.
 270  8
         if (charset == null) {
 271  0
             return null;
 272  
         }
 273  
 
 274  8
         String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH));
 275  
         // if there is no mapping, then the original name is used.  Many of the MIME character set
 276  
         // names map directly back into Java.  The reverse isn't necessarily true.
 277  8
         if (mappedCharset == null) {
 278  6
             return charset;
 279  
         }
 280  2
         return mappedCharset;
 281  
     }
 282  
 
 283  
 }