001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.io.UnsupportedEncodingException; 020import java.util.Arrays; 021import java.util.Collection; 022import java.util.Collections; 023import java.util.HashMap; 024import java.util.Iterator; 025import java.util.LinkedHashMap; 026import java.util.LinkedHashSet; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.Objects; 030import java.util.Set; 031 032import org.apache.commons.lang3.ArrayUtils; 033import org.apache.commons.lang3.StringUtils; 034 035/** 036 * <p> 037 * Convert from one alphabet to another, with the possibility of leaving certain 038 * characters unencoded. 039 * </p> 040 * 041 * <p> 042 * The target and do not encode languages must be in the Unicode BMP, but the 043 * source language does not. 044 * </p> 045 * 046 * <p> 047 * The encoding will all be of a fixed length, except for the 'do not encode' 048 * chars, which will be of length 1 049 * </p> 050 * 051 * <h2>Sample usage</h2> 052 * 053 * <pre> 054 * Character[] originals; // a, b, c, d 055 * Character[] encoding; // 0, 1, d 056 * Character[] doNotEncode; // d 057 * 058 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, 059 * encoding, doNotEncode); 060 * 061 * ac.encode("a"); // 00 062 * ac.encode("b"); // 01 063 * ac.encode("c"); // 0d 064 * ac.encode("d"); // d 065 * ac.encode("abcd"); // 00010dd 066 * </pre> 067 * 068 * <p> 069 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not 070 * change internal state. 071 * </p> 072 * 073 * @since 1.0 074 * 075 */ 076public final class AlphabetConverter { 077 078 /** 079 * Original string to be encoded. 080 */ 081 private final Map<Integer, String> originalToEncoded; 082 /** 083 * Encoding alphabet. 084 */ 085 private final Map<String, String> encodedToOriginal; 086 /** 087 * Length of the encoded letter. 088 */ 089 private final int encodedLetterLength; 090 /** 091 * Arrow constant, used for converting the object into a string. 092 */ 093 private static final String ARROW = " -> "; 094 095 /** 096 * Hidden constructor for alphabet converter. Used by static helper methods. 097 * 098 * @param originalToEncoded original string to be encoded 099 * @param encodedToOriginal encoding alphabet 100 * @param encodedLetterLength length of the encoded letter 101 */ 102 private AlphabetConverter(final Map<Integer, String> originalToEncoded, 103 final Map<String, String> encodedToOriginal, 104 final int encodedLetterLength) { 105 106 this.originalToEncoded = originalToEncoded; 107 this.encodedToOriginal = encodedToOriginal; 108 this.encodedLetterLength = encodedLetterLength; 109 } 110 111 /** 112 * Encode a given string. 113 * 114 * @param original the string to be encoded 115 * @return The encoded string, {@code null} if the given string is null 116 * @throws UnsupportedEncodingException if chars that are not supported are 117 * encountered 118 */ 119 public String encode(final String original) 120 throws UnsupportedEncodingException { 121 if (original == null) { 122 return null; 123 } 124 125 final StringBuilder sb = new StringBuilder(); 126 127 for (int i = 0; i < original.length();) { 128 final int codepoint = original.codePointAt(i); 129 130 final String nextLetter = originalToEncoded.get(codepoint); 131 132 if (nextLetter == null) { 133 throw new UnsupportedEncodingException( 134 "Couldn't find encoding for '" 135 + codePointToString(codepoint) 136 + "' in " 137 + original 138 ); 139 } 140 141 sb.append(nextLetter); 142 143 i += Character.charCount(codepoint); 144 } 145 146 return sb.toString(); 147 } 148 149 /** 150 * Decode a given string. 151 * 152 * @param encoded a string that has been encoded using this 153 * AlphabetConverter 154 * @return The decoded string, {@code null} if the given string is null 155 * @throws UnsupportedEncodingException if unexpected characters that 156 * cannot be handled are encountered 157 */ 158 public String decode(final String encoded) 159 throws UnsupportedEncodingException { 160 if (encoded == null) { 161 return null; 162 } 163 164 final StringBuilder result = new StringBuilder(); 165 166 for (int j = 0; j < encoded.length();) { 167 final int i = encoded.codePointAt(j); 168 final String s = codePointToString(i); 169 170 if (s.equals(originalToEncoded.get(i))) { 171 result.append(s); 172 j++; // because we do not encode in Unicode extended the 173 // length of each encoded char is 1 174 } else { 175 if (j + encodedLetterLength > encoded.length()) { 176 throw new UnsupportedEncodingException("Unexpected end " 177 + "of string while decoding " + encoded); 178 } 179 final String nextGroup = encoded.substring(j, 180 j + encodedLetterLength); 181 final String next = encodedToOriginal.get(nextGroup); 182 if (next == null) { 183 throw new UnsupportedEncodingException( 184 "Unexpected string without decoding (" 185 + nextGroup + ") in " + encoded); 186 } 187 result.append(next); 188 j += encodedLetterLength; 189 } 190 } 191 192 return result.toString(); 193 } 194 195 /** 196 * Get the length of characters in the encoded alphabet that are necessary 197 * for each character in the original 198 * alphabet. 199 * 200 * @return The length of the encoded char 201 */ 202 public int getEncodedCharLength() { 203 return encodedLetterLength; 204 } 205 206 /** 207 * Get the mapping from integer code point of source language to encoded 208 * string. Use to reconstruct converter from 209 * serialized map. 210 * 211 * @return The original map 212 */ 213 public Map<Integer, String> getOriginalToEncoded() { 214 return Collections.unmodifiableMap(originalToEncoded); 215 } 216 217 /** 218 * Recursive method used when creating encoder/decoder. 219 * 220 * @param level at which point it should add a single encoding 221 * @param currentEncoding current encoding 222 * @param encoding letters encoding 223 * @param originals original values 224 * @param doNotEncodeMap map of values that should not be encoded 225 */ 226 @SuppressWarnings("PMD") 227 private void addSingleEncoding(final int level, 228 final String currentEncoding, 229 final Collection<Integer> encoding, 230 final Iterator<Integer> originals, 231 final Map<Integer, String> doNotEncodeMap) { 232 233 if (level > 0) { 234 for (final int encodingLetter : encoding) { 235 if (originals.hasNext()) { 236 237 // this skips the doNotEncode chars if they are in the 238 // leftmost place 239 if (level != encodedLetterLength 240 || !doNotEncodeMap.containsKey(encodingLetter)) { 241 addSingleEncoding(level - 1, 242 currentEncoding 243 + codePointToString(encodingLetter), 244 encoding, 245 originals, 246 doNotEncodeMap 247 ); 248 } 249 } else { 250 return; // done encoding all the original alphabet 251 } 252 } 253 } else { 254 Integer next = originals.next(); 255 256 while (doNotEncodeMap.containsKey(next)) { 257 final String originalLetterAsString = codePointToString(next); 258 259 originalToEncoded.put(next, originalLetterAsString); 260 encodedToOriginal.put(originalLetterAsString, 261 originalLetterAsString); 262 263 if (!originals.hasNext()) { 264 return; 265 } 266 267 next = originals.next(); 268 } 269 270 final String originalLetterAsString = codePointToString(next); 271 272 originalToEncoded.put(next, currentEncoding); 273 encodedToOriginal.put(currentEncoding, originalLetterAsString); 274 } 275 } 276 277 @Override 278 public String toString() { 279 final StringBuilder sb = new StringBuilder(); 280 281 for (final Entry<Integer, String> entry 282 : originalToEncoded.entrySet()) { 283 sb.append(codePointToString(entry.getKey())) 284 .append(ARROW) 285 .append(entry.getValue()).append(System.lineSeparator()); 286 } 287 288 return sb.toString(); 289 } 290 291 @Override 292 public boolean equals(final Object obj) { 293 if (obj == null) { 294 return false; 295 } 296 if (obj == this) { 297 return true; 298 } 299 if (!(obj instanceof AlphabetConverter)) { 300 return false; 301 } 302 final AlphabetConverter other = (AlphabetConverter) obj; 303 return originalToEncoded.equals(other.originalToEncoded) 304 && encodedToOriginal.equals(other.encodedToOriginal) 305 && encodedLetterLength == other.encodedLetterLength; 306 } 307 308 @Override 309 public int hashCode() { 310 return Objects.hash(originalToEncoded, 311 encodedToOriginal, 312 encodedLetterLength); 313 } 314 315 // -- static methods 316 317 /** 318 * Create a new converter from a map. 319 * 320 * @param originalToEncoded a map returned from getOriginalToEncoded() 321 * @return The reconstructed AlphabetConverter 322 * @see AlphabetConverter#getOriginalToEncoded() 323 */ 324 public static AlphabetConverter createConverterFromMap( 325 final Map<Integer, String> originalToEncoded) { 326 final Map<Integer, String> unmodifiableOriginalToEncoded = 327 Collections.unmodifiableMap(originalToEncoded); 328 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 329 330 int encodedLetterLength = 1; 331 332 for (final Entry<Integer, String> e 333 : unmodifiableOriginalToEncoded.entrySet()) { 334 final String originalAsString = codePointToString(e.getKey()); 335 encodedToOriginal.put(e.getValue(), originalAsString); 336 337 if (e.getValue().length() > encodedLetterLength) { 338 encodedLetterLength = e.getValue().length(); 339 } 340 } 341 342 return new AlphabetConverter(unmodifiableOriginalToEncoded, 343 encodedToOriginal, 344 encodedLetterLength); 345 } 346 347 /** 348 * Create an alphabet converter, for converting from the original alphabet, 349 * to the encoded alphabet, while leaving the characters in 350 * <em>doNotEncode</em> as they are (if possible). 351 * 352 * <p>Duplicate letters in either original or encoding will be ignored.</p> 353 * 354 * @param original an array of chars representing the original alphabet 355 * @param encoding an array of chars representing the alphabet to be used 356 * for encoding 357 * @param doNotEncode an array of chars to be encoded using the original 358 * alphabet - every char here must appear in 359 * both the previous params 360 * @return The AlphabetConverter 361 * @throws IllegalArgumentException if an AlphabetConverter cannot be 362 * constructed 363 */ 364 public static AlphabetConverter createConverterFromChars( 365 final Character[] original, 366 final Character[] encoding, 367 final Character[] doNotEncode) { 368 return AlphabetConverter.createConverter( 369 convertCharsToIntegers(original), 370 convertCharsToIntegers(encoding), 371 convertCharsToIntegers(doNotEncode)); 372 } 373 374 /** 375 * Convert characters to integers. 376 * 377 * @param chars array of characters 378 * @return an equivalent array of integers 379 */ 380 private static Integer[] convertCharsToIntegers(final Character[] chars) { 381 if (ArrayUtils.isEmpty(chars)) { 382 return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY; 383 } 384 final Integer[] integers = new Integer[chars.length]; 385 for (int i = 0; i < chars.length; i++) { 386 integers[i] = (int) chars[i]; 387 } 388 return integers; 389 } 390 391 /** 392 * Create an alphabet converter, for converting from the original alphabet, 393 * to the encoded alphabet, while leaving 394 * the characters in <em>doNotEncode</em> as they are (if possible). 395 * 396 * <p>Duplicate letters in either original or encoding will be ignored.</p> 397 * 398 * @param original an array of ints representing the original alphabet in 399 * codepoints 400 * @param encoding an array of ints representing the alphabet to be used for 401 * encoding, in codepoints 402 * @param doNotEncode an array of ints representing the chars to be encoded 403 * using the original alphabet - every char 404 * here must appear in both the previous params 405 * @return The AlphabetConverter 406 * @throws IllegalArgumentException if an AlphabetConverter cannot be 407 * constructed 408 */ 409 public static AlphabetConverter createConverter( 410 final Integer[] original, 411 final Integer[] encoding, 412 final Integer[] doNotEncode) { 413 final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original)); 414 final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding)); 415 final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode)); 416 417 final Map<Integer, String> originalToEncoded = new LinkedHashMap<>(); 418 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 419 final Map<Integer, String> doNotEncodeMap = new HashMap<>(); 420 421 int encodedLetterLength; 422 423 for (final int i : doNotEncodeCopy) { 424 if (!originalCopy.contains(i)) { 425 throw new IllegalArgumentException( 426 "Can not use 'do not encode' list because original " 427 + "alphabet does not contain '" 428 + codePointToString(i) + "'"); 429 } 430 431 if (!encodingCopy.contains(i)) { 432 throw new IllegalArgumentException( 433 "Can not use 'do not encode' list because encoding alphabet does not contain '" 434 + codePointToString(i) + "'"); 435 } 436 437 doNotEncodeMap.put(i, codePointToString(i)); 438 } 439 440 if (encodingCopy.size() >= originalCopy.size()) { 441 encodedLetterLength = 1; 442 443 final Iterator<Integer> it = encodingCopy.iterator(); 444 445 for (final int originalLetter : originalCopy) { 446 final String originalLetterAsString = 447 codePointToString(originalLetter); 448 449 if (doNotEncodeMap.containsKey(originalLetter)) { 450 originalToEncoded.put(originalLetter, 451 originalLetterAsString); 452 encodedToOriginal.put(originalLetterAsString, 453 originalLetterAsString); 454 } else { 455 Integer next = it.next(); 456 457 while (doNotEncodeCopy.contains(next)) { 458 next = it.next(); 459 } 460 461 final String encodedLetter = codePointToString(next); 462 463 originalToEncoded.put(originalLetter, encodedLetter); 464 encodedToOriginal.put(encodedLetter, 465 originalLetterAsString); 466 } 467 } 468 469 return new AlphabetConverter(originalToEncoded, 470 encodedToOriginal, 471 encodedLetterLength); 472 473 } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) { 474 throw new IllegalArgumentException( 475 "Must have at least two encoding characters (excluding " 476 + "those in the 'do not encode' list), but has " 477 + (encodingCopy.size() - doNotEncodeCopy.size())); 478 } else { 479 // we start with one which is our minimum, and because we do the 480 // first division outside the loop 481 int lettersSoFar = 1; 482 483 // the first division takes into account that the doNotEncode 484 // letters can't be in the leftmost place 485 int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size()) 486 / (encodingCopy.size() - doNotEncodeCopy.size()); 487 488 while (lettersLeft / encodingCopy.size() >= 1) { 489 lettersLeft = lettersLeft / encodingCopy.size(); 490 lettersSoFar++; 491 } 492 493 encodedLetterLength = lettersSoFar + 1; 494 495 final AlphabetConverter ac = 496 new AlphabetConverter(originalToEncoded, 497 encodedToOriginal, 498 encodedLetterLength); 499 500 ac.addSingleEncoding(encodedLetterLength, 501 StringUtils.EMPTY, 502 encodingCopy, 503 originalCopy.iterator(), 504 doNotEncodeMap); 505 506 return ac; 507 } 508 } 509 510 /** 511 * Create new String that contains just the given code point. 512 * 513 * @param i code point 514 * @return a new string with the new code point 515 * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html" 516 */ 517 private static String codePointToString(final int i) { 518 if (Character.charCount(i) == 1) { 519 return String.valueOf((char) i); 520 } 521 return new String(Character.toChars(i)); 522 } 523}