001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.io.UnsupportedEncodingException;
020import java.util.Arrays;
021import java.util.Collection;
022import java.util.Collections;
023import java.util.HashMap;
024import java.util.Iterator;
025import java.util.LinkedHashMap;
026import java.util.LinkedHashSet;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.Objects;
030import java.util.Set;
031
032import org.apache.commons.lang3.ArrayUtils;
033import org.apache.commons.lang3.StringUtils;
034
035/**
036 * <p>
037 * Convert from one alphabet to another, with the possibility of leaving certain
038 * characters unencoded.
039 * </p>
040 *
041 * <p>
042 * The target and do not encode languages must be in the Unicode BMP, but the
043 * source language does not.
044 * </p>
045 *
046 * <p>
047 * The encoding will all be of a fixed length, except for the 'do not encode'
048 * chars, which will be of length 1
049 * </p>
050 *
051 * <h2>Sample usage</h2>
052 *
053 * <pre>
054 * Character[] originals;   // a, b, c, d
055 * Character[] encoding;    // 0, 1, d
056 * Character[] doNotEncode; // d
057 *
058 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals,
059 * encoding, doNotEncode);
060 *
061 * ac.encode("a");    // 00
062 * ac.encode("b");    // 01
063 * ac.encode("c");    // 0d
064 * ac.encode("d");    // d
065 * ac.encode("abcd"); // 00010dd
066 * </pre>
067 *
068 * <p>
069 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not
070 * change internal state.
071 * </p>
072 *
073 * @since 1.0
074 *
075 */
076public final class AlphabetConverter {
077
078    /**
079     * Original string to be encoded.
080     */
081    private final Map<Integer, String> originalToEncoded;
082    /**
083     * Encoding alphabet.
084     */
085    private final Map<String, String> encodedToOriginal;
086    /**
087     * Length of the encoded letter.
088     */
089    private final int encodedLetterLength;
090    /**
091     * Arrow constant, used for converting the object into a string.
092     */
093    private static final String ARROW = " -> ";
094
095    /**
096     * Hidden constructor for alphabet converter. Used by static helper methods.
097     *
098     * @param originalToEncoded original string to be encoded
099     * @param encodedToOriginal encoding alphabet
100     * @param encodedLetterLength length of the encoded letter
101     */
102    private AlphabetConverter(final Map<Integer, String> originalToEncoded,
103                              final Map<String, String> encodedToOriginal,
104                              final int encodedLetterLength) {
105
106        this.originalToEncoded = originalToEncoded;
107        this.encodedToOriginal = encodedToOriginal;
108        this.encodedLetterLength = encodedLetterLength;
109    }
110
111    /**
112     * Encode a given string.
113     *
114     * @param original the string to be encoded
115     * @return The encoded string, {@code null} if the given string is null
116     * @throws UnsupportedEncodingException if chars that are not supported are
117     *                                      encountered
118     */
119    public String encode(final String original)
120            throws UnsupportedEncodingException {
121        if (original == null) {
122            return null;
123        }
124
125        final StringBuilder sb = new StringBuilder();
126
127        for (int i = 0; i < original.length();) {
128            final int codepoint = original.codePointAt(i);
129
130            final String nextLetter = originalToEncoded.get(codepoint);
131
132            if (nextLetter == null) {
133                throw new UnsupportedEncodingException(
134                        "Couldn't find encoding for '"
135                                + codePointToString(codepoint)
136                                + "' in "
137                                + original
138                );
139            }
140
141            sb.append(nextLetter);
142
143            i += Character.charCount(codepoint);
144        }
145
146        return sb.toString();
147    }
148
149    /**
150     * Decode a given string.
151     *
152     * @param encoded a string that has been encoded using this
153     *                AlphabetConverter
154     * @return The decoded string, {@code null} if the given string is null
155     * @throws UnsupportedEncodingException if unexpected characters that
156     *                                      cannot be handled are encountered
157     */
158    public String decode(final String encoded)
159            throws UnsupportedEncodingException {
160        if (encoded == null) {
161            return null;
162        }
163
164        final StringBuilder result = new StringBuilder();
165
166        for (int j = 0; j < encoded.length();) {
167            final int i = encoded.codePointAt(j);
168            final String s = codePointToString(i);
169
170            if (s.equals(originalToEncoded.get(i))) {
171                result.append(s);
172                j++; // because we do not encode in Unicode extended the
173                     // length of each encoded char is 1
174            } else {
175                if (j + encodedLetterLength > encoded.length()) {
176                    throw new UnsupportedEncodingException("Unexpected end "
177                            + "of string while decoding " + encoded);
178                }
179                final String nextGroup = encoded.substring(j,
180                        j + encodedLetterLength);
181                final String next = encodedToOriginal.get(nextGroup);
182                if (next == null) {
183                    throw new UnsupportedEncodingException(
184                            "Unexpected string without decoding ("
185                                    + nextGroup + ") in " + encoded);
186                }
187                result.append(next);
188                j += encodedLetterLength;
189            }
190        }
191
192        return result.toString();
193    }
194
195    /**
196     * Get the length of characters in the encoded alphabet that are necessary
197     * for each character in the original
198     * alphabet.
199     *
200     * @return The length of the encoded char
201     */
202    public int getEncodedCharLength() {
203        return encodedLetterLength;
204    }
205
206    /**
207     * Get the mapping from integer code point of source language to encoded
208     * string. Use to reconstruct converter from
209     * serialized map.
210     *
211     * @return The original map
212     */
213    public Map<Integer, String> getOriginalToEncoded() {
214        return Collections.unmodifiableMap(originalToEncoded);
215    }
216
217    /**
218     * Recursive method used when creating encoder/decoder.
219     *
220     * @param level at which point it should add a single encoding
221     * @param currentEncoding current encoding
222     * @param encoding letters encoding
223     * @param originals original values
224     * @param doNotEncodeMap map of values that should not be encoded
225     */
226    @SuppressWarnings("PMD")
227    private void addSingleEncoding(final int level,
228                                   final String currentEncoding,
229                                   final Collection<Integer> encoding,
230                                   final Iterator<Integer> originals,
231                                   final Map<Integer, String> doNotEncodeMap) {
232
233        if (level > 0) {
234            for (final int encodingLetter : encoding) {
235                if (originals.hasNext()) {
236
237                    // this skips the doNotEncode chars if they are in the
238                    // leftmost place
239                    if (level != encodedLetterLength
240                            || !doNotEncodeMap.containsKey(encodingLetter)) {
241                        addSingleEncoding(level - 1,
242                                currentEncoding
243                                        + codePointToString(encodingLetter),
244                                encoding,
245                                originals,
246                                doNotEncodeMap
247                        );
248                    }
249                } else {
250                    return; // done encoding all the original alphabet
251                }
252            }
253        } else {
254            Integer next = originals.next();
255
256            while (doNotEncodeMap.containsKey(next)) {
257                final String originalLetterAsString = codePointToString(next);
258
259                originalToEncoded.put(next, originalLetterAsString);
260                encodedToOriginal.put(originalLetterAsString,
261                        originalLetterAsString);
262
263                if (!originals.hasNext()) {
264                    return;
265                }
266
267                next = originals.next();
268            }
269
270            final String originalLetterAsString = codePointToString(next);
271
272            originalToEncoded.put(next, currentEncoding);
273            encodedToOriginal.put(currentEncoding, originalLetterAsString);
274        }
275    }
276
277    @Override
278    public String toString() {
279        final StringBuilder sb = new StringBuilder();
280
281        for (final Entry<Integer, String> entry
282                : originalToEncoded.entrySet()) {
283            sb.append(codePointToString(entry.getKey()))
284                    .append(ARROW)
285                    .append(entry.getValue()).append(System.lineSeparator());
286        }
287
288        return sb.toString();
289    }
290
291    @Override
292    public boolean equals(final Object obj) {
293        if (obj == null) {
294            return false;
295        }
296        if (obj == this) {
297            return true;
298        }
299        if (!(obj instanceof AlphabetConverter)) {
300            return false;
301        }
302        final AlphabetConverter other = (AlphabetConverter) obj;
303        return originalToEncoded.equals(other.originalToEncoded)
304                && encodedToOriginal.equals(other.encodedToOriginal)
305                && encodedLetterLength == other.encodedLetterLength;
306    }
307
308    @Override
309    public int hashCode() {
310        return Objects.hash(originalToEncoded,
311                encodedToOriginal,
312                encodedLetterLength);
313    }
314
315    // -- static methods
316
317    /**
318     * Create a new converter from a map.
319     *
320     * @param originalToEncoded a map returned from getOriginalToEncoded()
321     * @return The reconstructed AlphabetConverter
322     * @see AlphabetConverter#getOriginalToEncoded()
323     */
324    public static AlphabetConverter createConverterFromMap(
325            final Map<Integer, String> originalToEncoded) {
326        final Map<Integer, String> unmodifiableOriginalToEncoded =
327                Collections.unmodifiableMap(originalToEncoded);
328        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
329
330        int encodedLetterLength = 1;
331
332        for (final Entry<Integer, String> e
333                : unmodifiableOriginalToEncoded.entrySet()) {
334            final String originalAsString = codePointToString(e.getKey());
335            encodedToOriginal.put(e.getValue(), originalAsString);
336
337            if (e.getValue().length() > encodedLetterLength) {
338                encodedLetterLength = e.getValue().length();
339            }
340        }
341
342        return new AlphabetConverter(unmodifiableOriginalToEncoded,
343                encodedToOriginal,
344                encodedLetterLength);
345    }
346
347    /**
348     * Create an alphabet converter, for converting from the original alphabet,
349     * to the encoded alphabet, while leaving the characters in
350     * <em>doNotEncode</em> as they are (if possible).
351     *
352     * <p>Duplicate letters in either original or encoding will be ignored.</p>
353     *
354     * @param original an array of chars representing the original alphabet
355     * @param encoding an array of chars representing the alphabet to be used
356     *                 for encoding
357     * @param doNotEncode an array of chars to be encoded using the original
358     *                    alphabet - every char here must appear in
359     *                    both the previous params
360     * @return The AlphabetConverter
361     * @throws IllegalArgumentException if an AlphabetConverter cannot be
362     *                                  constructed
363     */
364    public static AlphabetConverter createConverterFromChars(
365            final Character[] original,
366            final Character[] encoding,
367            final Character[] doNotEncode) {
368        return AlphabetConverter.createConverter(
369                convertCharsToIntegers(original),
370                convertCharsToIntegers(encoding),
371                convertCharsToIntegers(doNotEncode));
372    }
373
374    /**
375     * Convert characters to integers.
376     *
377     * @param chars array of characters
378     * @return an equivalent array of integers
379     */
380    private static Integer[] convertCharsToIntegers(final Character[] chars) {
381        if (ArrayUtils.isEmpty(chars)) {
382            return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY;
383        }
384        final Integer[] integers = new Integer[chars.length];
385        for (int i = 0; i < chars.length; i++) {
386            integers[i] = (int) chars[i];
387        }
388        return integers;
389    }
390
391    /**
392     * Create an alphabet converter, for converting from the original alphabet,
393     * to the encoded alphabet, while leaving
394     * the characters in <em>doNotEncode</em> as they are (if possible).
395     *
396     * <p>Duplicate letters in either original or encoding will be ignored.</p>
397     *
398     * @param original an array of ints representing the original alphabet in
399     *                 codepoints
400     * @param encoding an array of ints representing the alphabet to be used for
401     *                 encoding, in codepoints
402     * @param doNotEncode an array of ints representing the chars to be encoded
403     *                    using the original alphabet - every char
404     *                    here must appear in both the previous params
405     * @return The AlphabetConverter
406     * @throws IllegalArgumentException if an AlphabetConverter cannot be
407     *                                   constructed
408     */
409    public static AlphabetConverter createConverter(
410            final Integer[] original,
411            final Integer[] encoding,
412            final Integer[] doNotEncode) {
413        final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original));
414        final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding));
415        final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode));
416
417        final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
418        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
419        final Map<Integer, String> doNotEncodeMap = new HashMap<>();
420
421        int encodedLetterLength;
422
423        for (final int i : doNotEncodeCopy) {
424            if (!originalCopy.contains(i)) {
425                throw new IllegalArgumentException(
426                        "Can not use 'do not encode' list because original "
427                                + "alphabet does not contain '"
428                                + codePointToString(i) + "'");
429            }
430
431            if (!encodingCopy.contains(i)) {
432                throw new IllegalArgumentException(
433                        "Can not use 'do not encode' list because encoding alphabet does not contain '"
434                                + codePointToString(i) + "'");
435            }
436
437            doNotEncodeMap.put(i, codePointToString(i));
438        }
439
440        if (encodingCopy.size() >= originalCopy.size()) {
441            encodedLetterLength = 1;
442
443            final Iterator<Integer> it = encodingCopy.iterator();
444
445            for (final int originalLetter : originalCopy) {
446                final String originalLetterAsString =
447                        codePointToString(originalLetter);
448
449                if (doNotEncodeMap.containsKey(originalLetter)) {
450                    originalToEncoded.put(originalLetter,
451                            originalLetterAsString);
452                    encodedToOriginal.put(originalLetterAsString,
453                            originalLetterAsString);
454                } else {
455                    Integer next = it.next();
456
457                    while (doNotEncodeCopy.contains(next)) {
458                        next = it.next();
459                    }
460
461                    final String encodedLetter = codePointToString(next);
462
463                    originalToEncoded.put(originalLetter, encodedLetter);
464                    encodedToOriginal.put(encodedLetter,
465                            originalLetterAsString);
466                }
467            }
468
469            return new AlphabetConverter(originalToEncoded,
470                    encodedToOriginal,
471                    encodedLetterLength);
472
473        } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
474            throw new IllegalArgumentException(
475                    "Must have at least two encoding characters (excluding "
476                            + "those in the 'do not encode' list), but has "
477                            + (encodingCopy.size() - doNotEncodeCopy.size()));
478        } else {
479            // we start with one which is our minimum, and because we do the
480            // first division outside the loop
481            int lettersSoFar = 1;
482
483            // the first division takes into account that the doNotEncode
484            // letters can't be in the leftmost place
485            int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
486                    / (encodingCopy.size() - doNotEncodeCopy.size());
487
488            while (lettersLeft / encodingCopy.size() >= 1) {
489                lettersLeft = lettersLeft / encodingCopy.size();
490                lettersSoFar++;
491            }
492
493            encodedLetterLength = lettersSoFar + 1;
494
495            final AlphabetConverter ac =
496                    new AlphabetConverter(originalToEncoded,
497                            encodedToOriginal,
498                            encodedLetterLength);
499
500            ac.addSingleEncoding(encodedLetterLength,
501                    StringUtils.EMPTY,
502                    encodingCopy,
503                    originalCopy.iterator(),
504                    doNotEncodeMap);
505
506            return ac;
507        }
508    }
509
510    /**
511     * Create new String that contains just the given code point.
512     *
513     * @param i code point
514     * @return a new string with the new code point
515     * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
516     */
517    private static String codePointToString(final int i) {
518        if (Character.charCount(i) == 1) {
519            return String.valueOf((char) i);
520        }
521        return new String(Character.toChars(i));
522    }
523}