001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.lang3.StringUtils;
026import org.apache.commons.text.matcher.StringMatcher;
027import org.apache.commons.text.matcher.StringMatcherFactory;
028
029/**
030 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
031 * <p>
032 * This class can split a String into many smaller strings. It aims to do a similar job to
033 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
034 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
035 * <p>
036 * The input String is split into a number of <i>tokens</i>. Each token is separated from the next String by a
037 * <i>delimiter</i>. One or more delimiter characters must be specified.
038 * <p>
039 * Each token may be surrounded by quotes. The <i>quote</i> matcher specifies the quote character(s). A quote may be
040 * escaped within a quoted section by duplicating itself.
041 * <p>
042 * Between each token and the delimiter are potentially characters that need trimming. The <i>trimmer</i> matcher
043 * specifies these characters. One usage might be to trim whitespace characters.
044 * <p>
045 * At any point outside the quotes there might potentially be invalid characters. The <i>ignored</i> matcher specifies
046 * these characters to be removed. One usage might be to remove new line characters.
047 * <p>
048 * Empty tokens may be removed or returned as null.
049 *
050 * <pre>
051 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
052 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
053 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
054 * </pre>
055 *
056 * <table>
057 * <caption>StringTokenizer properties and options</caption>
058 * <tr>
059 * <th>Property</th>
060 * <th>Type</th>
061 * <th>Default</th>
062 * </tr>
063 * <tr>
064 * <td>delim</td>
065 * <td>CharSetMatcher</td>
066 * <td>{ \t\n\r\f}</td>
067 * </tr>
068 * <tr>
069 * <td>quote</td>
070 * <td>NoneMatcher</td>
071 * <td>{}</td>
072 * </tr>
073 * <tr>
074 * <td>ignore</td>
075 * <td>NoneMatcher</td>
076 * <td>{}</td>
077 * </tr>
078 * <tr>
079 * <td>emptyTokenAsNull</td>
080 * <td>boolean</td>
081 * <td>false</td>
082 * </tr>
083 * <tr>
084 * <td>ignoreEmptyTokens</td>
085 * <td>boolean</td>
086 * <td>true</td>
087 * </tr>
088 * </table>
089 *
090 * @since 1.3
091 */
092public class StringTokenizer implements ListIterator<String>, Cloneable {
093
094    /** Comma separated values tokenizer internal variable. */
095    private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE;
096    /** Tab separated values tokenizer internal variable. */
097    private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE;
098    static {
099        CSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
100        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher());
101        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
102        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
103        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
104        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
105        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
106
107        TSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
108        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher());
109        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
110        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
111        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
112        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
113        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
114    }
115
116    /** The text to work on. */
117    private char[] chars;
118    /** The parsed tokens. */
119    private String[] tokens;
120    /** The current iteration position. */
121    private int tokenPos;
122
123    /** The delimiter matcher. */
124    private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
125    /** The quote matcher. */
126    private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
127    /** The ignored matcher. */
128    private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
129    /** The trimmer matcher. */
130    private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
131
132    /** Whether to return empty tokens as null. */
133    private boolean emptyAsNull = false;
134    /** Whether to ignore empty tokens. */
135    private boolean ignoreEmptyTokens = true;
136
137    // -----------------------------------------------------------------------
138
139    /**
140     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
141     *
142     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
143     */
144    private static StringTokenizer getCSVClone() {
145        return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
146    }
147
148    /**
149     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
150     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
151     * setTrimmer method).
152     * <p>
153     * You must call a "reset" method to set the string which you want to parse.
154     *
155     * @return a new tokenizer instance which parses Comma Separated Value strings
156     */
157    public static StringTokenizer getCSVInstance() {
158        return getCSVClone();
159    }
160
161    /**
162     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
163     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
164     * setTrimmer method).
165     *
166     * @param input
167     *            the text to parse
168     * @return a new tokenizer instance which parses Comma Separated Value strings
169     */
170    public static StringTokenizer getCSVInstance(final String input) {
171        final StringTokenizer tok = getCSVClone();
172        tok.reset(input);
173        return tok;
174    }
175
176    /**
177     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
178     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
179     * setTrimmer method).
180     *
181     * @param input
182     *            the text to parse
183     * @return a new tokenizer instance which parses Comma Separated Value strings
184     */
185    public static StringTokenizer getCSVInstance(final char[] input) {
186        final StringTokenizer tok = getCSVClone();
187        tok.reset(input);
188        return tok;
189    }
190
191    /**
192     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
193     *
194     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
195     */
196    private static StringTokenizer getTSVClone() {
197        return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
198    }
199
200    /**
201     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
202     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
203     * <p>
204     * You must call a "reset" method to set the string which you want to parse.
205     *
206     * @return a new tokenizer instance which parses Tab Separated Value strings.
207     */
208    public static StringTokenizer getTSVInstance() {
209        return getTSVClone();
210    }
211
212    /**
213     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
214     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
215     *
216     * @param input
217     *            the string to parse
218     * @return a new tokenizer instance which parses Tab Separated Value strings.
219     */
220    public static StringTokenizer getTSVInstance(final String input) {
221        final StringTokenizer tok = getTSVClone();
222        tok.reset(input);
223        return tok;
224    }
225
226    /**
227     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
228     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
229     *
230     * @param input
231     *            the string to parse
232     * @return a new tokenizer instance which parses Tab Separated Value strings.
233     */
234    public static StringTokenizer getTSVInstance(final char[] input) {
235        final StringTokenizer tok = getTSVClone();
236        tok.reset(input);
237        return tok;
238    }
239
240    // -----------------------------------------------------------------------
241    /**
242     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
243     * tokenize.
244     * <p>
245     * This constructor is normally used with {@link #reset(String)}.
246     */
247    public StringTokenizer() {
248        super();
249        this.chars = null;
250    }
251
252    /**
253     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
254     *
255     * @param input
256     *            the string which is to be parsed
257     */
258    public StringTokenizer(final String input) {
259        super();
260        if (input != null) {
261            chars = input.toCharArray();
262        } else {
263            chars = null;
264        }
265    }
266
267    /**
268     * Constructs a tokenizer splitting on the specified delimiter character.
269     *
270     * @param input
271     *            the string which is to be parsed
272     * @param delim
273     *            the field delimiter character
274     */
275    public StringTokenizer(final String input, final char delim) {
276        this(input);
277        setDelimiterChar(delim);
278    }
279
280    /**
281     * Constructs a tokenizer splitting on the specified delimiter string.
282     *
283     * @param input
284     *            the string which is to be parsed
285     * @param delim
286     *            the field delimiter string
287     */
288    public StringTokenizer(final String input, final String delim) {
289        this(input);
290        setDelimiterString(delim);
291    }
292
293    /**
294     * Constructs a tokenizer splitting using the specified delimiter matcher.
295     *
296     * @param input
297     *            the string which is to be parsed
298     * @param delim
299     *            the field delimiter matcher
300     */
301    public StringTokenizer(final String input, final StringMatcher delim) {
302        this(input);
303        setDelimiterMatcher(delim);
304    }
305
306    /**
307     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
308     * quote character.
309     *
310     * @param input
311     *            the string which is to be parsed
312     * @param delim
313     *            the field delimiter character
314     * @param quote
315     *            the field quoted string character
316     */
317    public StringTokenizer(final String input, final char delim, final char quote) {
318        this(input, delim);
319        setQuoteChar(quote);
320    }
321
322    /**
323     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
324     * quote matcher.
325     *
326     * @param input
327     *            the string which is to be parsed
328     * @param delim
329     *            the field delimiter matcher
330     * @param quote
331     *            the field quoted string matcher
332     */
333    public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
334        this(input, delim);
335        setQuoteMatcher(quote);
336    }
337
338    /**
339     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
340     *
341     * @param input
342     *            the string which is to be parsed, not cloned
343     */
344    public StringTokenizer(final char[] input) {
345        super();
346        if (input == null) {
347            this.chars = null;
348        } else {
349            this.chars = input.clone();
350        }
351    }
352
353    /**
354     * Constructs a tokenizer splitting on the specified character.
355     *
356     * @param input
357     *            the string which is to be parsed, not cloned
358     * @param delim
359     *            the field delimiter character
360     */
361    public StringTokenizer(final char[] input, final char delim) {
362        this(input);
363        setDelimiterChar(delim);
364    }
365
366    /**
367     * Constructs a tokenizer splitting on the specified string.
368     *
369     * @param input
370     *            the string which is to be parsed, not cloned
371     * @param delim
372     *            the field delimiter string
373     */
374    public StringTokenizer(final char[] input, final String delim) {
375        this(input);
376        setDelimiterString(delim);
377    }
378
379    /**
380     * Constructs a tokenizer splitting using the specified delimiter matcher.
381     *
382     * @param input
383     *            the string which is to be parsed, not cloned
384     * @param delim
385     *            the field delimiter matcher
386     */
387    public StringTokenizer(final char[] input, final StringMatcher delim) {
388        this(input);
389        setDelimiterMatcher(delim);
390    }
391
392    /**
393     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
394     * quote character.
395     *
396     * @param input
397     *            the string which is to be parsed, not cloned
398     * @param delim
399     *            the field delimiter character
400     * @param quote
401     *            the field quoted string character
402     */
403    public StringTokenizer(final char[] input, final char delim, final char quote) {
404        this(input, delim);
405        setQuoteChar(quote);
406    }
407
408    /**
409     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
410     * quote matcher.
411     *
412     * @param input
413     *            the string which is to be parsed, not cloned
414     * @param delim
415     *            the field delimiter character
416     * @param quote
417     *            the field quoted string character
418     */
419    public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
420        this(input, delim);
421        setQuoteMatcher(quote);
422    }
423
424    // API
425    // -----------------------------------------------------------------------
426    /**
427     * Gets the number of tokens found in the String.
428     *
429     * @return The number of matched tokens
430     */
431    public int size() {
432        checkTokenized();
433        return tokens.length;
434    }
435
436    /**
437     * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
438     * {@link NoSuchElementException} when no tokens remain.
439     *
440     * @return The next sequential token, or null when no more tokens are found
441     */
442    public String nextToken() {
443        if (hasNext()) {
444            return tokens[tokenPos++];
445        }
446        return null;
447    }
448
449    /**
450     * Gets the previous token from the String.
451     *
452     * @return The previous sequential token, or null when no more tokens are found
453     */
454    public String previousToken() {
455        if (hasPrevious()) {
456            return tokens[--tokenPos];
457        }
458        return null;
459    }
460
461    /**
462     * Gets a copy of the full token list as an independent modifiable array.
463     *
464     * @return The tokens as a String array
465     */
466    public String[] getTokenArray() {
467        checkTokenized();
468        return tokens.clone();
469    }
470
471    /**
472     * Gets a copy of the full token list as an independent modifiable list.
473     *
474     * @return The tokens as a String array
475     */
476    public List<String> getTokenList() {
477        checkTokenized();
478        final List<String> list = new ArrayList<>(tokens.length);
479        Collections.addAll(list, tokens);
480
481        return list;
482    }
483
484    /**
485     * Resets this tokenizer, forgetting all parsing and iteration already completed.
486     * <p>
487     * This method allows the same tokenizer to be reused for the same String.
488     *
489     * @return this, to enable chaining
490     */
491    public StringTokenizer reset() {
492        tokenPos = 0;
493        tokens = null;
494        return this;
495    }
496
497    /**
498     * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
499     * same settings on multiple input lines.
500     *
501     * @param input
502     *            the new string to tokenize, null sets no text to parse
503     * @return this, to enable chaining
504     */
505    public StringTokenizer reset(final String input) {
506        reset();
507        if (input != null) {
508            this.chars = input.toCharArray();
509        } else {
510            this.chars = null;
511        }
512        return this;
513    }
514
515    /**
516     * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
517     * same settings on multiple input lines.
518     *
519     * @param input
520     *            the new character array to tokenize, not cloned, null sets no text to parse
521     * @return this, to enable chaining
522     */
523    public StringTokenizer reset(final char[] input) {
524        reset();
525        if (input != null) {
526            this.chars = input.clone();
527        } else {
528            this.chars = null;
529        }
530        return this;
531    }
532
533    // ListIterator
534    // -----------------------------------------------------------------------
535    /**
536     * Checks whether there are any more tokens.
537     *
538     * @return true if there are more tokens
539     */
540    @Override
541    public boolean hasNext() {
542        checkTokenized();
543        return tokenPos < tokens.length;
544    }
545
546    /**
547     * Gets the next token.
548     *
549     * @return The next String token
550     * @throws NoSuchElementException
551     *             if there are no more elements
552     */
553    @Override
554    public String next() {
555        if (hasNext()) {
556            return tokens[tokenPos++];
557        }
558        throw new NoSuchElementException();
559    }
560
561    /**
562     * Gets the index of the next token to return.
563     *
564     * @return The next token index
565     */
566    @Override
567    public int nextIndex() {
568        return tokenPos;
569    }
570
571    /**
572     * Checks whether there are any previous tokens that can be iterated to.
573     *
574     * @return true if there are previous tokens
575     */
576    @Override
577    public boolean hasPrevious() {
578        checkTokenized();
579        return tokenPos > 0;
580    }
581
582    /**
583     * Gets the token previous to the last returned token.
584     *
585     * @return The previous token
586     */
587    @Override
588    public String previous() {
589        if (hasPrevious()) {
590            return tokens[--tokenPos];
591        }
592        throw new NoSuchElementException();
593    }
594
595    /**
596     * Gets the index of the previous token.
597     *
598     * @return The previous token index
599     */
600    @Override
601    public int previousIndex() {
602        return tokenPos - 1;
603    }
604
605    /**
606     * Unsupported ListIterator operation.
607     *
608     * @throws UnsupportedOperationException
609     *             always
610     */
611    @Override
612    public void remove() {
613        throw new UnsupportedOperationException("remove() is unsupported");
614    }
615
616    /**
617     * Unsupported ListIterator operation.
618     *
619     * @param obj
620     *            this parameter ignored.
621     * @throws UnsupportedOperationException
622     *             always
623     */
624    @Override
625    public void set(final String obj) {
626        throw new UnsupportedOperationException("set() is unsupported");
627    }
628
629    /**
630     * Unsupported ListIterator operation.
631     *
632     * @param obj
633     *            this parameter ignored.
634     * @throws UnsupportedOperationException
635     *             always
636     */
637    @Override
638    public void add(final String obj) {
639        throw new UnsupportedOperationException("add() is unsupported");
640    }
641
642    // Implementation
643    // -----------------------------------------------------------------------
644    /**
645     * Checks if tokenization has been done, and if not then do it.
646     */
647    private void checkTokenized() {
648        if (tokens == null) {
649            if (chars == null) {
650                // still call tokenize as subclass may do some work
651                final List<String> split = tokenize(null, 0, 0);
652                tokens = split.toArray(new String[split.size()]);
653            } else {
654                final List<String> split = tokenize(chars, 0, chars.length);
655                tokens = split.toArray(new String[split.size()]);
656            }
657        }
658    }
659
660    /**
661     * Internal method to performs the tokenization.
662     * <p>
663     * Most users of this class do not need to call this method. This method will be called automatically by other
664     * (public) methods when required.
665     * <p>
666     * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
667     * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
668     * strings. It is also be possible to filter the results.
669     * <p>
670     * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this
671     * method, however a subclass may pass other values, or even an entirely different array.
672     *
673     * @param srcChars
674     *            the character array being tokenized, may be null
675     * @param offset
676     *            the start position within the character array, must be valid
677     * @param count
678     *            the number of characters to tokenize, must be valid
679     * @return The modifiable list of String tokens, unmodifiable if null array or zero count
680     */
681    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
682        if (srcChars == null || count == 0) {
683            return Collections.emptyList();
684        }
685        final TextStringBuilder buf = new TextStringBuilder();
686        final List<String> tokenList = new ArrayList<>();
687        int pos = offset;
688
689        // loop around the entire buffer
690        while (pos >= 0 && pos < count) {
691            // find next token
692            pos = readNextToken(srcChars, pos, count, buf, tokenList);
693
694            // handle case where end of string is a delimiter
695            if (pos >= count) {
696                addToken(tokenList, StringUtils.EMPTY);
697            }
698        }
699        return tokenList;
700    }
701
702    /**
703     * Adds a token to a list, paying attention to the parameters we've set.
704     *
705     * @param list
706     *            the list to add to
707     * @param tok
708     *            the token to add
709     */
710    private void addToken(final List<String> list, String tok) {
711        if (tok == null || tok.length() == 0) {
712            if (isIgnoreEmptyTokens()) {
713                return;
714            }
715            if (isEmptyTokenAsNull()) {
716                tok = null;
717            }
718        }
719        list.add(tok);
720    }
721
722    /**
723     * Reads character by character through the String to get the next token.
724     *
725     * @param srcChars
726     *            the character array being tokenized
727     * @param start
728     *            the first character of field
729     * @param len
730     *            the length of the character array being tokenized
731     * @param workArea
732     *            a temporary work area
733     * @param tokenList
734     *            the list of parsed tokens
735     * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of
736     *         string found
737     */
738    private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
739            final List<String> tokenList) {
740        // skip all leading whitespace, unless it is the
741        // field delimiter or the quote character
742        while (start < len) {
743            final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
744                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
745            if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
746                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
747                break;
748            }
749            start += removeLen;
750        }
751
752        // handle reaching end
753        if (start >= len) {
754            addToken(tokenList, StringUtils.EMPTY);
755            return -1;
756        }
757
758        // handle empty token
759        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
760        if (delimLen > 0) {
761            addToken(tokenList, StringUtils.EMPTY);
762            return start + delimLen;
763        }
764
765        // handle found token
766        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
767        if (quoteLen > 0) {
768            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
769        }
770        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
771    }
772
773    /**
774     * Reads a possibly quoted string token.
775     *
776     * @param srcChars
777     *            the character array being tokenized
778     * @param start
779     *            the first character of field
780     * @param len
781     *            the length of the character array being tokenized
782     * @param workArea
783     *            a temporary work area
784     * @param tokenList
785     *            the list of parsed tokens
786     * @param quoteStart
787     *            the start position of the matched quote, 0 if no quoting
788     * @param quoteLen
789     *            the length of the matched quote, 0 if no quoting
790     * @return The starting position of the next field (the character immediately after the delimiter, or if end of
791     *         string found, then the length of string
792     */
793    private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
794            final List<String> tokenList, final int quoteStart, final int quoteLen) {
795        // Loop until we've found the end of the quoted
796        // string or the end of the input
797        workArea.clear();
798        int pos = start;
799        boolean quoting = quoteLen > 0;
800        int trimStart = 0;
801
802        while (pos < len) {
803            // quoting mode can occur several times throughout a string
804            // we must switch between quoting and non-quoting until we
805            // encounter a non-quoted delimiter, or end of string
806            if (quoting) {
807                // In quoting mode
808
809                // If we've found a quote character, see if it's
810                // followed by a second quote. If so, then we need
811                // to actually put the quote character into the token
812                // rather than end the token.
813                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
814                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
815                        // matched pair of quotes, thus an escaped quote
816                        workArea.append(srcChars, pos, quoteLen);
817                        pos += quoteLen * 2;
818                        trimStart = workArea.size();
819                        continue;
820                    }
821
822                    // end of quoting
823                    quoting = false;
824                    pos += quoteLen;
825                    continue;
826                }
827
828                // copy regular character from inside quotes
829                workArea.append(srcChars[pos++]);
830                trimStart = workArea.size();
831
832            } else {
833                // Not in quoting mode
834
835                // check for delimiter, and thus end of token
836                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
837                if (delimLen > 0) {
838                    // return condition when end of token found
839                    addToken(tokenList, workArea.substring(0, trimStart));
840                    return pos + delimLen;
841                }
842
843                // check for quote, and thus back into quoting mode
844                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
845                    quoting = true;
846                    pos += quoteLen;
847                    continue;
848                }
849
850                // check for ignored (outside quotes), and ignore
851                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
852                if (ignoredLen > 0) {
853                    pos += ignoredLen;
854                    continue;
855                }
856
857                // check for trimmed character
858                // don't yet know if its at the end, so copy to workArea
859                // use trimStart to keep track of trim at the end
860                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
861                if (trimmedLen > 0) {
862                    workArea.append(srcChars, pos, trimmedLen);
863                    pos += trimmedLen;
864                    continue;
865                }
866
867                // copy regular character from outside quotes
868                workArea.append(srcChars[pos++]);
869                trimStart = workArea.size();
870            }
871        }
872
873        // return condition when end of string found
874        addToken(tokenList, workArea.substring(0, trimStart));
875        return -1;
876    }
877
878    /**
879     * Checks if the characters at the index specified match the quote already matched in readNextToken().
880     *
881     * @param srcChars
882     *            the character array being tokenized
883     * @param pos
884     *            the position to check for a quote
885     * @param len
886     *            the length of the character array being tokenized
887     * @param quoteStart
888     *            the start position of the matched quote, 0 if no quoting
889     * @param quoteLen
890     *            the length of the matched quote, 0 if no quoting
891     * @return true if a quote is matched
892     */
893    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
894            final int quoteLen) {
895        for (int i = 0; i < quoteLen; i++) {
896            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
897                return false;
898            }
899        }
900        return true;
901    }
902
903    // Delimiter
904    // -----------------------------------------------------------------------
905    /**
906     * Gets the field delimiter matcher.
907     *
908     * @return The delimiter matcher in use
909     */
910    public StringMatcher getDelimiterMatcher() {
911        return this.delimMatcher;
912    }
913
914    /**
915     * Sets the field delimiter matcher.
916     * <p>
917     * The delimiter is used to separate one token from another.
918     *
919     * @param delim
920     *            the delimiter matcher to use
921     * @return this, to enable chaining
922     */
923    public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
924        if (delim == null) {
925            this.delimMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
926        } else {
927            this.delimMatcher = delim;
928        }
929        return this;
930    }
931
932    /**
933     * Sets the field delimiter character.
934     *
935     * @param delim
936     *            the delimiter character to use
937     * @return this, to enable chaining
938     */
939    public StringTokenizer setDelimiterChar(final char delim) {
940        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
941    }
942
943    /**
944     * Sets the field delimiter string.
945     *
946     * @param delim
947     *            the delimiter string to use
948     * @return this, to enable chaining
949     */
950    public StringTokenizer setDelimiterString(final String delim) {
951        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
952    }
953
954    // Quote
955    // -----------------------------------------------------------------------
956    /**
957     * Gets the quote matcher currently in use.
958     * <p>
959     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
960     * default value is '"' (double quote).
961     *
962     * @return The quote matcher in use
963     */
964    public StringMatcher getQuoteMatcher() {
965        return quoteMatcher;
966    }
967
968    /**
969     * Set the quote matcher to use.
970     * <p>
971     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
972     *
973     * @param quote
974     *            the quote matcher to use, null ignored
975     * @return this, to enable chaining
976     */
977    public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
978        if (quote != null) {
979            this.quoteMatcher = quote;
980        }
981        return this;
982    }
983
984    /**
985     * Sets the quote character to use.
986     * <p>
987     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
988     *
989     * @param quote
990     *            the quote character to use
991     * @return this, to enable chaining
992     */
993    public StringTokenizer setQuoteChar(final char quote) {
994        return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
995    }
996
997    // Ignored
998    // -----------------------------------------------------------------------
999    /**
1000     * Gets the ignored character matcher.
1001     * <p>
1002     * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
1003     * is not to ignore anything.
1004     *
1005     * @return The ignored matcher in use
1006     */
1007    public StringMatcher getIgnoredMatcher() {
1008        return ignoredMatcher;
1009    }
1010
1011    /**
1012     * Set the matcher for characters to ignore.
1013     * <p>
1014     * These characters are ignored when parsing the String, unless they are within a quoted region.
1015     *
1016     * @param ignored
1017     *            the ignored matcher to use, null ignored
1018     * @return this, to enable chaining
1019     */
1020    public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
1021        if (ignored != null) {
1022            this.ignoredMatcher = ignored;
1023        }
1024        return this;
1025    }
1026
1027    /**
1028     * Set the character to ignore.
1029     * <p>
1030     * This character is ignored when parsing the String, unless it is within a quoted region.
1031     *
1032     * @param ignored
1033     *            the ignored character to use
1034     * @return this, to enable chaining
1035     */
1036    public StringTokenizer setIgnoredChar(final char ignored) {
1037        return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
1038    }
1039
1040    // Trimmer
1041    // -----------------------------------------------------------------------
1042    /**
1043     * Gets the trimmer character matcher.
1044     * <p>
1045     * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
1046     * value is not to trim anything.
1047     *
1048     * @return The trimmer matcher in use
1049     */
1050    public StringMatcher getTrimmerMatcher() {
1051        return trimmerMatcher;
1052    }
1053
1054    /**
1055     * Sets the matcher for characters to trim.
1056     * <p>
1057     * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1058     *
1059     * @param trimmer
1060     *            the trimmer matcher to use, null ignored
1061     * @return this, to enable chaining
1062     */
1063    public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
1064        if (trimmer != null) {
1065            this.trimmerMatcher = trimmer;
1066        }
1067        return this;
1068    }
1069
1070    // -----------------------------------------------------------------------
1071    /**
1072     * Gets whether the tokenizer currently returns empty tokens as null. The default for this property is false.
1073     *
1074     * @return true if empty tokens are returned as null
1075     */
1076    public boolean isEmptyTokenAsNull() {
1077        return this.emptyAsNull;
1078    }
1079
1080    /**
1081     * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
1082     *
1083     * @param emptyAsNull
1084     *            whether empty tokens are returned as null
1085     * @return this, to enable chaining
1086     */
1087    public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1088        this.emptyAsNull = emptyAsNull;
1089        return this;
1090    }
1091
1092    // -----------------------------------------------------------------------
1093    /**
1094     * Gets whether the tokenizer currently ignores empty tokens. The default for this property is true.
1095     *
1096     * @return true if empty tokens are not returned
1097     */
1098    public boolean isIgnoreEmptyTokens() {
1099        return ignoreEmptyTokens;
1100    }
1101
1102    /**
1103     * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
1104     *
1105     * @param ignoreEmptyTokens
1106     *            whether empty tokens are not returned
1107     * @return this, to enable chaining
1108     */
1109    public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1110        this.ignoreEmptyTokens = ignoreEmptyTokens;
1111        return this;
1112    }
1113
1114    // -----------------------------------------------------------------------
1115    /**
1116     * Gets the String content that the tokenizer is parsing.
1117     *
1118     * @return The string content being parsed
1119     */
1120    public String getContent() {
1121        if (chars == null) {
1122            return null;
1123        }
1124        return new String(chars);
1125    }
1126
1127    // -----------------------------------------------------------------------
1128    /**
1129     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
1130     * list. If a {@link CloneNotSupportedException} is caught, return {@code null}.
1131     *
1132     * @return a new instance of this Tokenizer which has been reset.
1133     */
1134    @Override
1135    public Object clone() {
1136        try {
1137            return cloneReset();
1138        } catch (final CloneNotSupportedException ex) {
1139            return null;
1140        }
1141    }
1142
1143    /**
1144     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
1145     * list.
1146     *
1147     * @return a new instance of this Tokenizer which has been reset.
1148     * @throws CloneNotSupportedException
1149     *             if there is a problem cloning
1150     */
1151    Object cloneReset() throws CloneNotSupportedException {
1152        // this method exists to enable 100% test coverage
1153        final StringTokenizer cloned = (StringTokenizer) super.clone();
1154        if (cloned.chars != null) {
1155            cloned.chars = cloned.chars.clone();
1156        }
1157        cloned.reset();
1158        return cloned;
1159    }
1160
1161    // -----------------------------------------------------------------------
1162    /**
1163     * Gets the String content that the tokenizer is parsing.
1164     *
1165     * @return The string content being parsed
1166     */
1167    @Override
1168    public String toString() {
1169        if (tokens == null) {
1170            return "StringTokenizer[not tokenized yet]";
1171        }
1172        return "StringTokenizer" + getTokenList();
1173    }
1174
1175}