001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025import org.apache.commons.lang3.StringUtils; 026import org.apache.commons.text.matcher.StringMatcher; 027import org.apache.commons.text.matcher.StringMatcherFactory; 028 029/** 030 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts. 031 * <p> 032 * This class can split a String into many smaller strings. It aims to do a similar job to 033 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including 034 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}. 035 * <p> 036 * The input String is split into a number of <i>tokens</i>. Each token is separated from the next String by a 037 * <i>delimiter</i>. One or more delimiter characters must be specified. 038 * <p> 039 * Each token may be surrounded by quotes. The <i>quote</i> matcher specifies the quote character(s). A quote may be 040 * escaped within a quoted section by duplicating itself. 041 * <p> 042 * Between each token and the delimiter are potentially characters that need trimming. The <i>trimmer</i> matcher 043 * specifies these characters. One usage might be to trim whitespace characters. 044 * <p> 045 * At any point outside the quotes there might potentially be invalid characters. The <i>ignored</i> matcher specifies 046 * these characters to be removed. One usage might be to remove new line characters. 047 * <p> 048 * Empty tokens may be removed or returned as null. 049 * 050 * <pre> 051 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 052 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 053 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 054 * </pre> 055 * 056 * <table> 057 * <caption>StringTokenizer properties and options</caption> 058 * <tr> 059 * <th>Property</th> 060 * <th>Type</th> 061 * <th>Default</th> 062 * </tr> 063 * <tr> 064 * <td>delim</td> 065 * <td>CharSetMatcher</td> 066 * <td>{ \t\n\r\f}</td> 067 * </tr> 068 * <tr> 069 * <td>quote</td> 070 * <td>NoneMatcher</td> 071 * <td>{}</td> 072 * </tr> 073 * <tr> 074 * <td>ignore</td> 075 * <td>NoneMatcher</td> 076 * <td>{}</td> 077 * </tr> 078 * <tr> 079 * <td>emptyTokenAsNull</td> 080 * <td>boolean</td> 081 * <td>false</td> 082 * </tr> 083 * <tr> 084 * <td>ignoreEmptyTokens</td> 085 * <td>boolean</td> 086 * <td>true</td> 087 * </tr> 088 * </table> 089 * 090 * @since 1.3 091 */ 092public class StringTokenizer implements ListIterator<String>, Cloneable { 093 094 /** Comma separated values tokenizer internal variable. */ 095 private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE; 096 /** Tab separated values tokenizer internal variable. */ 097 private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE; 098 static { 099 CSV_TOKENIZER_PROTOTYPE = new StringTokenizer(); 100 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher()); 101 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()); 102 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()); 103 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()); 104 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 105 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 106 107 TSV_TOKENIZER_PROTOTYPE = new StringTokenizer(); 108 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher()); 109 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()); 110 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()); 111 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()); 112 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 113 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 114 } 115 116 /** The text to work on. */ 117 private char[] chars; 118 /** The parsed tokens. */ 119 private String[] tokens; 120 /** The current iteration position. */ 121 private int tokenPos; 122 123 /** The delimiter matcher. */ 124 private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher(); 125 /** The quote matcher. */ 126 private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 127 /** The ignored matcher. */ 128 private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 129 /** The trimmer matcher. */ 130 private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 131 132 /** Whether to return empty tokens as null. */ 133 private boolean emptyAsNull = false; 134 /** Whether to ignore empty tokens. */ 135 private boolean ignoreEmptyTokens = true; 136 137 // ----------------------------------------------------------------------- 138 139 /** 140 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 141 * 142 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 143 */ 144 private static StringTokenizer getCSVClone() { 145 return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 146 } 147 148 /** 149 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 150 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 151 * setTrimmer method). 152 * <p> 153 * You must call a "reset" method to set the string which you want to parse. 154 * 155 * @return a new tokenizer instance which parses Comma Separated Value strings 156 */ 157 public static StringTokenizer getCSVInstance() { 158 return getCSVClone(); 159 } 160 161 /** 162 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 163 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 164 * setTrimmer method). 165 * 166 * @param input 167 * the text to parse 168 * @return a new tokenizer instance which parses Comma Separated Value strings 169 */ 170 public static StringTokenizer getCSVInstance(final String input) { 171 final StringTokenizer tok = getCSVClone(); 172 tok.reset(input); 173 return tok; 174 } 175 176 /** 177 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 178 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 179 * setTrimmer method). 180 * 181 * @param input 182 * the text to parse 183 * @return a new tokenizer instance which parses Comma Separated Value strings 184 */ 185 public static StringTokenizer getCSVInstance(final char[] input) { 186 final StringTokenizer tok = getCSVClone(); 187 tok.reset(input); 188 return tok; 189 } 190 191 /** 192 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 193 * 194 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 195 */ 196 private static StringTokenizer getTSVClone() { 197 return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 198 } 199 200 /** 201 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 202 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 203 * <p> 204 * You must call a "reset" method to set the string which you want to parse. 205 * 206 * @return a new tokenizer instance which parses Tab Separated Value strings. 207 */ 208 public static StringTokenizer getTSVInstance() { 209 return getTSVClone(); 210 } 211 212 /** 213 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 214 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 215 * 216 * @param input 217 * the string to parse 218 * @return a new tokenizer instance which parses Tab Separated Value strings. 219 */ 220 public static StringTokenizer getTSVInstance(final String input) { 221 final StringTokenizer tok = getTSVClone(); 222 tok.reset(input); 223 return tok; 224 } 225 226 /** 227 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 228 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 229 * 230 * @param input 231 * the string to parse 232 * @return a new tokenizer instance which parses Tab Separated Value strings. 233 */ 234 public static StringTokenizer getTSVInstance(final char[] input) { 235 final StringTokenizer tok = getTSVClone(); 236 tok.reset(input); 237 return tok; 238 } 239 240 // ----------------------------------------------------------------------- 241 /** 242 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to 243 * tokenize. 244 * <p> 245 * This constructor is normally used with {@link #reset(String)}. 246 */ 247 public StringTokenizer() { 248 super(); 249 this.chars = null; 250 } 251 252 /** 253 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 254 * 255 * @param input 256 * the string which is to be parsed 257 */ 258 public StringTokenizer(final String input) { 259 super(); 260 if (input != null) { 261 chars = input.toCharArray(); 262 } else { 263 chars = null; 264 } 265 } 266 267 /** 268 * Constructs a tokenizer splitting on the specified delimiter character. 269 * 270 * @param input 271 * the string which is to be parsed 272 * @param delim 273 * the field delimiter character 274 */ 275 public StringTokenizer(final String input, final char delim) { 276 this(input); 277 setDelimiterChar(delim); 278 } 279 280 /** 281 * Constructs a tokenizer splitting on the specified delimiter string. 282 * 283 * @param input 284 * the string which is to be parsed 285 * @param delim 286 * the field delimiter string 287 */ 288 public StringTokenizer(final String input, final String delim) { 289 this(input); 290 setDelimiterString(delim); 291 } 292 293 /** 294 * Constructs a tokenizer splitting using the specified delimiter matcher. 295 * 296 * @param input 297 * the string which is to be parsed 298 * @param delim 299 * the field delimiter matcher 300 */ 301 public StringTokenizer(final String input, final StringMatcher delim) { 302 this(input); 303 setDelimiterMatcher(delim); 304 } 305 306 /** 307 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 308 * quote character. 309 * 310 * @param input 311 * the string which is to be parsed 312 * @param delim 313 * the field delimiter character 314 * @param quote 315 * the field quoted string character 316 */ 317 public StringTokenizer(final String input, final char delim, final char quote) { 318 this(input, delim); 319 setQuoteChar(quote); 320 } 321 322 /** 323 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 324 * quote matcher. 325 * 326 * @param input 327 * the string which is to be parsed 328 * @param delim 329 * the field delimiter matcher 330 * @param quote 331 * the field quoted string matcher 332 */ 333 public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) { 334 this(input, delim); 335 setQuoteMatcher(quote); 336 } 337 338 /** 339 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 340 * 341 * @param input 342 * the string which is to be parsed, not cloned 343 */ 344 public StringTokenizer(final char[] input) { 345 super(); 346 if (input == null) { 347 this.chars = null; 348 } else { 349 this.chars = input.clone(); 350 } 351 } 352 353 /** 354 * Constructs a tokenizer splitting on the specified character. 355 * 356 * @param input 357 * the string which is to be parsed, not cloned 358 * @param delim 359 * the field delimiter character 360 */ 361 public StringTokenizer(final char[] input, final char delim) { 362 this(input); 363 setDelimiterChar(delim); 364 } 365 366 /** 367 * Constructs a tokenizer splitting on the specified string. 368 * 369 * @param input 370 * the string which is to be parsed, not cloned 371 * @param delim 372 * the field delimiter string 373 */ 374 public StringTokenizer(final char[] input, final String delim) { 375 this(input); 376 setDelimiterString(delim); 377 } 378 379 /** 380 * Constructs a tokenizer splitting using the specified delimiter matcher. 381 * 382 * @param input 383 * the string which is to be parsed, not cloned 384 * @param delim 385 * the field delimiter matcher 386 */ 387 public StringTokenizer(final char[] input, final StringMatcher delim) { 388 this(input); 389 setDelimiterMatcher(delim); 390 } 391 392 /** 393 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 394 * quote character. 395 * 396 * @param input 397 * the string which is to be parsed, not cloned 398 * @param delim 399 * the field delimiter character 400 * @param quote 401 * the field quoted string character 402 */ 403 public StringTokenizer(final char[] input, final char delim, final char quote) { 404 this(input, delim); 405 setQuoteChar(quote); 406 } 407 408 /** 409 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 410 * quote matcher. 411 * 412 * @param input 413 * the string which is to be parsed, not cloned 414 * @param delim 415 * the field delimiter character 416 * @param quote 417 * the field quoted string character 418 */ 419 public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) { 420 this(input, delim); 421 setQuoteMatcher(quote); 422 } 423 424 // API 425 // ----------------------------------------------------------------------- 426 /** 427 * Gets the number of tokens found in the String. 428 * 429 * @return The number of matched tokens 430 */ 431 public int size() { 432 checkTokenized(); 433 return tokens.length; 434 } 435 436 /** 437 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing 438 * {@link NoSuchElementException} when no tokens remain. 439 * 440 * @return The next sequential token, or null when no more tokens are found 441 */ 442 public String nextToken() { 443 if (hasNext()) { 444 return tokens[tokenPos++]; 445 } 446 return null; 447 } 448 449 /** 450 * Gets the previous token from the String. 451 * 452 * @return The previous sequential token, or null when no more tokens are found 453 */ 454 public String previousToken() { 455 if (hasPrevious()) { 456 return tokens[--tokenPos]; 457 } 458 return null; 459 } 460 461 /** 462 * Gets a copy of the full token list as an independent modifiable array. 463 * 464 * @return The tokens as a String array 465 */ 466 public String[] getTokenArray() { 467 checkTokenized(); 468 return tokens.clone(); 469 } 470 471 /** 472 * Gets a copy of the full token list as an independent modifiable list. 473 * 474 * @return The tokens as a String array 475 */ 476 public List<String> getTokenList() { 477 checkTokenized(); 478 final List<String> list = new ArrayList<>(tokens.length); 479 Collections.addAll(list, tokens); 480 481 return list; 482 } 483 484 /** 485 * Resets this tokenizer, forgetting all parsing and iteration already completed. 486 * <p> 487 * This method allows the same tokenizer to be reused for the same String. 488 * 489 * @return this, to enable chaining 490 */ 491 public StringTokenizer reset() { 492 tokenPos = 0; 493 tokens = null; 494 return this; 495 } 496 497 /** 498 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 499 * same settings on multiple input lines. 500 * 501 * @param input 502 * the new string to tokenize, null sets no text to parse 503 * @return this, to enable chaining 504 */ 505 public StringTokenizer reset(final String input) { 506 reset(); 507 if (input != null) { 508 this.chars = input.toCharArray(); 509 } else { 510 this.chars = null; 511 } 512 return this; 513 } 514 515 /** 516 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 517 * same settings on multiple input lines. 518 * 519 * @param input 520 * the new character array to tokenize, not cloned, null sets no text to parse 521 * @return this, to enable chaining 522 */ 523 public StringTokenizer reset(final char[] input) { 524 reset(); 525 if (input != null) { 526 this.chars = input.clone(); 527 } else { 528 this.chars = null; 529 } 530 return this; 531 } 532 533 // ListIterator 534 // ----------------------------------------------------------------------- 535 /** 536 * Checks whether there are any more tokens. 537 * 538 * @return true if there are more tokens 539 */ 540 @Override 541 public boolean hasNext() { 542 checkTokenized(); 543 return tokenPos < tokens.length; 544 } 545 546 /** 547 * Gets the next token. 548 * 549 * @return The next String token 550 * @throws NoSuchElementException 551 * if there are no more elements 552 */ 553 @Override 554 public String next() { 555 if (hasNext()) { 556 return tokens[tokenPos++]; 557 } 558 throw new NoSuchElementException(); 559 } 560 561 /** 562 * Gets the index of the next token to return. 563 * 564 * @return The next token index 565 */ 566 @Override 567 public int nextIndex() { 568 return tokenPos; 569 } 570 571 /** 572 * Checks whether there are any previous tokens that can be iterated to. 573 * 574 * @return true if there are previous tokens 575 */ 576 @Override 577 public boolean hasPrevious() { 578 checkTokenized(); 579 return tokenPos > 0; 580 } 581 582 /** 583 * Gets the token previous to the last returned token. 584 * 585 * @return The previous token 586 */ 587 @Override 588 public String previous() { 589 if (hasPrevious()) { 590 return tokens[--tokenPos]; 591 } 592 throw new NoSuchElementException(); 593 } 594 595 /** 596 * Gets the index of the previous token. 597 * 598 * @return The previous token index 599 */ 600 @Override 601 public int previousIndex() { 602 return tokenPos - 1; 603 } 604 605 /** 606 * Unsupported ListIterator operation. 607 * 608 * @throws UnsupportedOperationException 609 * always 610 */ 611 @Override 612 public void remove() { 613 throw new UnsupportedOperationException("remove() is unsupported"); 614 } 615 616 /** 617 * Unsupported ListIterator operation. 618 * 619 * @param obj 620 * this parameter ignored. 621 * @throws UnsupportedOperationException 622 * always 623 */ 624 @Override 625 public void set(final String obj) { 626 throw new UnsupportedOperationException("set() is unsupported"); 627 } 628 629 /** 630 * Unsupported ListIterator operation. 631 * 632 * @param obj 633 * this parameter ignored. 634 * @throws UnsupportedOperationException 635 * always 636 */ 637 @Override 638 public void add(final String obj) { 639 throw new UnsupportedOperationException("add() is unsupported"); 640 } 641 642 // Implementation 643 // ----------------------------------------------------------------------- 644 /** 645 * Checks if tokenization has been done, and if not then do it. 646 */ 647 private void checkTokenized() { 648 if (tokens == null) { 649 if (chars == null) { 650 // still call tokenize as subclass may do some work 651 final List<String> split = tokenize(null, 0, 0); 652 tokens = split.toArray(new String[split.size()]); 653 } else { 654 final List<String> split = tokenize(chars, 0, chars.length); 655 tokens = split.toArray(new String[split.size()]); 656 } 657 } 658 } 659 660 /** 661 * Internal method to performs the tokenization. 662 * <p> 663 * Most users of this class do not need to call this method. This method will be called automatically by other 664 * (public) methods when required. 665 * <p> 666 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass 667 * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple 668 * strings. It is also be possible to filter the results. 669 * <p> 670 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this 671 * method, however a subclass may pass other values, or even an entirely different array. 672 * 673 * @param srcChars 674 * the character array being tokenized, may be null 675 * @param offset 676 * the start position within the character array, must be valid 677 * @param count 678 * the number of characters to tokenize, must be valid 679 * @return The modifiable list of String tokens, unmodifiable if null array or zero count 680 */ 681 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 682 if (srcChars == null || count == 0) { 683 return Collections.emptyList(); 684 } 685 final TextStringBuilder buf = new TextStringBuilder(); 686 final List<String> tokenList = new ArrayList<>(); 687 int pos = offset; 688 689 // loop around the entire buffer 690 while (pos >= 0 && pos < count) { 691 // find next token 692 pos = readNextToken(srcChars, pos, count, buf, tokenList); 693 694 // handle case where end of string is a delimiter 695 if (pos >= count) { 696 addToken(tokenList, StringUtils.EMPTY); 697 } 698 } 699 return tokenList; 700 } 701 702 /** 703 * Adds a token to a list, paying attention to the parameters we've set. 704 * 705 * @param list 706 * the list to add to 707 * @param tok 708 * the token to add 709 */ 710 private void addToken(final List<String> list, String tok) { 711 if (tok == null || tok.length() == 0) { 712 if (isIgnoreEmptyTokens()) { 713 return; 714 } 715 if (isEmptyTokenAsNull()) { 716 tok = null; 717 } 718 } 719 list.add(tok); 720 } 721 722 /** 723 * Reads character by character through the String to get the next token. 724 * 725 * @param srcChars 726 * the character array being tokenized 727 * @param start 728 * the first character of field 729 * @param len 730 * the length of the character array being tokenized 731 * @param workArea 732 * a temporary work area 733 * @param tokenList 734 * the list of parsed tokens 735 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of 736 * string found 737 */ 738 private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea, 739 final List<String> tokenList) { 740 // skip all leading whitespace, unless it is the 741 // field delimiter or the quote character 742 while (start < len) { 743 final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len), 744 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 745 if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 746 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 747 break; 748 } 749 start += removeLen; 750 } 751 752 // handle reaching end 753 if (start >= len) { 754 addToken(tokenList, StringUtils.EMPTY); 755 return -1; 756 } 757 758 // handle empty token 759 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 760 if (delimLen > 0) { 761 addToken(tokenList, StringUtils.EMPTY); 762 return start + delimLen; 763 } 764 765 // handle found token 766 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 767 if (quoteLen > 0) { 768 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 769 } 770 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 771 } 772 773 /** 774 * Reads a possibly quoted string token. 775 * 776 * @param srcChars 777 * the character array being tokenized 778 * @param start 779 * the first character of field 780 * @param len 781 * the length of the character array being tokenized 782 * @param workArea 783 * a temporary work area 784 * @param tokenList 785 * the list of parsed tokens 786 * @param quoteStart 787 * the start position of the matched quote, 0 if no quoting 788 * @param quoteLen 789 * the length of the matched quote, 0 if no quoting 790 * @return The starting position of the next field (the character immediately after the delimiter, or if end of 791 * string found, then the length of string 792 */ 793 private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea, 794 final List<String> tokenList, final int quoteStart, final int quoteLen) { 795 // Loop until we've found the end of the quoted 796 // string or the end of the input 797 workArea.clear(); 798 int pos = start; 799 boolean quoting = quoteLen > 0; 800 int trimStart = 0; 801 802 while (pos < len) { 803 // quoting mode can occur several times throughout a string 804 // we must switch between quoting and non-quoting until we 805 // encounter a non-quoted delimiter, or end of string 806 if (quoting) { 807 // In quoting mode 808 809 // If we've found a quote character, see if it's 810 // followed by a second quote. If so, then we need 811 // to actually put the quote character into the token 812 // rather than end the token. 813 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 814 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 815 // matched pair of quotes, thus an escaped quote 816 workArea.append(srcChars, pos, quoteLen); 817 pos += quoteLen * 2; 818 trimStart = workArea.size(); 819 continue; 820 } 821 822 // end of quoting 823 quoting = false; 824 pos += quoteLen; 825 continue; 826 } 827 828 // copy regular character from inside quotes 829 workArea.append(srcChars[pos++]); 830 trimStart = workArea.size(); 831 832 } else { 833 // Not in quoting mode 834 835 // check for delimiter, and thus end of token 836 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 837 if (delimLen > 0) { 838 // return condition when end of token found 839 addToken(tokenList, workArea.substring(0, trimStart)); 840 return pos + delimLen; 841 } 842 843 // check for quote, and thus back into quoting mode 844 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 845 quoting = true; 846 pos += quoteLen; 847 continue; 848 } 849 850 // check for ignored (outside quotes), and ignore 851 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 852 if (ignoredLen > 0) { 853 pos += ignoredLen; 854 continue; 855 } 856 857 // check for trimmed character 858 // don't yet know if its at the end, so copy to workArea 859 // use trimStart to keep track of trim at the end 860 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 861 if (trimmedLen > 0) { 862 workArea.append(srcChars, pos, trimmedLen); 863 pos += trimmedLen; 864 continue; 865 } 866 867 // copy regular character from outside quotes 868 workArea.append(srcChars[pos++]); 869 trimStart = workArea.size(); 870 } 871 } 872 873 // return condition when end of string found 874 addToken(tokenList, workArea.substring(0, trimStart)); 875 return -1; 876 } 877 878 /** 879 * Checks if the characters at the index specified match the quote already matched in readNextToken(). 880 * 881 * @param srcChars 882 * the character array being tokenized 883 * @param pos 884 * the position to check for a quote 885 * @param len 886 * the length of the character array being tokenized 887 * @param quoteStart 888 * the start position of the matched quote, 0 if no quoting 889 * @param quoteLen 890 * the length of the matched quote, 0 if no quoting 891 * @return true if a quote is matched 892 */ 893 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, 894 final int quoteLen) { 895 for (int i = 0; i < quoteLen; i++) { 896 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 897 return false; 898 } 899 } 900 return true; 901 } 902 903 // Delimiter 904 // ----------------------------------------------------------------------- 905 /** 906 * Gets the field delimiter matcher. 907 * 908 * @return The delimiter matcher in use 909 */ 910 public StringMatcher getDelimiterMatcher() { 911 return this.delimMatcher; 912 } 913 914 /** 915 * Sets the field delimiter matcher. 916 * <p> 917 * The delimiter is used to separate one token from another. 918 * 919 * @param delim 920 * the delimiter matcher to use 921 * @return this, to enable chaining 922 */ 923 public StringTokenizer setDelimiterMatcher(final StringMatcher delim) { 924 if (delim == null) { 925 this.delimMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 926 } else { 927 this.delimMatcher = delim; 928 } 929 return this; 930 } 931 932 /** 933 * Sets the field delimiter character. 934 * 935 * @param delim 936 * the delimiter character to use 937 * @return this, to enable chaining 938 */ 939 public StringTokenizer setDelimiterChar(final char delim) { 940 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim)); 941 } 942 943 /** 944 * Sets the field delimiter string. 945 * 946 * @param delim 947 * the delimiter string to use 948 * @return this, to enable chaining 949 */ 950 public StringTokenizer setDelimiterString(final String delim) { 951 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim)); 952 } 953 954 // Quote 955 // ----------------------------------------------------------------------- 956 /** 957 * Gets the quote matcher currently in use. 958 * <p> 959 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The 960 * default value is '"' (double quote). 961 * 962 * @return The quote matcher in use 963 */ 964 public StringMatcher getQuoteMatcher() { 965 return quoteMatcher; 966 } 967 968 /** 969 * Set the quote matcher to use. 970 * <p> 971 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 972 * 973 * @param quote 974 * the quote matcher to use, null ignored 975 * @return this, to enable chaining 976 */ 977 public StringTokenizer setQuoteMatcher(final StringMatcher quote) { 978 if (quote != null) { 979 this.quoteMatcher = quote; 980 } 981 return this; 982 } 983 984 /** 985 * Sets the quote character to use. 986 * <p> 987 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 988 * 989 * @param quote 990 * the quote character to use 991 * @return this, to enable chaining 992 */ 993 public StringTokenizer setQuoteChar(final char quote) { 994 return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote)); 995 } 996 997 // Ignored 998 // ----------------------------------------------------------------------- 999 /** 1000 * Gets the ignored character matcher. 1001 * <p> 1002 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value 1003 * is not to ignore anything. 1004 * 1005 * @return The ignored matcher in use 1006 */ 1007 public StringMatcher getIgnoredMatcher() { 1008 return ignoredMatcher; 1009 } 1010 1011 /** 1012 * Set the matcher for characters to ignore. 1013 * <p> 1014 * These characters are ignored when parsing the String, unless they are within a quoted region. 1015 * 1016 * @param ignored 1017 * the ignored matcher to use, null ignored 1018 * @return this, to enable chaining 1019 */ 1020 public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) { 1021 if (ignored != null) { 1022 this.ignoredMatcher = ignored; 1023 } 1024 return this; 1025 } 1026 1027 /** 1028 * Set the character to ignore. 1029 * <p> 1030 * This character is ignored when parsing the String, unless it is within a quoted region. 1031 * 1032 * @param ignored 1033 * the ignored character to use 1034 * @return this, to enable chaining 1035 */ 1036 public StringTokenizer setIgnoredChar(final char ignored) { 1037 return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored)); 1038 } 1039 1040 // Trimmer 1041 // ----------------------------------------------------------------------- 1042 /** 1043 * Gets the trimmer character matcher. 1044 * <p> 1045 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default 1046 * value is not to trim anything. 1047 * 1048 * @return The trimmer matcher in use 1049 */ 1050 public StringMatcher getTrimmerMatcher() { 1051 return trimmerMatcher; 1052 } 1053 1054 /** 1055 * Sets the matcher for characters to trim. 1056 * <p> 1057 * These characters are trimmed off on each side of the delimiter until the token or quote is found. 1058 * 1059 * @param trimmer 1060 * the trimmer matcher to use, null ignored 1061 * @return this, to enable chaining 1062 */ 1063 public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) { 1064 if (trimmer != null) { 1065 this.trimmerMatcher = trimmer; 1066 } 1067 return this; 1068 } 1069 1070 // ----------------------------------------------------------------------- 1071 /** 1072 * Gets whether the tokenizer currently returns empty tokens as null. The default for this property is false. 1073 * 1074 * @return true if empty tokens are returned as null 1075 */ 1076 public boolean isEmptyTokenAsNull() { 1077 return this.emptyAsNull; 1078 } 1079 1080 /** 1081 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false. 1082 * 1083 * @param emptyAsNull 1084 * whether empty tokens are returned as null 1085 * @return this, to enable chaining 1086 */ 1087 public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1088 this.emptyAsNull = emptyAsNull; 1089 return this; 1090 } 1091 1092 // ----------------------------------------------------------------------- 1093 /** 1094 * Gets whether the tokenizer currently ignores empty tokens. The default for this property is true. 1095 * 1096 * @return true if empty tokens are not returned 1097 */ 1098 public boolean isIgnoreEmptyTokens() { 1099 return ignoreEmptyTokens; 1100 } 1101 1102 /** 1103 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true. 1104 * 1105 * @param ignoreEmptyTokens 1106 * whether empty tokens are not returned 1107 * @return this, to enable chaining 1108 */ 1109 public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1110 this.ignoreEmptyTokens = ignoreEmptyTokens; 1111 return this; 1112 } 1113 1114 // ----------------------------------------------------------------------- 1115 /** 1116 * Gets the String content that the tokenizer is parsing. 1117 * 1118 * @return The string content being parsed 1119 */ 1120 public String getContent() { 1121 if (chars == null) { 1122 return null; 1123 } 1124 return new String(chars); 1125 } 1126 1127 // ----------------------------------------------------------------------- 1128 /** 1129 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 1130 * list. If a {@link CloneNotSupportedException} is caught, return {@code null}. 1131 * 1132 * @return a new instance of this Tokenizer which has been reset. 1133 */ 1134 @Override 1135 public Object clone() { 1136 try { 1137 return cloneReset(); 1138 } catch (final CloneNotSupportedException ex) { 1139 return null; 1140 } 1141 } 1142 1143 /** 1144 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 1145 * list. 1146 * 1147 * @return a new instance of this Tokenizer which has been reset. 1148 * @throws CloneNotSupportedException 1149 * if there is a problem cloning 1150 */ 1151 Object cloneReset() throws CloneNotSupportedException { 1152 // this method exists to enable 100% test coverage 1153 final StringTokenizer cloned = (StringTokenizer) super.clone(); 1154 if (cloned.chars != null) { 1155 cloned.chars = cloned.chars.clone(); 1156 } 1157 cloned.reset(); 1158 return cloned; 1159 } 1160 1161 // ----------------------------------------------------------------------- 1162 /** 1163 * Gets the String content that the tokenizer is parsing. 1164 * 1165 * @return The string content being parsed 1166 */ 1167 @Override 1168 public String toString() { 1169 if (tokens == null) { 1170 return "StringTokenizer[not tokenized yet]"; 1171 } 1172 return "StringTokenizer" + getTokenList(); 1173 } 1174 1175}