001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.validator.routines; 018 019import java.io.Serializable; 020import java.net.URI; 021import java.net.URISyntaxException; 022import java.util.Collections; 023import java.util.HashSet; 024import java.util.Locale; 025import java.util.Set; 026import java.util.regex.Matcher; 027import java.util.regex.Pattern; 028 029/** 030 * <p><b>URL Validation</b> routines.</p> 031 * Behavior of validation is modified by passing in options: 032 * <ul> 033 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 034 * component.</li> 035 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 036 * included then fragments are flagged as illegal.</li> 037 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 038 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 039 * </ul> 040 * 041 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 042 * http://javascript.internet.com. However, this validation now bears little resemblance 043 * to the php original.</p> 044 * <pre> 045 * Example of usage: 046 * Construct a UrlValidator with valid schemes of "http", and "https". 047 * 048 * String[] schemes = {"http","https"}. 049 * UrlValidator urlValidator = new UrlValidator(schemes); 050 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 051 * System.out.println("url is valid"); 052 * } else { 053 * System.out.println("url is invalid"); 054 * } 055 * 056 * prints "url is invalid" 057 * If instead the default constructor is used. 058 * 059 * UrlValidator urlValidator = new UrlValidator(); 060 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 061 * System.out.println("url is valid"); 062 * } else { 063 * System.out.println("url is invalid"); 064 * } 065 * 066 * prints out "url is valid" 067 * </pre> 068 * 069 * @see 070 * <a href="http://www.ietf.org/rfc/rfc2396.txt"> 071 * Uniform Resource Identifiers (URI): Generic Syntax 072 * </a> 073 * 074 * @version $Revision$ 075 * @since Validator 1.4 076 */ 077public class UrlValidator implements Serializable { 078 079 private static final long serialVersionUID = 7557161713937335013L; 080 081 private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max 082 083 /** 084 * Allows all validly formatted schemes to pass validation instead of 085 * supplying a set of valid schemes. 086 */ 087 public static final long ALLOW_ALL_SCHEMES = 1 << 0; 088 089 /** 090 * Allow two slashes in the path component of the URL. 091 */ 092 public static final long ALLOW_2_SLASHES = 1 << 1; 093 094 /** 095 * Enabling this options disallows any URL fragments. 096 */ 097 public static final long NO_FRAGMENTS = 1 << 2; 098 099 /** 100 * Allow local URLs, such as http://localhost/ or http://machine/ . 101 * This enables a broad-brush check, for complex local machine name 102 * validation requirements you should create your validator with 103 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)}) 104 */ 105 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber 106 107 /** 108 * Protocol scheme (e.g. http, ftp, https). 109 */ 110 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"; 111 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX); 112 113 // Drop numeric, and "+-." for now 114 // TODO does not allow for optional userinfo. 115 // Validation of character set is done by isValidAuthority 116 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6 117 // Allow for IPv4 mapped addresses: ::FFF:123.123.123.123 118 private static final String IPV6_REGEX = "::FFFF:(?:\\d{1,3}\\.){3}\\d{1,3}|[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix 119 120 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 121 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 122 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 123 // We assume that password has the same valid chars as user info 124 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]"; 125 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching 126 private static final String USERINFO_FIELD_REGEX = 127 USERINFO_CHARS_REGEX + "+" + // At least one character for the name 128 "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent 129 private static final String AUTHORITY_REGEX = 130 "(?:\\[("+IPV6_REGEX+")\\]|(?:(?:"+USERINFO_FIELD_REGEX+")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?"; 131 // 1 e.g. user:pass@ 2 3 4 132 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 133 134 private static final int PARSE_AUTHORITY_IPV6 = 1; 135 136 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present 137 138 private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon 139 140 /** 141 * Should always be empty. The code currently allows spaces. 142 */ 143 private static final int PARSE_AUTHORITY_EXTRA = 4; 144 145 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"; 146 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX); 147 148 private static final String QUERY_REGEX = "^(\\S*)$"; 149 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX); 150 151 /** 152 * Holds the set of current validation options. 153 */ 154 private final long options; 155 156 /** 157 * The set of schemes that are allowed to be in a URL. 158 */ 159 private final Set<String> allowedSchemes; // Must be lower-case 160 161 /** 162 * Regular expressions used to manually validate authorities if IANA 163 * domain name validation isn't desired. 164 */ 165 private final RegexValidator authorityValidator; 166 167 /** 168 * If no schemes are provided, default to this set. 169 */ 170 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case 171 172 /** 173 * Singleton instance of this class with default schemes and options. 174 */ 175 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator(); 176 177 /** 178 * Returns the singleton instance of this class with default schemes and options. 179 * @return singleton instance with default schemes and options 180 */ 181 public static UrlValidator getInstance() { 182 return DEFAULT_URL_VALIDATOR; 183 } 184 185 private final DomainValidator domainValidator; 186 187 /** 188 * Create a UrlValidator with default properties. 189 */ 190 public UrlValidator() { 191 this(null); 192 } 193 194 /** 195 * Behavior of validation is modified by passing in several strings options: 196 * @param schemes Pass in one or more url schemes to consider valid, passing in 197 * a null will default to "http,https,ftp" being valid. 198 * If a non-null schemes is specified then all valid schemes must 199 * be specified. Setting the ALLOW_ALL_SCHEMES option will 200 * ignore the contents of schemes. 201 */ 202 public UrlValidator(String[] schemes) { 203 this(schemes, 0L); 204 } 205 206 /** 207 * Initialize a UrlValidator with the given validation options. 208 * @param options The options should be set using the public constants declared in 209 * this class. To set multiple options you simply add them together. For example, 210 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 211 */ 212 public UrlValidator(long options) { 213 this(null, null, options); 214 } 215 216 /** 217 * Behavior of validation is modified by passing in options: 218 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 219 * @param options The options should be set using the public constants declared in 220 * this class. To set multiple options you simply add them together. For example, 221 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 222 */ 223 public UrlValidator(String[] schemes, long options) { 224 this(schemes, null, options); 225 } 226 227 /** 228 * Initialize a UrlValidator with the given validation options. 229 * @param authorityValidator Regular expression validator used to validate the authority part 230 * This allows the user to override the standard set of domains. 231 * @param options Validation options. Set using the public constants of this class. 232 * To set multiple options, simply add them together: 233 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 234 * enables both of those options. 235 */ 236 public UrlValidator(RegexValidator authorityValidator, long options) { 237 this(null, authorityValidator, options); 238 } 239 240 /** 241 * Customizable constructor. Validation behavior is modifed by passing in options. 242 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 243 * @param authorityValidator Regular expression validator used to validate the authority part 244 * @param options Validation options. Set using the public constants of this class. 245 * To set multiple options, simply add them together: 246 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 247 * enables both of those options. 248 */ 249 public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) { 250 this(schemes, authorityValidator, options, DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS, options))); 251 } 252 253 /** 254 * Customizable constructor. Validation behavior is modifed by passing in options. 255 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 256 * @param authorityValidator Regular expression validator used to validate the authority part 257 * @param options Validation options. Set using the public constants of this class. 258 * To set multiple options, simply add them together: 259 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 260 * enables both of those options. 261 * @param domainValidator the DomainValidator to use; must agree with ALLOW_LOCAL_URLS setting 262 * @since 1.7 263 */ 264 public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options, DomainValidator domainValidator) { 265 this.options = options; 266 if (domainValidator == null) { 267 throw new IllegalArgumentException("DomainValidator must not be null"); 268 } 269 if (domainValidator.isAllowLocal() != ((options & ALLOW_LOCAL_URLS) > 0)){ 270 throw new IllegalArgumentException("DomainValidator disagrees with ALLOW_LOCAL_URLS setting"); 271 } 272 this.domainValidator = domainValidator; 273 274 if (isOn(ALLOW_ALL_SCHEMES)) { 275 allowedSchemes = Collections.emptySet(); 276 } else { 277 if (schemes == null) { 278 schemes = DEFAULT_SCHEMES; 279 } 280 allowedSchemes = new HashSet<>(schemes.length); 281 for(int i=0; i < schemes.length; i++) { 282 allowedSchemes.add(schemes[i].toLowerCase(Locale.ENGLISH)); 283 } 284 } 285 286 this.authorityValidator = authorityValidator; 287 } 288 289 /** 290 * <p>Checks if a field has a valid url address.</p> 291 * 292 * Note that the method calls #isValidAuthority() 293 * which checks that the domain is valid. 294 * 295 * @param value The value validation is being performed on. A <code>null</code> 296 * value is considered invalid. 297 * @return true if the url is valid. 298 */ 299 public boolean isValid(String value) { 300 if (value == null) { 301 return false; 302 } 303 304 URI uri; // ensure value is a valid URI 305 try { 306 uri = new URI(value); 307 } catch (URISyntaxException e) { 308 return false; 309 } 310 // OK, perfom additional validation 311 312 String scheme = uri.getScheme(); 313 if (!isValidScheme(scheme)) { 314 return false; 315 } 316 317 String authority = uri.getRawAuthority(); 318 if ("file".equals(scheme) && (authority == null || "".equals(authority))) {// Special case - file: allows an empty authority 319 return true; // this is a local file - nothing more to do here 320 } else if ("file".equals(scheme) && authority != null && authority.contains(":")) { 321 return false; 322 } else { 323 // Validate the authority 324 if (!isValidAuthority(authority)) { 325 return false; 326 } 327 } 328 329 if (!isValidPath(uri.getRawPath())) { 330 return false; 331 } 332 333 if (!isValidQuery(uri.getRawQuery())) { 334 return false; 335 } 336 337 if (!isValidFragment(uri.getRawFragment())) { 338 return false; 339 } 340 341 return true; 342 } 343 344 /** 345 * Validate scheme. If schemes[] was initialized to a non null, 346 * then only those schemes are allowed. 347 * Otherwise the default schemes are "http", "https", "ftp". 348 * Matching is case-blind. 349 * @param scheme The scheme to validate. A <code>null</code> value is considered 350 * invalid. 351 * @return true if valid. 352 */ 353 protected boolean isValidScheme(String scheme) { 354 if (scheme == null) { 355 return false; 356 } 357 358 if (!SCHEME_PATTERN.matcher(scheme).matches()) { 359 return false; 360 } 361 362 if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) { 363 return false; 364 } 365 366 return true; 367 } 368 369 /** 370 * Returns true if the authority is properly formatted. An authority is the combination 371 * of hostname and port. A <code>null</code> authority value is considered invalid. 372 * Note: this implementation validates the domain unless a RegexValidator was provided. 373 * If a RegexValidator was supplied and it matches, then the authority is regarded 374 * as valid with no further checks, otherwise the method checks against the 375 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS) 376 * @param authority Authority value to validate, alllows IDN 377 * @return true if authority (hostname and port) is valid. 378 */ 379 protected boolean isValidAuthority(String authority) { 380 if (authority == null) { 381 return false; 382 } 383 384 // check manual authority validation if specified 385 if (authorityValidator != null && authorityValidator.isValid(authority)) { 386 return true; 387 } 388 // convert to ASCII if possible 389 final String authorityASCII = DomainValidator.unicodeToASCII(authority); 390 391 Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII); 392 if (!authorityMatcher.matches()) { 393 return false; 394 } 395 396 // We have to process IPV6 separately because that is parsed in a different group 397 String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6); 398 if (ipv6 != null) { 399 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 400 if (!inetAddressValidator.isValidInet6Address(ipv6)) { 401 return false; 402 } 403 } else { 404 String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 405 // check if authority is hostname or IP address: 406 // try a hostname first since that's much more likely 407 if (!this.domainValidator.isValid(hostLocation)) { 408 // try an IPv4 address 409 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 410 if (!inetAddressValidator.isValidInet4Address(hostLocation)) { 411 // isn't IPv4, so the URL is invalid 412 return false; 413 } 414 } 415 String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); 416 if (port != null && port.length() > 0) { 417 try { 418 int iPort = Integer.parseInt(port); 419 if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) { 420 return false; 421 } 422 } catch (NumberFormatException nfe) { 423 return false; // this can happen for big numbers 424 } 425 } 426 } 427 428 String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 429 if (extra != null && extra.trim().length() > 0){ 430 return false; 431 } 432 433 return true; 434 } 435 436 /** 437 * Returns true if the path is valid. A <code>null</code> value is considered invalid. 438 * @param path Path value to validate. 439 * @return true if path is valid. 440 */ 441 protected boolean isValidPath(String path) { 442 if (path == null) { 443 return false; 444 } 445 446 if (!PATH_PATTERN.matcher(path).matches()) { 447 return false; 448 } 449 450 try { 451 // Don't omit host otherwise leading path may be taken as host if it starts with // 452 URI uri = new URI(null,"localhost",path,null); 453 String norm = uri.normalize().getPath(); 454 if (norm.startsWith("/../") // Trying to go via the parent dir 455 || norm.equals("/..")) { // Trying to go to the parent dir 456 return false; 457 } 458 } catch (URISyntaxException e) { 459 return false; 460 } 461 462 int slash2Count = countToken("//", path); 463 if (isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) { 464 return false; 465 } 466 467 return true; 468 } 469 470 /** 471 * Returns true if the query is null or it's a properly formatted query string. 472 * @param query Query value to validate. 473 * @return true if query is valid. 474 */ 475 protected boolean isValidQuery(String query) { 476 if (query == null) { 477 return true; 478 } 479 480 return QUERY_PATTERN.matcher(query).matches(); 481 } 482 483 /** 484 * Returns true if the given fragment is null or fragments are allowed. 485 * @param fragment Fragment value to validate. 486 * @return true if fragment is valid. 487 */ 488 protected boolean isValidFragment(String fragment) { 489 if (fragment == null) { 490 return true; 491 } 492 493 return isOff(NO_FRAGMENTS); 494 } 495 496 /** 497 * Returns the number of times the token appears in the target. 498 * @param token Token value to be counted. 499 * @param target Target value to count tokens in. 500 * @return the number of tokens. 501 */ 502 protected int countToken(String token, String target) { 503 int tokenIndex = 0; 504 int count = 0; 505 while (tokenIndex != -1) { 506 tokenIndex = target.indexOf(token, tokenIndex); 507 if (tokenIndex > -1) { 508 tokenIndex++; 509 count++; 510 } 511 } 512 return count; 513 } 514 515 /** 516 * Tests whether the given flag is on. If the flag is not a power of 2 517 * (ie. 3) this tests whether the combination of flags is on. 518 * 519 * @param flag Flag value to check. 520 * 521 * @return whether the specified flag value is on. 522 */ 523 private boolean isOn(long flag) { 524 return (options & flag) > 0; 525 } 526 527 /** 528 * Tests whether the given flag is on. If the flag is not a power of 2 529 * (e.g. 3) this tests whether the combination of flags is on. 530 * 531 * @param flag Flag value to check. 532 * @param options what to check 533 * 534 * @return whether the specified flag value is on. 535 */ 536 private static boolean isOn(long flag, long options) { 537 return (options & flag) > 0; 538 } 539 540 /** 541 * Tests whether the given flag is off. If the flag is not a power of 2 542 * (ie. 3) this tests whether the combination of flags is off. 543 * 544 * @param flag Flag value to check. 545 * 546 * @return whether the specified flag value is off. 547 */ 548 private boolean isOff(long flag) { 549 return (options & flag) == 0; 550 } 551 552}