001package org.jsoup.safety; 002 003/* 004 Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/rgrove/sanitize/, which inspired 005 this whitelist configuration, and the initial defaults. 006 */ 007 008import org.jsoup.helper.Validate; 009import org.jsoup.nodes.Attribute; 010import org.jsoup.nodes.Attributes; 011import org.jsoup.nodes.Element; 012 013import java.util.HashMap; 014import java.util.HashSet; 015import java.util.Map; 016import java.util.Set; 017 018import static org.jsoup.internal.Normalizer.lowerCase; 019 020 021/** 022 Whitelists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed. 023 <p> 024 Start with one of the defaults: 025 </p> 026 <ul> 027 <li>{@link #none} 028 <li>{@link #simpleText} 029 <li>{@link #basic} 030 <li>{@link #basicWithImages} 031 <li>{@link #relaxed} 032 </ul> 033 <p> 034 If you need to allow more through (please be careful!), tweak a base whitelist with: 035 </p> 036 <ul> 037 <li>{@link #addTags} 038 <li>{@link #addAttributes} 039 <li>{@link #addEnforcedAttribute} 040 <li>{@link #addProtocols} 041 </ul> 042 <p> 043 You can remove any setting from an existing whitelist with: 044 </p> 045 <ul> 046 <li>{@link #removeTags} 047 <li>{@link #removeAttributes} 048 <li>{@link #removeEnforcedAttribute} 049 <li>{@link #removeProtocols} 050 </ul> 051 052 <p> 053 The cleaner and these whitelists assume that you want to clean a <code>body</code> fragment of HTML (to add user 054 supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, either wrap the 055 document HTML around the cleaned body HTML, or create a whitelist that allows <code>html</code> and <code>head</code> 056 elements as appropriate. 057 </p> 058 <p> 059 If you are going to extend a whitelist, please be very careful. Make sure you understand what attributes may lead to 060 XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See 061 http://ha.ckers.org/xss.html for some XSS attack examples. 062 </p> 063 064 @author Jonathan Hedley 065 */ 066public class Whitelist { 067 private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span] 068 private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag. 069 private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values 070 private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes 071 private boolean preserveRelativeLinks; // option to preserve relative links 072 073 /** 074 This whitelist allows only text nodes: all HTML will be stripped. 075 076 @return whitelist 077 */ 078 public static Whitelist none() { 079 return new Whitelist(); 080 } 081 082 /** 083 This whitelist allows only simple text formatting: <code>b, em, i, strong, u</code>. All other HTML (tags and 084 attributes) will be removed. 085 086 @return whitelist 087 */ 088 public static Whitelist simpleText() { 089 return new Whitelist() 090 .addTags("b", "em", "i", "strong", "u") 091 ; 092 } 093 094 /** 095 <p> 096 This whitelist allows a fuller range of text nodes: <code>a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, 097 ol, p, pre, q, small, span, strike, strong, sub, sup, u, ul</code>, and appropriate attributes. 098 </p> 099 <p> 100 Links (<code>a</code> elements) can point to <code>http, https, ftp, mailto</code>, and have an enforced 101 <code>rel=nofollow</code> attribute. 102 </p> 103 <p> 104 Does not allow images. 105 </p> 106 107 @return whitelist 108 */ 109 public static Whitelist basic() { 110 return new Whitelist() 111 .addTags( 112 "a", "b", "blockquote", "br", "cite", "code", "dd", "dl", "dt", "em", 113 "i", "li", "ol", "p", "pre", "q", "small", "span", "strike", "strong", "sub", 114 "sup", "u", "ul") 115 116 .addAttributes("a", "href") 117 .addAttributes("blockquote", "cite") 118 .addAttributes("q", "cite") 119 120 .addProtocols("a", "href", "ftp", "http", "https", "mailto") 121 .addProtocols("blockquote", "cite", "http", "https") 122 .addProtocols("cite", "cite", "http", "https") 123 124 .addEnforcedAttribute("a", "rel", "nofollow") 125 ; 126 127 } 128 129 /** 130 This whitelist allows the same text tags as {@link #basic}, and also allows <code>img</code> tags, with appropriate 131 attributes, with <code>src</code> pointing to <code>http</code> or <code>https</code>. 132 133 @return whitelist 134 */ 135 public static Whitelist basicWithImages() { 136 return basic() 137 .addTags("img") 138 .addAttributes("img", "align", "alt", "height", "src", "title", "width") 139 .addProtocols("img", "src", "http", "https") 140 ; 141 } 142 143 /** 144 This whitelist allows a full range of text and structural body HTML: <code>a, b, blockquote, br, caption, cite, 145 code, col, colgroup, dd, div, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, span, strike, strong, sub, 146 sup, table, tbody, td, tfoot, th, thead, tr, u, ul</code> 147 <p> 148 Links do not have an enforced <code>rel=nofollow</code> attribute, but you can add that if desired. 149 </p> 150 151 @return whitelist 152 */ 153 public static Whitelist relaxed() { 154 return new Whitelist() 155 .addTags( 156 "a", "b", "blockquote", "br", "caption", "cite", "code", "col", 157 "colgroup", "dd", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6", 158 "i", "img", "li", "ol", "p", "pre", "q", "small", "span", "strike", "strong", 159 "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", 160 "ul") 161 162 .addAttributes("a", "href", "title") 163 .addAttributes("blockquote", "cite") 164 .addAttributes("col", "span", "width") 165 .addAttributes("colgroup", "span", "width") 166 .addAttributes("img", "align", "alt", "height", "src", "title", "width") 167 .addAttributes("ol", "start", "type") 168 .addAttributes("q", "cite") 169 .addAttributes("table", "summary", "width") 170 .addAttributes("td", "abbr", "axis", "colspan", "rowspan", "width") 171 .addAttributes( 172 "th", "abbr", "axis", "colspan", "rowspan", "scope", 173 "width") 174 .addAttributes("ul", "type") 175 176 .addProtocols("a", "href", "ftp", "http", "https", "mailto") 177 .addProtocols("blockquote", "cite", "http", "https") 178 .addProtocols("cite", "cite", "http", "https") 179 .addProtocols("img", "src", "http", "https") 180 .addProtocols("q", "cite", "http", "https") 181 ; 182 } 183 184 /** 185 Create a new, empty whitelist. Generally it will be better to start with a default prepared whitelist instead. 186 187 @see #basic() 188 @see #basicWithImages() 189 @see #simpleText() 190 @see #relaxed() 191 */ 192 public Whitelist() { 193 tagNames = new HashSet<>(); 194 attributes = new HashMap<>(); 195 enforcedAttributes = new HashMap<>(); 196 protocols = new HashMap<>(); 197 preserveRelativeLinks = false; 198 } 199 200 /** 201 Add a list of allowed elements to a whitelist. (If a tag is not allowed, it will be removed from the HTML.) 202 203 @param tags tag names to allow 204 @return this (for chaining) 205 */ 206 public Whitelist addTags(String... tags) { 207 Validate.notNull(tags); 208 209 for (String tagName : tags) { 210 Validate.notEmpty(tagName); 211 tagNames.add(TagName.valueOf(tagName)); 212 } 213 return this; 214 } 215 216 /** 217 Remove a list of allowed elements from a whitelist. (If a tag is not allowed, it will be removed from the HTML.) 218 219 @param tags tag names to disallow 220 @return this (for chaining) 221 */ 222 public Whitelist removeTags(String... tags) { 223 Validate.notNull(tags); 224 225 for(String tag: tags) { 226 Validate.notEmpty(tag); 227 TagName tagName = TagName.valueOf(tag); 228 229 if(tagNames.remove(tagName)) { // Only look in sub-maps if tag was allowed 230 attributes.remove(tagName); 231 enforcedAttributes.remove(tagName); 232 protocols.remove(tagName); 233 } 234 } 235 return this; 236 } 237 238 /** 239 Add a list of allowed attributes to a tag. (If an attribute is not allowed on an element, it will be removed.) 240 <p> 241 E.g.: <code>addAttributes("a", "href", "class")</code> allows <code>href</code> and <code>class</code> attributes 242 on <code>a</code> tags. 243 </p> 244 <p> 245 To make an attribute valid for <b>all tags</b>, use the pseudo tag <code>:all</code>, e.g. 246 <code>addAttributes(":all", "class")</code>. 247 </p> 248 249 @param tag The tag the attributes are for. The tag will be added to the allowed tag list if necessary. 250 @param attributes List of valid attributes for the tag 251 @return this (for chaining) 252 */ 253 public Whitelist addAttributes(String tag, String... attributes) { 254 Validate.notEmpty(tag); 255 Validate.notNull(attributes); 256 Validate.isTrue(attributes.length > 0, "No attribute names supplied."); 257 258 TagName tagName = TagName.valueOf(tag); 259 if (!tagNames.contains(tagName)) 260 tagNames.add(tagName); 261 Set<AttributeKey> attributeSet = new HashSet<>(); 262 for (String key : attributes) { 263 Validate.notEmpty(key); 264 attributeSet.add(AttributeKey.valueOf(key)); 265 } 266 if (this.attributes.containsKey(tagName)) { 267 Set<AttributeKey> currentSet = this.attributes.get(tagName); 268 currentSet.addAll(attributeSet); 269 } else { 270 this.attributes.put(tagName, attributeSet); 271 } 272 return this; 273 } 274 275 /** 276 Remove a list of allowed attributes from a tag. (If an attribute is not allowed on an element, it will be removed.) 277 <p> 278 E.g.: <code>removeAttributes("a", "href", "class")</code> disallows <code>href</code> and <code>class</code> 279 attributes on <code>a</code> tags. 280 </p> 281 <p> 282 To make an attribute invalid for <b>all tags</b>, use the pseudo tag <code>:all</code>, e.g. 283 <code>removeAttributes(":all", "class")</code>. 284 </p> 285 286 @param tag The tag the attributes are for. 287 @param attributes List of invalid attributes for the tag 288 @return this (for chaining) 289 */ 290 public Whitelist removeAttributes(String tag, String... attributes) { 291 Validate.notEmpty(tag); 292 Validate.notNull(attributes); 293 Validate.isTrue(attributes.length > 0, "No attribute names supplied."); 294 295 TagName tagName = TagName.valueOf(tag); 296 Set<AttributeKey> attributeSet = new HashSet<>(); 297 for (String key : attributes) { 298 Validate.notEmpty(key); 299 attributeSet.add(AttributeKey.valueOf(key)); 300 } 301 if(tagNames.contains(tagName) && this.attributes.containsKey(tagName)) { // Only look in sub-maps if tag was allowed 302 Set<AttributeKey> currentSet = this.attributes.get(tagName); 303 currentSet.removeAll(attributeSet); 304 305 if(currentSet.isEmpty()) // Remove tag from attribute map if no attributes are allowed for tag 306 this.attributes.remove(tagName); 307 } 308 if(tag.equals(":all")) // Attribute needs to be removed from all individually set tags 309 for(TagName name: this.attributes.keySet()) { 310 Set<AttributeKey> currentSet = this.attributes.get(name); 311 currentSet.removeAll(attributeSet); 312 313 if(currentSet.isEmpty()) // Remove tag from attribute map if no attributes are allowed for tag 314 this.attributes.remove(name); 315 } 316 return this; 317 } 318 319 /** 320 Add an enforced attribute to a tag. An enforced attribute will always be added to the element. If the element 321 already has the attribute set, it will be overridden with this value. 322 <p> 323 E.g.: <code>addEnforcedAttribute("a", "rel", "nofollow")</code> will make all <code>a</code> tags output as 324 <code><a href="..." rel="nofollow"></code> 325 </p> 326 327 @param tag The tag the enforced attribute is for. The tag will be added to the allowed tag list if necessary. 328 @param attribute The attribute name 329 @param value The enforced attribute value 330 @return this (for chaining) 331 */ 332 public Whitelist addEnforcedAttribute(String tag, String attribute, String value) { 333 Validate.notEmpty(tag); 334 Validate.notEmpty(attribute); 335 Validate.notEmpty(value); 336 337 TagName tagName = TagName.valueOf(tag); 338 if (!tagNames.contains(tagName)) 339 tagNames.add(tagName); 340 AttributeKey attrKey = AttributeKey.valueOf(attribute); 341 AttributeValue attrVal = AttributeValue.valueOf(value); 342 343 if (enforcedAttributes.containsKey(tagName)) { 344 enforcedAttributes.get(tagName).put(attrKey, attrVal); 345 } else { 346 Map<AttributeKey, AttributeValue> attrMap = new HashMap<>(); 347 attrMap.put(attrKey, attrVal); 348 enforcedAttributes.put(tagName, attrMap); 349 } 350 return this; 351 } 352 353 /** 354 Remove a previously configured enforced attribute from a tag. 355 356 @param tag The tag the enforced attribute is for. 357 @param attribute The attribute name 358 @return this (for chaining) 359 */ 360 public Whitelist removeEnforcedAttribute(String tag, String attribute) { 361 Validate.notEmpty(tag); 362 Validate.notEmpty(attribute); 363 364 TagName tagName = TagName.valueOf(tag); 365 if(tagNames.contains(tagName) && enforcedAttributes.containsKey(tagName)) { 366 AttributeKey attrKey = AttributeKey.valueOf(attribute); 367 Map<AttributeKey, AttributeValue> attrMap = enforcedAttributes.get(tagName); 368 attrMap.remove(attrKey); 369 370 if(attrMap.isEmpty()) // Remove tag from enforced attribute map if no enforced attributes are present 371 enforcedAttributes.remove(tagName); 372 } 373 return this; 374 } 375 376 /** 377 * Configure this Whitelist to preserve relative links in an element's URL attribute, or convert them to absolute 378 * links. By default, this is <b>false</b>: URLs will be made absolute (e.g. start with an allowed protocol, like 379 * e.g. {@code http://}. 380 * <p> 381 * Note that when handling relative links, the input document must have an appropriate {@code base URI} set when 382 * parsing, so that the link's protocol can be confirmed. Regardless of the setting of the {@code preserve relative 383 * links} option, the link must be resolvable against the base URI to an allowed protocol; otherwise the attribute 384 * will be removed. 385 * </p> 386 * 387 * @param preserve {@code true} to allow relative links, {@code false} (default) to deny 388 * @return this Whitelist, for chaining. 389 * @see #addProtocols 390 */ 391 public Whitelist preserveRelativeLinks(boolean preserve) { 392 preserveRelativeLinks = preserve; 393 return this; 394 } 395 396 /** 397 Add allowed URL protocols for an element's URL attribute. This restricts the possible values of the attribute to 398 URLs with the defined protocol. 399 <p> 400 E.g.: <code>addProtocols("a", "href", "ftp", "http", "https")</code> 401 </p> 402 <p> 403 To allow a link to an in-page URL anchor (i.e. <code><a href="#anchor"></code>, add a <code>#</code>:<br> 404 E.g.: <code>addProtocols("a", "href", "#")</code> 405 </p> 406 407 @param tag Tag the URL protocol is for 408 @param attribute Attribute name 409 @param protocols List of valid protocols 410 @return this, for chaining 411 */ 412 public Whitelist addProtocols(String tag, String attribute, String... protocols) { 413 Validate.notEmpty(tag); 414 Validate.notEmpty(attribute); 415 Validate.notNull(protocols); 416 417 TagName tagName = TagName.valueOf(tag); 418 AttributeKey attrKey = AttributeKey.valueOf(attribute); 419 Map<AttributeKey, Set<Protocol>> attrMap; 420 Set<Protocol> protSet; 421 422 if (this.protocols.containsKey(tagName)) { 423 attrMap = this.protocols.get(tagName); 424 } else { 425 attrMap = new HashMap<>(); 426 this.protocols.put(tagName, attrMap); 427 } 428 if (attrMap.containsKey(attrKey)) { 429 protSet = attrMap.get(attrKey); 430 } else { 431 protSet = new HashSet<>(); 432 attrMap.put(attrKey, protSet); 433 } 434 for (String protocol : protocols) { 435 Validate.notEmpty(protocol); 436 Protocol prot = Protocol.valueOf(protocol); 437 protSet.add(prot); 438 } 439 return this; 440 } 441 442 /** 443 Remove allowed URL protocols for an element's URL attribute. If you remove all protocols for an attribute, that 444 attribute will allow any protocol. 445 <p> 446 E.g.: <code>removeProtocols("a", "href", "ftp")</code> 447 </p> 448 449 @param tag Tag the URL protocol is for 450 @param attribute Attribute name 451 @param removeProtocols List of invalid protocols 452 @return this, for chaining 453 */ 454 public Whitelist removeProtocols(String tag, String attribute, String... removeProtocols) { 455 Validate.notEmpty(tag); 456 Validate.notEmpty(attribute); 457 Validate.notNull(removeProtocols); 458 459 TagName tagName = TagName.valueOf(tag); 460 AttributeKey attr = AttributeKey.valueOf(attribute); 461 462 // make sure that what we're removing actually exists; otherwise can open the tag to any data and that can 463 // be surprising 464 Validate.isTrue(protocols.containsKey(tagName), "Cannot remove a protocol that is not set."); 465 Map<AttributeKey, Set<Protocol>> tagProtocols = protocols.get(tagName); 466 Validate.isTrue(tagProtocols.containsKey(attr), "Cannot remove a protocol that is not set."); 467 468 Set<Protocol> attrProtocols = tagProtocols.get(attr); 469 for (String protocol : removeProtocols) { 470 Validate.notEmpty(protocol); 471 attrProtocols.remove(Protocol.valueOf(protocol)); 472 } 473 474 if (attrProtocols.isEmpty()) { // Remove protocol set if empty 475 tagProtocols.remove(attr); 476 if (tagProtocols.isEmpty()) // Remove entry for tag if empty 477 protocols.remove(tagName); 478 } 479 return this; 480 } 481 482 /** 483 * Test if the supplied tag is allowed by this whitelist 484 * @param tag test tag 485 * @return true if allowed 486 */ 487 protected boolean isSafeTag(String tag) { 488 return tagNames.contains(TagName.valueOf(tag)); 489 } 490 491 /** 492 * Test if the supplied attribute is allowed by this whitelist for this tag 493 * @param tagName tag to consider allowing the attribute in 494 * @param el element under test, to confirm protocol 495 * @param attr attribute under test 496 * @return true if allowed 497 */ 498 protected boolean isSafeAttribute(String tagName, Element el, Attribute attr) { 499 TagName tag = TagName.valueOf(tagName); 500 AttributeKey key = AttributeKey.valueOf(attr.getKey()); 501 502 Set<AttributeKey> okSet = attributes.get(tag); 503 if (okSet != null && okSet.contains(key)) { 504 if (protocols.containsKey(tag)) { 505 Map<AttributeKey, Set<Protocol>> attrProts = protocols.get(tag); 506 // ok if not defined protocol; otherwise test 507 return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key)); 508 } else { // attribute found, no protocols defined, so OK 509 return true; 510 } 511 } 512 // might be an enforced attribute? 513 Map<AttributeKey, AttributeValue> enforcedSet = enforcedAttributes.get(tag); 514 if (enforcedSet != null) { 515 Attributes expect = getEnforcedAttributes(tagName); 516 String attrKey = attr.getKey(); 517 if (expect.hasKeyIgnoreCase(attrKey)) { 518 return expect.getIgnoreCase(attrKey).equals(attr.getValue()); 519 } 520 } 521 // no attributes defined for tag, try :all tag 522 return !tagName.equals(":all") && isSafeAttribute(":all", el, attr); 523 } 524 525 private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) { 526 // try to resolve relative urls to abs, and optionally update the attribute so output html has abs. 527 // rels without a baseuri get removed 528 String value = el.absUrl(attr.getKey()); 529 if (value.length() == 0) 530 value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols 531 if (!preserveRelativeLinks) 532 attr.setValue(value); 533 534 for (Protocol protocol : protocols) { 535 String prot = protocol.toString(); 536 537 if (prot.equals("#")) { // allows anchor links 538 if (isValidAnchor(value)) { 539 return true; 540 } else { 541 continue; 542 } 543 } 544 545 prot += ":"; 546 547 if (lowerCase(value).startsWith(prot)) { 548 return true; 549 } 550 } 551 return false; 552 } 553 554 private boolean isValidAnchor(String value) { 555 return value.startsWith("#") && !value.matches(".*\\s.*"); 556 } 557 558 Attributes getEnforcedAttributes(String tagName) { 559 Attributes attrs = new Attributes(); 560 TagName tag = TagName.valueOf(tagName); 561 if (enforcedAttributes.containsKey(tag)) { 562 Map<AttributeKey, AttributeValue> keyVals = enforcedAttributes.get(tag); 563 for (Map.Entry<AttributeKey, AttributeValue> entry : keyVals.entrySet()) { 564 attrs.put(entry.getKey().toString(), entry.getValue().toString()); 565 } 566 } 567 return attrs; 568 } 569 570 // named types for config. All just hold strings, but here for my sanity. 571 572 static class TagName extends TypedValue { 573 TagName(String value) { 574 super(value); 575 } 576 577 static TagName valueOf(String value) { 578 return new TagName(value); 579 } 580 } 581 582 static class AttributeKey extends TypedValue { 583 AttributeKey(String value) { 584 super(value); 585 } 586 587 static AttributeKey valueOf(String value) { 588 return new AttributeKey(value); 589 } 590 } 591 592 static class AttributeValue extends TypedValue { 593 AttributeValue(String value) { 594 super(value); 595 } 596 597 static AttributeValue valueOf(String value) { 598 return new AttributeValue(value); 599 } 600 } 601 602 static class Protocol extends TypedValue { 603 Protocol(String value) { 604 super(value); 605 } 606 607 static Protocol valueOf(String value) { 608 return new Protocol(value); 609 } 610 } 611 612 abstract static class TypedValue { 613 private String value; 614 615 TypedValue(String value) { 616 Validate.notNull(value); 617 this.value = value; 618 } 619 620 @Override 621 public int hashCode() { 622 final int prime = 31; 623 int result = 1; 624 result = prime * result + ((value == null) ? 0 : value.hashCode()); 625 return result; 626 } 627 628 @Override 629 public boolean equals(Object obj) { 630 if (this == obj) return true; 631 if (obj == null) return false; 632 if (getClass() != obj.getClass()) return false; 633 TypedValue other = (TypedValue) obj; 634 if (value == null) { 635 if (other.value != null) return false; 636 } else if (!value.equals(other.value)) return false; 637 return true; 638 } 639 640 @Override 641 public String toString() { 642 return value; 643 } 644 } 645} 646