001package org.jsoup.safety; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Attribute; 005import org.jsoup.nodes.Attributes; 006import org.jsoup.nodes.DataNode; 007import org.jsoup.nodes.Document; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.Node; 010import org.jsoup.nodes.TextNode; 011import org.jsoup.parser.ParseErrorList; 012import org.jsoup.parser.Parser; 013import org.jsoup.parser.Tag; 014import org.jsoup.select.NodeTraversor; 015import org.jsoup.select.NodeVisitor; 016 017import java.util.List; 018 019 020/** 021 The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes 022 that you are expecting; no junk, and no cross-site scripting attacks! 023 <p> 024 The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain 025 HTML that is allowed by the whitelist. 026 </p> 027 <p> 028 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the 029 canned white-lists only allow body contained tags. 030 </p> 031 <p> 032 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. 033 </p> 034 */ 035public class Cleaner { 036 private Whitelist whitelist; 037 038 /** 039 Create a new cleaner, that sanitizes documents using the supplied whitelist. 040 @param whitelist white-list to clean with 041 */ 042 public Cleaner(Whitelist whitelist) { 043 Validate.notNull(whitelist); 044 this.whitelist = whitelist; 045 } 046 047 /** 048 Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. 049 The original document is not modified. Only elements from the dirt document's <code>body</code> are used. 050 @param dirtyDocument Untrusted base document to clean. 051 @return cleaned document. 052 */ 053 public Document clean(Document dirtyDocument) { 054 Validate.notNull(dirtyDocument); 055 056 Document clean = Document.createShell(dirtyDocument.baseUri()); 057 if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body. 058 copySafeNodes(dirtyDocument.body(), clean.body()); 059 060 return clean; 061 } 062 063 /** 064 Determines if the input document <b>body</b>is valid, against the whitelist. It is considered valid if all the tags and attributes 065 in the input HTML are allowed by the whitelist, and that there is no content in the <code>head</code>. 066 <p> 067 This method can be used as a validator for user input. An invalid document will still be cleaned successfully 068 using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document 069 to ensure enforced attributes are set correctly, and that the output is tidied. 070 </p> 071 @param dirtyDocument document to test 072 @return true if no tags or attributes need to be removed; false if they do 073 */ 074 public boolean isValid(Document dirtyDocument) { 075 Validate.notNull(dirtyDocument); 076 077 Document clean = Document.createShell(dirtyDocument.baseUri()); 078 int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); 079 return numDiscarded == 0 080 && dirtyDocument.head().childNodes().size() == 0; // because we only look at the body, but we start from a shell, make sure there's nothing in the head 081 } 082 083 public boolean isValidBodyHtml(String bodyHtml) { 084 Document clean = Document.createShell(""); 085 Document dirty = Document.createShell(""); 086 ParseErrorList errorList = ParseErrorList.tracking(1); 087 List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList); 088 dirty.body().insertChildren(0, nodes); 089 int numDiscarded = copySafeNodes(dirty.body(), clean.body()); 090 return numDiscarded == 0 && errorList.size() == 0; 091 } 092 093 /** 094 Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. 095 */ 096 private final class CleaningVisitor implements NodeVisitor { 097 private int numDiscarded = 0; 098 private final Element root; 099 private Element destination; // current element to append nodes to 100 101 private CleaningVisitor(Element root, Element destination) { 102 this.root = root; 103 this.destination = destination; 104 } 105 106 public void head(Node source, int depth) { 107 if (source instanceof Element) { 108 Element sourceEl = (Element) source; 109 110 if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs 111 ElementMeta meta = createSafeElement(sourceEl); 112 Element destChild = meta.el; 113 destination.appendChild(destChild); 114 115 numDiscarded += meta.numAttribsDiscarded; 116 destination = destChild; 117 } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. 118 numDiscarded++; 119 } 120 } else if (source instanceof TextNode) { 121 TextNode sourceText = (TextNode) source; 122 TextNode destText = new TextNode(sourceText.getWholeText()); 123 destination.appendChild(destText); 124 } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { 125 DataNode sourceData = (DataNode) source; 126 DataNode destData = new DataNode(sourceData.getWholeData()); 127 destination.appendChild(destData); 128 } else { // else, we don't care about comments, xml proc instructions, etc 129 numDiscarded++; 130 } 131 } 132 133 public void tail(Node source, int depth) { 134 if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) { 135 destination = destination.parent(); // would have descended, so pop destination stack 136 } 137 } 138 } 139 140 private int copySafeNodes(Element source, Element dest) { 141 CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest); 142 NodeTraversor.traverse(cleaningVisitor, source); 143 return cleaningVisitor.numDiscarded; 144 } 145 146 private ElementMeta createSafeElement(Element sourceEl) { 147 String sourceTag = sourceEl.tagName(); 148 Attributes destAttrs = new Attributes(); 149 Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); 150 int numDiscarded = 0; 151 152 Attributes sourceAttrs = sourceEl.attributes(); 153 for (Attribute sourceAttr : sourceAttrs) { 154 if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) 155 destAttrs.put(sourceAttr); 156 else 157 numDiscarded++; 158 } 159 Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); 160 destAttrs.addAll(enforcedAttrs); 161 162 return new ElementMeta(dest, numDiscarded); 163 } 164 165 private static class ElementMeta { 166 Element el; 167 int numAttribsDiscarded; 168 169 ElementMeta(Element el, int numAttribsDiscarded) { 170 this.el = el; 171 this.numAttribsDiscarded = numAttribsDiscarded; 172 } 173 } 174 175}