001package org.jsoup.parser; 002 003import org.jsoup.helper.StringUtil; 004import org.jsoup.helper.Validate; 005import org.jsoup.nodes.Entities; 006 007import java.util.Arrays; 008 009/** 010 * Readers the input stream into tokens. 011 */ 012final class Tokeniser { 013 static final char replacementChar = '\uFFFD'; // replaces null character 014 private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'}; 015 016 static { 017 Arrays.sort(notCharRefCharsSorted); 018 } 019 020 private final CharacterReader reader; // html input 021 private final ParseErrorList errors; // errors found while tokenising 022 023 private TokeniserState state = TokeniserState.Data; // current tokenisation state 024 private Token emitPending; // the token we are about to emit on next read 025 private boolean isEmitPending = false; 026 private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one 027 private StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read 028 StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script> 029 030 Token.Tag tagPending; // tag we are building up 031 Token.StartTag startPending = new Token.StartTag(); 032 Token.EndTag endPending = new Token.EndTag(); 033 Token.Character charPending = new Token.Character(); 034 Token.Doctype doctypePending = new Token.Doctype(); // doctype building up 035 Token.Comment commentPending = new Token.Comment(); // comment building up 036 private String lastStartTag; // the last start tag emitted, to test appropriate end tag 037 038 Tokeniser(CharacterReader reader, ParseErrorList errors) { 039 this.reader = reader; 040 this.errors = errors; 041 } 042 043 Token read() { 044 while (!isEmitPending) 045 state.read(this, reader); 046 047 // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read: 048 if (charsBuilder.length() > 0) { 049 String str = charsBuilder.toString(); 050 charsBuilder.delete(0, charsBuilder.length()); 051 charsString = null; 052 return charPending.data(str); 053 } else if (charsString != null) { 054 Token token = charPending.data(charsString); 055 charsString = null; 056 return token; 057 } else { 058 isEmitPending = false; 059 return emitPending; 060 } 061 } 062 063 void emit(Token token) { 064 Validate.isFalse(isEmitPending, "There is an unread token pending!"); 065 066 emitPending = token; 067 isEmitPending = true; 068 069 if (token.type == Token.TokenType.StartTag) { 070 Token.StartTag startTag = (Token.StartTag) token; 071 lastStartTag = startTag.tagName; 072 } else if (token.type == Token.TokenType.EndTag) { 073 Token.EndTag endTag = (Token.EndTag) token; 074 if (endTag.attributes != null) 075 error("Attributes incorrectly present on end tag"); 076 } 077 } 078 079 void emit(final String str) { 080 // buffer strings up until last string token found, to emit only one token for a run of character refs etc. 081 // does not set isEmitPending; read checks that 082 if (charsString == null) { 083 charsString = str; 084 } 085 else { 086 if (charsBuilder.length() == 0) { // switching to string builder as more than one emit before read 087 charsBuilder.append(charsString); 088 } 089 charsBuilder.append(str); 090 } 091 } 092 093 void emit(char[] chars) { 094 emit(String.valueOf(chars)); 095 } 096 097 void emit(int[] codepoints) { 098 emit(new String(codepoints, 0, codepoints.length)); 099 } 100 101 void emit(char c) { 102 emit(String.valueOf(c)); 103 } 104 105 TokeniserState getState() { 106 return state; 107 } 108 109 void transition(TokeniserState state) { 110 this.state = state; 111 } 112 113 void advanceTransition(TokeniserState state) { 114 reader.advance(); 115 this.state = state; 116 } 117 118 final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays 119 final private int[] multipointHolder = new int[2]; 120 int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { 121 if (reader.isEmpty()) 122 return null; 123 if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) 124 return null; 125 if (reader.matchesAnySorted(notCharRefCharsSorted)) 126 return null; 127 128 final int[] codeRef = codepointHolder; 129 reader.mark(); 130 if (reader.matchConsume("#")) { // numbered 131 boolean isHexMode = reader.matchConsumeIgnoreCase("X"); 132 String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); 133 if (numRef.length() == 0) { // didn't match anything 134 characterReferenceError("numeric reference with no numerals"); 135 reader.rewindToMark(); 136 return null; 137 } 138 if (!reader.matchConsume(";")) 139 characterReferenceError("missing semicolon"); // missing semi 140 int charval = -1; 141 try { 142 int base = isHexMode ? 16 : 10; 143 charval = Integer.valueOf(numRef, base); 144 } catch (NumberFormatException ignored) { 145 } // skip 146 if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { 147 characterReferenceError("character outside of valid range"); 148 codeRef[0] = replacementChar; 149 return codeRef; 150 } else { 151 // todo: implement number replacement table 152 // todo: check for extra illegal unicode points as parse errors 153 codeRef[0] = charval; 154 return codeRef; 155 } 156 } else { // named 157 // get as many letters as possible, and look for matching entities. 158 String nameRef = reader.consumeLetterThenDigitSequence(); 159 boolean looksLegit = reader.matches(';'); 160 // found if a base named entity without a ;, or an extended entity with the ;. 161 boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); 162 163 if (!found) { 164 reader.rewindToMark(); 165 if (looksLegit) // named with semicolon 166 characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); 167 return null; 168 } 169 if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { 170 // don't want that to match 171 reader.rewindToMark(); 172 return null; 173 } 174 if (!reader.matchConsume(";")) 175 characterReferenceError("missing semicolon"); // missing semi 176 int numChars = Entities.codepointsForName(nameRef, multipointHolder); 177 if (numChars == 1) { 178 codeRef[0] = multipointHolder[0]; 179 return codeRef; 180 } else if (numChars ==2) { 181 return multipointHolder; 182 } else { 183 Validate.fail("Unexpected characters returned for " + nameRef); 184 return multipointHolder; 185 } 186 } 187 } 188 189 Token.Tag createTagPending(boolean start) { 190 tagPending = start ? startPending.reset() : endPending.reset(); 191 return tagPending; 192 } 193 194 void emitTagPending() { 195 tagPending.finaliseTag(); 196 emit(tagPending); 197 } 198 199 void createCommentPending() { 200 commentPending.reset(); 201 } 202 203 void emitCommentPending() { 204 emit(commentPending); 205 } 206 207 void createDoctypePending() { 208 doctypePending.reset(); 209 } 210 211 void emitDoctypePending() { 212 emit(doctypePending); 213 } 214 215 void createTempBuffer() { 216 Token.reset(dataBuffer); 217 } 218 219 boolean isAppropriateEndTagToken() { 220 return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag); 221 } 222 223 String appropriateEndTagName() { 224 return lastStartTag; // could be null 225 } 226 227 void error(TokeniserState state) { 228 if (errors.canAddError()) 229 errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state)); 230 } 231 232 void eofError(TokeniserState state) { 233 if (errors.canAddError()) 234 errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state)); 235 } 236 237 private void characterReferenceError(String message) { 238 if (errors.canAddError()) 239 errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message)); 240 } 241 242 void error(String errorMsg) { 243 if (errors.canAddError()) 244 errors.add(new ParseError(reader.pos(), errorMsg)); 245 } 246 247 boolean currentNodeInHtmlNS() { 248 // todo: implement namespaces correctly 249 return true; 250 // Element currentNode = currentNode(); 251 // return currentNode != null && currentNode.namespace().equals("HTML"); 252 } 253 254 /** 255 * Utility method to consume reader and unescape entities found within. 256 * @param inAttribute if the text to be unescaped is in an attribute 257 * @return unescaped string from reader 258 */ 259 String unescapeEntities(boolean inAttribute) { 260 StringBuilder builder = StringUtil.stringBuilder(); 261 while (!reader.isEmpty()) { 262 builder.append(reader.consumeTo('&')); 263 if (reader.matches('&')) { 264 reader.consume(); 265 int[] c = consumeCharacterReference(null, inAttribute); 266 if (c == null || c.length==0) 267 builder.append('&'); 268 else { 269 builder.appendCodePoint(c[0]); 270 if (c.length == 2) 271 builder.appendCodePoint(c[1]); 272 } 273 274 } 275 } 276 return builder.toString(); 277 } 278}