Source code

001package org.jsoup.parser;
002
003import org.jsoup.helper.StringUtil;
004import org.jsoup.helper.Validate;
005import org.jsoup.nodes.Entities;
006
007import java.util.Arrays;
008
009/**
010 * Readers the input stream into tokens.
011 */
012final class Tokeniser {
013    static final char replacementChar = '\uFFFD'; // replaces null character
014    private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'};
015
016    static {
017        Arrays.sort(notCharRefCharsSorted);
018    }
019
020    private final CharacterReader reader; // html input
021    private final ParseErrorList errors; // errors found while tokenising
022
023    private TokeniserState state = TokeniserState.Data; // current tokenisation state
024    private Token emitPending; // the token we are about to emit on next read
025    private boolean isEmitPending = false;
026    private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one
027    private StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
028    StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script>
029
030    Token.Tag tagPending; // tag we are building up
031    Token.StartTag startPending = new Token.StartTag();
032    Token.EndTag endPending = new Token.EndTag();
033    Token.Character charPending = new Token.Character();
034    Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
035    Token.Comment commentPending = new Token.Comment(); // comment building up
036    private String lastStartTag; // the last start tag emitted, to test appropriate end tag
037
038    Tokeniser(CharacterReader reader, ParseErrorList errors) {
039        this.reader = reader;
040        this.errors = errors;
041    }
042
043    Token read() {
044        while (!isEmitPending)
045            state.read(this, reader);
046
047        // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
048        if (charsBuilder.length() > 0) {
049            String str = charsBuilder.toString();
050            charsBuilder.delete(0, charsBuilder.length());
051            charsString = null;
052            return charPending.data(str);
053        } else if (charsString != null) {
054            Token token = charPending.data(charsString);
055            charsString = null;
056            return token;
057        } else {
058            isEmitPending = false;
059            return emitPending;
060        }
061    }
062
063    void emit(Token token) {
064        Validate.isFalse(isEmitPending, "There is an unread token pending!");
065
066        emitPending = token;
067        isEmitPending = true;
068
069        if (token.type == Token.TokenType.StartTag) {
070            Token.StartTag startTag = (Token.StartTag) token;
071            lastStartTag = startTag.tagName;
072        } else if (token.type == Token.TokenType.EndTag) {
073            Token.EndTag endTag = (Token.EndTag) token;
074            if (endTag.attributes != null)
075                error("Attributes incorrectly present on end tag");
076        }
077    }
078
079    void emit(final String str) {
080        // buffer strings up until last string token found, to emit only one token for a run of character refs etc.
081        // does not set isEmitPending; read checks that
082        if (charsString == null) {
083            charsString = str;
084        }
085        else {
086            if (charsBuilder.length() == 0) { // switching to string builder as more than one emit before read
087                charsBuilder.append(charsString);
088            }
089            charsBuilder.append(str);
090        }
091    }
092
093    void emit(char[] chars) {
094        emit(String.valueOf(chars));
095    }
096
097    void emit(int[] codepoints) {
098        emit(new String(codepoints, 0, codepoints.length));
099    }
100
101    void emit(char c) {
102        emit(String.valueOf(c));
103    }
104
105    TokeniserState getState() {
106        return state;
107    }
108
109    void transition(TokeniserState state) {
110        this.state = state;
111    }
112
113    void advanceTransition(TokeniserState state) {
114        reader.advance();
115        this.state = state;
116    }
117
118    final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
119    final private int[] multipointHolder = new int[2];
120    int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
121        if (reader.isEmpty())
122            return null;
123        if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
124            return null;
125        if (reader.matchesAnySorted(notCharRefCharsSorted))
126            return null;
127
128        final int[] codeRef = codepointHolder;
129        reader.mark();
130        if (reader.matchConsume("#")) { // numbered
131            boolean isHexMode = reader.matchConsumeIgnoreCase("X");
132            String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
133            if (numRef.length() == 0) { // didn't match anything
134                characterReferenceError("numeric reference with no numerals");
135                reader.rewindToMark();
136                return null;
137            }
138            if (!reader.matchConsume(";"))
139                characterReferenceError("missing semicolon"); // missing semi
140            int charval = -1;
141            try {
142                int base = isHexMode ? 16 : 10;
143                charval = Integer.valueOf(numRef, base);
144            } catch (NumberFormatException ignored) {
145            } // skip
146            if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
147                characterReferenceError("character outside of valid range");
148                codeRef[0] = replacementChar;
149                return codeRef;
150            } else {
151                // todo: implement number replacement table
152                // todo: check for extra illegal unicode points as parse errors
153                codeRef[0] = charval;
154                return codeRef;
155            }
156        } else { // named
157            // get as many letters as possible, and look for matching entities.
158            String nameRef = reader.consumeLetterThenDigitSequence();
159            boolean looksLegit = reader.matches(';');
160            // found if a base named entity without a ;, or an extended entity with the ;.
161            boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
162
163            if (!found) {
164                reader.rewindToMark();
165                if (looksLegit) // named with semicolon
166                    characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
167                return null;
168            }
169            if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
170                // don't want that to match
171                reader.rewindToMark();
172                return null;
173            }
174            if (!reader.matchConsume(";"))
175                characterReferenceError("missing semicolon"); // missing semi
176            int numChars = Entities.codepointsForName(nameRef, multipointHolder);
177            if (numChars == 1) {
178                codeRef[0] = multipointHolder[0];
179                return codeRef;
180            } else if (numChars ==2) {
181                return multipointHolder;
182            } else {
183                Validate.fail("Unexpected characters returned for " + nameRef);
184                return multipointHolder;
185            }
186        }
187    }
188
189    Token.Tag createTagPending(boolean start) {
190        tagPending = start ? startPending.reset() : endPending.reset();
191        return tagPending;
192    }
193
194    void emitTagPending() {
195        tagPending.finaliseTag();
196        emit(tagPending);
197    }
198
199    void createCommentPending() {
200        commentPending.reset();
201    }
202
203    void emitCommentPending() {
204        emit(commentPending);
205    }
206
207    void createDoctypePending() {
208        doctypePending.reset();
209    }
210
211    void emitDoctypePending() {
212        emit(doctypePending);
213    }
214
215    void createTempBuffer() {
216        Token.reset(dataBuffer);
217    }
218
219    boolean isAppropriateEndTagToken() {
220        return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag);
221    }
222
223    String appropriateEndTagName() {
224        return lastStartTag; // could be null
225    }
226
227    void error(TokeniserState state) {
228        if (errors.canAddError())
229            errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
230    }
231
232    void eofError(TokeniserState state) {
233        if (errors.canAddError())
234            errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
235    }
236
237    private void characterReferenceError(String message) {
238        if (errors.canAddError())
239            errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
240    }
241
242    void error(String errorMsg) {
243        if (errors.canAddError())
244            errors.add(new ParseError(reader.pos(), errorMsg));
245    }
246
247    boolean currentNodeInHtmlNS() {
248        // todo: implement namespaces correctly
249        return true;
250        // Element currentNode = currentNode();
251        // return currentNode != null && currentNode.namespace().equals("HTML");
252    }
253
254    /**
255     * Utility method to consume reader and unescape entities found within.
256     * @param inAttribute if the text to be unescaped is in an attribute
257     * @return unescaped string from reader
258     */
259    String unescapeEntities(boolean inAttribute) {
260        StringBuilder builder = StringUtil.stringBuilder();
261        while (!reader.isEmpty()) {
262            builder.append(reader.consumeTo('&'));
263            if (reader.matches('&')) {
264                reader.consume();
265                int[] c = consumeCharacterReference(null, inAttribute);
266                if (c == null || c.length==0)
267                    builder.append('&');
268                else {
269                    builder.appendCodePoint(c[0]);
270                    if (c.length == 2)
271                        builder.appendCodePoint(c[1]);
272                }
273
274            }
275        }
276        return builder.toString();
277    }
278}