001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Attributes; 005 006import static org.jsoup.internal.Normalizer.lowerCase; 007 008/** 009 * Parse tokens for the Tokeniser. 010 */ 011abstract class Token { 012 TokenType type; 013 014 private Token() { 015 } 016 017 String tokenType() { 018 return this.getClass().getSimpleName(); 019 } 020 021 /** 022 * Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every 023 * piece of data, which immediately get GCed. 024 */ 025 abstract Token reset(); 026 027 static void reset(StringBuilder sb) { 028 if (sb != null) { 029 sb.delete(0, sb.length()); 030 } 031 } 032 033 static final class Doctype extends Token { 034 final StringBuilder name = new StringBuilder(); 035 String pubSysKey = null; 036 final StringBuilder publicIdentifier = new StringBuilder(); 037 final StringBuilder systemIdentifier = new StringBuilder(); 038 boolean forceQuirks = false; 039 040 Doctype() { 041 type = TokenType.Doctype; 042 } 043 044 @Override 045 Token reset() { 046 reset(name); 047 pubSysKey = null; 048 reset(publicIdentifier); 049 reset(systemIdentifier); 050 forceQuirks = false; 051 return this; 052 } 053 054 String getName() { 055 return name.toString(); 056 } 057 058 String getPubSysKey() { 059 return pubSysKey; 060 } 061 062 String getPublicIdentifier() { 063 return publicIdentifier.toString(); 064 } 065 066 public String getSystemIdentifier() { 067 return systemIdentifier.toString(); 068 } 069 070 public boolean isForceQuirks() { 071 return forceQuirks; 072 } 073 } 074 075 static abstract class Tag extends Token { 076 protected String tagName; 077 protected String normalName; // lc version of tag name, for case insensitive tree build 078 private String pendingAttributeName; // attribute names are generally caught in one hop, not accumulated 079 private StringBuilder pendingAttributeValue = new StringBuilder(); // but values are accumulated, from e.g. & in hrefs 080 private String pendingAttributeValueS; // try to get attr vals in one shot, vs Builder 081 private boolean hasEmptyAttributeValue = false; // distinguish boolean attribute from empty string value 082 private boolean hasPendingAttributeValue = false; 083 boolean selfClosing = false; 084 Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used). 085 086 @Override 087 Tag reset() { 088 tagName = null; 089 normalName = null; 090 pendingAttributeName = null; 091 reset(pendingAttributeValue); 092 pendingAttributeValueS = null; 093 hasEmptyAttributeValue = false; 094 hasPendingAttributeValue = false; 095 selfClosing = false; 096 attributes = null; 097 return this; 098 } 099 100 final void newAttribute() { 101 if (attributes == null) 102 attributes = new Attributes(); 103 104 if (pendingAttributeName != null) { 105 // the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here 106 pendingAttributeName = pendingAttributeName.trim(); 107 if (pendingAttributeName.length() > 0) { 108 String value; 109 if (hasPendingAttributeValue) 110 value = pendingAttributeValue.length() > 0 ? pendingAttributeValue.toString() : pendingAttributeValueS; 111 else if (hasEmptyAttributeValue) 112 value = ""; 113 else 114 value = null; 115 attributes.put(pendingAttributeName, value); 116 } 117 } 118 pendingAttributeName = null; 119 hasEmptyAttributeValue = false; 120 hasPendingAttributeValue = false; 121 reset(pendingAttributeValue); 122 pendingAttributeValueS = null; 123 } 124 125 final void finaliseTag() { 126 // finalises for emit 127 if (pendingAttributeName != null) { 128 // todo: check if attribute name exists; if so, drop and error 129 newAttribute(); 130 } 131 } 132 133 final String name() { // preserves case, for input into Tag.valueOf (which may drop case) 134 Validate.isFalse(tagName == null || tagName.length() == 0); 135 return tagName; 136 } 137 138 final String normalName() { // loses case, used in tree building for working out where in tree it should go 139 return normalName; 140 } 141 142 final Tag name(String name) { 143 tagName = name; 144 normalName = lowerCase(name); 145 return this; 146 } 147 148 final boolean isSelfClosing() { 149 return selfClosing; 150 } 151 152 @SuppressWarnings({"TypeMayBeWeakened"}) 153 final Attributes getAttributes() { 154 return attributes; 155 } 156 157 // these appenders are rarely hit in not null state-- caused by null chars. 158 final void appendTagName(String append) { 159 tagName = tagName == null ? append : tagName.concat(append); 160 normalName = lowerCase(tagName); 161 } 162 163 final void appendTagName(char append) { 164 appendTagName(String.valueOf(append)); 165 } 166 167 final void appendAttributeName(String append) { 168 pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append); 169 } 170 171 final void appendAttributeName(char append) { 172 appendAttributeName(String.valueOf(append)); 173 } 174 175 final void appendAttributeValue(String append) { 176 ensureAttributeValue(); 177 if (pendingAttributeValue.length() == 0) { 178 pendingAttributeValueS = append; 179 } else { 180 pendingAttributeValue.append(append); 181 } 182 } 183 184 final void appendAttributeValue(char append) { 185 ensureAttributeValue(); 186 pendingAttributeValue.append(append); 187 } 188 189 final void appendAttributeValue(char[] append) { 190 ensureAttributeValue(); 191 pendingAttributeValue.append(append); 192 } 193 194 final void appendAttributeValue(int[] appendCodepoints) { 195 ensureAttributeValue(); 196 for (int codepoint : appendCodepoints) { 197 pendingAttributeValue.appendCodePoint(codepoint); 198 } 199 } 200 201 final void setEmptyAttributeValue() { 202 hasEmptyAttributeValue = true; 203 } 204 205 private void ensureAttributeValue() { 206 hasPendingAttributeValue = true; 207 // if on second hit, we'll need to move to the builder 208 if (pendingAttributeValueS != null) { 209 pendingAttributeValue.append(pendingAttributeValueS); 210 pendingAttributeValueS = null; 211 } 212 } 213 } 214 215 final static class StartTag extends Tag { 216 StartTag() { 217 super(); 218 attributes = new Attributes(); 219 type = TokenType.StartTag; 220 } 221 222 @Override 223 Tag reset() { 224 super.reset(); 225 attributes = new Attributes(); 226 // todo - would prefer these to be null, but need to check Element assertions 227 return this; 228 } 229 230 StartTag nameAttr(String name, Attributes attributes) { 231 this.tagName = name; 232 this.attributes = attributes; 233 normalName = lowerCase(tagName); 234 return this; 235 } 236 237 @Override 238 public String toString() { 239 if (attributes != null && attributes.size() > 0) 240 return "<" + name() + " " + attributes.toString() + ">"; 241 else 242 return "<" + name() + ">"; 243 } 244 } 245 246 final static class EndTag extends Tag{ 247 EndTag() { 248 super(); 249 type = TokenType.EndTag; 250 } 251 252 @Override 253 public String toString() { 254 return "</" + name() + ">"; 255 } 256 } 257 258 final static class Comment extends Token { 259 final StringBuilder data = new StringBuilder(); 260 boolean bogus = false; 261 262 @Override 263 Token reset() { 264 reset(data); 265 bogus = false; 266 return this; 267 } 268 269 Comment() { 270 type = TokenType.Comment; 271 } 272 273 String getData() { 274 return data.toString(); 275 } 276 277 @Override 278 public String toString() { 279 return "<!--" + getData() + "-->"; 280 } 281 } 282 283 final static class Character extends Token { 284 private String data; 285 286 Character() { 287 super(); 288 type = TokenType.Character; 289 } 290 291 @Override 292 Token reset() { 293 data = null; 294 return this; 295 } 296 297 Character data(String data) { 298 this.data = data; 299 return this; 300 } 301 302 String getData() { 303 return data; 304 } 305 306 @Override 307 public String toString() { 308 return getData(); 309 } 310 } 311 312 final static class EOF extends Token { 313 EOF() { 314 type = Token.TokenType.EOF; 315 } 316 317 @Override 318 Token reset() { 319 return this; 320 } 321 } 322 323 final boolean isDoctype() { 324 return type == TokenType.Doctype; 325 } 326 327 final Doctype asDoctype() { 328 return (Doctype) this; 329 } 330 331 final boolean isStartTag() { 332 return type == TokenType.StartTag; 333 } 334 335 final StartTag asStartTag() { 336 return (StartTag) this; 337 } 338 339 final boolean isEndTag() { 340 return type == TokenType.EndTag; 341 } 342 343 final EndTag asEndTag() { 344 return (EndTag) this; 345 } 346 347 final boolean isComment() { 348 return type == TokenType.Comment; 349 } 350 351 final Comment asComment() { 352 return (Comment) this; 353 } 354 355 final boolean isCharacter() { 356 return type == TokenType.Character; 357 } 358 359 final Character asCharacter() { 360 return (Character) this; 361 } 362 363 final boolean isEOF() { 364 return type == TokenType.EOF; 365 } 366 367 enum TokenType { 368 Doctype, 369 StartTag, 370 EndTag, 371 Comment, 372 Character, 373 EOF 374 } 375}