001package org.jsoup.parser; 002 003import org.jsoup.helper.StringUtil; 004import org.jsoup.helper.Validate; 005 006/** 007 * A character queue with parsing helpers. 008 * 009 * @author Jonathan Hedley 010 */ 011public class TokenQueue { 012 private String queue; 013 private int pos = 0; 014 015 private static final char ESC = '\\'; // escape char for chomp balanced. 016 017 /** 018 Create a new TokenQueue. 019 @param data string of data to back queue. 020 */ 021 public TokenQueue(String data) { 022 Validate.notNull(data); 023 queue = data; 024 } 025 026 /** 027 * Is the queue empty? 028 * @return true if no data left in queue. 029 */ 030 public boolean isEmpty() { 031 return remainingLength() == 0; 032 } 033 034 private int remainingLength() { 035 return queue.length() - pos; 036 } 037 038 /** 039 * Retrieves but does not remove the first character from the queue. 040 * @return First character, or 0 if empty. 041 */ 042 public char peek() { 043 return isEmpty() ? 0 : queue.charAt(pos); 044 } 045 046 /** 047 Add a character to the start of the queue (will be the next character retrieved). 048 @param c character to add 049 */ 050 public void addFirst(Character c) { 051 addFirst(c.toString()); 052 } 053 054 /** 055 Add a string to the start of the queue. 056 @param seq string to add. 057 */ 058 public void addFirst(String seq) { 059 // not very performant, but an edge case 060 queue = seq + queue.substring(pos); 061 pos = 0; 062 } 063 064 /** 065 * Tests if the next characters on the queue match the sequence. Case insensitive. 066 * @param seq String to check queue for. 067 * @return true if the next characters match. 068 */ 069 public boolean matches(String seq) { 070 return queue.regionMatches(true, pos, seq, 0, seq.length()); 071 } 072 073 /** 074 * Case sensitive match test. 075 * @param seq string to case sensitively check for 076 * @return true if matched, false if not 077 */ 078 public boolean matchesCS(String seq) { 079 return queue.startsWith(seq, pos); 080 } 081 082 083 /** 084 Tests if the next characters match any of the sequences. Case insensitive. 085 @param seq list of strings to case insensitively check for 086 @return true of any matched, false if none did 087 */ 088 public boolean matchesAny(String... seq) { 089 for (String s : seq) { 090 if (matches(s)) 091 return true; 092 } 093 return false; 094 } 095 096 public boolean matchesAny(char... seq) { 097 if (isEmpty()) 098 return false; 099 100 for (char c: seq) { 101 if (queue.charAt(pos) == c) 102 return true; 103 } 104 return false; 105 } 106 107 public boolean matchesStartTag() { 108 // micro opt for matching "<x" 109 return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1))); 110 } 111 112 /** 113 * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the 114 * queue. 115 * @param seq String to search for, and if found, remove from queue. 116 * @return true if found and removed, false if not found. 117 */ 118 public boolean matchChomp(String seq) { 119 if (matches(seq)) { 120 pos += seq.length(); 121 return true; 122 } else { 123 return false; 124 } 125 } 126 127 /** 128 Tests if queue starts with a whitespace character. 129 @return if starts with whitespace 130 */ 131 public boolean matchesWhitespace() { 132 return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos)); 133 } 134 135 /** 136 Test if the queue matches a word character (letter or digit). 137 @return if matches a word character 138 */ 139 public boolean matchesWord() { 140 return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); 141 } 142 143 /** 144 * Drops the next character off the queue. 145 */ 146 public void advance() { 147 if (!isEmpty()) pos++; 148 } 149 150 /** 151 * Consume one character off queue. 152 * @return first character on queue. 153 */ 154 public char consume() { 155 return queue.charAt(pos++); 156 } 157 158 /** 159 * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will 160 * throw an illegal state exception -- but you should be running match() against that condition. 161 <p> 162 Case insensitive. 163 * @param seq sequence to remove from head of queue. 164 */ 165 public void consume(String seq) { 166 if (!matches(seq)) 167 throw new IllegalStateException("Queue did not match expected sequence"); 168 int len = seq.length(); 169 if (len > remainingLength()) 170 throw new IllegalStateException("Queue not long enough to consume sequence"); 171 172 pos += len; 173 } 174 175 /** 176 * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. 177 * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b> 178 * @return The matched data consumed from queue. 179 */ 180 public String consumeTo(String seq) { 181 int offset = queue.indexOf(seq, pos); 182 if (offset != -1) { 183 String consumed = queue.substring(pos, offset); 184 pos += consumed.length(); 185 return consumed; 186 } else { 187 return remainder(); 188 } 189 } 190 191 public String consumeToIgnoreCase(String seq) { 192 int start = pos; 193 String first = seq.substring(0, 1); 194 boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of 195 while (!isEmpty()) { 196 if (matches(seq)) 197 break; 198 199 if (canScan) { 200 int skip = queue.indexOf(first, pos) - pos; 201 if (skip == 0) // this char is the skip char, but not match, so force advance of pos 202 pos++; 203 else if (skip < 0) // no chance of finding, grab to end 204 pos = queue.length(); 205 else 206 pos += skip; 207 } 208 else 209 pos++; 210 } 211 212 return queue.substring(start, pos); 213 } 214 215 /** 216 Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. 217 @param seq any number of terminators to consume to. <b>Case insensitive.</b> 218 @return consumed string 219 */ 220 // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this 221 // is is a case sensitive time... 222 public String consumeToAny(String... seq) { 223 int start = pos; 224 while (!isEmpty() && !matchesAny(seq)) { 225 pos++; 226 } 227 228 return queue.substring(start, pos); 229 } 230 231 /** 232 * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). 233 * <p> 234 * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go 235 * isEmpty() == true). 236 * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b> 237 * @return Data matched from queue. 238 */ 239 public String chompTo(String seq) { 240 String data = consumeTo(seq); 241 matchChomp(seq); 242 return data; 243 } 244 245 public String chompToIgnoreCase(String seq) { 246 String data = consumeToIgnoreCase(seq); // case insensitive scan 247 matchChomp(seq); 248 return data; 249 } 250 251 /** 252 * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", 253 * and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \). Those escapes will be left 254 * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for 255 * contains text strings; use unescape for that. 256 * @param open opener 257 * @param close closer 258 * @return data matched from the queue 259 */ 260 public String chompBalanced(char open, char close) { 261 int start = -1; 262 int end = -1; 263 int depth = 0; 264 char last = 0; 265 boolean inQuote = false; 266 267 do { 268 if (isEmpty()) break; 269 Character c = consume(); 270 if (last == 0 || last != ESC) { 271 if ((c.equals('\'') || c.equals('"')) && c != open) 272 inQuote = !inQuote; 273 if (inQuote) 274 continue; 275 if (c.equals(open)) { 276 depth++; 277 if (start == -1) 278 start = pos; 279 } 280 else if (c.equals(close)) 281 depth--; 282 } 283 284 if (depth > 0 && last != 0) 285 end = pos; // don't include the outer match pair in the return 286 last = c; 287 } while (depth > 0); 288 final String out = (end >= 0) ? queue.substring(start, end) : ""; 289 if (depth > 0) {// ran out of queue before seeing enough ) 290 Validate.fail("Did not find balanced marker at '" + out + "'"); 291 } 292 return out; 293 } 294 295 /** 296 * Unescape a \ escaped string. 297 * @param in backslash escaped string 298 * @return unescaped string 299 */ 300 public static String unescape(String in) { 301 StringBuilder out = StringUtil.stringBuilder(); 302 char last = 0; 303 for (char c : in.toCharArray()) { 304 if (c == ESC) { 305 if (last != 0 && last == ESC) 306 out.append(c); 307 } 308 else 309 out.append(c); 310 last = c; 311 } 312 return out.toString(); 313 } 314 315 /** 316 * Pulls the next run of whitespace characters of the queue. 317 * @return Whether consuming whitespace or not 318 */ 319 public boolean consumeWhitespace() { 320 boolean seen = false; 321 while (matchesWhitespace()) { 322 pos++; 323 seen = true; 324 } 325 return seen; 326 } 327 328 /** 329 * Retrieves the next run of word type (letter or digit) off the queue. 330 * @return String of word characters from queue, or empty string if none. 331 */ 332 public String consumeWord() { 333 int start = pos; 334 while (matchesWord()) 335 pos++; 336 return queue.substring(start, pos); 337 } 338 339 /** 340 * Consume an tag name off the queue (word or :, _, -) 341 * 342 * @return tag name 343 */ 344 public String consumeTagName() { 345 int start = pos; 346 while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) 347 pos++; 348 349 return queue.substring(start, pos); 350 } 351 352 /** 353 * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects). 354 * 355 * @return tag name 356 */ 357 public String consumeElementSelector() { 358 int start = pos; 359 while (!isEmpty() && (matchesWord() || matchesAny("*|","|", "_", "-"))) 360 pos++; 361 362 return queue.substring(start, pos); 363 } 364 365 /** 366 Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _) 367 http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier 368 @return identifier 369 */ 370 public String consumeCssIdentifier() { 371 int start = pos; 372 while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) 373 pos++; 374 375 return queue.substring(start, pos); 376 } 377 378 /** 379 Consume an attribute key off the queue (letter, digit, -, _, :") 380 @return attribute key 381 */ 382 public String consumeAttributeKey() { 383 int start = pos; 384 while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) 385 pos++; 386 387 return queue.substring(start, pos); 388 } 389 390 /** 391 Consume and return whatever is left on the queue. 392 @return remained of queue. 393 */ 394 public String remainder() { 395 final String remainder = queue.substring(pos, queue.length()); 396 pos = queue.length(); 397 return remainder; 398 } 399 400 @Override 401 public String toString() { 402 return queue.substring(pos); 403 } 404}