001/* 002 * Copyright 2006 - 2013 003 * Stefan Balev <stefan.balev@graphstream-project.org> 004 * Julien Baudry <julien.baudry@graphstream-project.org> 005 * Antoine Dutot <antoine.dutot@graphstream-project.org> 006 * Yoann Pigné <yoann.pigne@graphstream-project.org> 007 * Guilhelm Savin <guilhelm.savin@graphstream-project.org> 008 * 009 * This file is part of GraphStream <http://graphstream-project.org>. 010 * 011 * GraphStream is a library whose purpose is to handle static or dynamic 012 * graph, create them from scratch, file or any source and display them. 013 * 014 * This program is free software distributed under the terms of two licenses, the 015 * CeCILL-C license that fits European law, and the GNU Lesser General Public 016 * License. You can use, modify and/ or redistribute the software under the terms 017 * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following 018 * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by 019 * the Free Software Foundation, either version 3 of the License, or (at your 020 * option) any later version. 021 * 022 * This program is distributed in the hope that it will be useful, but WITHOUT ANY 023 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 024 * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. 025 * 026 * You should have received a copy of the GNU Lesser General Public License 027 * along with this program. If not, see <http://www.gnu.org/licenses/>. 028 * 029 * The fact that you are presently reading this means that you have had 030 * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms. 031 */ 032package org.graphstream.algorithm.generator; 033 034import java.io.BufferedReader; 035import java.io.IOException; 036import java.io.InputStream; 037import java.io.InputStreamReader; 038import java.net.HttpURLConnection; 039import java.net.URI; 040import java.net.URISyntaxException; 041import java.net.URLConnection; 042import java.util.HashSet; 043import java.util.LinkedList; 044import java.util.concurrent.locks.ReentrantLock; 045import java.util.regex.Matcher; 046import java.util.regex.Pattern; 047 048import org.graphstream.graph.Edge; 049import org.graphstream.graph.Node; 050 051/** 052 * Generate a graph using the web. Some urls are given to start and the 053 * generator will extract links on these pages. Each url is a node and there is 054 * an edge between two urls when one has a link to the other. Links are 055 * extracted using the "href" attribute of html elements. 056 * 057 */ 058public class URLGenerator extends BaseGenerator { 059 060 public static enum Mode { 061 HOST, PATH, FULL 062 } 063 064 private static String REGEX = "href=\"([^\"]*)\""; 065 066 protected HashSet<String> urls; 067 protected LinkedList<String> stepUrls; 068 protected HashSet<String> newUrls; 069 protected Pattern hrefPattern; 070 protected Mode mode; 071 protected int threads = 2; 072 protected String nodeWeight = "weight"; 073 protected String edgeWeight = "weight"; 074 protected LinkedList<URLFilter> filters; 075 protected double step; 076 protected boolean printProgress; 077 protected int depthLimit; 078 protected final ReentrantLock lock; 079 080 public URLGenerator(String... startFrom) { 081 urls = new HashSet<String>(); 082 stepUrls = new LinkedList<String>(); 083 newUrls = new HashSet<String>(); 084 hrefPattern = Pattern.compile(REGEX); 085 mode = Mode.HOST; 086 filters = new LinkedList<URLFilter>(); 087 directed = false; 088 step = 0; 089 printProgress = false; 090 depthLimit = 0; 091 lock = new ReentrantLock(); 092 093 declineMatchingURL("^(javascript:|mailto:|#).*"); 094 declineMatchingURL(".*[.](avi|tar|gz|zip|mp3|mpg|jpg|jpeg|png|ogg|flv|ico|svg)$"); 095 096 setUseInternalGraph(true); 097 098 if (startFrom != null) { 099 for (int i = 0; i < startFrom.length; i++) { 100 stepUrls.add(startFrom[i]); 101 } 102 } 103 } 104 105 /* 106 * (non-Javadoc) 107 * 108 * @see org.graphstream.algorithm.generator.Generator#begin() 109 */ 110 public void begin() { 111 } 112 113 /* 114 * (non-Javadoc) 115 * 116 * @see org.graphstream.algorithm.generator.Generator#nextEvents() 117 */ 118 public boolean nextEvents() { 119 sendStepBegins(sourceId, step); 120 sendGraphAttributeChanged(sourceId, "urls.parsed", null, urls.size()); 121 sendGraphAttributeChanged(sourceId, "urls.remaining", null, 122 stepUrls.size()); 123 124 if (printProgress) 125 progress(); 126 127 for (String url : stepUrls) { 128 try { 129 addNodeURL(url); 130 } catch (URISyntaxException e) { 131 e.printStackTrace(); 132 } 133 } 134 135 urls.addAll(stepUrls); 136 newUrls.clear(); 137 138 if (threads > 1) 139 nextEventsThreaded(); 140 else { 141 for (String url : stepUrls) { 142 try { 143 parseUrl(url); 144 } catch (IOException e) { 145 System.err.printf("Failed to parse \"%s\" : %s\n", url, 146 e.getMessage()); 147 } 148 } 149 } 150 151 stepUrls.clear(); 152 stepUrls.addAll(newUrls); 153 step++; 154 155 return newUrls.size() > 0; 156 } 157 158 /** 159 * Add an url to process. 160 * 161 * @param url 162 * a new url 163 */ 164 public void addURL(String url) { 165 stepUrls.add(url); 166 } 167 168 /** 169 * Create directed edges. 170 * 171 * @param on 172 * true to create directed edges 173 */ 174 public void setDirected(boolean on) { 175 setDirectedEdges(on, false); 176 } 177 178 /** 179 * Set the attribute key used to store weight of nodes. Whenever a node is 180 * reached, its weight is increased by one. 181 * 182 * @param attribute 183 * attribute key of the weight of nodes 184 */ 185 public void setNodeWeightAttribute(String attribute) { 186 this.nodeWeight = attribute; 187 } 188 189 /** 190 * Set the attribute key used to store weight of edges. Whenever an edge is 191 * reached, its weight is increased by one. 192 * 193 * @param attribute 194 * attribute key of the weight of edges 195 */ 196 public void setEdgeWeightAttribute(String attribute) { 197 this.edgeWeight = attribute; 198 } 199 200 /** 201 * Set the way that url are converted to node id. When mode is Mode.FULL, 202 * then the id is the raw url. With Mode.PATH, the query of the url is 203 * truncated so the url http://host/path?what=xxx will be converted as 204 * http://host/path. With Mode.HOST, the url is converted to the host name 205 * so the url http://host/path will be converted as http://host. 206 * 207 * @param mode 208 * mode specifying how to convert url to have node id 209 */ 210 public void setMode(Mode mode) { 211 this.mode = mode; 212 } 213 214 /** 215 * Set the amount of threads used to parse urls. Threads are created in the 216 * {@link #nextEvents()} step. At the end of this method, all working thread 217 * have stop. 218 * 219 * @param count 220 * amount of threads 221 */ 222 public void setThreadCount(int count) { 223 this.threads = count; 224 } 225 226 /** 227 * Set the maximum steps before stop. If 0 or less, limit is disabled. 228 * 229 * @param depthLimit 230 */ 231 public void setDepthLimit(int depthLimit) { 232 this.depthLimit = depthLimit; 233 } 234 235 public void enableProgression(boolean on) { 236 printProgress = on; 237 } 238 239 /** 240 * Can be used to filter url. Url not matching this regex will be discarded. 241 * 242 * @param regex 243 */ 244 public void acceptOnlyMatchingURL(final String regex) { 245 URLFilter f = new URLFilter() { 246 public boolean accept(String url) { 247 if (url.matches(regex)) 248 return true; 249 250 return false; 251 } 252 }; 253 254 filters.add(f); 255 } 256 257 /** 258 * Can be used to filter url. Url matching this regex will be discarded. 259 * 260 * @param regex 261 */ 262 public void declineMatchingURL(final String regex) { 263 URLFilter f = new URLFilter() { 264 public boolean accept(String url) { 265 if (!url.matches(regex)) 266 return true; 267 268 return false; 269 } 270 }; 271 272 filters.add(f); 273 } 274 275 /** 276 * Can be used to filter url according to the host. Note that several calls 277 * to this method may lead to discard all url. All hosts should be gived in 278 * a single call. 279 * 280 * @param hosts 281 * list of accepted hosts 282 */ 283 public void addHostFilter(String... hosts) { 284 if (hosts != null) { 285 StringBuilder b = new StringBuilder( 286 "^(\\w+:)?(//)?([\\w-\\d]+[.])?("); 287 b.append(hosts[0]); 288 289 for (int i = 1; i < hosts.length; i++) 290 b.append("|").append(hosts[i]); 291 292 b.append(").*"); 293 294 acceptOnlyMatchingURL(b.toString()); 295 } 296 } 297 298 protected void nextEventsThreaded() { 299 int t = Math.min(threads, stepUrls.size()); 300 int byThreads = stepUrls.size() / t; 301 302 LinkedList<Worker> workers = new LinkedList<Worker>(); 303 LinkedList<Thread> workersThreads = new LinkedList<Thread>(); 304 305 for (int i = 0; i < t; i++) { 306 int start = i * byThreads; 307 int stop = (i + 1) * byThreads; 308 309 if (i == t - 1) 310 stop += stepUrls.size() % t; 311 312 Worker w = new Worker(start, stop, stepUrls); 313 Thread u = new Thread(w); 314 315 u.start(); 316 317 workers.add(w); 318 workersThreads.add(u); 319 } 320 321 for (int i = 0; i < t; i++) { 322 try { 323 workersThreads.get(i).join(); 324 } catch (InterruptedException e) { 325 e.printStackTrace(); 326 } 327 } 328 } 329 330 protected boolean isValid(String url) { 331 for (int i = 0; i < filters.size(); i++) { 332 if (!filters.get(i).accept(url)) 333 return false; 334 } 335 336 return true; 337 } 338 339 /** 340 * Parse an url and add all extracted links in a specified set. 341 * 342 * @param url 343 * the url to parse 344 * @param newUrls 345 * the set where extracted links will be added 346 * @throws IOException 347 */ 348 protected void parseUrl(String url) throws IOException { 349 URI uri; 350 URLConnection conn; 351 InputStream stream; 352 BufferedReader reader; 353 HashSet<String> localUrls = new HashSet<String>(); 354 355 if (!isValid(url)) 356 return; 357 358 try { 359 uri = new URI(url); 360 } catch (URISyntaxException e1) { 361 throw new IOException(e1); 362 } 363 364 if (uri.getHost() == null) { 365 System.err.printf("skip invalid uri : '%s'\n", url); 366 return; 367 } 368 369 if (!uri.isAbsolute()) { 370 System.err.printf("skip non-absolute uri : '%s'\n", url); 371 return; 372 } 373 374 conn = uri.toURL().openConnection(); 375 conn.setConnectTimeout(1000); 376 conn.setReadTimeout(1000); 377 conn.connect(); 378 379 if (conn.getContentType() == null 380 || !conn.getContentType().startsWith("text/html")) 381 return; 382 383 stream = conn.getInputStream(); 384 reader = new BufferedReader(new InputStreamReader(stream)); 385 386 while (reader.ready()) { 387 String line = reader.readLine(); 388 389 if (line == null) 390 continue; 391 392 Matcher m = hrefPattern.matcher(line); 393 394 while (m.find()) { 395 String href = m.group(1); 396 397 if (href == null || href.length() == 0) 398 continue; 399 400 href = href.trim(); 401 402 if (href.charAt(0) == '/') 403 href = String.format("%s://%s%s", uri.getScheme(), 404 uri.getHost(), href); 405 406 if (href.charAt(0) == '.') 407 href = String.format("%s%s", url, href); 408 409 if (!isValid(href)) 410 continue; 411 412 try { 413 if (depthLimit == 0 || step < depthLimit) { 414 synchronizedOperation(href, null); 415 synchronizedOperation(url, href); 416 } else { 417 if (urls.contains(href)) 418 synchronizedOperation(url, href); 419 } 420 } catch (URISyntaxException e) { 421 throw new IOException(e); 422 } 423 424 if (!urls.contains(href) 425 && (depthLimit == 0 || step < depthLimit)) 426 localUrls.add(href); 427 } 428 } 429 430 lock.lock(); 431 432 try { 433 newUrls.addAll(localUrls); 434 } finally { 435 lock.unlock(); 436 } 437 438 localUrls.clear(); 439 localUrls = null; 440 441 try { 442 if (conn.getDoOutput()) 443 conn.getOutputStream().close(); 444 reader.close(); 445 stream.close(); 446 } catch (IOException e) { 447 // Do not throw this exception 448 } 449 450 if (conn instanceof HttpURLConnection) 451 ((HttpURLConnection) conn).disconnect(); 452 } 453 454 protected String getNodeId(String url) throws URISyntaxException { 455 String nodeId = url; 456 URI uri = new URI(url); 457 458 switch (mode) { 459 case HOST: 460 nodeId = String.format("%s://%s", uri.getScheme(), uri.getHost()); 461 break; 462 case PATH: 463 nodeId = String.format("%s://%s%s", uri.getScheme(), uri.getHost(), 464 uri.getPath()); 465 break; 466 case FULL: 467 nodeId = String.format("%s://%s%s%s", uri.getScheme(), uri 468 .getHost(), uri.getPath(), uri.getQuery() == null ? "" 469 : uri.getQuery()); 470 break; 471 } 472 473 return nodeId; 474 } 475 476 protected String getNodeLabel(String url) throws URISyntaxException { 477 return url; 478 } 479 480 protected String getEdgeId(String nodeId1, String nodeId2) { 481 if (directed || nodeId1.compareTo(nodeId2) < 0) 482 return String.format("%s > %s", nodeId1, nodeId2); 483 484 return String.format("%s > %s", nodeId2, nodeId1); 485 } 486 487 protected synchronized void synchronizedOperation(String url1, String url2) 488 throws URISyntaxException { 489 if (url2 == null) 490 addNodeURL(url1); 491 else 492 connect(url1, url2); 493 } 494 495 protected void addNodeURL(String url) throws URISyntaxException { 496 String nodeId = getNodeId(url); 497 498 // urls.add(url); 499 500 if (internalGraph.getNode(nodeId) == null) { 501 addNode(nodeId); 502 sendNodeAttributeAdded(sourceId, nodeId, "label", getNodeLabel(url)); 503 // System.out.printf("> new url '%s' --> '%s'\n", url, nodeId); 504 } 505 506 Node n = internalGraph.getNode(nodeId); 507 double w; 508 509 if (n.hasNumber(nodeWeight)) 510 w = n.getNumber(nodeWeight); 511 else 512 w = 0; 513 514 n.setAttribute(nodeWeight, w + 1); 515 sendNodeAttributeChanged(sourceId, nodeId, nodeWeight, null, w + 1); 516 } 517 518 protected void connect(String url1, String url2) throws URISyntaxException { 519 String src, trg, eid; 520 521 src = getNodeId(url1); 522 trg = getNodeId(url2); 523 524 if (internalGraph.getNode(src) == null) 525 addNode(src); 526 527 if (internalGraph.getNode(trg) == null) 528 addNode(trg); 529 530 if (!src.equals(trg)) { 531 eid = getEdgeId(src, trg); 532 533 if (internalGraph.getEdge(eid) == null) 534 addEdge(eid, src, trg); 535 536 Edge e = internalGraph.getEdge(eid); 537 double w; 538 539 if (e.hasNumber(edgeWeight)) 540 w = e.getNumber(edgeWeight); 541 else 542 w = 0; 543 544 e.setAttribute(edgeWeight, w + 1); 545 sendEdgeAttributeChanged(sourceId, eid, edgeWeight, null, w + 1); 546 } 547 } 548 549 protected void progress() { 550 System.out.printf("\033[s\033[K%d urls parsed, %d remaining\033[u", 551 urls.size(), stepUrls.size()); 552 } 553 554 /* 555 * Private class used to distribute url parsing. 556 */ 557 private class Worker implements Runnable { 558 int start, stop; 559 LinkedList<String> urls; 560 561 public Worker(int start, int stop, LinkedList<String> urls) { 562 this.start = start; 563 this.stop = stop; 564 this.urls = urls; 565 } 566 567 public void run() { 568 for (int i = start; i < stop; i++) { 569 try { 570 parseUrl(urls.get(i)); 571 } catch (IOException e) { 572 System.err.printf("Failed to parse \"%s\" : %s\n", 573 urls.get(i), e.getMessage()); 574 } 575 } 576 } 577 } 578 579 /** 580 * Defines url filter. 581 */ 582 public static interface URLFilter { 583 /** 584 * Called by the generator to know if the specified url can be accepted 585 * by this filter. If a filter return false, then the url is discarded. 586 * 587 * @param url 588 * the url to check if it can be accepted 589 * @return true if the url is accepted 590 */ 591 boolean accept(String url); 592 } 593}